New Full Text Index using contrib/fulltextindex which now able to processing Traditional Chinese characters(Big5 encoding)

From: "eggli" <egg(dot)li(at)msa(dot)hinet(dot)net>
To: pgsql-patches(at)postgresql(dot)org
Subject: New Full Text Index using contrib/fulltextindex which now able to processing Traditional Chinese characters(Big5 encoding)
Date: 2002-07-09 10:58:00
Message-ID: agefko$24p1$1@news.hub.org
Views: Raw Message | Whole Thread | Download mbox | Resend email
Thread:
Lists: pgsql-patches

Hi, all, I found that contrib/fulltextindex is unable to process multibyte
characters, so I tried to make it suit for my mother language as Chinese, I
believe it's able to process Unicode by wcrok(), but I'm lazy to complete
it.;)

USAGE:

Just replace the fti.c from this mail on contrib/fulltextindex and re-make
it.

And the indexing/query way is most the same.

CODE:
/* The difference: breakup() and issleadbyte()

#include "postgres.h"

#include <ctype.h>

#include "executor/spi.h"
#include "commands/trigger.h"

/*
* Trigger function accepts variable number of arguments:
*
* 1. relation in which to store the substrings
* 2. fields to extract substrings from
*
* The relation in which to insert *must* have the following layout:
*
* string varchar(#)
* id oid
*
* where # is the largest size of the varchar columns being indexed
*
* Example:
*
* -- Create the SQL function based on the compiled shared object
* create function fti() returns opaque as
* '/usr/local/pgsql/lib/contrib/fti.so' language 'C';
*
* -- Create the FTI table
* create table product_fti (string varchar(255), id oid);
*
* -- Create an index to assist string matches
* create index product_fti_string_idx on product_fti (string);
*
* -- Create an index to assist trigger'd deletes
* create index product_fti_id_idx on product_fti (id);
*
* -- Create an index on the product oid column to assist joins
* -- between the fti table and the product table
* create index product_oid_idx on product (oid);
*
* -- Create the trigger to perform incremental changes to the full text
index.
* create trigger product_fti_trig after update or insert or delete on
product
* for each row execute procedure fti(product_fti, title, artist);
* ^^^^^^^^^^^
* table where full text index is stored
* ^^^^^^^^^^^^^
* columns to index in the base table
*
* After populating 'product', try something like:
*
* SELECT DISTINCT(p.*) FROM product p, product_fti f1, product_fti f2 WHERE
* f1.string ~ '^slippery' AND f2.string ~ '^wet' AND p.oid=f1.id AND
p.oid=f2.id;
*
* To check that your indicies are being used correctly, make sure you
* EXPLAIN SELECT ... your test query above.
*
* CHANGELOG
* ---------
*
* august 3 2001
* Extended fti function to accept more than one column as a
* parameter and all specified columns are indexed. Changed
* all uses of sprintf to snprintf. Made error messages more
* consistent.
*
* march 4 1998 Changed breakup() to return less substrings. Only breakup
* in word parts which are in turn shortened from the start
* of the word (ie. word, ord, rd)
* Did allocation of substring buffer outside of breakup()
*
* oct. 5 1997, fixed a bug in string breakup (where there are more nonalpha
* characters between words then 1).
*
* oct 4-5 1997 implemented the thing, at least the basic functionallity
* of it all....
*
* TODO
* ----
*
* prevent generating duplicate words for an oid in the fti table
* save a plan for deletes
* create a function that will make the index *after* we have populated
* the main table (probably first delete all contents to be sure there's
* nothing in it, then re-populate the fti-table)
*
* can we do something with operator overloading or a seperate function
* that can build the final query automatigally?
*/

#define MAX_FTI_QUERY_LENGTH 8192

extern Datum fti(PG_FUNCTION_ARGS);
static char *breakup(char *, char *);
static bool is_stopword(char *);
static bool isleadbyte(unsigned char ch);
static bool new_tuple = false;

#ifdef USE_STOP_WORDS

/* THIS LIST MUST BE IN SORTED ORDER, A BINARY SEARCH IS USED!!!! */
char *StopWords[] = { /* list of words to skip in indexing */
"no",
"the",
"yes"
};
#endif /* USE_STOP_WORDS */

/* stuff for caching query-plans, stolen from contrib/spi/\*.c */
typedef struct
{
char *ident;
int nplans;
void **splan;
} EPlan;

static EPlan *InsertPlans = NULL;
static EPlan *DeletePlans = NULL;
static int nInsertPlans = 0;
static int nDeletePlans = 0;

static EPlan *find_plan(char *ident, EPlan ** eplan, int *nplans);

/***********************************************************************/
PG_FUNCTION_INFO_V1(fti);

Datum
fti(PG_FUNCTION_ARGS)
{
TriggerData *trigdata;
Trigger *trigger; /* to get trigger name */
int nargs; /* # of arguments */
char **args; /* arguments */
char *relname; /* triggered relation name */
Relation rel; /* triggered relation */
char *indexname; /* name of table for substrings */
HeapTuple rettuple = NULL;
TupleDesc tupdesc; /* tuple description */
bool isinsert = false;
bool isdelete = false;
int ret;
char query[MAX_FTI_QUERY_LENGTH];
Oid oid;

/*
* FILE *debug;
*/

/*
* debug = fopen("/dev/xconsole", "w"); fprintf(debug, "FTI: entered
* function\n"); fflush(debug);
*/

if (!CALLED_AS_TRIGGER(fcinfo))
elog(ERROR, "Full Text Indexing: Not fired by trigger manager");

/* It's safe to cast now that we've checked */
trigdata = (TriggerData *) fcinfo->context;

if (TRIGGER_FIRED_FOR_STATEMENT(trigdata->tg_event))
elog(ERROR, "Full Text Indexing: Can't process STATEMENT events");
if (TRIGGER_FIRED_BEFORE(trigdata->tg_event))
elog(ERROR, "Full Text Indexing: Must be fired AFTER event");

if (TRIGGER_FIRED_BY_INSERT(trigdata->tg_event))
isinsert = true;
if (TRIGGER_FIRED_BY_UPDATE(trigdata->tg_event))
{
isdelete = true;
isinsert = true;
}
if (TRIGGER_FIRED_BY_DELETE(trigdata->tg_event))
isdelete = true;

trigger = trigdata->tg_trigger;
rel = trigdata->tg_relation;
relname = SPI_getrelname(rel);
rettuple = trigdata->tg_trigtuple;
if (isdelete && isinsert) /* is an UPDATE */
rettuple = trigdata->tg_newtuple;

if ((ret = SPI_connect()) < 0)
elog(ERROR, "Full Text Indexing: SPI_connect: Failed, returned %d\n",
ret);

nargs = trigger->tgnargs;
if (nargs < 2)
elog(ERROR, "Full Text Indexing: Trigger must have at least 2
arguments\n");

args = trigger->tgargs;
indexname = args[0];
tupdesc = rel->rd_att; /* what the tuple looks like (?) */

/* get oid of current tuple, needed by all, so place here */
oid = rettuple->t_data->t_oid;
if (!OidIsValid(oid))
elog(ERROR, "Full Text Indexing: Oid of current tuple is invalid");

if (isdelete)
{
void *pplan;
Oid *argtypes;
Datum values[1];
EPlan *plan;
int i;

snprintf(query, MAX_FTI_QUERY_LENGTH, "D%s", indexname);
for (i = 1; i < nargs; i++)
snprintf(query, MAX_FTI_QUERY_LENGTH, "%s$%s", query, args[i]);

plan = find_plan(query, &DeletePlans, &nDeletePlans);
if (plan->nplans <= 0)
{
argtypes = (Oid *) palloc(sizeof(Oid));

argtypes[0] = OIDOID;

snprintf(query, MAX_FTI_QUERY_LENGTH, "DELETE FROM %s WHERE id = $1",
indexname);
pplan = SPI_prepare(query, 1, argtypes);
if (!pplan)
elog(ERROR, "Full Text Indexing: SPI_prepare: Returned NULL in delete");
pplan = SPI_saveplan(pplan);
if (pplan == NULL)
elog(ERROR, "Full Text Indexing: SPI_saveplan: Returned NULL in
delete");

plan->splan = (void **) malloc(sizeof(void *));
*(plan->splan) = pplan;
plan->nplans = 1;
}

values[0] = oid;

ret = SPI_execp(*(plan->splan), values, NULL, 0);
if (ret != SPI_OK_DELETE)
elog(ERROR, "Full Text Indexing: SPI_execp: Error executing plan in
delete");
}

if (isinsert)
{
char *substring;
char *column;
void *pplan;
Oid *argtypes;
Datum values[2];
int colnum;
struct varlena *data;
EPlan *plan;
int i;
char *buff;
char *string;

snprintf(query, MAX_FTI_QUERY_LENGTH, "I%s", indexname);
for (i = 1; i < nargs; i++)
snprintf(query, MAX_FTI_QUERY_LENGTH, "%s$%s", query, args[i]);

plan = find_plan(query, &InsertPlans, &nInsertPlans);

/* no plan yet, so allocate mem for argtypes */
if (plan->nplans <= 0)
{
argtypes = (Oid *) palloc(2 * sizeof(Oid));

argtypes[0] = VARCHAROID; /* create table t_name (string
* varchar, */
argtypes[1] = OIDOID; /* id oid); */

/* prepare plan to gain speed */
snprintf(query, MAX_FTI_QUERY_LENGTH, "INSERT INTO %s (string, id) VALUES
($1, $2)",
indexname);
pplan = SPI_prepare(query, 2, argtypes);
if (!pplan)
elog(ERROR, "Full Text Indexing: SPI_prepare: Returned NULL in insert");

pplan = SPI_saveplan(pplan);
if (pplan == NULL)
elog(ERROR, "Full Text Indexing: SPI_saveplan: Returned NULL in
insert");

plan->splan = (void **) malloc(sizeof(void *));
*(plan->splan) = pplan;
plan->nplans = 1;
}

/* prepare plan for query */
for (i = 0; i < nargs - 1; i++)
{
colnum = SPI_fnumber(tupdesc, args[i + 1]);
if (colnum == SPI_ERROR_NOATTRIBUTE)
elog(ERROR, "Full Text Indexing: SPI_fnumber: Column '%s' of '%s' not
found", args[i + 1], indexname);

/* Get the char* representation of the column */
column = SPI_getvalue(rettuple, tupdesc, colnum);

/* make sure we don't try to index NULL's */
if (column)
{
string = column;
while (*string != '\0')
{
*string = tolower((unsigned char) *string);
string++;
}

data = (struct varlena *) palloc(sizeof(int32) + strlen(column) +1);
buff = palloc(strlen(column) + 1);
/* saves lots of calls in while-loop and in breakup() */

new_tuple = true;

while ((substring = breakup(column, buff)))
{
int l;

l = strlen(substring);

data->vl_len = l + sizeof(int32);
memcpy(VARDATA(data), substring, l);
values[0] = PointerGetDatum(data);
values[1] = oid;

ret = SPI_execp(*(plan->splan), values, NULL, 0);
if (ret != SPI_OK_INSERT)
elog(ERROR, "Full Text Indexing: SPI_execp: Error executing plan in
insert");
}
pfree(buff);
pfree(data);
}
}
}

SPI_finish();
return PointerGetDatum(rettuple);
}

static char *
breakup(char *string, char *substring)
{
static char *last_start;
static char *cur_pos;
static char *string_end;

if (new_tuple)
{
string_end = &string[strlen(string)-1];
cur_pos = last_start = &string[0];
new_tuple = false;/* don't initialize this next time */
}

while (cur_pos <= string_end)/* don't read after end of 'string' */
{
if (isleadbyte((unsigned)*cur_pos )) {
/* Bingo, got a Big-5 word (2 bytes) */
cur_pos += 2;
memcpy(substring, last_start, cur_pos - last_start);
substring[cur_pos - last_start] = '\0';
if (!is_stopword(substring))
return substring;
} else if (isalnum((unsigned char) *cur_pos)) {
/* Houston, we have a substring! :) */
cur_pos++;
memcpy(substring, last_start, cur_pos - last_start);
substring[cur_pos - last_start] = '\0';
if (!is_stopword(substring))
return substring;
} else {
last_start = cur_pos + 1;
cur_pos = last_start;
}
}

return NULL;/* we've processed all of 'string' */
}

bool isleadbyte(unsigned char ch)
{
if ((ch >= 0xA1) && (ch <= 0xFE))
return true;

if ((ch >= 0x8E) && (ch <= 0xA0))
return true;

if ((ch >= 0x81) && (ch <= 0x8D))
return true;

return false;
}
/* copied from src/backend/parser/keywords.c and adjusted for our
situation*/
static bool
is_stopword(char *text)
{
#ifdef USE_STOP_WORDS
char **StopLow; /* for list of stop-words */
char **StopHigh;
char **StopMiddle;
int difference;

StopLow = &StopWords[0]; /* initialize stuff for binary search */
StopHigh = endof(StopWords);

/* Loop invariant: *StopLow <= text < *StopHigh */

while (StopLow < StopHigh)
{
StopMiddle = StopLow + (StopHigh - StopLow) / 2;
difference = strcmp(*StopMiddle, text);
if (difference == 0)
return (true);
else if (difference < 0)
StopLow = StopMiddle + 1;
else
StopHigh = StopMiddle;
}
#endif /* USE_STOP_WORDS */

return (false);
}

/* for caching of query plans, stolen from contrib/spi/\*.c */
static EPlan *
find_plan(char *ident, EPlan ** eplan, int *nplans)
{
EPlan *newp;
int i;

if (*nplans > 0)
{
for (i = 0; i < *nplans; i++)
{
if (strcmp((*eplan)[i].ident, ident) == 0)
break;
}
if (i != *nplans)
return (*eplan + i);
*eplan = (EPlan *) realloc(*eplan, (i + 1) * sizeof(EPlan));
newp = *eplan + i;
}
else
{
newp = *eplan = (EPlan *) malloc(sizeof(EPlan));
(*nplans) = i = 0;
}

newp->ident = (char *) malloc(strlen(ident) + 1);
strcpy(newp->ident, ident);
newp->nplans = 0;
newp->splan = NULL;
(*nplans)++;

return (newp);
}

Responses

Browse pgsql-patches by date

  From Date Subject
Next Message J. R. Nield 2002-07-09 19:56:27 First group of logging changes for PITR
Previous Message Andrew Sullivan 2002-07-08 16:44:18 Re: Some Solaris notes, and an invitation