Quick Links

Re: New Full Text Index using contrib/fulltextindex which now able to processing Traditional Chinese characters(Big5 encoding)

From:	"Christopher Kings-Lynne" <chriskl(at)familyhealth(dot)com(dot)au>
To:	"eggli" <anti_spawn(at)msa(dot)hinet(dot)net>, <pgsql-patches(at)postgresql(dot)org>
Subject:	Re: New Full Text Index using contrib/fulltextindex which now able to processing Traditional Chinese characters(Big5 encoding)
Date:	2002-07-10 03:03:53
Message-ID:	GNELIHDDFBOCMGBFGEFOKEAPCDAA.chriskl@familyhealth.com.au
Views:	Whole Thread \| Raw Message \| Download mbox \| Resend email
Thread:
Lists:	pgsql-patches

Hi eggli,

I'm currently working on another patch that has been submitted for
fulltextindex. I will try to merge both yours and Florian's changes for the
fulltextindex module and get it into CVS for 7.3.

Cheers,

Chris

> -----Original Message-----
> From: pgsql-patches-owner(at)postgresql(dot)org
> [mailto:pgsql-patches-owner(at)postgresql(dot)org]On Behalf Of eggli
> Sent: Tuesday, 9 July 2002 6:58 PM
> To: pgsql-patches(at)postgresql(dot)org
> Subject: [PATCHES] New Full Text Index using contrib/fulltextindex which
> now able to processing Traditional Chinese characters(Big5 encoding)
>
>
> Hi, all, I found that contrib/fulltextindex is unable to process multibyte
> characters, so I tried to make it suit for my mother language as
> Chinese, I
> believe it's able to process Unicode by wcrok(), but I'm lazy to complete
> it.;)
>
> USAGE:
>
> Just replace the fti.c from this mail on contrib/fulltextindex and re-make
> it.
>
> And the indexing/query way is most the same.
>
> CODE:
> /* The difference: breakup() and issleadbyte()
>
> #include "postgres.h"
>
> #include <ctype.h>
>
> #include "executor/spi.h"
> #include "commands/trigger.h"
>
> /*
> * Trigger function accepts variable number of arguments:
> *
> * 1. relation in which to store the substrings
> * 2. fields to extract substrings from
> *
> * The relation in which to insert *must* have the following layout:
> *
> * string varchar(#)
> * id oid
> *
> * where # is the largest size of the varchar columns being indexed
> *
> * Example:
> *
> * -- Create the SQL function based on the compiled shared object
> * create function fti() returns opaque as
> * '/usr/local/pgsql/lib/contrib/fti.so' language 'C';
> *
> * -- Create the FTI table
> * create table product_fti (string varchar(255), id oid);
> *
> * -- Create an index to assist string matches
> * create index product_fti_string_idx on product_fti (string);
> *
> * -- Create an index to assist trigger'd deletes
> * create index product_fti_id_idx on product_fti (id);
> *
> * -- Create an index on the product oid column to assist joins
> * -- between the fti table and the product table
> * create index product_oid_idx on product (oid);
> *
> * -- Create the trigger to perform incremental changes to the full text
> index.
> * create trigger product_fti_trig after update or insert or delete on
> product
> * for each row execute procedure fti(product_fti, title, artist);
> * ^^^^^^^^^^^
> * table where full text index is stored
> * ^^^^^^^^^^^^^
> * columns to index in the base table
> *
> * After populating 'product', try something like:
> *
> * SELECT DISTINCT(p.*) FROM product p, product_fti f1,
> product_fti f2 WHERE
> * f1.string ~ '^slippery' AND f2.string ~ '^wet' AND p.oid=f1.id AND
> p.oid=f2.id;
> *
> * To check that your indicies are being used correctly, make sure you
> * EXPLAIN SELECT ... your test query above.
> *
> * CHANGELOG
> * ---------
> *
> * august 3 2001
> * Extended fti function to accept more than one column as a
> * parameter and all specified columns are indexed. Changed
> * all uses of sprintf to snprintf. Made error messages more
> * consistent.
> *
> * march 4 1998 Changed breakup() to return less substrings. Only breakup
> * in word parts which are in turn shortened from the start
> * of the word (ie. word, ord, rd)
> * Did allocation of substring buffer outside of breakup()
> *
> * oct. 5 1997, fixed a bug in string breakup (where there are
> more nonalpha
> * characters between words then 1).
> *
> * oct 4-5 1997 implemented the thing, at least the basic functionallity
> * of it all....
> *
> * TODO
> * ----
> *
> * prevent generating duplicate words for an oid in the fti table
> * save a plan for deletes
> * create a function that will make the index *after* we have populated
> * the main table (probably first delete all contents to be sure there's
> * nothing in it, then re-populate the fti-table)
> *
> * can we do something with operator overloading or a seperate function
> * that can build the final query automatigally?
> */
>
> #define MAX_FTI_QUERY_LENGTH 8192
>
> extern Datum fti(PG_FUNCTION_ARGS);
> static char *breakup(char *, char *);
> static bool is_stopword(char *);
> static bool isleadbyte(unsigned char ch);
> static bool new_tuple = false;
>
>
> #ifdef USE_STOP_WORDS
>
> /* THIS LIST MUST BE IN SORTED ORDER, A BINARY SEARCH IS USED!!!! */
> char *StopWords[] = { /* list of words to skip in indexing */
> "no",
> "the",
> "yes"
> };
> #endif /* USE_STOP_WORDS */
>
> /* stuff for caching query-plans, stolen from contrib/spi/\*.c */
> typedef struct
> {
> char *ident;
> int nplans;
> void **splan;
> } EPlan;
>
> static EPlan *InsertPlans = NULL;
> static EPlan *DeletePlans = NULL;
> static int nInsertPlans = 0;
> static int nDeletePlans = 0;
>
> static EPlan *find_plan(char *ident, EPlan ** eplan, int *nplans);
>
> /***********************************************************************/
> PG_FUNCTION_INFO_V1(fti);
>
> Datum
> fti(PG_FUNCTION_ARGS)
> {
> TriggerData *trigdata;
> Trigger *trigger; /* to get trigger name */
> int nargs; /* # of arguments */
> char **args; /* arguments */
> char *relname; /* triggered relation name */
> Relation rel; /* triggered relation */
> char *indexname; /* name of table for substrings */
> HeapTuple rettuple = NULL;
> TupleDesc tupdesc; /* tuple description */
> bool isinsert = false;
> bool isdelete = false;
> int ret;
> char query[MAX_FTI_QUERY_LENGTH];
> Oid oid;
>
> /*
> * FILE *debug;
> */
>
> /*
> * debug = fopen("/dev/xconsole", "w"); fprintf(debug, "FTI: entered
> * function\n"); fflush(debug);
> */
>
> if (!CALLED_AS_TRIGGER(fcinfo))
> elog(ERROR, "Full Text Indexing: Not fired by trigger manager");
>
> /* It's safe to cast now that we've checked */
> trigdata = (TriggerData *) fcinfo->context;
>
> if (TRIGGER_FIRED_FOR_STATEMENT(trigdata->tg_event))
> elog(ERROR, "Full Text Indexing: Can't process STATEMENT events");
> if (TRIGGER_FIRED_BEFORE(trigdata->tg_event))
> elog(ERROR, "Full Text Indexing: Must be fired AFTER event");
>
> if (TRIGGER_FIRED_BY_INSERT(trigdata->tg_event))
> isinsert = true;
> if (TRIGGER_FIRED_BY_UPDATE(trigdata->tg_event))
> {
> isdelete = true;
> isinsert = true;
> }
> if (TRIGGER_FIRED_BY_DELETE(trigdata->tg_event))
> isdelete = true;
>
> trigger = trigdata->tg_trigger;
> rel = trigdata->tg_relation;
> relname = SPI_getrelname(rel);
> rettuple = trigdata->tg_trigtuple;
> if (isdelete && isinsert) /* is an UPDATE */
> rettuple = trigdata->tg_newtuple;
>
> if ((ret = SPI_connect()) < 0)
> elog(ERROR, "Full Text Indexing: SPI_connect: Failed, returned %d\n",
> ret);
>
> nargs = trigger->tgnargs;
> if (nargs < 2)
> elog(ERROR, "Full Text Indexing: Trigger must have at least 2
> arguments\n");
>
> args = trigger->tgargs;
> indexname = args[0];
> tupdesc = rel->rd_att; /* what the tuple looks like (?) */
>
> /* get oid of current tuple, needed by all, so place here */
> oid = rettuple->t_data->t_oid;
> if (!OidIsValid(oid))
> elog(ERROR, "Full Text Indexing: Oid of current tuple is invalid");
>
> if (isdelete)
> {
> void *pplan;
> Oid *argtypes;
> Datum values[1];
> EPlan *plan;
> int i;
>
> snprintf(query, MAX_FTI_QUERY_LENGTH, "D%s", indexname);
> for (i = 1; i < nargs; i++)
> snprintf(query, MAX_FTI_QUERY_LENGTH, "%s$%s", query, args[i]);
>
> plan = find_plan(query, &DeletePlans, &nDeletePlans);
> if (plan->nplans <= 0)
> {
> argtypes = (Oid *) palloc(sizeof(Oid));
>
> argtypes[0] = OIDOID;
>
> snprintf(query, MAX_FTI_QUERY_LENGTH, "DELETE FROM %s WHERE id = $1",
> indexname);
> pplan = SPI_prepare(query, 1, argtypes);
> if (!pplan)
> elog(ERROR, "Full Text Indexing: SPI_prepare: Returned NULL
> in delete");
> pplan = SPI_saveplan(pplan);
> if (pplan == NULL)
> elog(ERROR, "Full Text Indexing: SPI_saveplan: Returned NULL in
> delete");
>
> plan->splan = (void **) malloc(sizeof(void *));
> *(plan->splan) = pplan;
> plan->nplans = 1;
> }
>
> values[0] = oid;
>
> ret = SPI_execp(*(plan->splan), values, NULL, 0);
> if (ret != SPI_OK_DELETE)
> elog(ERROR, "Full Text Indexing: SPI_execp: Error executing plan in
> delete");
> }
>
> if (isinsert)
> {
> char *substring;
> char *column;
> void *pplan;
> Oid *argtypes;
> Datum values[2];
> int colnum;
> struct varlena *data;
> EPlan *plan;
> int i;
> char *buff;
> char *string;
>
> snprintf(query, MAX_FTI_QUERY_LENGTH, "I%s", indexname);
> for (i = 1; i < nargs; i++)
> snprintf(query, MAX_FTI_QUERY_LENGTH, "%s$%s", query, args[i]);
>
> plan = find_plan(query, &InsertPlans, &nInsertPlans);
>
> /* no plan yet, so allocate mem for argtypes */
> if (plan->nplans <= 0)
> {
> argtypes = (Oid *) palloc(2 * sizeof(Oid));
>
> argtypes[0] = VARCHAROID; /* create table t_name (string
> * varchar, */
> argtypes[1] = OIDOID; /* id oid); */
>
> /* prepare plan to gain speed */
> snprintf(query, MAX_FTI_QUERY_LENGTH, "INSERT INTO %s (string,
> id) VALUES
> ($1, $2)",
> indexname);
> pplan = SPI_prepare(query, 2, argtypes);
> if (!pplan)
> elog(ERROR, "Full Text Indexing: SPI_prepare: Returned NULL
> in insert");
>
> pplan = SPI_saveplan(pplan);
> if (pplan == NULL)
> elog(ERROR, "Full Text Indexing: SPI_saveplan: Returned NULL in
> insert");
>
> plan->splan = (void **) malloc(sizeof(void *));
> *(plan->splan) = pplan;
> plan->nplans = 1;
> }
>
> /* prepare plan for query */
> for (i = 0; i < nargs - 1; i++)
> {
> colnum = SPI_fnumber(tupdesc, args[i + 1]);
> if (colnum == SPI_ERROR_NOATTRIBUTE)
> elog(ERROR, "Full Text Indexing: SPI_fnumber: Column '%s' of '%s' not
> found", args[i + 1], indexname);
>
> /* Get the char* representation of the column */
> column = SPI_getvalue(rettuple, tupdesc, colnum);
>
> /* make sure we don't try to index NULL's */
> if (column)
> {
> string = column;
> while (*string != '\0')
> {
> *string = tolower((unsigned char) *string);
> string++;
> }
>
> data = (struct varlena *) palloc(sizeof(int32) + strlen(column) +1);
> buff = palloc(strlen(column) + 1);
> /* saves lots of calls in while-loop and in breakup() */
>
> new_tuple = true;
>
> while ((substring = breakup(column, buff)))
> {
> int l;
>
> l = strlen(substring);
>
> data->vl_len = l + sizeof(int32);
> memcpy(VARDATA(data), substring, l);
> values[0] = PointerGetDatum(data);
> values[1] = oid;
>
> ret = SPI_execp(*(plan->splan), values, NULL, 0);
> if (ret != SPI_OK_INSERT)
> elog(ERROR, "Full Text Indexing: SPI_execp: Error executing plan in
> insert");
> }
> pfree(buff);
> pfree(data);
> }
> }
> }
>
> SPI_finish();
> return PointerGetDatum(rettuple);
> }
>
> static char *
> breakup(char *string, char *substring)
> {
> static char *last_start;
> static char *cur_pos;
> static char *string_end;
>
> if (new_tuple)
> {
> string_end = &string[strlen(string)-1];
> cur_pos = last_start = &string[0];
> new_tuple = false;/* don't initialize this next time */
> }
>
> while (cur_pos <= string_end)/* don't read after end of 'string' */
> {
> if (isleadbyte((unsigned)*cur_pos )) {
> /* Bingo, got a Big-5 word (2 bytes) */
> cur_pos += 2;
> memcpy(substring, last_start, cur_pos - last_start);
> substring[cur_pos - last_start] = '\0';
> if (!is_stopword(substring))
> return substring;
> } else if (isalnum((unsigned char) *cur_pos)) {
> /* Houston, we have a substring! :) */
> cur_pos++;
> memcpy(substring, last_start, cur_pos - last_start);
> substring[cur_pos - last_start] = '\0';
> if (!is_stopword(substring))
> return substring;
> } else {
> last_start = cur_pos + 1;
> cur_pos = last_start;
> }
> }
>
> return NULL;/* we've processed all of 'string' */
> }
>
> bool isleadbyte(unsigned char ch)
> {
> if ((ch >= 0xA1) && (ch <= 0xFE))
> return true;
>
> if ((ch >= 0x8E) && (ch <= 0xA0))
> return true;
>
> if ((ch >= 0x81) && (ch <= 0x8D))
> return true;
>
> return false;
> }
> /* copied from src/backend/parser/keywords.c and adjusted for our
> situation*/
> static bool
> is_stopword(char *text)
> {
> #ifdef USE_STOP_WORDS
> char **StopLow; /* for list of stop-words */
> char **StopHigh;
> char **StopMiddle;
> int difference;
>
> StopLow = &StopWords[0]; /* initialize stuff for binary search */
> StopHigh = endof(StopWords);
>
> /* Loop invariant: *StopLow <= text < *StopHigh */
>
> while (StopLow < StopHigh)
> {
> StopMiddle = StopLow + (StopHigh - StopLow) / 2;
> difference = strcmp(*StopMiddle, text);
> if (difference == 0)
> return (true);
> else if (difference < 0)
> StopLow = StopMiddle + 1;
> else
> StopHigh = StopMiddle;
> }
> #endif /* USE_STOP_WORDS */
>
> return (false);
> }
>
> /* for caching of query plans, stolen from contrib/spi/\*.c */
> static EPlan *
> find_plan(char *ident, EPlan ** eplan, int *nplans)
> {
> EPlan *newp;
> int i;
>
> if (*nplans > 0)
> {
> for (i = 0; i < *nplans; i++)
> {
> if (strcmp((*eplan)[i].ident, ident) == 0)
> break;
> }
> if (i != *nplans)
> return (*eplan + i);
> *eplan = (EPlan *) realloc(*eplan, (i + 1) * sizeof(EPlan));
> newp = *eplan + i;
> }
> else
> {
> newp = *eplan = (EPlan *) malloc(sizeof(EPlan));
> (*nplans) = i = 0;
> }
>
> newp->ident = (char *) malloc(strlen(ident) + 1);
> strcpy(newp->ident, ident);
> newp->nplans = 0;
> newp->splan = NULL;
> (*nplans)++;
>
> return (newp);
> }
>
>
>
>
>
>
> ---------------------------(end of broadcast)---------------------------
> TIP 4: Don't 'kill -9' the postmaster
>

In response to

New Full Text Index using contrib/fulltextindex which now able to processing Traditional Chinese characters(Big5 encoding) at 2002-07-09 10:58:00 from eggli

Browse pgsql-patches by date

	From	Date	Subject
Next Message	Bruce Momjian	2002-07-10 04:05:13	Re: implementing query timeout
Previous Message	Christopher Kings-Lynne	2002-07-10 02:13:18	Re: Between Node