From ae97174555cbaf059d2df45df3c80d7e67007fdd Mon Sep 17 00:00:00 2001 From: Tomas Vondra Date: Sun, 9 Jun 2019 21:38:42 +0200 Subject: [PATCH 04/10] Add opclass parameters to GiST tsvector_ops --- doc/src/sgml/textsearch.sgml | 9 +- src/backend/utils/adt/tsgistidx.c | 269 ++++++++++++++------------ src/include/catalog/pg_amproc.dat | 5 +- src/include/catalog/pg_proc.dat | 19 +- src/test/regress/expected/tsearch.out | 176 +++++++++++++++++ src/test/regress/sql/tsearch.sql | 45 +++++ 6 files changed, 392 insertions(+), 131 deletions(-) diff --git a/doc/src/sgml/textsearch.sgml b/doc/src/sgml/textsearch.sgml index 40888a4d20..54b796ecf1 100644 --- a/doc/src/sgml/textsearch.sgml +++ b/doc/src/sgml/textsearch.sgml @@ -3637,7 +3637,7 @@ SELECT plainto_tsquery('supernovae stars'); text search - CREATE INDEX name ON table USING GIST (column); + CREATE INDEX name ON table USING GIST (column [ { DEFAULT | tsvector_ops } (siglen = number) ] ); @@ -3645,6 +3645,8 @@ SELECT plainto_tsquery('supernovae stars'); Creates a GiST (Generalized Search Tree)-based index. The column can be of tsvector or tsquery type. + Optional integer parameter siglen determines + signature length in bytes (see below for details). @@ -3668,7 +3670,10 @@ SELECT plainto_tsquery('supernovae stars'); to check the actual table row to eliminate such false matches. (PostgreSQL does this automatically when needed.) GiST indexes are lossy because each document is represented in the - index by a fixed-length signature. The signature is generated by hashing + index by a fixed-length signature. Signature length in bytes is determined + by the value of the optional integer parameter siglen. + Default signature length (when siglen is not specied) is + 124 bytes, maximal length is 484 bytes. The signature is generated by hashing each word into a single bit in an n-bit string, with all these bits OR-ed together to produce an n-bit document signature. When two words hash to the same bit position there will be a false match. If all words in diff --git a/src/backend/utils/adt/tsgistidx.c b/src/backend/utils/adt/tsgistidx.c index 4f256260fd..91661cf8b8 100644 --- a/src/backend/utils/adt/tsgistidx.c +++ b/src/backend/utils/adt/tsgistidx.c @@ -15,6 +15,7 @@ #include "postgres.h" #include "access/gist.h" +#include "access/reloptions.h" #include "access/tuptoaster.h" #include "port/pg_bitutils.h" #include "tsearch/ts_utils.h" @@ -22,17 +23,22 @@ #include "utils/pg_crc.h" -#define SIGLENINT 31 /* >121 => key will toast, so it will not work - * !!! */ +#define SIGLEN_DEFAULT (31 * 4) +#define SIGLEN_MAX (121 * 4) /* key will toast, so it will not work !!! */ -#define SIGLEN ( sizeof(int32) * SIGLENINT ) -#define SIGLENBIT (SIGLEN * BITS_PER_BYTE) +#define SIGLENBIT(siglen) ((siglen) * BITS_PER_BYTE) + +/* tsvector_ops opclass options */ +typedef struct GistTsVectorOptions +{ + int32 vl_len_; /* varlena header (do not touch directly!) */ + int siglen; /* signature length */ +} GistTsVectorOptions; -typedef char BITVEC[SIGLEN]; typedef char *BITVECP; -#define LOOPBYTE \ - for(i=0;i> (i) & 0x01 ) @@ -40,8 +46,8 @@ typedef char *BITVECP; #define SETBIT(x,i) GETBYTE(x,i) |= ( 0x01 << ( (i) % BITS_PER_BYTE ) ) #define GETBIT(x,i) ( (GETBYTE(x,i) >> ( (i) % BITS_PER_BYTE )) & 0x01 ) -#define HASHVAL(val) (((unsigned int)(val)) % SIGLENBIT) -#define HASH(sign, val) SETBIT((sign), HASHVAL(val)) +#define HASHVAL(val, siglen) (((unsigned int)(val)) % SIGLENBIT(siglen)) +#define HASH(sign, val, siglen) SETBIT((sign), HASHVAL(val, siglen)) #define GETENTRY(vec,pos) ((SignTSVector *) DatumGetPointer((vec)->vector[(pos)].key)) @@ -65,13 +71,14 @@ typedef struct #define ISALLTRUE(x) ( ((SignTSVector*)(x))->flag & ALLISTRUE ) #define GTHDRSIZE ( VARHDRSZ + sizeof(int32) ) -#define CALCGTSIZE(flag, len) ( GTHDRSIZE + ( ( (flag) & ARRKEY ) ? ((len)*sizeof(int32)) : (((flag) & ALLISTRUE) ? 0 : SIGLEN) ) ) +#define CALCGTSIZE(flag, len) ( GTHDRSIZE + ( ( (flag) & ARRKEY ) ? ((len)*sizeof(int32)) : (((flag) & ALLISTRUE) ? 0 : (len)) ) ) #define GETSIGN(x) ( (BITVECP)( (char*)(x)+GTHDRSIZE ) ) +#define GETSIGLEN(x)( VARSIZE(x) - GTHDRSIZE ) #define GETARR(x) ( (int32*)( (char*)(x)+GTHDRSIZE ) ) #define ARRNELEM(x) ( ( VARSIZE(x) - GTHDRSIZE )/sizeof(int32) ) -static int32 sizebitvec(BITVECP sign); +static int32 sizebitvec(BITVECP sign, int siglen); Datum gtsvectorin(PG_FUNCTION_ARGS) @@ -102,9 +109,10 @@ gtsvectorout(PG_FUNCTION_ARGS) sprintf(outbuf, ARROUTSTR, (int) ARRNELEM(key)); else { - int cnttrue = (ISALLTRUE(key)) ? SIGLENBIT : sizebitvec(GETSIGN(key)); + int siglen = GETSIGLEN(key); + int cnttrue = (ISALLTRUE(key)) ? SIGLENBIT(siglen) : sizebitvec(GETSIGN(key), siglen); - sprintf(outbuf, SINGOUTSTR, cnttrue, (int) SIGLENBIT - cnttrue); + sprintf(outbuf, SINGOUTSTR, cnttrue, (int) SIGLENBIT(siglen) - cnttrue); } PG_FREE_IF_COPY(key, 0); @@ -148,36 +156,49 @@ uniqueint(int32 *a, int32 l) } static void -makesign(BITVECP sign, SignTSVector *a) +makesign(BITVECP sign, SignTSVector *a, int siglen) { int32 k, len = ARRNELEM(a); int32 *ptr = GETARR(a); - MemSet((void *) sign, 0, sizeof(BITVEC)); + MemSet((void *) sign, 0, siglen); for (k = 0; k < len; k++) - HASH(sign, ptr[k]); + HASH(sign, ptr[k], siglen); +} + +static SignTSVector * +gtsvector_alloc(int flag, int len, BITVECP sign) +{ + int size = CALCGTSIZE(flag, len); + SignTSVector *res = palloc(size); + + SET_VARSIZE(res, size); + res->flag = flag; + + if ((flag & (SIGNKEY | ALLISTRUE)) == SIGNKEY && sign) + memcpy(GETSIGN(res), sign, len); + + return res; } + Datum gtsvector_compress(PG_FUNCTION_ARGS) { GISTENTRY *entry = (GISTENTRY *) PG_GETARG_POINTER(0); + int siglen = ((GistTsVectorOptions *) PG_GETARG_POINTER(1))->siglen; GISTENTRY *retval = entry; if (entry->leafkey) { /* tsvector */ - SignTSVector *res; TSVector val = DatumGetTSVector(entry->key); + SignTSVector *res = gtsvector_alloc(ARRKEY, val->size, NULL); int32 len; int32 *arr; WordEntry *ptr = ARRPTR(val); char *words = STRPTR(val); - len = CALCGTSIZE(ARRKEY, val->size); - res = (SignTSVector *) palloc(len); - SET_VARSIZE(res, len); - res->flag = ARRKEY; arr = GETARR(res); len = val->size; while (len--) @@ -208,13 +229,9 @@ gtsvector_compress(PG_FUNCTION_ARGS) /* make signature, if array is too long */ if (VARSIZE(res) > TOAST_INDEX_TARGET) { - SignTSVector *ressign; + SignTSVector *ressign = gtsvector_alloc(SIGNKEY, siglen, NULL); - len = CALCGTSIZE(SIGNKEY, 0); - ressign = (SignTSVector *) palloc(len); - SET_VARSIZE(ressign, len); - ressign->flag = SIGNKEY; - makesign(GETSIGN(ressign), res); + makesign(GETSIGN(ressign), res, siglen); res = ressign; } @@ -226,22 +243,17 @@ gtsvector_compress(PG_FUNCTION_ARGS) else if (ISSIGNKEY(DatumGetPointer(entry->key)) && !ISALLTRUE(DatumGetPointer(entry->key))) { - int32 i, - len; + int32 i; SignTSVector *res; BITVECP sign = GETSIGN(DatumGetPointer(entry->key)); - LOOPBYTE + LOOPBYTE(siglen) { if ((sign[i] & 0xff) != 0xff) PG_RETURN_POINTER(retval); } - len = CALCGTSIZE(SIGNKEY | ALLISTRUE, 0); - res = (SignTSVector *) palloc(len); - SET_VARSIZE(res, len); - res->flag = SIGNKEY | ALLISTRUE; - + res = gtsvector_alloc(SIGNKEY | ALLISTRUE, siglen, sign); retval = (GISTENTRY *) palloc(sizeof(GISTENTRY)); gistentryinit(*retval, PointerGetDatum(res), entry->rel, entry->page, @@ -315,12 +327,14 @@ checkcondition_arr(void *checkval, QueryOperand *val, ExecPhraseData *data) static bool checkcondition_bit(void *checkval, QueryOperand *val, ExecPhraseData *data) { + void *key = (SignTSVector *) checkval; + /* * we are not able to find a prefix in signature tree */ if (val->prefix) return true; - return GETBIT(checkval, HASHVAL(val->valcrc)); + return GETBIT(GETSIGN(key), HASHVAL(val->valcrc, GETSIGLEN(key))); } Datum @@ -347,7 +361,7 @@ gtsvector_consistent(PG_FUNCTION_ARGS) /* since signature is lossy, cannot specify CALC_NOT here */ PG_RETURN_BOOL(TS_execute(GETQUERY(query), - (void *) GETSIGN(key), + key, TS_EXEC_PHRASE_NO_POS, checkcondition_bit)); } @@ -365,7 +379,7 @@ gtsvector_consistent(PG_FUNCTION_ARGS) } static int32 -unionkey(BITVECP sbase, SignTSVector *add) +unionkey(BITVECP sbase, SignTSVector *add, int siglen) { int32 i; @@ -376,7 +390,9 @@ unionkey(BITVECP sbase, SignTSVector *add) if (ISALLTRUE(add)) return 1; - LOOPBYTE + Assert(GETSIGLEN(add) == siglen); + + LOOPBYTE(siglen) sbase[i] |= sadd[i]; } else @@ -384,7 +400,7 @@ unionkey(BITVECP sbase, SignTSVector *add) int32 *ptr = GETARR(add); for (i = 0; i < ARRNELEM(add); i++) - HASH(sbase, ptr[i]); + HASH(sbase, ptr[i], siglen); } return 0; } @@ -395,30 +411,24 @@ gtsvector_union(PG_FUNCTION_ARGS) { GistEntryVector *entryvec = (GistEntryVector *) PG_GETARG_POINTER(0); int *size = (int *) PG_GETARG_POINTER(1); - BITVEC base; - int32 i, - len; - int32 flag = 0; - SignTSVector *result; + int siglen = ((GistTsVectorOptions *) PG_GETARG_POINTER(2))->siglen; + SignTSVector *result = gtsvector_alloc(SIGNKEY, siglen, NULL); + BITVECP base = GETSIGN(result); + int32 i; + + memset(base, 0, siglen); - MemSet((void *) base, 0, sizeof(BITVEC)); for (i = 0; i < entryvec->n; i++) { - if (unionkey(base, GETENTRY(entryvec, i))) + if (unionkey(base, GETENTRY(entryvec, i), siglen)) { - flag = ALLISTRUE; + result->flag |= ALLISTRUE; + SET_VARSIZE(result, CALCGTSIZE(result->flag, siglen)); break; } } - flag |= SIGNKEY; - len = CALCGTSIZE(flag, 0); - result = (SignTSVector *) palloc(len); - *size = len; - SET_VARSIZE(result, len); - result->flag = flag; - if (!ISALLTRUE(result)) - memcpy((void *) GETSIGN(result), (void *) base, sizeof(BITVEC)); + *size = VARSIZE(result); PG_RETURN_POINTER(result); } @@ -429,6 +439,7 @@ gtsvector_same(PG_FUNCTION_ARGS) SignTSVector *a = (SignTSVector *) PG_GETARG_POINTER(0); SignTSVector *b = (SignTSVector *) PG_GETARG_POINTER(1); bool *result = (bool *) PG_GETARG_POINTER(2); + int siglen = ((GistTsVectorOptions *) PG_GETARG_POINTER(3))->siglen; if (ISSIGNKEY(a)) { /* then b also ISSIGNKEY */ @@ -444,8 +455,10 @@ gtsvector_same(PG_FUNCTION_ARGS) BITVECP sa = GETSIGN(a), sb = GETSIGN(b); + Assert(GETSIGLEN(a) == siglen && GETSIGLEN(b) == siglen); + *result = true; - LOOPBYTE + LOOPBYTE(siglen) { if (sa[i] != sb[i]) { @@ -482,19 +495,19 @@ gtsvector_same(PG_FUNCTION_ARGS) } static int32 -sizebitvec(BITVECP sign) +sizebitvec(BITVECP sign, int siglen) { - return pg_popcount(sign, SIGLEN); + return pg_popcount(sign, siglen); } static int -hemdistsign(BITVECP a, BITVECP b) +hemdistsign(BITVECP a, BITVECP b, int siglen) { int i, diff, dist = 0; - LOOPBYTE + LOOPBYTE(siglen) { diff = (unsigned char) (a[i] ^ b[i]); /* Using the popcount functions here isn't likely to win */ @@ -506,17 +519,22 @@ hemdistsign(BITVECP a, BITVECP b) static int hemdist(SignTSVector *a, SignTSVector *b) { + int siglena = GETSIGLEN(a); + int siglenb = GETSIGLEN(b); + if (ISALLTRUE(a)) { if (ISALLTRUE(b)) return 0; else - return SIGLENBIT - sizebitvec(GETSIGN(b)); + return SIGLENBIT(siglenb) - sizebitvec(GETSIGN(b), siglenb); } else if (ISALLTRUE(b)) - return SIGLENBIT - sizebitvec(GETSIGN(a)); + return SIGLENBIT(siglena) - sizebitvec(GETSIGN(a), siglena); - return hemdistsign(GETSIGN(a), GETSIGN(b)); + Assert(siglena == siglenb); + + return hemdistsign(GETSIGN(a), GETSIGN(b), siglena); } Datum @@ -525,6 +543,7 @@ gtsvector_penalty(PG_FUNCTION_ARGS) GISTENTRY *origentry = (GISTENTRY *) PG_GETARG_POINTER(0); /* always ISSIGNKEY */ GISTENTRY *newentry = (GISTENTRY *) PG_GETARG_POINTER(1); float *penalty = (float *) PG_GETARG_POINTER(2); + int siglen = ((GistTsVectorOptions *) PG_GETARG_POINTER(3))->siglen; SignTSVector *origval = (SignTSVector *) DatumGetPointer(origentry->key); SignTSVector *newval = (SignTSVector *) DatumGetPointer(newentry->key); BITVECP orig = GETSIGN(origval); @@ -533,14 +552,22 @@ gtsvector_penalty(PG_FUNCTION_ARGS) if (ISARRKEY(newval)) { - BITVEC sign; + BITVECP sign = palloc(siglen); - makesign(sign, newval); + makesign(sign, newval, siglen); if (ISALLTRUE(origval)) - *penalty = ((float) (SIGLENBIT - sizebitvec(sign))) / (float) (SIGLENBIT + 1); + { + int siglenbit = SIGLENBIT(siglen); + + *penalty = + (float) (siglenbit - sizebitvec(sign, siglen)) / + (float) (siglenbit + 1); + } else - *penalty = hemdistsign(sign, orig); + *penalty = hemdistsign(sign, orig, siglen); + + pfree(sign); } else *penalty = hemdist(origval, newval); @@ -550,19 +577,19 @@ gtsvector_penalty(PG_FUNCTION_ARGS) typedef struct { bool allistrue; - BITVEC sign; + BITVECP sign; } CACHESIGN; static void -fillcache(CACHESIGN *item, SignTSVector *key) +fillcache(CACHESIGN *item, SignTSVector *key, int siglen) { item->allistrue = false; if (ISARRKEY(key)) - makesign(item->sign, key); + makesign(item->sign, key, siglen); else if (ISALLTRUE(key)) item->allistrue = true; else - memcpy((void *) item->sign, (void *) GETSIGN(key), sizeof(BITVEC)); + memcpy((void *) item->sign, (void *) GETSIGN(key), siglen); } #define WISH_F(a,b,c) (double)( -(double)(((a)-(b))*((a)-(b))*((a)-(b)))*(c) ) @@ -586,19 +613,19 @@ comparecost(const void *va, const void *vb) static int -hemdistcache(CACHESIGN *a, CACHESIGN *b) +hemdistcache(CACHESIGN *a, CACHESIGN *b, int siglen) { if (a->allistrue) { if (b->allistrue) return 0; else - return SIGLENBIT - sizebitvec(b->sign); + return SIGLENBIT(siglen) - sizebitvec(b->sign, siglen); } else if (b->allistrue) - return SIGLENBIT - sizebitvec(a->sign); + return SIGLENBIT(siglen) - sizebitvec(a->sign, siglen); - return hemdistsign(a->sign, b->sign); + return hemdistsign(a->sign, b->sign, siglen); } Datum @@ -606,6 +633,7 @@ gtsvector_picksplit(PG_FUNCTION_ARGS) { GistEntryVector *entryvec = (GistEntryVector *) PG_GETARG_POINTER(0); GIST_SPLITVEC *v = (GIST_SPLITVEC *) PG_GETARG_POINTER(1); + int siglen = ((GistTsVectorOptions *) PG_GETARG_POINTER(2))->siglen; OffsetNumber k, j; SignTSVector *datum_l, @@ -625,6 +653,7 @@ gtsvector_picksplit(PG_FUNCTION_ARGS) BITVECP ptr; int i; CACHESIGN *cache; + char *cache_sign; SPLITCOST *costvector; maxoff = entryvec->n - 2; @@ -633,16 +662,22 @@ gtsvector_picksplit(PG_FUNCTION_ARGS) v->spl_right = (OffsetNumber *) palloc(nbytes); cache = (CACHESIGN *) palloc(sizeof(CACHESIGN) * (maxoff + 2)); - fillcache(&cache[FirstOffsetNumber], GETENTRY(entryvec, FirstOffsetNumber)); + cache_sign = palloc(siglen * (maxoff + 2)); + + for (j = 0; j < maxoff + 2; j++) + cache[j].sign = &cache_sign[siglen * j]; + + fillcache(&cache[FirstOffsetNumber], GETENTRY(entryvec, FirstOffsetNumber), + siglen); for (k = FirstOffsetNumber; k < maxoff; k = OffsetNumberNext(k)) { for (j = OffsetNumberNext(k); j <= maxoff; j = OffsetNumberNext(j)) { if (k == FirstOffsetNumber) - fillcache(&cache[j], GETENTRY(entryvec, j)); + fillcache(&cache[j], GETENTRY(entryvec, j), siglen); - size_waste = hemdistcache(&(cache[j]), &(cache[k])); + size_waste = hemdistcache(&(cache[j]), &(cache[k]), siglen); if (size_waste > waste) { waste = size_waste; @@ -664,44 +699,21 @@ gtsvector_picksplit(PG_FUNCTION_ARGS) } /* form initial .. */ - if (cache[seed_1].allistrue) - { - datum_l = (SignTSVector *) palloc(CALCGTSIZE(SIGNKEY | ALLISTRUE, 0)); - SET_VARSIZE(datum_l, CALCGTSIZE(SIGNKEY | ALLISTRUE, 0)); - datum_l->flag = SIGNKEY | ALLISTRUE; - } - else - { - datum_l = (SignTSVector *) palloc(CALCGTSIZE(SIGNKEY, 0)); - SET_VARSIZE(datum_l, CALCGTSIZE(SIGNKEY, 0)); - datum_l->flag = SIGNKEY; - memcpy((void *) GETSIGN(datum_l), (void *) cache[seed_1].sign, sizeof(BITVEC)); - } - if (cache[seed_2].allistrue) - { - datum_r = (SignTSVector *) palloc(CALCGTSIZE(SIGNKEY | ALLISTRUE, 0)); - SET_VARSIZE(datum_r, CALCGTSIZE(SIGNKEY | ALLISTRUE, 0)); - datum_r->flag = SIGNKEY | ALLISTRUE; - } - else - { - datum_r = (SignTSVector *) palloc(CALCGTSIZE(SIGNKEY, 0)); - SET_VARSIZE(datum_r, CALCGTSIZE(SIGNKEY, 0)); - datum_r->flag = SIGNKEY; - memcpy((void *) GETSIGN(datum_r), (void *) cache[seed_2].sign, sizeof(BITVEC)); - } - + datum_l = gtsvector_alloc(SIGNKEY | (cache[seed_1].allistrue ? ALLISTRUE : 0), + siglen, cache[seed_1].sign); + datum_r = gtsvector_alloc(SIGNKEY | (cache[seed_2].allistrue ? ALLISTRUE : 0), + siglen, cache[seed_2].sign); union_l = GETSIGN(datum_l); union_r = GETSIGN(datum_r); maxoff = OffsetNumberNext(maxoff); - fillcache(&cache[maxoff], GETENTRY(entryvec, maxoff)); + fillcache(&cache[maxoff], GETENTRY(entryvec, maxoff), siglen); /* sort before ... */ costvector = (SPLITCOST *) palloc(sizeof(SPLITCOST) * maxoff); for (j = FirstOffsetNumber; j <= maxoff; j = OffsetNumberNext(j)) { costvector[j - 1].pos = j; - size_alpha = hemdistcache(&(cache[seed_1]), &(cache[j])); - size_beta = hemdistcache(&(cache[seed_2]), &(cache[j])); + size_alpha = hemdistcache(&(cache[seed_1]), &(cache[j]), siglen); + size_beta = hemdistcache(&(cache[seed_2]), &(cache[j]), siglen); costvector[j - 1].cost = Abs(size_alpha - size_beta); } qsort((void *) costvector, maxoff, sizeof(SPLITCOST), comparecost); @@ -727,36 +739,34 @@ gtsvector_picksplit(PG_FUNCTION_ARGS) if (ISALLTRUE(datum_l) && cache[j].allistrue) size_alpha = 0; else - size_alpha = SIGLENBIT - sizebitvec( - (cache[j].allistrue) ? GETSIGN(datum_l) : GETSIGN(cache[j].sign) - ); + size_alpha = SIGLENBIT(siglen) - + sizebitvec((cache[j].allistrue) ? GETSIGN(datum_l) : GETSIGN(cache[j].sign), siglen); } else - size_alpha = hemdistsign(cache[j].sign, GETSIGN(datum_l)); + size_alpha = hemdistsign(cache[j].sign, GETSIGN(datum_l), siglen); if (ISALLTRUE(datum_r) || cache[j].allistrue) { if (ISALLTRUE(datum_r) && cache[j].allistrue) size_beta = 0; else - size_beta = SIGLENBIT - sizebitvec( - (cache[j].allistrue) ? GETSIGN(datum_r) : GETSIGN(cache[j].sign) - ); + size_beta = SIGLENBIT(siglen) - + sizebitvec((cache[j].allistrue) ? GETSIGN(datum_r) : GETSIGN(cache[j].sign), siglen); } else - size_beta = hemdistsign(cache[j].sign, GETSIGN(datum_r)); + size_beta = hemdistsign(cache[j].sign, GETSIGN(datum_r), siglen); if (size_alpha < size_beta + WISH_F(v->spl_nleft, v->spl_nright, 0.1)) { if (ISALLTRUE(datum_l) || cache[j].allistrue) { if (!ISALLTRUE(datum_l)) - MemSet((void *) GETSIGN(datum_l), 0xff, sizeof(BITVEC)); + MemSet((void *) GETSIGN(datum_l), 0xff, siglen); } else { ptr = cache[j].sign; - LOOPBYTE + LOOPBYTE(siglen) union_l[i] |= ptr[i]; } *left++ = j; @@ -767,12 +777,12 @@ gtsvector_picksplit(PG_FUNCTION_ARGS) if (ISALLTRUE(datum_r) || cache[j].allistrue) { if (!ISALLTRUE(datum_r)) - MemSet((void *) GETSIGN(datum_r), 0xff, sizeof(BITVEC)); + MemSet((void *) GETSIGN(datum_r), 0xff, siglen); } else { ptr = cache[j].sign; - LOOPBYTE + LOOPBYTE(siglen) union_r[i] |= ptr[i]; } *right++ = j; @@ -799,3 +809,20 @@ gtsvector_consistent_oldsig(PG_FUNCTION_ARGS) { return gtsvector_consistent(fcinfo); } + +Datum +gtsvector_options(PG_FUNCTION_ARGS) +{ + Datum raw_options = PG_GETARG_DATUM(0); + bool validate = PG_GETARG_BOOL(1); + relopt_int siglen = + { {"siglen", "signature length", 0, 0, 6, RELOPT_TYPE_INT }, + SIGLEN_DEFAULT, 1, SIGLEN_MAX }; + relopt_gen *optgen[] = { &siglen.gen }; + int offsets[] = { offsetof(GistTsVectorOptions, siglen) }; + GistTsVectorOptions *options = + parseAndFillLocalRelOptions(raw_options, optgen, offsets, 1, + sizeof(GistTsVectorOptions), validate); + + PG_RETURN_POINTER(options); +} diff --git a/src/include/catalog/pg_amproc.dat b/src/include/catalog/pg_amproc.dat index 020b7413cc..5ceee11ab1 100644 --- a/src/include/catalog/pg_amproc.dat +++ b/src/include/catalog/pg_amproc.dat @@ -458,7 +458,7 @@ amproc => 'gist_circle_distance' }, { amprocfamily => 'gist/tsvector_ops', amproclefttype => 'tsvector', amprocrighttype => 'tsvector', amprocnum => '1', - amproc => 'gtsvector_consistent(internal,tsvector,int2,oid,internal)' }, + amproc => 'gtsvector_consistent(internal,tsvector,int2,oid,internal,internal)' }, { amprocfamily => 'gist/tsvector_ops', amproclefttype => 'tsvector', amprocrighttype => 'tsvector', amprocnum => '2', amproc => 'gtsvector_union' }, @@ -476,6 +476,9 @@ amproc => 'gtsvector_picksplit' }, { amprocfamily => 'gist/tsvector_ops', amproclefttype => 'tsvector', amprocrighttype => 'tsvector', amprocnum => '7', amproc => 'gtsvector_same' }, +{ amprocfamily => 'gist/tsvector_ops', amproclefttype => 'tsvector', + amprocrighttype => 'tsvector', amprocnum => '10', + amproc => 'gtsvector_options' }, { amprocfamily => 'gist/tsquery_ops', amproclefttype => 'tsquery', amprocrighttype => 'tsquery', amprocnum => '1', amproc => 'gtsquery_consistent(internal,tsquery,int2,oid,internal)' }, diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index 87335248a0..82b51fc1bb 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -8569,30 +8569,35 @@ { oid => '3648', descr => 'GiST tsvector support', proname => 'gtsvector_compress', prorettype => 'internal', - proargtypes => 'internal', prosrc => 'gtsvector_compress' }, + proargtypes => 'internal internal', prosrc => 'gtsvector_compress' }, { oid => '3649', descr => 'GiST tsvector support', proname => 'gtsvector_decompress', prorettype => 'internal', - proargtypes => 'internal', prosrc => 'gtsvector_decompress' }, + proargtypes => 'internal internal', prosrc => 'gtsvector_decompress' }, { oid => '3650', descr => 'GiST tsvector support', proname => 'gtsvector_picksplit', prorettype => 'internal', - proargtypes => 'internal internal', prosrc => 'gtsvector_picksplit' }, + proargtypes => 'internal internal internal', prosrc => 'gtsvector_picksplit' }, { oid => '3651', descr => 'GiST tsvector support', proname => 'gtsvector_union', prorettype => 'gtsvector', - proargtypes => 'internal internal', prosrc => 'gtsvector_union' }, + proargtypes => 'internal internal internal', prosrc => 'gtsvector_union' }, { oid => '3652', descr => 'GiST tsvector support', proname => 'gtsvector_same', prorettype => 'internal', - proargtypes => 'gtsvector gtsvector internal', prosrc => 'gtsvector_same' }, + proargtypes => 'gtsvector gtsvector internal internal', + prosrc => 'gtsvector_same' }, { oid => '3653', descr => 'GiST tsvector support', proname => 'gtsvector_penalty', prorettype => 'internal', - proargtypes => 'internal internal internal', prosrc => 'gtsvector_penalty' }, + proargtypes => 'internal internal internal internal', + prosrc => 'gtsvector_penalty' }, { oid => '3654', descr => 'GiST tsvector support', proname => 'gtsvector_consistent', prorettype => 'bool', - proargtypes => 'internal tsvector int2 oid internal', + proargtypes => 'internal tsvector int2 oid internal internal', prosrc => 'gtsvector_consistent' }, { oid => '3790', descr => 'GiST tsvector support (obsolete)', proname => 'gtsvector_consistent', prorettype => 'bool', proargtypes => 'internal gtsvector int4 oid internal', prosrc => 'gtsvector_consistent_oldsig' }, +{ oid => '3998', descr => 'GiST tsvector support', + proname => 'gtsvector_options', prorettype => 'internal', + proargtypes => 'internal bool', prosrc => 'gtsvector_options' }, { oid => '3656', descr => 'GIN tsvector support', proname => 'gin_extract_tsvector', prorettype => 'internal', diff --git a/src/test/regress/expected/tsearch.out b/src/test/regress/expected/tsearch.out index 6f61acc1ed..a1873dc722 100644 --- a/src/test/regress/expected/tsearch.out +++ b/src/test/regress/expected/tsearch.out @@ -260,6 +260,182 @@ SELECT count(*) FROM test_tsvector WHERE a @@ '!no_such_lexeme'; 508 (1 row) +-- Test siglen parameter of GiST tsvector_ops +CREATE INDEX wowidx1 ON test_tsvector USING gist (a tsvector_ops(foo=1)); +ERROR: unrecognized parameter "foo" +CREATE INDEX wowidx1 ON test_tsvector USING gist (a tsvector_ops(siglen=0)); +ERROR: value 0 out of bounds for option "siglen" +DETAIL: Valid values are between "1" and "484". +CREATE INDEX wowidx1 ON test_tsvector USING gist (a tsvector_ops(siglen=485)); +ERROR: value 485 out of bounds for option "siglen" +DETAIL: Valid values are between "1" and "484". +CREATE INDEX wowidx1 ON test_tsvector USING gist (a tsvector_ops(siglen=100,foo='bar')); +ERROR: unrecognized parameter "foo" +CREATE INDEX wowidx1 ON test_tsvector USING gist (a tsvector_ops(siglen=100, siglen = 200)); +ERROR: parameter "siglen" specified more than once +CREATE INDEX wowidx2 ON test_tsvector USING gist (a tsvector_ops(siglen=1)); +\d test_tsvector + Table "public.test_tsvector" + Column | Type | Collation | Nullable | Default +--------+----------+-----------+----------+--------- + t | text | | | + a | tsvector | | | +Indexes: + "wowidx" gist (a) + "wowidx2" gist (a tsvector_ops (siglen='1')) + +DROP INDEX wowidx; +EXPLAIN (costs off) SELECT count(*) FROM test_tsvector WHERE a @@ 'wr|qh'; + QUERY PLAN +------------------------------------------------------------- + Aggregate + -> Bitmap Heap Scan on test_tsvector + Recheck Cond: (a @@ '''wr'' | ''qh'''::tsquery) + -> Bitmap Index Scan on wowidx2 + Index Cond: (a @@ '''wr'' | ''qh'''::tsquery) +(5 rows) + +SELECT count(*) FROM test_tsvector WHERE a @@ 'wr|qh'; + count +------- + 158 +(1 row) + +SELECT count(*) FROM test_tsvector WHERE a @@ 'wr&qh'; + count +------- + 17 +(1 row) + +SELECT count(*) FROM test_tsvector WHERE a @@ 'eq&yt'; + count +------- + 6 +(1 row) + +SELECT count(*) FROM test_tsvector WHERE a @@ 'eq|yt'; + count +------- + 98 +(1 row) + +SELECT count(*) FROM test_tsvector WHERE a @@ '(eq&yt)|(wr&qh)'; + count +------- + 23 +(1 row) + +SELECT count(*) FROM test_tsvector WHERE a @@ '(eq|yt)&(wr|qh)'; + count +------- + 39 +(1 row) + +SELECT count(*) FROM test_tsvector WHERE a @@ 'w:*|q:*'; + count +------- + 494 +(1 row) + +SELECT count(*) FROM test_tsvector WHERE a @@ any ('{wr,qh}'); + count +------- + 158 +(1 row) + +SELECT count(*) FROM test_tsvector WHERE a @@ 'no_such_lexeme'; + count +------- + 0 +(1 row) + +SELECT count(*) FROM test_tsvector WHERE a @@ '!no_such_lexeme'; + count +------- + 508 +(1 row) + +DROP INDEX wowidx2; +CREATE INDEX wowidx ON test_tsvector USING gist (a DEFAULT(siglen=484)); +\d test_tsvector + Table "public.test_tsvector" + Column | Type | Collation | Nullable | Default +--------+----------+-----------+----------+--------- + t | text | | | + a | tsvector | | | +Indexes: + "wowidx" gist (a tsvector_ops (siglen='484')) + +EXPLAIN (costs off) SELECT count(*) FROM test_tsvector WHERE a @@ 'wr|qh'; + QUERY PLAN +------------------------------------------------------------- + Aggregate + -> Bitmap Heap Scan on test_tsvector + Recheck Cond: (a @@ '''wr'' | ''qh'''::tsquery) + -> Bitmap Index Scan on wowidx + Index Cond: (a @@ '''wr'' | ''qh'''::tsquery) +(5 rows) + +SELECT count(*) FROM test_tsvector WHERE a @@ 'wr|qh'; + count +------- + 158 +(1 row) + +SELECT count(*) FROM test_tsvector WHERE a @@ 'wr&qh'; + count +------- + 17 +(1 row) + +SELECT count(*) FROM test_tsvector WHERE a @@ 'eq&yt'; + count +------- + 6 +(1 row) + +SELECT count(*) FROM test_tsvector WHERE a @@ 'eq|yt'; + count +------- + 98 +(1 row) + +SELECT count(*) FROM test_tsvector WHERE a @@ '(eq&yt)|(wr&qh)'; + count +------- + 23 +(1 row) + +SELECT count(*) FROM test_tsvector WHERE a @@ '(eq|yt)&(wr|qh)'; + count +------- + 39 +(1 row) + +SELECT count(*) FROM test_tsvector WHERE a @@ 'w:*|q:*'; + count +------- + 494 +(1 row) + +SELECT count(*) FROM test_tsvector WHERE a @@ any ('{wr,qh}'); + count +------- + 158 +(1 row) + +SELECT count(*) FROM test_tsvector WHERE a @@ 'no_such_lexeme'; + count +------- + 0 +(1 row) + +SELECT count(*) FROM test_tsvector WHERE a @@ '!no_such_lexeme'; + count +------- + 508 +(1 row) + RESET enable_seqscan; RESET enable_indexscan; RESET enable_bitmapscan; diff --git a/src/test/regress/sql/tsearch.sql b/src/test/regress/sql/tsearch.sql index 637bfb3012..9aed780d0c 100644 --- a/src/test/regress/sql/tsearch.sql +++ b/src/test/regress/sql/tsearch.sql @@ -87,6 +87,51 @@ SELECT count(*) FROM test_tsvector WHERE a @@ any ('{wr,qh}'); SELECT count(*) FROM test_tsvector WHERE a @@ 'no_such_lexeme'; SELECT count(*) FROM test_tsvector WHERE a @@ '!no_such_lexeme'; +-- Test siglen parameter of GiST tsvector_ops +CREATE INDEX wowidx1 ON test_tsvector USING gist (a tsvector_ops(foo=1)); +CREATE INDEX wowidx1 ON test_tsvector USING gist (a tsvector_ops(siglen=0)); +CREATE INDEX wowidx1 ON test_tsvector USING gist (a tsvector_ops(siglen=485)); +CREATE INDEX wowidx1 ON test_tsvector USING gist (a tsvector_ops(siglen=100,foo='bar')); +CREATE INDEX wowidx1 ON test_tsvector USING gist (a tsvector_ops(siglen=100, siglen = 200)); + +CREATE INDEX wowidx2 ON test_tsvector USING gist (a tsvector_ops(siglen=1)); + +\d test_tsvector + +DROP INDEX wowidx; + +EXPLAIN (costs off) SELECT count(*) FROM test_tsvector WHERE a @@ 'wr|qh'; + +SELECT count(*) FROM test_tsvector WHERE a @@ 'wr|qh'; +SELECT count(*) FROM test_tsvector WHERE a @@ 'wr&qh'; +SELECT count(*) FROM test_tsvector WHERE a @@ 'eq&yt'; +SELECT count(*) FROM test_tsvector WHERE a @@ 'eq|yt'; +SELECT count(*) FROM test_tsvector WHERE a @@ '(eq&yt)|(wr&qh)'; +SELECT count(*) FROM test_tsvector WHERE a @@ '(eq|yt)&(wr|qh)'; +SELECT count(*) FROM test_tsvector WHERE a @@ 'w:*|q:*'; +SELECT count(*) FROM test_tsvector WHERE a @@ any ('{wr,qh}'); +SELECT count(*) FROM test_tsvector WHERE a @@ 'no_such_lexeme'; +SELECT count(*) FROM test_tsvector WHERE a @@ '!no_such_lexeme'; + +DROP INDEX wowidx2; + +CREATE INDEX wowidx ON test_tsvector USING gist (a DEFAULT(siglen=484)); + +\d test_tsvector + +EXPLAIN (costs off) SELECT count(*) FROM test_tsvector WHERE a @@ 'wr|qh'; + +SELECT count(*) FROM test_tsvector WHERE a @@ 'wr|qh'; +SELECT count(*) FROM test_tsvector WHERE a @@ 'wr&qh'; +SELECT count(*) FROM test_tsvector WHERE a @@ 'eq&yt'; +SELECT count(*) FROM test_tsvector WHERE a @@ 'eq|yt'; +SELECT count(*) FROM test_tsvector WHERE a @@ '(eq&yt)|(wr&qh)'; +SELECT count(*) FROM test_tsvector WHERE a @@ '(eq|yt)&(wr|qh)'; +SELECT count(*) FROM test_tsvector WHERE a @@ 'w:*|q:*'; +SELECT count(*) FROM test_tsvector WHERE a @@ any ('{wr,qh}'); +SELECT count(*) FROM test_tsvector WHERE a @@ 'no_such_lexeme'; +SELECT count(*) FROM test_tsvector WHERE a @@ '!no_such_lexeme'; + RESET enable_seqscan; RESET enable_indexscan; RESET enable_bitmapscan; -- 2.20.1