diff --git a/src/backend/tsearch/Makefile b/src/backend/tsearch/Makefile index e20a4a2..ba728eb 100644 --- a/src/backend/tsearch/Makefile +++ b/src/backend/tsearch/Makefile @@ -19,7 +19,7 @@ DICTFILES=synonym_sample.syn thesaurus_sample.ths hunspell_sample.affix \ OBJS = ts_locale.o ts_parse.o wparser.o wparser_def.o dict.o \ dict_simple.o dict_synonym.o dict_thesaurus.o \ dict_ispell.o regis.o spell.o \ - to_tsany.o ts_typanalyze.o ts_utils.o + to_tsany.o ts_typanalyze.o ts_selfuncs.o ts_utils.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/tsearch/ts_selfuncs.c b/src/backend/tsearch/ts_selfuncs.c new file mode 100644 index 0000000..0fadc60 --- /dev/null +++ b/src/backend/tsearch/ts_selfuncs.c @@ -0,0 +1,320 @@ +/*------------------------------------------------------------------------- + * + * ts_selfuncs.c + * Selectivity functions for text search types. + * + * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * $PostgreSQL$ + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "miscadmin.h" /* for check_stack_depth() */ +#include "utils/memutils.h" +#include "utils/builtins.h" +#include "utils/syscache.h" +#include "utils/lsyscache.h" +#include "utils/selfuncs.h" +#include "catalog/pg_type.h" +#include "catalog/pg_statistic.h" +#include "nodes/nodes.h" +#include "tsearch/ts_type.h" + +/* lookup table type for binary searching through MCELEMs */ +typedef struct +{ + Datum element; + float4 frequency; +} TextFreq; + +/* type of keys for bsearch()ing through an array of TextFreqs */ +typedef struct +{ + char *lexeme; + int length; +} LexemeKey; + +static int +compare_lexeme_textfreq(const void *e1, const void *e2); + +static Selectivity +tsquery_opr_selec(QueryItem *item, char *operand, TextFreq *lookup, + int length, float4 minfreq); +static Selectivity +mcelem_tsquery_selec(TSQuery query, Datum *mcelem, int nmcelem, + float4 *numbers, int nnumbers); +static double +tsquerysel(VariableStatData *vardata, Datum constval); + + +/* TSQuery traversal function */ +static Selectivity +tsquery_opr_selec(QueryItem *item, char *operand, TextFreq *lookup, + int length, float4 minfreq) +{ + LexemeKey key; + TextFreq *searchres; + Selectivity s1, s2; + + /* since this function recurses, it could be driven to stack overflow */ + check_stack_depth(); + + if (item->type == QI_VAL) + { + QueryOperand *oper = (QueryOperand *) item; + + /* + * Prepare the key for bsearch(). + */ + key.lexeme = operand + oper->distance; + key.length = oper->length; + + searchres = (TextFreq *) bsearch(&key, lookup, length, + sizeof(TextFreq), compare_lexeme_textfreq); + + if (searchres) + { + /* + * The element is in MCELEM. Return precise selectivity (or at + * least as precise, as ANALYZE could find out). + */ + return (Selectivity) searchres->frequency; + } + else + { + /* + * The element is not in MCELEM. Punt, but assert that the + * selectivity cannot be more than minfreq / 2. + */ + return (Selectivity) Min(DEFAULT_TS_SEL, minfreq / 2); + } + } + + /* Current TSQuery node is an operator */ + switch (item->operator.oper) + { + case OP_NOT: + return 1.0 - tsquery_opr_selec(item + 1, operand, lookup, + length, minfreq); + + case OP_AND: + return + tsquery_opr_selec(item + 1, operand, lookup, length, minfreq) * + tsquery_opr_selec(item + item->operator.left, operand, lookup, + length, minfreq); + + case OP_OR: + s1 = tsquery_opr_selec(item + 1, operand, lookup, length, minfreq); + s2 = tsquery_opr_selec(item + item->operator.left, operand, lookup, + length, minfreq); + return s1 + s2 - s1 * s2; + + default: + elog(ERROR, "unrecognized operator: %d", item->operator.oper); + } + + /* never reached, keep compiler happy */ + return (Selectivity) DEFAULT_TS_SEL; +} + +/* + * Traverse the tsquery preorder, calculating selectivity as: + * + * selec(left_oper) * selec(right_oper) in AND nodes, + * + * selec(left_oper) + selec(right_oper) - + * selec(left_oper) * selec(right_oper) in OR nodes, + * + * 1 - select(oper) in NOT nodes + * + * freq[val] in VAL nodes, if the value is in MCELEM + * min(freq[MCELEM]) / 2 in VAL nodes, if it is not + * + * + * The MCELEM array is already sorted (see ts_typanalyze.c), so we can use + * binary search for determining freq[MCELEM]. + */ +static Selectivity +mcelem_tsquery_selec(TSQuery query, Datum *mcelem, int nmcelem, + float4 *numbers, int nnumbers) +{ + float4 minfreq; + TextFreq *lookup; + Selectivity selec; + int i; + + /* Grab the lowest frequency */ + + /* + XXX This is no longer easy, need to go through the whole list. Plug it + with 2.0 for the moment (so that minfreq / 2 is 1.0) + + minfreq = numbers[nnumbers - 1]; + */ + minfreq = 2.0; + + /* Construct the array for binary search */ + Assert(nmcelem == nnumbers); + lookup = (TextFreq *) palloc(sizeof(TextFreq) * nmcelem); + for (i = 0; i < nmcelem; i++) + { + lookup[i].element = mcelem[i]; + lookup[i].frequency = numbers[i]; + } + + selec = tsquery_opr_selec(GETQUERY(query), GETOPERAND(query), lookup, + nmcelem, minfreq); + + pfree(lookup); + + return selec; +} + +/* + * bsearch() comparator for a lexeme (non-NULL terminated string with length) + * and a TextFreq. Use length, then byte-for-byte comparision, because that's + * how ANALYZE code sorted data before storing it in a statistic tuple. + * See ts_typanalyze.c for details. + */ +static int +compare_lexeme_textfreq(const void *e1, const void *e2) +{ + const LexemeKey *key; + const TextFreq *t; + text *element; + int len1, + len2; + + key = (const LexemeKey *) e1; + t = (const TextFreq *) e2; + + len1 = key->length; + len2 = VARSIZE_ANY_EXHDR(t->element); + + /* Compare lengths first, possibly avoiding pointless detoasting */ + if (len1 > len2) + return 1; + else if (len1 < len2) + return -1; + + /* Fall back on byte-for-byte comparision, detoasting the Datum first */ + element = DatumGetTextP(t->element); + + return strncmp(key->lexeme, VARDATA_ANY(element), len1); +} + +/* + * tssel -- Selectivity of "@@" + */ +Datum +tssel(PG_FUNCTION_ARGS) +{ + PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0); + /* We skip arg #2, which is the operator OID - it's gonna be "@@" */ + List *args = (List *) PG_GETARG_POINTER(2); + int varRelid = PG_GETARG_INT32(3); + VariableStatData vardata; + Node *other; + bool varonleft; + Selectivity selec; + + /* + * If expression is not variable op something or something op variable, + * then punt and return a default estimate. + */ + if (!get_restriction_variable(root, args, varRelid, + &vardata, &other, &varonleft)) + { + PG_RETURN_FLOAT8(DEFAULT_TS_SEL); + } + + /* + * Can't do anything useful if the something is not a constant, either. + */ + if (!IsA(other, Const)) + { + ReleaseVariableStats(vardata); + PG_RETURN_FLOAT8(DEFAULT_TS_SEL); + } + + /* The "@@" operator is strict, so might cope with NULL right away */ + if (((Const *) other)->constisnull) { + ReleaseVariableStats(vardata); + PG_RETURN_FLOAT8((float8) 0.0); + } + + /* + * OK, there's a Var and a Const we're dealing with here. We need the Var + * to be a TSVector (or else we don't have any useful statistic for it). + */ + + if (vardata.vartype == TSVECTOROID) + { + /* tsvector @@ tsquery or the other way around */ + Assert(((Const *) other)->consttype == TSQUERYOID); + + selec = tsquerysel(&vardata, ((Const *) other)->constvalue); + } + else + { + /* The Var is something we don't have useful statistic for */ + selec = DEFAULT_TS_SEL; + } + + ReleaseVariableStats(vardata); + + PG_RETURN_FLOAT8((float8) selec); +} + +static Selectivity +tsquerysel(VariableStatData *vardata, Datum constval) +{ + Selectivity selec; + + if (HeapTupleIsValid(vardata->statsTuple)) + { + TSQuery query; + Form_pg_statistic stats; + Datum *values; + int nvalues; + float4 *numbers; + int nnumbers; + + /* The caller made sure the const is a TSQuery, so get it now */ + query = DatumGetTSQuery(constval); + + stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple); + + /* MCELEM will be an array of Text elements for a tsvector column */ + if (get_attstatsslot(vardata->statsTuple, + TEXTOID, -1, + STATISTIC_KIND_MCELEM, InvalidOid, + &values, &nvalues, + &numbers, &nnumbers)) + { + /* + * There is a most-common-elements slot for the tsvector Var, so + * use that. + */ + + selec = mcelem_tsquery_selec(query, values, nvalues, + numbers, nnumbers); + free_attstatsslot(TEXTOID, values, nvalues, numbers, nnumbers); + } + else + { + /* No most-common-elements slot */ + selec = (Selectivity) DEFAULT_TS_SEL; + } + } + else + { + selec = (Selectivity) DEFAULT_TS_SEL; + } + + return selec; +} diff --git a/src/backend/tsearch/ts_typanalyze.c b/src/backend/tsearch/ts_typanalyze.c index 965e758..201c4ab 100644 --- a/src/backend/tsearch/ts_typanalyze.c +++ b/src/backend/tsearch/ts_typanalyze.c @@ -43,8 +43,9 @@ static void compute_tsvector_stats(VacAttrStats *stats, static void prune_lexemes_hashtable(HTAB *lexemes_tab, int b_current); static uint32 lexeme_hash(const void *key, Size keysize); static int lexeme_match(const void *key1, const void *key2, Size keysize); -static int trackitem_compare_desc(const void *e1, const void *e2); - +static int lexeme_compare(const void *key1, const void *key2); +static int trackitem_compare_frequencies_desc(const void *e1, const void *e2); +static int trackitem_compare_lexemes(const void *e1, const void *e2); /* * ts_typanalyze -- a custom typanalyze function for tsvector columns @@ -273,7 +274,7 @@ compute_tsvector_stats(VacAttrStats *stats, Assert(i == track_len); qsort(sort_table, track_len, sizeof(TrackItem *), - trackitem_compare_desc); + trackitem_compare_frequencies_desc); /* Suppress any single-occurrence items */ while (track_len > 0) @@ -287,6 +288,22 @@ compute_tsvector_stats(VacAttrStats *stats, if (num_mcelem > track_len) num_mcelem = track_len; + /* + * We want to store statistics sorted on the lexeme value using first + * length, then byte-for-byte comparision. The reason for doing length + * comparision first is that we don't care about the ordering as long + * as it's consistent and comparing lengths first gives us a chance to + * avoid a strncmp() call. + * + * This is different from what we do with scalar statistics -- they get + * sorted on frequencies. The rationale is that we usually search + * through most common elements looking for a specific value, so we can + * grab its frequency. When values are presorted we can employ binary + * search for that. See ts_selfuncs.c for a real usage scenario. + */ + qsort(sort_table, num_mcelem, sizeof(TrackItem *), + trackitem_compare_lexemes); + /* Generate MCELEM slot entry */ if (num_mcelem > 0) { @@ -379,25 +396,49 @@ lexeme_hash(const void *key, Size keysize) static int lexeme_match(const void *key1, const void *key2, Size keysize) { - const LexemeHashKey *d1 = (const LexemeHashKey *) key1; - const LexemeHashKey *d2 = (const LexemeHashKey *) key2; + /* The keysize parameter is superfluous, the keys store their lengths */ + return lexeme_compare(key1, key2); +} - /* The lexemes need to have the same length, and be memcmp-equal */ - if (d1->length == d2->length && - memcmp(d1->lexeme, d2->lexeme, d1->length) == 0) - return 0; - else +/* + * Comparision function for lexemes. + */ +static int +lexeme_compare(const void *key1, const void *key2) +{ + const LexemeHashKey *d1 = (const LexemeHashKey *) key1; + const LexemeHashKey *d2 = (const LexemeHashKey *) key2; + + /* First, compare by length */ + if (d1->length > d2->length) return 1; + else if (d1->length < d2->length) + return -1; + else + /* Lengths are equal, do a byte-for-byte comparision */ + return strncmp(d1->lexeme, d2->lexeme, d1->length); } /* - * qsort() comparator for TrackItems - LC style (descending sort) + * qsort() comparator for sorting TrackItems on frequencies (descending sort) */ static int -trackitem_compare_desc(const void *e1, const void *e2) +trackitem_compare_frequencies_desc(const void *e1, const void *e2) { const TrackItem * const *t1 = (const TrackItem * const *) e1; const TrackItem * const *t2 = (const TrackItem * const *) e2; return (*t2)->frequency - (*t1)->frequency; } + +/* + * qsort() comparator for sorting TrackItem on lexemes + */ +static int +trackitem_compare_lexemes(const void *e1, const void *e2) +{ + const TrackItem * const *t1 = (const TrackItem * const *) e1; + const TrackItem * const *t2 = (const TrackItem * const *) e2; + + return lexeme_compare(&(*t1)->key, &(*t2)->key); +} diff --git a/src/include/catalog/pg_operator.h b/src/include/catalog/pg_operator.h index 1c7f37d..451805d 100644 --- a/src/include/catalog/pg_operator.h +++ b/src/include/catalog/pg_operator.h @@ -915,10 +915,10 @@ DATA(insert OID = 3630 ( "<>" PGNSP PGUID b f f 3614 3614 16 3630 3629 ts DATA(insert OID = 3631 ( ">=" PGNSP PGUID b f f 3614 3614 16 3628 3627 tsvector_ge scalargtsel scalargtjoinsel )); DATA(insert OID = 3632 ( ">" PGNSP PGUID b f f 3614 3614 16 3627 3628 tsvector_gt scalargtsel scalargtjoinsel )); DATA(insert OID = 3633 ( "||" PGNSP PGUID b f f 3614 3614 3614 0 0 tsvector_concat - - )); -DATA(insert OID = 3636 ( "@@" PGNSP PGUID b f f 3614 3615 16 3637 0 ts_match_vq contsel contjoinsel )); -DATA(insert OID = 3637 ( "@@" PGNSP PGUID b f f 3615 3614 16 3636 0 ts_match_qv contsel contjoinsel )); -DATA(insert OID = 3660 ( "@@@" PGNSP PGUID b f f 3614 3615 16 3661 0 ts_match_vq contsel contjoinsel )); -DATA(insert OID = 3661 ( "@@@" PGNSP PGUID b f f 3615 3614 16 3660 0 ts_match_qv contsel contjoinsel )); +DATA(insert OID = 3636 ( "@@" PGNSP PGUID b f f 3614 3615 16 3637 0 ts_match_vq tssel contjoinsel )); +DATA(insert OID = 3637 ( "@@" PGNSP PGUID b f f 3615 3614 16 3636 0 ts_match_qv tssel contjoinsel )); +DATA(insert OID = 3660 ( "@@@" PGNSP PGUID b f f 3614 3615 16 3661 0 ts_match_vq tssel contjoinsel )); +DATA(insert OID = 3661 ( "@@@" PGNSP PGUID b f f 3615 3614 16 3660 0 ts_match_qv tssel contjoinsel )); DATA(insert OID = 3674 ( "<" PGNSP PGUID b f f 3615 3615 16 3679 3678 tsquery_lt scalarltsel scalarltjoinsel )); DATA(insert OID = 3675 ( "<=" PGNSP PGUID b f f 3615 3615 16 3678 3679 tsquery_le scalarltsel scalarltjoinsel )); DATA(insert OID = 3676 ( "=" PGNSP PGUID b t f 3615 3615 16 3676 3677 tsquery_eq eqsel eqjoinsel )); diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h index 16ccb55..ef24f40 100644 --- a/src/include/catalog/pg_proc.h +++ b/src/include/catalog/pg_proc.h @@ -4321,6 +4321,8 @@ DESCR("GiST tsquery support"); DATA(insert OID = 3701 ( gtsquery_consistent PGNSP PGUID 12 1 0 0 f f t f i 5 16 "2281 2281 23 26 2281" _null_ _null_ _null_ gtsquery_consistent _null_ _null_ _null_ )); DESCR("GiST tsquery support"); +DATA(insert OID = 3687 ( tssel PGNSP PGUID 12 1 0 0 f f t f s 4 701 "2281 26 2281 23" _null_ _null_ _null_ tssel _null_ _null_ _null_ )); +DESCR("restriction selectivity of tsvector @@ tsquery"); DATA(insert OID = 3688 ( ts_typanalyze PGNSP PGUID 12 1 0 0 f f t f s 1 16 "2281" _null_ _null_ _null_ ts_typanalyze _null_ _null_ _null_ )); DESCR("tsvector typanalyze"); diff --git a/src/include/tsearch/ts_type.h b/src/include/tsearch/ts_type.h index 06ca1f9..b253327 100644 --- a/src/include/tsearch/ts_type.h +++ b/src/include/tsearch/ts_type.h @@ -155,6 +155,8 @@ extern Datum ts_rankcd_wttf(PG_FUNCTION_ARGS); extern Datum ts_typanalyze(PG_FUNCTION_ARGS); +extern Datum tssel(PG_FUNCTION_ARGS); + /* * TSQuery @@ -291,4 +293,17 @@ extern Datum tsquery_rewrite_query(PG_FUNCTION_ARGS); extern Datum tsq_mcontains(PG_FUNCTION_ARGS); extern Datum tsq_mcontained(PG_FUNCTION_ARGS); +/* + * The default text search selectivity is chosen to be smll enough to + * encourage indexscans for typical table densities. See selfuncs.h and + * DEFAULT_EQ_SEL for details. + */ +#define DEFAULT_TS_SEL 0.005 + +/* + * operator restriction function for tsvector @@ tsquery and + * tsquery @@ tsvector + */ +extern Datum tssel(PG_FUNCTION_ARGS); + #endif /* _PG_TSTYPE_H_ */