*** a/doc/src/sgml/catalogs.sgml --- b/doc/src/sgml/catalogs.sgml *************** *** 6664,6669 **** --- 6664,6671 ---- A list of the frequencies of the most common values or elements, i.e., number of occurrences of each divided by total number of rows. (NULL when most_common_vals is.) + For some datatypes such as tsvector, it can also store some + additional information, i.e. be longer than the most_common_vals array. *** a/src/backend/tsearch/Makefile --- b/src/backend/tsearch/Makefile *************** *** 19,25 **** DICTFILES=synonym_sample.syn thesaurus_sample.ths hunspell_sample.affix \ OBJS = ts_locale.o ts_parse.o wparser.o wparser_def.o dict.o \ dict_simple.o dict_synonym.o dict_thesaurus.o \ dict_ispell.o regis.o spell.o \ ! to_tsany.o ts_typanalyze.o ts_utils.o include $(top_srcdir)/src/backend/common.mk --- 19,25 ---- OBJS = ts_locale.o ts_parse.o wparser.o wparser_def.o dict.o \ dict_simple.o dict_synonym.o dict_thesaurus.o \ dict_ispell.o regis.o spell.o \ ! to_tsany.o ts_typanalyze.o ts_selfuncs.o ts_utils.o include $(top_srcdir)/src/backend/common.mk *** /dev/null --- b/src/backend/tsearch/ts_selfuncs.c *************** *** 0 **** --- 1,323 ---- + /*------------------------------------------------------------------------- + * + * ts_selfuncs.c + * Selectivity functions for text search types. + * + * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * $PostgreSQL$ + * + *------------------------------------------------------------------------- + */ + #include "postgres.h" + + #include "miscadmin.h" /* for check_stack_depth() */ + #include "utils/memutils.h" + #include "utils/builtins.h" + #include "utils/syscache.h" + #include "utils/lsyscache.h" + #include "utils/selfuncs.h" + #include "catalog/pg_type.h" + #include "catalog/pg_statistic.h" + #include "nodes/nodes.h" + #include "tsearch/ts_type.h" + + /* lookup table type for binary searching through MCELEMs */ + typedef struct + { + Datum element; + float4 frequency; + } TextFreq; + + /* type of keys for bsearch()ing through an array of TextFreqs */ + typedef struct + { + char *lexeme; + int length; + } LexemeKey; + + static double + tsquerysel(VariableStatData *vardata, Datum constval); + static Selectivity + mcelem_tsquery_selec(TSQuery query, Datum *mcelem, int nmcelem, + float4 *numbers, int nnumbers); + static Selectivity + tsquery_opr_selec(QueryItem *item, char *operand, TextFreq *lookup, + int length, float4 minfreq); + static int + compare_lexeme_textfreq(const void *e1, const void *e2); + + /* + * tssel -- Selectivity of "@@" + */ + Datum + tssel(PG_FUNCTION_ARGS) + { + PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0); + /* We skip arg #2, which is the operator OID - it's gonna be "@@" */ + List *args = (List *) PG_GETARG_POINTER(2); + int varRelid = PG_GETARG_INT32(3); + VariableStatData vardata; + Node *other; + bool varonleft; + Selectivity selec; + + /* + * If expression is not variable op something or something op variable, + * then punt and return a default estimate. + */ + if (!get_restriction_variable(root, args, varRelid, + &vardata, &other, &varonleft)) + { + PG_RETURN_FLOAT8(DEFAULT_TS_SEL); + } + + /* + * Can't do anything useful if the something is not a constant, either. + */ + if (!IsA(other, Const)) + { + ReleaseVariableStats(vardata); + PG_RETURN_FLOAT8(DEFAULT_TS_SEL); + } + + /* The "@@" operator is strict, so might cope with NULL right away */ + if (((Const *) other)->constisnull) { + ReleaseVariableStats(vardata); + PG_RETURN_FLOAT8((float8) 0.0); + } + + /* + * OK, there's a Var and a Const we're dealing with here. We need the Var + * to be a TSVector (or else we don't have any useful statistic for it). + */ + + if (vardata.vartype == TSVECTOROID) + { + /* tsvector @@ tsquery or the other way around */ + Assert(((Const *) other)->consttype == TSQUERYOID); + + selec = tsquerysel(&vardata, ((Const *) other)->constvalue); + } + else + { + /* The Var is something we don't have useful statistic for */ + selec = DEFAULT_TS_SEL; + } + + ReleaseVariableStats(vardata); + + PG_RETURN_FLOAT8((float8) selec); + } + + static Selectivity + tsquerysel(VariableStatData *vardata, Datum constval) + { + Selectivity selec; + + if (HeapTupleIsValid(vardata->statsTuple)) + { + TSQuery query; + Form_pg_statistic stats; + Datum *values; + int nvalues; + float4 *numbers; + int nnumbers; + + /* The caller made sure the const is a TSQuery, so get it now */ + query = DatumGetTSQuery(constval); + + stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple); + + /* MCELEM will be an array of Text elements for a tsvector column */ + if (get_attstatsslot(vardata->statsTuple, + TEXTOID, -1, + STATISTIC_KIND_MCELEM, InvalidOid, + &values, &nvalues, + &numbers, &nnumbers)) + { + /* + * There is a most-common-elements slot for the tsvector Var, so + * use that. + */ + + selec = mcelem_tsquery_selec(query, values, nvalues, + numbers, nnumbers); + free_attstatsslot(TEXTOID, values, nvalues, numbers, nnumbers); + } + else + { + /* No most-common-elements slot */ + selec = (Selectivity) DEFAULT_TS_SEL; + } + } + else + { + selec = (Selectivity) DEFAULT_TS_SEL; + } + + return selec; + } + + /* + * Traverse the tsquery preorder, calculating selectivity as: + * + * selec(left_oper) * selec(right_oper) in AND nodes, + * + * selec(left_oper) + selec(right_oper) - + * selec(left_oper) * selec(right_oper) in OR nodes, + * + * 1 - select(oper) in NOT nodes + * + * freq[val] in VAL nodes, if the value is in MCELEM + * min(freq[MCELEM]) / 2 in VAL nodes, if it is not + * + * + * The MCELEM array is already sorted (see ts_typanalyze.c), so we can use + * binary search for determining freq[MCELEM]. + */ + static Selectivity + mcelem_tsquery_selec(TSQuery query, Datum *mcelem, int nmcelem, + float4 *numbers, int nnumbers) + { + float4 minfreq; + TextFreq *lookup; + Selectivity selec; + int i; + + /* + * Grab the lowest frequency. compute_tsvector_stats() stored it for us in + * the one before the last cell of the Numbers array. See ts_typanalyze.c + */ + minfreq = numbers[nnumbers - 2]; + + /* + * Construct the array for binary search. There should be two more Numbers + * than Values, because the last two cells are taken for minimal and + * maximal frequency. + */ + Assert(nmcelem == nnumbers - 2); + lookup = (TextFreq *) palloc(sizeof(TextFreq) * nmcelem); + for (i = 0; i < nmcelem; i++) + { + lookup[i].element = mcelem[i]; + lookup[i].frequency = numbers[i]; + } + + selec = tsquery_opr_selec(GETQUERY(query), GETOPERAND(query), lookup, + nmcelem, minfreq); + + pfree(lookup); + + return selec; + } + + /* TSQuery traversal function */ + static Selectivity + tsquery_opr_selec(QueryItem *item, char *operand, TextFreq *lookup, + int length, float4 minfreq) + { + LexemeKey key; + TextFreq *searchres; + Selectivity s1, s2; + + /* since this function recurses, it could be driven to stack overflow */ + check_stack_depth(); + + if (item->type == QI_VAL) + { + QueryOperand *oper = (QueryOperand *) item; + + /* + * Prepare the key for bsearch(). + */ + key.lexeme = operand + oper->distance; + key.length = oper->length; + + searchres = (TextFreq *) bsearch(&key, lookup, length, + sizeof(TextFreq), compare_lexeme_textfreq); + + if (searchres) + { + /* + * The element is in MCELEM. Return precise selectivity (or at + * least as precise, as ANALYZE could find out). + */ + return (Selectivity) searchres->frequency; + } + else + { + /* + * The element is not in MCELEM. Punt, but assert that the + * selectivity cannot be more than minfreq / 2. + */ + return (Selectivity) Min(DEFAULT_TS_SEL, minfreq / 2); + } + } + + /* Current TSQuery node is an operator */ + switch (item->operator.oper) + { + case OP_NOT: + return 1.0 - tsquery_opr_selec(item + 1, operand, lookup, + length, minfreq); + + case OP_AND: + return + tsquery_opr_selec(item + 1, operand, lookup, length, minfreq) * + tsquery_opr_selec(item + item->operator.left, operand, lookup, + length, minfreq); + + case OP_OR: + s1 = tsquery_opr_selec(item + 1, operand, lookup, length, minfreq); + s2 = tsquery_opr_selec(item + item->operator.left, operand, lookup, + length, minfreq); + return s1 + s2 - s1 * s2; + + default: + elog(ERROR, "unrecognized operator: %d", item->operator.oper); + } + + /* never reached, keep compiler happy */ + return (Selectivity) DEFAULT_TS_SEL; + } + + /* + * bsearch() comparator for a lexeme (non-NULL terminated string with length) + * and a TextFreq. Use length, then byte-for-byte comparision, because that's + * how ANALYZE code sorted data before storing it in a statistic tuple. + * See ts_typanalyze.c for details. + */ + static int + compare_lexeme_textfreq(const void *e1, const void *e2) + { + const LexemeKey *key; + const TextFreq *t; + text *element; + int len1, + len2; + + key = (const LexemeKey *) e1; + t = (const TextFreq *) e2; + + /* + * The text Datum came from an array, so it cannot be compressed + * or stored out-of-line -- it's safe to use VARSIZE_ANY*. + */ + Assert(!VARATT_IS_COMPRESSED(t->element) && !VARATT_IS_EXTERNAL(t->element)); + + len1 = key->length; + len2 = VARSIZE_ANY_EXHDR(t->element); + + /* Compare lengths first, possibly avoiding a strncmp call */ + if (len1 > len2) + return 1; + else if (len1 < len2) + return -1; + + /* Fall back on byte-for-byte comparision */ + element = DatumGetTextP(t->element); + return strncmp(key->lexeme, VARDATA_ANY(element), len1); + } *** a/src/backend/tsearch/ts_typanalyze.c --- b/src/backend/tsearch/ts_typanalyze.c *************** *** 43,50 **** static void compute_tsvector_stats(VacAttrStats *stats, static void prune_lexemes_hashtable(HTAB *lexemes_tab, int b_current); static uint32 lexeme_hash(const void *key, Size keysize); static int lexeme_match(const void *key1, const void *key2, Size keysize); ! static int trackitem_compare_desc(const void *e1, const void *e2); ! /* * ts_typanalyze -- a custom typanalyze function for tsvector columns --- 43,51 ---- static void prune_lexemes_hashtable(HTAB *lexemes_tab, int b_current); static uint32 lexeme_hash(const void *key, Size keysize); static int lexeme_match(const void *key1, const void *key2, Size keysize); ! static int lexeme_compare(const void *key1, const void *key2); ! static int trackitem_compare_frequencies_desc(const void *e1, const void *e2); ! static int trackitem_compare_lexemes(const void *e1, const void *e2); /* * ts_typanalyze -- a custom typanalyze function for tsvector columns *************** *** 247,252 **** compute_tsvector_stats(VacAttrStats *stats, --- 248,254 ---- int i; TrackItem **sort_table; int track_len; + int minfreq, maxfreq; stats->stats_valid = true; /* Do the simple null-frac and average width stats */ *************** *** 273,279 **** compute_tsvector_stats(VacAttrStats *stats, Assert(i == track_len); qsort(sort_table, track_len, sizeof(TrackItem *), ! trackitem_compare_desc); /* Suppress any single-occurrence items */ while (track_len > 0) --- 275,281 ---- Assert(i == track_len); qsort(sort_table, track_len, sizeof(TrackItem *), ! trackitem_compare_frequencies_desc); /* Suppress any single-occurrence items */ while (track_len > 0) *************** *** 287,292 **** compute_tsvector_stats(VacAttrStats *stats, --- 289,314 ---- if (num_mcelem > track_len) num_mcelem = track_len; + /* Grab the minimal and maximal frequencies that will get stored */ + minfreq = sort_table[num_mcelem]->frequency; + maxfreq = sort_table[0]->frequency; + + /* + * We want to store statistics sorted on the lexeme value using first + * length, then byte-for-byte comparision. The reason for doing length + * comparision first is that we don't care about the ordering as long + * as it's consistent and comparing lengths first gives us a chance to + * avoid a strncmp() call. + * + * This is different from what we do with scalar statistics -- they get + * sorted on frequencies. The rationale is that we usually search + * through most common elements looking for a specific value, so we can + * grab its frequency. When values are presorted we can employ binary + * search for that. See ts_selfuncs.c for a real usage scenario. + */ + qsort(sort_table, num_mcelem, sizeof(TrackItem *), + trackitem_compare_lexemes); + /* Generate MCELEM slot entry */ if (num_mcelem > 0) { *************** *** 296,303 **** compute_tsvector_stats(VacAttrStats *stats, /* Must copy the target values into anl_context */ old_context = MemoryContextSwitchTo(stats->anl_context); mcelem_values = (Datum *) palloc(num_mcelem * sizeof(Datum)); ! mcelem_freqs = (float4 *) palloc(num_mcelem * sizeof(float4)); for (i = 0; i < num_mcelem; i++) { --- 318,332 ---- /* Must copy the target values into anl_context */ old_context = MemoryContextSwitchTo(stats->anl_context); + + /* + * We sorted statistics on the lexeme value, but we want to be + * able to reach the minimal and maximal frequency without goind + * through all the values. We keep those two extra frequencies in + * two extra cells in mcelem_freqs. + */ mcelem_values = (Datum *) palloc(num_mcelem * sizeof(Datum)); ! mcelem_freqs = (float4 *) palloc((num_mcelem + 2) * sizeof(float4)); for (i = 0; i < num_mcelem; i++) { *************** *** 306,319 **** compute_tsvector_stats(VacAttrStats *stats, mcelem_values[i] = PointerGetDatum(cstring_to_text_with_len(item->key.lexeme, item->key.length)); ! mcelem_freqs[i] = (double) item->frequency / (double) nonnull_cnt; } MemoryContextSwitchTo(old_context); stats->stakind[0] = STATISTIC_KIND_MCELEM; stats->staop[0] = TextEqualOperator; stats->stanumbers[0] = mcelem_freqs; ! stats->numnumbers[0] = num_mcelem; stats->stavalues[0] = mcelem_values; stats->numvalues[0] = num_mcelem; /* We are storing text values */ --- 335,353 ---- mcelem_values[i] = PointerGetDatum(cstring_to_text_with_len(item->key.lexeme, item->key.length)); ! mcelem_freqs[i] = ! (double) item->frequency / ! (double) nonnull_cnt; } + mcelem_freqs[i++] = (double) minfreq / (double) nonnull_cnt; + mcelem_freqs[i] = (double) maxfreq / (double) nonnull_cnt; MemoryContextSwitchTo(old_context); stats->stakind[0] = STATISTIC_KIND_MCELEM; stats->staop[0] = TextEqualOperator; stats->stanumbers[0] = mcelem_freqs; ! /* See above comment about two extra frequency fields */ ! stats->numnumbers[0] = num_mcelem + 2; stats->stavalues[0] = mcelem_values; stats->numvalues[0] = num_mcelem; /* We are storing text values */ *************** *** 379,403 **** lexeme_hash(const void *key, Size keysize) static int lexeme_match(const void *key1, const void *key2, Size keysize) { ! const LexemeHashKey *d1 = (const LexemeHashKey *) key1; ! const LexemeHashKey *d2 = (const LexemeHashKey *) key2; ! /* The lexemes need to have the same length, and be memcmp-equal */ ! if (d1->length == d2->length && ! memcmp(d1->lexeme, d2->lexeme, d1->length) == 0) ! return 0; ! else return 1; } /* ! * qsort() comparator for TrackItems - LC style (descending sort) */ static int ! trackitem_compare_desc(const void *e1, const void *e2) { const TrackItem * const *t1 = (const TrackItem * const *) e1; const TrackItem * const *t2 = (const TrackItem * const *) e2; return (*t2)->frequency - (*t1)->frequency; } --- 413,461 ---- static int lexeme_match(const void *key1, const void *key2, Size keysize) { ! /* The keysize parameter is superfluous, the keys store their lengths */ ! return lexeme_compare(key1, key2); ! } ! /* ! * Comparision function for lexemes. ! */ ! static int ! lexeme_compare(const void *key1, const void *key2) ! { ! const LexemeHashKey *d1 = (const LexemeHashKey *) key1; ! const LexemeHashKey *d2 = (const LexemeHashKey *) key2; ! ! /* First, compare by length */ ! if (d1->length > d2->length) return 1; + else if (d1->length < d2->length) + return -1; + else + /* Lengths are equal, do a byte-for-byte comparision */ + return strncmp(d1->lexeme, d2->lexeme, d1->length); } /* ! * qsort() comparator for sorting TrackItems on frequencies (descending sort) */ static int ! trackitem_compare_frequencies_desc(const void *e1, const void *e2) { const TrackItem * const *t1 = (const TrackItem * const *) e1; const TrackItem * const *t2 = (const TrackItem * const *) e2; return (*t2)->frequency - (*t1)->frequency; } + + /* + * qsort() comparator for sorting TrackItem on lexemes + */ + static int + trackitem_compare_lexemes(const void *e1, const void *e2) + { + const TrackItem * const *t1 = (const TrackItem * const *) e1; + const TrackItem * const *t2 = (const TrackItem * const *) e2; + + return lexeme_compare(&(*t1)->key, &(*t2)->key); + } *** a/src/include/catalog/pg_operator.h --- b/src/include/catalog/pg_operator.h *************** *** 915,924 **** DATA(insert OID = 3630 ( "<>" PGNSP PGUID b f f 3614 3614 16 3630 3629 ts DATA(insert OID = 3631 ( ">=" PGNSP PGUID b f f 3614 3614 16 3628 3627 tsvector_ge scalargtsel scalargtjoinsel )); DATA(insert OID = 3632 ( ">" PGNSP PGUID b f f 3614 3614 16 3627 3628 tsvector_gt scalargtsel scalargtjoinsel )); DATA(insert OID = 3633 ( "||" PGNSP PGUID b f f 3614 3614 3614 0 0 tsvector_concat - - )); ! DATA(insert OID = 3636 ( "@@" PGNSP PGUID b f f 3614 3615 16 3637 0 ts_match_vq contsel contjoinsel )); ! DATA(insert OID = 3637 ( "@@" PGNSP PGUID b f f 3615 3614 16 3636 0 ts_match_qv contsel contjoinsel )); ! DATA(insert OID = 3660 ( "@@@" PGNSP PGUID b f f 3614 3615 16 3661 0 ts_match_vq contsel contjoinsel )); ! DATA(insert OID = 3661 ( "@@@" PGNSP PGUID b f f 3615 3614 16 3660 0 ts_match_qv contsel contjoinsel )); DATA(insert OID = 3674 ( "<" PGNSP PGUID b f f 3615 3615 16 3679 3678 tsquery_lt scalarltsel scalarltjoinsel )); DATA(insert OID = 3675 ( "<=" PGNSP PGUID b f f 3615 3615 16 3678 3679 tsquery_le scalarltsel scalarltjoinsel )); DATA(insert OID = 3676 ( "=" PGNSP PGUID b t f 3615 3615 16 3676 3677 tsquery_eq eqsel eqjoinsel )); --- 915,924 ---- DATA(insert OID = 3631 ( ">=" PGNSP PGUID b f f 3614 3614 16 3628 3627 tsvector_ge scalargtsel scalargtjoinsel )); DATA(insert OID = 3632 ( ">" PGNSP PGUID b f f 3614 3614 16 3627 3628 tsvector_gt scalargtsel scalargtjoinsel )); DATA(insert OID = 3633 ( "||" PGNSP PGUID b f f 3614 3614 3614 0 0 tsvector_concat - - )); ! DATA(insert OID = 3636 ( "@@" PGNSP PGUID b f f 3614 3615 16 3637 0 ts_match_vq tssel contjoinsel )); ! DATA(insert OID = 3637 ( "@@" PGNSP PGUID b f f 3615 3614 16 3636 0 ts_match_qv tssel contjoinsel )); ! DATA(insert OID = 3660 ( "@@@" PGNSP PGUID b f f 3614 3615 16 3661 0 ts_match_vq tssel contjoinsel )); ! DATA(insert OID = 3661 ( "@@@" PGNSP PGUID b f f 3615 3614 16 3660 0 ts_match_qv tssel contjoinsel )); DATA(insert OID = 3674 ( "<" PGNSP PGUID b f f 3615 3615 16 3679 3678 tsquery_lt scalarltsel scalarltjoinsel )); DATA(insert OID = 3675 ( "<=" PGNSP PGUID b f f 3615 3615 16 3678 3679 tsquery_le scalarltsel scalarltjoinsel )); DATA(insert OID = 3676 ( "=" PGNSP PGUID b t f 3615 3615 16 3676 3677 tsquery_eq eqsel eqjoinsel )); *** a/src/include/catalog/pg_proc.h --- b/src/include/catalog/pg_proc.h *************** *** 4434,4439 **** DESCR("GiST tsquery support"); --- 4434,4441 ---- DATA(insert OID = 3701 ( gtsquery_consistent PGNSP PGUID 12 1 0 0 f f t f i 5 16 "2281 2281 23 26 2281" _null_ _null_ _null_ gtsquery_consistent _null_ _null_ _null_ )); DESCR("GiST tsquery support"); + DATA(insert OID = 3687 ( tssel PGNSP PGUID 12 1 0 0 f f t f s 4 701 "2281 26 2281 23" _null_ _null_ _null_ tssel _null_ _null_ _null_ )); + DESCR("restriction selectivity of tsvector @@ tsquery"); DATA(insert OID = 3688 ( ts_typanalyze PGNSP PGUID 12 1 0 0 f f t f s 1 16 "2281" _null_ _null_ _null_ ts_typanalyze _null_ _null_ _null_ )); DESCR("tsvector typanalyze"); *** a/src/include/tsearch/ts_type.h --- b/src/include/tsearch/ts_type.h *************** *** 155,160 **** extern Datum ts_rankcd_wttf(PG_FUNCTION_ARGS); --- 155,162 ---- extern Datum ts_typanalyze(PG_FUNCTION_ARGS); + extern Datum tssel(PG_FUNCTION_ARGS); + /* * TSQuery *************** *** 291,294 **** extern Datum tsquery_rewrite_query(PG_FUNCTION_ARGS); --- 293,309 ---- extern Datum tsq_mcontains(PG_FUNCTION_ARGS); extern Datum tsq_mcontained(PG_FUNCTION_ARGS); + /* + * The default text search selectivity is chosen to be smll enough to + * encourage indexscans for typical table densities. See selfuncs.h and + * DEFAULT_EQ_SEL for details. + */ + #define DEFAULT_TS_SEL 0.005 + + /* + * operator restriction function for tsvector @@ tsquery and + * tsquery @@ tsvector + */ + extern Datum tssel(PG_FUNCTION_ARGS); + #endif /* _PG_TSTYPE_H_ */