diff --git a/src/backend/tsearch/Makefile b/src/backend/tsearch/Makefile index e20a4a2..ba728eb 100644 --- a/src/backend/tsearch/Makefile +++ b/src/backend/tsearch/Makefile @@ -19,7 +19,7 @@ DICTFILES=synonym_sample.syn thesaurus_sample.ths hunspell_sample.affix \ OBJS = ts_locale.o ts_parse.o wparser.o wparser_def.o dict.o \ dict_simple.o dict_synonym.o dict_thesaurus.o \ dict_ispell.o regis.o spell.o \ - to_tsany.o ts_typanalyze.o ts_utils.o + to_tsany.o ts_typanalyze.o ts_selfuncs.o ts_utils.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/tsearch/ts_selfuncs.c b/src/backend/tsearch/ts_selfuncs.c new file mode 100644 index 0000000..4b86072 --- /dev/null +++ b/src/backend/tsearch/ts_selfuncs.c @@ -0,0 +1,299 @@ +/*------------------------------------------------------------------------- + * + * ts_selfuncs.c + * Selectivity functions for text search types. + * + * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * $PostgreSQL$ + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "miscadmin.h" /* for check_stack_depth() */ +#include "utils/memutils.h" +#include "utils/builtins.h" +#include "utils/syscache.h" +#include "utils/lsyscache.h" +#include "utils/selfuncs.h" +#include "catalog/pg_type.h" +#include "catalog/pg_statistic.h" +#include "nodes/nodes.h" +#include "tsearch/ts_type.h" + +/* lookup table type for binary searching through MCELEMs */ +typedef struct +{ + Datum element; + float4 frequency; +} TextFreq; + +static int +compare_textfreq(const void *e1, const void *e2); +static Selectivity +tsquery_opr_selec(QueryItem *item, char *operand, TextFreq *lookup, + int length, float4 minfreq); +static Selectivity +mcelem_tsquery_selec(TSQuery query, Datum *mcelem, int nmcelem, + float4 *numbers, int nnumbers); +static double +tsquerysel(VariableStatData *vardata, Datum constval); + + +/* TSQuery traversal function */ +static Selectivity +tsquery_opr_selec(QueryItem *item, char *operand, TextFreq *lookup, + int length, float4 minfreq) +{ + TextFreq key, + *searchres; + Selectivity s1, s2; + + /* since this function recurses, it could be driven to stack overflow */ + check_stack_depth(); + + if (item->type == QI_VAL) + { + QueryOperand *oper = (QueryOperand *) item; + + /* + * Prepare the key for bsearch(). No need to initialize key.frequency, + * because sorting is only on key.element. + */ + key.element = PointerGetDatum( + cstring_to_text_with_len(operand + oper->distance, oper->length)); + + searchres = (TextFreq *) bsearch(&key, lookup, length, + sizeof(TextFreq), compare_textfreq); + if (searchres) + { + /* + * The element is in MCELEM. Return precise selectivity (or at + * least as precise, as ANALYZE could find out). + */ + return (Selectivity) searchres->frequency; + } + else + { + /* + * The element is not in MCELEM. Punt, but assure that the + * selectivity cannot be more than minfreq / 2. + */ + return (Selectivity) Min(DEFAULT_TS_SEL, minfreq / 2); + } + } + + /* Current TSQuery node is an operator */ + switch (item->operator.oper) + { + case OP_NOT: + return 1.0 - tsquery_opr_selec(item + 1, operand, lookup, + length, minfreq); + + case OP_AND: + return + tsquery_opr_selec(item + 1, operand, lookup, length, minfreq) * + tsquery_opr_selec(item + item->operator.left, operand, lookup, + length, minfreq); + + case OP_OR: + s1 = tsquery_opr_selec(item + 1, operand, lookup, length, minfreq); + s2 = tsquery_opr_selec(item + item->operator.left, operand, lookup, + length, minfreq); + return s1 + s2 - s1 * s2; + + default: + elog(ERROR, "unrecognized operator: %d", item->operator.oper); + } + + /* never reached, keep compiler happy */ + return (Selectivity) DEFAULT_TS_SEL; +} + +/* + * Traverse the tsquery preorder, calculating selectivity as: + * + * selec(left_oper) * selec(right_oper) in AND nodes, + * + * selec(left_oper) + selec(right_oper) - + * selec(left_oper) * selec(right_oper) in OR nodes, + * + * 1 - select(oper) in NOT nodes + * + * freq[val] in VAL nodes, if the value is in MCELEM + * min(freq[MCELEM]) / 2 in VAL nodes, if it is not + * + * + * Implementation-wise, we sort the MCELEM array to use binary + * search on it. + */ +static Selectivity +mcelem_tsquery_selec(TSQuery query, Datum *mcelem, int nmcelem, + float4 *numbers, int nnumbers) +{ + float4 minfreq; + TextFreq *lookup; + Selectivity selec; + MemoryContext opr_context, + old_context; + int i; + + /* Grab the lowest frequency */ + minfreq = numbers[nnumbers - 1]; + + /* + * Compute selectivity in child context, because we will possibly do a lot + * of QueryOperand to text transformations, which palloc() memory + */ + opr_context = AllocSetContextCreate(CurrentMemoryContext, + "Text search restriction", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE); + old_context = MemoryContextSwitchTo(opr_context); + + + /* Construct the array to be qsort()'ed */ + Assert(nmcelem == nnumbers); + lookup = (TextFreq *) palloc(sizeof(TextFreq) * nmcelem); + for (i = 0; i < nmcelem; i++) + { + lookup[i].element = mcelem[i]; + lookup[i].frequency = numbers[i]; + } + + qsort(lookup, nmcelem, sizeof(TextFreq), compare_textfreq); + + selec = tsquery_opr_selec(GETQUERY(query), GETOPERAND(query), lookup, + nmcelem, minfreq); + + MemoryContextSwitchTo(old_context); + MemoryContextDelete(opr_context); + + return selec; +} + +static int +compare_textfreq(const void *e1, const void *e2) +{ + const TextFreq *t1 = (const TextFreq *) e1; + const TextFreq *t2 = (const TextFreq *) e2; + + return DatumGetInt32(DirectFunctionCall2(bttextcmp, + t1->element, t2->element)); +} + +/* + * tssel -- Selectivity of "@@" + */ +Datum +tssel(PG_FUNCTION_ARGS) +{ + PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0); + /* We skip arg #2, which is the operator OID - it's gonna be "@@" */ + List *args = (List *) PG_GETARG_POINTER(2); + int varRelid = PG_GETARG_INT32(3); + VariableStatData vardata; + Node *other; + bool varonleft; + Selectivity selec; + + /* + * If expression is not variable op something or something op variable, + * then punt and return a default estimate. + */ + if (!get_restriction_variable(root, args, varRelid, + &vardata, &other, &varonleft)) + { + PG_RETURN_FLOAT8(DEFAULT_TS_SEL); + } + + /* + * Can't do anything useful if the something is not a constant, either. + */ + if (!IsA(other, Const)) + { + ReleaseVariableStats(vardata); + PG_RETURN_FLOAT8(DEFAULT_TS_SEL); + } + + /* The "@@" operator is strict, so might cope with NULL right away */ + if (((Const *) other)->constisnull) { + ReleaseVariableStats(vardata); + PG_RETURN_FLOAT8((float8) 0.0); + } + + /* + * OK, there's a Var and a Const we're dealing with here. We need the Var + * to be a TSVector (or else we don't have any useful statistic for it). + */ + + if (vardata.vartype == TSVECTOROID) + { + /* tsvector @@ tsquery or the other way around */ + Assert(((Const *) other)->consttype == TSQUERYOID); + + selec = tsquerysel(&vardata, ((Const *) other)->constvalue); + } + else + { + /* The Var is something we don't have useful statistic for */ + selec = (Selectivity) DEFAULT_TS_SEL; + } + + ReleaseVariableStats(vardata); + + PG_RETURN_FLOAT8((float8) selec); +} + +static Selectivity +tsquerysel(VariableStatData *vardata, Datum constval) +{ + Selectivity selec; + + if (HeapTupleIsValid(vardata->statsTuple)) + { + TSQuery query; + Form_pg_statistic stats; + Datum *values; + int nvalues; + float4 *numbers; + int nnumbers; + + /* The caller made sure the const is a TSQuery, so get it now */ + query = DatumGetTSQuery(constval); + + stats = (Form_pg_statistic) GETSTRUCT(vardata->statsTuple); + + /* MCELEM will be an array of Text elements for a tsvector column */ + if (get_attstatsslot(vardata->statsTuple, + TEXTOID, -1, + STATISTIC_KIND_MCELEM, InvalidOid, + &values, &nvalues, + &numbers, &nnumbers)) + { + /* + * There is a most-common-elements slot for the tsvector Var, so + * use that. + */ + + selec = mcelem_tsquery_selec(query, values, nvalues, + numbers, nnumbers); + free_attstatsslot(TEXTOID, values, nvalues, numbers, nnumbers); + } + else + { + /* No most-common-elements slot */ + selec = (Selectivity) DEFAULT_TS_SEL; + } + } + else + { + selec = (Selectivity) DEFAULT_TS_SEL; + } + + return selec; +} diff --git a/src/include/catalog/pg_operator.h b/src/include/catalog/pg_operator.h index 1c7f37d..451805d 100644 --- a/src/include/catalog/pg_operator.h +++ b/src/include/catalog/pg_operator.h @@ -915,10 +915,10 @@ DATA(insert OID = 3630 ( "<>" PGNSP PGUID b f f 3614 3614 16 3630 3629 ts DATA(insert OID = 3631 ( ">=" PGNSP PGUID b f f 3614 3614 16 3628 3627 tsvector_ge scalargtsel scalargtjoinsel )); DATA(insert OID = 3632 ( ">" PGNSP PGUID b f f 3614 3614 16 3627 3628 tsvector_gt scalargtsel scalargtjoinsel )); DATA(insert OID = 3633 ( "||" PGNSP PGUID b f f 3614 3614 3614 0 0 tsvector_concat - - )); -DATA(insert OID = 3636 ( "@@" PGNSP PGUID b f f 3614 3615 16 3637 0 ts_match_vq contsel contjoinsel )); -DATA(insert OID = 3637 ( "@@" PGNSP PGUID b f f 3615 3614 16 3636 0 ts_match_qv contsel contjoinsel )); -DATA(insert OID = 3660 ( "@@@" PGNSP PGUID b f f 3614 3615 16 3661 0 ts_match_vq contsel contjoinsel )); -DATA(insert OID = 3661 ( "@@@" PGNSP PGUID b f f 3615 3614 16 3660 0 ts_match_qv contsel contjoinsel )); +DATA(insert OID = 3636 ( "@@" PGNSP PGUID b f f 3614 3615 16 3637 0 ts_match_vq tssel contjoinsel )); +DATA(insert OID = 3637 ( "@@" PGNSP PGUID b f f 3615 3614 16 3636 0 ts_match_qv tssel contjoinsel )); +DATA(insert OID = 3660 ( "@@@" PGNSP PGUID b f f 3614 3615 16 3661 0 ts_match_vq tssel contjoinsel )); +DATA(insert OID = 3661 ( "@@@" PGNSP PGUID b f f 3615 3614 16 3660 0 ts_match_qv tssel contjoinsel )); DATA(insert OID = 3674 ( "<" PGNSP PGUID b f f 3615 3615 16 3679 3678 tsquery_lt scalarltsel scalarltjoinsel )); DATA(insert OID = 3675 ( "<=" PGNSP PGUID b f f 3615 3615 16 3678 3679 tsquery_le scalarltsel scalarltjoinsel )); DATA(insert OID = 3676 ( "=" PGNSP PGUID b t f 3615 3615 16 3676 3677 tsquery_eq eqsel eqjoinsel )); diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h index 16ccb55..ef24f40 100644 --- a/src/include/catalog/pg_proc.h +++ b/src/include/catalog/pg_proc.h @@ -4321,6 +4321,8 @@ DESCR("GiST tsquery support"); DATA(insert OID = 3701 ( gtsquery_consistent PGNSP PGUID 12 1 0 0 f f t f i 5 16 "2281 2281 23 26 2281" _null_ _null_ _null_ gtsquery_consistent _null_ _null_ _null_ )); DESCR("GiST tsquery support"); +DATA(insert OID = 3687 ( tssel PGNSP PGUID 12 1 0 0 f f t f s 4 701 "2281 26 2281 23" _null_ _null_ _null_ tssel _null_ _null_ _null_ )); +DESCR("restriction selectivity of tsvector @@ tsquery"); DATA(insert OID = 3688 ( ts_typanalyze PGNSP PGUID 12 1 0 0 f f t f s 1 16 "2281" _null_ _null_ _null_ ts_typanalyze _null_ _null_ _null_ )); DESCR("tsvector typanalyze"); diff --git a/src/include/tsearch/ts_type.h b/src/include/tsearch/ts_type.h index 06ca1f9..6208432 100644 --- a/src/include/tsearch/ts_type.h +++ b/src/include/tsearch/ts_type.h @@ -291,4 +291,17 @@ extern Datum tsquery_rewrite_query(PG_FUNCTION_ARGS); extern Datum tsq_mcontains(PG_FUNCTION_ARGS); extern Datum tsq_mcontained(PG_FUNCTION_ARGS); +/* + * The default text search selectivity is chosen to be smll enough to + * encourage indexscans for typical table densities. See selfuncs.h and + * DEFAULT_EQ_SEL for details. + */ +#define DEFAULT_TS_SEL 0.005 + +/* + * operator restriction function for tsvector @@ tsquery and + * tsquery @@ tsvector + */ +extern Datum tssel(PG_FUNCTION_ARGS); + #endif /* _PG_TSTYPE_H_ */