From 7957f0cc7bd96981fcbc34b4f5eb5948538769ae Mon Sep 17 00:00:00 2001 From: Tomas Vondra Date: Thu, 5 Mar 2020 22:36:03 +0100 Subject: [PATCH 3/3] Support for extended statistics on expressions --- src/backend/commands/statscmds.c | 190 +++++-- src/backend/nodes/copyfuncs.c | 14 + src/backend/nodes/equalfuncs.c | 13 + src/backend/nodes/outfuncs.c | 12 + src/backend/optimizer/util/plancat.c | 40 ++ src/backend/parser/gram.y | 31 +- src/backend/parser/parse_agg.c | 10 + src/backend/parser/parse_expr.c | 6 + src/backend/parser/parse_func.c | 3 + src/backend/parser/parse_utilcmd.c | 89 ++- src/backend/statistics/dependencies.c | 159 +++++- src/backend/statistics/extended_stats.c | 532 +++++++++++++++++- src/backend/statistics/mcv.c | 17 +- src/backend/statistics/mvdistinct.c | 51 +- src/backend/tcop/utility.c | 16 +- src/backend/utils/adt/ruleutils.c | 59 ++ src/backend/utils/adt/selfuncs.c | 11 + src/bin/psql/describe.c | 1 + src/include/catalog/pg_statistic_ext.h | 3 + src/include/nodes/nodes.h | 1 + src/include/nodes/parsenodes.h | 16 + src/include/nodes/pathnodes.h | 1 + src/include/parser/parse_node.h | 1 + src/include/parser/parse_utilcmd.h | 2 + .../statistics/extended_stats_internal.h | 13 +- 25 files changed, 1191 insertions(+), 100 deletions(-) diff --git a/src/backend/commands/statscmds.c b/src/backend/commands/statscmds.c index 988cdba6f5..56559a1e91 100644 --- a/src/backend/commands/statscmds.c +++ b/src/backend/commands/statscmds.c @@ -29,6 +29,8 @@ #include "commands/comment.h" #include "commands/defrem.h" #include "miscadmin.h" +#include "nodes/nodeFuncs.h" +#include "optimizer/optimizer.h" #include "statistics/statistics.h" #include "utils/builtins.h" #include "utils/fmgroids.h" @@ -42,6 +44,7 @@ static char *ChooseExtendedStatisticName(const char *name1, const char *name2, const char *label, Oid namespaceid); static char *ChooseExtendedStatisticNameAddition(List *exprs); +static bool CheckMutability(Expr *expr); /* qsort comparator for the attnums in CreateStatistics */ @@ -62,6 +65,7 @@ ObjectAddress CreateStatistics(CreateStatsStmt *stmt) { int16 attnums[STATS_MAX_DIMENSIONS]; + int nattnums = 0; int numcols = 0; char *namestr; NameData stxname; @@ -74,6 +78,8 @@ CreateStatistics(CreateStatsStmt *stmt) Datum datavalues[Natts_pg_statistic_ext_data]; bool datanulls[Natts_pg_statistic_ext_data]; int2vector *stxkeys; + List *stxexprs = NIL; + Datum exprsDatum; Relation statrel; Relation datarel; Relation rel = NULL; @@ -192,56 +198,95 @@ CreateStatistics(CreateStatsStmt *stmt) foreach(cell, stmt->exprs) { Node *expr = (Node *) lfirst(cell); - ColumnRef *cref; - char *attname; + StatsElem *selem; HeapTuple atttuple; Form_pg_attribute attForm; TypeCacheEntry *type; - if (!IsA(expr, ColumnRef)) + if (!IsA(expr, StatsElem)) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("only simple column references are allowed in CREATE STATISTICS"))); - cref = (ColumnRef *) expr; + selem = (StatsElem *) expr; - if (list_length(cref->fields) != 1) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("only simple column references are allowed in CREATE STATISTICS"))); - attname = strVal((Value *) linitial(cref->fields)); - - atttuple = SearchSysCacheAttName(relid, attname); - if (!HeapTupleIsValid(atttuple)) - ereport(ERROR, - (errcode(ERRCODE_UNDEFINED_COLUMN), - errmsg("column \"%s\" does not exist", - attname))); - attForm = (Form_pg_attribute) GETSTRUCT(atttuple); - - /* Disallow use of system attributes in extended stats */ - if (attForm->attnum <= 0) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("statistics creation on system columns is not supported"))); - - /* Disallow data types without a less-than operator */ - type = lookup_type_cache(attForm->atttypid, TYPECACHE_LT_OPR); - if (type->lt_opr == InvalidOid) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("column \"%s\" cannot be used in statistics because its type %s has no default btree operator class", - attname, format_type_be(attForm->atttypid)))); - - /* Make sure no more than STATS_MAX_DIMENSIONS columns are used */ - if (numcols >= STATS_MAX_DIMENSIONS) - ereport(ERROR, - (errcode(ERRCODE_TOO_MANY_COLUMNS), - errmsg("cannot have more than %d columns in statistics", - STATS_MAX_DIMENSIONS))); - - attnums[numcols] = attForm->attnum; - numcols++; - ReleaseSysCache(atttuple); + if (selem->name) /* column reference */ + { + char *attname; + attname = selem->name; + + atttuple = SearchSysCacheAttName(relid, attname); + if (!HeapTupleIsValid(atttuple)) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_COLUMN), + errmsg("column \"%s\" does not exist", + attname))); + attForm = (Form_pg_attribute) GETSTRUCT(atttuple); + + /* Disallow use of system attributes in extended stats */ + if (attForm->attnum <= 0) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("statistics creation on system columns is not supported"))); + + /* Disallow data types without a less-than operator */ + type = lookup_type_cache(attForm->atttypid, TYPECACHE_LT_OPR); + if (type->lt_opr == InvalidOid) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("column \"%s\" cannot be used in statistics because its type %s has no default btree operator class", + attname, format_type_be(attForm->atttypid)))); + + /* Make sure no more than STATS_MAX_DIMENSIONS columns are used */ + if (numcols >= STATS_MAX_DIMENSIONS) + ereport(ERROR, + (errcode(ERRCODE_TOO_MANY_COLUMNS), + errmsg("cannot have more than %d columns in statistics", + STATS_MAX_DIMENSIONS))); + + attnums[nattnums] = attForm->attnum; + nattnums++; + numcols++; + ReleaseSysCache(atttuple); + } + else /* expression */ + { + Node *expr = selem->expr; + TypeCacheEntry *type; + Oid atttype; + + Assert(expr != NULL); + + /* + * An expression using mutable functions is probably wrong, + * since if you aren't going to get the same result for the + * same data every time, it's not clear what the index entries + * mean at all. + */ + if (CheckMutability((Expr *) expr)) + ereport(ERROR, + (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), + errmsg("functions in statistics expression must be marked IMMUTABLE"))); + + /* Disallow data types without a less-than operator */ + atttype = exprType(expr); + type = lookup_type_cache(atttype, TYPECACHE_LT_OPR); + if (type->lt_opr == InvalidOid) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("expression cannot be used in statistics because its type %s has no default btree operator class", + format_type_be(atttype)))); + + /* Make sure no more than STATS_MAX_DIMENSIONS columns are used */ + if (numcols >= STATS_MAX_DIMENSIONS) + ereport(ERROR, + (errcode(ERRCODE_TOO_MANY_COLUMNS), + errmsg("cannot have more than %d columns in statistics", + STATS_MAX_DIMENSIONS))); + + numcols++; + + stxexprs = lappend(stxexprs, expr); + } } /* @@ -258,13 +303,13 @@ CreateStatistics(CreateStatsStmt *stmt) * it does not hurt (it does not affect the efficiency, unlike for * indexes, for example). */ - qsort(attnums, numcols, sizeof(int16), compare_int16); + qsort(attnums, nattnums, sizeof(int16), compare_int16); /* * Check for duplicates in the list of columns. The attnums are sorted so * just check consecutive elements. */ - for (i = 1; i < numcols; i++) + for (i = 1; i < nattnums; i++) { if (attnums[i] == attnums[i - 1]) ereport(ERROR, @@ -273,7 +318,7 @@ CreateStatistics(CreateStatsStmt *stmt) } /* Form an int2vector representation of the sorted column list */ - stxkeys = buildint2vector(attnums, numcols); + stxkeys = buildint2vector(attnums, nattnums); /* * Parse the statistics kinds. @@ -325,6 +370,18 @@ CreateStatistics(CreateStatsStmt *stmt) Assert(ntypes > 0 && ntypes <= lengthof(types)); stxkind = construct_array(types, ntypes, CHAROID, 1, true, TYPALIGN_CHAR); + /* convert the expressions (if any) to a text datum */ + if (stxexprs != NIL) + { + char *exprsString; + + exprsString = nodeToString(stxexprs); + exprsDatum = CStringGetTextDatum(exprsString); + pfree(exprsString); + } + else + exprsDatum = (Datum) 0; + statrel = table_open(StatisticExtRelationId, RowExclusiveLock); /* @@ -344,6 +401,15 @@ CreateStatistics(CreateStatsStmt *stmt) values[Anum_pg_statistic_ext_stxkeys - 1] = PointerGetDatum(stxkeys); values[Anum_pg_statistic_ext_stxkind - 1] = PointerGetDatum(stxkind); + values[Anum_pg_statistic_ext_stxexprs - 1] = exprsDatum; + if (exprsDatum == (Datum) 0) + nulls[Anum_pg_statistic_ext_stxexprs - 1] = true; + + /* + * FIXME add dependencies on anything mentioned in the expressions, + * see recordDependencyOnSingleRelExpr in index_create + */ + /* insert it into pg_statistic_ext */ htup = heap_form_tuple(statrel->rd_att, values, nulls); CatalogTupleInsert(statrel, htup); @@ -387,7 +453,7 @@ CreateStatistics(CreateStatsStmt *stmt) */ ObjectAddressSet(myself, StatisticExtRelationId, statoid); - for (i = 0; i < numcols; i++) + for (i = 0; i < nattnums; i++) { ObjectAddressSubSet(parentobject, RelationRelationId, relid, attnums[i]); recordDependencyOn(&myself, &parentobject, DEPENDENCY_AUTO); @@ -722,14 +788,14 @@ ChooseExtendedStatisticNameAddition(List *exprs) buf[0] = '\0'; foreach(lc, exprs) { - ColumnRef *cref = (ColumnRef *) lfirst(lc); + StatsElem *selem = (StatsElem *) lfirst(lc); const char *name; /* It should be one of these, but just skip if it happens not to be */ - if (!IsA(cref, ColumnRef)) + if (!IsA(selem, StatsElem)) continue; - name = strVal((Value *) linitial(cref->fields)); + name = selem->name; if (buflen > 0) buf[buflen++] = '_'; /* insert _ between names */ @@ -745,3 +811,29 @@ ChooseExtendedStatisticNameAddition(List *exprs) } return pstrdup(buf); } + +/* + * CheckMutability + * Test whether given expression is mutable + */ +static bool +CheckMutability(Expr *expr) +{ + /* + * First run the expression through the planner. This has a couple of + * important consequences. First, function default arguments will get + * inserted, which may affect volatility (consider "default now()"). + * Second, inline-able functions will get inlined, which may allow us to + * conclude that the function is really less volatile than it's marked. As + * an example, polymorphic functions must be marked with the most volatile + * behavior that they have for any input type, but once we inline the + * function we may be able to conclude that it's not so volatile for the + * particular input type we're dealing with. + * + * We assume here that expression_planner() won't scribble on its input. + */ + expr = expression_planner(expr); + + /* Now we can search for non-immutable functions */ + return contain_mutable_functions((Node *) expr); +} diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c index e04c33e4ad..fee5d3b086 100644 --- a/src/backend/nodes/copyfuncs.c +++ b/src/backend/nodes/copyfuncs.c @@ -2883,6 +2883,17 @@ _copyIndexElem(const IndexElem *from) return newnode; } +static StatsElem * +_copyStatsElem(const StatsElem *from) +{ + StatsElem *newnode = makeNode(StatsElem); + + COPY_STRING_FIELD(name); + COPY_NODE_FIELD(expr); + + return newnode; +} + static ColumnDef * _copyColumnDef(const ColumnDef *from) { @@ -5566,6 +5577,9 @@ copyObjectImpl(const void *from) case T_IndexElem: retval = _copyIndexElem(from); break; + case T_StatsElem: + retval = _copyStatsElem(from); + break; case T_ColumnDef: retval = _copyColumnDef(from); break; diff --git a/src/backend/nodes/equalfuncs.c b/src/backend/nodes/equalfuncs.c index 5b1ba143b1..956420cce9 100644 --- a/src/backend/nodes/equalfuncs.c +++ b/src/backend/nodes/equalfuncs.c @@ -2569,6 +2569,16 @@ _equalIndexElem(const IndexElem *a, const IndexElem *b) return true; } + +static bool +_equalStatsElem(const StatsElem *a, const StatsElem *b) +{ + COMPARE_STRING_FIELD(name); + COMPARE_NODE_FIELD(expr); + + return true; +} + static bool _equalColumnDef(const ColumnDef *a, const ColumnDef *b) { @@ -3662,6 +3672,9 @@ equal(const void *a, const void *b) case T_IndexElem: retval = _equalIndexElem(a, b); break; + case T_StatsElem: + retval = _equalStatsElem(a, b); + break; case T_ColumnDef: retval = _equalColumnDef(a, b); break; diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c index e084c3f069..dabf62ed55 100644 --- a/src/backend/nodes/outfuncs.c +++ b/src/backend/nodes/outfuncs.c @@ -2873,6 +2873,15 @@ _outIndexElem(StringInfo str, const IndexElem *node) WRITE_ENUM_FIELD(nulls_ordering, SortByNulls); } +static void +_outStatsElem(StringInfo str, const StatsElem *node) +{ + WRITE_NODE_TYPE("STATSELEM"); + + WRITE_STRING_FIELD(name); + WRITE_NODE_FIELD(expr); +} + static void _outQuery(StringInfo str, const Query *node) { @@ -4179,6 +4188,9 @@ outNode(StringInfo str, const void *obj) case T_IndexElem: _outIndexElem(str, obj); break; + case T_StatsElem: + _outStatsElem(str, obj); + break; case T_Query: _outQuery(str, obj); break; diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c index d82fc5ab8b..01130c5779 100644 --- a/src/backend/optimizer/util/plancat.c +++ b/src/backend/optimizer/util/plancat.c @@ -34,6 +34,7 @@ #include "foreign/fdwapi.h" #include "miscadmin.h" #include "nodes/makefuncs.h" +#include "nodes/nodeFuncs.h" #include "nodes/supportnodes.h" #include "optimizer/clauses.h" #include "optimizer/cost.h" @@ -1304,6 +1305,7 @@ get_relation_statistics(RelOptInfo *rel, Relation relation) HeapTuple dtup; Bitmapset *keys = NULL; int i; + List *exprs = NIL; htup = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(statOid)); if (!HeapTupleIsValid(htup)) @@ -1322,6 +1324,41 @@ get_relation_statistics(RelOptInfo *rel, Relation relation) for (i = 0; i < staForm->stxkeys.dim1; i++) keys = bms_add_member(keys, staForm->stxkeys.values[i]); + /* + * preprocess expression (if any) + * + * FIXME we probably need to cache the result somewhere + */ + { + bool isnull; + Datum datum; + + /* decode expression (if any) */ + datum = SysCacheGetAttr(STATEXTOID, htup, + Anum_pg_statistic_ext_stxexprs, &isnull); + + if (!isnull) + { + char *exprsString; + + exprsString = TextDatumGetCString(datum); + exprs = (List *) stringToNode(exprsString); + pfree(exprsString); + + /* + * Run the expressions through eval_const_expressions. This is not just an + * optimization, but is necessary, because the planner will be comparing + * them to similarly-processed qual clauses, and may fail to detect valid + * matches without this. We must not use canonicalize_qual, however, + * since these aren't qual expressions. + */ + exprs = (List *) eval_const_expressions(NULL, (Node *) exprs); + + /* May as well fix opfuncids too */ + fix_opfuncids((Node *) exprs); + } + } + /* add one StatisticExtInfo for each kind built */ if (statext_is_kind_built(dtup, STATS_EXT_NDISTINCT)) { @@ -1331,6 +1368,7 @@ get_relation_statistics(RelOptInfo *rel, Relation relation) info->rel = rel; info->kind = STATS_EXT_NDISTINCT; info->keys = bms_copy(keys); + info->exprs = exprs; stainfos = lappend(stainfos, info); } @@ -1343,6 +1381,7 @@ get_relation_statistics(RelOptInfo *rel, Relation relation) info->rel = rel; info->kind = STATS_EXT_DEPENDENCIES; info->keys = bms_copy(keys); + info->exprs = exprs; stainfos = lappend(stainfos, info); } @@ -1355,6 +1394,7 @@ get_relation_statistics(RelOptInfo *rel, Relation relation) info->rel = rel; info->kind = STATS_EXT_MCV; info->keys = bms_copy(keys); + info->exprs = exprs; stainfos = lappend(stainfos, info); } diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y index 96e7fdbcfe..90204b5768 100644 --- a/src/backend/parser/gram.y +++ b/src/backend/parser/gram.y @@ -225,6 +225,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); WindowDef *windef; JoinExpr *jexpr; IndexElem *ielem; + StatsElem *selem; Alias *alias; RangeVar *range; IntoClause *into; @@ -386,7 +387,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); old_aggr_definition old_aggr_list oper_argtypes RuleActionList RuleActionMulti opt_column_list columnList opt_name_list - sort_clause opt_sort_clause sortby_list index_params + sort_clause opt_sort_clause sortby_list index_params stats_params opt_include opt_c_include index_including_params name_list role_list from_clause from_list opt_array_bounds qualified_name_list any_name any_name_list type_name_list @@ -494,6 +495,7 @@ static Node *makeRecursiveViewSelect(char *relname, List *aliases, Node *query); %type func_alias_clause %type sortby %type index_elem +%type stats_param %type table_ref %type joined_table %type relation_expr @@ -3982,7 +3984,7 @@ ExistingIndex: USING INDEX index_name { $$ = $3; } CreateStatsStmt: CREATE STATISTICS any_name - opt_name_list ON expr_list FROM from_list + opt_name_list ON stats_params FROM from_list { CreateStatsStmt *n = makeNode(CreateStatsStmt); n->defnames = $3; @@ -3994,7 +3996,7 @@ CreateStatsStmt: $$ = (Node *)n; } | CREATE STATISTICS IF_P NOT EXISTS any_name - opt_name_list ON expr_list FROM from_list + opt_name_list ON stats_params FROM from_list { CreateStatsStmt *n = makeNode(CreateStatsStmt); n->defnames = $6; @@ -4007,6 +4009,29 @@ CreateStatsStmt: } ; +stats_params: stats_param { $$ = list_make1($1); } + | stats_params ',' stats_param { $$ = lappend($1, $3); } + ; + +stats_param: ColId + { + $$ = makeNode(StatsElem); + $$->name = $1; + $$->expr = NULL; + } + | func_expr_windowless + { + $$ = makeNode(StatsElem); + $$->name = NULL; + $$->expr = $1; + } + | '(' a_expr ')' + { + $$ = makeNode(StatsElem); + $$->name = NULL; + $$->expr = $2; + } + ; /***************************************************************************** * diff --git a/src/backend/parser/parse_agg.c b/src/backend/parser/parse_agg.c index f1cc5479e4..169a31bf37 100644 --- a/src/backend/parser/parse_agg.c +++ b/src/backend/parser/parse_agg.c @@ -484,6 +484,13 @@ check_agglevels_and_constraints(ParseState *pstate, Node *expr) else err = _("grouping operations are not allowed in index predicates"); + break; + case EXPR_KIND_STATS_EXPRESSION: + if (isAgg) + err = _("aggregate functions are not allowed in statistics expressions"); + else + err = _("grouping operations are not allowed in statistics expressions"); + break; case EXPR_KIND_ALTER_COL_TRANSFORM: if (isAgg) @@ -906,6 +913,9 @@ transformWindowFuncCall(ParseState *pstate, WindowFunc *wfunc, case EXPR_KIND_INDEX_EXPRESSION: err = _("window functions are not allowed in index expressions"); break; + case EXPR_KIND_STATS_EXPRESSION: + err = _("window functions are not allowed in stats expressions"); + break; case EXPR_KIND_INDEX_PREDICATE: err = _("window functions are not allowed in index predicates"); break; diff --git a/src/backend/parser/parse_expr.c b/src/backend/parser/parse_expr.c index 831db4af95..6ddd839654 100644 --- a/src/backend/parser/parse_expr.c +++ b/src/backend/parser/parse_expr.c @@ -564,6 +564,7 @@ transformColumnRef(ParseState *pstate, ColumnRef *cref) case EXPR_KIND_FUNCTION_DEFAULT: case EXPR_KIND_INDEX_EXPRESSION: case EXPR_KIND_INDEX_PREDICATE: + case EXPR_KIND_STATS_EXPRESSION: case EXPR_KIND_ALTER_COL_TRANSFORM: case EXPR_KIND_EXECUTE_PARAMETER: case EXPR_KIND_TRIGGER_WHEN: @@ -1913,6 +1914,9 @@ transformSubLink(ParseState *pstate, SubLink *sublink) case EXPR_KIND_INDEX_PREDICATE: err = _("cannot use subquery in index predicate"); break; + case EXPR_KIND_STATS_EXPRESSION: + err = _("cannot use subquery in statistics expression"); + break; case EXPR_KIND_ALTER_COL_TRANSFORM: err = _("cannot use subquery in transform expression"); break; @@ -3543,6 +3547,8 @@ ParseExprKindName(ParseExprKind exprKind) return "index expression"; case EXPR_KIND_INDEX_PREDICATE: return "index predicate"; + case EXPR_KIND_STATS_EXPRESSION: + return "statistics expression"; case EXPR_KIND_ALTER_COL_TRANSFORM: return "USING"; case EXPR_KIND_EXECUTE_PARAMETER: diff --git a/src/backend/parser/parse_func.c b/src/backend/parser/parse_func.c index 9c3b6ad916..cffc276de0 100644 --- a/src/backend/parser/parse_func.c +++ b/src/backend/parser/parse_func.c @@ -2495,6 +2495,9 @@ check_srf_call_placement(ParseState *pstate, Node *last_srf, int location) case EXPR_KIND_INDEX_PREDICATE: err = _("set-returning functions are not allowed in index predicates"); break; + case EXPR_KIND_STATS_EXPRESSION: + err = _("set-returning functions are not allowed in stats expressions"); + break; case EXPR_KIND_ALTER_COL_TRANSFORM: err = _("set-returning functions are not allowed in transform expressions"); break; diff --git a/src/backend/parser/parse_utilcmd.c b/src/backend/parser/parse_utilcmd.c index af77f1890f..f63068e5fc 100644 --- a/src/backend/parser/parse_utilcmd.c +++ b/src/backend/parser/parse_utilcmd.c @@ -1747,14 +1747,15 @@ generateClonedExtStatsStmt(RangeVar *heapRel, Oid heapRelid, /* Determine which columns the statistics are on */ for (i = 0; i < statsrec->stxkeys.dim1; i++) { - ColumnRef *cref = makeNode(ColumnRef); + StatsElem *selem = makeNode(StatsElem); AttrNumber attnum = statsrec->stxkeys.values[i]; - cref->fields = list_make1(makeString(get_attname(heapRelid, - attnum, false))); - cref->location = -1; + selem->name = get_attname(heapRelid, attnum, false); + selem->expr = NULL; - def_names = lappend(def_names, cref); + /* FIXME handle expressions properly */ + + def_names = lappend(def_names, selem); } /* finally, build the output node */ @@ -2699,6 +2700,84 @@ transformIndexStmt(Oid relid, IndexStmt *stmt, const char *queryString) return stmt; } +/* + * transformStatsStmt - parse analysis for CREATE STATISTICS + * + * To avoid race conditions, it's important that this function rely only on + * the passed-in relid (and not on stmt->relation) to determine the target + * relation. + */ +CreateStatsStmt * +transformStatsStmt(Oid relid, CreateStatsStmt *stmt, const char *queryString) +{ + ParseState *pstate; + RangeTblEntry *rte; + ListCell *l; + Relation rel; + + /* Nothing to do if statement already transformed. */ + if (stmt->transformed) + return stmt; + + /* + * We must not scribble on the passed-in CreateStatsStmt, so copy it. (This is + * overkill, but easy.) + */ + stmt = copyObject(stmt); + + /* Set up pstate */ + pstate = make_parsestate(NULL); + pstate->p_sourcetext = queryString; + + /* + * Put the parent table into the rtable so that the expressions can refer + * to its fields without qualification. Caller is responsible for locking + * relation, but we still need to open it. + */ + rel = relation_open(relid, NoLock); + rte = addRangeTableEntryForRelation(pstate, rel, + AccessShareLock, + NULL, false, true); + + /* no to join list, yes to namespaces */ + addRTEtoQuery(pstate, rte, false, true, true); + + /* take care of any expressions */ + foreach(l, stmt->exprs) + { + StatsElem *selem = (StatsElem *) lfirst(l); + + if (selem->expr) + { + /* Now do parse transformation of the expression */ + selem->expr = transformExpr(pstate, selem->expr, + EXPR_KIND_STATS_EXPRESSION); + + /* We have to fix its collations too */ + assign_expr_collations(pstate, selem->expr); + } + } + + /* + * Check that only the base rel is mentioned. (This should be dead code + * now that add_missing_from is history.) + */ + if (list_length(pstate->p_rtable) != 1) + ereport(ERROR, + (errcode(ERRCODE_INVALID_COLUMN_REFERENCE), + errmsg("index expressions and predicates can refer only to the table being indexed"))); + + free_parsestate(pstate); + + /* Close relation */ + table_close(rel, NoLock); + + /* Mark statement as successfully transformed */ + stmt->transformed = true; + + return stmt; +} + /* * transformRuleStmt - diff --git a/src/backend/statistics/dependencies.c b/src/backend/statistics/dependencies.c index e2f6c5bb97..76afb0ea2a 100644 --- a/src/backend/statistics/dependencies.c +++ b/src/backend/statistics/dependencies.c @@ -69,8 +69,10 @@ static void generate_dependencies(DependencyGenerator state); static DependencyGenerator DependencyGenerator_init(int n, int k); static void DependencyGenerator_free(DependencyGenerator state); static AttrNumber *DependencyGenerator_next(DependencyGenerator state); -static double dependency_degree(int numrows, HeapTuple *rows, int k, - AttrNumber *dependency, VacAttrStats **stats, Bitmapset *attrs); +static double dependency_degree(int numrows, HeapTuple *rows, + Datum *exprvals, bool *exprnulls, int nexprs, int k, + AttrNumber *dependency, VacAttrStats **stats, + Bitmapset *attrs); static bool dependency_is_fully_matched(MVDependency *dependency, Bitmapset *attnums); static bool dependency_implies_attribute(MVDependency *dependency, @@ -213,8 +215,8 @@ DependencyGenerator_next(DependencyGenerator state) * the last one. */ static double -dependency_degree(int numrows, HeapTuple *rows, int k, AttrNumber *dependency, - VacAttrStats **stats, Bitmapset *attrs) +dependency_degree(int numrows, HeapTuple *rows, Datum *exprvals, bool *exprnulls, + int nexprs, int k, AttrNumber *dependency, VacAttrStats **stats, Bitmapset *attrs) { int i, nitems; @@ -283,8 +285,8 @@ dependency_degree(int numrows, HeapTuple *rows, int k, AttrNumber *dependency, * descriptor. For now that assumption holds, but it might change in the * future for example if we support statistics on multiple tables. */ - items = build_sorted_items(numrows, &nitems, rows, stats[0]->tupDesc, - mss, k, attnums_dep); + items = build_sorted_items(numrows, &nitems, rows, exprvals, exprnulls, + nexprs, stats[0]->tupDesc, mss, k, attnums_dep); /* * Walk through the sorted array, split it into rows according to the @@ -354,7 +356,9 @@ dependency_degree(int numrows, HeapTuple *rows, int k, AttrNumber *dependency, * (c) -> b */ MVDependencies * -statext_dependencies_build(int numrows, HeapTuple *rows, Bitmapset *attrs, +statext_dependencies_build(int numrows, HeapTuple *rows, + Datum *exprvals, bool *exprnulls, + Bitmapset *attrs, List *exprs, VacAttrStats **stats) { int i, @@ -365,6 +369,15 @@ statext_dependencies_build(int numrows, HeapTuple *rows, Bitmapset *attrs, /* result */ MVDependencies *dependencies = NULL; + /* + * Copy the bitmapset and add fake attnums representing expressions, + * starting above MaxHeapAttributeNumber. + */ + attrs = bms_copy(attrs); + + for (i = 1; i <= list_length(exprs); i++) + attrs = bms_add_member(attrs, MaxHeapAttributeNumber + i); + /* * Transform the bms into an array, to make accessing i-th member easier. */ @@ -392,7 +405,9 @@ statext_dependencies_build(int numrows, HeapTuple *rows, Bitmapset *attrs, MVDependency *d; /* compute how valid the dependency seems */ - degree = dependency_degree(numrows, rows, k, dependency, stats, attrs); + degree = dependency_degree(numrows, rows, exprvals, exprnulls, + list_length(exprs), k, dependency, + stats, attrs); /* * if the dependency seems entirely invalid, don't store it @@ -435,6 +450,8 @@ statext_dependencies_build(int numrows, HeapTuple *rows, Bitmapset *attrs, DependencyGenerator_free(DependencyGenerator); } + pfree(attrs); + return dependencies; } @@ -914,6 +931,128 @@ find_strongest_dependency(MVDependencies **dependencies, int ndependencies, return strongest; } +/* + * Similar to dependency_is_compatible_clause, but don't enforce that the + * expression is a simple Var. + */ +static bool +dependency_clause_matches_expression(Node *clause, Index relid, List *statlist) +{ + List *vars; + ListCell *lc, *lc2; + + RestrictInfo *rinfo = (RestrictInfo *) clause; + Node *clause_expr; + + if (!IsA(rinfo, RestrictInfo)) + return false; + + /* Pseudoconstants are not interesting (they couldn't contain a Var) */ + if (rinfo->pseudoconstant) + return false; + + /* Clauses referencing multiple, or no, varnos are incompatible */ + if (bms_membership(rinfo->clause_relids) != BMS_SINGLETON) + return false; + + if (is_opclause(rinfo->clause)) + { + /* If it's an opclause, check for Var = Const or Const = Var. */ + OpExpr *expr = (OpExpr *) rinfo->clause; + + /* Only expressions with two arguments are candidates. */ + if (list_length(expr->args) != 2) + return false; + + /* Make sure non-selected argument is a pseudoconstant. */ + if (is_pseudo_constant_clause(lsecond(expr->args))) + clause_expr = linitial(expr->args); + else if (is_pseudo_constant_clause(linitial(expr->args))) + clause_expr = lsecond(expr->args); + else + return false; + + /* + * If it's not an "=" operator, just ignore the clause, as it's not + * compatible with functional dependencies. + * + * This uses the function for estimating selectivity, not the operator + * directly (a bit awkward, but well ...). + * + * XXX this is pretty dubious; probably it'd be better to check btree + * or hash opclass membership, so as not to be fooled by custom + * selectivity functions, and to be more consistent with decisions + * elsewhere in the planner. + */ + if (get_oprrest(expr->opno) != F_EQSEL) + return false; + + /* OK to proceed with checking "var" */ + } + else if (is_notclause(rinfo->clause)) + { + /* + * "NOT x" can be interpreted as "x = false", so get the argument and + * proceed with seeing if it's a suitable Var. + */ + clause_expr = (Node *) get_notclausearg(rinfo->clause); + } + else + { + /* + * A boolean expression "x" can be interpreted as "x = true", so + * proceed with seeing if it's a suitable Var. + */ + clause_expr = (Node *) rinfo->clause; + } + + /* + * We may ignore any RelabelType node above the operand. (There won't be + * more than one, since eval_const_expressions has been applied already.) + */ + if (IsA(clause_expr, RelabelType)) + clause_expr = (Node *) ((RelabelType *) clause_expr)->arg; + + vars = pull_var_clause(clause_expr, 0); + + elog(WARNING, "nvars = %d", list_length(vars)); + + foreach (lc, vars) + { + Var *var = (Var *) lfirst(lc); + + /* Ensure Var is from the correct relation */ + if (var->varno != relid) + return false; + + /* We also better ensure the Var is from the current level */ + if (var->varlevelsup != 0) + return false; + + /* Also ignore system attributes (we don't allow stats on those) */ + if (!AttrNumberIsForUserDefinedAttr(var->varattno)) + return false; + } + + foreach (lc, statlist) + { + StatisticExtInfo *info = (StatisticExtInfo *) lfirst(lc); + + foreach (lc2, info->exprs) + { + Node *expr = (Node *) lfirst(lc2); + + if (equal(clause_expr, expr)) + { + elog(WARNING, "match"); + return true; + } + } + } + + return false; +} + /* * dependencies_clauselist_selectivity * Return the estimated selectivity of (a subset of) the given clauses @@ -982,8 +1121,10 @@ dependencies_clauselist_selectivity(PlannerInfo *root, Node *clause = (Node *) lfirst(l); AttrNumber attnum; + dependency_clause_matches_expression(clause, rel->relid, rel->statlist); + if (!bms_is_member(listidx, *estimatedclauses) && - dependency_is_compatible_clause(clause, rel->relid, &attnum)) + dependency_is_compatible_clause(clause, rel->relid, &attnum)) { list_attnums[listidx] = bms_make_singleton(attnum); clauses_attnums = bms_add_member(clauses_attnums, attnum); diff --git a/src/backend/statistics/extended_stats.c b/src/backend/statistics/extended_stats.c index 1872cd4529..9f70db7377 100644 --- a/src/backend/statistics/extended_stats.c +++ b/src/backend/statistics/extended_stats.c @@ -24,6 +24,7 @@ #include "catalog/pg_collation.h" #include "catalog/pg_statistic_ext.h" #include "catalog/pg_statistic_ext_data.h" +#include "executor/executor.h" #include "commands/progress.h" #include "miscadmin.h" #include "nodes/nodeFuncs.h" @@ -65,11 +66,12 @@ typedef struct StatExtEntry Bitmapset *columns; /* attribute numbers covered by the object */ List *types; /* 'char' list of enabled statistic kinds */ int stattarget; /* statistics target (-1 for default) */ + List *exprs; /* expressions */ } StatExtEntry; static List *fetch_statentries_for_relation(Relation pg_statext, Oid relid); -static VacAttrStats **lookup_var_attr_stats(Relation rel, Bitmapset *attrs, +static VacAttrStats **lookup_var_attr_stats(Relation rel, Bitmapset *attrs, List *exprs, int nvacatts, VacAttrStats **vacatts); static void statext_store(Oid relid, MVNDistinct *ndistinct, MVDependencies *dependencies, @@ -130,11 +132,15 @@ BuildRelationExtStatistics(Relation onerel, double totalrows, ListCell *lc2; int stattarget; + /* evaluated expressions */ + Datum *exprvals = NULL; + bool *exprnulls = NULL; + /* * Check if we can build these stats based on the column analyzed. If * not, report this fact (except in autovacuum) and move on. */ - stats = lookup_var_attr_stats(onerel, stat->columns, + stats = lookup_var_attr_stats(onerel, stat->columns, stat->exprs, natts, vacattrstats); if (!stats) { @@ -150,8 +156,8 @@ BuildRelationExtStatistics(Relation onerel, double totalrows, } /* check allowed number of dimensions */ - Assert(bms_num_members(stat->columns) >= 2 && - bms_num_members(stat->columns) <= STATS_MAX_DIMENSIONS); + Assert(bms_num_members(stat->columns) + list_length(stat->exprs) >= 2 && + bms_num_members(stat->columns) + list_length(stat->exprs) <= STATS_MAX_DIMENSIONS); /* compute statistics target for this statistics */ stattarget = statext_compute_stattarget(stat->stattarget, @@ -166,6 +172,78 @@ BuildRelationExtStatistics(Relation onerel, double totalrows, if (stattarget == 0) continue; + if (stat->exprs) + { + int i; + int idx; + TupleTableSlot *slot; + EState *estate; + ExprContext *econtext; + List *exprstates = NIL; + + /* + * Need an EState for evaluation of index expressions and + * partial-index predicates. Create it in the per-index context to be + * sure it gets cleaned up at the bottom of the loop. + */ + estate = CreateExecutorState(); + econtext = GetPerTupleExprContext(estate); + /* Need a slot to hold the current heap tuple, too */ + slot = MakeSingleTupleTableSlot(RelationGetDescr(onerel), + &TTSOpsHeapTuple); + + /* Arrange for econtext's scan tuple to be the tuple under test */ + econtext->ecxt_scantuple = slot; + + /* Compute and save index expression values */ + exprvals = (Datum *) palloc(numrows * list_length(stat->exprs) * sizeof(Datum)); + exprnulls = (bool *) palloc(numrows * list_length(stat->exprs) * sizeof(bool)); + + /* Set up expression evaluation state */ + exprstates = ExecPrepareExprList(stat->exprs, estate); + + idx = 0; + for (i = 0; i < numrows; i++) + { + /* + * Reset the per-tuple context each time, to reclaim any cruft + * left behind by evaluating the predicate or index expressions. + */ + ResetExprContext(econtext); + + /* Set up for predicate or expression evaluation */ + ExecStoreHeapTuple(rows[i], slot, false); + + foreach (lc2, exprstates) + { + Datum datum; + bool isnull; + ExprState *exprstate = (ExprState *) lfirst(lc2); + + datum = ExecEvalExprSwitchContext(exprstate, + GetPerTupleExprContext(estate), + &isnull); + if (isnull) + { + exprvals[idx] = (Datum) 0; + exprnulls[idx] = true; + } + else + { + exprvals[idx] = (Datum) datum; + exprnulls[idx] = false; + } + + idx++; + } + } + + ExecDropSingleTupleTableSlot(slot); + FreeExecutorState(estate); + + elog(WARNING, "idx = %d", idx); + } + /* compute statistic of each requested type */ foreach(lc2, stat->types) { @@ -173,13 +251,19 @@ BuildRelationExtStatistics(Relation onerel, double totalrows, if (t == STATS_EXT_NDISTINCT) ndistinct = statext_ndistinct_build(totalrows, numrows, rows, - stat->columns, stats); + exprvals, exprnulls, + stat->columns, stat->exprs, + stats); else if (t == STATS_EXT_DEPENDENCIES) dependencies = statext_dependencies_build(numrows, rows, - stat->columns, stats); + exprvals, exprnulls, + stat->columns, + stat->exprs, stats); else if (t == STATS_EXT_MCV) - mcv = statext_mcv_build(numrows, rows, stat->columns, stats, - totalrows, stattarget); + mcv = statext_mcv_build(numrows, rows, + exprvals, exprnulls, + stat->columns, stat->exprs, + stats, totalrows, stattarget); } /* store the statistics in the catalog */ @@ -240,7 +324,7 @@ ComputeExtStatisticsRows(Relation onerel, * analyzed. If not, ignore it (don't report anything, we'll do that * during the actual build BuildRelationExtStatistics). */ - stats = lookup_var_attr_stats(onerel, stat->columns, + stats = lookup_var_attr_stats(onerel, stat->columns, stat->exprs, natts, vacattrstats); if (!stats) @@ -387,6 +471,7 @@ fetch_statentries_for_relation(Relation pg_statext, Oid relid) ArrayType *arr; char *enabled; Form_pg_statistic_ext staForm; + List *exprs = NIL; entry = palloc0(sizeof(StatExtEntry)); staForm = (Form_pg_statistic_ext) GETSTRUCT(htup); @@ -418,6 +503,34 @@ fetch_statentries_for_relation(Relation pg_statext, Oid relid) entry->types = lappend_int(entry->types, (int) enabled[i]); } + /* decode expression (if any) */ + datum = SysCacheGetAttr(STATEXTOID, htup, + Anum_pg_statistic_ext_stxexprs, &isnull); + + if (!isnull) + { + char *exprsString; + + exprsString = TextDatumGetCString(datum); + exprs = (List *) stringToNode(exprsString); + + pfree(exprsString); + + /* + * Run the expressions through eval_const_expressions. This is not just an + * optimization, but is necessary, because the planner will be comparing + * them to similarly-processed qual clauses, and may fail to detect valid + * matches without this. We must not use canonicalize_qual, however, + * since these aren't qual expressions. + */ + exprs = (List *) eval_const_expressions(NULL, (Node *) exprs); + + /* May as well fix opfuncids too */ + fix_opfuncids((Node *) exprs); + } + + entry->exprs = exprs; + result = lappend(result, entry); } @@ -426,6 +539,89 @@ fetch_statentries_for_relation(Relation pg_statext, Oid relid) return result; } + +/* + * examine_attribute -- pre-analysis of a single column + * + * Determine whether the column is analyzable; if so, create and initialize + * a VacAttrStats struct for it. If not, return NULL. + * + * If index_expr isn't NULL, then we're trying to analyze an expression index, + * and index_expr is the expression tree representing the column's data. + */ +static VacAttrStats * +examine_attribute(Node *expr) +{ + HeapTuple typtuple; + VacAttrStats *stats; + int i; + bool ok; + + /* + * Create the VacAttrStats struct. Note that we only have a copy of the + * fixed fields of the pg_attribute tuple. + */ + stats = (VacAttrStats *) palloc0(sizeof(VacAttrStats)); + + /* fake the attribute */ + stats->attr = (Form_pg_attribute) palloc0(ATTRIBUTE_FIXED_PART_SIZE); + stats->attr->attstattarget = -1; + + /* + * When analyzing an expression index, believe the expression tree's type + * not the column datatype --- the latter might be the opckeytype storage + * type of the opclass, which is not interesting for our purposes. (Note: + * if we did anything with non-expression index columns, we'd need to + * figure out where to get the correct type info from, but for now that's + * not a problem.) It's not clear whether anyone will care about the + * typmod, but we store that too just in case. + */ + stats->attrtypid = exprType(expr); + stats->attrtypmod = exprTypmod(expr); + stats->attrcollid = exprCollation(expr); + + typtuple = SearchSysCacheCopy1(TYPEOID, + ObjectIdGetDatum(stats->attrtypid)); + if (!HeapTupleIsValid(typtuple)) + elog(ERROR, "cache lookup failed for type %u", stats->attrtypid); + stats->attrtype = (Form_pg_type) GETSTRUCT(typtuple); + // stats->anl_context = anl_context; + stats->tupattnum = InvalidAttrNumber; + + /* + * The fields describing the stats->stavalues[n] element types default to + * the type of the data being analyzed, but the type-specific typanalyze + * function can change them if it wants to store something else. + */ + for (i = 0; i < STATISTIC_NUM_SLOTS; i++) + { + stats->statypid[i] = stats->attrtypid; + stats->statyplen[i] = stats->attrtype->typlen; + stats->statypbyval[i] = stats->attrtype->typbyval; + stats->statypalign[i] = stats->attrtype->typalign; + } + + /* + * Call the type-specific typanalyze function. If none is specified, use + * std_typanalyze(). + */ + if (OidIsValid(stats->attrtype->typanalyze)) + ok = DatumGetBool(OidFunctionCall1(stats->attrtype->typanalyze, + PointerGetDatum(stats))); + else + ok = std_typanalyze(stats); + + if (!ok || stats->compute_stats == NULL || stats->minrows <= 0) + { + heap_freetuple(typtuple); + pfree(stats->attr); + pfree(stats); + return NULL; + } + + return stats; +} + /* * Using 'vacatts' of size 'nvacatts' as input data, return a newly built * VacAttrStats array which includes only the items corresponding to @@ -434,15 +630,18 @@ fetch_statentries_for_relation(Relation pg_statext, Oid relid) * to the caller that the stats should not be built. */ static VacAttrStats ** -lookup_var_attr_stats(Relation rel, Bitmapset *attrs, +lookup_var_attr_stats(Relation rel, Bitmapset *attrs, List *exprs, int nvacatts, VacAttrStats **vacatts) { int i = 0; int x = -1; + int natts; VacAttrStats **stats; + ListCell *lc; - stats = (VacAttrStats **) - palloc(bms_num_members(attrs) * sizeof(VacAttrStats *)); + natts = bms_num_members(attrs) + list_length(exprs); + + stats = (VacAttrStats **) palloc(natts * sizeof(VacAttrStats *)); /* lookup VacAttrStats info for the requested columns (same attnum) */ while ((x = bms_next_member(attrs, x)) >= 0) @@ -476,6 +675,19 @@ lookup_var_attr_stats(Relation rel, Bitmapset *attrs, */ Assert(!stats[i]->attr->attisdropped); + elog(WARNING, "A: %d => %p", i, stats[i]); + + i++; + } + + foreach (lc, exprs) + { + Node *expr = (Node *) lfirst(lc); + + stats[i] = examine_attribute(expr); + + elog(WARNING, "B: %d => %p (%s)", i, stats[i], nodeToString(expr)); + i++; } @@ -740,8 +952,10 @@ build_attnums_array(Bitmapset *attrs, int *numattrs) * can simply pfree the return value to release all of it. */ SortItem * -build_sorted_items(int numrows, int *nitems, HeapTuple *rows, TupleDesc tdesc, - MultiSortSupport mss, int numattrs, AttrNumber *attnums) +build_sorted_items(int numrows, int *nitems, HeapTuple *rows, + Datum *exprvals, bool *exprnulls, int nexprs, + TupleDesc tdesc, MultiSortSupport mss, + int numattrs, AttrNumber *attnums) { int i, j, @@ -789,7 +1003,16 @@ build_sorted_items(int numrows, int *nitems, HeapTuple *rows, TupleDesc tdesc, Datum value; bool isnull; - value = heap_getattr(rows[i], attnums[j], tdesc, &isnull); + if (attnums[j] <= MaxHeapAttributeNumber) + value = heap_getattr(rows[i], attnums[j], tdesc, &isnull); + else + { + int expridx = (attnums[j] - MaxHeapAttributeNumber - 1); + int idx = i * nexprs + expridx; + + value = exprvals[idx]; + isnull = exprnulls[idx]; + } /* * If this is a varlena value, check if it's too wide and if yes @@ -1110,6 +1333,168 @@ statext_is_compatible_clause_internal(PlannerInfo *root, Node *clause, return false; } + + +/* + * statext_extract_clause_internal + * Determines if the clause is compatible with MCV lists. + * + * Does the heavy lifting of actually inspecting the clauses for + * statext_is_compatible_clause. It needs to be split like this because + * of recursion. The attnums bitmap is an input/output parameter collecting + * attribute numbers from all compatible clauses (recursively). + */ +static List * +statext_extract_clause_internal(PlannerInfo *root, Node *clause, Index relid) +{ + List *result = NIL; + + /* Look inside any binary-compatible relabeling (as in examine_variable) */ + if (IsA(clause, RelabelType)) + clause = (Node *) ((RelabelType *) clause)->arg; + + /* plain Var references (boolean Vars or recursive checks) */ + if (IsA(clause, Var)) + { + Var *var = (Var *) clause; + + /* Ensure var is from the correct relation */ + if (var->varno != relid) + return NIL; + + /* we also better ensure the Var is from the current level */ + if (var->varlevelsup > 0) + return NIL; + + /* Also skip system attributes (we don't allow stats on those). */ + if (!AttrNumberIsForUserDefinedAttr(var->varattno)) + return NIL; + + // *attnums = bms_add_member(*attnums, var->varattno); + + result = lappend(result, clause); + + return result; + } + + /* (Var op Const) or (Const op Var) */ + if (is_opclause(clause)) + { + RangeTblEntry *rte = root->simple_rte_array[relid]; + OpExpr *expr = (OpExpr *) clause; + Var *var; + Var *var2 = NULL; + + /* Only expressions with two arguments are considered compatible. */ + if (list_length(expr->args) != 2) + return NIL; + + /* Check if the expression the right shape (one Var, one Const) */ + if ((!examine_opclause_expression(expr, &var, NULL, NULL)) && + (!examine_opclause_expression2(expr, &var, &var2))) + return NIL; + + /* + * If it's not one of the supported operators ("=", "<", ">", etc.), + * just ignore the clause, as it's not compatible with MCV lists. + * + * This uses the function for estimating selectivity, not the operator + * directly (a bit awkward, but well ...). + */ + switch (get_oprrest(expr->opno)) + { + case F_EQSEL: + case F_NEQSEL: + case F_SCALARLTSEL: + case F_SCALARLESEL: + case F_SCALARGTSEL: + case F_SCALARGESEL: + /* supported, will continue with inspection of the Var */ + break; + + default: + /* other estimators are considered unknown/unsupported */ + return NIL; + } + + /* + * If there are any securityQuals on the RTE from security barrier + * views or RLS policies, then the user may not have access to all the + * table's data, and we must check that the operator is leak-proof. + * + * If the operator is leaky, then we must ignore this clause for the + * purposes of estimating with MCV lists, otherwise the operator might + * reveal values from the MCV list that the user doesn't have + * permission to see. + */ + if (rte->securityQuals != NIL && + !get_func_leakproof(get_opcode(expr->opno))) + return NIL; + + result = lappend(result, var); + + if (var2) + result = lappend(result, var2); + + return result; + } + + /* AND/OR/NOT clause */ + if (is_andclause(clause) || + is_orclause(clause) || + is_notclause(clause)) + { + /* + * AND/OR/NOT-clauses are supported if all sub-clauses are supported + * + * Perhaps we could improve this by handling mixed cases, when some of + * the clauses are supported and some are not. Selectivity for the + * supported subclauses would be computed using extended statistics, + * and the remaining clauses would be estimated using the traditional + * algorithm (product of selectivities). + * + * It however seems overly complex, and in a way we already do that + * because if we reject the whole clause as unsupported here, it will + * be eventually passed to clauselist_selectivity() which does exactly + * this (split into supported/unsupported clauses etc). + */ + BoolExpr *expr = (BoolExpr *) clause; + ListCell *lc; + + foreach(lc, expr->args) + { + /* + * Had we found incompatible clause in the arguments, treat the + * whole clause as incompatible. + */ + if (!statext_extract_clause_internal(root, + (Node *) lfirst(lc), + relid)) + return NIL; + } + + return result; + } + + /* Var IS NULL */ + if (IsA(clause, NullTest)) + { + NullTest *nt = (NullTest *) clause; + + /* + * Only simple (Var IS NULL) expressions supported for now. Maybe we + * could use examine_variable to fix this? + */ + if (!IsA(nt->arg, Var)) + return false; + + return statext_extract_clause_internal(root, (Node *) (nt->arg), + relid); + } + + return false; +} + /* * statext_is_compatible_clause * Determines if the clause is compatible with MCV lists. @@ -1184,6 +1569,51 @@ statext_is_compatible_clause(PlannerInfo *root, Node *clause, Index relid, return true; } +/* + * statext_extract_clause + * Determines if the clause is compatible with MCV lists. + * + * Currently, we only support three types of clauses: + * + * (a) OpExprs of the form (Var op Const), or (Const op Var), where the op + * is one of ("=", "<", ">", ">=", "<=") + * + * (b) (Var IS [NOT] NULL) + * + * (c) combinations using AND/OR/NOT + * + * In the future, the range of supported clauses may be expanded to more + * complex cases, for example (Var op Var). + */ +static List * +statext_extract_clause(PlannerInfo *root, Node *clause, Index relid) +{ + RestrictInfo *rinfo = (RestrictInfo *) clause; + List *exprs; + + if (!IsA(rinfo, RestrictInfo)) + return false; + + /* Pseudoconstants are not really interesting here. */ + if (rinfo->pseudoconstant) + return false; + + /* clauses referencing multiple varnos are incompatible */ + if (bms_membership(rinfo->clause_relids) != BMS_SINGLETON) + return false; + + /* Check the clause and determine what attributes it references. */ + exprs = statext_extract_clause_internal(root, (Node *) rinfo->clause, relid); + + if (!exprs) + return NULL; + + /* FIXME do the same ACL check as in statext_is_compatible_clause */ + + /* If we reach here, the clause is OK */ + return exprs; +} + /* * statext_mcv_clauselist_selectivity * Estimate clauses using the best multi-column statistics. @@ -1246,7 +1676,8 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli bool is_or) { ListCell *l; - Bitmapset **list_attnums; + Bitmapset **list_attnums; /* attnums extracted from the clause */ + bool *exact_clauses; /* covered as-is by at least one statistic */ int listidx; Selectivity sel = 1.0; @@ -1257,6 +1688,8 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli list_attnums = (Bitmapset **) palloc(sizeof(Bitmapset *) * list_length(clauses)); + exact_clauses = (bool *) palloc(sizeof(bool) * list_length(clauses)); + /* * Pre-process the clauses list to extract the attnums seen in each item. * We need to determine if there's any clauses which will be useful for @@ -1274,11 +1707,76 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli Node *clause = (Node *) lfirst(l); Bitmapset *attnums = NULL; + /* the clause is considered incompatible by default */ + list_attnums[listidx] = NULL; + + /* and it's also not covered exactly by the statistic */ + exact_clauses[listidx] = false; + + /* + * First see if the clause is simple enough to be covered directly + * by the attributes. If not, see if there's at least one statistic + * object using the expression as-is. + */ if (!bms_is_member(listidx, *estimatedclauses) && statext_is_compatible_clause(root, clause, rel->relid, &attnums)) + /* simple expression, covered through attnum(s) */ list_attnums[listidx] = attnums; else - list_attnums[listidx] = NULL; + { + ListCell *lc; + + List *exprs = statext_extract_clause(root, clause, rel->relid); + + /* complex expression, search for statistic */ + foreach(lc, rel->statlist) + { + ListCell *lc2; + StatisticExtInfo *info = (StatisticExtInfo *) lfirst(lc); + bool all_found = true; + + /* have we already found all expressions in a statistic? */ + Assert(!exact_clauses[listidx]); + + /* no expressions */ + if (!info->exprs) + continue; + + foreach (lc2, exprs) + { + Node *expr = (Node *) lfirst(lc2); + + /* + * Walk the expressions, see if all expressions extracted from + * the clause are covered by the extended statistic object. + */ + foreach (lc2, info->exprs) + { + Node *stat_expr = (Node *) lfirst(lc2); + bool expr_found = false; + + if (equal(expr, stat_expr)) + { + expr_found = true; + break; + } + + if (!expr_found) + { + all_found = false; + break; + } + } + } + + /* stop looking for another statistic */ + if (all_found) + { + exact_clauses[listidx] = true; + break; + } + } + } listidx++; } diff --git a/src/backend/statistics/mcv.c b/src/backend/statistics/mcv.c index 97d3083451..9334504714 100644 --- a/src/backend/statistics/mcv.c +++ b/src/backend/statistics/mcv.c @@ -180,7 +180,9 @@ get_mincount_for_mcv_list(int samplerows, double totalrows) * */ MCVList * -statext_mcv_build(int numrows, HeapTuple *rows, Bitmapset *attrs, +statext_mcv_build(int numrows, HeapTuple *rows, + Datum *exprvals, bool *exprnulls, + Bitmapset *attrs, List *exprs, VacAttrStats **stats, double totalrows, int stattarget) { int i, @@ -194,13 +196,23 @@ statext_mcv_build(int numrows, HeapTuple *rows, Bitmapset *attrs, MCVList *mcvlist = NULL; MultiSortSupport mss; + /* + * Copy the bitmapset and add fake attnums representing expressions, + * starting above MaxHeapAttributeNumber. + */ + attrs = bms_copy(attrs); + + for (i = 1; i <= list_length(exprs); i++) + attrs = bms_add_member(attrs, MaxHeapAttributeNumber + i); + attnums = build_attnums_array(attrs, &numattrs); /* comparator for all the columns */ mss = build_mss(stats, numattrs); /* sort the rows */ - items = build_sorted_items(numrows, &nitems, rows, stats[0]->tupDesc, + items = build_sorted_items(numrows, &nitems, rows, exprvals, exprnulls, + list_length(exprs), stats[0]->tupDesc, mss, numattrs, attnums); if (!items) @@ -337,6 +349,7 @@ statext_mcv_build(int numrows, HeapTuple *rows, Bitmapset *attrs, pfree(items); pfree(groups); + pfree(attrs); return mcvlist; } diff --git a/src/backend/statistics/mvdistinct.c b/src/backend/statistics/mvdistinct.c index 977d6f3e2e..dd874c7a04 100644 --- a/src/backend/statistics/mvdistinct.c +++ b/src/backend/statistics/mvdistinct.c @@ -37,8 +37,10 @@ #include "utils/typcache.h" static double ndistinct_for_combination(double totalrows, int numrows, - HeapTuple *rows, VacAttrStats **stats, - int k, int *combination); + HeapTuple *rows, Datum *exprvals, + bool *exprnulls, int nexprs, + VacAttrStats **stats, int k, + int *combination); static double estimate_ndistinct(double totalrows, int numrows, int d, int f1); static int n_choose_k(int n, int k); static int num_combinations(int n); @@ -84,14 +86,26 @@ static void generate_combinations(CombinationGenerator *state); */ MVNDistinct * statext_ndistinct_build(double totalrows, int numrows, HeapTuple *rows, - Bitmapset *attrs, VacAttrStats **stats) + Datum *exprvals, bool *exprnulls, + Bitmapset *attrs, List *exprs, + VacAttrStats **stats) { MVNDistinct *result; + int i; int k; int itemcnt; - int numattrs = bms_num_members(attrs); + int numattrs = bms_num_members(attrs) + list_length(exprs); int numcombs = num_combinations(numattrs); + /* + * Copy the bitmapset and add fake attnums representing expressions, + * starting above MaxHeapAttributeNumber. + */ + attrs = bms_copy(attrs); + + for (i = 1; i <= list_length(exprs); i++) + attrs = bms_add_member(attrs, MaxHeapAttributeNumber + i); + result = palloc(offsetof(MVNDistinct, items) + numcombs * sizeof(MVNDistinctItem)); result->magic = STATS_NDISTINCT_MAGIC; @@ -114,10 +128,18 @@ statext_ndistinct_build(double totalrows, int numrows, HeapTuple *rows, item->attrs = NULL; for (j = 0; j < k; j++) - item->attrs = bms_add_member(item->attrs, - stats[combination[j]]->attr->attnum); + { + if (combination[j] <= MaxHeapAttributeNumber) + item->attrs = bms_add_member(item->attrs, + stats[combination[j]]->attr->attnum); + else + item->attrs = bms_add_member(item->attrs, combination[j]); + } + item->ndistinct = ndistinct_for_combination(totalrows, numrows, rows, + exprvals, exprnulls, + list_length(exprs), stats, k, combination); itemcnt++; @@ -428,6 +450,7 @@ pg_ndistinct_send(PG_FUNCTION_ARGS) */ static double ndistinct_for_combination(double totalrows, int numrows, HeapTuple *rows, + Datum *exprvals, bool *exprnulls, int nexprs, VacAttrStats **stats, int k, int *combination) { int i, @@ -481,11 +504,17 @@ ndistinct_for_combination(double totalrows, int numrows, HeapTuple *rows, /* accumulate all the data for this dimension into the arrays */ for (j = 0; j < numrows; j++) { - items[j].values[i] = - heap_getattr(rows[j], - colstat->attr->attnum, - colstat->tupDesc, - &items[j].isnull[i]); + if (combination[i] <= MaxHeapAttributeNumber) + items[j].values[i] = + heap_getattr(rows[j], + colstat->attr->attnum, + colstat->tupDesc, + &items[j].isnull[i]); + else + { + items[j].values[i] = exprvals[j * nexprs + combination[i] - MaxHeapAttributeNumber - 1]; + items[j].isnull[i] = exprnulls[j * nexprs + combination[i] - MaxHeapAttributeNumber - 1]; + } } } diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c index 1b460a2612..8c36f516e1 100644 --- a/src/backend/tcop/utility.c +++ b/src/backend/tcop/utility.c @@ -1781,7 +1781,21 @@ ProcessUtilitySlow(ParseState *pstate, break; case T_CreateStatsStmt: - address = CreateStatistics((CreateStatsStmt *) parsetree); + { + Oid relid; + CreateStatsStmt *stmt = (CreateStatsStmt *) parsetree; + RangeVar *rel = (RangeVar *) linitial(stmt->relations); + + relid = RangeVarGetRelidExtended(rel, ShareLock, + 0, + RangeVarCallbackOwnsRelation, + NULL); + + /* Run parse analysis ... */ + stmt = transformStatsStmt(relid, stmt, queryString); + + address = CreateStatistics(stmt); + } break; case T_AlterStatsStmt: diff --git a/src/backend/utils/adt/ruleutils.c b/src/backend/utils/adt/ruleutils.c index 5e63238f03..e811a54667 100644 --- a/src/backend/utils/adt/ruleutils.c +++ b/src/backend/utils/adt/ruleutils.c @@ -1524,6 +1524,9 @@ pg_get_statisticsobj_worker(Oid statextid, bool missing_ok) bool dependencies_enabled; bool mcv_enabled; int i; + List *context; + ListCell *lc; + List *exprs = NIL; statexttup = SearchSysCache1(STATEXTOID, ObjectIdGetDatum(statextid)); @@ -1616,6 +1619,62 @@ pg_get_statisticsobj_worker(Oid statextid, bool missing_ok) appendStringInfoString(&buf, quote_identifier(attname)); } + /* deparse expressions */ + + { + bool isnull; + Datum datum; + + /* decode expression (if any) */ + datum = SysCacheGetAttr(STATEXTOID, statexttup, + Anum_pg_statistic_ext_stxexprs, &isnull); + + if (!isnull) + { + char *exprsString; + + exprsString = TextDatumGetCString(datum); + exprs = (List *) stringToNode(exprsString); + pfree(exprsString); + + /* + * Run the expressions through eval_const_expressions. This is not just an + * optimization, but is necessary, because the planner will be comparing + * them to similarly-processed qual clauses, and may fail to detect valid + * matches without this. We must not use canonicalize_qual, however, + * since these aren't qual expressions. + */ + exprs = (List *) eval_const_expressions(NULL, (Node *) exprs); + + /* May as well fix opfuncids too */ + fix_opfuncids((Node *) exprs); + } + } + + context = deparse_context_for(get_relation_name(statextrec->stxrelid), + statextrec->stxrelid); + + foreach (lc, exprs) + { + Node *expr = (Node *) lfirst(lc); + char *str; + int prettyFlags = PRETTYFLAG_INDENT; + + str = deparse_expression_pretty(expr, context, false, false, + prettyFlags, 0); + + if (colno > 0) + appendStringInfoString(&buf, ", "); + + /* Need parens if it's not a bare function call */ + if (looks_like_function(expr)) + appendStringInfoString(&buf, str); + else + appendStringInfo(&buf, "(%s)", str); + + colno++; + } + appendStringInfo(&buf, " FROM %s", generate_relation_name(statextrec->stxrelid, NIL)); diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index 0be26fe037..7574a5395a 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -3082,6 +3082,7 @@ estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows, double this_srf_multiplier; VariableStatData vardata; List *varshere; + Relids varnos; ListCell *l2; /* is expression in this grouping set? */ @@ -3149,6 +3150,16 @@ estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows, continue; } + /* + * Are all the variables from the same relation? If yes, search for + * an extended statistic matching this expression exactly. + */ + varnos = pull_varnos((Node *) varshere); + if (bms_membership(varnos) == BMS_SINGLETON) + { + // FIXME try to match it to expressions in mvdistinct stats + } + /* * Else add variables to varinfos list */ diff --git a/src/bin/psql/describe.c b/src/bin/psql/describe.c index f3c7eb96fa..92c2deb1ba 100644 --- a/src/bin/psql/describe.c +++ b/src/bin/psql/describe.c @@ -2671,6 +2671,7 @@ describeOneTableDetails(const char *schemaname, /* print any extended statistics */ if (pset.sversion >= 100000) { + /* FIXME improve this with printing expressions the statistics is defined on */ printfPQExpBuffer(&buf, "SELECT oid, " "stxrelid::pg_catalog.regclass, " diff --git a/src/include/catalog/pg_statistic_ext.h b/src/include/catalog/pg_statistic_ext.h index e9491a0a87..dd0f41cd14 100644 --- a/src/include/catalog/pg_statistic_ext.h +++ b/src/include/catalog/pg_statistic_ext.h @@ -52,6 +52,9 @@ CATALOG(pg_statistic_ext,3381,StatisticExtRelationId) #ifdef CATALOG_VARLEN char stxkind[1] BKI_FORCE_NOT_NULL; /* statistics kinds requested * to build */ + pg_node_tree stxexprs; /* expression trees for stats attributes that + * are not simple column references; one for + * each zero entry in stxkeys[] */ #endif } FormData_pg_statistic_ext; diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h index baced7eec0..72f6534ceb 100644 --- a/src/include/nodes/nodes.h +++ b/src/include/nodes/nodes.h @@ -448,6 +448,7 @@ typedef enum NodeTag T_TypeName, T_ColumnDef, T_IndexElem, + T_StatsElem, T_Constraint, T_DefElem, T_RangeTblEntry, diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h index da0706add5..74e5a855ca 100644 --- a/src/include/nodes/parsenodes.h +++ b/src/include/nodes/parsenodes.h @@ -2807,8 +2807,24 @@ typedef struct CreateStatsStmt List *relations; /* rels to build stats on (list of RangeVar) */ char *stxcomment; /* comment to apply to stats, or NULL */ bool if_not_exists; /* do nothing if stats name already exists */ + bool transformed; /* true when transformIndexStmt is finished */ } CreateStatsStmt; +/* + * StatsElem - statistics parameters (used in CREATE STATISTICS) + * + * For a plain attribute, 'name' is the name of the referenced table column + * and 'expr' is NULL. For an expression, 'name' is NULL and 'expr' is the + * expression tree. + */ +typedef struct StatsElem +{ + NodeTag type; + char *name; /* name of attribute to index, or NULL */ + Node *expr; /* expression to index, or NULL */ +} StatsElem; + + /* ---------------------- * Alter Statistics Statement * ---------------------- diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h index 0ceb809644..7e9aeb409b 100644 --- a/src/include/nodes/pathnodes.h +++ b/src/include/nodes/pathnodes.h @@ -885,6 +885,7 @@ typedef struct StatisticExtInfo RelOptInfo *rel; /* back-link to statistic's table */ char kind; /* statistic kind of this entry */ Bitmapset *keys; /* attnums of the columns covered */ + List *exprs; /* expressions */ } StatisticExtInfo; /* diff --git a/src/include/parser/parse_node.h b/src/include/parser/parse_node.h index d25819aa28..82e5190964 100644 --- a/src/include/parser/parse_node.h +++ b/src/include/parser/parse_node.h @@ -69,6 +69,7 @@ typedef enum ParseExprKind EXPR_KIND_FUNCTION_DEFAULT, /* default parameter value for function */ EXPR_KIND_INDEX_EXPRESSION, /* index expression */ EXPR_KIND_INDEX_PREDICATE, /* index predicate */ + EXPR_KIND_STATS_EXPRESSION, /* extended statistics expression */ EXPR_KIND_ALTER_COL_TRANSFORM, /* transform expr in ALTER COLUMN TYPE */ EXPR_KIND_EXECUTE_PARAMETER, /* parameter value in EXECUTE */ EXPR_KIND_TRIGGER_WHEN, /* WHEN condition in CREATE TRIGGER */ diff --git a/src/include/parser/parse_utilcmd.h b/src/include/parser/parse_utilcmd.h index 1a5e0b83a7..43247186b0 100644 --- a/src/include/parser/parse_utilcmd.h +++ b/src/include/parser/parse_utilcmd.h @@ -26,6 +26,8 @@ extern AlterTableStmt *transformAlterTableStmt(Oid relid, AlterTableStmt *stmt, List **afterStmts); extern IndexStmt *transformIndexStmt(Oid relid, IndexStmt *stmt, const char *queryString); +extern CreateStatsStmt *transformStatsStmt(Oid relid, CreateStatsStmt *stmt, + const char *queryString); extern void transformRuleStmt(RuleStmt *stmt, const char *queryString, List **actions, Node **whereClause); extern List *transformCreateSchemaStmt(CreateSchemaStmt *stmt); diff --git a/src/include/statistics/extended_stats_internal.h b/src/include/statistics/extended_stats_internal.h index 804089bc57..b159ea0313 100644 --- a/src/include/statistics/extended_stats_internal.h +++ b/src/include/statistics/extended_stats_internal.h @@ -59,17 +59,23 @@ typedef struct SortItem extern MVNDistinct *statext_ndistinct_build(double totalrows, int numrows, HeapTuple *rows, - Bitmapset *attrs, VacAttrStats **stats); + Datum *exprvals, bool *exprnulls, + Bitmapset *attrs, List *exprs, + VacAttrStats **stats); extern bytea *statext_ndistinct_serialize(MVNDistinct *ndistinct); extern MVNDistinct *statext_ndistinct_deserialize(bytea *data); extern MVDependencies *statext_dependencies_build(int numrows, HeapTuple *rows, - Bitmapset *attrs, VacAttrStats **stats); + Datum *exprvals, bool *exprnulls, + Bitmapset *attrs, List *exprs, + VacAttrStats **stats); extern bytea *statext_dependencies_serialize(MVDependencies *dependencies); extern MVDependencies *statext_dependencies_deserialize(bytea *data); extern MCVList *statext_mcv_build(int numrows, HeapTuple *rows, - Bitmapset *attrs, VacAttrStats **stats, + Datum *exprvals, bool *exprnulls, + Bitmapset *attrs, List *exprs, + VacAttrStats **stats, double totalrows, int stattarget); extern bytea *statext_mcv_serialize(MCVList *mcv, VacAttrStats **stats); extern MCVList *statext_mcv_deserialize(bytea *data); @@ -93,6 +99,7 @@ extern void *bsearch_arg(const void *key, const void *base, extern AttrNumber *build_attnums_array(Bitmapset *attrs, int *numattrs); extern SortItem *build_sorted_items(int numrows, int *nitems, HeapTuple *rows, + Datum *exprvals, bool *exprnulls, int nexprs, TupleDesc tdesc, MultiSortSupport mss, int numattrs, AttrNumber *attnums); -- 2.21.1