From d7f639b6150fe9fd179066af2a536465d877842a Mon Sep 17 00:00:00 2001 From: Tomas Vondra Date: Mon, 2 Dec 2019 23:02:17 +0100 Subject: [PATCH 1/3] Support using extended stats for parts of OR clauses --- src/backend/optimizer/path/clausesel.c | 109 +++++++++++++++--- src/backend/statistics/extended_stats.c | 45 +++++++- src/backend/statistics/mcv.c | 5 +- .../statistics/extended_stats_internal.h | 3 +- src/include/statistics/statistics.h | 3 +- src/test/regress/expected/stats_ext.out | 3 +- src/test/regress/sql/stats_ext.sql | 1 - 7 files changed, 138 insertions(+), 31 deletions(-) diff --git a/src/backend/optimizer/path/clausesel.c b/src/backend/optimizer/path/clausesel.c index a3ebe10592..8c1a404ce2 100644 --- a/src/backend/optimizer/path/clausesel.c +++ b/src/backend/optimizer/path/clausesel.c @@ -92,7 +92,7 @@ clauselist_selectivity(PlannerInfo *root, */ s1 *= statext_clauselist_selectivity(root, clauses, varRelid, jointype, sjinfo, rel, - &estimatedclauses); + &estimatedclauses, false); } /* @@ -104,6 +104,89 @@ clauselist_selectivity(PlannerInfo *root, estimatedclauses); } +/* + * clauselist_selectivity_or - + * Compute the selectivity of an implicitly-ORed list of boolean + * expression clauses. The list can be empty, in which case 0.0 + * must be returned. List elements may be either RestrictInfos + * or bare expression clauses --- the former is preferred since + * it allows caching of results. + * + * See clause_selectivity() for the meaning of the additional parameters. + * + * The basic approach is to apply extended statistics first, on as many + * clauses as possible, in order to capture cross-column dependencies etc. + * The remaining clauses are then estimated using regular statistics tracked + * for individual columns. This is done by simply passing the clauses to + * clauselist_selectivity and then combining the selectivities using the + * regular formula (s1+s2 - s1*s2). + */ +static Selectivity +clauselist_selectivity_or(PlannerInfo *root, + List *clauses, + int varRelid, + JoinType jointype, + SpecialJoinInfo *sjinfo) +{ + ListCell *lc; + Selectivity s1 = 0.0; + RelOptInfo *rel; + Bitmapset *estimatedclauses = NULL; + int listidx; + + /* + * Determine if these clauses reference a single relation. If so, and if + * it has extended statistics, try to apply those. + */ + rel = find_single_rel_for_clauses(root, clauses); + if (rel && rel->rtekind == RTE_RELATION && rel->statlist != NIL) + { + /* + * Estimate as many clauses as possible using extended statistics. + * + * 'estimatedclauses' tracks the 0-based list position index of + * clauses that we've estimated using extended statistics, and that + * should be ignored. + * + * XXX We can't multiply with current value, because for OR clauses + * we start with 0.0, so we simply assign to s1 directly. + */ + s1 = statext_clauselist_selectivity(root, clauses, varRelid, + jointype, sjinfo, rel, + &estimatedclauses, true); + } + + /* + * Selectivities of the remaining clauses for an OR clause are computed + * as s1+s2 - s1*s2 to account for the probable overlap of selected tuple + * sets. The clauses estimated using extended statistics are effectively + * treated as a single clause. + * + * XXX is this too conservative? + */ + listidx = -1; + foreach(lc, clauses) + { + Selectivity s2; + + listidx++; + + /* skip already estimated clauses */ + if (bms_is_member(listidx, estimatedclauses)) + continue; + + s2 = clause_selectivity(root, + (Node *) lfirst(lc), + varRelid, + jointype, + sjinfo); + + s1 = s1 + s2 - s1 * s2; + } + + return s1; +} + /* * clauselist_selectivity_simple - * Compute the selectivity of an implicitly-ANDed list of boolean @@ -735,24 +818,14 @@ clause_selectivity(PlannerInfo *root, else if (is_orclause(clause)) { /* - * Selectivities for an OR clause are computed as s1+s2 - s1*s2 to - * account for the probable overlap of selected tuple sets. - * - * XXX is this too conservative? + * Almost the same thing as clauselist_selectivity, but with the + * clauses connected by OR. */ - ListCell *arg; - - s1 = 0.0; - foreach(arg, ((BoolExpr *) clause)->args) - { - Selectivity s2 = clause_selectivity(root, - (Node *) lfirst(arg), - varRelid, - jointype, - sjinfo); - - s1 = s1 + s2 - s1 * s2; - } + s1 = clauselist_selectivity_or(root, + ((BoolExpr *) clause)->args, + varRelid, + jointype, + sjinfo); } else if (is_opclause(clause) || IsA(clause, DistinctExpr)) { diff --git a/src/backend/statistics/extended_stats.c b/src/backend/statistics/extended_stats.c index 03e69d057f..24ece6f99c 100644 --- a/src/backend/statistics/extended_stats.c +++ b/src/backend/statistics/extended_stats.c @@ -1225,7 +1225,8 @@ statext_is_compatible_clause(PlannerInfo *root, Node *clause, Index relid, static Selectivity statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varRelid, JoinType jointype, SpecialJoinInfo *sjinfo, - RelOptInfo *rel, Bitmapset **estimatedclauses) + RelOptInfo *rel, Bitmapset **estimatedclauses, + bool is_or) { ListCell *l; Bitmapset **list_attnums; @@ -1317,8 +1318,32 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli * columns/clauses. We'll then use the various selectivities computed from * MCV list to improve it. */ - simple_sel = clauselist_selectivity_simple(root, stat_clauses, varRelid, - jointype, sjinfo, NULL); + if (is_or) + { + ListCell *lc; + Selectivity s1 = 0.0, + s2; + + /* + * Selectivities of OR clauses are computed s1+s2 - s1*s2 to account + * for the probable overlap of selected tuple sets. + */ + foreach(lc, stat_clauses) + { + s2 = clause_selectivity(root, + (Node *) lfirst(lc), + varRelid, + jointype, + sjinfo); + + s1 = s1 + s2 - s1 * s2; + } + + simple_sel = s1; + } + else + simple_sel = clauselist_selectivity_simple(root, stat_clauses, varRelid, + jointype, sjinfo, NULL); /* * Now compute the multi-column estimate from the MCV list, along with the @@ -1326,7 +1351,7 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli */ mcv_sel = mcv_clauselist_selectivity(root, stat, stat_clauses, varRelid, jointype, sjinfo, rel, - &mcv_basesel, &mcv_totalsel); + &mcv_basesel, &mcv_totalsel, is_or); /* Estimated selectivity of values not covered by MCV matches */ other_sel = simple_sel - mcv_basesel; @@ -1354,13 +1379,21 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli Selectivity statext_clauselist_selectivity(PlannerInfo *root, List *clauses, int varRelid, JoinType jointype, SpecialJoinInfo *sjinfo, - RelOptInfo *rel, Bitmapset **estimatedclauses) + RelOptInfo *rel, Bitmapset **estimatedclauses, + bool is_or) { Selectivity sel; /* First, try estimating clauses using a multivariate MCV list. */ sel = statext_mcv_clauselist_selectivity(root, clauses, varRelid, jointype, - sjinfo, rel, estimatedclauses); + sjinfo, rel, estimatedclauses, is_or); + + /* + * Functional dependencies only work for clauses connected by AND, so for + * OR clauses we're done. + */ + if (is_or) + return sel; /* * Then, apply functional dependencies on the remaining clauses by calling diff --git a/src/backend/statistics/mcv.c b/src/backend/statistics/mcv.c index 87e232fdd4..3f42713aa2 100644 --- a/src/backend/statistics/mcv.c +++ b/src/backend/statistics/mcv.c @@ -1795,7 +1795,8 @@ mcv_clauselist_selectivity(PlannerInfo *root, StatisticExtInfo *stat, List *clauses, int varRelid, JoinType jointype, SpecialJoinInfo *sjinfo, RelOptInfo *rel, - Selectivity *basesel, Selectivity *totalsel) + Selectivity *basesel, Selectivity *totalsel, + bool is_or) { int i; MCVList *mcv; @@ -1808,7 +1809,7 @@ mcv_clauselist_selectivity(PlannerInfo *root, StatisticExtInfo *stat, mcv = statext_mcv_load(stat->statOid); /* build a match bitmap for the clauses */ - matches = mcv_get_match_bitmap(root, clauses, stat->keys, mcv, false); + matches = mcv_get_match_bitmap(root, clauses, stat->keys, mcv, is_or); /* sum frequencies for all the matching MCV items */ *basesel = 0.0; diff --git a/src/include/statistics/extended_stats_internal.h b/src/include/statistics/extended_stats_internal.h index b512ee908a..5171895bba 100644 --- a/src/include/statistics/extended_stats_internal.h +++ b/src/include/statistics/extended_stats_internal.h @@ -107,6 +107,7 @@ extern Selectivity mcv_clauselist_selectivity(PlannerInfo *root, SpecialJoinInfo *sjinfo, RelOptInfo *rel, Selectivity *basesel, - Selectivity *totalsel); + Selectivity *totalsel, + bool is_or); #endif /* EXTENDED_STATS_INTERNAL_H */ diff --git a/src/include/statistics/statistics.h b/src/include/statistics/statistics.h index f5d9b6c73a..e18c9a6539 100644 --- a/src/include/statistics/statistics.h +++ b/src/include/statistics/statistics.h @@ -116,7 +116,8 @@ extern Selectivity statext_clauselist_selectivity(PlannerInfo *root, JoinType jointype, SpecialJoinInfo *sjinfo, RelOptInfo *rel, - Bitmapset **estimatedclauses); + Bitmapset **estimatedclauses, + bool is_or); extern bool has_stats_of_kind(List *stats, char requiredkind); extern StatisticExtInfo *choose_best_statistics(List *stats, char requiredkind, Bitmapset **clause_attnums, diff --git a/src/test/regress/expected/stats_ext.out b/src/test/regress/expected/stats_ext.out index 61237dfb11..5344b70cf4 100644 --- a/src/test/regress/expected/stats_ext.out +++ b/src/test/regress/expected/stats_ext.out @@ -648,11 +648,10 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 OR b = ' 200 | 200 (1 row) --- we can't use the statistic for OR clauses that are not fully covered (missing 'd' attribute) SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 OR b = ''1'' OR c = 1 OR d IS NOT NULL'); estimated | actual -----------+-------- - 343 | 200 + 200 | 200 (1 row) -- check change of unrelated column type does not reset the MCV statistics diff --git a/src/test/regress/sql/stats_ext.sql b/src/test/regress/sql/stats_ext.sql index 84f13e8814..fa989fccb0 100644 --- a/src/test/regress/sql/stats_ext.sql +++ b/src/test/regress/sql/stats_ext.sql @@ -400,7 +400,6 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a <= 4 AND b < SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 OR b = ''1'' OR c = 1'); --- we can't use the statistic for OR clauses that are not fully covered (missing 'd' attribute) SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 OR b = ''1'' OR c = 1 OR d IS NOT NULL'); -- check change of unrelated column type does not reset the MCV statistics -- 2.21.1