From 4d1b7a071ff7ea27bb991f7999133eb8d34f4720 Mon Sep 17 00:00:00 2001 From: Tomas Vondra Date: Sun, 8 Mar 2020 23:26:50 +0100 Subject: [PATCH 1/2] Improve estimation of OR clauses using extended statistics Until now, OR clauses were estimated using extended statistics only when the whole clause (all the arguments) are compatible. If even just one argument was found to be incompatible, the whole clause was estimated ignoring extended statistics. Estimation errors for OR clauses tend to be fairly mild, so this was considered acceptable, but it may become an issue for OR clauses with more complex arguments, etc. This commit relaxes the restriction, using mostly the same logic as AND clauses. We first apply extended statistics to as many arguments as possible, and then use the (s1 + s2 - s1 * s2) formula to factor in the remaining clauses. The OR clause is still considered incompatible, though. If any argument is unsupported or references variable not covered by the statistics, the whole OR clause is incompatible. The consequence is that e.g. clauses (a = 1) AND (b = 1 OR c = 1 OR d = 1) can't be estimated by statistics on (a,b,c) because the OR clause also references "d". So we'll estimate each of the AND arguments separately, and the extended statistics will be used only to estimate the OR clause. This may be solved by creating statistics including the "d" column, but the issue applies to cases where the clause type is unsupported, e.g. (a = 1) AND (b = 1 OR c = 1 OR mod(d,10) = 0) which can't be solved by adding "d" to the statistics, at least for now. Author: Tomas Vondra Reviewed-by: Dean Rasheed, Thomas Munro Discussion: Discussion: https://www.postgresql.org/message-id/flat/20200113230008.g67iyk4cs3xbnjju@development --- src/backend/optimizer/path/clausesel.c | 141 +++++++++++++++--- src/backend/statistics/extended_stats.c | 36 +++-- src/backend/statistics/mcv.c | 5 +- src/include/optimizer/optimizer.h | 7 + .../statistics/extended_stats_internal.h | 3 +- src/include/statistics/statistics.h | 3 +- src/test/regress/expected/stats_ext.out | 61 +++++++- src/test/regress/sql/stats_ext.sql | 13 +- 8 files changed, 227 insertions(+), 42 deletions(-) diff --git a/src/backend/optimizer/path/clausesel.c b/src/backend/optimizer/path/clausesel.c index a3ebe10592..ce14d47409 100644 --- a/src/backend/optimizer/path/clausesel.c +++ b/src/backend/optimizer/path/clausesel.c @@ -92,7 +92,7 @@ clauselist_selectivity(PlannerInfo *root, */ s1 *= statext_clauselist_selectivity(root, clauses, varRelid, jointype, sjinfo, rel, - &estimatedclauses); + &estimatedclauses, false); } /* @@ -104,6 +104,62 @@ clauselist_selectivity(PlannerInfo *root, estimatedclauses); } +/* + * clauselist_selectivity_or - + * Compute the selectivity of an implicitly-ORed list of boolean + * expression clauses. The list can be empty, in which case 0.0 + * must be returned. List elements may be either RestrictInfos + * or bare expression clauses --- the former is preferred since + * it allows caching of results. + * + * See clause_selectivity() for the meaning of the additional parameters. + * + * The basic approach is to apply extended statistics first, on as many + * clauses as possible, in order to capture cross-column dependencies etc. + * The remaining clauses are then estimated using regular statistics tracked + * for individual columns. This is done by simply passing the clauses to + * clauselist_selectivity and then combining the selectivities using the + * regular formula (s1+s2 - s1*s2). + */ +static Selectivity +clauselist_selectivity_or(PlannerInfo *root, + List *clauses, + int varRelid, + JoinType jointype, + SpecialJoinInfo *sjinfo) +{ + Selectivity s = 0.0; + RelOptInfo *rel; + Bitmapset *estimatedclauses = NULL; + + /* + * Determine if these clauses reference a single relation. If so, and if + * it has extended statistics, try to apply those. + */ + rel = find_single_rel_for_clauses(root, clauses); + if (rel && rel->rtekind == RTE_RELATION && rel->statlist != NIL) + { + /* + * Estimate as many clauses as possible using extended statistics. + * + * 'estimatedclauses' tracks the 0-based list position index of + * clauses that we've estimated using extended statistics, and that + * should be ignored. + * + * XXX We can't multiply with current value, because for OR clauses + * we start with 0.0, so we simply assign to 's' directly. + */ + s = statext_clauselist_selectivity(root, clauses, varRelid, + jointype, sjinfo, rel, + &estimatedclauses, true); + } + + /* Estimate the rest of the clauses as if they were independent. */ + return clauselist_selectivity_simple_or(root, clauses, varRelid, + jointype, sjinfo, + estimatedclauses, s); +} + /* * clauselist_selectivity_simple - * Compute the selectivity of an implicitly-ANDed list of boolean @@ -351,6 +407,65 @@ clauselist_selectivity_simple(PlannerInfo *root, return s1; } +/* + * clauselist_selectivity_simple_or - + * Compute the selectivity of an implicitly-ORed list of boolean + * expression clauses. The list can be empty, in which case the + * value passed in the last parameter must be returned. + * The estimatedclauses bitmap tracks clauses that have already + * been estimated by other means. + * + * See clause_selectivity() for the meaning of the additional parameters. + * + * Our basic approach is to combine selectivities of the subclauses + * using the formula (s1 + s2 - s1 * s2). It's possible some of the + * clauses are already estimated by other means, in which case we treat + * them as a single logical clause, with the selecitivity passed in the + * last parameter. + */ +Selectivity +clauselist_selectivity_simple_or(PlannerInfo *root, + List *clauses, + int varRelid, + JoinType jointype, + SpecialJoinInfo *sjinfo, + Bitmapset *estimatedclauses, + Selectivity s) +{ + ListCell *lc; + int listidx; + + /* + * Selectivities of the remaining clauses for an OR clause are computed + * as s1+s2 - s1*s2 to account for the probable overlap of selected tuple + * sets. The clauses estimated using extended statistics are effectively + * treated as a single clause. + * + * XXX is this too conservative? + */ + listidx = -1; + foreach(lc, clauses) + { + Selectivity s2; + + listidx++; + + /* skip already estimated clauses */ + if (bms_is_member(listidx, estimatedclauses)) + continue; + + s2 = clause_selectivity(root, + (Node *) lfirst(lc), + varRelid, + jointype, + sjinfo); + + s = s + s2 - s * s2; + } + + return s; +} + /* * addRangeClause --- add a new range clause for clauselist_selectivity * @@ -735,24 +850,14 @@ clause_selectivity(PlannerInfo *root, else if (is_orclause(clause)) { /* - * Selectivities for an OR clause are computed as s1+s2 - s1*s2 to - * account for the probable overlap of selected tuple sets. - * - * XXX is this too conservative? + * Almost the same thing as clauselist_selectivity, but with the + * clauses connected by OR. */ - ListCell *arg; - - s1 = 0.0; - foreach(arg, ((BoolExpr *) clause)->args) - { - Selectivity s2 = clause_selectivity(root, - (Node *) lfirst(arg), - varRelid, - jointype, - sjinfo); - - s1 = s1 + s2 - s1 * s2; - } + s1 = clauselist_selectivity_or(root, + ((BoolExpr *) clause)->args, + varRelid, + jointype, + sjinfo); } else if (is_opclause(clause) || IsA(clause, DistinctExpr)) { diff --git a/src/backend/statistics/extended_stats.c b/src/backend/statistics/extended_stats.c index 318fdb8f6d..93d67791c5 100644 --- a/src/backend/statistics/extended_stats.c +++ b/src/backend/statistics/extended_stats.c @@ -1282,16 +1282,17 @@ statext_is_compatible_clause(PlannerInfo *root, Node *clause, Index relid, static Selectivity statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varRelid, JoinType jointype, SpecialJoinInfo *sjinfo, - RelOptInfo *rel, Bitmapset **estimatedclauses) + RelOptInfo *rel, Bitmapset **estimatedclauses, + bool is_or) { ListCell *l; Bitmapset **list_attnums; int listidx; - Selectivity sel = 1.0; + Selectivity sel = (is_or) ? 0.0 : 1.0; /* check if there's any stats that might be useful for us. */ if (!has_stats_of_kind(rel->statlist, STATS_EXT_MCV)) - return 1.0; + return sel; list_attnums = (Bitmapset **) palloc(sizeof(Bitmapset *) * list_length(clauses)); @@ -1374,8 +1375,12 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli * columns/clauses. We'll then use the various selectivities computed from * MCV list to improve it. */ - simple_sel = clauselist_selectivity_simple(root, stat_clauses, varRelid, - jointype, sjinfo, NULL); + if (is_or) + simple_sel = clauselist_selectivity_simple_or(root, stat_clauses, varRelid, + jointype, sjinfo, NULL, 0.0); + else + simple_sel = clauselist_selectivity_simple(root, stat_clauses, varRelid, + jointype, sjinfo, NULL); /* * Now compute the multi-column estimate from the MCV list, along with the @@ -1383,7 +1388,7 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli */ mcv_sel = mcv_clauselist_selectivity(root, stat, stat_clauses, varRelid, jointype, sjinfo, rel, - &mcv_basesel, &mcv_totalsel); + &mcv_basesel, &mcv_totalsel, is_or); /* Estimated selectivity of values not covered by MCV matches */ other_sel = simple_sel - mcv_basesel; @@ -1397,8 +1402,11 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli stat_sel = mcv_sel + other_sel; CLAMP_PROBABILITY(stat_sel); - /* Factor the estimate from this MCV to the oveall estimate. */ - sel *= stat_sel; + /* Factor the estimate from this MCV to the overall estimate. */ + if (is_or) + sel = sel + stat_sel - sel * stat_sel; + else + sel *= stat_sel; } return sel; @@ -1411,13 +1419,21 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli Selectivity statext_clauselist_selectivity(PlannerInfo *root, List *clauses, int varRelid, JoinType jointype, SpecialJoinInfo *sjinfo, - RelOptInfo *rel, Bitmapset **estimatedclauses) + RelOptInfo *rel, Bitmapset **estimatedclauses, + bool is_or) { Selectivity sel; /* First, try estimating clauses using a multivariate MCV list. */ sel = statext_mcv_clauselist_selectivity(root, clauses, varRelid, jointype, - sjinfo, rel, estimatedclauses); + sjinfo, rel, estimatedclauses, is_or); + + /* + * Functional dependencies only work for clauses connected by AND, so for + * OR clauses we're done. + */ + if (is_or) + return sel; /* * Then, apply functional dependencies on the remaining clauses by calling diff --git a/src/backend/statistics/mcv.c b/src/backend/statistics/mcv.c index 3147d8fedc..343011178b 100644 --- a/src/backend/statistics/mcv.c +++ b/src/backend/statistics/mcv.c @@ -1902,7 +1902,8 @@ mcv_clauselist_selectivity(PlannerInfo *root, StatisticExtInfo *stat, List *clauses, int varRelid, JoinType jointype, SpecialJoinInfo *sjinfo, RelOptInfo *rel, - Selectivity *basesel, Selectivity *totalsel) + Selectivity *basesel, Selectivity *totalsel, + bool is_or) { int i; MCVList *mcv; @@ -1915,7 +1916,7 @@ mcv_clauselist_selectivity(PlannerInfo *root, StatisticExtInfo *stat, mcv = statext_mcv_load(stat->statOid); /* build a match bitmap for the clauses */ - matches = mcv_get_match_bitmap(root, clauses, stat->keys, mcv, false); + matches = mcv_get_match_bitmap(root, clauses, stat->keys, mcv, is_or); /* sum frequencies for all the matching MCV items */ *basesel = 0.0; diff --git a/src/include/optimizer/optimizer.h b/src/include/optimizer/optimizer.h index 5283995df8..67adcd4485 100644 --- a/src/include/optimizer/optimizer.h +++ b/src/include/optimizer/optimizer.h @@ -64,6 +64,13 @@ extern Selectivity clauselist_selectivity_simple(PlannerInfo *root, JoinType jointype, SpecialJoinInfo *sjinfo, Bitmapset *estimatedclauses); +extern Selectivity clauselist_selectivity_simple_or(PlannerInfo *root, + List *clauses, + int varRelid, + JoinType jointype, + SpecialJoinInfo *sjinfo, + Bitmapset *estimatedclauses, + Selectivity s); extern Selectivity clauselist_selectivity(PlannerInfo *root, List *clauses, int varRelid, diff --git a/src/include/statistics/extended_stats_internal.h b/src/include/statistics/extended_stats_internal.h index 2b14ab238c..6c039a81c2 100644 --- a/src/include/statistics/extended_stats_internal.h +++ b/src/include/statistics/extended_stats_internal.h @@ -107,6 +107,7 @@ extern Selectivity mcv_clauselist_selectivity(PlannerInfo *root, SpecialJoinInfo *sjinfo, RelOptInfo *rel, Selectivity *basesel, - Selectivity *totalsel); + Selectivity *totalsel, + bool is_or); #endif /* EXTENDED_STATS_INTERNAL_H */ diff --git a/src/include/statistics/statistics.h b/src/include/statistics/statistics.h index f5d9b6c73a..e18c9a6539 100644 --- a/src/include/statistics/statistics.h +++ b/src/include/statistics/statistics.h @@ -116,7 +116,8 @@ extern Selectivity statext_clauselist_selectivity(PlannerInfo *root, JoinType jointype, SpecialJoinInfo *sjinfo, RelOptInfo *rel, - Bitmapset **estimatedclauses); + Bitmapset **estimatedclauses, + bool is_or); extern bool has_stats_of_kind(List *stats, char requiredkind); extern StatisticExtInfo *choose_best_statistics(List *stats, char requiredkind, Bitmapset **clause_attnums, diff --git a/src/test/regress/expected/stats_ext.out b/src/test/regress/expected/stats_ext.out index bd6d8be434..3aba4a37c6 100644 --- a/src/test/regress/expected/stats_ext.out +++ b/src/test/regress/expected/stats_ext.out @@ -1075,6 +1075,12 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 OR b = ' 200 | 200 (1 row) +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 OR b = ''1'' OR c = 1 OR d IS NOT NULL'); + estimated | actual +-----------+-------- + 200 | 200 +(1 row) + SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a IN (1, 2, 51, 52) AND b IN ( ''1'', ''2'')'); estimated | actual -----------+-------- @@ -1135,13 +1141,6 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a < ALL (ARRAY 100 | 100 (1 row) --- we can't use the statistic for OR clauses that are not fully covered (missing 'd' attribute) -SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 OR b = ''1'' OR c = 1 OR d IS NOT NULL'); - estimated | actual ------------+-------- - 343 | 200 -(1 row) - -- check change of unrelated column type does not reset the MCV statistics ALTER TABLE mcv_lists ALTER COLUMN d TYPE VARCHAR(64); SELECT d.stxdmcv IS NOT NULL @@ -1464,12 +1463,36 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE c = 0 AN 102 | 714 (1 row) +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE b = 0 AND c = 0'); + estimated | actual +-----------+-------- + 143 | 142 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE b = 0 OR c = 0'); + estimated | actual +-----------+-------- + 1571 | 1572 +(1 row) + SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 AND b = 0 AND c = 0 AND d = 0'); estimated | actual -----------+-------- 4 | 142 (1 row) +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE (a = 0 AND b = 0) OR (c = 0 AND d = 0)'); + estimated | actual +-----------+-------- + 298 | 1572 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 OR b = 0 OR c = 0 OR d = 0'); + estimated | actual +-----------+-------- + 2649 | 1572 +(1 row) + -- create separate MCV statistics CREATE STATISTICS mcv_lists_multi_1 (mcv) ON a, b FROM mcv_lists_multi; CREATE STATISTICS mcv_lists_multi_2 (mcv) ON c, d FROM mcv_lists_multi; @@ -1486,12 +1509,36 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE c = 0 AN 714 | 714 (1 row) +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE b = 0 AND c = 0'); + estimated | actual +-----------+-------- + 143 | 142 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE b = 0 OR c = 0'); + estimated | actual +-----------+-------- + 1571 | 1572 +(1 row) + SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 AND b = 0 AND c = 0 AND d = 0'); estimated | actual -----------+-------- 143 | 142 (1 row) +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE (a = 0 AND b = 0) OR (c = 0 AND d = 0)'); + estimated | actual +-----------+-------- + 1571 | 1572 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 OR b = 0 OR c = 0 OR d = 0'); + estimated | actual +-----------+-------- + 1571 | 1572 +(1 row) + DROP TABLE mcv_lists_multi; -- Permission tests. Users should not be able to see specific data values in -- the extended statistics, if they lack permission to see those values in diff --git a/src/test/regress/sql/stats_ext.sql b/src/test/regress/sql/stats_ext.sql index 897ed3051c..54ceb1f4ee 100644 --- a/src/test/regress/sql/stats_ext.sql +++ b/src/test/regress/sql/stats_ext.sql @@ -547,6 +547,8 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE 4 >= a AND ''0 SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 OR b = ''1'' OR c = 1'); +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 OR b = ''1'' OR c = 1 OR d IS NOT NULL'); + SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a IN (1, 2, 51, 52) AND b IN ( ''1'', ''2'')'); SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a IN (1, 2, 51, 52, NULL) AND b IN ( ''1'', ''2'', NULL)'); @@ -567,9 +569,6 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a < ALL (ARRAY SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a < ALL (ARRAY[4, 5]) AND b IN (''1'', ''2'', NULL, ''3'') AND c > ANY (ARRAY[1, 2, NULL, 3])'); --- we can't use the statistic for OR clauses that are not fully covered (missing 'd' attribute) -SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists WHERE a = 1 OR b = ''1'' OR c = 1 OR d IS NOT NULL'); - -- check change of unrelated column type does not reset the MCV statistics ALTER TABLE mcv_lists ALTER COLUMN d TYPE VARCHAR(64); @@ -781,7 +780,11 @@ ANALYZE mcv_lists_multi; -- estimates without any mcv statistics SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 AND b = 0'); SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE c = 0 AND d = 0'); +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE b = 0 AND c = 0'); +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE b = 0 OR c = 0'); SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 AND b = 0 AND c = 0 AND d = 0'); +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE (a = 0 AND b = 0) OR (c = 0 AND d = 0)'); +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 OR b = 0 OR c = 0 OR d = 0'); -- create separate MCV statistics CREATE STATISTICS mcv_lists_multi_1 (mcv) ON a, b FROM mcv_lists_multi; @@ -791,7 +794,11 @@ ANALYZE mcv_lists_multi; SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 AND b = 0'); SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE c = 0 AND d = 0'); +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE b = 0 AND c = 0'); +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE b = 0 OR c = 0'); SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 AND b = 0 AND c = 0 AND d = 0'); +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE (a = 0 AND b = 0) OR (c = 0 AND d = 0)'); +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 OR b = 0 OR c = 0 OR d = 0'); DROP TABLE mcv_lists_multi; -- 2.21.1