From 0e09c4749f3712da1983374a2838e4b7e14b7c62 Mon Sep 17 00:00:00 2001 From: Tomas Vondra Date: Tue, 12 Nov 2019 22:57:06 +0100 Subject: [PATCH 01/10] Apply multiple multivariate MCV lists when possible Until now we've only used a single multivariate MCV list per relation, covering the largest number of clauses. So for example given a query SELECT * FROM t WHERE a = 1 AND b =1 AND c = 1 AND d = 1 and extended statistics on (a,b) and (c,d), we'd only pick and use only one of them. This commit relaxes this by repeating the process, using the best statistics (matching the largest number of remaining clauses) in each step. This greedy algorithm is very simple, but may not be optimal. There may be a different choice of stats leaving fewer clauses unestimated and/or giving better estimates for some other reason. This can however happen only when there are overlapping statistics, and selecting one makes it impossible to use the other. E.g. with statistics on (a,b), (c,d), (b,c,d), we may pick either (a,b) and (c,d) or (b,c,d). But it's not clear which option is better, though, as each one ignores information about possible correlation between different columns. We however assume cases like this are rare, and the easiest solution is to define statistics covering the whole group of correlated columns for a given query. In the future we might support overlapping stats, using some of the clauses as conditions (in conditional probability sense). Author: Tomas Vondra Reviewed-by: Mark Dilger Discussion: https://postgr.es/m/20191028152048.jc6pqv5hb7j77ocp@development --- src/backend/statistics/extended_stats.c | 166 ++++++++++++++---------- src/test/regress/expected/stats_ext.out | 58 +++++++++ src/test/regress/sql/stats_ext.sql | 36 +++++ 3 files changed, 195 insertions(+), 65 deletions(-) diff --git a/src/backend/statistics/extended_stats.c b/src/backend/statistics/extended_stats.c index 207ee3160e..b55799b8b1 100644 --- a/src/backend/statistics/extended_stats.c +++ b/src/backend/statistics/extended_stats.c @@ -1123,6 +1123,33 @@ statext_is_compatible_clause(PlannerInfo *root, Node *clause, Index relid, return true; } +/* + * statext_mcv_clause_attnums + * Recalculate attnums from compatible but not-yet-estimated clauses. + */ +static Bitmapset * +statext_mcv_remaining_attnums(int nclauses, Bitmapset **estimatedclauses, + Bitmapset **list_attnums) +{ + int listidx; + Bitmapset *attnums = NULL; + + for (listidx = 0; listidx < nclauses; listidx++) + { + /* + * Skip clauses that have no precalculated attnums, which means it is + * either incompatible or was already used by some other statistic. + */ + if (!list_attnums[listidx]) + continue; + + if (!bms_is_member(listidx, *estimatedclauses)) + attnums = bms_add_members(attnums, list_attnums[listidx]); + } + + return attnums; +} + /* * statext_mcv_clauselist_selectivity * Estimate clauses using the best multi-column statistics. @@ -1173,11 +1200,6 @@ statext_is_compatible_clause(PlannerInfo *root, Node *clause, Index relid, * 'estimatedclauses' is an input/output parameter. We set bits for the * 0-based 'clauses' indexes we estimate for and also skip clause items that * already have a bit set. - * - * XXX If we were to use multiple statistics, this is where it would happen. - * We would simply repeat this on a loop on the "remaining" clauses, possibly - * using the already estimated clauses as conditions (and combining the values - * using conditional probability formula). */ static Selectivity statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varRelid, @@ -1188,14 +1210,7 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli Bitmapset *clauses_attnums = NULL; Bitmapset **list_attnums; int listidx; - StatisticExtInfo *stat; - List *stat_clauses; - Selectivity simple_sel, - mcv_sel, - mcv_basesel, - mcv_totalsel, - other_sel, - sel; + Selectivity sel = 1.0; /* check if there's any stats that might be useful for us. */ if (!has_stats_of_kind(rel->statlist, STATS_EXT_MCV)) @@ -1223,78 +1238,99 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli if (!bms_is_member(listidx, *estimatedclauses) && statext_is_compatible_clause(root, clause, rel->relid, &attnums)) - { list_attnums[listidx] = attnums; - clauses_attnums = bms_add_members(clauses_attnums, attnums); - } else list_attnums[listidx] = NULL; listidx++; } - /* We need at least two attributes for multivariate statistics. */ - if (bms_membership(clauses_attnums) != BMS_MULTIPLE) - return 1.0; + /* apply as many extended statistics as possible */ + while (true) + { + StatisticExtInfo *stat; + List *stat_clauses; + Selectivity simple_sel, + mcv_sel, + mcv_basesel, + mcv_totalsel, + other_sel, + stat_sel; - /* find the best suited statistics object for these attnums */ - stat = choose_best_statistics(rel->statlist, clauses_attnums, STATS_EXT_MCV); + /* + * Recompute attnums in the remaining clauses (we simply use the bitmaps + * computed earlier, so that we don't have to inspect the clauses again). + */ + clauses_attnums = statext_mcv_remaining_attnums(list_length(clauses), + estimatedclauses, + list_attnums); - /* if no matching stats could be found then we've nothing to do */ - if (!stat) - return 1.0; + /* We need at least two attributes for multivariate statistics. */ + if (bms_membership(clauses_attnums) != BMS_MULTIPLE) + break; - /* Ensure choose_best_statistics produced an expected stats type. */ - Assert(stat->kind == STATS_EXT_MCV); + /* find the best suited statistics object for these attnums */ + stat = choose_best_statistics(rel->statlist, clauses_attnums, STATS_EXT_MCV); - /* now filter the clauses to be estimated using the selected MCV */ - stat_clauses = NIL; + /* if no (additional) matching stats could be found then we've nothing to do */ + if (!stat) + break; - listidx = 0; - foreach(l, clauses) - { - /* - * If the clause is compatible with the selected statistics, mark it - * as estimated and add it to the list to estimate. - */ - if (list_attnums[listidx] != NULL && - bms_is_subset(list_attnums[listidx], stat->keys)) + /* Ensure choose_best_statistics produced an expected stats type. */ + Assert(stat->kind == STATS_EXT_MCV); + + /* now filter the clauses to be estimated using the selected MCV */ + stat_clauses = NIL; + + listidx = 0; + foreach(l, clauses) { - stat_clauses = lappend(stat_clauses, (Node *) lfirst(l)); - *estimatedclauses = bms_add_member(*estimatedclauses, listidx); + /* + * If the clause is compatible with the selected statistics, mark it + * as estimated and add it to the list to estimate. + */ + if (list_attnums[listidx] != NULL && + bms_is_subset(list_attnums[listidx], stat->keys)) + { + stat_clauses = lappend(stat_clauses, (Node *) lfirst(l)); + *estimatedclauses = bms_add_member(*estimatedclauses, listidx); + } + + listidx++; } - listidx++; - } + /* + * First compute "simple" selectivity, i.e. without the extended + * statistics, and essentially assuming independence of the + * columns/clauses. We'll then use the various selectivities computed from + * MCV list to improve it. + */ + simple_sel = clauselist_selectivity_simple(root, stat_clauses, varRelid, + jointype, sjinfo, NULL); - /* - * First compute "simple" selectivity, i.e. without the extended - * statistics, and essentially assuming independence of the - * columns/clauses. We'll then use the various selectivities computed from - * MCV list to improve it. - */ - simple_sel = clauselist_selectivity_simple(root, stat_clauses, varRelid, - jointype, sjinfo, NULL); + /* + * Now compute the multi-column estimate from the MCV list, along with the + * other selectivities (base & total selectivity). + */ + mcv_sel = mcv_clauselist_selectivity(root, stat, stat_clauses, varRelid, + jointype, sjinfo, rel, + &mcv_basesel, &mcv_totalsel); - /* - * Now compute the multi-column estimate from the MCV list, along with the - * other selectivities (base & total selectivity). - */ - mcv_sel = mcv_clauselist_selectivity(root, stat, stat_clauses, varRelid, - jointype, sjinfo, rel, - &mcv_basesel, &mcv_totalsel); + /* Estimated selectivity of values not covered by MCV matches */ + other_sel = simple_sel - mcv_basesel; + CLAMP_PROBABILITY(other_sel); - /* Estimated selectivity of values not covered by MCV matches */ - other_sel = simple_sel - mcv_basesel; - CLAMP_PROBABILITY(other_sel); + /* The non-MCV selectivity can't exceed the 1 - mcv_totalsel. */ + if (other_sel > 1.0 - mcv_totalsel) + other_sel = 1.0 - mcv_totalsel; - /* The non-MCV selectivity can't exceed the 1 - mcv_totalsel. */ - if (other_sel > 1.0 - mcv_totalsel) - other_sel = 1.0 - mcv_totalsel; + /* Overall selectivity is the combination of MCV and non-MCV estimates. */ + stat_sel = mcv_sel + other_sel; + CLAMP_PROBABILITY(stat_sel); - /* Overall selectivity is the combination of MCV and non-MCV estimates. */ - sel = mcv_sel + other_sel; - CLAMP_PROBABILITY(sel); + /* Factor the estimate from this MCV to the oveall estimate. */ + sel *= stat_sel; + } return sel; } diff --git a/src/test/regress/expected/stats_ext.out b/src/test/regress/expected/stats_ext.out index b65228fa07..c5ae282122 100644 --- a/src/test/regress/expected/stats_ext.out +++ b/src/test/regress/expected/stats_ext.out @@ -747,6 +747,64 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_bool WHERE NOT a AND 1 | 0 (1 row) +-- check the ability to use multiple MCV lists +CREATE TABLE mcv_lists_multi ( + a INTEGER, + b INTEGER, + c INTEGER, + d INTEGER +); +-- +INSERT INTO mcv_lists_multi (a, b, c, d) + SELECT + mod(i,5), + mod(i,5), + mod(i,7), + mod(i,7) + FROM generate_series(1,5000) s(i); +ANALYZE mcv_lists_multi; +-- estimates without any mcv statistics +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 AND b = 0'); + estimated | actual +-----------+-------- + 200 | 1000 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE c = 0 AND d = 0'); + estimated | actual +-----------+-------- + 102 | 714 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 AND b = 0 AND c = 0 AND d = 0'); + estimated | actual +-----------+-------- + 4 | 142 +(1 row) + +-- create separate MCV statistics +CREATE STATISTICS mcv_lists_multi_1 (mcv) ON a, b FROM mcv_lists_multi; +CREATE STATISTICS mcv_lists_multi_2 (mcv) ON c, d FROM mcv_lists_multi; +ANALYZE mcv_lists_multi; +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 AND b = 0'); + estimated | actual +-----------+-------- + 1000 | 1000 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE c = 0 AND d = 0'); + estimated | actual +-----------+-------- + 714 | 714 +(1 row) + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 AND b = 0 AND c = 0 AND d = 0'); + estimated | actual +-----------+-------- + 143 | 142 +(1 row) + +DROP TABLE mcv_lists_multi; -- Permission tests. Users should not be able to see specific data values in -- the extended statistics, if they lack permission to see those values in -- the underlying table. diff --git a/src/test/regress/sql/stats_ext.sql b/src/test/regress/sql/stats_ext.sql index 040ee97a1e..ca3d20ee42 100644 --- a/src/test/regress/sql/stats_ext.sql +++ b/src/test/regress/sql/stats_ext.sql @@ -488,6 +488,42 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_bool WHERE NOT a AND SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_bool WHERE NOT a AND b AND NOT c'); +-- check the ability to use multiple MCV lists +CREATE TABLE mcv_lists_multi ( + a INTEGER, + b INTEGER, + c INTEGER, + d INTEGER +); + +-- +INSERT INTO mcv_lists_multi (a, b, c, d) + SELECT + mod(i,5), + mod(i,5), + mod(i,7), + mod(i,7) + FROM generate_series(1,5000) s(i); + +ANALYZE mcv_lists_multi; + +-- estimates without any mcv statistics +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 AND b = 0'); +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE c = 0 AND d = 0'); +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 AND b = 0 AND c = 0 AND d = 0'); + +-- create separate MCV statistics +CREATE STATISTICS mcv_lists_multi_1 (mcv) ON a, b FROM mcv_lists_multi; +CREATE STATISTICS mcv_lists_multi_2 (mcv) ON c, d FROM mcv_lists_multi; + +ANALYZE mcv_lists_multi; + +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 AND b = 0'); +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE c = 0 AND d = 0'); +SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_multi WHERE a = 0 AND b = 0 AND c = 0 AND d = 0'); + +DROP TABLE mcv_lists_multi; + -- Permission tests. Users should not be able to see specific data values in -- the extended statistics, if they lack permission to see those values in -- the underlying table. -- 2.21.0