From 41ce6007cd552afd1a73983f0b9c9cac0e125d58 Mon Sep 17 00:00:00 2001 From: "Andrey V. Lepikhov" Date: Mon, 30 Aug 2021 11:21:57 +0500 Subject: [PATCH] Estimating number of fetched rows in a btree index we save selectivity estimation in the costs structure. It will be used by the genericcostestimate routine as a top bound for estimation of total tuples, visited in the main table. This code fix the problem with unique index, when we know for sure that no more than one tuple can be fetched, but clauselist_selectivity gives us much less accurate estimation because of many possible reasons. A regression test is added as a demonstration of the problem. --- src/backend/utils/adt/selfuncs.c | 18 ++++++++++-- src/test/regress/expected/stats_ext.out | 38 +++++++++++++++++++++++++ src/test/regress/sql/stats_ext.sql | 34 ++++++++++++++++++++++ 3 files changed, 88 insertions(+), 2 deletions(-) diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index c2aeb4b947..dd1cadad61 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -6074,6 +6074,14 @@ genericcostestimate(PlannerInfo *root, */ numIndexTuples = rint(numIndexTuples / num_sa_scans); } + else if (costs->indexSelectivity > 0. && + indexSelectivity > costs->indexSelectivity) + /* + * If caller give us an estimation of amount of fetched index tuples, + * it could give the selectivity estimation. In this case amount of + * returned tuples can't be more than amount of fetched tuples. + */ + indexSelectivity = costs->indexSelectivity; /* * We can bound the number of tuples by the index size in any case. Also, @@ -6258,6 +6266,7 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, bool found_is_null_op; double num_sa_scans; ListCell *lc; + Selectivity btreeSelectivity; /* * For a btree scan, only leading '=' quals plus inequality quals for the @@ -6362,19 +6371,23 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, /* * If index is unique and we found an '=' clause for each column, we can * just assume numIndexTuples = 1 and skip the expensive - * clauselist_selectivity calculations. However, a ScalarArrayOp or + * clauselist_selectivity calculations. However, a ScalarArrayOp or * NullTest invalidates that theory, even though it sets eqQualHere. + * Value of btreeSelectivity is used as a top bound for selectivity + * estimation of returned tuples in the genericcostestimate routine. */ if (index->unique && indexcol == index->nkeycolumns - 1 && eqQualHere && !found_saop && !found_is_null_op) + { numIndexTuples = 1.0; + btreeSelectivity = 1. / index->rel->tuples; + } else { List *selectivityQuals; - Selectivity btreeSelectivity; /* * If the index is partial, AND the index predicate with the @@ -6402,6 +6415,7 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, */ MemSet(&costs, 0, sizeof(costs)); costs.numIndexTuples = numIndexTuples; + costs.indexSelectivity = btreeSelectivity; genericcostestimate(root, path, loop_count, &costs); diff --git a/src/test/regress/expected/stats_ext.out b/src/test/regress/expected/stats_ext.out index 7524e65142..b90463821f 100644 --- a/src/test/regress/expected/stats_ext.out +++ b/src/test/regress/expected/stats_ext.out @@ -1602,3 +1602,41 @@ NOTICE: drop cascades to 2 other objects DETAIL: drop cascades to table tststats.priv_test_tbl drop cascades to view tststats.priv_test_view DROP USER regress_stats_user1; +-- Reproduction of the problem with picking of suboptimal index. +SET enable_bitmapscan = 'off'; +-- Table with specific distribution of values +CREATE TABLE tbl42 AS ( + SELECT + gs % 10 AS x, + (gs % 10 + (gs/10::int4) % 10) % 10 AS y, + (gs / 100)::int4 AS z + FROM generate_series(1,1000) AS gs +); +INSERT INTO tbl42 ( + SELECT gs,gs,gs FROM generate_series(1000,2000) AS gs +); +CREATE UNIQUE INDEX good ON tbl42 (x,y,z); +CREATE INDEX bad ON tbl42(x); +ANALYZE tbl42; +-- Optimizer picks optimal, a primary key unique index +EXPLAIN (COSTS OFF) +SELECT * FROM tbl42 WHERE x=1 AND y=1 AND z=1; + QUERY PLAN +------------------------------------------------- + Index Only Scan using good on tbl42 + Index Cond: ((x = 1) AND (y = 1) AND (z = 1)) +(2 rows) + +CREATE STATISTICS aestat(dependencies,ndistinct) ON x,y,z FROM tbl42; +ANALYZE tbl42; +-- Optimizer picks suboptimal index +EXPLAIN (COSTS OFF) +SELECT * FROM tbl42 WHERE x=1 AND y=1 AND z=1; + QUERY PLAN +------------------------------------------------- + Index Only Scan using good on tbl42 + Index Cond: ((x = 1) AND (y = 1) AND (z = 1)) +(2 rows) + +-- Clean up +DROP TABLE tbl42 CASCADE; diff --git a/src/test/regress/sql/stats_ext.sql b/src/test/regress/sql/stats_ext.sql index 906503bd0b..f274f53996 100644 --- a/src/test/regress/sql/stats_ext.sql +++ b/src/test/regress/sql/stats_ext.sql @@ -877,3 +877,37 @@ DROP FUNCTION op_leak(int, int); RESET SESSION AUTHORIZATION; DROP SCHEMA tststats CASCADE; DROP USER regress_stats_user1; + + +-- Reproduction of the problem with picking of suboptimal index. +SET enable_bitmapscan = 'off'; + +-- Table with specific distribution of values +CREATE TABLE tbl42 AS ( + SELECT + gs % 10 AS x, + (gs % 10 + (gs/10::int4) % 10) % 10 AS y, + (gs / 100)::int4 AS z + FROM generate_series(1,1000) AS gs +); +INSERT INTO tbl42 ( + SELECT gs,gs,gs FROM generate_series(1000,2000) AS gs +); + +CREATE UNIQUE INDEX good ON tbl42 (x,y,z); +CREATE INDEX bad ON tbl42(x); +ANALYZE tbl42; + +-- Optimizer picks optimal, a primary key unique index +EXPLAIN (COSTS OFF) +SELECT * FROM tbl42 WHERE x=1 AND y=1 AND z=1; + +CREATE STATISTICS aestat(dependencies,ndistinct) ON x,y,z FROM tbl42; +ANALYZE tbl42; + +-- Optimizer picks suboptimal index +EXPLAIN (COSTS OFF) +SELECT * FROM tbl42 WHERE x=1 AND y=1 AND z=1; + +-- Clean up +DROP TABLE tbl42 CASCADE; -- 2.33.0