From 41ce6007cd552afd1a73983f0b9c9cac0e125d58 Mon Sep 17 00:00:00 2001
From: "Andrey V. Lepikhov" <a.lepikhov@postgrespro.ru>
Date: Mon, 30 Aug 2021 11:21:57 +0500
Subject: [PATCH] Estimating number of fetched rows in a btree index we save
 selectivity estimation in the costs structure. It will be used by the
 genericcostestimate routine as a top bound for estimation of total tuples,
 visited in the main table.

This code fix the problem with unique index, when we know for sure
that no more than one tuple can be fetched, but clauselist_selectivity
gives us much less accurate estimation because of many possible reasons.
A regression test is added as a demonstration of the problem.
---
 src/backend/utils/adt/selfuncs.c        | 18 ++++++++++--
 src/test/regress/expected/stats_ext.out | 38 +++++++++++++++++++++++++
 src/test/regress/sql/stats_ext.sql      | 34 ++++++++++++++++++++++
 3 files changed, 88 insertions(+), 2 deletions(-)

diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index c2aeb4b947..dd1cadad61 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -6074,6 +6074,14 @@ genericcostestimate(PlannerInfo *root,
 		 */
 		numIndexTuples = rint(numIndexTuples / num_sa_scans);
 	}
+	else if (costs->indexSelectivity > 0. &&
+		indexSelectivity > costs->indexSelectivity)
+		/*
+		 * If caller give us an estimation of amount of fetched index tuples,
+		 * it could give the selectivity estimation. In this case amount of
+		 * returned tuples can't be more than amount of fetched tuples.
+		 */
+		indexSelectivity = costs->indexSelectivity;
 
 	/*
 	 * We can bound the number of tuples by the index size in any case. Also,
@@ -6258,6 +6266,7 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
 	bool		found_is_null_op;
 	double		num_sa_scans;
 	ListCell   *lc;
+	Selectivity btreeSelectivity;
 
 	/*
 	 * For a btree scan, only leading '=' quals plus inequality quals for the
@@ -6362,19 +6371,23 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
 	/*
 	 * If index is unique and we found an '=' clause for each column, we can
 	 * just assume numIndexTuples = 1 and skip the expensive
-	 * clauselist_selectivity calculations.  However, a ScalarArrayOp or
+	 * clauselist_selectivity calculations. However, a ScalarArrayOp or
 	 * NullTest invalidates that theory, even though it sets eqQualHere.
+	 * Value of btreeSelectivity is used as a top bound for selectivity
+	 * estimation of returned tuples in the genericcostestimate routine.
 	 */
 	if (index->unique &&
 		indexcol == index->nkeycolumns - 1 &&
 		eqQualHere &&
 		!found_saop &&
 		!found_is_null_op)
+	{
 		numIndexTuples = 1.0;
+		btreeSelectivity = 1. / index->rel->tuples;
+	}
 	else
 	{
 		List	   *selectivityQuals;
-		Selectivity btreeSelectivity;
 
 		/*
 		 * If the index is partial, AND the index predicate with the
@@ -6402,6 +6415,7 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
 	 */
 	MemSet(&costs, 0, sizeof(costs));
 	costs.numIndexTuples = numIndexTuples;
+	costs.indexSelectivity = btreeSelectivity;
 
 	genericcostestimate(root, path, loop_count, &costs);
 
diff --git a/src/test/regress/expected/stats_ext.out b/src/test/regress/expected/stats_ext.out
index 7524e65142..b90463821f 100644
--- a/src/test/regress/expected/stats_ext.out
+++ b/src/test/regress/expected/stats_ext.out
@@ -1602,3 +1602,41 @@ NOTICE:  drop cascades to 2 other objects
 DETAIL:  drop cascades to table tststats.priv_test_tbl
 drop cascades to view tststats.priv_test_view
 DROP USER regress_stats_user1;
+-- Reproduction of the problem with picking of suboptimal index.
+SET enable_bitmapscan = 'off';
+-- Table with specific distribution of values
+CREATE TABLE tbl42 AS (
+	SELECT
+		gs % 10 AS x,
+		(gs % 10 + (gs/10::int4) % 10) % 10 AS y,
+		(gs / 100)::int4 AS z
+	FROM generate_series(1,1000) AS gs
+);
+INSERT INTO tbl42 (
+	SELECT gs,gs,gs FROM generate_series(1000,2000) AS gs
+);
+CREATE UNIQUE INDEX good ON tbl42 (x,y,z);
+CREATE INDEX bad ON tbl42(x);
+ANALYZE tbl42;
+-- Optimizer picks optimal, a primary key unique index
+EXPLAIN (COSTS OFF)
+SELECT * FROM tbl42 WHERE x=1 AND y=1 AND z=1;
+                   QUERY PLAN                    
+-------------------------------------------------
+ Index Only Scan using good on tbl42
+   Index Cond: ((x = 1) AND (y = 1) AND (z = 1))
+(2 rows)
+
+CREATE STATISTICS aestat(dependencies,ndistinct) ON x,y,z FROM tbl42;
+ANALYZE tbl42;
+-- Optimizer picks suboptimal index
+EXPLAIN (COSTS OFF)
+SELECT * FROM tbl42 WHERE x=1 AND y=1 AND z=1;
+                   QUERY PLAN                    
+-------------------------------------------------
+ Index Only Scan using good on tbl42
+   Index Cond: ((x = 1) AND (y = 1) AND (z = 1))
+(2 rows)
+
+-- Clean up
+DROP TABLE tbl42 CASCADE;
diff --git a/src/test/regress/sql/stats_ext.sql b/src/test/regress/sql/stats_ext.sql
index 906503bd0b..f274f53996 100644
--- a/src/test/regress/sql/stats_ext.sql
+++ b/src/test/regress/sql/stats_ext.sql
@@ -877,3 +877,37 @@ DROP FUNCTION op_leak(int, int);
 RESET SESSION AUTHORIZATION;
 DROP SCHEMA tststats CASCADE;
 DROP USER regress_stats_user1;
+
+
+-- Reproduction of the problem with picking of suboptimal index.
+SET enable_bitmapscan = 'off';
+
+-- Table with specific distribution of values
+CREATE TABLE tbl42 AS (
+	SELECT
+		gs % 10 AS x,
+		(gs % 10 + (gs/10::int4) % 10) % 10 AS y,
+		(gs / 100)::int4 AS z
+	FROM generate_series(1,1000) AS gs
+);
+INSERT INTO tbl42 (
+	SELECT gs,gs,gs FROM generate_series(1000,2000) AS gs
+);
+
+CREATE UNIQUE INDEX good ON tbl42 (x,y,z);
+CREATE INDEX bad ON tbl42(x);
+ANALYZE tbl42;
+
+-- Optimizer picks optimal, a primary key unique index
+EXPLAIN (COSTS OFF)
+SELECT * FROM tbl42 WHERE x=1 AND y=1 AND z=1;
+
+CREATE STATISTICS aestat(dependencies,ndistinct) ON x,y,z FROM tbl42;
+ANALYZE tbl42;
+
+-- Optimizer picks suboptimal index
+EXPLAIN (COSTS OFF)
+SELECT * FROM tbl42 WHERE x=1 AND y=1 AND z=1;
+
+-- Clean up
+DROP TABLE tbl42 CASCADE;
-- 
2.33.0