From 97d02920b87d662c94cd5367ba38fa752789d5e4 Mon Sep 17 00:00:00 2001 From: Chengpeng Yan Date: Sat, 27 Jun 2026 12:48:23 +0800 Subject: [PATCH v2] Improve estimates for multicolumn unique indexes Clamp base and parameterized base relation row estimates when full unique-key equality clauses prove a one-row upper bound. Use the existing unique-index proof machinery to recognize that bound for base restrictions and parameterized clauses. Keep B-tree costing's separate tuple-fetch estimate consistent with the same proof by clamping index selectivity for full unique equality lookups. Require immediate unique enforcement for that shortcut, since deferrable unique constraints cannot provide a hard one-row proof during planning. Add planner_est coverage for the relation row-estimate and B-tree costing changes, including parameterized lookups and cases where uniqueness must not imply a one-row bound. --- src/backend/optimizer/path/costsize.c | 7 ++ src/backend/optimizer/path/indxpath.c | 48 ++++++++++ src/backend/utils/adt/selfuncs.c | 19 +++- src/include/optimizer/paths.h | 3 + src/test/regress/expected/planner_est.out | 112 ++++++++++++++++++++++ src/test/regress/sql/planner_est.sql | 81 ++++++++++++++++ 6 files changed, 268 insertions(+), 2 deletions(-) diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index 1c575e56ff6..b8aca969f2d 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -5504,6 +5504,10 @@ set_baserel_size_estimates(PlannerInfo *root, RelOptInfo *rel) JOIN_INNER, NULL); + if (nrows > 1.0 && + relation_has_unique_index_for_clauses(root, rel, NIL)) + nrows = 1.0; + rel->rows = clamp_row_est(nrows); cost_qual_eval(&rel->baserestrictcost, rel->baserestrictinfo, root); @@ -5539,6 +5543,9 @@ get_parameterized_baserel_size(PlannerInfo *root, RelOptInfo *rel, rel->relid, /* do not use 0! */ JOIN_INNER, NULL); + if (nrows > 1.0 && + relation_has_unique_index_for_clauses(root, rel, param_clauses)) + nrows = 1.0; nrows = clamp_row_est(nrows); /* For safety, make sure result is not more than the base estimate */ if (nrows > rel->rows) diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c index 3f5d4fa3182..ee3ed219070 100644 --- a/src/backend/optimizer/path/indxpath.c +++ b/src/backend/optimizer/path/indxpath.c @@ -4285,6 +4285,54 @@ relation_has_unique_index_for(PlannerInfo *root, RelOptInfo *rel, return false; } +/* + * relation_has_unique_index_for_clauses + * Determine whether a relation has at most one row satisfying the given + * clauses because they constrain all columns of some unique index. + * + * The given clauses are additional clauses to use for the proof. The rel's + * baserestrictinfo clauses are considered automatically by + * relation_has_unique_index_for(). + */ +bool +relation_has_unique_index_for_clauses(PlannerInfo *root, RelOptInfo *rel, + List *clauses) +{ + List *restrictlist = NIL; + ListCell *lc; + + /* Quick checks before doing any clause processing. */ + if (rel->reloptkind != RELOPT_BASEREL || + rel->rtekind != RTE_RELATION || + rel->indexlist == NIL) + return false; + + foreach(lc, clauses) + { + RestrictInfo *rinfo = lfirst_node(RestrictInfo, lc); + Relids outerrelids; + + if (rinfo->mergeopfamilies == NIL) + continue; + + /* Restriction clauses are picked up below from baserestrictinfo. */ + if (bms_is_subset(rinfo->clause_relids, rel->relids)) + continue; + + if (!is_opclause(rinfo->clause) || + list_length(((OpExpr *) rinfo->clause)->args) != 2) + continue; + + outerrelids = bms_difference(rinfo->clause_relids, rel->relids); + if (!bms_is_empty(outerrelids) && + clause_sides_match_join(rinfo, outerrelids, rel->relids)) + restrictlist = lappend(restrictlist, rinfo); + bms_free(outerrelids); + } + + return relation_has_unique_index_for(root, rel, restrictlist, NULL); +} + /* * indexcol_is_bool_constant_for_query * diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index d6efd07073a..3c69119636b 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -7719,6 +7719,7 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, bool found_row_compare; bool found_array; bool found_is_null_op; + bool unique_full_eq = false; bool have_correlation = false; double num_sa_scans; double correlation = 0.0; @@ -8026,17 +8027,21 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, } /* - * If index is unique and we found an '=' clause for each column, we can - * just assume numIndexTuples = 1 and skip the expensive + * If index is immediately unique and we found an '=' clause for each + * column, we can just assume numIndexTuples = 1 and skip the expensive * clauselist_selectivity calculations. However, an array or NullTest * always invalidates that theory (even when eqQualHere has been set). */ if (index->unique && + index->immediate && indexcol == index->nkeycolumns - 1 && eqQualHere && !found_array && !found_is_null_op) + { numIndexTuples = 1.0; + unique_full_eq = true; + } else { List *selectivityQuals; @@ -8120,6 +8125,16 @@ btcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, genericcostestimate(root, path, loop_count, &costs); + /* + * genericcostestimate() derives indexSelectivity from the qual list. For + * a full equality lookup on an immediate unique index, the scan cannot + * fetch more than one heap tuple, so keep heap-fetch costing consistent + * with the numIndexTuples shortcut above. + */ + if (unique_full_eq && index->rel->tuples >= 1.0) + costs.indexSelectivity = + Min(costs.indexSelectivity, 1.0 / index->rel->tuples); + /* * Add a CPU-cost component to represent the costs of initial btree * descent. We don't charge any I/O cost for touching upper btree levels, diff --git a/src/include/optimizer/paths.h b/src/include/optimizer/paths.h index 17f2099ec3b..6dfdf4d5624 100644 --- a/src/include/optimizer/paths.h +++ b/src/include/optimizer/paths.h @@ -83,6 +83,9 @@ extern void create_index_paths(PlannerInfo *root, RelOptInfo *rel); extern bool relation_has_unique_index_for(PlannerInfo *root, RelOptInfo *rel, List *restrictlist, List **extra_clauses); +extern bool relation_has_unique_index_for_clauses(PlannerInfo *root, + RelOptInfo *rel, + List *clauses); extern bool indexcol_is_bool_constant_for_query(PlannerInfo *root, IndexOptInfo *index, int indexcol); diff --git a/src/test/regress/expected/planner_est.out b/src/test/regress/expected/planner_est.out index b62a47552fa..489e66a88c4 100644 --- a/src/test/regress/expected/planner_est.out +++ b/src/test/regress/expected/planner_est.out @@ -210,4 +210,116 @@ false, true, false, true); -> Result (cost=N..N rows=1 width=N) (4 rows) +-- Check that row estimation for multi-column unique indexes yields 1. +CREATE TABLE multi_column_unique_test_table (a int, b int) WITH (autovacuum_enabled=false); +CREATE UNIQUE INDEX multi_column_unique ON multi_column_unique_test_table (a, b); +INSERT INTO multi_column_unique_test_table(a, b) SELECT 1, i FROM generate_series(1,10) as g(i); +INSERT INTO multi_column_unique_test_table(a, b) SELECT i, 1 FROM generate_series(2,10) as g(i); +ANALYZE multi_column_unique_test_table; +-- The relation-level estimate should be clamped even for a plain seq scan. +set enable_indexscan to false; +set enable_indexonlyscan to false; +set enable_bitmapscan to false; +SELECT explain_mask_costs($$ +SELECT * FROM multi_column_unique_test_table WHERE a=1 AND b=1;$$, +false, true, false, true); + explain_mask_costs +------------------------------------------------------------------------ + Seq Scan on multi_column_unique_test_table (cost=N..N rows=1 width=N) + Filter: ((a = 1) AND (b = 1)) +(2 rows) + +reset enable_indexscan; +reset enable_indexonlyscan; +reset enable_bitmapscan; +-- Covering only part of a multi-column unique key should not be clamped. +SELECT explain_mask_costs($$ +SELECT * FROM multi_column_unique_test_table WHERE a=1;$$, +false, true, false, true); + explain_mask_costs +------------------------------------------------------------------------- + Seq Scan on multi_column_unique_test_table (cost=N..N rows=10 width=N) + Filter: (a = 1) +(2 rows) + +-- NULL values in a regular unique index are not equal, so IS NULL does not +-- prove a one-row upper bound. +CREATE TABLE multi_column_unique_null_test (a int, b int) WITH (autovacuum_enabled=false); +CREATE UNIQUE INDEX multi_column_unique_null ON multi_column_unique_null_test (a, b); +INSERT INTO multi_column_unique_null_test(a, b) SELECT 1, NULL FROM generate_series(1,20); +INSERT INTO multi_column_unique_null_test(a, b) SELECT 2, i FROM generate_series(1,10) as g(i); +ANALYZE multi_column_unique_null_test; +SELECT explain_mask_costs($$ +SELECT * FROM multi_column_unique_null_test WHERE a=1 AND b IS NULL;$$, +false, true, false, true); + explain_mask_costs +------------------------------------------------------------------------ + Seq Scan on multi_column_unique_null_test (cost=N..N rows=13 width=N) + Filter: ((b IS NULL) AND (a = 1)) +(2 rows) + +-- A deferrable unique constraint is not a planner proof that only one row can +-- match, because uniqueness may be transiently violated inside a transaction. +CREATE TABLE multi_column_unique_deferred (a int, b int) WITH (autovacuum_enabled=false); +ALTER TABLE multi_column_unique_deferred + ADD CONSTRAINT multi_column_unique_deferred_key UNIQUE (a, b) + DEFERRABLE INITIALLY IMMEDIATE; +INSERT INTO multi_column_unique_deferred(a, b) SELECT 1, i FROM generate_series(1,10) as g(i); +INSERT INTO multi_column_unique_deferred(a, b) SELECT i, 1 FROM generate_series(2,10) as g(i); +ANALYZE multi_column_unique_deferred; +SELECT explain_mask_costs($$ +SELECT * FROM multi_column_unique_deferred WHERE a=1 AND b=1;$$, +false, true, false, true); + explain_mask_costs +---------------------------------------------------------------------- + Seq Scan on multi_column_unique_deferred (cost=N..N rows=5 width=N) + Filter: ((a = 1) AND (b = 1)) +(2 rows) + +-- The B-tree index path should also expose a one-row output estimate. +set enable_seqscan to false; +SELECT explain_mask_costs($$ +SELECT * FROM multi_column_unique_test_table WHERE a=1 AND b=1;$$, +false, true, false, true); + explain_mask_costs +--------------------------------------------------------------------------------------------------------- + Index Only Scan using multi_column_unique on multi_column_unique_test_table (cost=N..N rows=1 width=N) + Index Cond: ((a = 1) AND (b = 1)) +(2 rows) + +reset enable_seqscan; +CREATE TABLE multi_column_unique_outer (b int) WITH (autovacuum_enabled=false); +INSERT INTO multi_column_unique_outer(b) SELECT i FROM generate_series(1,3) as g(i); +ANALYZE multi_column_unique_outer; +-- Base restriction plus a parameterized join clause can cover a multi-column +-- unique key for each outer row. +set enable_hashjoin to false; +set enable_mergejoin to false; +set enable_seqscan to false; +set enable_bitmapscan to false; +SELECT explain_mask_costs($$ +SELECT * +FROM multi_column_unique_outer o +CROSS JOIN LATERAL ( + SELECT * + FROM multi_column_unique_test_table t + WHERE t.a=1 AND t.b=o.b + OFFSET 0 +) t;$$, +false, true, false, true); + explain_mask_costs +----------------------------------------------------------------------------------------------------------------- + Nested Loop (cost=N..N rows=3 width=N) + -> Seq Scan on multi_column_unique_outer o (cost=N..N rows=3 width=N) + Disabled: true + -> Index Only Scan using multi_column_unique on multi_column_unique_test_table t (cost=N..N rows=1 width=N) + Index Cond: ((a = 1) AND (b = o.b)) +(5 rows) + +reset enable_hashjoin; +reset enable_mergejoin; +reset enable_seqscan; +reset enable_bitmapscan; +DROP TABLE multi_column_unique_outer, multi_column_unique_deferred, + multi_column_unique_null_test, multi_column_unique_test_table; DROP FUNCTION explain_mask_costs(text, bool, bool, bool, bool); diff --git a/src/test/regress/sql/planner_est.sql b/src/test/regress/sql/planner_est.sql index 53210d5baad..56aa134da38 100644 --- a/src/test/regress/sql/planner_est.sql +++ b/src/test/regress/sql/planner_est.sql @@ -147,4 +147,85 @@ SELECT explain_mask_costs($$ SELECT * FROM tenk1 WHERE unique1 <> ALL (ARRAY[1, 2, 98, (SELECT 99), NULL]);$$, false, true, false, true); +-- Check that row estimation for multi-column unique indexes yields 1. +CREATE TABLE multi_column_unique_test_table (a int, b int) WITH (autovacuum_enabled=false); +CREATE UNIQUE INDEX multi_column_unique ON multi_column_unique_test_table (a, b); +INSERT INTO multi_column_unique_test_table(a, b) SELECT 1, i FROM generate_series(1,10) as g(i); +INSERT INTO multi_column_unique_test_table(a, b) SELECT i, 1 FROM generate_series(2,10) as g(i); +ANALYZE multi_column_unique_test_table; + +-- The relation-level estimate should be clamped even for a plain seq scan. +set enable_indexscan to false; +set enable_indexonlyscan to false; +set enable_bitmapscan to false; +SELECT explain_mask_costs($$ +SELECT * FROM multi_column_unique_test_table WHERE a=1 AND b=1;$$, +false, true, false, true); +reset enable_indexscan; +reset enable_indexonlyscan; +reset enable_bitmapscan; + +-- Covering only part of a multi-column unique key should not be clamped. +SELECT explain_mask_costs($$ +SELECT * FROM multi_column_unique_test_table WHERE a=1;$$, +false, true, false, true); + +-- NULL values in a regular unique index are not equal, so IS NULL does not +-- prove a one-row upper bound. +CREATE TABLE multi_column_unique_null_test (a int, b int) WITH (autovacuum_enabled=false); +CREATE UNIQUE INDEX multi_column_unique_null ON multi_column_unique_null_test (a, b); +INSERT INTO multi_column_unique_null_test(a, b) SELECT 1, NULL FROM generate_series(1,20); +INSERT INTO multi_column_unique_null_test(a, b) SELECT 2, i FROM generate_series(1,10) as g(i); +ANALYZE multi_column_unique_null_test; +SELECT explain_mask_costs($$ +SELECT * FROM multi_column_unique_null_test WHERE a=1 AND b IS NULL;$$, +false, true, false, true); + +-- A deferrable unique constraint is not a planner proof that only one row can +-- match, because uniqueness may be transiently violated inside a transaction. +CREATE TABLE multi_column_unique_deferred (a int, b int) WITH (autovacuum_enabled=false); +ALTER TABLE multi_column_unique_deferred + ADD CONSTRAINT multi_column_unique_deferred_key UNIQUE (a, b) + DEFERRABLE INITIALLY IMMEDIATE; +INSERT INTO multi_column_unique_deferred(a, b) SELECT 1, i FROM generate_series(1,10) as g(i); +INSERT INTO multi_column_unique_deferred(a, b) SELECT i, 1 FROM generate_series(2,10) as g(i); +ANALYZE multi_column_unique_deferred; +SELECT explain_mask_costs($$ +SELECT * FROM multi_column_unique_deferred WHERE a=1 AND b=1;$$, +false, true, false, true); + +-- The B-tree index path should also expose a one-row output estimate. +set enable_seqscan to false; +SELECT explain_mask_costs($$ +SELECT * FROM multi_column_unique_test_table WHERE a=1 AND b=1;$$, +false, true, false, true); +reset enable_seqscan; + +CREATE TABLE multi_column_unique_outer (b int) WITH (autovacuum_enabled=false); +INSERT INTO multi_column_unique_outer(b) SELECT i FROM generate_series(1,3) as g(i); +ANALYZE multi_column_unique_outer; + +-- Base restriction plus a parameterized join clause can cover a multi-column +-- unique key for each outer row. +set enable_hashjoin to false; +set enable_mergejoin to false; +set enable_seqscan to false; +set enable_bitmapscan to false; +SELECT explain_mask_costs($$ +SELECT * +FROM multi_column_unique_outer o +CROSS JOIN LATERAL ( + SELECT * + FROM multi_column_unique_test_table t + WHERE t.a=1 AND t.b=o.b + OFFSET 0 +) t;$$, +false, true, false, true); +reset enable_hashjoin; +reset enable_mergejoin; +reset enable_seqscan; +reset enable_bitmapscan; + +DROP TABLE multi_column_unique_outer, multi_column_unique_deferred, + multi_column_unique_null_test, multi_column_unique_test_table; DROP FUNCTION explain_mask_costs(text, bool, bool, bool, bool); -- 2.50.1 (Apple Git-155)