From 42a36e6d98d7642e87500de8f138e1f54160fe55 Mon Sep 17 00:00:00 2001
From: Enrique Sanchez Cardoso <enriqueesanchz@gmail.com>
Date: Sun, 24 May 2026 01:03:14 +0200
Subject: [PATCH 1/4] Cap selectivity when values are not in multi-column mcv

Selectivity can't be > last MCV item (least common) selectivity when
they are AND clauses and cover all the MCV dimensions.
---
 src/backend/statistics/extended_stats.c       | 11 +++-
 src/backend/statistics/mcv.c                  | 43 ++++++++++++++-
 .../statistics/extended_stats_internal.h      |  3 +-
 src/test/regress/expected/stats_ext.out       | 52 +++++++++++++++++++
 src/test/regress/sql/stats_ext.sql            | 34 ++++++++++++
 5 files changed, 140 insertions(+), 3 deletions(-)

diff --git a/src/backend/statistics/extended_stats.c b/src/backend/statistics/extended_stats.c
index 2b83355d26e..f8c38653bf9 100644
--- a/src/backend/statistics/extended_stats.c
+++ b/src/backend/statistics/extended_stats.c
@@ -1989,6 +1989,7 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli
 						mcv_sel,
 						mcv_basesel,
 						mcv_totalsel,
+						mcv_cap,
 						stat_sel;
 
 			/*
@@ -2006,7 +2007,8 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli
 			mcv_sel = mcv_clauselist_selectivity(root, stat, stat_clauses,
 												 varRelid, jointype, sjinfo,
 												 rel, &mcv_basesel,
-												 &mcv_totalsel);
+												 &mcv_totalsel,
+												 &mcv_cap);
 
 			/* Combine the simple and multi-column estimates. */
 			stat_sel = mcv_combine_selectivities(simple_sel,
@@ -2014,6 +2016,13 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli
 												 mcv_basesel,
 												 mcv_totalsel);
 
+			/*
+			 * Cap to the least common MCV frequency when no MCV items
+			 * matched.
+			 */
+			if (stat_sel > mcv_cap)
+				stat_sel = mcv_cap;
+
 			/* Factor this into the overall result */
 			sel *= stat_sel;
 		}
diff --git a/src/backend/statistics/mcv.c b/src/backend/statistics/mcv.c
index 0b7da605a4c..df70d00cc3d 100644
--- a/src/backend/statistics/mcv.c
+++ b/src/backend/statistics/mcv.c
@@ -24,6 +24,7 @@
 #include "statistics/statistics.h"
 #include "utils/array.h"
 #include "utils/builtins.h"
+#include "utils/fmgroids.h"
 #include "utils/fmgrprotos.h"
 #include "utils/lsyscache.h"
 #include "utils/selfuncs.h"
@@ -1523,6 +1524,32 @@ pg_mcv_list_send(PG_FUNCTION_ARGS)
 	return byteasend(fcinfo);
 }
 
+/*
+ * mcv_is_all_equality_clauses
+ *		Check if all clauses are simple equality conditions (OpExpr with eqsel
+ *		restriction estimator).  This mirrors the check done by
+ *		dependency_is_compatible_clause() in dependencies.c.
+ */
+static bool
+mcv_is_all_equality_clauses(List *clauses)
+{
+	ListCell   *lc;
+
+	foreach(lc, clauses)
+	{
+		Node	   *clause = (Node *) lfirst(lc);
+
+		if (IsA(clause, RestrictInfo))
+			clause = (Node *) ((RestrictInfo *) clause)->clause;
+
+		if (!is_opclause(clause) ||
+			get_oprrest(((OpExpr *) clause)->opno) != F_EQSEL)
+			return false;
+	}
+
+	return true;
+}
+
 /*
  * match the attribute/expression to a dimension of the statistic
  *
@@ -2047,7 +2074,8 @@ mcv_clauselist_selectivity(PlannerInfo *root, StatisticExtInfo *stat,
 						   List *clauses, int varRelid,
 						   JoinType jointype, SpecialJoinInfo *sjinfo,
 						   RelOptInfo *rel,
-						   Selectivity *basesel, Selectivity *totalsel)
+						   Selectivity *basesel, Selectivity *totalsel,
+						   Selectivity *cap)
 {
 	int			i;
 	MCVList    *mcv;
@@ -2057,6 +2085,9 @@ mcv_clauselist_selectivity(PlannerInfo *root, StatisticExtInfo *stat,
 	/* match/mismatch bitmap for each MCV item */
 	bool	   *matches = NULL;
 
+	/* default: no cap on combined selectivity */
+	*cap = 1.0;
+
 	/* load the MCV list stored in the statistics object */
 	mcv = statext_mcv_load(stat->statOid, rte->inh);
 
@@ -2078,6 +2109,16 @@ mcv_clauselist_selectivity(PlannerInfo *root, StatisticExtInfo *stat,
 		}
 	}
 
+	/*
+	 * When no MCV item matched and there is one equality clause per MCV
+	 * dimension, cap the selectivity to the least common MCV frequency. The
+	 * combination is not among the most common, so it can't be more frequent
+	 * than the least common tracked combination.
+	 */
+	if (s == 0.0 && mcv->ndimensions == list_length(clauses) &&
+		mcv_is_all_equality_clauses(clauses))
+		*cap = mcv->items[mcv->nitems - 1].frequency;
+
 	return s;
 }
 
diff --git a/src/include/statistics/extended_stats_internal.h b/src/include/statistics/extended_stats_internal.h
index c775442f2ee..01b5f67b843 100644
--- a/src/include/statistics/extended_stats_internal.h
+++ b/src/include/statistics/extended_stats_internal.h
@@ -129,7 +129,8 @@ extern Selectivity mcv_clauselist_selectivity(PlannerInfo *root,
 											  SpecialJoinInfo *sjinfo,
 											  RelOptInfo *rel,
 											  Selectivity *basesel,
-											  Selectivity *totalsel);
+											  Selectivity *totalsel,
+											  Selectivity *cap);
 
 extern Selectivity mcv_clause_selectivity_or(PlannerInfo *root,
 											 StatisticExtInfo *stat,
diff --git a/src/test/regress/expected/stats_ext.out b/src/test/regress/expected/stats_ext.out
index 37070c1a896..1ca26669bb1 100644
--- a/src/test/regress/expected/stats_ext.out
+++ b/src/test/regress/expected/stats_ext.out
@@ -2928,6 +2928,58 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE (a = 0
 (1 row)
 
 DROP TABLE mcv_lists_partial;
+-- P(a=0)=0.5 and P(b=0)=0.5, so the independence estimate is 0.25 * N.
+-- After building MCV statistics the cap limits the combined estimate to the
+-- least-common MCV frequency, eliminating most of the over-estimation.
+CREATE TABLE mcv_cap (a INT, b INT) WITH (autovacuum_enabled = off);
+INSERT INTO mcv_cap
+    SELECT 0, b FROM generate_series(1, 99) b, generate_series(1, 100) r;
+INSERT INTO mcv_cap
+    SELECT a, 0 FROM generate_series(1, 99) a, generate_series(1, 100) r;
+ANALYZE mcv_cap;
+-- without MCV statistics: independence gives 0.5 * 0.5 * 19800 = 4950 rows
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_cap WHERE a = 0 AND b = 0');
+ estimated | actual 
+-----------+--------
+      4950 |      0
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_cap WHERE a = 0 AND b IN (0, 99)');
+ estimated | actual 
+-----------+--------
+      5000 |    100
+(1 row)
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_cap WHERE a = 0 AND b IN (0, 100)');
+ estimated | actual 
+-----------+--------
+      4950 |      0
+(1 row)
+
+CREATE STATISTICS mcv_cap_stats (mcv) ON a, b FROM mcv_cap;
+ANALYZE mcv_cap;
+-- with MCV statistics: bounded by least MCV frequency
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_cap WHERE a = 0 AND b = 0');
+ estimated | actual 
+-----------+--------
+       100 |      0
+(1 row)
+
+-- IN/ANY equality clauses are not supported, partial MCV match (a=0, b=99)
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_cap WHERE a = 0 AND b IN (0, 99)');
+ estimated | actual 
+-----------+--------
+      5050 |    100
+(1 row)
+
+-- IN/ANY equality clauses are not supported, no MCV match
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_cap WHERE a = 0 AND b IN (0, 100)');
+ estimated | actual 
+-----------+--------
+      4950 |      0
+(1 row)
+
+DROP TABLE mcv_cap;
 -- check the ability to use multiple MCV lists
 CREATE TABLE mcv_lists_multi (
 	a INTEGER,
diff --git a/src/test/regress/sql/stats_ext.sql b/src/test/regress/sql/stats_ext.sql
index 3cc6012b822..0f67363cd6d 100644
--- a/src/test/regress/sql/stats_ext.sql
+++ b/src/test/regress/sql/stats_ext.sql
@@ -1465,6 +1465,40 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_lists_partial WHERE (a = 0
 
 DROP TABLE mcv_lists_partial;
 
+-- P(a=0)=0.5 and P(b=0)=0.5, so the independence estimate is 0.25 * N.
+-- After building MCV statistics the cap limits the combined estimate to the
+-- least-common MCV frequency, eliminating most of the over-estimation.
+CREATE TABLE mcv_cap (a INT, b INT) WITH (autovacuum_enabled = off);
+
+INSERT INTO mcv_cap
+    SELECT 0, b FROM generate_series(1, 99) b, generate_series(1, 100) r;
+
+INSERT INTO mcv_cap
+    SELECT a, 0 FROM generate_series(1, 99) a, generate_series(1, 100) r;
+
+ANALYZE mcv_cap;
+
+-- without MCV statistics: independence gives 0.5 * 0.5 * 19800 = 4950 rows
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_cap WHERE a = 0 AND b = 0');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_cap WHERE a = 0 AND b IN (0, 99)');
+
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_cap WHERE a = 0 AND b IN (0, 100)');
+
+CREATE STATISTICS mcv_cap_stats (mcv) ON a, b FROM mcv_cap;
+ANALYZE mcv_cap;
+
+-- with MCV statistics: bounded by least MCV frequency
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_cap WHERE a = 0 AND b = 0');
+
+-- IN/ANY equality clauses are not supported, partial MCV match (a=0, b=99)
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_cap WHERE a = 0 AND b IN (0, 99)');
+
+-- IN/ANY equality clauses are not supported, no MCV match
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_cap WHERE a = 0 AND b IN (0, 100)');
+
+DROP TABLE mcv_cap;
+
 -- check the ability to use multiple MCV lists
 CREATE TABLE mcv_lists_multi (
 	a INTEGER,
-- 
2.43.0

