From 145c56ff65785eb559e720f3c04b908a57c61940 Mon Sep 17 00:00:00 2001
From: Enrique Sanchez Cardoso <enriqueesanchz@gmail.com>
Date: Sun, 24 May 2026 19:09:26 +0200
Subject: [PATCH 4/4] Extend multi-column MCV cap to AND clauses inside OR
 expressions

---
 src/backend/statistics/extended_stats.c       | 12 +++-
 src/backend/statistics/mcv.c                  | 68 +++++++++++++++----
 .../statistics/extended_stats_internal.h      |  3 +-
 src/test/regress/expected/stats_ext.out       | 16 ++++-
 src/test/regress/sql/stats_ext.sql            |  8 ++-
 5 files changed, 90 insertions(+), 17 deletions(-)

diff --git a/src/backend/statistics/extended_stats.c b/src/backend/statistics/extended_stats.c
index da6f6315698..fc6f7905bb3 100644
--- a/src/backend/statistics/extended_stats.c
+++ b/src/backend/statistics/extended_stats.c
@@ -1915,7 +1915,8 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli
 							overlap_basesel,
 							mcv_totalsel,
 							clause_sel,
-							overlap_sel;
+							overlap_sel,
+							clause_cap;
 
 				/*
 				 * "Simple" selectivity of the next clause and its overlap
@@ -1945,7 +1946,8 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli
 													&mcv_basesel,
 													&overlap_mcvsel,
 													&overlap_basesel,
-													&mcv_totalsel);
+													&mcv_totalsel,
+													&clause_cap);
 
 				/*
 				 * Combine the simple and multi-column estimates.
@@ -1959,11 +1961,17 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli
 				if (bms_is_member(listidx, simple_clauses))
 					clause_sel = simple_sel;
 				else
+				{
 					clause_sel = mcv_combine_selectivities(simple_sel,
 														   mcv_sel,
 														   mcv_basesel,
 														   mcv_totalsel);
 
+					/* Cap the contribution of values not found in the MCV. */
+					if (clause_sel > clause_cap)
+						clause_sel = clause_cap;
+				}
+
 				overlap_sel = mcv_combine_selectivities(overlap_simple_sel,
 														overlap_mcvsel,
 														overlap_basesel,
diff --git a/src/backend/statistics/mcv.c b/src/backend/statistics/mcv.c
index 62761c58e33..df75087391c 100644
--- a/src/backend/statistics/mcv.c
+++ b/src/backend/statistics/mcv.c
@@ -1577,6 +1577,40 @@ mcv_cap_multiplier(List *clauses)
 	return multiplier;
 }
 
+/*
+ * mcv_compute_cap
+ *		Compute a selectivity cap based on the least common MCV frequency.
+ *
+ * When one equality/IN clause covers each MCV dimension, value combinations
+ * not found in the MCV can't be more frequent than the least common tracked
+ * combination.  The cap is: matched MCV frequency plus the number of
+ * non-MCV combinations times the least common MCV frequency.
+ *
+ * Returns 1.0 (no cap) when the clauses don't fully cover all dimensions
+ * or contain unsupported clause types.
+ */
+static Selectivity
+mcv_compute_cap(MCVList *mcv, List *clauses, Selectivity mcv_sel,
+					 int64 matched_count)
+{
+	int64		cap_mult;
+	int64		non_mcv_mult;
+	Selectivity cap;
+
+	if (list_length(clauses) != mcv->ndimensions)
+		return 1.0;
+
+	cap_mult = mcv_cap_multiplier(clauses);
+	non_mcv_mult = cap_mult - matched_count;
+
+	if (non_mcv_mult <= 0)
+		return 1.0;
+
+	cap = mcv_sel + non_mcv_mult * mcv->items[mcv->nitems - 1].frequency;
+	CLAMP_PROBABILITY(cap);
+	return cap;
+}
+
 /*
  * match the attribute/expression to a dimension of the statistic
  *
@@ -2144,17 +2178,7 @@ mcv_clauselist_selectivity(PlannerInfo *root, StatisticExtInfo *stat,
 	 * combination is not among the most common, so it can't be more frequent
 	 * than the least common tracked combination.
 	 */
-	if (mcv->ndimensions == list_length(clauses))
-	{
-		int64		cap_mult = mcv_cap_multiplier(clauses);
-		int64		non_mcv_mult = cap_mult - matched_count;
-
-		if (non_mcv_mult > 0)
-		{
-			*cap = s + non_mcv_mult * mcv->items[mcv->nitems - 1].frequency;
-			CLAMP_PROBABILITY(*cap);
-		}
-	}
+	*cap = mcv_compute_cap(mcv, clauses, s, matched_count);
 
 	return s;
 }
@@ -2202,11 +2226,16 @@ Selectivity
 mcv_clause_selectivity_or(PlannerInfo *root, StatisticExtInfo *stat,
 						  MCVList *mcv, Node *clause, bool **or_matches,
 						  Selectivity *basesel, Selectivity *overlap_mcvsel,
-						  Selectivity *overlap_basesel, Selectivity *totalsel)
+						  Selectivity *overlap_basesel, Selectivity *totalsel,
+						  Selectivity *clause_cap)
 {
 	Selectivity s = 0.0;
 	bool	   *new_matches;
 	int			i;
+	int64		matched_count = 0;
+
+	/* default: no cap on clause selectivity */
+	*clause_cap = 1.0;
 
 	/* build the OR-matches bitmap, if not built already */
 	if (*or_matches == NULL)
@@ -2233,6 +2262,7 @@ mcv_clause_selectivity_or(PlannerInfo *root, StatisticExtInfo *stat,
 		{
 			s += mcv->items[i].frequency;
 			*basesel += mcv->items[i].base_frequency;
+			matched_count++;
 
 			if ((*or_matches)[i])
 			{
@@ -2247,6 +2277,20 @@ mcv_clause_selectivity_or(PlannerInfo *root, StatisticExtInfo *stat,
 
 	pfree(new_matches);
 
+	/*
+	 * When there is one equality/IN clause per MCV dimension, cap the
+	 * contribution of value combinations not found in the MCV.  Each such
+	 * combination is not among the most common, so it can't be more frequent
+	 * than the least common tracked combination.
+	 */
+	if (is_andclause(clause))
+	{
+		BoolExpr   *bexpr = (BoolExpr *) clause;
+
+		*clause_cap = mcv_compute_cap(mcv, bexpr->args, s,
+												matched_count);
+	}
+
 	return s;
 }
 
diff --git a/src/include/statistics/extended_stats_internal.h b/src/include/statistics/extended_stats_internal.h
index 01b5f67b843..10f41f87564 100644
--- a/src/include/statistics/extended_stats_internal.h
+++ b/src/include/statistics/extended_stats_internal.h
@@ -140,6 +140,7 @@ extern Selectivity mcv_clause_selectivity_or(PlannerInfo *root,
 											 Selectivity *basesel,
 											 Selectivity *overlap_mcvsel,
 											 Selectivity *overlap_basesel,
-											 Selectivity *totalsel);
+											 Selectivity *totalsel,
+											 Selectivity *clause_cap);
 
 #endif							/* EXTENDED_STATS_INTERNAL_H */
diff --git a/src/test/regress/expected/stats_ext.out b/src/test/regress/expected/stats_ext.out
index 7ea244f7851..c712679b573 100644
--- a/src/test/regress/expected/stats_ext.out
+++ b/src/test/regress/expected/stats_ext.out
@@ -2931,7 +2931,7 @@ DROP TABLE mcv_lists_partial;
 -- P(a=0)=0.5 and P(b=0)=0.5, so the independence estimate is 0.25 * N.
 -- After building MCV statistics the cap limits the combined estimate to the
 -- least-common MCV frequency, eliminating most of the over-estimation.
-CREATE TABLE mcv_cap (a INT, b INT) WITH (autovacuum_enabled = off);
+CREATE TABLE mcv_cap (a INT, b INT, c INT DEFAULT 0) WITH (autovacuum_enabled = off);
 INSERT INTO mcv_cap
     SELECT 0, b FROM generate_series(1, 99) b, generate_series(1, 100) r;
 INSERT INTO mcv_cap
@@ -2979,6 +2979,20 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_cap WHERE a = 0 AND b IN (
        200 |      0
 (1 row)
 
+-- partial MCV match inside OR (a=0, b=99)
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_cap WHERE c = 1 OR (a = 0 AND b IN (0, 99))');
+ estimated | actual 
+-----------+--------
+       200 |    100
+(1 row)
+
+-- no MCV match inside OR
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_cap WHERE c = 1 OR (a = 0 AND b = 0)');
+ estimated | actual 
+-----------+--------
+       100 |      0
+(1 row)
+
 DROP TABLE mcv_cap;
 -- check the ability to use multiple MCV lists
 CREATE TABLE mcv_lists_multi (
diff --git a/src/test/regress/sql/stats_ext.sql b/src/test/regress/sql/stats_ext.sql
index 8e0b8c0eb5c..926dfaa4e6d 100644
--- a/src/test/regress/sql/stats_ext.sql
+++ b/src/test/regress/sql/stats_ext.sql
@@ -1468,7 +1468,7 @@ DROP TABLE mcv_lists_partial;
 -- P(a=0)=0.5 and P(b=0)=0.5, so the independence estimate is 0.25 * N.
 -- After building MCV statistics the cap limits the combined estimate to the
 -- least-common MCV frequency, eliminating most of the over-estimation.
-CREATE TABLE mcv_cap (a INT, b INT) WITH (autovacuum_enabled = off);
+CREATE TABLE mcv_cap (a INT, b INT, c INT DEFAULT 0) WITH (autovacuum_enabled = off);
 
 INSERT INTO mcv_cap
     SELECT 0, b FROM generate_series(1, 99) b, generate_series(1, 100) r;
@@ -1497,6 +1497,12 @@ SELECT * FROM check_estimated_rows('SELECT * FROM mcv_cap WHERE a = 0 AND b IN (
 -- no MCV match
 SELECT * FROM check_estimated_rows('SELECT * FROM mcv_cap WHERE a = 0 AND b IN (0, 100)');
 
+-- partial MCV match inside OR (a=0, b=99)
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_cap WHERE c = 1 OR (a = 0 AND b IN (0, 99))');
+
+-- no MCV match inside OR
+SELECT * FROM check_estimated_rows('SELECT * FROM mcv_cap WHERE c = 1 OR (a = 0 AND b = 0)');
+
 DROP TABLE mcv_cap;
 
 -- check the ability to use multiple MCV lists
-- 
2.43.0

