From 517c40a71bd3e35b747fa811205944f6a430bd5a Mon Sep 17 00:00:00 2001
From: Enrique Sanchez Cardoso <enriqueesanchz@gmail.com>
Date: Sun, 7 Jun 2026 15:20:37 +0200
Subject: [PATCH v3 2/2] Use ndistinct to cap non-MCV values

When no MCV matches and ndistinct is available, apply the uniform
distribution among non-MCV combinations as an upper bound:

        (1 - mcv_totalsel) / (ndistinct - mcv_nitems)
---
 src/backend/statistics/extended_stats.c       | 89 ++++++++++++++++++-
 src/backend/statistics/mcv.c                  |  4 +-
 .../statistics/extended_stats_internal.h      |  3 +-
 src/test/regress/expected/stats_ext.out       | 21 ++++-
 src/test/regress/sql/stats_ext.sql            | 14 ++-
 5 files changed, 123 insertions(+), 8 deletions(-)

diff --git a/src/backend/statistics/extended_stats.c b/src/backend/statistics/extended_stats.c
index 0359857ad91..ba50bcbaca6 100644
--- a/src/backend/statistics/extended_stats.c
+++ b/src/backend/statistics/extended_stats.c
@@ -1768,6 +1768,64 @@ mcv_can_cap(StatisticExtInfo *stat, Bitmapset *covered_attnums, List *stat_claus
 	return true;
 }
 
+/*
+ * get_ndistinct_for_keys
+ *		Return the ndistinct estimate for the full set of columns identified by
+ *		keys, using a matching STATS_EXT_NDISTINCT object from the relation's
+ *		statlist.
+ *
+ * Accepts both exact-match and superset statistics objects.  Returns -1.0
+ * if no matching ndistinct statistics object or item is found.
+ */
+static double
+get_ndistinct_for_keys(List *statlist, Bitmapset *keys, bool inh)
+{
+	ListCell   *lc;
+
+	foreach(lc, statlist)
+	{
+		StatisticExtInfo *info = (StatisticExtInfo *) lfirst(lc);
+		MVNDistinct *mvnd;
+		int			nkeys;
+		int			i;
+
+		if (info->kind != STATS_EXT_NDISTINCT || info->inherit != inh)
+			continue;
+		if (!bms_is_subset(keys, info->keys))
+			continue;
+
+		mvnd = statext_ndistinct_load(info->statOid, inh);
+		nkeys = bms_num_members(keys);
+
+		for (i = 0; i < mvnd->nitems; i++)
+		{
+			MVNDistinctItem *item = &mvnd->items[i];
+			int			j;
+
+			if (item->nattributes != nkeys)
+				continue;
+
+			for (j = 0; j < item->nattributes; j++)
+			{
+				if (!bms_is_member(item->attributes[j], keys))
+					break;
+			}
+
+			if (j == item->nattributes)
+			{
+				double		ndistinct = item->ndistinct;
+
+				statext_ndistinct_free(mvnd);
+				return ndistinct;
+			}
+		}
+
+		statext_ndistinct_free(mvnd);
+	}
+
+	return -1.0;
+}
+
 /*
  * statext_mcv_clauselist_selectivity
  *		Estimate clauses using the best multi-column statistics.
@@ -2064,6 +2122,7 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli
 						mcv_totalsel,
 						mcv_cap,
 						stat_sel;
+			uint32		mcv_nitems;
 
 			/*
 			 * "Simple" selectivity, i.e. without any extended statistics,
@@ -2081,7 +2140,8 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli
 												 varRelid, jointype, sjinfo,
 												 rel, &mcv_basesel,
 												 &mcv_totalsel,
-												 &mcv_cap);
+												 &mcv_cap,
+												 &mcv_nitems);
 
 			/* Combine the simple and multi-column estimates. */
 			stat_sel = mcv_combine_selectivities(simple_sel,
@@ -2089,9 +2149,30 @@ statext_mcv_clauselist_selectivity(PlannerInfo *root, List *clauses, int varReli
 												 mcv_basesel,
 												 mcv_totalsel);
 
-			/* Cap to the least common MCV item when no MCV items matched. */
-			if (can_cap && stat_sel > mcv_cap)
-				stat_sel = mcv_cap;
+			/* Cap when no MCV items matched (mcv_sel = 0.0). */
+			if (can_cap && mcv_sel == 0.0)
+			{
+				double		ndistinct;
+
+				/* Cap to the least common MCV item. */
+				if (stat_sel > mcv_cap)
+					stat_sel = mcv_cap;
+
+				ndistinct = get_ndistinct_for_keys(rel->statlist, stat->keys, rte->inh);
+
+				if (ndistinct > (double) mcv_nitems)
+				{
+					double		non_mcv_sel = (1.0 - mcv_totalsel) / (ndistinct - (double) mcv_nitems);
+
+					/*
+					 * Cap to uniform distribution among the non-MCV
+					 * combinations. This is similar to what var_eq_const()
+					 * does for single-column MCV stats.
+					 */
+					if (stat_sel > non_mcv_sel)
+						stat_sel = non_mcv_sel;
+				}
+			}
 
 			/* Factor this into the overall result */
 			sel *= stat_sel;
diff --git a/src/backend/statistics/mcv.c b/src/backend/statistics/mcv.c
index 6617c297eab..04a2c430637 100644
--- a/src/backend/statistics/mcv.c
+++ b/src/backend/statistics/mcv.c
@@ -2048,7 +2048,7 @@ mcv_clauselist_selectivity(PlannerInfo *root, StatisticExtInfo *stat,
 						   JoinType jointype, SpecialJoinInfo *sjinfo,
 						   RelOptInfo *rel,
 						   Selectivity *basesel, Selectivity *totalsel,
-						   Selectivity *cap)
+						   Selectivity *cap, uint32 *nitems)
 {
 	int			i;
 	MCVList    *mcv;
@@ -2064,6 +2064,8 @@ mcv_clauselist_selectivity(PlannerInfo *root, StatisticExtInfo *stat,
 	/* load the MCV list stored in the statistics object */
 	mcv = statext_mcv_load(stat->statOid, rte->inh);
 
+	*nitems = mcv->nitems;
+
 	/* build a match bitmap for the clauses */
 	matches = mcv_get_match_bitmap(root, clauses, stat->keys, stat->exprs,
 								   mcv, false);
diff --git a/src/include/statistics/extended_stats_internal.h b/src/include/statistics/extended_stats_internal.h
index 01b5f67b843..1114d2870b2 100644
--- a/src/include/statistics/extended_stats_internal.h
+++ b/src/include/statistics/extended_stats_internal.h
@@ -130,7 +130,8 @@ extern Selectivity mcv_clauselist_selectivity(PlannerInfo *root,
 											  RelOptInfo *rel,
 											  Selectivity *basesel,
 											  Selectivity *totalsel,
-											  Selectivity *cap);
+											  Selectivity *cap,
+											  uint32 *nitems);
 
 extern Selectivity mcv_clause_selectivity_or(PlannerInfo *root,
 											 StatisticExtInfo *stat,
diff --git a/src/test/regress/expected/stats_ext.out b/src/test/regress/expected/stats_ext.out
index c87b2d9f9f5..30760efe47c 100644
--- a/src/test/regress/expected/stats_ext.out
+++ b/src/test/regress/expected/stats_ext.out
@@ -2945,7 +2945,7 @@ SELECT * FROM check_estimated_rows($$SELECT * FROM mcv_cap WHERE a = 0 AND b = 0
       1219 |      0
 (1 row)
 
-CREATE STATISTICS mcv_cap_stats (mcv) ON a, b, c, d FROM mcv_cap;
+CREATE STATISTICS mcv_cap_stats_mcv (mcv) ON a, b, c, d FROM mcv_cap;
 ANALYZE mcv_cap;
 -- MCV
 SELECT * FROM check_estimated_rows($$SELECT * FROM mcv_cap WHERE a = 0 AND b = 0 AND c = TRUE AND d = '{1, 2}'$$);
@@ -2954,6 +2954,15 @@ SELECT * FROM check_estimated_rows($$SELECT * FROM mcv_cap WHERE a = 0 AND b = 0
        100 |      0
 (1 row)
 
+CREATE STATISTICS mcv_cap_stats_nd (ndistinct) ON a, b, c, d FROM mcv_cap;
+ANALYZE mcv_cap;
+-- MCV + ndistinct
+SELECT * FROM check_estimated_rows($$SELECT * FROM mcv_cap WHERE a = 0 AND b = 0 AND c = TRUE AND d = '{1, 2}'$$);
+ estimated | actual 
+-----------+--------
+        50 |      0
+(1 row)
+
 -- When a value IS in the MCV list, no cap path runs
 SELECT * FROM check_estimated_rows($$SELECT * FROM mcv_cap WHERE a = 0 AND b = 1 AND c = TRUE AND d = '{}'$$);
  estimated | actual 
@@ -2975,6 +2984,16 @@ SELECT * FROM check_estimated_rows($$SELECT * FROM mcv_cap WHERE a = 0 AND b = 0
       2450 |      0
 (1 row)
 
+-- MCV + superset ndistinct
+DROP STATISTICS mcv_cap_stats_nd;
+CREATE STATISTICS mcv_cap_stats_nd (ndistinct) ON a, b, c, d, e FROM mcv_cap;
+ANALYZE mcv_cap;
+SELECT * FROM check_estimated_rows($$SELECT * FROM mcv_cap WHERE a = 0 AND b = 0 AND c = TRUE AND d = '{1, 2}'$$);
+ estimated | actual 
+-----------+--------
+        50 |      0
+(1 row)
+
 DROP TABLE mcv_cap;
 -- check the ability to use multiple MCV lists
 CREATE TABLE mcv_lists_multi (
diff --git a/src/test/regress/sql/stats_ext.sql b/src/test/regress/sql/stats_ext.sql
index e6cf85aa6ab..ffbf64a98af 100644
--- a/src/test/regress/sql/stats_ext.sql
+++ b/src/test/regress/sql/stats_ext.sql
@@ -1483,12 +1483,18 @@ ANALYZE mcv_cap;
 -- no MCV
 SELECT * FROM check_estimated_rows($$SELECT * FROM mcv_cap WHERE a = 0 AND b = 0 AND c = TRUE AND d = '{1, 2}'$$);
 
-CREATE STATISTICS mcv_cap_stats (mcv) ON a, b, c, d FROM mcv_cap;
+CREATE STATISTICS mcv_cap_stats_mcv (mcv) ON a, b, c, d FROM mcv_cap;
 ANALYZE mcv_cap;
 
 -- MCV
 SELECT * FROM check_estimated_rows($$SELECT * FROM mcv_cap WHERE a = 0 AND b = 0 AND c = TRUE AND d = '{1, 2}'$$);
 
+CREATE STATISTICS mcv_cap_stats_nd (ndistinct) ON a, b, c, d FROM mcv_cap;
+ANALYZE mcv_cap;
+
+-- MCV + ndistinct
+SELECT * FROM check_estimated_rows($$SELECT * FROM mcv_cap WHERE a = 0 AND b = 0 AND c = TRUE AND d = '{1, 2}'$$);
+
 -- When a value IS in the MCV list, no cap path runs
 SELECT * FROM check_estimated_rows($$SELECT * FROM mcv_cap WHERE a = 0 AND b = 1 AND c = TRUE AND d = '{}'$$);
 
@@ -1498,6 +1504,12 @@ SELECT * FROM check_estimated_rows($$SELECT * FROM mcv_cap WHERE a >= 0 AND b =
 -- Capping does not apply when the query does not cover all MCV columns
 SELECT * FROM check_estimated_rows($$SELECT * FROM mcv_cap WHERE a = 0 AND b = 0 AND c = TRUE$$);
 
+-- MCV + superset ndistinct
+DROP STATISTICS mcv_cap_stats_nd;
+CREATE STATISTICS mcv_cap_stats_nd (ndistinct) ON a, b, c, d, e FROM mcv_cap;
+ANALYZE mcv_cap;
+SELECT * FROM check_estimated_rows($$SELECT * FROM mcv_cap WHERE a = 0 AND b = 0 AND c = TRUE AND d = '{1, 2}'$$);
+
 DROP TABLE mcv_cap;
 
 -- check the ability to use multiple MCV lists
-- 
2.43.0

