From c968f05c26ecfa9344a8a9c9209bd755fa4ddf7b Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@pgaddict.com>
Date: Tue, 26 Jan 2016 18:14:33 +0100
Subject: [PATCH 8/9] change how we apply selectivity to number of groups
 estimate

Instead of simply multiplying the ndistinct estimate with selecticity,
we instead use the formula for the expected number of distinct values
observed in 'k' rows when there are 'd' distinct values in the bin

    d * (1 - ((d - 1) / d)^k)

This is 'with replacements' which seems appropriate for the use, and it
mostly assumes uniform distribution of the distinct values. So if the
distribution is not uniform (e.g. there are very frequent groups) this
may be less accurate than the current algorithm in some cases, giving
over-estimates. But that's probably better than OOM.
---
 src/backend/utils/adt/selfuncs.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index f8d39aa..76be0e3 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -3464,9 +3464,9 @@ estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows,
 				reldistinct = clamp;
 
 			/*
-			 * Multiply by restriction selectivity.
+			 * Estimate the number of distinct values observed in rel->rows.
 			 */
-			reldistinct *= rel->rows / rel->tuples;
+			reldistinct *= (1 - powl(1 - rel->rows/rel->tuples, rel->tuples/reldistinct));
 
 			/*
 			 * Update estimate of total distinct groups.
-- 
2.5.0

