diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
index 732ab22..3975fb6 100644
--- a/src/backend/commands/analyze.c
+++ b/src/backend/commands/analyze.c
@@ -110,6 +110,8 @@ static void update_attstats(Oid relid, bool inh,
 static Datum std_fetch_func(VacAttrStatsP stats, int rownum, bool *isNull);
 static Datum ind_fetch_func(VacAttrStatsP stats, int rownum, bool *isNull);
 
+static double optimize_estimate(int total_rows, int sample_rows, int ndistinct,
+								int *f, int f_max);
 
 /*
  *	analyze_rel() -- analyze one relation
@@ -2369,6 +2371,11 @@ compute_scalar_stats(VacAttrStatsP stats,
 		int			slot_idx = 0;
 		CompareScalarsContext cxt;
 
+		/* f values for the estimator - messy and we likely need much
+		 * less memory, but who cares */
+		int			f_max = 0; /* max number of duplicates */
+		int		   *f_count = (int*)palloc0(sizeof(int)*values_cnt);
+
 		/* Sort the collected values */
 		cxt.ssup = &ssup;
 		cxt.tupnoLink = tupnoLink;
@@ -2410,6 +2417,7 @@ compute_scalar_stats(VacAttrStatsP stats,
 				ndistinct++;
 				if (dups_cnt > 1)
 				{
+
 					nmultiple++;
 					if (track_cnt < num_mcv ||
 						dups_cnt > track[track_cnt - 1].count)
@@ -2435,6 +2443,12 @@ compute_scalar_stats(VacAttrStatsP stats,
 						track[j].first = i + 1 - dups_cnt;
 					}
 				}
+
+				/* increment the number of values with this number of
+				 * repetitions and the largest number of repetitions */
+				f_count[dups_cnt] += 1;
+				f_max = (f_max < dups_cnt) ? dups_cnt : f_max;
+
 				dups_cnt = 0;
 			}
 		}
@@ -2481,6 +2495,7 @@ compute_scalar_stats(VacAttrStatsP stats,
 			double		numer,
 						denom,
 						stadistinct;
+			double		adaptdistinct;	/* adaptive estimate */
 
 			numer = (double) samplerows *(double) d;
 
@@ -2494,6 +2509,20 @@ compute_scalar_stats(VacAttrStatsP stats,
 			if (stadistinct > totalrows)
 				stadistinct = totalrows;
 			stats->stadistinct = floor(stadistinct + 0.5);
+
+			/* compute the adaptive estimate */
+			adaptdistinct = optimize_estimate(totalrows, samplerows, d, f_count, f_max);
+
+			elog(WARNING, "ndistinct estimate current=%.2f adaptive=%.2f", stadistinct, adaptdistinct);
+
+			/* if we've seen 'almost' all rows, use the estimate instead */
+			if (samplerows >= 0.95 * totalrows)
+			{
+				adaptdistinct = (d + d/0.95)/2;
+				elog(WARNING, "corrected ndistinct estimate current=%.2f adaptive=%.2f",
+					 stadistinct, adaptdistinct);
+			}
+
 		}
 
 		/*
@@ -2819,3 +2848,84 @@ compare_mcvs(const void *a, const void *b)
 
 	return da - db;
 }
+
+
+/*
+ * We need to minimize this equality (find "m" solving it)
+ *
+ *     m - f1 - f2 = f1 * (A + A(m)) / (B + B(m))
+ *
+ * where A, B are effectively constants (not depending on m), and A(m)
+ * and B(m) are functions. This is equal to solving
+ *
+ *     0 = f1 * (A + A(m)) / (B + B(m)) - (m - f1 - f2)
+ *
+ * Instead of looking for the exact solution to this equation (which
+ * might be fractional), we'll look for a natural number minimizing
+ * the absolute difference. Number of (distinct) elements is a natural
+ * number, and we don't mind if the number os slightly wrong. It's
+ * just an estimate, after all. The error from sampling will be much
+ * worse in most cases.
+ *
+ * We know the acceptable values of 'm' are [d,N] where 'd' is the number
+ * of distinct elements in the sample, and N is the number of rows in
+ * the table (not just the sample). For large tables (billions of rows)
+ * that'd be quite time-consuming to compute, so we'll approximate the
+ * solution by gradually increasing the step to ~1% of the current value
+ * of 'm'. This will make it much faster and yet very accurate.
+ * 
+ * All this of course assumes the function behaves reasonably (not
+ * oscillating etc.), but that's a safe assumption as the estimator
+ * would perform terribly otherwise.
+ */
+static double
+optimize_estimate(int total_rows, int sample_rows, int d, int *f, int f_max)
+{
+    int     i, m;
+    double  A = 0.0, B = 0.0;
+    int     opt_m = 0;
+    double  opt_diff = total_rows;
+    int     step = 1;
+    int     ndistinct;
+
+    /* compute the 'constant' parts of the equality (A, B) */
+    for (i = 3; i <= f_max; i++)
+    {
+        double p = i / (double)sample_rows;
+        A +=     pow((1.0 - p), sample_rows  ) * f[i];
+        B += i * pow((1.0 - p), sample_rows-1) * f[i];
+    }
+
+    /* find the 'm' value minimizing the difference */
+    for (m = 1; m <= total_rows; m += step)
+    {
+        int    k = (f[1] + 2*f[2]);
+        double q = k / ((double)(sample_rows * 2 * m));
+
+        double A_m = A + m * pow((1 - q), sample_rows  );
+        double B_m = B + k * pow((1 - q), sample_rows-1);
+
+        double diff = fabs(f[1] * (A_m / B_m) - (m - f[1] - f[2]));
+
+        /* if this is a better solution */
+        if (diff < opt_diff)
+        {
+            opt_diff = diff;
+            opt_m = m;
+        }
+
+        /* tweak the step to 1% to make it faster */
+        step = ((int)(0.01 * m) > 1) ? (int)(0.01 * m) : 1;
+    }
+
+    /* compute the final estimate */
+    ndistinct = d + opt_m - f[1] - f[2];
+
+    /* sanity checks that the estimate is within [d,total_rows] */
+    if (ndistinct < d)
+        ndistinct = d;
+    else if (ndistinct > total_rows)
+        ndistinct = total_rows;
+
+    return ndistinct;
+}
