diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c
index 44a4b3f..a2c606b 100644
*** a/src/backend/commands/analyze.c
--- b/src/backend/commands/analyze.c
*************** compute_distinct_stats(VacAttrStatsP sta
*** 2120,2128 ****
* we are able to generate a complete MCV list (all the values in the
* sample will fit, and we think these are all the ones in the table),
* then do so. Otherwise, store only those values that are
! * significantly more common than the (estimated) average. We set the
! * threshold rather arbitrarily at 25% more than average, with at
! * least 2 instances in the sample.
*/
if (track_cnt < track_max && toowide_cnt == 0 &&
stats->stadistinct > 0 &&
--- 2120,2138 ----
* we are able to generate a complete MCV list (all the values in the
* sample will fit, and we think these are all the ones in the table),
* then do so. Otherwise, store only those values that are
! * significantly more common than the ones we omit. We determine that
! * by considering the values in frequency order, and accepting each
! * one if it is at least 50% more common than the average among the
! * values after it. The 50% threshold is somewhat arbitrary.
! *
! * Note that the 50% rule will never accept a value with count 1,
! * since all the values have count at least 1; this is a property we
! * desire, since there's no very good reason to assume that a
! * single-occurrence value is an MCV and not just a random non-MCV.
! *
! * We need a special rule for the very last value. If we get to it,
! * we'll accept it if it's at least 1% of the non-null samples and has
! * count more than 1.
*/
if (track_cnt < track_max && toowide_cnt == 0 &&
stats->stadistinct > 0 &&
*************** compute_distinct_stats(VacAttrStatsP sta
*** 2133,2153 ****
}
else
{
! /* d here is the same as d in the Haas-Stokes formula */
int d = nonnull_cnt - summultiple + nmultiple;
! double avgcount,
! mincount;
! /* estimate # occurrences in sample of a typical nonnull value */
! avgcount = (double) nonnull_cnt / (double) d;
! /* set minimum threshold count to store a value */
! mincount = avgcount * 1.25;
! if (mincount < 2)
! mincount = 2;
if (num_mcv > track_cnt)
num_mcv = track_cnt;
for (i = 0; i < num_mcv; i++)
{
if (track[i].count < mincount)
{
num_mcv = i;
--- 2143,2181 ----
}
else
{
! /* d here is initially the same as d in the Haas-Stokes formula */
int d = nonnull_cnt - summultiple + nmultiple;
! int remaining_samples = nonnull_cnt;
! /* can't store more MCVs than we tracked ... */
if (num_mcv > track_cnt)
num_mcv = track_cnt;
+ /* locate first value we're not going to store as an MCV */
for (i = 0; i < num_mcv; i++)
{
+ double avgcount,
+ mincount;
+
+ /* remove current value from remaining_samples and d */
+ remaining_samples -= track[i].count;
+ d--;
+ if (d > 0)
+ {
+ /* get avg # occurrences of distinct values after this */
+ avgcount = (double) remaining_samples / (double) d;
+ /* set minimum count to accept a value (is surely > 1) */
+ mincount = avgcount * 1.50;
+ }
+ else
+ {
+ /* last value, use 1% rule */
+ mincount = nonnull_cnt * 0.01;
+
+ /* here, we need a clamp to avoid accepting count 1 */
+ if (mincount < 2)
+ mincount = 2;
+ }
+ /* if this value falls below threshold, we're done */
if (track[i].count < mincount)
{
num_mcv = i;
*************** compute_scalar_stats(VacAttrStatsP stats
*** 2375,2381 ****
/*
* Found a new item for the mcv list; find its
* position, bubbling down old items if needed. Loop
! * invariant is that j points at an empty/ replaceable
* slot.
*/
int j;
--- 2403,2409 ----
/*
* Found a new item for the mcv list; find its
* position, bubbling down old items if needed. Loop
! * invariant is that j points at an empty/replaceable
* slot.
*/
int j;
*************** compute_scalar_stats(VacAttrStatsP stats
*** 2475,2488 ****
* we are able to generate a complete MCV list (all the values in the
* sample will fit, and we think these are all the ones in the table),
* then do so. Otherwise, store only those values that are
! * significantly more common than the (estimated) average. We set the
! * threshold rather arbitrarily at 25% more than average, with at
! * least 2 instances in the sample. Also, we won't suppress values
! * that have a frequency of at least 1/K where K is the intended
! * number of histogram bins; such values might otherwise cause us to
! * emit duplicate histogram bin boundaries. (We might end up with
! * duplicate histogram entries anyway, if the distribution is skewed;
! * but we prefer to treat such values as MCVs if at all possible.)
*/
if (track_cnt == ndistinct && toowide_cnt == 0 &&
stats->stadistinct > 0 &&
--- 2503,2532 ----
* we are able to generate a complete MCV list (all the values in the
* sample will fit, and we think these are all the ones in the table),
* then do so. Otherwise, store only those values that are
! * significantly more common than the ones we omit. We determine that
! * by considering the values in frequency order, and accepting each
! * one if it is at least 50% more common than the average among the
! * values after it. The 50% threshold is somewhat arbitrary.
! *
! * We need a special rule for the very last value. If we get to it,
! * we'll accept it if it's at least 1% of the non-null samples and has
! * count more than 1.
! *
! * Also, we will treat values as MCVs if they have a frequency of at
! * least 1/K where K is the intended number of histogram entries.
! * Relegating such values to the histogram might cause us to emit
! * duplicate histogram entries. (We might get duplicate histogram
! * entries anyway, if the distribution is skewed; but we prefer to
! * treat such values as MCVs if at all possible.) For this purpose,
! * measure the frequency with respect to the population represented by
! * the histogram; so in the loop below, cur_remaining_samples/num_hist
! * is the correct calculation.
! *
! * In any case, unless we believe we have a complete MCV list, we will
! * not accept an MCV value with count 1, since there's no very good
! * reason to assume that a single-occurrence value is an MCV and not
! * just a random non-MCV. This is automatic with the 50% rule but
! * needs enforcement with the other ones.
*/
if (track_cnt == ndistinct && toowide_cnt == 0 &&
stats->stadistinct > 0 &&
*************** compute_scalar_stats(VacAttrStatsP stats
*** 2490,2523 ****
{
/* Track list includes all values seen, and all will fit */
num_mcv = track_cnt;
}
else
{
! /* d here is the same as d in the Haas-Stokes formula */
int d = ndistinct + toowide_cnt;
! double avgcount,
! mincount,
! maxmincount;
! /* estimate # occurrences in sample of a typical nonnull value */
! avgcount = (double) values_cnt / (double) d;
! /* set minimum threshold count to store a value */
! mincount = avgcount * 1.25;
! if (mincount < 2)
! mincount = 2;
! /* don't let threshold exceed 1/K, however */
! maxmincount = (double) values_cnt / (double) num_bins;
! if (mincount > maxmincount)
! mincount = maxmincount;
if (num_mcv > track_cnt)
num_mcv = track_cnt;
for (i = 0; i < num_mcv; i++)
{
if (track[i].count < mincount)
{
num_mcv = i;
break;
}
}
}
--- 2534,2605 ----
{
/* Track list includes all values seen, and all will fit */
num_mcv = track_cnt;
+ /* Nothing left for the histogram */
+ num_hist = 0;
}
else
{
! /* d here is initially the same as d in the Haas-Stokes formula */
int d = ndistinct + toowide_cnt;
! int remaining_samples = values_cnt;
! /*
! * num_hist is the planned histogram size; it's limited by the
! * number of distinct sample vals not absorbed into the MCV list.
! * Start with assumption that nothing is absorbed into MCV list.
! */
! num_hist = Min(ndistinct, num_bins + 1);
!
! /* can't store more MCVs than we tracked ... */
if (num_mcv > track_cnt)
num_mcv = track_cnt;
+ /* locate first value we're not going to store as an MCV */
for (i = 0; i < num_mcv; i++)
{
+ int cur_remaining_samples = remaining_samples;
+ double avgcount,
+ mincount;
+
+ /* remove current value from remaining_samples and d */
+ remaining_samples -= track[i].count;
+ d--;
+ if (d > 0)
+ {
+ /* get avg # occurrences of distinct values after this */
+ avgcount = (double) remaining_samples / (double) d;
+ /* set minimum count to accept a value (is surely > 1) */
+ mincount = avgcount * 1.50;
+ }
+ else
+ {
+ /* last value, use 1% rule */
+ mincount = values_cnt * 0.01;
+
+ /* here, we need a clamp to avoid accepting count 1 */
+ if (mincount < 2)
+ mincount = 2;
+ }
+
+ /* don't let threshold exceed 1/K, however */
+ if (num_hist >= 2) /* else we won't make a histogram */
+ {
+ double hfreq;
+
+ hfreq = (double) cur_remaining_samples / (double) num_hist;
+ if (mincount > hfreq)
+ mincount = hfreq;
+ /* hfreq could be 1, so clamp to avoid accepting count 1 */
+ if (mincount < 2)
+ mincount = 2;
+ }
+ /* if this value falls below threshold, we're done */
if (track[i].count < mincount)
{
num_mcv = i;
break;
}
+ /* update planned histogram size, removing this value */
+ num_hist = Min(ndistinct - (i + 1), num_bins + 1);
}
}
*************** compute_scalar_stats(VacAttrStatsP stats
*** 2560,2568 ****
* values not accounted for in the MCV list. (This ensures the
* histogram won't collapse to empty or a singleton.)
*/
- num_hist = ndistinct - num_mcv;
- if (num_hist > num_bins)
- num_hist = num_bins + 1;
if (num_hist >= 2)
{
MemoryContext old_context;
--- 2642,2647 ----