From b12cab38c3e8ccc2be057d5ef3f3da3d368f258f Mon Sep 17 00:00:00 2001 From: Tomas Vondra Date: Sat, 22 Jun 2019 15:37:40 +0200 Subject: [PATCH 2/2] pick MCV items by relative error --- src/backend/statistics/mcv.c | 149 +++++++++++++++++++++++++---------- 1 file changed, 109 insertions(+), 40 deletions(-) diff --git a/src/backend/statistics/mcv.c b/src/backend/statistics/mcv.c index 04a4f17b01..3056b659e7 100644 --- a/src/backend/statistics/mcv.c +++ b/src/backend/statistics/mcv.c @@ -78,6 +78,8 @@ static MultiSortSupport build_mss(VacAttrStats **stats, int numattrs); static SortItem *build_distinct_groups(int numrows, SortItem *items, MultiSortSupport mss, int *ndistinct); +static int sort_item_compare(const void *a, const void *b, void *arg); + static SortItem **build_column_frequencies(SortItem *groups, int ngroups, MultiSortSupport mss, int *ncounts); @@ -147,6 +149,63 @@ get_mincount_for_mcv_list(int samplerows, double totalrows) return numer / denom; } +/* + * Represents a group of values, with both the observed and base frequency + * (as expected from product of individual columns). + */ +typedef struct SortGroup { + Datum *values; + bool *isnull; + double frequency; + double base_frequency; +} SortGroup; + +/* + * compare_relative_error + * compare MCV groups by relative estimation error + * + * We simply compute relative estimation error + * + * Max(estimate/actual, actual/estimate) + * + * and then use that to pick the most mis-estimated groups. + */ +static int +compare_relative_error(const void *a, const void *b) +{ + SortGroup *sa = (SortGroup *) a; + SortGroup *sb = (SortGroup *) b; + + double ea = Max(sa->frequency / sa->base_frequency, + sa->base_frequency / sa->frequency), + eb = Max(sb->frequency / sb->base_frequency, + sb->base_frequency / sb->frequency); +/* + double ea = sa->frequency * abs(sa->base_frequency - sa->base_frequency), + eb = sb->frequency * abs(sb->base_frequency - sb->base_frequency); +*/ + if (ea > eb) + return -1; + else if (ea < eb) + return 1; + + return 0; +} + +static int +compare_frequency(const void *a, const void *b) +{ + SortGroup *sa = (SortGroup *) a; + SortGroup *sb = (SortGroup *) b; + + if (sa->frequency > sb->frequency) + return -1; + else if (sa->frequency < sb->frequency) + return 1; + + return 0; +} + /* * Builds MCV list from the set of sampled rows. * @@ -177,6 +236,13 @@ statext_mcv_build(int numrows, HeapTuple *rows, Bitmapset *attrs, MultiSortSupport mss; SortItem **freqs; int *nfreqs; + SortGroup *sort_groups; + int nsort_groups; + SortItem key; + + /* space for search key */ + key.values = palloc(sizeof(Datum)); + key.isnull = palloc(sizeof(bool)); attnums = build_attnums_array(attrs, &numattrs); @@ -229,20 +295,55 @@ statext_mcv_build(int numrows, HeapTuple *rows, Bitmapset *attrs, */ mincount = get_mincount_for_mcv_list(numrows, totalrows); + nsort_groups = 0; + sort_groups = (SortGroup *) palloc(sizeof(SortGroup) * ngroups); + /* * Walk the groups until we find the first group with a count below the * mincount threshold (the index of that group is the number of groups we - * want to keep). + * will consider to keep). */ - for (i = 0; i < nitems; i++) + for (i = 0; i < ngroups; i++) { + int j; + if (groups[i].count < mincount) - { - nitems = i; break; + + sort_groups[i].values = groups[i].values; + sort_groups[i].isnull = groups[i].isnull; + sort_groups[i].frequency = (double) groups[i].count / numrows; + + /* base frequency, if the attributes were independent */ + sort_groups[i].base_frequency = 1.0; + for (j = 0; j < numattrs; j++) + { + SortItem *freq; + + /* fill search key */ + key.values[0] = groups[i].values[j]; + key.isnull[0] = groups[i].isnull[j]; + + freq = (SortItem *) bsearch_arg(&key, freqs[j], nfreqs[j], + sizeof(SortItem), + sort_item_compare, &mss->ssup[j]); + + sort_groups[i].base_frequency *= ((double) freq->count) / numrows; } + + nsort_groups = i; } + /* sort the groups by relative error */ + pg_qsort(sort_groups, nsort_groups, sizeof(SortGroup), compare_relative_error); + + /* make sure we only consider groups that are frequent enough */ + if (nitems > nsort_groups) + nitems = nsort_groups; + + /* sort the first groups by frequency (descending) */ + pg_qsort(sort_groups, nitems, sizeof(SortGroup), compare_frequency); + /* * At this point we know the number of items for the MCV list. There might * be none (for uniform distribution with many groups), and in that case @@ -250,18 +351,6 @@ statext_mcv_build(int numrows, HeapTuple *rows, Bitmapset *attrs, */ if (nitems > 0) { - int j; - SortItem key; - MultiSortSupport tmp; - - /* used to search values */ - tmp = (MultiSortSupport) palloc(offsetof(MultiSortSupportData, ssup) - + sizeof(SortSupportData)); - - /* space for search key */ - key.values = palloc(sizeof(Datum)); - key.isnull = palloc(sizeof(bool)); - /* * Allocate the MCV list structure, set the global parameters. */ @@ -287,35 +376,15 @@ statext_mcv_build(int numrows, HeapTuple *rows, Bitmapset *attrs, item->isnull = (bool *) palloc(sizeof(bool) * numattrs); /* copy values for the group */ - memcpy(item->values, groups[i].values, sizeof(Datum) * numattrs); - memcpy(item->isnull, groups[i].isnull, sizeof(bool) * numattrs); + memcpy(item->values, sort_groups[i].values, sizeof(Datum) * numattrs); + memcpy(item->isnull, sort_groups[i].isnull, sizeof(bool) * numattrs); /* groups should be sorted by frequency in descending order */ Assert((i == 0) || (groups[i - 1].count >= groups[i].count)); /* group frequency */ - item->frequency = (double) groups[i].count / numrows; - - /* base frequency, if the attributes were independent */ - item->base_frequency = 1.0; - for (j = 0; j < numattrs; j++) - { - SortItem *freq; - - /* single dimension */ - tmp->ndims = 1; - tmp->ssup[0] = mss->ssup[j]; - - /* fill search key */ - key.values[0] = groups[i].values[j]; - key.isnull[0] = groups[i].isnull[j]; - - freq = (SortItem *) bsearch_arg(&key, freqs[j], nfreqs[j], - sizeof(SortItem), - multi_sort_compare, tmp); - - item->base_frequency *= ((double) freq->count) / numrows; - } + item->frequency = sort_groups[i].frequency; + item->base_frequency = sort_groups[i].base_frequency; } } -- 2.20.1