diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index 6c29bc9d67..225b13376d 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -21288,7 +21288,7 @@ CREATE EVENT TRIGGER test_table_rewrite_oid
PostgreSQL provides a function to inspect complex
- statistics defined using CREATE STATISTICS command.
+ statistics defined using the CREATE STATISTICS command.
diff --git a/doc/src/sgml/planstats.sgml b/doc/src/sgml/planstats.sgml
index 05a0eaf476..4b1d3f4952 100644
--- a/doc/src/sgml/planstats.sgml
+++ b/doc/src/sgml/planstats.sgml
@@ -599,9 +599,9 @@ EXPLAIN (ANALYZE, TIMING OFF) SELECT COUNT(*) FROM t GROUP BY a, b;
This section introduces multivariate variant of MCV
(most-common values) lists, a straightforward extension of the per-column
- statistics described in . This
- statistics addresses the limitation by storing individual values, but it
- is naturally more expensive, both in terms of building the statistics in
+ statistics described in . These
+ statistics address the limitation by storing individual values, but it is
+ naturally more expensive, both in terms of building the statistics in
ANALYZE, storage and planning time.
@@ -651,11 +651,11 @@ SELECT m.* FROM pg_statistic_ext,
(100 rows)
- Which confirms there are 100 distinct combinations in the two columns,
- and all of them are about equally likely (1% frequency for each one).
- The base frequency is the frequency computed from per-column statistics,
- as if there were no multi-column statistics. Had there been any null
- values in either of the columns, this would be identified in the
+ This confirms there are 100 distinct combinations in the two columns, and
+ all of them are about equally likely (1% frequency for each one). The
+ base frequency is the frequency computed from per-column statistics, as if
+ there were no multi-column statistics. Had there been any null values in
+ either of the columns, this would be identified in the
nulls column.
diff --git a/doc/src/sgml/ref/create_statistics.sgml b/doc/src/sgml/ref/create_statistics.sgml
index f86e542237..ae1d8024a4 100644
--- a/doc/src/sgml/ref/create_statistics.sgml
+++ b/doc/src/sgml/ref/create_statistics.sgml
@@ -81,7 +81,7 @@ CREATE STATISTICS [ IF NOT EXISTS ] statistics_na
A statistics kind to be computed in this statistics object.
Currently supported kinds are
- ndistinct, which enables n-distinct statistics, and
+ ndistinct, which enables n-distinct statistics,
dependencies, which enables functional
dependency statistics, and mcv which enables
most-common values lists.
diff --git a/src/backend/optimizer/path/clausesel.c b/src/backend/optimizer/path/clausesel.c
index e9c08c7c4a..b895f06a37 100644
--- a/src/backend/optimizer/path/clausesel.c
+++ b/src/backend/optimizer/path/clausesel.c
@@ -99,13 +99,14 @@ clauselist_selectivity(PlannerInfo *root,
* the more complex stats can track more complex correlations between
* the attributes, and may be considered more reliable.
*
- * For example MCV list can give us an exact selectivity for values in
+ * For example, MCV list can give us an exact selectivity for values in
* two columns, while functional dependencies can only provide
- * information about overall strength of the dependency.
+ * information about the overall strength of the dependency.
*
- * 'estimatedclauses' is a bitmap of 0-based list positions of clauses
- * used that way, so that we can ignore them later (not to estimate
- * them twice).
+ * 'estimatedclauses' tracks the 0-based list position index of
+ * clauses that we've already estimated for. Each selectivity
+ * function will set the appropriate bit in the bitmapset to mark that
+ * no further estimation is required for that list item.
*/
s1 *= statext_clauselist_selectivity(root, clauses, varRelid,
jointype, sjinfo, rel,
@@ -113,9 +114,8 @@ clauselist_selectivity(PlannerInfo *root,
/*
* Perform selectivity estimations on any clauses found applicable by
- * dependencies_clauselist_selectivity. 'estimatedclauses' will be
- * filled with the 0-based list positions of clauses used that way, so
- * that we can ignore them lager (not to estimate them twice).
+ * dependencies_clauselist_selectivity. Pass 'estimatedclauses' so
+ * the function can properly skip clauses already estimated above.
*/
s1 *= dependencies_clauselist_selectivity(root, clauses, varRelid,
jointype, sjinfo, rel,
@@ -123,8 +123,9 @@ clauselist_selectivity(PlannerInfo *root,
}
/*
- * Apply normal selectivity estimates for remaining clauses. We'll be
- * careful to skip any clauses which were already estimated above.
+ * Apply normal selectivity estimates for the remaining clauses, again
+ * passing 'estimatedclauses' so that the function can skip already
+ * estimated clauses.
*/
return s1 * clauselist_selectivity_simple(root, clauses, varRelid,
jointype, sjinfo,
diff --git a/src/backend/statistics/dependencies.c b/src/backend/statistics/dependencies.c
index ea10d2a718..1c15523d03 100644
--- a/src/backend/statistics/dependencies.c
+++ b/src/backend/statistics/dependencies.c
@@ -221,13 +221,13 @@ dependency_degree(int numrows, HeapTuple *rows, int k, AttrNumber *dependency,
mss = multi_sort_init(k);
/*
- * Transform the attrs from bitmap to an array, to make accessing i-th
+ * Transform the attrs from bitmap to an array to make accessing the i-th
* member easier, and then construct a filtered version with only attnums
* referenced by the dependency we validate.
*/
attnums = build_attnums_array(attrs);
- attnums_dep = (int *)palloc(k * sizeof(int));
+ attnums_dep = (int *) palloc(k * sizeof(int));
for (i = 0; i < k; i++)
attnums_dep[i] = attnums[dependency[i]];
@@ -958,8 +958,8 @@ dependencies_clauselist_selectivity(PlannerInfo *root,
Node *clause = (Node *) lfirst(l);
AttrNumber attnum;
- if ((!bms_is_member(listidx, *estimatedclauses)) &&
- (dependency_is_compatible_clause(clause, rel->relid, &attnum)))
+ if (!bms_is_member(listidx, *estimatedclauses) &&
+ dependency_is_compatible_clause(clause, rel->relid, &attnum))
{
list_attnums[listidx] = attnum;
clauses_attnums = bms_add_member(clauses_attnums, attnum);
diff --git a/src/backend/statistics/extended_stats.c b/src/backend/statistics/extended_stats.c
index f5b5562e5c..8d35f45661 100644
--- a/src/backend/statistics/extended_stats.c
+++ b/src/backend/statistics/extended_stats.c
@@ -926,7 +926,7 @@ statext_is_compatible_clause(Node *clause, Index relid, Bitmapset **attnums)
*
* Selects the best extended (multi-column) statistic on a table (measured by
* the number of attributes extracted from the clauses and covered by it), and
- * computes the selectivity for supplied clauses.
+ * computes the selectivity for the supplied clauses.
*
* One of the main challenges with using MCV lists is how to extrapolate the
* estimate to the data not covered by the MCV list. To do that, we compute
@@ -965,6 +965,10 @@ statext_is_compatible_clause(Node *clause, Index relid, Bitmapset **attnums)
* are computed, the inequality may not always hold. Which is why we clamp
* the selectivities to prevent strange estimate (negative etc.).
*
+ * 'estimatedclauses' is an input/output parameter. We set bits for the
+ * 0-based 'clauses' indexes we estimate for and also skip clause items that
+ * already have a bit set.
+ *
* XXX If we were to use multiple statistics, this is where it would happen.
* We would simply repeat this on a loop on the "remaining" clauses, possibly
* using the already estimated clauses as conditions (and combining the values
@@ -1005,10 +1009,6 @@ statext_clauselist_selectivity(PlannerInfo *root, List *clauses, int varRelid,
*
* We also skip clauses that we already estimated using different types of
* statistics (we treat them as incompatible).
- *
- * XXX Currently, the estimated clauses are always empty because the extra
- * statistics are applied before functional dependencies. Once we decide
- * to apply multiple statistics, this may change.
*/
listidx = 0;
foreach(l, clauses)
@@ -1016,8 +1016,8 @@ statext_clauselist_selectivity(PlannerInfo *root, List *clauses, int varRelid,
Node *clause = (Node *) lfirst(l);
Bitmapset *attnums = NULL;
- if ((!bms_is_member(listidx, *estimatedclauses)) &&
- (statext_is_compatible_clause(clause, rel->relid, &attnums)))
+ if (!bms_is_member(listidx, *estimatedclauses) &&
+ statext_is_compatible_clause(clause, rel->relid, &attnums))
{
list_attnums[listidx] = attnums;
clauses_attnums = bms_add_members(clauses_attnums, attnums);
@@ -1052,8 +1052,8 @@ statext_clauselist_selectivity(PlannerInfo *root, List *clauses, int varRelid,
* If the clause is compatible with the selected statistics, mark it
* as estimated and add it to the list to estimate.
*/
- if ((list_attnums[listidx] != NULL) &&
- (bms_is_subset(list_attnums[listidx], stat->keys)))
+ if (list_attnums[listidx] != NULL &&
+ bms_is_subset(list_attnums[listidx], stat->keys))
{
stat_clauses = lappend(stat_clauses, (Node *) lfirst(l));
*estimatedclauses = bms_add_member(*estimatedclauses, listidx);
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index 88622202e1..c3d6a83f21 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -3287,7 +3287,7 @@ estimate_num_groups(PlannerInfo *root, List *groupExprs, double input_rows,
*
* A simplified version of estimate_num_groups, assuming all expressions
* are only plain Vars from a single relation, and that no filtering is
- * happenning.
+ * happening.
*/
double
estimate_num_groups_simple(PlannerInfo *root, List *vars)
diff --git a/src/include/statistics/extended_stats_internal.h b/src/include/statistics/extended_stats_internal.h
index 64cc8c9ecd..eed7f86036 100644
--- a/src/include/statistics/extended_stats_internal.h
+++ b/src/include/statistics/extended_stats_internal.h
@@ -38,7 +38,7 @@ typedef struct DimensionInfo
int nbytes; /* number of bytes (serialized) */
int typlen; /* pg_type.typlen */
bool typbyval; /* pg_type.typbyval */
-} DimensionInfo;
+} DimensionInfo;
/* multi-sort */
typedef struct MultiSortSupportData
@@ -67,9 +67,9 @@ extern MVDependencies *statext_dependencies_build(int numrows, HeapTuple *rows,
extern bytea *statext_dependencies_serialize(MVDependencies *dependencies);
extern MVDependencies *statext_dependencies_deserialize(bytea *data);
-extern MCVList * statext_mcv_build(int numrows, HeapTuple *rows,
- Bitmapset *attrs, VacAttrStats **stats,
- double totalrows);
+extern MCVList *statext_mcv_build(int numrows, HeapTuple *rows,
+ Bitmapset *attrs, VacAttrStats **stats,
+ double totalrows);
extern bytea *statext_mcv_serialize(MCVList * mcv, VacAttrStats **stats);
extern MCVList * statext_mcv_deserialize(bytea *data);
diff --git a/src/include/statistics/statistics.h b/src/include/statistics/statistics.h
index fed875cd52..a52d5800d5 100644
--- a/src/include/statistics/statistics.h
+++ b/src/include/statistics/statistics.h
@@ -87,9 +87,8 @@ typedef struct MVDependencies
#define SizeOfDependencies (offsetof(MVDependencies, ndeps) + sizeof(uint32))
/* used to flag stats serialized to bytea */
-#define STATS_MCV_MAGIC 0xE1A651C2 /* marks serialized
- * bytea */
-#define STATS_MCV_TYPE_BASIC 1 /* basic MCV list type */
+#define STATS_MCV_MAGIC 0xE1A651C2 /* marks serialized bytea */
+#define STATS_MCV_TYPE_BASIC 1 /* basic MCV list type */
/* max items in MCV list (mostly arbitrary number) */
#define STATS_MCVLIST_MAX_ITEMS 8192
@@ -106,7 +105,7 @@ typedef struct MCVItem
double base_frequency; /* frequency if independent */
bool *isnull; /* NULL flags */
Datum *values; /* item values */
-} MCVItem;
+} MCVItem;
/* multivariate MCV list - essentally an array of MCV items */
typedef struct MCVList
@@ -117,11 +116,11 @@ typedef struct MCVList
AttrNumber ndimensions; /* number of dimensions */
Oid types[STATS_MAX_DIMENSIONS]; /* OIDs of data types */
MCVItem **items; /* array of MCV items */
-} MCVList;
+} MCVList;
extern MVNDistinct *statext_ndistinct_load(Oid mvoid);
extern MVDependencies *statext_dependencies_load(Oid mvoid);
-extern MCVList * statext_mcv_load(Oid mvoid);
+extern MCVList *statext_mcv_load(Oid mvoid);
extern void BuildRelationExtStatistics(Relation onerel, double totalrows,
int numrows, HeapTuple *rows,