diff --git a/src/backend/optimizer/path/clausesel.c b/src/backend/optimizer/path/clausesel.c
index d263ecf082..dca1e7d34e 100644
--- a/src/backend/optimizer/path/clausesel.c
+++ b/src/backend/optimizer/path/clausesel.c
@@ -157,6 +157,23 @@ clauselist_selectivity_ext(PlannerInfo *root,
&estimatedclauses, false);
}
+ /*
+ * Try applying extended statistics to joins. There's not much we can
+ * do to detect when this makes sense, but we can check that there are
+ * join clauses, and that at least some of the rels have stats.
+ *
+ * XXX Isn't this mutualy exclusive with the preceding block which
+ * calculates estimates for a single relation?
+ */
+ if (use_extended_stats &&
+ statext_try_join_estimates(root, clauses, varRelid, jointype, sjinfo,
+ estimatedclauses))
+ {
+ s1 *= statext_clauselist_join_selectivity(root, clauses, varRelid,
+ jointype, sjinfo,
+ &estimatedclauses);
+ }
+
/*
* Apply normal selectivity estimates for remaining clauses. We'll be
* careful to skip any clauses which were already estimated above.
diff --git a/src/backend/statistics/extended_stats.c b/src/backend/statistics/extended_stats.c
index b05e818ba9..d4cbbee785 100644
--- a/src/backend/statistics/extended_stats.c
+++ b/src/backend/statistics/extended_stats.c
@@ -30,6 +30,7 @@
#include "nodes/nodeFuncs.h"
#include "optimizer/clauses.h"
#include "optimizer/optimizer.h"
+#include "optimizer/pathnode.h"
#include "pgstat.h"
#include "postmaster/autovacuum.h"
#include "statistics/extended_stats_internal.h"
@@ -101,6 +102,16 @@ static StatsBuildData *make_build_data(Relation onerel, StatExtEntry *stat,
int numrows, HeapTuple *rows,
VacAttrStats **stats, int stattarget);
+static bool stat_covers_expressions(StatisticExtInfo *stat, List *exprs,
+ Bitmapset **expr_idxs);
+
+static List *statext_mcv_get_conditions(PlannerInfo *root,
+ RelOptInfo *rel,
+ StatisticExtInfo *info);
+
+static bool *statext_mcv_eval_conditions(PlannerInfo *root, RelOptInfo *rel,
+ StatisticExtInfo *stat, MCVList *mcv,
+ Selectivity *sel);
/*
* Compute requested extended stats, using the rows sampled for the plain
@@ -1154,6 +1165,89 @@ has_stats_of_kind(List *stats, char requiredkind)
return false;
}
+/*
+ * find_matching_mcv
+ * Search for a MCV covering all the attributes.
+ *
+ * XXX Should consider both attnums and expressions. Also should consider
+ * additional restrictinfos as conditions (but only as optional).
+ *
+ * XXX The requirement that all the attributes need to be covered might be
+ * too strong. Maybe we could relax it a bit, and search for MCVs (on both
+ * sides of the join) with the largest overlap. But we don't really expect
+ * many candidate MCVs, so this simple approach seems sufficient.
+ */
+StatisticExtInfo *
+find_matching_mcv(PlannerInfo *root, RelOptInfo *rel, Bitmapset *attnums, List *exprs)
+{
+ ListCell *l;
+ StatisticExtInfo *mcv = NULL;
+ List *stats = rel->statlist;
+
+ foreach(l, stats)
+ {
+ StatisticExtInfo *stat = (StatisticExtInfo *) lfirst(l);
+ List *conditions1 = NIL,
+ *conditions2 = NIL;
+
+ /* We only care about MCV statistics here. */
+ if (stat->kind != STATS_EXT_MCV)
+ continue;
+
+ /*
+ * Ignore MCVs not covering all the attributes/expressions.
+ *
+ * XXX Maybe we shouldn't be so strict and consider only partial
+ * matches for join clauses too?
+ */
+ if (!bms_is_subset(attnums, stat->keys) ||
+ !stat_covers_expressions(stat, exprs, NULL))
+ continue;
+
+ /* If there's no matching MCV yet, keep it. */
+ if (!mcv)
+ {
+ mcv = stat;
+ continue;
+ }
+
+ /*
+ * OK, we have two candidate statistics and we need to pick. We'll
+ * use two simple heuristics: We prefer smaller statistics (fewer
+ * columns), on the assumption that a smaller statistics probably
+ * represents a larger fraction of the data (fewer combinations
+ * with higher counts). But we also like if the statistics covers
+ * some additional conditions at the baserel level, because this
+ * may affect the data distribition. Of course, those two metrics
+ * are contradictory - smaller stats are less likely to cover as
+ * many conditions as a larger one.
+ *
+ * XXX For now we simply prefer smaller statistics, but maybe it
+ * should be the other way around.
+ */
+ if (bms_num_members(mcv->keys) + list_length(mcv->exprs) >
+ bms_num_members(stat->keys) + list_length(stat->exprs))
+ {
+ mcv = stat;
+ continue;
+ }
+
+ /*
+ * Now inspect the base restrictinfo conditions too. We need to be
+ * more careful because we didn't check which of those clauses are
+ * compatible, so we need to run statext_is_compatible_clause.
+ */
+ conditions1 = statext_mcv_get_conditions(root, rel, stat);
+ conditions2 = statext_mcv_get_conditions(root, rel, mcv);
+
+ /* if the new statistics covers more conditions, use it */
+ if (list_length(conditions2) > list_length(conditions1))
+ mcv = stat;
+ }
+
+ return mcv;
+}
+
/*
* stat_find_expression
* Search for an expression in statistics object's list of expressions.
@@ -2603,3 +2697,846 @@ make_build_data(Relation rel, StatExtEntry *stat, int numrows, HeapTuple *rows,
return result;
}
+
+/*
+ * statext_mcv_get_conditions
+ * Get conditions on base relations, to be used as conditions for joins.
+ *
+ * When estimating joins using extended statistics, we can apply conditions
+ * from base relations as conditions. This peeks at the baserestrictinfo
+ * list for a relation and extracts those that are compatible with extended
+ * statistics.
+ */
+static List *
+statext_mcv_get_conditions(PlannerInfo *root, RelOptInfo *rel,
+ StatisticExtInfo *info)
+{
+ ListCell *lc;
+ List *conditions = NIL;
+
+ /* extract conditions that may be applied to the MCV list */
+ foreach (lc, rel->baserestrictinfo)
+ {
+ RestrictInfo *rinfo = (RestrictInfo *) lfirst(lc);
+ Bitmapset *indexes = NULL;
+ Bitmapset *attnums = NULL;
+ List *exprs = NIL;
+
+ /* clause has to be supported by MCV in general */
+ if (!statext_is_compatible_clause(root, (Node *) rinfo, rel->relid,
+ &attnums, &exprs))
+ continue;
+
+ /*
+ * clause is compatible in general, but is it actually covered
+ * by this partiular statistics object?
+ */
+ if (!bms_is_subset(attnums, info->keys) ||
+ !stat_covers_expressions(info, exprs, &indexes))
+ continue;
+
+ conditions = lappend(conditions, rinfo->clause);
+ }
+
+ return conditions;
+}
+
+/*
+ * statext_mcv_eval_conditions
+ * Evaluate a list of conditions on the MCV lists.
+ *
+ * This returns a match bitmap for the conditions, which can be used later
+ * to restrict just the "interesting" part of the MCV lists. Also returns
+ * the selectivity of the conditions, or 1.0 if there are no conditions.
+ */
+static bool *
+statext_mcv_eval_conditions(PlannerInfo *root, RelOptInfo *rel,
+ StatisticExtInfo *stat, MCVList *mcv,
+ Selectivity *sel)
+{
+ List *conditions;
+
+ /* everything matches by default */
+ *sel = 1.0;
+
+ /*
+ * XXX We've already evaluated this before, when picking the statistics
+ * object. Maybe we should stash it somewhere, so that we don't have to
+ * evaluate it again.
+ */
+ conditions = statext_mcv_get_conditions(root, rel, stat);
+
+ /* If no conditions, we're done. */
+ if (!conditions)
+ return NULL;
+
+ /* what's the selectivity of the conditions alone? */
+ *sel = clauselist_selectivity(root, conditions, rel->relid, 0, NULL);
+
+ return mcv_get_match_bitmap(root, conditions, stat->keys, stat->exprs,
+ mcv, false);
+}
+
+/*
+ * statext_ndistinct_estimate
+ * Estimate number of distinct values in a list of clauses.
+ *
+ * This is used to extract expressions for a given relation from join clauses,
+ * so that we can estimate the number of distinct values in those expressions.
+ * That is needed for join cardinality estimation, similarly to what eqjoinsel
+ * does for regular coumns.
+ */
+static double
+statext_ndistinct_estimate(PlannerInfo *root, RelOptInfo *rel, List *clauses)
+{
+ ListCell *lc;
+
+ List *exprs = NIL;
+
+ foreach (lc, clauses)
+ {
+ ListCell *lc2;
+ Node *clause = (Node *) lfirst(lc);
+ OpExpr *opexpr;
+
+ if (!is_opclause(clause))
+ continue;
+
+ opexpr = (OpExpr *) clause;
+
+ if (list_length(opexpr->args) != 2)
+ continue;
+
+ foreach (lc2, opexpr->args)
+ {
+ Node *expr = (Node *) lfirst(lc2);
+ Bitmapset *varnos = pull_varnos(root, expr);
+
+ if (bms_singleton_member(varnos) == rel->relid)
+ exprs = lappend(exprs, expr);
+ }
+ }
+
+ return estimate_num_groups(root, exprs, rel->rows, NULL, NULL);
+}
+
+/*
+ * statext_compare_mcvs
+ * Calculte join selectivity using extended statistics, similarly to
+ * eqjoinsel_inner.
+ *
+ * Considers restrictions on base relations too, essentially computing
+ * a conditional probability
+ *
+ * P(join clauses | baserestrictinfos on either side)
+ *
+ * Compared to eqjoinsel_inner there's a couple problems. With per-column
+ * MCV lists it's obvious that the number of distinct values not covered
+ * by the MCV is (ndistinct - size(MCV)). With multi-column MCVs it's not
+ * that simple, particularly when the conditions are on a subset of the
+ * MCV and NULLs are involved. E.g. with MCV (a,b,c) and conditions on
+ * (a,b), it's not clear if the number of (a,b) combinations not covered
+ * by the MCV is
+ *
+ * (ndistinct(a,b) - ndistinct_mcv(a,b))
+ *
+ * where ndistinct_mcv(a,b) is the number of distinct (a,b) combinations
+ * included in the MCV list. These combinations may be present in the rest
+ * of the data (outside MCV), just with some extra values in "c". So in
+ * principle there may be between
+ *
+ * (ndistinct(a,b) - ndistinct_mcv(a,b)) and ndistinct(a,b)
+ *
+ * distinct values in the rest of the data. So we need to pick something
+ * in between, there's no way to calculate this accurately.
+ */
+static Selectivity
+statext_compare_mcvs(PlannerInfo *root, RelOptInfo *rel1, RelOptInfo *rel2,
+ StatisticExtInfo *stat1, StatisticExtInfo *stat2,
+ List *clauses)
+{
+ MCVList *mcv1;
+ MCVList *mcv2;
+ int i, j;
+ Selectivity s = 0;
+
+ /* items eliminated by conditions (if any) */
+ bool *conditions1 = NULL,
+ *conditions2 = NULL;
+
+ double conditions1_sel = 1.0,
+ conditions2_sel = 1.0;
+
+ bool *matches1 = NULL,
+ *matches2 = NULL;
+
+ double matchfreq1,
+ unmatchfreq1,
+ matchfreq2,
+ unmatchfreq2,
+ otherfreq1,
+ mcvfreq1,
+ otherfreq2,
+ mcvfreq2;
+
+ double nd1,
+ nd2;
+
+ double totalsel1,
+ totalsel2;
+
+ mcv1 = statext_mcv_load(stat1->statOid);
+ mcv2 = statext_mcv_load(stat2->statOid);
+
+ /* should only get here with MCV on both sides */
+ Assert(mcv1 && mcv2);
+
+ matches1 = (bool *) palloc0(sizeof(bool) * mcv1->nitems);
+ matches2 = (bool *) palloc0(sizeof(bool) * mcv2->nitems);
+
+ /* apply baserestrictinfo conditions on the MCV lists */
+
+ conditions1 = statext_mcv_eval_conditions(root, rel1, stat1, mcv1,
+ &conditions1_sel);
+
+ conditions2 = statext_mcv_eval_conditions(root, rel2, stat2, mcv2,
+ &conditions2_sel);
+
+ /*
+ * Match items from the two MCV lits.
+ *
+ * We don't know if the matches are 1:1 - we may have overlap on only
+ * a subset of attributes, e.g. (a,b,c) vs. (b,c,d), so there may be
+ * multiple matches.
+ */
+ for (i = 0; i < mcv1->nitems; i++)
+ {
+ /* skip items eliminated by restrictions on rel1 */
+ if (conditions1 && !conditions1[i])
+ continue;
+
+ /* find matches in the second MCV list */
+ for (j = 0; j < mcv2->nitems; j++)
+ {
+ ListCell *lc;
+ bool items_match = true;
+
+ /* skip items eliminated by restrictions on rel2 */
+ if (conditions2 && !conditions2[j])
+ continue;
+
+ foreach (lc, clauses)
+ {
+ Node *clause = (Node *) lfirst(lc);
+ Bitmapset *atts1 = NULL;
+ Bitmapset *atts2 = NULL;
+ Datum value1, value2;
+ int index1, index2;
+ AttrNumber attnum1;
+ AttrNumber attnum2;
+ bool match;
+
+ FmgrInfo opproc;
+ OpExpr *expr = (OpExpr *) clause;
+
+ Assert(is_opclause(clause));
+
+ fmgr_info(get_opcode(expr->opno), &opproc);
+
+ /* determine the columns in each statistics object */
+
+ pull_varattnos(clause, rel1->relid, &atts1);
+ attnum1 = bms_singleton_member(atts1) + FirstLowInvalidHeapAttributeNumber;
+ index1 = bms_member_index(stat1->keys, attnum1);
+
+ pull_varattnos(clause, rel2->relid, &atts2);
+ attnum2 = bms_singleton_member(atts2) + FirstLowInvalidHeapAttributeNumber;
+ index2 = bms_member_index(stat2->keys, attnum2);
+
+ /* if either value is null, we're done */
+ if (mcv1->items[i].isnull[index1] || mcv2->items[j].isnull[index2])
+ match = false;
+ else
+ {
+ value1 = mcv1->items[i].values[index1];
+ value2 = mcv2->items[j].values[index2];
+
+ /*
+ * FIXME Might have issues with order of parameters, but for
+ * same-type equality that should not matter.
+ * */
+ match = DatumGetBool(FunctionCall2Coll(&opproc,
+ InvalidOid,
+ value1, value2));
+ }
+
+ items_match &= match;
+
+ if (!items_match)
+ break;
+ }
+
+ if (items_match)
+ {
+ matches1[i] = matches2[j] = true;
+ s += mcv1->items[i].frequency * mcv2->items[j].frequency;
+
+ /* XXX Do we need to do something about base frequency? */
+ }
+ }
+ }
+
+ matchfreq1 = unmatchfreq1 = mcvfreq1 = 0.0;
+ for (i = 0; i < mcv1->nitems; i++)
+ {
+ mcvfreq1 += mcv1->items[i].frequency;
+
+ if (conditions1 && !conditions1[i])
+ continue;
+
+ if (matches1[i])
+ matchfreq1 += mcv1->items[i].frequency;
+ else
+ unmatchfreq1 += mcv1->items[i].frequency;
+ }
+
+ /* not represented by the MCV */
+ otherfreq1 = 1 - mcvfreq1;
+
+ matchfreq2 = unmatchfreq2 = mcvfreq2 = 0.0;
+ for (i = 0; i < mcv2->nitems; i++)
+ {
+ mcvfreq2 += mcv2->items[i].frequency;
+
+ if (conditions2 && !conditions2[i])
+ continue;
+
+ if (matches2[i])
+ matchfreq2 += mcv2->items[i].frequency;
+ else
+ unmatchfreq2 += mcv2->items[i].frequency;
+ }
+
+ /* not represented by the MCV */
+ otherfreq2 = 1 - mcvfreq2;
+
+ /* correction for MCV parts eliminated by the conditions */
+ s = s * mcvfreq1 * mcvfreq2 / (matchfreq1 + unmatchfreq1) / (matchfreq2 + unmatchfreq2);
+
+ nd1 = statext_ndistinct_estimate(root, rel1, clauses);
+ nd2 = statext_ndistinct_estimate(root, rel2, clauses);
+
+ /*
+ * XXX this is a bit bogus, because we don't know what fraction of
+ * distinct combinations is covered by the MCV list (we're only
+ * dealing with some of the columns), so we can't use the same
+ * formular as eqjoinsel_inner exactly. Moreover, we need to look
+ * at the conditions. So instead we simply assume the conditions
+ * affect the distinct groups, and use that.
+ */
+ nd1 *= conditions1_sel;
+ nd2 *= conditions2_sel;
+
+ totalsel1 = s;
+ totalsel1 += unmatchfreq1 * otherfreq2 / nd2;
+ totalsel1 += otherfreq1 * (otherfreq2 + unmatchfreq2) / nd2;
+
+// if (nd2 > mcvb->nitems)
+// totalsel1 += unmatchfreq1 * otherfreq2 / (nd2 - mcvb->nitems);
+// if (nd2 > nmatches)
+// totalsel1 += otherfreq1 * (otherfreq2 + unmatchfreq2) /
+// (nd2 - nmatches);
+
+ totalsel2 = s;
+ totalsel2 += unmatchfreq2 * otherfreq1 / nd1;
+ totalsel2 += otherfreq2 * (otherfreq1 + unmatchfreq1) / nd1;
+
+// if (nd1 > mcva->nitems)
+// totalsel2 += unmatchfreq2 * otherfreq1 / (nd1 - mcva->nitems);
+// if (nd1 > nmatches)
+// totalsel2 += otherfreq2 * (otherfreq1 + unmatchfreq1) /
+// (nd1 - nmatches);
+
+ s = Min(totalsel1, totalsel2);
+
+ return s;
+}
+
+/*
+ * statext_is_supported_join_clause
+ * Check if a join clause may be estimated using extended stats.
+ *
+ * Determines if this is a join clause of the form (Expr op Expr) which
+ * may be estimated using extended statistics. Each side must reference
+ * just one relation for now.
+ */
+static bool
+statext_is_supported_join_clause(PlannerInfo *root, Node *clause,
+ int varRelid, SpecialJoinInfo *sjinfo)
+{
+ Oid oprsel;
+ RestrictInfo *rinfo;
+ OpExpr *opclause;
+ ListCell *lc;
+
+ /*
+ * evaluation as a restriction clause, either at scan node or forced
+ *
+ * XXX See treat_as_join_clause.
+ */
+ if ((varRelid != 0) || (sjinfo == NULL))
+ return false;
+
+ /* XXX Can we rely on always getting RestrictInfo here? */
+ if (!IsA(clause, RestrictInfo))
+ return false;
+
+ /* strip the RestrictInfo */
+ rinfo = (RestrictInfo *) clause;
+ clause = (Node *) rinfo->clause;
+
+ /* is it referencing multiple relations? */
+ if (bms_membership(rinfo->clause_relids) != BMS_MULTIPLE)
+ return false;
+
+ /* we only support simple operator clauses for now */
+ if (!is_opclause(clause))
+ return false;
+
+ opclause = (OpExpr *) clause;
+
+ /* for now we only support estimating equijoins */
+ oprsel = get_oprjoin(opclause->opno);
+
+ if (oprsel != F_EQJOINSEL)
+ return false;
+
+ /*
+ * Make sure we're not mixing vars from multiple relations on the same
+ * side, like
+ *
+ * (t1.a + t2.a) = (t1.b + t2.b)
+ *
+ * which is still technically an opclause, but we can't match it to
+ * extended statistics in a simple way.
+ *
+ * XXX This also means we require rinfo->clause_relids to have 2 rels.
+ *
+ * XXX Also check it's not expression on system attributes, which we
+ * don't allow in extended statistics.
+ *
+ * XXX Although maybe we could allow cases that combine expressions
+ * from both relations on either side? Like (t1.a + t2.b = t1.c - t2.d)
+ * or something like that. We could do "cartesian product" of the MCV
+ * stats and restrict it using this condition.
+ */
+ foreach (lc, opclause->args)
+ {
+ Bitmapset *varnos = NULL;
+ Node *expr = (Node *) lfirst(lc);
+
+ varnos = pull_varnos(root, expr);
+
+ /*
+ * No argument should reference more than just one relation.
+ *
+ * This effectively means each side references just two relations.
+ * If there's no relation on one side, it's a Const, and the other
+ * side has to be either Const or Expr with a single rel. In which
+ * case it can't be a join clause.
+ */
+ if (bms_num_members(varnos) > 1)
+ return false;
+
+ /*
+ * XXX Maybe check that both relations have extended statistics
+ * (no point in considering the clause as useful without it). But
+ * we'll do that check later anyway, so keep this cheap.
+ */
+ }
+
+ return true;
+}
+
+/*
+ * statext_try_join_estimates
+ * Checks if it's worth considering extended stats on join estimates.
+ *
+ * This is supposed to be a quick/cheap check to decide whether to expend
+ * more effort on applying extended statistics to join clauses.
+ */
+bool
+statext_try_join_estimates(PlannerInfo *root, List *clauses, int varRelid,
+ JoinType jointype, SpecialJoinInfo *sjinfo,
+ Bitmapset *estimatedclauses)
+{
+ int listidx;
+ int k;
+ ListCell *lc;
+ Bitmapset *relids = NULL;
+
+ /*
+ * XXX Not having these values means treat_as_join_clause returns false,
+ * so we're not supposed to handle join clauses here. So just bail out.
+ */
+ if ((varRelid != 0) || (sjinfo == NULL))
+ return false;
+
+ listidx = -1;
+ foreach (lc, clauses)
+ {
+ Node *clause = (Node *) lfirst(lc);
+ RestrictInfo *rinfo;
+ listidx++;
+
+ /* skip estimated clauses */
+ if (bms_is_member(listidx, estimatedclauses))
+ continue;
+
+ /*
+ * Skip clauses that are not join clauses or that we don't know
+ * how to handle estimate using extended statistics.
+ */
+ if (!statext_is_supported_join_clause(root, clause, varRelid, sjinfo))
+ continue;
+
+ /*
+ * Collect relids from all usable clauses.
+ *
+ * XXX We're guaranteed to have RestrictInfo thanks to the checks
+ * in statext_is_supported_join_clause.
+ */
+ rinfo = (RestrictInfo *) clause;
+ relids = bms_union(relids, rinfo->clause_relids);
+ }
+
+ /* no join clauses found, don't try applying extended stats */
+ if (bms_num_members(relids) == 0)
+ return false;
+
+ /*
+ * We expect either 0 or >= 2 relids, a case with 1 relid in join clauses
+ * should be impossible. And we just ruled out 0, so there are at least 2.
+ */
+ Assert(bms_num_members(relids) >= 2);
+
+ /*
+ * Check that at least some of the rels referenced by the clauses have
+ * extended stats.
+ *
+ * XXX Maybe we should check how many rels have stats, and cross-check
+ * how compatible they are (e.g. that both have MCVs, etc.). Also,
+ * maybe this should cross-check the exact pairs of rels with a join
+ * clause between them? OTOH this is supposed to be a cheap check, so
+ * maybe better leave that for later.
+ *
+ * XXX We could also check if there are enough parameters in each rel
+ * to consider extended stats. If there's just a single attribute, it's
+ * probably better to use just regular statistics. OTOH we can also
+ * consider restriction clauses from baserestrictinfo and use them
+ * to calculate conditional probabilities.
+ */
+ k = -1;
+ while ((k = bms_next_member(relids, k)) >= 0)
+ {
+ RelOptInfo *rel = find_base_rel(root, k);
+ if (rel->statlist)
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Information about a join between two relations. It tracks relations being
+ * joined and the join clauses.
+ */
+typedef struct JoinPairInfo
+{
+ Bitmapset *rels;
+ List *clauses;
+} JoinPairInfo;
+
+/*
+ * statext_build_join_pairs
+ * Extract pairs of joined rels with join clauses for each pair.
+ *
+ * Walks the remaining (not yet estimated) clauses, and splits them into
+ * lists for each pair of joined relations. Returns NULL if there are no
+ * suitable join pairs that might be estimated using extended stats.
+ *
+ * XXX It's possible there are join clauses, but the clauses are not
+ * supported by the extended stats machinery (we only support opclauses
+ * with F_EQJOINSEL selectivity function at the moment).
+ */
+static JoinPairInfo *
+statext_build_join_pairs(PlannerInfo *root, List *clauses, int varRelid,
+ JoinType jointype, SpecialJoinInfo *sjinfo,
+ Bitmapset *estimatedclauses, int *npairs)
+{
+ int cnt;
+ int listidx;
+ JoinPairInfo *info;
+ ListCell *lc;
+
+ /*
+ * Assume each clause is for a different pair of relations (some of them
+ * might be already estimated, but meh - there shouldn't be too many of
+ * them and it's cheaper than repalloc.
+ */
+ info = (JoinPairInfo *) palloc0(sizeof(JoinPairInfo) * list_length(clauses));
+ cnt = 0;
+
+ listidx = -1;
+ foreach(lc, clauses)
+ {
+ int i;
+ bool found;
+ Node *clause = (Node *) lfirst(lc);
+ RestrictInfo *rinfo;
+
+ listidx++;
+
+ /* skip already estimated clauses */
+ if (bms_is_member(listidx, estimatedclauses))
+ continue;
+
+ /*
+ * Make sure the clause is a join clause of a supported shape (at
+ * the moment we support just (Expr op Expr) clauses with each
+ * side referencing just a single relation.
+ */
+ if (!statext_is_supported_join_clause(root, clause, varRelid, sjinfo))
+ continue;
+
+ /* statext_is_supported_join_clause guarantees RestrictInfo */
+ rinfo = (RestrictInfo *) clause;
+ clause = (Node *) rinfo->clause;
+
+ /* search for a matching join pair */
+ found = false;
+ for (i = 0; i < cnt; i++)
+ {
+ if (bms_is_subset(rinfo->clause_relids, info[i].rels))
+ {
+ info[i].clauses = lappend(info[i].clauses, clause);
+ found = true;
+ break;
+ }
+ }
+
+ if (!found)
+ {
+ info[cnt].rels = rinfo->clause_relids;
+ info[cnt].clauses = lappend(info[cnt].clauses, clause);
+ cnt++;
+ }
+ }
+
+ if (cnt == 0)
+ return NULL;
+
+ *npairs = cnt;
+ return info;
+}
+
+/*
+ * extract_relation_info
+ * Extract information about a relation in a join pair.
+ *
+ * The relation is identified by index (generally 0 or 1), and picks extended
+ * statistics covering matching the join clauses and baserel restrictions.
+ *
+ * XXX Can we have cases with indexes above 1? Probably for clauses mixing
+ * vars from 3 relations, but we're rejecting those.
+ */
+static RelOptInfo *
+extract_relation_info(PlannerInfo *root, JoinPairInfo *info, int index,
+ StatisticExtInfo **stat)
+{
+ int k;
+ int relid;
+ RelOptInfo *rel;
+ ListCell *lc;
+
+ Bitmapset *attnums = NULL;
+ List *exprs = NIL;
+
+ k = -1;
+ while (index >= 0)
+ {
+ k = bms_next_member(info->rels, k);
+ if (k < 0)
+ elog(ERROR, "failed to extract relid");
+
+ relid = k;
+ index--;
+ }
+
+ rel = find_base_rel(root, relid);
+
+ /*
+ * Walk the clauses for this join pair, and extract expressions about
+ * the relation identified by index / relid. For simple Vars we extract
+ * the attnum. Otherwise we keep the whole expression.
+ */
+ foreach (lc, info->clauses)
+ {
+ ListCell *lc2;
+ Node *clause = (Node *) lfirst(lc);
+ OpExpr *opclause = (OpExpr *) clause;
+
+ /* only opclauses supported for now */
+ Assert(is_opclause(clause));
+
+ foreach (lc2, opclause->args)
+ {
+ Node *arg = (Node *) lfirst(lc2);
+ Bitmapset *varnos = NULL;
+
+ /* plain Var references (boolean Vars or recursive checks) */
+ if (IsA(arg, Var))
+ {
+ Var *var = (Var *) arg;
+
+ /* Ignore vars from other relations. */
+ if (var->varno != relid)
+ continue;
+
+ /* we also better ensure the Var is from the current level */
+ if (var->varlevelsup > 0)
+ continue;
+
+ /* Also skip system attributes (we don't allow stats on those). */
+ if (!AttrNumberIsForUserDefinedAttr(var->varattno))
+ elog(ERROR, "unexpected system attribute");
+
+ attnums = bms_add_member(attnums, var->varattno);
+
+ /* Done, process the next argument. */
+ continue;
+ }
+
+ /*
+ * OK, it's a more complex expression, so check if it matches
+ * the relid and maybe keep it as a whole. It should be
+ * compatible because we already checked it when building the
+ * join pairs.
+ */
+ varnos = pull_varnos(root, arg);
+
+ if (relid == bms_singleton_member(varnos))
+ exprs = lappend(exprs, arg);
+ }
+ }
+
+ *stat = find_matching_mcv(root, rel, attnums, exprs);
+
+ return rel;
+}
+
+/*
+ * statext_clauselist_join_selectivity
+ * Use extended stats to estimate join clauses.
+ *
+ * XXX In principle, we should not restrict this to cases with multiple
+ * join clauses - we should consider dependencies with conditions at the
+ * base relations, i.e. calculate P(join clause | base restrictions).
+ * But currently that does not happen, because clauselist_selectivity_ext
+ * treats a single clause as a special case (and we don't apply extended
+ * statistics in that case yet).
+ */
+Selectivity
+statext_clauselist_join_selectivity(PlannerInfo *root, List *clauses, int varRelid,
+ JoinType jointype, SpecialJoinInfo *sjinfo,
+ Bitmapset **estimatedclauses)
+{
+ int i;
+ int listidx;
+ Selectivity s = 1.0;
+
+ JoinPairInfo *info;
+ int ninfo;
+
+ if (!clauses)
+ return 1.0;
+
+ /* extract pairs of joined relations from the list of clauses */
+ info = statext_build_join_pairs(root, clauses, varRelid, jointype, sjinfo,
+ *estimatedclauses, &ninfo);
+
+ /* no useful join pairs */
+ if (!info)
+ return 1.0;
+
+ /*
+ * Process the join pairs, try to find a matching MCV on each side.
+ *
+ * XXX The basic principle is quite similar to eqjoinsel_inner, i.e.
+ * we try to find a MCV on both sides of the join, and use it to get
+ * better join estimate. It's a bit more complicated, because there
+ * might be multiple MCV lists, we also need ndistinct estimate, and
+ * there may be interesting baserestrictions too.
+ *
+ * XXX At the moment we only handle the case with matching MCVs on
+ * both sides, but it'd be good to also handle case with just ndistinct
+ * statistics improving ndistinct estimates.
+ *
+ * XXX Perhaps it'd be good to also handle case with one side only
+ * having "regular" statistics (e.g. MCV), especially in cases with
+ * no conditions on that side of the join (where we can't use the
+ * extended MCV to calculate conditional probability).
+ */
+ for (i = 0; i < ninfo; i++)
+ {
+ RelOptInfo *rel1;
+ RelOptInfo *rel2;
+ StatisticExtInfo *stat1;
+ StatisticExtInfo *stat2;
+
+ ListCell *lc;
+
+ /* extract info about the first relation */
+ rel1 = extract_relation_info(root, &info[i], 0, &stat1);
+
+ /* extract info about the second relation */
+ rel2 = extract_relation_info(root, &info[i], 1, &stat2);
+
+ /* XXX only handling case with MCV on both sides for now */
+ if (!stat1 || !stat2)
+ continue;
+
+ s *= statext_compare_mcvs(root, rel1, rel2, stat1, stat2, info[i].clauses);
+
+ /*
+ * Now mark all the clauses for this join pair as estimated.
+ *
+ * XXX Maybe track the indexes in JoinPairInfo, so that we can
+ * simply union the two bitmaps, without the extra matching.
+ */
+ foreach (lc, info->clauses)
+ {
+ Node *clause = (Node *) lfirst(lc);
+ ListCell *lc2;
+
+ listidx = -1;
+ foreach (lc2, clauses)
+ {
+ Node *clause2 = (Node *) lfirst(lc2);
+ listidx++;
+
+ Assert(IsA(clause2, RestrictInfo));
+
+ clause2 = (Node *) ((RestrictInfo *) clause2)->clause;
+
+ if (equal(clause, clause2))
+ {
+ *estimatedclauses = bms_add_member(*estimatedclauses, listidx);
+ break;
+ }
+ }
+ }
+ }
+
+ return s;
+}
diff --git a/src/backend/statistics/mcv.c b/src/backend/statistics/mcv.c
index ef118952c7..7a7d2c8834 100644
--- a/src/backend/statistics/mcv.c
+++ b/src/backend/statistics/mcv.c
@@ -1602,7 +1602,7 @@ mcv_match_expression(Node *expr, Bitmapset *keys, List *exprs, Oid *collid)
* & and |, which should be faster than min/max. The bitmaps are fairly
* small, though (thanks to the cap on the MCV list size).
*/
-static bool *
+bool *
mcv_get_match_bitmap(PlannerInfo *root, List *clauses,
Bitmapset *keys, List *exprs,
MCVList *mcvlist, bool is_or)
diff --git a/src/include/statistics/extended_stats_internal.h b/src/include/statistics/extended_stats_internal.h
index 55cd9252a5..072085365c 100644
--- a/src/include/statistics/extended_stats_internal.h
+++ b/src/include/statistics/extended_stats_internal.h
@@ -127,4 +127,8 @@ extern Selectivity mcv_clause_selectivity_or(PlannerInfo *root,
Selectivity *overlap_basesel,
Selectivity *totalsel);
+extern bool *mcv_get_match_bitmap(PlannerInfo *root, List *clauses,
+ Bitmapset *keys, List *exprs,
+ MCVList *mcvlist, bool is_or);
+
#endif /* EXTENDED_STATS_INTERNAL_H */
diff --git a/src/include/statistics/statistics.h b/src/include/statistics/statistics.h
index 326cf26fea..8d890e6ce7 100644
--- a/src/include/statistics/statistics.h
+++ b/src/include/statistics/statistics.h
@@ -120,10 +120,21 @@ extern Selectivity statext_clauselist_selectivity(PlannerInfo *root,
Bitmapset **estimatedclauses,
bool is_or);
extern bool has_stats_of_kind(List *stats, char requiredkind);
+extern StatisticExtInfo *find_matching_mcv(PlannerInfo *root, RelOptInfo *rel,
+ Bitmapset *attnums, List *exprs);
extern StatisticExtInfo *choose_best_statistics(List *stats, char requiredkind,
Bitmapset **clause_attnums,
List **clause_exprs,
int nclauses);
extern HeapTuple statext_expressions_load(Oid stxoid, int idx);
+extern bool statext_try_join_estimates(PlannerInfo *root, List *clauses, int varRelid,
+ JoinType jointype, SpecialJoinInfo *sjinfo,
+ Bitmapset *estimatedclauses);
+
+extern Selectivity statext_clauselist_join_selectivity(PlannerInfo *root, List *clauses,
+ int varRelid,
+ JoinType jointype, SpecialJoinInfo *sjinfo,
+ Bitmapset **estimatedclauses);
+
#endif /* STATISTICS_H */