From eee0c5f0de35d8cb83e6c4ca7749020acb18a4d1 Mon Sep 17 00:00:00 2001 From: "Andrey V. Lepikhov" Date: Wed, 28 Apr 2021 18:27:53 +0500 Subject: [PATCH] Remove self-joins. Remove inner joins of a relation to itself if can be proven that such join can be replaced with a scan. We can build the required proofs of uniqueness using the existing innerrel_is_unique machinery. We can remove a self-join when for each outer row, if: 1. At most one inner row matches the join clauses. 2. If the join target list contains any inner vars then the inner row is (physically) same row as the outer one. In this patch we use Rowley's [1] approach to identify a self-join: 1. Collect all mergejoinable join quals looks like a.x = b.x 2. Collect all another join quals. 3. Check innerrel_is_unique() for the qual list from (1). If it returns true, then outer row matches only the same row from the inner relation. Proved, that this join is self-join and can be replaced by a scan. Some regression tests changed due to self-join removal logic. [1] https://www.postgresql.org/message-id/raw/CAApHDvpggnFMC4yP-jUO7PKN%3DfXeErW5bOxisvJ0HvkHQEY%3DWw%40mail.gmail.com --- src/backend/optimizer/plan/analyzejoins.c | 942 +++++++++++++++++++++- src/backend/optimizer/plan/planmain.c | 5 + src/backend/optimizer/util/joininfo.c | 3 + src/backend/optimizer/util/relnode.c | 26 +- src/backend/utils/misc/guc.c | 10 + src/include/optimizer/pathnode.h | 4 + src/include/optimizer/planmain.h | 2 + src/test/regress/expected/equivclass.out | 32 + src/test/regress/expected/join.out | 331 ++++++++ src/test/regress/expected/sysviews.out | 3 +- src/test/regress/sql/equivclass.sql | 16 + src/test/regress/sql/join.sql | 166 ++++ 12 files changed, 1511 insertions(+), 29 deletions(-) diff --git a/src/backend/optimizer/plan/analyzejoins.c b/src/backend/optimizer/plan/analyzejoins.c index 37eb64bcef..dd5c4d2bd3 100644 --- a/src/backend/optimizer/plan/analyzejoins.c +++ b/src/backend/optimizer/plan/analyzejoins.c @@ -22,6 +22,7 @@ */ #include "postgres.h" +#include "catalog/pg_class.h" #include "nodes/nodeFuncs.h" #include "optimizer/clauses.h" #include "optimizer/joininfo.h" @@ -32,10 +33,12 @@ #include "optimizer/tlist.h" #include "utils/lsyscache.h" +bool enable_self_join_removal; + /* local functions */ static bool join_is_removable(PlannerInfo *root, SpecialJoinInfo *sjinfo); static void remove_rel_from_query(PlannerInfo *root, int relid, - Relids joinrelids); + Relids joinrelids, int subst_relid); static List *remove_rel_from_joinlist(List *joinlist, int relid, int *nremoved); static bool rel_supports_distinctness(PlannerInfo *root, RelOptInfo *rel); static bool rel_is_distinct_for(PlannerInfo *root, RelOptInfo *rel, @@ -47,6 +50,9 @@ static bool is_innerrel_unique_for(PlannerInfo *root, RelOptInfo *innerrel, JoinType jointype, List *restrictlist); +static void change_rinfo(RestrictInfo* rinfo, Index from, Index to); +static Bitmapset* change_relid(Relids relids, Index oldId, Index newId); +static void change_varno(Expr *expr, Index oldRelid, Index newRelid); /* @@ -86,7 +92,7 @@ restart: remove_rel_from_query(root, innerrelid, bms_union(sjinfo->min_lefthand, - sjinfo->min_righthand)); + sjinfo->min_righthand), 0); /* We verify that exactly one reference gets removed from joinlist */ nremoved = 0; @@ -300,7 +306,10 @@ join_is_removable(PlannerInfo *root, SpecialJoinInfo *sjinfo) /* * Remove the target relid from the planner's data structures, having - * determined that there is no need to include it in the query. + * determined that there is no need to include it in the query. Or replace + * with another relid. + * To reusability, this routine can work in two modes: delete relid from a plan + * or replace it. It is used in replace mode in a self-join removing process. * * We are not terribly thorough here. We must make sure that the rel is * no longer treated as a baserel, and that attributes of other baserels @@ -309,13 +318,16 @@ join_is_removable(PlannerInfo *root, SpecialJoinInfo *sjinfo) * lists, but only if they belong to the outer join identified by joinrelids. */ static void -remove_rel_from_query(PlannerInfo *root, int relid, Relids joinrelids) +remove_rel_from_query(PlannerInfo *root, int relid, Relids joinrelids, + int subst_relid) { RelOptInfo *rel = find_base_rel(root, relid); List *joininfos; Index rti; ListCell *l; + Assert(subst_relid == 0 || relid != subst_relid); + /* * Mark the rel as "dead" to show it is no longer part of the join tree. * (Removing it from the baserel array altogether seems too risky.) @@ -345,8 +357,11 @@ remove_rel_from_query(PlannerInfo *root, int relid, Relids joinrelids) attroff--) { otherrel->attr_needed[attroff] = - bms_del_member(otherrel->attr_needed[attroff], relid); + change_relid(otherrel->attr_needed[attroff], relid, subst_relid); } + + /* Update lateral references. */ + change_varno((Expr*)otherrel->lateral_vars, relid, subst_relid); } /* @@ -361,10 +376,12 @@ remove_rel_from_query(PlannerInfo *root, int relid, Relids joinrelids) { SpecialJoinInfo *sjinfo = (SpecialJoinInfo *) lfirst(l); - sjinfo->min_lefthand = bms_del_member(sjinfo->min_lefthand, relid); - sjinfo->min_righthand = bms_del_member(sjinfo->min_righthand, relid); - sjinfo->syn_lefthand = bms_del_member(sjinfo->syn_lefthand, relid); - sjinfo->syn_righthand = bms_del_member(sjinfo->syn_righthand, relid); + sjinfo->min_lefthand = change_relid(sjinfo->min_lefthand, relid, subst_relid); + sjinfo->min_righthand = change_relid(sjinfo->min_righthand, relid, subst_relid); + sjinfo->syn_lefthand = change_relid(sjinfo->syn_lefthand, relid, subst_relid); + sjinfo->syn_righthand = change_relid(sjinfo->syn_righthand, relid, subst_relid); + + change_varno((Expr*)sjinfo->semi_rhs_exprs, relid, subst_relid); } /* @@ -385,16 +402,29 @@ remove_rel_from_query(PlannerInfo *root, int relid, Relids joinrelids) { PlaceHolderInfo *phinfo = (PlaceHolderInfo *) lfirst(l); - Assert(!bms_is_member(relid, phinfo->ph_lateral)); - if (bms_is_subset(phinfo->ph_needed, joinrelids) && + if (subst_relid == 0 && bms_is_subset(phinfo->ph_needed, joinrelids) && bms_is_member(relid, phinfo->ph_eval_at)) root->placeholder_list = foreach_delete_current(root->placeholder_list, l); else { - phinfo->ph_eval_at = bms_del_member(phinfo->ph_eval_at, relid); + phinfo->ph_eval_at = change_relid(phinfo->ph_eval_at, relid, subst_relid); Assert(!bms_is_empty(phinfo->ph_eval_at)); - phinfo->ph_needed = bms_del_member(phinfo->ph_needed, relid); + phinfo->ph_needed = change_relid(phinfo->ph_needed, relid, subst_relid); + Assert(subst_relid != 0 || !bms_is_member(relid, phinfo->ph_lateral)); + phinfo->ph_lateral = change_relid(phinfo->ph_lateral, relid, subst_relid); + phinfo->ph_var->phrels = change_relid(phinfo->ph_var->phrels, relid, subst_relid); + } + } + + if (subst_relid != 0) + { + foreach(l, rel->baserestrictinfo) + { + RestrictInfo *rinfo = (RestrictInfo *) lfirst(l); + + change_rinfo(rinfo, relid, subst_relid); + distribute_restrictinfo_to_rels(root, rinfo); } } @@ -418,6 +448,7 @@ remove_rel_from_query(PlannerInfo *root, int relid, Relids joinrelids) RestrictInfo *rinfo = (RestrictInfo *) lfirst(l); remove_join_clause_from_rels(root, rinfo, rinfo->required_relids); + change_rinfo(rinfo, relid, subst_relid); if (RINFO_IS_PUSHED_DOWN(rinfo, joinrelids)) { @@ -433,6 +464,8 @@ remove_rel_from_query(PlannerInfo *root, int relid, Relids joinrelids) relid); distribute_restrictinfo_to_rels(root, rinfo); } + else if (subst_relid != 0) + distribute_restrictinfo_to_rels(root, rinfo); } /* @@ -1118,3 +1151,886 @@ is_innerrel_unique_for(PlannerInfo *root, /* Let rel_is_distinct_for() do the hard work */ return rel_is_distinct_for(root, innerrel, clause_list); } + +typedef struct ChangeVarnoContext +{ + Index oldRelid; + Index newRelid; +} ChangeVarnoContext; + + +static bool +change_varno_walker(Node *node, ChangeVarnoContext *context) +{ + if (node == NULL) + return false; + + if (IsA(node, Var)) + { + Var* var = (Var*)node; + if (var->varno == context->oldRelid) + { + var->varno = context->newRelid; + var->varnosyn = context->newRelid; + var->location = -1; + } + else if (var->varno == context->newRelid) + var->location = -1; + + return false; + } + if (IsA(node, RestrictInfo)) + { + change_rinfo((RestrictInfo*)node, context->oldRelid, context->newRelid); + return false; + } + return expression_tree_walker(node, change_varno_walker, context); +} + +/* + * For all Vars in the expression that have varno = oldRelid, set + * varno = newRelid. + */ +static void +change_varno(Expr *expr, Index oldRelid, Index newRelid) +{ + ChangeVarnoContext context; + + if (newRelid == 0) + return; + + context.oldRelid = oldRelid; + context.newRelid = newRelid; + change_varno_walker((Node *) expr, &context); +} + +/* + * Substitute newId for oldId in relids. + */ +static Bitmapset* +change_relid(Relids relids, Index oldId, Index newId) +{ + if (newId == 0) + /* Delete relid without substitution. */ + return bms_del_member(relids, oldId); + + if (bms_is_member(oldId, relids)) + return bms_add_member(bms_del_member(bms_copy(relids), oldId), newId); + + return relids; +} + +static void +change_rinfo(RestrictInfo* rinfo, Index from, Index to) +{ + if (to == 0) + return; + + bool is_req_equal = + (rinfo->required_relids == rinfo->clause_relids) ? true : false; + + change_varno(rinfo->clause, from, to); + change_varno(rinfo->orclause, from, to); + rinfo->clause_relids = change_relid(rinfo->clause_relids, from, to); + if (is_req_equal) + rinfo->required_relids = rinfo->clause_relids; + else + rinfo->required_relids = change_relid(rinfo->required_relids, from, to); + rinfo->left_relids = change_relid(rinfo->left_relids, from, to); + rinfo->right_relids = change_relid(rinfo->right_relids, from, to); + rinfo->outer_relids = change_relid(rinfo->outer_relids, from, to); + rinfo->nullable_relids = change_relid(rinfo->nullable_relids, from, to); +} + +/* + * Update EC members to point to the remaining relation instead of the removed + * one, removing duplicates. + */ +static void +update_ec_members(EquivalenceClass *ec, Index toRemove, Index toKeep) +{ + int counter; + + for (counter = 0; counter < list_length(ec->ec_members); ) + { + ListCell *cell = list_nth_cell(ec->ec_members, counter); + EquivalenceMember *em = lfirst(cell); + int counter1; + + if (!bms_is_member(toRemove, em->em_relids)) + { + counter++; + continue; + } + + em->em_relids = change_relid(em->em_relids, toRemove, toKeep); + /* We only process inner joins */ + change_varno(em->em_expr, toRemove, toKeep); + + /* + * After we switched the equivalence member to the remaining relation, + * check that it is not the same as the existing member, and if it + * is, delete it. + */ + for (counter1 = 0; counter1 < list_length(ec->ec_members); counter1++) + { + EquivalenceMember *other; + + if (counter1 == counter) + continue; + + other = castNode(EquivalenceMember, list_nth(ec->ec_members, counter1)); + + if (equal(other->em_expr, em->em_expr)) + break; + } + + if (counter1 < list_length(ec->ec_members)) + ec->ec_members = list_delete_cell(ec->ec_members, cell); + else + counter++; + } +} + +/* + * Update EC sources to point to the remaining relation instead of the + * removed one. + */ +static void +update_ec_sources(List **sources, Index toRemove, Index toKeep) +{ + int counter; + int cc=0; + + for (counter = 0; counter < list_length(*sources); ) + { + ListCell *cell = list_nth_cell(*sources, counter); + RestrictInfo *rinfo = castNode(RestrictInfo, lfirst(cell)); + int counter1; + + if (!bms_is_member(toRemove, rinfo->required_relids)) + { + counter++; + continue; + } + + change_varno(rinfo->clause, toRemove, toKeep); + + /* + * After switching the clause to the remaining relation, check it for + * redundancy with existing ones. We don't have to check for + * redundancy with derived clauses, because we've just deleted them. + */ + for (counter1 = 0; counter1 < list_length(*sources); counter1++) + { + RestrictInfo *other; + + if (counter1 == counter) + continue; + + other = castNode(RestrictInfo, list_nth(*sources, counter1)); + if (equal(rinfo->clause, other->clause)) + break; + } + + if (counter1 < list_length(*sources)) + { + *sources = list_delete_cell(*sources, cell); + cc++; + } + else + { + counter++; + + /* We will keep this RestrictInfo, correct its relids. */ + change_rinfo(rinfo, toRemove, toKeep); + } + } +} + +/* + * Remove a relation after we have proven that it participates only in an + * unneeded unique self join. + * + * The joinclauses list is destructively changed. + */ +static void +remove_self_join_rel(PlannerInfo *root, PlanRowMark *kmark, PlanRowMark *rmark, + RelOptInfo *toKeep, RelOptInfo *toRemove) +{ + ListCell *cell; + int i; + List *target = NIL; + + /* + * Include all eclass mentions of removed relation into the eclass mentions + * of kept relation. + */ + toKeep->eclass_indexes = bms_add_members(toRemove->eclass_indexes, + toKeep->eclass_indexes); + + /* + * Now, baserestrictinfo replenished with restrictions from removing + * relation. It is needed to remove duplicates and replace degenerated + * clauses with a NullTest. + */ + foreach(cell, toKeep->baserestrictinfo) + { + RestrictInfo *rinfo = lfirst_node(RestrictInfo, cell); + ListCell *otherCell; + + Assert(!bms_is_member(toRemove->relid, rinfo->clause_relids)); + + /* + * If this clause is a mergejoinable equality clause that compares a + * variable to itself, i.e., has the form of "X=X", replace it with + * null test. + */ + if (rinfo->mergeopfamilies && IsA(rinfo->clause, OpExpr)) + { + Expr *leftOp; + Expr *rightOp; + + leftOp = (Expr *) get_leftop(rinfo->clause); + rightOp = (Expr *) get_rightop(rinfo->clause); + + if (leftOp != NULL && equal(leftOp, rightOp)) + { + NullTest *nullTest = makeNode(NullTest); + nullTest->arg = leftOp; + nullTest->nulltesttype = IS_NOT_NULL; + nullTest->argisrow = false; + nullTest->location = -1; + rinfo->clause = (Expr *) nullTest; + } + } + + /* Search for duplicates. */ + foreach(otherCell, target) + { + RestrictInfo *other = lfirst_node(RestrictInfo, otherCell); + + if (other == rinfo || + (rinfo->parent_ec != NULL + && other->parent_ec == rinfo->parent_ec) + || equal(rinfo->clause, other->clause)) + { + break; + } + } + + if (otherCell != NULL) + /* Duplicate found */ + continue; + + target = lappend(target, rinfo); + } + + list_free(toKeep->baserestrictinfo); + toKeep->baserestrictinfo = target; + + /* + * Update the equivalence classes that reference the removed relations. + */ + foreach(cell, root->eq_classes) + { + EquivalenceClass *ec = lfirst(cell); + + if (!bms_is_member(toRemove->relid, ec->ec_relids)) + { + /* + * This EC doesn't reference the removed relation, nothing to be + * done for it. + */ + continue; + } + + /* + * Update the EC members to reference the remaining relation instead + * of the removed one. + */ + update_ec_members(ec, toRemove->relid, toKeep->relid); + ec->ec_relids = change_relid(ec->ec_relids, toRemove->relid, toKeep->relid); + + /* + * We will now update source and derived clauses of the EC. + * + * Restriction clauses for base relations are already distributed to + * the respective baserestrictinfo lists (see + * generate_implied_equalities). The above code has already processed + * this list, and updated these clauses to reference the remaining + * relation, so we can skip them here based on their relids. + * + * Likewise, we have already processed the join clauses that join the + * removed relation to the remaining one. + * + * Finally, there are join clauses that join the removed relation to + * some third relation. We can't just delete the source clauses and + * regenerate them from the EC, because the corresponding equality + * operators might be missing (see the handling of ec_broken). + * Therefore, we will update the references in the source clauses. + * + * Derived clauses can be generated again, so it is simpler to just + * delete them. + */ + list_free(ec->ec_derives); + ec->ec_derives = NULL; + update_ec_sources(&ec->ec_sources, toRemove->relid, toKeep->relid); + } + + /* + * Transfer the targetlist and attr_needed flags. + */ + Assert(toRemove->reltarget->sortgrouprefs == 0); + + foreach (cell, toRemove->reltarget->exprs) + { + Expr *node = lfirst(cell); + change_varno(node, toRemove->relid, toKeep->relid); + if (!list_member(toKeep->reltarget->exprs, node)) + toKeep->reltarget->exprs = lappend(toKeep->reltarget->exprs, node); + } + + for (i = toKeep->min_attr; i <= toKeep->max_attr; i++) + { + int attno = i - toKeep->min_attr; + toKeep->attr_needed[attno] = bms_add_members(toKeep->attr_needed[attno], + toRemove->attr_needed[attno]); + } + + /* + * If the removed relation has a row mark, transfer it to the remaining + * one. + * + * If both rels have row marks, just keep the one corresponding to the + * remaining relation, because we verified earlier that they have the same + * strength. + * + * Also make sure that the scratch->row_marks cache is up to date, because + * we are going to use it for further join removals. + */ + if (rmark) + { + if (kmark) + { + Assert(kmark->markType == rmark->markType); + + root->rowMarks = list_delete_ptr(root->rowMarks, kmark); + } + else + { + /* Shouldn't have inheritance children here. */ + Assert(kmark->rti == kmark->prti); + + rmark->rti = toKeep->relid; + rmark->prti = toKeep->relid; + } + } + + /* + * Change varno in some special cases with non-trivial RangeTblEntry + */ + foreach(cell, root->parse->rtable) + { + RangeTblEntry *rte = lfirst(cell); + + switch(rte->rtekind) + { + case RTE_FUNCTION: + change_varno((Expr*)rte->functions, toRemove->relid, toKeep->relid); + break; + case RTE_TABLEFUNC: + change_varno((Expr*)rte->tablefunc, toRemove->relid, toKeep->relid); + break; + case RTE_VALUES: + change_varno((Expr*)rte->values_lists, toRemove->relid, toKeep->relid); + break; + default: + /* no op */ + break; + } + } + + /* + * Replace varno in root targetlist and HAVING clause. + */ + change_varno((Expr *) root->processed_tlist, toRemove->relid, toKeep->relid); + change_varno((Expr *) root->parse->havingQual, toRemove->relid, toKeep->relid); + + /* + * Transfer join and restriction clauses from the removed relation to the + * remaining one. We change the Vars of the clause to point to the + * remaining relation instead of the removed one. The clauses that require + * a subset of joinrelids become restriction clauses of the remaining + * relation, and others remain join clauses. We append them to + * baserestrictinfo and joininfo respectively, trying not to introduce + * duplicates. + * + * We also have to process the 'joinclauses' list here, because it + * contains EC-derived join clauses which must become filter clauses. It + * is not enough to just correct the ECs, because the EC-derived + * restrictions are generated before join removal (see + * generate_base_implied_equalities). + */ +} + +/* + * split_selfjoin_quals + * Processes 'joinquals' building two lists, one with a list of quals + * where the columns/exprs on either side of the join match and another + * list containing the remaining quals. + * + * 'joinquals' must only contain quals for a RTE_RELATION being joined to + * itself. + */ +static void +split_selfjoin_quals(PlannerInfo *root, List *joinquals, List **selfjoinquals, + List **otherjoinquals) +{ + ListCell *lc; + List *sjoinquals = NIL; + List *rjoinquals = NIL; + + foreach(lc, joinquals) + { + RestrictInfo *rinfo = (RestrictInfo *) lfirst(lc); + OpExpr *expr; + Expr *leftexpr; + Expr *rightexpr; + + if (bms_num_members(rinfo->clause_relids) != 2 || + bms_num_members(rinfo->left_relids) != 1 || + bms_num_members(rinfo->right_relids) != 1) + { + rjoinquals = lappend(rjoinquals, rinfo); + continue; + } + + expr = (OpExpr *) rinfo->clause; + + if (!IsA(expr, OpExpr) || list_length(expr->args) != 2) + { + rjoinquals = lappend(rjoinquals, rinfo); + continue; + } + + leftexpr = (Expr *) copyObject(get_leftop(rinfo->clause)); + rightexpr = (Expr *) copyObject(get_rightop(rinfo->clause)); + + /* Can't match of the exprs are not of the same type */ + if (leftexpr->type != rightexpr->type) + { + rjoinquals = lappend(rjoinquals, rinfo); + continue; + } + + change_varno(rightexpr, + bms_next_member(rinfo->right_relids, -1), + bms_next_member(rinfo->left_relids, -1)); + + if (equal(leftexpr, rightexpr)) + sjoinquals = lappend(sjoinquals, rinfo); + else + rjoinquals = lappend(rjoinquals, rinfo); + } + + *selfjoinquals = sjoinquals; + *otherjoinquals = rjoinquals; +} + +static bool +tlist_contains_rel_exprs(PlannerInfo *root, Relids relids, RelOptInfo *rel) +{ + ListCell *vars; + + foreach(vars, rel->reltarget->exprs) + { + Var *var = (Var *) lfirst(vars); + RelOptInfo *baserel; + int ndx; + + /* + * Ignore PlaceHolderVars in the input tlists; we'll make our own + * decisions about whether to copy them. + */ + if (IsA(var, PlaceHolderVar)) + return true; + + /* + * Otherwise, anything in a baserel or joinrel targetlist ought to be + * a Var. (More general cases can only appear in appendrel child + * rels, which will never be seen here.) + */ + if (!IsA(var, Var)) + elog(ERROR, "unexpected node type in rel targetlist: %d", + (int) nodeTag(var)); + + /* Get the Var's original base rel */ + baserel = find_base_rel(root, var->varno); + + /* Is it still needed above this joinrel? */ + ndx = var->varattno - baserel->min_attr; + if (bms_nonempty_difference(baserel->attr_needed[ndx], relids)) + return true; + } + return false; +} + +/* + * Find and remove unique self joins in a group of base relations that have + * the same Oid. + * + * Returns a set of relids that were removed. + */ +static Relids +remove_self_joins_one_group(PlannerInfo *root, Relids relids) +{ + Relids joinrelids = NULL; + Relids result = NULL; + int k; /* Index of kept relation */ + int r = -1; /* Index of removed relation */ + + if (bms_num_members(relids) < 2) + return NULL; + + while ((r = bms_next_member(relids, r)) > 0) + { + RelOptInfo *outer = root->simple_rel_array[r]; + k = r; + + while ((k = bms_next_member(relids, k)) > 0) + { + RelOptInfo *inner = root->simple_rel_array[k]; + List *restrictlist; + List *selfjoinquals; + List *otherjoinquals; + ListCell *lc; + bool jinfo_check = true; + PlanRowMark *omark = NULL; + PlanRowMark *imark = NULL; + + /* A sanity check: the relations have the same Oid. */ + Assert(root->simple_rte_array[k]->relid == + root->simple_rte_array[r]->relid); + + /* + * It is impossible to optimize two relations if they belong to + * different rules of order restriction. Otherwise planner can't + * be able to find any variants of correct query plan. + */ + foreach(lc, root->join_info_list) + { + SpecialJoinInfo *info = (SpecialJoinInfo *) lfirst(lc); + + if (bms_is_member(k, info->syn_lefthand) && + !bms_is_member(r, info->syn_lefthand)) + jinfo_check = false; + + if (bms_is_member(k, info->syn_righthand) && + !bms_is_member(r, info->syn_righthand)) + jinfo_check = false; + + if (bms_is_member(r, info->syn_lefthand) && + !bms_is_member(k, info->syn_lefthand)) + jinfo_check = false; + + if (bms_is_member(r, info->syn_righthand) && + !bms_is_member(k, info->syn_righthand)) + jinfo_check = false; + + if (!jinfo_check) + break; + } + + if (!jinfo_check) + continue; + + /* Reuse joinrelids bitset to avoid reallocation. */ + joinrelids = bms_del_members(joinrelids, joinrelids); + + /* + * We only deal with base rels here, so their relids bitset + * contains only one member -- their relid. + */ + joinrelids = bms_add_member(joinrelids, r); + joinrelids = bms_add_member(joinrelids, k); + + /* Is it a unique self join? */ + restrictlist = build_joinrel_restrictlist(root, joinrelids, outer, + inner); + + /* + * Process restrictlist to seperate out the self join quals from + * the other quals. e.g x = x goes to selfjoinquals and a = b to + * otherjoinquals. + */ + split_selfjoin_quals(root, restrictlist, &selfjoinquals, + &otherjoinquals); + + if (list_length(selfjoinquals) == 0) + { + /* + * Have a chance to remove join if target list contains vars from + * the only one relation. + */ + if (list_length(otherjoinquals) == 0) + { + /* Can't determine uniqueness without any quals. */ + continue; + + } + else if (!tlist_contains_rel_exprs(root, joinrelids, inner)) + { + /* + * TODO: + * In this case, we only have a chance in the case of a + * foreign key reference. + */ + continue; + } + else + /* + * The target list contains vars from both inner and outer + * relations. + */ + continue; + } + /* + * Determine if the inner table can duplicate outer rows. We must + * bypass the unique rel cache here since we're possibly using a + * subset of join quals. We can use 'force_cache' = true when all + * join quals are selfjoin quals. Otherwise we could end up + * putting false negatives in the cache. + */ + else if (!innerrel_is_unique(root, joinrelids, outer->relids, + inner, JOIN_INNER, selfjoinquals, + list_length(otherjoinquals) == 0)) + continue; + + /* See for row marks. */ + foreach (lc, root->rowMarks) + { + PlanRowMark *rowMark = (PlanRowMark *) lfirst(lc); + + if (rowMark->rti == k) + { + Assert(imark == NULL); + imark = rowMark; + } + else if (rowMark->rti == r) + { + Assert(omark == NULL); + omark = rowMark; + } + } + + /* + * We can't remove the join if the relations have row marks of + * different strength (e.g. one is locked FOR UPDATE and another + * just has ROW_MARK_REFERENCE for EvalPlanQual rechecking). + */ + if (omark && imark && omark->markType != imark->markType) + continue; + + /* + * Be safe to do not remove table participated in complicated PH + */ + foreach(lc, root->placeholder_list) + { + PlaceHolderInfo *phinfo = (PlaceHolderInfo *) lfirst(lc); + + /* there isn't any other place to eval PHV */ + if (bms_is_subset(phinfo->ph_eval_at, joinrelids) || + bms_is_subset(phinfo->ph_needed, joinrelids)) + break; + } + + if (lc) + continue; + + /* + * We can remove either relation, so remove the outer one, to + * simplify this loop. + */ + + /* + * Add join restrictions to joininfo of removing relation to simplify + * the relids replacing procedure. + */ + outer->joininfo = list_concat(outer->joininfo, restrictlist); + + /* Firstly, replace index of excluding relation with keeping. */ + remove_rel_from_query(root, outer->relid, joinrelids, inner->relid); + + /* Secondly, fix restrictions of keeping relation */ + remove_self_join_rel(root, imark, omark, inner, outer); + result = bms_add_member(result, r); + + /* We removed the outer relation, try the next one. */ + break; + } + } + + return result; +} + +/* + * Iteratively form a group of relation indexes with the same oid and launch + * the routine that detects self-joins in this group and removes excessive + * range table entries. + * + * At the end of iteration, exclude the group from the overall relids list. + * So each next iteration of the cycle will involve less and less value of + * relids. + */ +static Relids +remove_self_joins_one_level(PlannerInfo *root, Relids relids, Relids ToRemove) +{ + while (!bms_is_empty(relids)) + { + Relids group = NULL; + Oid groupOid; + int i; + + i = bms_first_member(relids); + groupOid = root->simple_rte_array[i]->relid; + Assert(OidIsValid(groupOid)); + group = bms_add_member(group, i); + + /* Create group of relation indexes with the same oid. */ + while ((i = bms_next_member(relids, i)) > 0) + { + RangeTblEntry *rte = root->simple_rte_array[i]; + + Assert(OidIsValid(rte->relid)); + + if (rte->relid == groupOid) + group = bms_add_member(group, i); + } + + relids = bms_del_members(relids, group); + ToRemove = bms_add_members(ToRemove, + remove_self_joins_one_group(root, group)); + bms_free(group); + } + return ToRemove; +} + +/* + * For each level of joinlist form a set of base relations and launch the + * routine of the self-join removal optimization. Recurse into sub-joinlists to + * handle deeper levels. + */ +static Relids +remove_self_joins_recurse(PlannerInfo *root, List *joinlist, Relids ToRemove) +{ + ListCell *lc; + Relids relids = NULL; + + /* Collect the ids of base relations at one level of the join tree. */ + foreach (lc, joinlist) + { + switch (((Node *) lfirst(lc))->type) + { + case T_List: + /* Recursively go inside the sub-joinlist */ + ToRemove = remove_self_joins_recurse(root, + (List *) lfirst(lc), + ToRemove); + break; + case T_RangeTblRef: + { + RangeTblRef *ref = (RangeTblRef *) lfirst(lc); + RangeTblEntry *rte = root->simple_rte_array[ref->rtindex]; + + /* + * We only care about base relations from which we select + * something. + */ + if (rte->rtekind != RTE_RELATION || + rte->relkind != RELKIND_RELATION || + root->simple_rel_array[ref->rtindex] == NULL) + break; + + Assert(!bms_is_member(ref->rtindex, relids)); + relids = bms_add_member(relids, ref->rtindex); + + /* + * Limit the number of joins we process to control the quadratic + * behavior. + */ + if (bms_num_members(relids) > join_collapse_limit) + break; + } + break; + default: + Assert(false); + } + } + + if (bms_num_members(relids) >= 2) + ToRemove = remove_self_joins_one_level(root, relids, ToRemove); + + bms_free(relids); + return ToRemove; +} + +/* + * Find and remove useless self joins. + * + * We search for joins where the same relation is joined to itself on all + * columns of some unique index. If this condition holds, then, for + * each outer row, only one inner row matches, and it is the same row + * of the same relation. This allows us to remove the join and replace + * it with a scan that combines WHERE clauses from both sides. The join + * clauses themselves assume the form of X = X and can be replaced with + * NOT NULL clauses. + * + * For the sake of simplicity, we don't apply this optimization to special + * joins. Here is a list of what we could do in some particular cases: + * 'a a1 semi join a a2': is reduced to inner by reduce_unique_semijoins, + * and then removed normally. + * 'a a1 anti join a a2': could simplify to a scan with 'outer quals AND + * (IS NULL on join columns OR NOT inner quals)'. + * 'a a1 left join a a2': could simplify to a scan like inner, but without + * NOT NULL conditions on join columns. + * 'a a1 left join (a a2 join b)': can't simplify this, because join to b + * can both remove rows and introduce duplicates. + * + * To search for removable joins, we order all the relations on their Oid, + * go over each set with the same Oid, and consider each pair of relations + * in this set. We check that both relation are made unique by the same + * unique index with the same clauses. + * + * To remove the join, we mark one of the participating relations as + * dead, and rewrite all references to it to point to the remaining + * relation. This includes modifying RestrictInfos, EquivalenceClasses and + * EquivalenceMembers. We also have to modify the row marks. The join clauses + * of the removed relation become either restriction or join clauses, based on + * whether they reference any relations not participating in the removed join. + * + * 'targetlist' is the top-level targetlist of query. If it has any references + * to the removed relations, we update them to point to the remaining ones. + */ +List * +remove_useless_self_joins(PlannerInfo *root, List *joinlist) +{ + Relids ToRemove = NULL; + int relid = -1; + + if (!enable_self_join_removal) + return joinlist; + + /* + * Merge pairs of relations participated in self-join. Remove + * unnecessary range table entries. + */ + ToRemove = remove_self_joins_recurse(root, joinlist, ToRemove); + while ((relid = bms_next_member(ToRemove, relid)) >= 0) + { + int nremoved = 0; + joinlist = remove_rel_from_joinlist(joinlist, relid, &nremoved); + } + + return joinlist; +} diff --git a/src/backend/optimizer/plan/planmain.c b/src/backend/optimizer/plan/planmain.c index 273ac0acf7..28a55b0f42 100644 --- a/src/backend/optimizer/plan/planmain.c +++ b/src/backend/optimizer/plan/planmain.c @@ -226,6 +226,11 @@ query_planner(PlannerInfo *root, */ reduce_unique_semijoins(root); + /* + * Remove self joins on a unique column. + */ + joinlist = remove_useless_self_joins(root, joinlist); + /* * Now distribute "placeholders" to base rels as needed. This has to be * done after join removal because removal could change whether a diff --git a/src/backend/optimizer/util/joininfo.c b/src/backend/optimizer/util/joininfo.c index 717808b037..3859b4843a 100644 --- a/src/backend/optimizer/util/joininfo.c +++ b/src/backend/optimizer/util/joininfo.c @@ -130,6 +130,9 @@ remove_join_clause_from_rels(PlannerInfo *root, { RelOptInfo *rel = find_base_rel(root, cur_relid); + if (!list_member_ptr(rel->joininfo, restrictinfo)) + continue; + /* * Remove the restrictinfo from the list. Pointer comparison is * sufficient. diff --git a/src/backend/optimizer/util/relnode.c b/src/backend/optimizer/util/relnode.c index e105a4d5f1..8aa8993749 100644 --- a/src/backend/optimizer/util/relnode.c +++ b/src/backend/optimizer/util/relnode.c @@ -40,14 +40,10 @@ typedef struct JoinHashEntry static void build_joinrel_tlist(PlannerInfo *root, RelOptInfo *joinrel, RelOptInfo *input_rel); -static List *build_joinrel_restrictlist(PlannerInfo *root, - RelOptInfo *joinrel, - RelOptInfo *outer_rel, - RelOptInfo *inner_rel); static void build_joinrel_joinlist(RelOptInfo *joinrel, RelOptInfo *outer_rel, RelOptInfo *inner_rel); -static List *subbuild_joinrel_restrictlist(RelOptInfo *joinrel, +static List *subbuild_joinrel_restrictlist(Relids joinrelids, List *joininfo_list, List *new_restrictlist); static List *subbuild_joinrel_joinlist(RelOptInfo *joinrel, @@ -599,7 +595,7 @@ build_join_rel(PlannerInfo *root, */ if (restrictlist_ptr) *restrictlist_ptr = build_joinrel_restrictlist(root, - joinrel, + joinrel->relids, outer_rel, inner_rel); return joinrel; @@ -705,7 +701,7 @@ build_join_rel(PlannerInfo *root, * caller might or might not need the restrictlist, but I need it anyway * for set_joinrel_size_estimates().) */ - restrictlist = build_joinrel_restrictlist(root, joinrel, + restrictlist = build_joinrel_restrictlist(root, joinrel->relids, outer_rel, inner_rel); if (restrictlist_ptr) *restrictlist_ptr = restrictlist; @@ -1056,7 +1052,7 @@ build_joinrel_tlist(PlannerInfo *root, RelOptInfo *joinrel, * the various joinlist entries ultimately refer to RestrictInfos * pushed into them by distribute_restrictinfo_to_rels(). * - * 'joinrel' is a join relation node + * 'joinrelids' is a join relation id set * 'outer_rel' and 'inner_rel' are a pair of relations that can be joined * to form joinrel. * @@ -1069,9 +1065,9 @@ build_joinrel_tlist(PlannerInfo *root, RelOptInfo *joinrel, * RestrictInfo nodes are no longer context-dependent. Instead, just include * the original nodes in the lists made for the join relation. */ -static List * +List * build_joinrel_restrictlist(PlannerInfo *root, - RelOptInfo *joinrel, + Relids joinrelids, RelOptInfo *outer_rel, RelOptInfo *inner_rel) { @@ -1082,8 +1078,8 @@ build_joinrel_restrictlist(PlannerInfo *root, * eliminating any duplicates (important since we will see many of the * same clauses arriving from both input relations). */ - result = subbuild_joinrel_restrictlist(joinrel, outer_rel->joininfo, NIL); - result = subbuild_joinrel_restrictlist(joinrel, inner_rel->joininfo, result); + result = subbuild_joinrel_restrictlist(joinrelids, outer_rel->joininfo, NIL); + result = subbuild_joinrel_restrictlist(joinrelids, inner_rel->joininfo, result); /* * Add on any clauses derived from EquivalenceClasses. These cannot be @@ -1092,7 +1088,7 @@ build_joinrel_restrictlist(PlannerInfo *root, */ result = list_concat(result, generate_join_implied_equalities(root, - joinrel->relids, + joinrelids, outer_rel->relids, inner_rel)); @@ -1118,7 +1114,7 @@ build_joinrel_joinlist(RelOptInfo *joinrel, } static List * -subbuild_joinrel_restrictlist(RelOptInfo *joinrel, +subbuild_joinrel_restrictlist(Relids joinrelids, List *joininfo_list, List *new_restrictlist) { @@ -1128,7 +1124,7 @@ subbuild_joinrel_restrictlist(RelOptInfo *joinrel, { RestrictInfo *rinfo = (RestrictInfo *) lfirst(l); - if (bms_is_subset(rinfo->required_relids, joinrel->relids)) + if (bms_is_subset(rinfo->required_relids, joinrelids)) { /* * This clause becomes a restriction clause for the joinrel, since diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index f2c7c2486b..b4d0a4ed9f 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -1156,6 +1156,16 @@ static struct config_bool ConfigureNamesBool[] = true, NULL, NULL, NULL }, + { + {"enable_self_join_removal", PGC_USERSET, QUERY_TUNING_METHOD, + gettext_noop("Enable removal of unique self-joins."), + NULL, + GUC_EXPLAIN + }, + &enable_self_join_removal, + true, + NULL, NULL, NULL + }, { {"geqo", PGC_USERSET, QUERY_TUNING_GEQO, gettext_noop("Enables genetic query optimization."), diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h index 53261ee91f..54d1a45fc5 100644 --- a/src/include/optimizer/pathnode.h +++ b/src/include/optimizer/pathnode.h @@ -306,6 +306,10 @@ extern RelOptInfo *build_join_rel(PlannerInfo *root, RelOptInfo *inner_rel, SpecialJoinInfo *sjinfo, List **restrictlist_ptr); +extern List *build_joinrel_restrictlist(PlannerInfo *root, + Relids joinrelids, + RelOptInfo *outer_rel, + RelOptInfo *inner_rel); extern Relids min_join_parameterization(PlannerInfo *root, Relids joinrelids, RelOptInfo *outer_rel, diff --git a/src/include/optimizer/planmain.h b/src/include/optimizer/planmain.h index bf1adfc52a..629bda4b0b 100644 --- a/src/include/optimizer/planmain.h +++ b/src/include/optimizer/planmain.h @@ -20,6 +20,7 @@ /* GUC parameters */ #define DEFAULT_CURSOR_TUPLE_FRACTION 0.1 extern double cursor_tuple_fraction; +extern bool enable_self_join_removal; /* query_planner callback to compute query_pathkeys */ typedef void (*query_pathkeys_callback) (PlannerInfo *root, void *extra); @@ -107,6 +108,7 @@ extern bool query_is_distinct_for(Query *query, List *colnos, List *opids); extern bool innerrel_is_unique(PlannerInfo *root, Relids joinrelids, Relids outerrelids, RelOptInfo *innerrel, JoinType jointype, List *restrictlist, bool force_cache); +extern List * remove_useless_self_joins(PlannerInfo *root, List *jointree); /* * prototypes for plan/setrefs.c diff --git a/src/test/regress/expected/equivclass.out b/src/test/regress/expected/equivclass.out index 126f7047fe..de71441052 100644 --- a/src/test/regress/expected/equivclass.out +++ b/src/test/regress/expected/equivclass.out @@ -430,6 +430,38 @@ explain (costs off) Filter: ((unique1 IS NOT NULL) AND (unique2 IS NOT NULL)) (2 rows) +-- Test that broken ECs are processed correctly during self join removal. +-- Disable merge joins so that we don't get an error about missing commutator. +-- Test both orientations of the join clause, because only one of them breaks +-- the EC. +set enable_mergejoin to off; +explain (costs off) + select * from ec0 m join ec0 n on m.ff = n.ff + join ec1 p on m.ff + n.ff = p.f1; + QUERY PLAN +---------------------------------------- + Nested Loop + Join Filter: ((n.ff + n.ff) = p.f1) + -> Seq Scan on ec1 p + -> Materialize + -> Seq Scan on ec0 n + Filter: (ff IS NOT NULL) +(6 rows) + +explain (costs off) + select * from ec0 m join ec0 n on m.ff = n.ff + join ec1 p on p.f1::int8 = (m.ff + n.ff)::int8alias1; + QUERY PLAN +--------------------------------------------------------------- + Nested Loop + Join Filter: ((p.f1)::bigint = ((n.ff + n.ff))::int8alias1) + -> Seq Scan on ec1 p + -> Materialize + -> Seq Scan on ec0 n + Filter: (ff IS NOT NULL) +(6 rows) + +reset enable_mergejoin; -- this could be converted, but isn't at present explain (costs off) select * from tenk1 where unique1 = unique1 or unique2 = unique2; diff --git a/src/test/regress/expected/join.out b/src/test/regress/expected/join.out index 86fd3907c5..7cd13c2b33 100644 --- a/src/test/regress/expected/join.out +++ b/src/test/regress/expected/join.out @@ -4880,6 +4880,337 @@ select * from ----+----+----+---- (0 rows) +-- +-- test that semi- or inner self-joins on a unique column are removed +-- +-- enable only nestloop to get more predictable plans +set enable_hashjoin to off; +set enable_mergejoin to off; +create table sj (a int unique, b int); +insert into sj values (1, null), (null, 2), (2, 1); +analyze sj; +select p.* from sj p, sj q where q.a = p.a and q.b = q.a - 1; + a | b +---+--- + 2 | 1 +(1 row) + +explain (costs off) +select p.* from sj p, sj q where q.a = p.a and q.b = q.a - 1; + QUERY PLAN +----------------------------------------------- + Seq Scan on sj q + Filter: ((a IS NOT NULL) AND (b = (a - 1))) +(2 rows) + +explain (costs off) +select * from sj p +where exists (select * from sj q + where q.a = p.a and q.b < 10); + QUERY PLAN +------------------------------------------ + Seq Scan on sj q + Filter: ((a IS NOT NULL) AND (b < 10)) +(2 rows) + +-- Double self-join removal. +-- Use a condition on "b + 1", not on "b", for the second join, so that +-- the equivalence class is different from the first one, and we can +-- test the non-ec code path. +explain (costs off) +select * from sj t1 join sj t2 on t1.a = t2.a and t1.b = t2.b + join sj t3 on t2.a = t3.a and t2.b + 1 = t3.b + 1; + QUERY PLAN +--------------------------------------------------------------------------- + Seq Scan on sj t3 + Filter: ((a IS NOT NULL) AND (b IS NOT NULL) AND ((b + 1) IS NOT NULL)) +(2 rows) + +-- subselect that references the removed relation +explain (costs off) +select t1.a, (select a from sj where a = t2.a and a = t1.a) +from sj t1, sj t2 +where t1.a = t2.a; + QUERY PLAN +------------------------------------------ + Seq Scan on sj t2 + Filter: (a IS NOT NULL) + SubPlan 1 + -> Result + One-Time Filter: (t2.a = t2.a) + -> Seq Scan on sj + Filter: (a = t2.a) +(7 rows) + +-- self-join under outer join +explain (costs off) +select * from sj x join sj y on x.a = y.a +left join int8_tbl z on x.a = z.q1; + QUERY PLAN +------------------------------------ + Nested Loop Left Join + Join Filter: (y.a = z.q1) + -> Seq Scan on sj y + Filter: (a IS NOT NULL) + -> Materialize + -> Seq Scan on int8_tbl z +(6 rows) + +explain (costs off) +select * from sj x join sj y on x.a = y.a +left join int8_tbl z on y.a = z.q1; + QUERY PLAN +------------------------------------ + Nested Loop Left Join + Join Filter: (y.a = z.q1) + -> Seq Scan on sj y + Filter: (a IS NOT NULL) + -> Materialize + -> Seq Scan on int8_tbl z +(6 rows) + +-- Test that placeholders are updated correctly after join removal +explain (costs off) +select * from (values (1)) x +left join (select coalesce(y.q1, 1) from int8_tbl y + right join sj j1 inner join sj j2 on j1.a = j2.a + on true) z +on true; + QUERY PLAN +------------------------------------------ + Nested Loop Left Join + -> Result + -> Nested Loop Left Join + -> Seq Scan on sj j2 + Filter: (a IS NOT NULL) + -> Materialize + -> Seq Scan on int8_tbl y +(7 rows) + +-- Test that OR predicated are updated correctly after join removal +CREATE TABLE tab_with_flag ( id INT PRIMARY KEY, is_flag SMALLINT); +CREATE INDEX idx_test_is_flag ON tab_with_flag (is_flag); +explain (costs off) +SELECT COUNT(*) FROM tab_with_flag WHERE (is_flag IS NULL OR is_flag = 0) AND id IN (SELECT id FROM tab_with_flag WHERE id IN (2, 3)); + QUERY PLAN +---------------------------------------------------------------------------------- + Aggregate + -> Bitmap Heap Scan on tab_with_flag + Recheck Cond: ((id = ANY ('{2,3}'::integer[])) AND (id IS NOT NULL)) + Filter: ((is_flag IS NULL) OR (is_flag = 0)) + -> Bitmap Index Scan on tab_with_flag_pkey + Index Cond: ((id = ANY ('{2,3}'::integer[])) AND (id IS NOT NULL)) +(6 rows) + +DROP TABLE tab_with_flag; +-- HAVING clause +explain (costs off) +select p.b from sj p join sj q on p.a = q.a group by p.b having sum(p.a) = 1; + QUERY PLAN +--------------------------------- + HashAggregate + Group Key: q.b + Filter: (sum(q.a) = 1) + -> Seq Scan on sj q + Filter: (a IS NOT NULL) +(5 rows) + +-- update lateral references and range table entry reference +explain (verbose, costs off) +select 1 from (select x.* from sj x, sj y where x.a = y.a) q, + lateral generate_series(1, q.a) gs(i); + QUERY PLAN +------------------------------------------------------ + Nested Loop + Output: 1 + -> Seq Scan on public.sj y + Output: y.a, y.b + Filter: (y.a IS NOT NULL) + -> Function Scan on pg_catalog.generate_series gs + Output: gs.i + Function Call: generate_series(1, y.a) +(8 rows) + +explain (verbose, costs off) +select 1 from (select y.* from sj x, sj y where x.a = y.a) q, + lateral generate_series(1, q.a) gs(i); + QUERY PLAN +------------------------------------------------------ + Nested Loop + Output: 1 + -> Seq Scan on public.sj y + Output: y.a, y.b + Filter: (y.a IS NOT NULL) + -> Function Scan on pg_catalog.generate_series gs + Output: gs.i + Function Call: generate_series(1, y.a) +(8 rows) + +-- Test that a non-EC-derived join clause is processed correctly. Use an +-- outer join so that we can't form an EC. +explain (costs off) select * from sj p join sj q on p.a = q.a + left join sj r on p.a + q.a = r.a; + QUERY PLAN +------------------------------------ + Nested Loop Left Join + Join Filter: ((q.a + q.a) = r.a) + -> Seq Scan on sj q + Filter: (a IS NOT NULL) + -> Materialize + -> Seq Scan on sj r +(6 rows) + +-- FIXME this constant false filter doesn't look good. Should we merge +-- equivalence classes? +explain (costs off) +select * from sj p, sj q where p.a = q.a and p.b = 1 and q.b = 2; + QUERY PLAN +----------------------------------------------------- + Seq Scan on sj q + Filter: ((a IS NOT NULL) AND (b = 2) AND (b = 1)) +(2 rows) + +-- Check that attr_needed is updated correctly after self-join removal. In this +-- test, the join of j1 with j2 is removed. k1.b is required at either j1 or j2. +-- If this info is lost, join targetlist for (k1, k2) will not contain k1.b. +-- Use index scan for k1 so that we don't get 'b' from physical tlist used for +-- seqscan. Also disable reordering of joins because this test depends on a +-- particular join tree. +create table sk (a int, b int); +create index on sk(a); +set join_collapse_limit to 1; +set enable_seqscan to off; +explain (costs off) select 1 from + (sk k1 join sk k2 on k1.a = k2.a) + join (sj j1 join sj j2 on j1.a = j2.a) on j1.b = k1.b; + QUERY PLAN +----------------------------------------------------- + Nested Loop + Join Filter: (k1.b = j2.b) + -> Nested Loop + -> Index Scan using sk_a_idx on sk k1 + -> Index Only Scan using sk_a_idx on sk k2 + Index Cond: (a = k1.a) + -> Materialize + -> Index Scan using sj_a_key on sj j2 + Index Cond: (a IS NOT NULL) +(9 rows) + +explain (costs off) select 1 from + (sk k1 join sk k2 on k1.a = k2.a) + join (sj j1 join sj j2 on j1.a = j2.a) on j2.b = k1.b; + QUERY PLAN +----------------------------------------------------- + Nested Loop + Join Filter: (k1.b = j2.b) + -> Nested Loop + -> Index Scan using sk_a_idx on sk k1 + -> Index Only Scan using sk_a_idx on sk k2 + Index Cond: (a = k1.a) + -> Materialize + -> Index Scan using sj_a_key on sj j2 + Index Cond: (a IS NOT NULL) +(9 rows) + +reset join_collapse_limit; +reset enable_seqscan; +-- Check that clauses from the join filter list is not lost on the self-join removal +CREATE TABLE emp1 ( id SERIAL PRIMARY KEY NOT NULL, code int); +explain (verbose, costs off) +SELECT * FROM emp1 e1, emp1 e2 WHERE e1.id = e2.id AND e2.code <> e1.code; + QUERY PLAN +---------------------------------------------------------- + Seq Scan on public.emp1 e2 + Output: e2.id, e2.code, e2.id, e2.code + Filter: ((e2.id IS NOT NULL) AND (e2.code <> e2.code)) +(3 rows) + +-- We can remove the join even if we find the join can't duplicate rows and +-- the base quals of each side are different. In the following case we end up +-- moving quals over to s1 to make it so it can't match any rows. +create table sl(a int, b int); +create unique index on sl(a, b); +vacuum analyze sl; +-- Both sides are unique, but base quals are different +explain (costs off) +select * from sl t1, sl t2 where t1.a = t2.a and t1.b = 1 and t2.b = 2; + QUERY PLAN +----------------------------------------------------- + Seq Scan on sl t2 + Filter: ((a IS NOT NULL) AND (b = 2) AND (b = 1)) +(2 rows) + +-- +---- Only one side is unqiue +--select * from sl t1, sl t2 where t1.a = t2.a and t1.b = 1; +--select * from sl t1, sl t2 where t1.a = t2.a and t2.b = 1; +-- +---- Several uniques indexes match, and we select a different one +---- for each side, so the join is not removed +--create table sm(a int unique, b int unique, c int unique); +--explain (costs off) +--select * from sm m, sm n where m.a = n.b and m.c = n.c; +--explain (costs off) +--select * from sm m, sm n where m.a = n.c and m.b = n.b; +--explain (costs off) +--select * from sm m, sm n where m.c = n.b and m.a = n.a; +-- Check optimization disabling if it will violate special join conditions. +-- Two identical joined relations satisfies self join removal conditions but +-- stay in different special join infos. +CREATE TABLE sj_t1 (id serial, a int); +CREATE TABLE sj_t2 (id serial, a int); +CREATE TABLE sj_t3 (id serial, a int); +CREATE TABLE sj_t4 (id serial, a int); +CREATE UNIQUE INDEX ON sj_t3 USING btree (a,id); +CREATE UNIQUE INDEX ON sj_t2 USING btree (id); +EXPLAIN (COSTS OFF) +SELECT * FROM sj_t1 +JOIN ( + SELECT sj_t2.id AS id FROM sj_t2 + WHERE EXISTS + ( + SELECT TRUE FROM sj_t3,sj_t4 WHERE sj_t3.a = 1 AND sj_t3.id = sj_t2.id + ) + ) t2t3t4 +ON sj_t1.id = t2t3t4.id +JOIN ( + SELECT sj_t2.id AS id FROM sj_t2 + WHERE EXISTS + ( + SELECT TRUE FROM sj_t3,sj_t4 WHERE sj_t3.a = 1 AND sj_t3.id = sj_t2.id + ) + ) _t2t3t4 +ON sj_t1.id = _t2t3t4.id; + QUERY PLAN +------------------------------------------------------------------------------------- + Nested Loop + Join Filter: (sj_t3.id = sj_t1.id) + -> Nested Loop + Join Filter: (sj_t3.id = sj_t2.id) + -> Nested Loop Semi Join + -> Nested Loop + -> HashAggregate + Group Key: sj_t3.id + -> Nested Loop + -> Seq Scan on sj_t4 + -> Materialize + -> Bitmap Heap Scan on sj_t3 + Recheck Cond: (a = 1) + -> Bitmap Index Scan on sj_t3_a_id_idx + Index Cond: (a = 1) + -> Index Only Scan using sj_t2_id_idx on sj_t2 sj_t2_1 + Index Cond: (id = sj_t3.id) + -> Nested Loop + -> Index Only Scan using sj_t3_a_id_idx on sj_t3 sj_t3_1 + Index Cond: ((a = 1) AND (id = sj_t3.id)) + -> Seq Scan on sj_t4 sj_t4_1 + -> Index Only Scan using sj_t2_id_idx on sj_t2 + Index Cond: (id = sj_t2_1.id) + -> Seq Scan on sj_t1 +(24 rows) + +reset enable_hashjoin; +reset enable_mergejoin; -- -- Test hints given on incorrect column references are useful -- diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out index 0bb558d93c..dd005da156 100644 --- a/src/test/regress/expected/sysviews.out +++ b/src/test/regress/expected/sysviews.out @@ -112,10 +112,11 @@ select name, setting from pg_settings where name like 'enable%'; enable_partitionwise_aggregate | off enable_partitionwise_join | off enable_resultcache | on + enable_self_join_removal | on enable_seqscan | on enable_sort | on enable_tidscan | on -(20 rows) +(21 rows) -- Test that the pg_timezone_names and pg_timezone_abbrevs views are -- more-or-less working. We can't test their contents in any great detail diff --git a/src/test/regress/sql/equivclass.sql b/src/test/regress/sql/equivclass.sql index 247b0a3105..77dd964ebf 100644 --- a/src/test/regress/sql/equivclass.sql +++ b/src/test/regress/sql/equivclass.sql @@ -259,6 +259,22 @@ drop user regress_user_ectest; explain (costs off) select * from tenk1 where unique1 = unique1 and unique2 = unique2; +-- Test that broken ECs are processed correctly during self join removal. +-- Disable merge joins so that we don't get an error about missing commutator. +-- Test both orientations of the join clause, because only one of them breaks +-- the EC. +set enable_mergejoin to off; + +explain (costs off) + select * from ec0 m join ec0 n on m.ff = n.ff + join ec1 p on m.ff + n.ff = p.f1; + +explain (costs off) + select * from ec0 m join ec0 n on m.ff = n.ff + join ec1 p on p.f1::int8 = (m.ff + n.ff)::int8alias1; + +reset enable_mergejoin; + -- this could be converted, but isn't at present explain (costs off) select * from tenk1 where unique1 = unique1 or unique2 = unique2; diff --git a/src/test/regress/sql/join.sql b/src/test/regress/sql/join.sql index 7f866c603b..3d58a1b054 100644 --- a/src/test/regress/sql/join.sql +++ b/src/test/regress/sql/join.sql @@ -1734,6 +1734,172 @@ select * from select * from int8_tbl x join (int4_tbl x cross join int4_tbl y(ff)) j on q1 = f1; -- ok +-- +-- test that semi- or inner self-joins on a unique column are removed +-- + +-- enable only nestloop to get more predictable plans +set enable_hashjoin to off; +set enable_mergejoin to off; + +create table sj (a int unique, b int); +insert into sj values (1, null), (null, 2), (2, 1); +analyze sj; + +select p.* from sj p, sj q where q.a = p.a and q.b = q.a - 1; + +explain (costs off) +select p.* from sj p, sj q where q.a = p.a and q.b = q.a - 1; + +explain (costs off) +select * from sj p +where exists (select * from sj q + where q.a = p.a and q.b < 10); + +-- Double self-join removal. +-- Use a condition on "b + 1", not on "b", for the second join, so that +-- the equivalence class is different from the first one, and we can +-- test the non-ec code path. +explain (costs off) +select * from sj t1 join sj t2 on t1.a = t2.a and t1.b = t2.b + join sj t3 on t2.a = t3.a and t2.b + 1 = t3.b + 1; + +-- subselect that references the removed relation +explain (costs off) +select t1.a, (select a from sj where a = t2.a and a = t1.a) +from sj t1, sj t2 +where t1.a = t2.a; + +-- self-join under outer join +explain (costs off) +select * from sj x join sj y on x.a = y.a +left join int8_tbl z on x.a = z.q1; + +explain (costs off) +select * from sj x join sj y on x.a = y.a +left join int8_tbl z on y.a = z.q1; + +-- Test that placeholders are updated correctly after join removal +explain (costs off) +select * from (values (1)) x +left join (select coalesce(y.q1, 1) from int8_tbl y + right join sj j1 inner join sj j2 on j1.a = j2.a + on true) z +on true; + +-- Test that OR predicated are updated correctly after join removal +CREATE TABLE tab_with_flag ( id INT PRIMARY KEY, is_flag SMALLINT); +CREATE INDEX idx_test_is_flag ON tab_with_flag (is_flag); +explain (costs off) +SELECT COUNT(*) FROM tab_with_flag WHERE (is_flag IS NULL OR is_flag = 0) AND id IN (SELECT id FROM tab_with_flag WHERE id IN (2, 3)); +DROP TABLE tab_with_flag; + +-- HAVING clause +explain (costs off) +select p.b from sj p join sj q on p.a = q.a group by p.b having sum(p.a) = 1; + +-- update lateral references and range table entry reference +explain (verbose, costs off) +select 1 from (select x.* from sj x, sj y where x.a = y.a) q, + lateral generate_series(1, q.a) gs(i); + +explain (verbose, costs off) +select 1 from (select y.* from sj x, sj y where x.a = y.a) q, + lateral generate_series(1, q.a) gs(i); + +-- Test that a non-EC-derived join clause is processed correctly. Use an +-- outer join so that we can't form an EC. +explain (costs off) select * from sj p join sj q on p.a = q.a + left join sj r on p.a + q.a = r.a; + +-- FIXME this constant false filter doesn't look good. Should we merge +-- equivalence classes? +explain (costs off) +select * from sj p, sj q where p.a = q.a and p.b = 1 and q.b = 2; + +-- Check that attr_needed is updated correctly after self-join removal. In this +-- test, the join of j1 with j2 is removed. k1.b is required at either j1 or j2. +-- If this info is lost, join targetlist for (k1, k2) will not contain k1.b. +-- Use index scan for k1 so that we don't get 'b' from physical tlist used for +-- seqscan. Also disable reordering of joins because this test depends on a +-- particular join tree. +create table sk (a int, b int); +create index on sk(a); +set join_collapse_limit to 1; +set enable_seqscan to off; +explain (costs off) select 1 from + (sk k1 join sk k2 on k1.a = k2.a) + join (sj j1 join sj j2 on j1.a = j2.a) on j1.b = k1.b; +explain (costs off) select 1 from + (sk k1 join sk k2 on k1.a = k2.a) + join (sj j1 join sj j2 on j1.a = j2.a) on j2.b = k1.b; +reset join_collapse_limit; +reset enable_seqscan; + +-- Check that clauses from the join filter list is not lost on the self-join removal +CREATE TABLE emp1 ( id SERIAL PRIMARY KEY NOT NULL, code int); +explain (verbose, costs off) +SELECT * FROM emp1 e1, emp1 e2 WHERE e1.id = e2.id AND e2.code <> e1.code; + +-- We can remove the join even if we find the join can't duplicate rows and +-- the base quals of each side are different. In the following case we end up +-- moving quals over to s1 to make it so it can't match any rows. +create table sl(a int, b int); +create unique index on sl(a, b); +vacuum analyze sl; + +-- Both sides are unique, but base quals are different +explain (costs off) +select * from sl t1, sl t2 where t1.a = t2.a and t1.b = 1 and t2.b = 2; + +-- +---- Only one side is unqiue +--select * from sl t1, sl t2 where t1.a = t2.a and t1.b = 1; +--select * from sl t1, sl t2 where t1.a = t2.a and t2.b = 1; +-- +---- Several uniques indexes match, and we select a different one +---- for each side, so the join is not removed +--create table sm(a int unique, b int unique, c int unique); +--explain (costs off) +--select * from sm m, sm n where m.a = n.b and m.c = n.c; +--explain (costs off) +--select * from sm m, sm n where m.a = n.c and m.b = n.b; +--explain (costs off) +--select * from sm m, sm n where m.c = n.b and m.a = n.a; + +-- Check optimization disabling if it will violate special join conditions. +-- Two identical joined relations satisfies self join removal conditions but +-- stay in different special join infos. +CREATE TABLE sj_t1 (id serial, a int); +CREATE TABLE sj_t2 (id serial, a int); +CREATE TABLE sj_t3 (id serial, a int); +CREATE TABLE sj_t4 (id serial, a int); + +CREATE UNIQUE INDEX ON sj_t3 USING btree (a,id); +CREATE UNIQUE INDEX ON sj_t2 USING btree (id); + +EXPLAIN (COSTS OFF) +SELECT * FROM sj_t1 +JOIN ( + SELECT sj_t2.id AS id FROM sj_t2 + WHERE EXISTS + ( + SELECT TRUE FROM sj_t3,sj_t4 WHERE sj_t3.a = 1 AND sj_t3.id = sj_t2.id + ) + ) t2t3t4 +ON sj_t1.id = t2t3t4.id +JOIN ( + SELECT sj_t2.id AS id FROM sj_t2 + WHERE EXISTS + ( + SELECT TRUE FROM sj_t3,sj_t4 WHERE sj_t3.a = 1 AND sj_t3.id = sj_t2.id + ) + ) _t2t3t4 +ON sj_t1.id = _t2t3t4.id; + +reset enable_hashjoin; +reset enable_mergejoin; + -- -- Test hints given on incorrect column references are useful -- -- 2.31.1