From ee627d07282629a85785a63341cf875bfb0decb2 Mon Sep 17 00:00:00 2001 From: Andrey Lepikhov Date: Fri, 2 Apr 2021 11:02:20 +0500 Subject: [PATCH] Asymmetric partitionwise join. Teach optimizer to consider partitionwise join of non-partitioned table with each partition of partitioned table. Disallow asymmetric machinery for joining of two partitioned (or appended) relations because it could cause huge consumption of CPU and memory during reparameterization of NestLoop path. Change logic of the multilevel child relids adjustment, because this feature allows the optimizer to plan in new way. --- src/backend/optimizer/path/joinpath.c | 9 + src/backend/optimizer/path/joinrels.c | 187 ++++++++ src/backend/optimizer/plan/setrefs.c | 17 +- src/backend/optimizer/util/appendinfo.c | 44 +- src/backend/optimizer/util/pathnode.c | 9 +- src/backend/optimizer/util/relnode.c | 19 +- src/include/optimizer/paths.h | 7 +- src/test/regress/expected/partition_join.out | 425 +++++++++++++++++++ src/test/regress/sql/partition_join.sql | 180 ++++++++ 9 files changed, 867 insertions(+), 30 deletions(-) diff --git a/src/backend/optimizer/path/joinpath.c b/src/backend/optimizer/path/joinpath.c index 6407ede12a..32618ebbd5 100644 --- a/src/backend/optimizer/path/joinpath.c +++ b/src/backend/optimizer/path/joinpath.c @@ -335,6 +335,15 @@ add_paths_to_joinrel(PlannerInfo *root, if (set_join_pathlist_hook) set_join_pathlist_hook(root, joinrel, outerrel, innerrel, jointype, &extra); + + /* + * 7. If outer relation is delivered from partition-tables, consider + * distributing inner relation to every partition-leaf prior to + * append these leafs. + */ + try_asymmetric_partitionwise_join(root, joinrel, + outerrel, innerrel, + jointype, &extra); } /* diff --git a/src/backend/optimizer/path/joinrels.c b/src/backend/optimizer/path/joinrels.c index 8b69870cf4..9453258f83 100644 --- a/src/backend/optimizer/path/joinrels.c +++ b/src/backend/optimizer/path/joinrels.c @@ -16,6 +16,7 @@ #include "miscadmin.h" #include "optimizer/appendinfo.h" +#include "optimizer/cost.h" #include "optimizer/joininfo.h" #include "optimizer/pathnode.h" #include "optimizer/paths.h" @@ -1552,6 +1553,192 @@ try_partitionwise_join(PlannerInfo *root, RelOptInfo *rel1, RelOptInfo *rel2, } } +/* + * Build RelOptInfo on JOIN of each partition of the outer relation and the inner + * relation. Return List of such RelOptInfo's. Return NIL, if at least one of + * these JOINs is impossible to build. + */ +static List * +extract_asymmetric_partitionwise_subjoin(PlannerInfo *root, + RelOptInfo *joinrel, + AppendPath *append_path, + RelOptInfo *inner_rel, + JoinType jointype, + JoinPathExtraData *extra) +{ + List *result = NIL; + ListCell *lc; + + foreach (lc, append_path->subpaths) + { + Path *child_path = lfirst(lc); + RelOptInfo *child_rel = child_path->parent; + Relids child_joinrelids; + Relids parent_relids; + RelOptInfo *child_joinrel; + SpecialJoinInfo *child_sjinfo; + List *child_restrictlist; + + child_joinrelids = bms_union(child_rel->relids, inner_rel->relids); + parent_relids = bms_union(append_path->path.parent->relids, + inner_rel->relids); + + child_sjinfo = build_child_join_sjinfo(root, extra->sjinfo, + child_rel->relids, + inner_rel->relids); + child_restrictlist = (List *) + adjust_appendrel_attrs_multilevel(root, (Node *)extra->restrictlist, + child_joinrelids, parent_relids); + + child_joinrel = find_join_rel(root, child_joinrelids); + if (!child_joinrel) + child_joinrel = build_child_join_rel(root, + child_rel, + inner_rel, + joinrel, + child_restrictlist, + child_sjinfo, + jointype); + else + { + /* + * The join relation already exists. For example, it could happen if + * we join two plane tables with partitioned table(s). + * Populating this join with additional paths could push out some + * previously added paths which could be pointed in a subplans list + * of an higher level append. + * Of course, we could save such paths before generating new. But it + * can increase too much the number of paths in complex queries. It + * can be a task for future work. + */ + return NIL; + } + + populate_joinrel_with_paths(root, + child_rel, + inner_rel, + child_joinrel, + child_sjinfo, + child_restrictlist); + + /* Give up if asymmetric partition-wise join is not available */ + if (child_joinrel->pathlist == NIL) + return NIL; + + set_cheapest(child_joinrel); + result = lappend(result, child_joinrel); + } + return result; +} + +static bool +is_asymmetric_join_feasible(PlannerInfo *root, + RelOptInfo *outer_rel, + RelOptInfo *inner_rel, + JoinType jointype) +{ + ListCell *lc; + + if (jointype != JOIN_INNER && jointype != JOIN_LEFT) + return false; + + if (IS_OTHER_REL(outer_rel) || IS_OTHER_REL(inner_rel)) + return false; + + /* Disallow recursive usage of asymmertic join machinery */ + if (root->join_rel_level == NULL) + return false; + + /* + * Don't allow asymmetric JOIN of two append subplans. + * In the case of a parameterized NL join, a reparameterization procedure + * will lead to large memory allocations and a CPU consumption: + * each reparameterization will induce subpath duplication, creating new + * ParamPathInfo instance and increasing of ppilist up to number of + * partitions in the inner. Also, if we have many partitions, each bitmapset + * variable will be large and many leaks of such variable (caused by relid + * replacement) will highly increase memory consumption. + * So, we deny such paths for now. + */ + foreach(lc, inner_rel->pathlist) + { + if (IsA(lfirst(lc), AppendPath)) + return false; + } + + return true; +} + +void +try_asymmetric_partitionwise_join(PlannerInfo *root, + RelOptInfo *joinrel, + RelOptInfo *outer_rel, + RelOptInfo *inner_rel, + JoinType jointype, + JoinPathExtraData *extra) +{ + ListCell *lc; + + /* + * Try this kind of paths if we allow complex partitionwise joins and we know + * we can build this join safely. + */ + if (!enable_partitionwise_join || + !is_asymmetric_join_feasible(root, outer_rel, inner_rel, jointype)) + return; + + foreach (lc, outer_rel->pathlist) + { + AppendPath *append_path = lfirst(lc); + + /* + * We assume this pathlist keeps at least one AppendPath that + * represents partitioned table-scan, symmetric or asymmetric + * partition-wise join. Asymmetric join isn't needed if the append node + * has only one child. + */ + if (IsA(append_path, AppendPath) && + list_length(append_path->subpaths) > 1) + { + List **join_rel_level_saved; + List *live_childrels = NIL; + + join_rel_level_saved = root->join_rel_level; + PG_TRY(); + { + /* temporary disables "dynamic programming" algorithm */ + root->join_rel_level = NULL; + + live_childrels = + extract_asymmetric_partitionwise_subjoin(root, + joinrel, + append_path, + inner_rel, + jointype, + extra); + } + PG_FINALLY(); + { + root->join_rel_level = join_rel_level_saved; + } + PG_END_TRY(); + + if (live_childrels != NIL) + { + /* + * Add new append relation. We must choose cheapest paths after + * this operation because the pathlist possibly contains + * joinrels and appendrels that can be suboptimal. + */ + add_paths_to_append_rel(root, joinrel, live_childrels); + set_cheapest(joinrel); + } + + break; + } + } +} + /* * Construct the SpecialJoinInfo for a child-join by translating * SpecialJoinInfo for the join between parents. left_relids and right_relids diff --git a/src/backend/optimizer/plan/setrefs.c b/src/backend/optimizer/plan/setrefs.c index e50624c465..fccc0685d7 100644 --- a/src/backend/optimizer/plan/setrefs.c +++ b/src/backend/optimizer/plan/setrefs.c @@ -281,24 +281,29 @@ set_plan_references(PlannerInfo *root, Plan *plan) /* * Adjust RT indexes of AppendRelInfos and add to final appendrels list. - * We assume the AppendRelInfos were built during planning and don't need - * to be copied. + * The AppendRelInfos are copied, because as a part of a subplan they could + * be visited many times in the case of asymmetric join. */ foreach(lc, root->append_rel_list) { AppendRelInfo *appinfo = lfirst_node(AppendRelInfo, lc); + AppendRelInfo *newappinfo; + + /* flat copy is enough since all valuable fields are scalars */ + newappinfo = (AppendRelInfo *) palloc(sizeof(AppendRelInfo)); + memcpy(newappinfo, appinfo, sizeof(AppendRelInfo)); /* adjust RT indexes */ - appinfo->parent_relid += rtoffset; - appinfo->child_relid += rtoffset; + newappinfo->parent_relid += rtoffset; + newappinfo->child_relid += rtoffset; /* * Rather than adjust the translated_vars entries, just drop 'em. * Neither the executor nor EXPLAIN currently need that data. */ - appinfo->translated_vars = NIL; + newappinfo->translated_vars = NIL; - glob->appendRelations = lappend(glob->appendRelations, appinfo); + glob->appendRelations = lappend(glob->appendRelations, newappinfo); } /* Now fix the Plan tree */ diff --git a/src/backend/optimizer/util/appendinfo.c b/src/backend/optimizer/util/appendinfo.c index af46f581ac..f4d12f76e1 100644 --- a/src/backend/optimizer/util/appendinfo.c +++ b/src/backend/optimizer/util/appendinfo.c @@ -202,7 +202,9 @@ adjust_appendrel_attrs(PlannerInfo *root, Node *node, int nappinfos, context.appinfos = appinfos; /* If there's nothing to adjust, don't call this function. */ - Assert(nappinfos >= 1 && appinfos != NULL); + /* If there's nothing to adjust, just return a duplication */ + if (nappinfos == 0) + return copyObject(node); /* Should never be translating a Query tree. */ Assert(node == NULL || !IsA(node, Query)); @@ -490,12 +492,10 @@ adjust_appendrel_attrs_multilevel(PlannerInfo *root, Node *node, Relids top_parent_relids) { AppendRelInfo **appinfos; - Bitmapset *parent_relids = NULL; + Relids parent_relids = NULL; int nappinfos; int cnt; - Assert(bms_num_members(child_relids) == bms_num_members(top_parent_relids)); - appinfos = find_appinfos_by_relids(root, child_relids, &nappinfos); /* Construct relids set for the immediate parent of given child. */ @@ -506,8 +506,13 @@ adjust_appendrel_attrs_multilevel(PlannerInfo *root, Node *node, parent_relids = bms_add_member(parent_relids, appinfo->parent_relid); } - /* Recurse if immediate parent is not the top parent. */ - if (!bms_equal(parent_relids, top_parent_relids)) + /* + * Recurse if immediate parent is not the top parent. Keep in mind that in a + * case of asymmetric JOIN top_parent_relids can contain relids which aren't + * part of an append node. + */ + if (!bms_equal(parent_relids, top_parent_relids) && + !bms_is_subset(parent_relids, top_parent_relids)) node = adjust_appendrel_attrs_multilevel(root, node, parent_relids, top_parent_relids); @@ -515,12 +520,13 @@ adjust_appendrel_attrs_multilevel(PlannerInfo *root, Node *node, node = adjust_appendrel_attrs(root, node, nappinfos, appinfos); pfree(appinfos); + pfree(parent_relids); return node; } /* - * Substitute child relids for parent relids in a Relid set. The array of + * Substitute child relids for parent relids in a Relid set. The array of * appinfos specifies the substitutions to be performed. */ Relids @@ -565,6 +571,7 @@ adjust_child_relids_multilevel(PlannerInfo *root, Relids relids, AppendRelInfo **appinfos; int nappinfos; Relids parent_relids = NULL; + Relids normal_relids = NULL; Relids result; Relids tmp_result = NULL; int cnt; @@ -579,13 +586,24 @@ adjust_child_relids_multilevel(PlannerInfo *root, Relids relids, appinfos = find_appinfos_by_relids(root, child_relids, &nappinfos); /* Construct relids set for the immediate parent of the given child. */ + normal_relids = bms_copy(child_relids); for (cnt = 0; cnt < nappinfos; cnt++) { AppendRelInfo *appinfo = appinfos[cnt]; parent_relids = bms_add_member(parent_relids, appinfo->parent_relid); + normal_relids = bms_del_member(normal_relids, appinfo->child_relid); } + if (bms_is_subset(relids, normal_relids)) + { + /* Nothing to do. Parameters set points to plain relations only. */ + result = relids; + goto cleanup; + } + + parent_relids = bms_union(parent_relids, normal_relids); + /* Recurse if immediate parent is not the top parent. */ if (!bms_equal(parent_relids, top_parent_relids)) { @@ -597,10 +615,11 @@ adjust_child_relids_multilevel(PlannerInfo *root, Relids relids, result = adjust_child_relids(relids, nappinfos, appinfos); +cleanup: /* Free memory consumed by any intermediate result. */ - if (tmp_result) - bms_free(tmp_result); + bms_free(tmp_result); bms_free(parent_relids); + bms_free(normal_relids); pfree(appinfos); return result; @@ -715,11 +734,11 @@ AppendRelInfo ** find_appinfos_by_relids(PlannerInfo *root, Relids relids, int *nappinfos) { AppendRelInfo **appinfos; + int nrooms = bms_num_members(relids); int cnt = 0; int i; - *nappinfos = bms_num_members(relids); - appinfos = (AppendRelInfo **) palloc(sizeof(AppendRelInfo *) * *nappinfos); + appinfos = (AppendRelInfo **) palloc(sizeof(AppendRelInfo *) * nrooms); i = -1; while ((i = bms_next_member(relids, i)) >= 0) @@ -727,10 +746,11 @@ find_appinfos_by_relids(PlannerInfo *root, Relids relids, int *nappinfos) AppendRelInfo *appinfo = root->append_rel_array[i]; if (!appinfo) - elog(ERROR, "child rel %d not found in append_rel_array", i); + continue; appinfos[cnt++] = appinfo; } + *nappinfos = cnt; return appinfos; } diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index cedb3848dd..17e215b72d 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -4225,7 +4225,14 @@ do { \ MemoryContextSwitchTo(oldcontext); } - bms_free(required_outer); + + /* + * If adjust_child_relids_multilevel don't do replacements it returns + * the original set, not a copy. It is possible in the case of asymmetric + * JOIN and child_rel->relids contains relids only of plane relations. + */ + if (required_outer != old_ppi->ppi_req_outer) + bms_free(required_outer); new_path->param_info = new_ppi; diff --git a/src/backend/optimizer/util/relnode.c b/src/backend/optimizer/util/relnode.c index 47769cea45..ddf0f5a876 100644 --- a/src/backend/optimizer/util/relnode.c +++ b/src/backend/optimizer/util/relnode.c @@ -792,11 +792,8 @@ build_child_join_rel(PlannerInfo *root, RelOptInfo *outer_rel, AppendRelInfo **appinfos; int nappinfos; - /* Only joins between "other" relations land here. */ - Assert(IS_OTHER_REL(outer_rel) && IS_OTHER_REL(inner_rel)); - - /* The parent joinrel should have consider_partitionwise_join set. */ - Assert(parent_joinrel->consider_partitionwise_join); + /* Either of relations must be "other" relation at least. */ + Assert(IS_OTHER_REL(outer_rel) || IS_OTHER_REL(inner_rel)); joinrel->reloptkind = RELOPT_OTHER_JOINREL; joinrel->relids = bms_union(outer_rel->relids, inner_rel->relids); @@ -854,8 +851,11 @@ build_child_join_rel(PlannerInfo *root, RelOptInfo *outer_rel, joinrel->partexprs = NULL; joinrel->nullable_partexprs = NULL; - joinrel->top_parent_relids = bms_union(outer_rel->top_parent_relids, - inner_rel->top_parent_relids); + joinrel->top_parent_relids = + bms_union(IS_OTHER_REL(outer_rel) ? + outer_rel->top_parent_relids : outer_rel->relids, + IS_OTHER_REL(inner_rel) ? + inner_rel->top_parent_relids : inner_rel->relids); /* Compute information relevant to foreign relations. */ set_foreign_rel_properties(joinrel, outer_rel, inner_rel); @@ -2036,9 +2036,8 @@ build_child_join_reltarget(PlannerInfo *root, { /* Build the targetlist */ childrel->reltarget->exprs = (List *) - adjust_appendrel_attrs(root, - (Node *) parentrel->reltarget->exprs, - nappinfos, appinfos); + adjust_appendrel_attrs_multilevel(root, (Node *)parentrel->reltarget->exprs, + childrel->relids, parentrel->relids); /* Set the cost and width fields */ childrel->reltarget->cost.startup = parentrel->reltarget->cost.startup; diff --git a/src/include/optimizer/paths.h b/src/include/optimizer/paths.h index f1d111063c..a0106dc539 100644 --- a/src/include/optimizer/paths.h +++ b/src/include/optimizer/paths.h @@ -109,7 +109,12 @@ extern bool have_join_order_restriction(PlannerInfo *root, extern bool have_dangerous_phv(PlannerInfo *root, Relids outer_relids, Relids inner_params); extern void mark_dummy_rel(RelOptInfo *rel); - +extern void try_asymmetric_partitionwise_join(PlannerInfo *root, + RelOptInfo *joinrel, + RelOptInfo *outer_rel, + RelOptInfo *inner_rel, + JoinType jointype, + JoinPathExtraData *extra); /* * equivclass.c * routines for managing EquivalenceClasses diff --git a/src/test/regress/expected/partition_join.out b/src/test/regress/expected/partition_join.out index 27f7525b3e..3eddd9bec3 100644 --- a/src/test/regress/expected/partition_join.out +++ b/src/test/regress/expected/partition_join.out @@ -2320,6 +2320,431 @@ SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_adv t1 INNER JOIN prt2_adv t2 ON (t1.a = 375 | 0375 | 375 | 0375 (8 rows) +-- +-- For asymmetric partition-wise join +-- +CREATE TABLE prt5 (hkey int, a int, b int) PARTITION BY HASH(hkey); +CREATE TABLE prt5_p0 PARTITION OF prt5 + FOR VALUES WITH (modulus 3, remainder 0); +CREATE TABLE prt5_p1 PARTITION OF prt5 + FOR VALUES WITH (modulus 3, remainder 1); +CREATE TABLE prt5_p2 PARTITION OF prt5 + FOR VALUES WITH (modulus 3, remainder 2); +CREATE TABLE prt6 (aid int, alabel text) PARTITION BY HASH(aid); +CREATE TABLE prt6_p0 PARTITION OF prt6 + FOR VALUES WITH (modulus 2, remainder 0); +CREATE TABLE prt6_p1 PARTITION OF prt6 + FOR VALUES WITH (modulus 2, remainder 1); +CREATE TABLE t5_1 (aid int, alabel text); +CREATE TABLE t5_2 (bid int, blabel text); +INSERT INTO prt5 (SELECT x, (x % 1000)::int, + ((x+1) % 1000)::int + FROM generate_series(1,1000000) x); +INSERT INTO t5_1 (SELECT x, md5(x::text) FROM generate_series(-200, 1500) x); +INSERT INTO t5_2 (SELECT x, md5(x::text) FROM generate_series(-200, 1500) x); +INSERT INTO prt6 (SELECT * FROM t5_1); +VACUUM ANALYZE prt5,prt6,t5_1,t5_2; +SET max_parallel_workers_per_gather = 0; +-- Trivial asymmetric JOIN of partitioned table with a relation +EXPLAIN (COSTS OFF) +SELECT * + FROM prt5 JOIN t5_1 ON a = aid AND alabel like '%abc%'; + QUERY PLAN +------------------------------------------------------- + Append + -> Hash Join + Hash Cond: (prt5_1.a = t5_1.aid) + -> Seq Scan on prt5_p0 prt5_1 + -> Hash + -> Seq Scan on t5_1 + Filter: (alabel ~~ '%abc%'::text) + -> Hash Join + Hash Cond: (prt5_2.a = t5_1.aid) + -> Seq Scan on prt5_p1 prt5_2 + -> Hash + -> Seq Scan on t5_1 + Filter: (alabel ~~ '%abc%'::text) + -> Hash Join + Hash Cond: (prt5_3.a = t5_1.aid) + -> Seq Scan on prt5_p2 prt5_3 + -> Hash + -> Seq Scan on t5_1 + Filter: (alabel ~~ '%abc%'::text) +(19 rows) + +-- The same, but appended with UNION ALL +EXPLAIN (COSTS OFF) +SELECT * FROM ( + (SELECT * FROM prt5_p0) + UNION ALL + (SELECT * FROM prt5_p1) + UNION ALL + (SELECT * FROM prt5_p2) + ) AS sq1 +JOIN t5_1 ON a = aid AND alabel like '%abc%'; + QUERY PLAN +------------------------------------------------------- + Append + -> Hash Join + Hash Cond: (prt5_p0.a = t5_1.aid) + -> Seq Scan on prt5_p0 + -> Hash + -> Seq Scan on t5_1 + Filter: (alabel ~~ '%abc%'::text) + -> Hash Join + Hash Cond: (prt5_p1.a = t5_1.aid) + -> Seq Scan on prt5_p1 + -> Hash + -> Seq Scan on t5_1 + Filter: (alabel ~~ '%abc%'::text) + -> Hash Join + Hash Cond: (prt5_p2.a = t5_1.aid) + -> Seq Scan on prt5_p2 + -> Hash + -> Seq Scan on t5_1 + Filter: (alabel ~~ '%abc%'::text) +(19 rows) + +-- Don't allow asymmetric JOIN of two partitioned tables. +EXPLAIN (COSTS OFF) +SELECT count(*) FROM prt5 JOIN prt6 ON a = aid AND alabel like '%abc%'; + QUERY PLAN +------------------------------------------------------------- + Aggregate + -> Hash Join + Hash Cond: (prt5.a = prt6.aid) + -> Append + -> Seq Scan on prt5_p0 prt5_1 + -> Seq Scan on prt5_p1 prt5_2 + -> Seq Scan on prt5_p2 prt5_3 + -> Hash + -> Append + -> Seq Scan on prt6_p0 prt6_1 + Filter: (alabel ~~ '%abc%'::text) + -> Seq Scan on prt6_p1 prt6_2 + Filter: (alabel ~~ '%abc%'::text) +(13 rows) + +SELECT count(*) FROM prt5 JOIN prt6 ON a = aid AND alabel like '%abc%'; + count +------- + 4000 +(1 row) + +-- Check asymmetric JOIN with Subquery +EXPLAIN (COSTS OFF) +SELECT count(*) FROM prt5 JOIN ( + SELECT * FROM prt6 LIMIT 1000 +) AS sq1 ON a = aid AND alabel like '%abc%'; + QUERY PLAN +---------------------------------------------------------------------- + Aggregate + -> Append + -> Hash Join + Hash Cond: (prt5_1.a = sq1.aid) + -> Seq Scan on prt5_p0 prt5_1 + -> Hash + -> Subquery Scan on sq1 + Filter: (sq1.alabel ~~ '%abc%'::text) + -> Limit + -> Append + -> Seq Scan on prt6_p0 prt6_1 + -> Seq Scan on prt6_p1 prt6_2 + -> Hash Join + Hash Cond: (prt5_2.a = sq1.aid) + -> Seq Scan on prt5_p1 prt5_2 + -> Hash + -> Subquery Scan on sq1 + Filter: (sq1.alabel ~~ '%abc%'::text) + -> Limit + -> Append + -> Seq Scan on prt6_p0 prt6_4 + -> Seq Scan on prt6_p1 prt6_5 + -> Hash Join + Hash Cond: (prt5_3.a = sq1.aid) + -> Seq Scan on prt5_p2 prt5_3 + -> Hash + -> Subquery Scan on sq1 + Filter: (sq1.alabel ~~ '%abc%'::text) + -> Limit + -> Append + -> Seq Scan on prt6_p0 prt6_7 + -> Seq Scan on prt6_p1 prt6_8 +(32 rows) + +SELECT count(*) FROM prt5 JOIN (SELECT * FROM prt6 LIMIT 1000) AS sq1 + ON a = aid AND alabel like '%abc%'; + count +------- + 2000 +(1 row) + +-- Asymmetric JOIN of two plane tables and one partitioned +EXPLAIN (COSTS OFF) +SELECT count(*) + FROM prt5 JOIN t5_1 ON a = aid AND alabel like '%ab%' + JOIN t5_2 ON b = bid AND blabel like '%cd%'; + QUERY PLAN +------------------------------------------------------------------ + Aggregate + -> Append + -> Hash Join + Hash Cond: (prt5_1.b = t5_2.bid) + -> Hash Join + Hash Cond: (prt5_1.a = t5_1.aid) + -> Seq Scan on prt5_p0 prt5_1 + -> Hash + -> Seq Scan on t5_1 + Filter: (alabel ~~ '%ab%'::text) + -> Hash + -> Seq Scan on t5_2 + Filter: (blabel ~~ '%cd%'::text) + -> Hash Join + Hash Cond: (prt5_2.b = t5_2.bid) + -> Hash Join + Hash Cond: (prt5_2.a = t5_1.aid) + -> Seq Scan on prt5_p1 prt5_2 + -> Hash + -> Seq Scan on t5_1 + Filter: (alabel ~~ '%ab%'::text) + -> Hash + -> Seq Scan on t5_2 + Filter: (blabel ~~ '%cd%'::text) + -> Hash Join + Hash Cond: (prt5_3.b = t5_2.bid) + -> Hash Join + Hash Cond: (prt5_3.a = t5_1.aid) + -> Seq Scan on prt5_p2 prt5_3 + -> Hash + -> Seq Scan on t5_1 + Filter: (alabel ~~ '%ab%'::text) + -> Hash + -> Seq Scan on t5_2 + Filter: (blabel ~~ '%cd%'::text) +(35 rows) + +SELECT count(*) + FROM prt5 JOIN t5_1 ON a = aid AND alabel like '%ab%' + JOIN t5_2 ON b = bid AND blabel like '%cd%'; + count +------- + 11000 +(1 row) + +-- unable to extract non-partitioned right relation +EXPLAIN (COSTS OFF) +SELECT * FROM prt5 RIGHT JOIN t5_1 ON a = aid AND alabel like '%abc%'; + QUERY PLAN +----------------------------------------------- + Hash Right Join + Hash Cond: (prt5.a = t5_1.aid) + Join Filter: (t5_1.alabel ~~ '%abc%'::text) + -> Append + -> Seq Scan on prt5_p0 prt5_1 + -> Seq Scan on prt5_p1 prt5_2 + -> Seq Scan on prt5_p2 prt5_3 + -> Hash + -> Seq Scan on t5_1 +(9 rows) + +-- left side can be extracted, but no cost benefit +EXPLAIN (COSTS OFF) +SELECT * FROM prt5 LEFT JOIN t5_1 ON a = aid AND alabel like '%abc%'; + QUERY PLAN +------------------------------------------------- + Hash Left Join + Hash Cond: (prt5.a = t5_1.aid) + -> Append + -> Seq Scan on prt5_p0 prt5_1 + -> Seq Scan on prt5_p1 prt5_2 + -> Seq Scan on prt5_p2 prt5_3 + -> Hash + -> Seq Scan on t5_1 + Filter: (alabel ~~ '%abc%'::text) +(9 rows) + +-- validation of the results with/without asymmetric partition-wise join +SELECT * INTO pg_temp.result01a + FROM prt5 JOIN t5_1 ON a = aid AND alabel like '%abc%'; +SELECT * INTO pg_temp.result02a + FROM prt5 JOIN t5_1 ON a = aid AND alabel like '%ab%' + JOIN t5_2 ON b = bid AND blabel like '%cd%'; +SET enable_partitionwise_join = off; +SELECT * INTO pg_temp.result01b + FROM prt5 JOIN t5_1 ON a = aid AND alabel like '%abc%'; +SELECT * INTO pg_temp.result02b + FROM prt5 JOIN t5_1 ON a = aid AND alabel like '%ab%' + JOIN t5_2 ON b = bid AND blabel like '%cd%'; +SELECT * FROM pg_temp.result01a EXCEPT SELECT * FROM pg_temp.result01b; + hkey | a | b | aid | alabel +------+---+---+-----+-------- +(0 rows) + +SELECT * FROM pg_temp.result02a EXCEPT SELECT * FROM pg_temp.result02b; + hkey | a | b | aid | alabel | bid | blabel +------+---+---+-----+--------+-----+-------- +(0 rows) + +SET enable_partitionwise_join = on; +-- Check reparameterization code when an optimizer have to make two level relids +-- adjustment. +SET enable_hashjoin = 'off'; +SET enable_mergejoin = 'off'; +SET enable_material = 'off'; +CREATE TABLE big AS SELECT x AS x FROM generate_series(1,1E4) x; +CREATE INDEX ON big(x); +CREATE TABLE small AS SELECT x, -x AS y FROM generate_series(1,100) x; +CREATE TABLE part_l0 (x int, y int, z int) PARTITION BY HASH (y); +CREATE TABLE part0_l1 PARTITION OF part_l0 (y) + FOR VALUES WITH (modulus 2, remainder 0) PARTITION BY HASH (z); +CREATE TABLE part0_l2 PARTITION OF part0_l1 (z) + FOR VALUES WITH (modulus 2, remainder 0); +CREATE TABLE part1_l2 PARTITION OF part0_l1 (z) + FOR VALUES WITH (modulus 2, remainder 1); +CREATE TABLE part1_l1 PARTITION OF part_l0 (y) + FOR VALUES WITH (modulus 2, remainder 1); +INSERT INTO part_l0 (x, y, z) (SELECT x,x,x FROM generate_series(1,1E4) x); +ANALYZE big,small,part_l0; +-- Parameter have to be reparameterized by a plane relation. +EXPLAIN (COSTS OFF) +SELECT small.* FROM small + JOIN part_l0 ON small.y = part_l0.y AND small.y + part_l0.y < 10 + LEFT JOIN big ON big.x = small.x; + QUERY PLAN +----------------------------------------------------------------------------------------- + Nested Loop Left Join + -> Append + -> Nested Loop + Join Filter: ((small.y = part_l0_1.y) AND ((small.y + part_l0_1.y) < 10)) + -> Seq Scan on small + -> Seq Scan on part0_l2 part_l0_1 + -> Nested Loop + Join Filter: ((small.y = part_l0_2.y) AND ((small.y + part_l0_2.y) < 10)) + -> Seq Scan on part1_l2 part_l0_2 + -> Seq Scan on small + -> Nested Loop + Join Filter: ((small.y = part_l0_3.y) AND ((small.y + part_l0_3.y) < 10)) + -> Seq Scan on small + -> Seq Scan on part1_l1 part_l0_3 + -> Index Only Scan using big_x_idx on big + Index Cond: (x = (small.x)::numeric) +(16 rows) + +SELECT small.* FROM small + JOIN part_l0 ON small.y = part_l0.y AND small.y + part_l0.y < 10 + LEFT JOIN big ON big.x = small.x; + x | y +---+--- +(0 rows) + +-- Parameters have to be reparameterized by plane and partitioned relations. +EXPLAIN (COSTS OFF) +SELECT count(*) FROM small + JOIN part_l0 ON small.y = part_l0.y AND small.y + part_l0.y < 10 + LEFT JOIN big ON big.x = small.x OR big.x = part_l0.x; + QUERY PLAN +----------------------------------------------------------------------------------------------- + Aggregate + -> Nested Loop Left Join + -> Append + -> Nested Loop + Join Filter: ((small.y = part_l0_1.y) AND ((small.y + part_l0_1.y) < 10)) + -> Seq Scan on small + -> Seq Scan on part0_l2 part_l0_1 + -> Nested Loop + Join Filter: ((small.y = part_l0_2.y) AND ((small.y + part_l0_2.y) < 10)) + -> Seq Scan on part1_l2 part_l0_2 + -> Seq Scan on small + -> Nested Loop + Join Filter: ((small.y = part_l0_3.y) AND ((small.y + part_l0_3.y) < 10)) + -> Seq Scan on small + -> Seq Scan on part1_l1 part_l0_3 + -> Bitmap Heap Scan on big + Recheck Cond: ((x = (small.x)::numeric) OR (x = (part_l0.x)::numeric)) + -> BitmapOr + -> Bitmap Index Scan on big_x_idx + Index Cond: (x = (small.x)::numeric) + -> Bitmap Index Scan on big_x_idx + Index Cond: (x = (part_l0.x)::numeric) +(22 rows) + +SELECT count(*) FROM small + JOIN part_l0 ON small.y = part_l0.y AND small.y + part_l0.y < 10 + LEFT JOIN big ON big.x = small.x OR big.x = part_l0.x; + count +------- + 0 +(1 row) + +DROP TABLE IF EXISTS big,small,part_l0 CASCADE; +RESET enable_hashjoin; +RESET enable_mergejoin; +RESET enable_material; +RESET max_parallel_workers_per_gather; +-- Parameterized path examples. +CREATE TABLE prta (id integer, payload text) PARTITION BY HASH (id); +CREATE TABLE prta1 PARTITION OF prta FOR VALUES WITH (modulus 2, remainder 0); +CREATE TABLE prta2 PARTITION OF prta FOR VALUES WITH (modulus 2, remainder 1); +CREATE UNIQUE INDEX ON prta1 (id); +CREATE UNIQUE INDEX ON prta2 (id); +INSERT INTO prta (id, payload) + (SELECT *, ('abc' || id)::text AS payload + FROM generate_series(1,1) AS id); +CREATE TABLE prtb (id integer, payload text) PARTITION BY HASH (id); +CREATE TABLE prtb1 PARTITION OF prtb FOR VALUES WITH (modulus 2, remainder 0); +CREATE TABLE prtb2 PARTITION OF prtb FOR VALUES WITH (modulus 2, remainder 1); +CREATE UNIQUE INDEX ON prtb1 (id); +CREATE UNIQUE INDEX ON prtb2 (id); +INSERT INTO prtb (id, payload) + (SELECT *, ('def' || id)::text AS payload + FROM generate_series(1,1000) AS id); +CREATE TABLE e (id integer, payload text) PARTITION BY HASH (id); +CREATE TABLE e1 PARTITION OF e FOR VALUES WITH (modulus 2, remainder 0); +CREATE TABLE e2 PARTITION OF e FOR VALUES WITH (modulus 2, remainder 1); +INSERT INTO e (id, payload) + (SELECT *, ('ghi' || id)::text AS payload + FROM generate_series(1,1000) AS id); +CREATE UNIQUE INDEX ON e1 (id); +CREATE UNIQUE INDEX ON e2 (id); +ANALYZE prta,prtb,e; +EXPLAIN (COSTS OFF) +SELECT * FROM prta,prtb WHERE prta.id=prtb.id; + QUERY PLAN +----------------------------------------------------------- + Append + -> Nested Loop + -> Seq Scan on prta1 prta_1 + -> Index Scan using prtb1_id_idx on prtb1 prtb_1 + Index Cond: (id = prta_1.id) + -> Nested Loop + -> Seq Scan on prta2 prta_2 + -> Index Scan using prtb2_id_idx on prtb2 prtb_2 + Index Cond: (id = prta_2.id) +(9 rows) + +EXPLAIN (COSTS OFF) +SELECT * FROM prta,prtb,e WHERE prta.id=prtb.id AND prta.id=e.id; + QUERY PLAN +----------------------------------------------------------------- + Append + -> Nested Loop + Join Filter: (prta_1.id = e_1.id) + -> Nested Loop + -> Seq Scan on prta1 prta_1 + -> Index Scan using prtb1_id_idx on prtb1 prtb_1 + Index Cond: (id = prta_1.id) + -> Index Scan using e1_id_idx on e1 e_1 + Index Cond: (id = prtb_1.id) + -> Nested Loop + Join Filter: (prta_2.id = e_2.id) + -> Nested Loop + -> Seq Scan on prta2 prta_2 + -> Index Scan using prtb2_id_idx on prtb2 prtb_2 + Index Cond: (id = prta_2.id) + -> Index Scan using e2_id_idx on e2 e_2 + Index Cond: (id = prtb_2.id) +(17 rows) + -- semi join EXPLAIN (COSTS OFF) SELECT t1.* FROM prt1_adv t1 WHERE EXISTS (SELECT 1 FROM prt2_adv t2 WHERE t1.a = t2.b) AND t1.b = 0 ORDER BY t1.a; diff --git a/src/test/regress/sql/partition_join.sql b/src/test/regress/sql/partition_join.sql index d97b5b69ff..4d1dd5bec1 100644 --- a/src/test/regress/sql/partition_join.sql +++ b/src/test/regress/sql/partition_join.sql @@ -536,6 +536,186 @@ EXPLAIN (COSTS OFF) SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_adv t1 INNER JOIN prt2_adv t2 ON (t1.a = t2.b) WHERE t1.b = 0 ORDER BY t1.a, t2.b; SELECT t1.a, t1.c, t2.b, t2.c FROM prt1_adv t1 INNER JOIN prt2_adv t2 ON (t1.a = t2.b) WHERE t1.b = 0 ORDER BY t1.a, t2.b; +-- +-- For asymmetric partition-wise join +-- +CREATE TABLE prt5 (hkey int, a int, b int) PARTITION BY HASH(hkey); +CREATE TABLE prt5_p0 PARTITION OF prt5 + FOR VALUES WITH (modulus 3, remainder 0); +CREATE TABLE prt5_p1 PARTITION OF prt5 + FOR VALUES WITH (modulus 3, remainder 1); +CREATE TABLE prt5_p2 PARTITION OF prt5 + FOR VALUES WITH (modulus 3, remainder 2); +CREATE TABLE prt6 (aid int, alabel text) PARTITION BY HASH(aid); +CREATE TABLE prt6_p0 PARTITION OF prt6 + FOR VALUES WITH (modulus 2, remainder 0); +CREATE TABLE prt6_p1 PARTITION OF prt6 + FOR VALUES WITH (modulus 2, remainder 1); +CREATE TABLE t5_1 (aid int, alabel text); +CREATE TABLE t5_2 (bid int, blabel text); + +INSERT INTO prt5 (SELECT x, (x % 1000)::int, + ((x+1) % 1000)::int + FROM generate_series(1,1000000) x); +INSERT INTO t5_1 (SELECT x, md5(x::text) FROM generate_series(-200, 1500) x); +INSERT INTO t5_2 (SELECT x, md5(x::text) FROM generate_series(-200, 1500) x); +INSERT INTO prt6 (SELECT * FROM t5_1); + +VACUUM ANALYZE prt5,prt6,t5_1,t5_2; + +SET max_parallel_workers_per_gather = 0; + +-- Trivial asymmetric JOIN of partitioned table with a relation +EXPLAIN (COSTS OFF) +SELECT * + FROM prt5 JOIN t5_1 ON a = aid AND alabel like '%abc%'; + +-- The same, but appended with UNION ALL +EXPLAIN (COSTS OFF) +SELECT * FROM ( + (SELECT * FROM prt5_p0) + UNION ALL + (SELECT * FROM prt5_p1) + UNION ALL + (SELECT * FROM prt5_p2) + ) AS sq1 +JOIN t5_1 ON a = aid AND alabel like '%abc%'; + +-- Don't allow asymmetric JOIN of two partitioned tables. +EXPLAIN (COSTS OFF) +SELECT count(*) FROM prt5 JOIN prt6 ON a = aid AND alabel like '%abc%'; +SELECT count(*) FROM prt5 JOIN prt6 ON a = aid AND alabel like '%abc%'; + +-- Check asymmetric JOIN with Subquery +EXPLAIN (COSTS OFF) +SELECT count(*) FROM prt5 JOIN ( + SELECT * FROM prt6 LIMIT 1000 +) AS sq1 ON a = aid AND alabel like '%abc%'; +SELECT count(*) FROM prt5 JOIN (SELECT * FROM prt6 LIMIT 1000) AS sq1 + ON a = aid AND alabel like '%abc%'; + +-- Asymmetric JOIN of two plane tables and one partitioned +EXPLAIN (COSTS OFF) +SELECT count(*) + FROM prt5 JOIN t5_1 ON a = aid AND alabel like '%ab%' + JOIN t5_2 ON b = bid AND blabel like '%cd%'; +SELECT count(*) + FROM prt5 JOIN t5_1 ON a = aid AND alabel like '%ab%' + JOIN t5_2 ON b = bid AND blabel like '%cd%'; + +-- unable to extract non-partitioned right relation +EXPLAIN (COSTS OFF) +SELECT * FROM prt5 RIGHT JOIN t5_1 ON a = aid AND alabel like '%abc%'; +-- left side can be extracted, but no cost benefit +EXPLAIN (COSTS OFF) +SELECT * FROM prt5 LEFT JOIN t5_1 ON a = aid AND alabel like '%abc%'; + +-- validation of the results with/without asymmetric partition-wise join +SELECT * INTO pg_temp.result01a + FROM prt5 JOIN t5_1 ON a = aid AND alabel like '%abc%'; + +SELECT * INTO pg_temp.result02a + FROM prt5 JOIN t5_1 ON a = aid AND alabel like '%ab%' + JOIN t5_2 ON b = bid AND blabel like '%cd%'; + +SET enable_partitionwise_join = off; + +SELECT * INTO pg_temp.result01b + FROM prt5 JOIN t5_1 ON a = aid AND alabel like '%abc%'; + +SELECT * INTO pg_temp.result02b + FROM prt5 JOIN t5_1 ON a = aid AND alabel like '%ab%' + JOIN t5_2 ON b = bid AND blabel like '%cd%'; + +SELECT * FROM pg_temp.result01a EXCEPT SELECT * FROM pg_temp.result01b; +SELECT * FROM pg_temp.result02a EXCEPT SELECT * FROM pg_temp.result02b; + +SET enable_partitionwise_join = on; + +-- Check reparameterization code when an optimizer have to make two level relids +-- adjustment. + +SET enable_hashjoin = 'off'; +SET enable_mergejoin = 'off'; +SET enable_material = 'off'; + +CREATE TABLE big AS SELECT x AS x FROM generate_series(1,1E4) x; +CREATE INDEX ON big(x); +CREATE TABLE small AS SELECT x, -x AS y FROM generate_series(1,100) x; + +CREATE TABLE part_l0 (x int, y int, z int) PARTITION BY HASH (y); +CREATE TABLE part0_l1 PARTITION OF part_l0 (y) + FOR VALUES WITH (modulus 2, remainder 0) PARTITION BY HASH (z); +CREATE TABLE part0_l2 PARTITION OF part0_l1 (z) + FOR VALUES WITH (modulus 2, remainder 0); +CREATE TABLE part1_l2 PARTITION OF part0_l1 (z) + FOR VALUES WITH (modulus 2, remainder 1); +CREATE TABLE part1_l1 PARTITION OF part_l0 (y) + FOR VALUES WITH (modulus 2, remainder 1); +INSERT INTO part_l0 (x, y, z) (SELECT x,x,x FROM generate_series(1,1E4) x); + +ANALYZE big,small,part_l0; + +-- Parameter have to be reparameterized by a plane relation. +EXPLAIN (COSTS OFF) +SELECT small.* FROM small + JOIN part_l0 ON small.y = part_l0.y AND small.y + part_l0.y < 10 + LEFT JOIN big ON big.x = small.x; +SELECT small.* FROM small + JOIN part_l0 ON small.y = part_l0.y AND small.y + part_l0.y < 10 + LEFT JOIN big ON big.x = small.x; + +-- Parameters have to be reparameterized by plane and partitioned relations. +EXPLAIN (COSTS OFF) +SELECT count(*) FROM small + JOIN part_l0 ON small.y = part_l0.y AND small.y + part_l0.y < 10 + LEFT JOIN big ON big.x = small.x OR big.x = part_l0.x; +SELECT count(*) FROM small + JOIN part_l0 ON small.y = part_l0.y AND small.y + part_l0.y < 10 + LEFT JOIN big ON big.x = small.x OR big.x = part_l0.x; + +DROP TABLE IF EXISTS big,small,part_l0 CASCADE; +RESET enable_hashjoin; +RESET enable_mergejoin; +RESET enable_material; +RESET max_parallel_workers_per_gather; + +-- Parameterized path examples. +CREATE TABLE prta (id integer, payload text) PARTITION BY HASH (id); +CREATE TABLE prta1 PARTITION OF prta FOR VALUES WITH (modulus 2, remainder 0); +CREATE TABLE prta2 PARTITION OF prta FOR VALUES WITH (modulus 2, remainder 1); +CREATE UNIQUE INDEX ON prta1 (id); +CREATE UNIQUE INDEX ON prta2 (id); +INSERT INTO prta (id, payload) + (SELECT *, ('abc' || id)::text AS payload + FROM generate_series(1,1) AS id); + +CREATE TABLE prtb (id integer, payload text) PARTITION BY HASH (id); +CREATE TABLE prtb1 PARTITION OF prtb FOR VALUES WITH (modulus 2, remainder 0); +CREATE TABLE prtb2 PARTITION OF prtb FOR VALUES WITH (modulus 2, remainder 1); +CREATE UNIQUE INDEX ON prtb1 (id); +CREATE UNIQUE INDEX ON prtb2 (id); +INSERT INTO prtb (id, payload) + (SELECT *, ('def' || id)::text AS payload + FROM generate_series(1,1000) AS id); + +CREATE TABLE e (id integer, payload text) PARTITION BY HASH (id); +CREATE TABLE e1 PARTITION OF e FOR VALUES WITH (modulus 2, remainder 0); +CREATE TABLE e2 PARTITION OF e FOR VALUES WITH (modulus 2, remainder 1); +INSERT INTO e (id, payload) + (SELECT *, ('ghi' || id)::text AS payload + FROM generate_series(1,1000) AS id); +CREATE UNIQUE INDEX ON e1 (id); +CREATE UNIQUE INDEX ON e2 (id); + +ANALYZE prta,prtb,e; + +EXPLAIN (COSTS OFF) +SELECT * FROM prta,prtb WHERE prta.id=prtb.id; + +EXPLAIN (COSTS OFF) +SELECT * FROM prta,prtb,e WHERE prta.id=prtb.id AND prta.id=e.id; + -- semi join EXPLAIN (COSTS OFF) SELECT t1.* FROM prt1_adv t1 WHERE EXISTS (SELECT 1 FROM prt2_adv t2 WHERE t1.a = t2.b) AND t1.b = 0 ORDER BY t1.a; -- 2.33.0