From b2c8085256aa6d6e34f6c9b8394738ee1bef3997 Mon Sep 17 00:00:00 2001 From: James Coleman Date: Sat, 3 Oct 2020 10:35:47 -0400 Subject: [PATCH 1/3] Fix get_useful_pathkeys_for_relation for volatile expressions When considering Incremental Sort below a Gather Merge, we need to be a bit more careful when matching pathkeys to EC members. It's not enough to find a member whose Vars are all in the current relation's target; volatile expressions in particular need to be contained in the target, otherwise it's too early to use the pathkey. Reported-by: Jaime Casanova Author: James Coleman Reviewed-by: Tomas Vondra Backpatch-through: 13 Discussion: https://postgr.es/m/CAJGNTeNaxpXgBVcRhJX%2B2vSbq%2BF2kJqGBcvompmpvXb7pq%2BoFA%40mail.gmail.com --- src/backend/optimizer/path/allpaths.c | 13 +-- src/backend/optimizer/path/equivclass.c | 70 +++++++++++++ src/include/optimizer/paths.h | 1 + .../regress/expected/incremental_sort.out | 98 +++++++++++++++++++ src/test/regress/sql/incremental_sort.sql | 31 ++++++ 5 files changed, 207 insertions(+), 6 deletions(-) diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c index 8ad6384c6a..84a69b064a 100644 --- a/src/backend/optimizer/path/allpaths.c +++ b/src/backend/optimizer/path/allpaths.c @@ -2816,7 +2816,8 @@ get_useful_pathkeys_for_relation(PlannerInfo *root, RelOptInfo *rel) /* * Considering query_pathkeys is always worth it, because it might allow * us to avoid a total sort when we have a partially presorted path - * available. + * available or to push the total sort into the parallel portion of the + * query. */ if (root->query_pathkeys) { @@ -2829,17 +2830,17 @@ get_useful_pathkeys_for_relation(PlannerInfo *root, RelOptInfo *rel) EquivalenceClass *pathkey_ec = pathkey->pk_eclass; /* - * We can only build an Incremental Sort for pathkeys which - * contain an EC member in the current relation, so ignore any - * suffix of the list as soon as we find a pathkey without an EC - * member the relation. + * We can only build a sort for pathkeys which contain an EC + * member in the current relation's target, so ignore any suffix + * of the list as soon as we find a pathkey without an EC member + * in the relation. * * By still returning the prefix of the pathkeys list that does * meet criteria of EC membership in the current relation, we * enable not just an incremental sort on the entirety of * query_pathkeys but also incremental sort below a JOIN. */ - if (!find_em_expr_for_rel(pathkey_ec, rel)) + if (!find_em_expr_usable_for_sorting_rel(pathkey_ec, rel)) break; npathkeys++; diff --git a/src/backend/optimizer/path/equivclass.c b/src/backend/optimizer/path/equivclass.c index a21b3b4756..f98fd7b3eb 100644 --- a/src/backend/optimizer/path/equivclass.c +++ b/src/backend/optimizer/path/equivclass.c @@ -797,6 +797,76 @@ find_em_expr_for_rel(EquivalenceClass *ec, RelOptInfo *rel) return NULL; } +/* + * Find an equivalence class member expression that can be safely used by a + * sort node on top of the provided relation. The rules here must match those + * applied in prepare_sort_from_pathkeys. + */ +Expr * +find_em_expr_usable_for_sorting_rel(EquivalenceClass *ec, RelOptInfo *rel) +{ + ListCell *lc_em; + + /* + * If there is more than one equivalence member matching these + * requirements we'll be content to choose any one of them. + */ + foreach(lc_em, ec->ec_members) + { + EquivalenceMember *em = lfirst(lc_em); + Expr *em_expr = em->em_expr; + PathTarget *target = rel->reltarget; + ListCell *lc_target_expr; + + /* + * We shouldn't be trying to sort by an equivalence class that + * contains a constant, so no need to consider such cases any further. + */ + if (em->em_is_const) + continue; + + /* + * Any Vars in the equivalence class member need to come from this + * relation. This is a superset of prepare_sort_from_pathkeys ignoring + * child members unless they belong to the rel being sorted. + */ + if (!bms_is_subset(em->em_relids, rel->relids)) + continue; + + /* + * As long as the expression isn't volatile then + * prepare_sort_from_pathkeys is able to generate a new target entry, + * so there's no need to verify that one already exists. + */ + if (!ec->ec_has_volatile) + return em->em_expr; + + /* + * If, however, it's volatile, we have to verify that the + * equivalence member's expr is already generated in the + * relation's target (we do strip relabels first from both + * expressions, which is cheap and might allow us to match + * more expressions). + */ + while (em_expr && IsA(em_expr, RelabelType)) + em_expr = ((RelabelType *) em_expr)->arg; + + foreach(lc_target_expr, target->exprs) + { + Expr *target_expr = lfirst(lc_target_expr); + + while (target_expr && IsA(target_expr, RelabelType)) + target_expr = ((RelabelType *) target_expr)->arg; + + if (equal(target_expr, em_expr)) + return em->em_expr; + } + } + + /* We didn't find any suitable equivalence class expression */ + return NULL; +} + /* * generate_base_implied_equalities * Generate any restriction clauses that we can deduce from equivalence diff --git a/src/include/optimizer/paths.h b/src/include/optimizer/paths.h index 2134227ebc..8a4c6f8b59 100644 --- a/src/include/optimizer/paths.h +++ b/src/include/optimizer/paths.h @@ -135,6 +135,7 @@ extern EquivalenceClass *get_eclass_for_sort_expr(PlannerInfo *root, Relids rel, bool create_it); extern Expr *find_em_expr_for_rel(EquivalenceClass *ec, RelOptInfo *rel); +extern Expr *find_em_expr_usable_for_sorting_rel(EquivalenceClass *ec, RelOptInfo *rel); extern void generate_base_implied_equalities(PlannerInfo *root); extern List *generate_join_implied_equalities(PlannerInfo *root, Relids join_relids, diff --git a/src/test/regress/expected/incremental_sort.out b/src/test/regress/expected/incremental_sort.out index e376ea1276..7cf2eee7c1 100644 --- a/src/test/regress/expected/incremental_sort.out +++ b/src/test/regress/expected/incremental_sort.out @@ -1469,3 +1469,101 @@ explain (costs off) select * from t union select * from t order by 1,3; (13 rows) drop table t; +-- Sort pushdown can't go below where expressions are part of the rel target. +-- In particular this is interesting for volatile expressions which have to +-- go above joins since otherwise we'll incorrectly use expression evaluations +-- across multiple rows. +set enable_hashagg=off; +set enable_seqscan=off; +set enable_incremental_sort = off; +set parallel_tuple_cost=0; +set parallel_setup_cost=0; +set min_parallel_table_scan_size = 0; +set min_parallel_index_scan_size = 0; +-- Parallel sort below join. +explain (costs off) select distinct sub.unique1, stringu1 +from tenk1, lateral (select tenk1.unique1 from generate_series(1, 1000)) as sub; + QUERY PLAN +-------------------------------------------------------------------------- + Unique + -> Nested Loop + -> Gather Merge + Workers Planned: 2 + -> Sort + Sort Key: tenk1.unique1, tenk1.stringu1 + -> Parallel Index Scan using tenk1_unique1 on tenk1 + -> Function Scan on generate_series +(8 rows) + +explain (costs off) select sub.unique1, stringu1 +from tenk1, lateral (select tenk1.unique1 from generate_series(1, 1000)) as sub +order by 1, 2; + QUERY PLAN +-------------------------------------------------------------------- + Nested Loop + -> Gather Merge + Workers Planned: 2 + -> Sort + Sort Key: tenk1.unique1, tenk1.stringu1 + -> Parallel Index Scan using tenk1_unique1 on tenk1 + -> Function Scan on generate_series +(7 rows) + +-- Parallel sort but with expression that can be safely generated at the base rel. +explain (costs off) select distinct sub.unique1, md5(stringu1) +from tenk1, lateral (select tenk1.unique1 from generate_series(1, 1000)) as sub; + QUERY PLAN +---------------------------------------------------------------------------------------- + Unique + -> Nested Loop + -> Gather Merge + Workers Planned: 2 + -> Sort + Sort Key: tenk1.unique1, (md5((tenk1.stringu1)::text)) COLLATE "C" + -> Parallel Index Scan using tenk1_unique1 on tenk1 + -> Function Scan on generate_series +(8 rows) + +explain (costs off) select sub.unique1, md5(stringu1) +from tenk1, lateral (select tenk1.unique1 from generate_series(1, 1000)) as sub +order by 1, 2; + QUERY PLAN +---------------------------------------------------------------------------------- + Nested Loop + -> Gather Merge + Workers Planned: 2 + -> Sort + Sort Key: tenk1.unique1, (md5((tenk1.stringu1)::text)) COLLATE "C" + -> Parallel Index Scan using tenk1_unique1 on tenk1 + -> Function Scan on generate_series +(7 rows) + +-- Parallel sort but with expression not available until the upper rel. +explain (costs off) select distinct sub.unique1, stringu1 || random()::text +from tenk1, lateral (select tenk1.unique1 from generate_series(1, 1000)) as sub; + QUERY PLAN +--------------------------------------------------------------------------------------------- + Unique + -> Sort + Sort Key: tenk1.unique1, (((tenk1.stringu1)::text || (random())::text)) COLLATE "C" + -> Gather + Workers Planned: 2 + -> Nested Loop + -> Parallel Index Scan using tenk1_unique1 on tenk1 + -> Function Scan on generate_series +(8 rows) + +explain (costs off) select sub.unique1, stringu1 || random()::text +from tenk1, lateral (select tenk1.unique1 from generate_series(1, 1000)) as sub +order by 1, 2; + QUERY PLAN +--------------------------------------------------------------------------------------- + Sort + Sort Key: tenk1.unique1, (((tenk1.stringu1)::text || (random())::text)) COLLATE "C" + -> Gather + Workers Planned: 2 + -> Nested Loop + -> Parallel Index Scan using tenk1_unique1 on tenk1 + -> Function Scan on generate_series +(7 rows) + diff --git a/src/test/regress/sql/incremental_sort.sql b/src/test/regress/sql/incremental_sort.sql index 9c040c90e6..3ee11c394b 100644 --- a/src/test/regress/sql/incremental_sort.sql +++ b/src/test/regress/sql/incremental_sort.sql @@ -221,3 +221,34 @@ set enable_hashagg to off; explain (costs off) select * from t union select * from t order by 1,3; drop table t; + +-- Sort pushdown can't go below where expressions are part of the rel target. +-- In particular this is interesting for volatile expressions which have to +-- go above joins since otherwise we'll incorrectly use expression evaluations +-- across multiple rows. +set enable_hashagg=off; +set enable_seqscan=off; +set enable_incremental_sort = off; +set parallel_tuple_cost=0; +set parallel_setup_cost=0; +set min_parallel_table_scan_size = 0; +set min_parallel_index_scan_size = 0; + +-- Parallel sort below join. +explain (costs off) select distinct sub.unique1, stringu1 +from tenk1, lateral (select tenk1.unique1 from generate_series(1, 1000)) as sub; +explain (costs off) select sub.unique1, stringu1 +from tenk1, lateral (select tenk1.unique1 from generate_series(1, 1000)) as sub +order by 1, 2; +-- Parallel sort but with expression that can be safely generated at the base rel. +explain (costs off) select distinct sub.unique1, md5(stringu1) +from tenk1, lateral (select tenk1.unique1 from generate_series(1, 1000)) as sub; +explain (costs off) select sub.unique1, md5(stringu1) +from tenk1, lateral (select tenk1.unique1 from generate_series(1, 1000)) as sub +order by 1, 2; +-- Parallel sort but with expression not available until the upper rel. +explain (costs off) select distinct sub.unique1, stringu1 || random()::text +from tenk1, lateral (select tenk1.unique1 from generate_series(1, 1000)) as sub; +explain (costs off) select sub.unique1, stringu1 || random()::text +from tenk1, lateral (select tenk1.unique1 from generate_series(1, 1000)) as sub +order by 1, 2; -- 2.25.4