From 5ba062828dfc83dff2a3521fce7532b2afc2c7b9 Mon Sep 17 00:00:00 2001 From: amit Date: Wed, 13 Sep 2017 18:24:55 +0900 Subject: [PATCH 2/5] Planner-side changes for partition-pruning This adds all the necessary planner code and representations viz. 0. Code to teach set_append_rel_size/pathlist to look at only the *live* partitions of partitioned tables. 1. Add a field partcollation to PartitionScheme, which will be needed to verify that a operator clause's input collation indeed matches what is used for partitioning, to be able to use the clause for partition-pruning (using parttypcoll won't be correct, because that's not what's used by partitioning) 2. Code to match the clauses to the table's partition key and generate a list of such matching clauses. 3. Add a field to RelOptInfo to store an array of pointers of AppendRelInfo of *all* partitions (stored in the same order as their RelOptInfos in part_rels) 4. Add a field to RelOptInfo to store a list of AppendRelInfos of *live* partitions that survived partition-pruning, although as of this commit this contains *all* appinfos. 5. Code to handle the possibility that a partition RelOptInfo may not have the basic information set (set_append_rel_size() does that normally, but for partitioned tables, it will only do it for the *live* partitions, but partitionwise-join code would look at *all* partitions) If the clauses identified in 2 above does not contain values necessary to perform partition pruning, do not call get_partitions_from_clauses() right away. Instead, store the clauses (somewhere, such as in the Append plan node) until such a time as when all the "constant" values in them will be available. As of this commit, we only pick up clauses from the baserestrictinfo list, so it's safe to assume that each of the matched clause will provide the constant value needed for pruning. In addition, a stub function get_partitions_from_clauses is added in partition.c, which currently simply returns all partitions from the partition descriptor. Authors: Amit Langote, Dilip Kumar --- src/backend/catalog/partition.c | 24 ++ src/backend/optimizer/path/allpaths.c | 694 ++++++++++++++++++++++++++++------ src/backend/optimizer/path/joinrels.c | 24 ++ src/backend/optimizer/plan/planner.c | 20 +- src/backend/optimizer/util/plancat.c | 4 + src/backend/optimizer/util/relnode.c | 90 +++++ src/include/catalog/partition.h | 6 + src/include/nodes/relation.h | 31 +- src/include/optimizer/pathnode.h | 4 + 9 files changed, 779 insertions(+), 118 deletions(-) diff --git a/src/backend/catalog/partition.c b/src/backend/catalog/partition.c index 07fdf66c38..f8da91d0fe 100644 --- a/src/backend/catalog/partition.c +++ b/src/backend/catalog/partition.c @@ -1421,6 +1421,30 @@ get_partition_dispatch_recurse(Relation rel, Relation parent, } } +/* + * get_partitions_using_clauses + * Determine the set of partitions of relation that will satisfy all + * the clauses contained in partclauses + * + * Outputs: + * *min_part_idx and *max_part_idx constitutes a range of contiguous + * indexes of partitions satisfying the query, while *other_parts + * contains indexes of partitions that satisfy the query but are + * not included in the aforementioned range + */ +void +get_partitions_from_clauses(Relation relation, int rt_index, + List *partclauses, + int *min_part_idx, int *max_part_idx, + Bitmapset **other_parts) +{ + PartitionDesc partdesc = RelationGetPartitionDesc(relation); + + *min_part_idx = 0; + *max_part_idx = partdesc->nparts - 1; + *other_parts = NULL; +} + /* Module-local functions */ /* diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c index 5535b63803..536ef22c58 100644 --- a/src/backend/optimizer/path/allpaths.c +++ b/src/backend/optimizer/path/allpaths.c @@ -20,9 +20,11 @@ #include "access/sysattr.h" #include "access/tsmapi.h" +#include "catalog/partition.h" #include "catalog/pg_class.h" #include "catalog/pg_operator.h" #include "catalog/pg_proc.h" +#include "catalog/pg_type.h" #include "foreign/fdwapi.h" #include "miscadmin.h" #include "nodes/makefuncs.h" @@ -135,6 +137,15 @@ static void recurse_push_qual(Node *setOp, Query *topquery, static void remove_unused_subquery_outputs(Query *subquery, RelOptInfo *rel); static void add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel, List *live_childrels); +static List *get_append_rel_partitions(PlannerInfo *root, + RelOptInfo *rel, + RangeTblEntry *rte); +static List *match_clauses_to_partkey(RelOptInfo *rel, + List *clauses, + bool *constfalse); +static BoolExpr *process_partition_ne_op(RelOptInfo *rel, + Oid negator, Oid partopfamily, Oid partcoll, + Expr *leftop, Expr *rightop); /* @@ -834,6 +845,17 @@ set_foreign_size(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte) rel->rows = clamp_row_est(rel->rows); } +static int +intcmp(const void *va, const void *vb) +{ + int a = *((const int *) va); + int b = *((const int *) vb); + + if (a == b) + return 0; + return (a > b) ? 1 : -1; +} + /* * set_foreign_pathlist * Build access paths for a foreign table RTE @@ -846,6 +868,488 @@ set_foreign_pathlist(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte) } /* + * get_rel_partitions + * Return the list of partitions of rel that pass the clauses mentioned + * rel->baserestrictinfo + * + * Returned list contains the AppendRelInfos of chosen partitions. + */ +static List * +get_append_rel_partitions(PlannerInfo *root, + RelOptInfo *rel, + RangeTblEntry *rte) +{ + Relation parent = heap_open(rte->relid, NoLock); + PartitionDesc partdesc = RelationGetPartitionDesc(parent); + List *partclauses; + List *result = NIL; + int i, + num_parts = 0, + min_part_idx = -1, + max_part_idx = -1, + *all_indexes = NULL; + Bitmapset *other_parts = NULL; + bool constfalse; + + /* + * Get the clauses that match the partition key, including information + * about any nullness tests against partition keys. Set keynullness to + * a invalid value of NullTestType, which 0 is not. + */ + partclauses = match_clauses_to_partkey(rel, + list_copy(rel->baserestrictinfo), + &constfalse); + + /* + * Since the clauses in rel->baserestrictinfo should all contain Const + * operands, it should be possible to prune partitions right away. + */ + if (partclauses != NIL && !constfalse) + { + get_partitions_from_clauses(parent, rel->relid, partclauses, + &min_part_idx, &max_part_idx, + &other_parts); + /* Get *all* indexes in one place and sort. */ + if (min_part_idx >= 0 && max_part_idx >= 0) + num_parts += (max_part_idx - min_part_idx + 1); + if (!bms_is_empty(other_parts)) + num_parts += bms_num_members(other_parts); + + if (num_parts > 0) + { + int j; + + all_indexes = (int *) palloc(num_parts * sizeof(int)); + j = 0; + if (min_part_idx >= 0 && max_part_idx >= 0) + { + for (i = min_part_idx; i <= max_part_idx; i++) + all_indexes[j++] = i; + } + if (!bms_is_empty(other_parts)) + while ((i = bms_first_member(other_parts)) >= 0) + all_indexes[j++] = i; + if (j > 1) + qsort((void *) all_indexes, j, sizeof(int), intcmp); + } + } + else if (!constfalse) + { + /* No clauses to prune paritions, so scan all partitions. */ + num_parts = partdesc->nparts; + all_indexes = (int *) palloc(num_parts * sizeof(int)); + for (i = 0; i < partdesc->nparts; i++) + all_indexes[i] = i; + } + + /* Fetch the partition appinfos. */ + for (i = 0; i < num_parts; i++) + { + AppendRelInfo *appinfo = rel->part_appinfos[all_indexes[i]]; +#ifdef USE_ASSERT_CHECKING + RangeTblEntry *rte = planner_rt_fetch(appinfo->child_relid, root); + + /* + * Must be the intended child's RTE here, because appinfos are ordered + * the same way as partitions in the partition descriptor. + */ + Assert(partdesc->oids[all_indexes[i]] == rte->relid); +#endif + result = lappend(result, appinfo); + } + if (all_indexes) + pfree(all_indexes); + + /* Remember for future users such as set_append_rel_pathlist(). */ + rel->live_part_appinfos = result; + + heap_close(parent, NoLock); + + return result; +} + +#define PartCollMatchesExprColl(partcoll, exprcoll) \ + ((partcoll) == InvalidOid || (partcoll) == (exprcoll)) + +/* + * match_clauses_to_partkey + * Match clauses with rel's partition key + * + * For an individual clause to match with a partition key column, the clause: + * + * 1. must be in the form (partkey op const) or (const op partkey); + * 2. must contain an operator which is in the same operator family as the + * partitioning operator for the partition key column + * 3. its input collation must match the partitioning collation + * + * The "const" mentioned in 1 means any expression that doesn't involve a + * volatile function or a Var of this relation. We allow Vars belonging to + * other relations (for example, if the clause is a join clause), but they + * are treated as parameters whose values are not known now, so cannot be + * used for partition pruning right within the planner. It's the + * responsibility of higher code levels to manage restriction and join + * clauses appropriately. + * + * If a NullTest against a partition key is encountered, it's recorded in the + * PartClauseValSet as well. + */ +static List * +match_clauses_to_partkey(RelOptInfo *rel, + List *clauses, + bool *constfalse) +{ + PartitionScheme partscheme = rel->part_scheme; + List *result = NIL; + ListCell *lc; + + *constfalse = false; + + Assert (partscheme != NULL); + + foreach(lc, clauses) + { + Expr *clause; + int i; + + if (IsA(lfirst(lc), RestrictInfo)) + { + RestrictInfo *rinfo = lfirst(lc); + + clause = rinfo->clause; + if (rinfo->pseudoconstant && + !DatumGetBool(((Const *) clause)->constvalue)) + { + *constfalse = true; + continue; + } + } + else + clause = (Expr *) lfirst(lc); + + /* Get the BoolExpr's out of the way.*/ + if (IsA(clause, BoolExpr)) + { + bool constfalse1; + + /* + * If the OR's args contain clauses that match, add the clause + * to the result. + */ + if (or_clause((Node *) clause) && + match_clauses_to_partkey(rel, + list_copy(((BoolExpr *) clause)->args), + &constfalse1) != NIL) + result = lappend(result, clause); + else if (and_clause((Node *) clause)) + /* + * These clauses are ANDed with the clauses in the + * original list, so queue them after the latter. Note + * that it also means that a queued clause will be added to + * the result if it happens to match. + */ + clauses = list_concat(clauses, + list_copy(((BoolExpr *) clause)->args)); + continue; + } + + for (i = 0; i < partscheme->partnatts; i++) + { + Node *partkey = linitial(rel->partexprs[i]); + Oid partopfamily = partscheme->partopfamily[i], + partcoll = partscheme->partcollation[i]; + + /* + * Check if the operator is compatible with partitioning and if + * so, add it to the list of opclauses matched with this partition + * key. + */ + if (is_opclause(clause)) + { + Expr *constexpr, + *leftop, + *rightop; + Relids constrelids, + left_relids, + right_relids; + Oid expr_op, + expr_coll; + + leftop = (Expr *) get_leftop(clause); + rightop = (Expr *) get_rightop(clause); + expr_op = ((OpExpr *) clause)->opno; + expr_coll = ((OpExpr *) clause)->inputcollid; + left_relids = pull_varnos((Node *) leftop); + right_relids = pull_varnos((Node *) rightop); + + if (IsA(leftop, RelabelType)) + leftop = ((RelabelType *) leftop)->arg; + if (IsA(rightop, RelabelType)) + rightop = ((RelabelType *) rightop)->arg; + + if (equal(leftop, partkey)) + { + constexpr = rightop; + constrelids = right_relids; + } + else if (equal(rightop, partkey)) + { + constexpr = leftop; + constrelids = left_relids; + expr_op = get_commutator(expr_op); + /* + * If no commutator exists, cannot flip the qual's args, + * so give up. + */ + if (!OidIsValid(expr_op)) + continue; + } + else + /* Neither argument matches the partition key. */ + continue; + + /* + * Only allow strict operators to think sanely about the + * behavior with null arguments. + */ + if (!op_strict(expr_op)) + continue; + + /* + * Check if the operator is in the partition operator family. + * It the operator happens to be '<>', which is never listed + * as part of the operator family, check if its negator + * exists and and that the latter is compatible with + * partitioning. If it is, we turn this into a OR BoolExpr: + * (key < val OR key > val), if the partitioning method + * supports such notion of inequlity. + */ + if (!op_in_opfamily(expr_op, partopfamily)) + { + Oid negator = get_negator(expr_op); + + if (!OidIsValid(negator) || + !op_in_opfamily(negator, partopfamily)) + continue; + + if (partscheme->strategy == PARTITION_STRATEGY_RANGE || + partscheme->strategy == PARTITION_STRATEGY_LIST) + { + BoolExpr *ne_or; + + ne_or = process_partition_ne_op(rel, negator, + partopfamily, + partcoll, + (Expr *) leftop, + (Expr *) rightop); + result = lappend(result, ne_or); + } + continue; + } + + /* + * Useless if what we're thinking of as a constant is actually + * a Var coming from this relation. + */ + if (bms_is_member(rel->relid, constrelids)) + continue; + + /* Useless if the "constant" can change its value. */ + if (contain_volatile_functions((Node *) constexpr)) + continue; + + /* + * Also, useless, if the clause's collation is different from + * the partitioning collation. + */ + if (!PartCollMatchesExprColl(partcoll, expr_coll)) + continue; + + /* + * Everything seems to be fine, so add it to the list of + * clauses we will use for pruning. Flip the left and right + * args if we have to, because the code that extracts the + * constant value to use for partition-pruning expects to find + * it as the rightop of the clause. + */ + if (constexpr == rightop) + result = lappend(result, clause); + else + { + OpExpr *commuted; + + commuted = (OpExpr *) copyObject(clause); + commuted->opno = expr_op; + commuted->opfuncid = get_opcode(expr_op); + commuted->args = list_make2(rightop, leftop); + result = lappend(result, commuted); + } + } + else if (IsA(clause, ScalarArrayOpExpr)) + { + ScalarArrayOpExpr *saop = (ScalarArrayOpExpr *) clause; + Oid saop_op = saop->opno; + Oid saop_opfuncid = saop->opfuncid; + Oid saop_coll = saop->inputcollid; + Node *leftop = (Node *) linitial(saop->args); + Const *arrconst = (Const *) lsecond(saop->args); + ArrayType *arrval = DatumGetArrayTypeP(arrconst->constvalue); + int16 elemlen; + bool elembyval; + char elemalign; + Datum *elem_values; + bool *elem_nulls; + int num_elems; + List *elem_exprs; + bool negated = false; + + /* + * Check if saop_op is compatible with partitioning. If so and + * if this saop is of type 'key op ANY (...)', convert this into + * a OR BoolExpr. + */ + if (IsA(leftop, RelabelType)) + leftop = (Node *) ((RelabelType *) leftop)->arg; + if (!equal(leftop, partkey)) + continue; + + if (!op_strict(saop_op)) + continue; + + /* + * In case of NOT IN (..), we get a '<>', which while not + * listed as part of any operator family, we are able to + * handle its negator is indeed a part of the partitioning + * operator family. + */ + if (!op_in_opfamily(saop_op, partopfamily)) + { + Oid negator = get_negator(saop_op); + + if (!OidIsValid(negator) || + !op_in_opfamily(negator, partopfamily)) + continue; + negated = true; + } + + /* + * Also, useless, if the clause's collation is different from + * the partitioning collation. + */ + if (!PartCollMatchesExprColl(partcoll, saop_coll)) + continue; + + /* Build clauses for the individual values in the array. */ + get_typlenbyvalalign(ARR_ELEMTYPE(arrval), + &elemlen, &elembyval, &elemalign); + deconstruct_array(arrval, + ARR_ELEMTYPE(arrval), + elemlen, elembyval, elemalign, + &elem_values, &elem_nulls, + &num_elems); + elem_exprs = NIL; + for (i = 0; i < num_elems; i++) + { + Expr *elem_expr; + + if (!elem_nulls[i]) + { + Const *rightop; + OpExpr *opexpr = makeNode(OpExpr); + + rightop = makeConst(ARR_ELEMTYPE(arrval), + -1, arrconst->constcollid, + elemlen, elem_values[i], + false, elembyval); + + opexpr->opno = saop_op; + opexpr->opfuncid = saop_opfuncid; + opexpr->opresulttype = BOOLOID; + opexpr->opretset = false; + opexpr->opcollid = InvalidOid; + opexpr->inputcollid = saop_coll; + opexpr->args = list_make2(leftop, rightop); + opexpr->location = -1; + elem_expr = (Expr *) opexpr; + } + else + { + NullTest *nulltest = makeNode(NullTest); + + nulltest->arg = (Expr *) leftop; + nulltest->nulltesttype = !negated ? IS_NULL + : IS_NOT_NULL; + nulltest->argisrow = false; + nulltest->location = -1; + elem_expr = (Expr *) nulltest; + } + + elem_exprs = lappend(elem_exprs, elem_expr); + } + + /* Build the OR clause and generate its PartClauseSetOr. */ + if (saop->useOr) + { + BoolExpr *orexpr; + + Assert(elem_exprs != NIL); + orexpr = (BoolExpr *) makeBoolExpr(OR_EXPR, elem_exprs, + -1); + result = lappend(result, orexpr); + } + else + /* + * To be ANDed with the clauses in the original list, just + * like what we do for the arguments of Boolean AND clause + * above. + */ + clauses = list_concat(clauses, elem_exprs); + } + else if (IsA(clause, NullTest)) + { + NullTest *nulltest = (NullTest *) clause; + Node *arg = (Node *) nulltest->arg; + + if (equal(arg, partkey)) + result = lappend(result, nulltest); + } + } + } + + return result; +} + +static BoolExpr * +process_partition_ne_op(RelOptInfo *rel, + Oid negator, Oid partopfamily, Oid partcoll, + Expr *leftop, Expr *rightop) +{ + Expr *ltexpr, + *gtexpr; + Oid ltop, + gtop; + int strategy; + Oid lefttype, + righttype; + + get_op_opfamily_properties(negator, partopfamily, false, + &strategy, &lefttype, &righttype); + if (strategy != BTEqualStrategyNumber) + elog(LOG, "unexpected negator of '<>' operator"); + ltop = get_opfamily_member(partopfamily, lefttype, righttype, + BTLessStrategyNumber); + gtop = get_opfamily_member(partopfamily, lefttype, righttype, + BTGreaterStrategyNumber); + ltexpr = make_opclause(ltop, BOOLOID, false, + (Expr *) leftop, (Expr *) rightop, + InvalidOid, partcoll); + gtexpr = make_opclause(gtop, BOOLOID, false, + (Expr *) leftop, (Expr *) rightop, + InvalidOid, partcoll); + + return (BoolExpr *) makeBoolExpr(OR_EXPR, list_make2(ltexpr, gtexpr), -1); +} + +/* * set_append_rel_size * Set size estimates for a simple "append relation" * @@ -860,6 +1364,7 @@ static void set_append_rel_size(PlannerInfo *root, RelOptInfo *rel, Index rti, RangeTblEntry *rte) { + List *rel_appinfos = NIL; int parentRTindex = rti; bool has_live_children; double parent_rows; @@ -873,6 +1378,23 @@ set_append_rel_size(PlannerInfo *root, RelOptInfo *rel, Assert(IS_SIMPLE_REL(rel)); + if (rte->relkind != RELKIND_PARTITIONED_TABLE) + { + foreach (l, root->append_rel_list) + { + AppendRelInfo *appinfo = lfirst(l); + + /* append_rel_list contains all append rels; ignore others */ + if (appinfo->parent_relid == parentRTindex) + rel_appinfos = lappend(rel_appinfos, appinfo); + } + } + else + { + rel_appinfos = get_append_rel_partitions(root, rel, rte); + rel->live_partitioned_rels = list_make1_int(rti); + } + /* * Initialize to compute size estimates for whole append relation. * @@ -893,7 +1415,7 @@ set_append_rel_size(PlannerInfo *root, RelOptInfo *rel, nattrs = rel->max_attr - rel->min_attr + 1; parent_attrsizes = (double *) palloc0(nattrs * sizeof(double)); - foreach(l, root->append_rel_list) + foreach(l, rel_appinfos) { AppendRelInfo *appinfo = (AppendRelInfo *) lfirst(l); int childRTindex; @@ -906,10 +1428,6 @@ set_append_rel_size(PlannerInfo *root, RelOptInfo *rel, ListCell *childvars; ListCell *lc; - /* append_rel_list contains all append rels; ignore others */ - if (appinfo->parent_relid != parentRTindex) - continue; - childRTindex = appinfo->child_relid; childRTE = root->simple_rte_array[childRTindex]; @@ -920,73 +1438,11 @@ set_append_rel_size(PlannerInfo *root, RelOptInfo *rel, childrel = find_base_rel(root, childRTindex); Assert(childrel->reloptkind == RELOPT_OTHER_MEMBER_REL); - if (rel->part_scheme) - { - AttrNumber attno; - - /* - * We need attr_needed data for building targetlist of a join - * relation representing join between matching partitions for - * partition-wise join. A given attribute of a child will be - * needed in the same highest joinrel where the corresponding - * attribute of parent is needed. Hence it suffices to use the - * same Relids set for parent and child. - */ - for (attno = rel->min_attr; attno <= rel->max_attr; attno++) - { - int index = attno - rel->min_attr; - Relids attr_needed = rel->attr_needed[index]; - - /* System attributes do not need translation. */ - if (attno <= 0) - { - Assert(rel->min_attr == childrel->min_attr); - childrel->attr_needed[index] = attr_needed; - } - else - { - Var *var = list_nth_node(Var, - appinfo->translated_vars, - attno - 1); - int child_index; - - child_index = var->varattno - childrel->min_attr; - childrel->attr_needed[child_index] = attr_needed; - } - } - } - /* - * Copy/Modify targetlist. Even if this child is deemed empty, we need - * its targetlist in case it falls on nullable side in a child-join - * because of partition-wise join. - * - * NB: the resulting childrel->reltarget->exprs may contain arbitrary - * expressions, which otherwise would not occur in a rel's targetlist. - * Code that might be looking at an appendrel child must cope with - * such. (Normally, a rel's targetlist would only include Vars and - * PlaceHolderVars.) XXX we do not bother to update the cost or width - * fields of childrel->reltarget; not clear if that would be useful. + * Initialize some properties of child rel from the parent rel, such + * target list, equivalence class members, etc. */ - childrel->reltarget->exprs = (List *) - adjust_appendrel_attrs(root, - (Node *) rel->reltarget->exprs, - 1, &appinfo); - - /* - * We have to make child entries in the EquivalenceClass data - * structures as well. This is needed either if the parent - * participates in some eclass joins (because we will want to consider - * inner-indexscan joins on the individual children) or if the parent - * has useful pathkeys (because we should try to build MergeAppend - * paths that produce those sort orderings). Even if this child is - * deemed dummy, it may fall on nullable side in a child-join, which - * in turn may participate in a MergeAppend, where we will need the - * EquivalenceClass data structures. - */ - if (rel->has_eclass_joins || has_useful_pathkeys(root, rel)) - add_child_rel_equivalences(root, appinfo, rel, childrel); - childrel->has_eclass_joins = rel->has_eclass_joins; + set_basic_child_rel_properties(root, rel, childrel, appinfo); /* * We have to copy the parent's quals to the child, with appropriate @@ -1152,6 +1608,17 @@ set_append_rel_size(PlannerInfo *root, RelOptInfo *rel, has_live_children = true; /* + * If childrel is itself partitioned, add it and its partitioned + * children to the list being propagated up to the root rel. + */ + if (childrel->part_scheme && rel->part_scheme) + { + rel->live_partitioned_rels = + list_concat(rel->live_partitioned_rels, + list_copy(childrel->live_partitioned_rels)); + } + + /* * If any live child is not parallel-safe, treat the whole appendrel * as not parallel-safe. In future we might be able to generate plans * in which some children are farmed out to workers while others are @@ -1247,14 +1714,29 @@ set_append_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, Index rti, RangeTblEntry *rte) { int parentRTindex = rti; - List *live_childrels = NIL; + List *rel_appinfos = NIL, + *live_childrels = NIL; ListCell *l; + if (rte->relkind != RELKIND_PARTITIONED_TABLE) + { + foreach (l, root->append_rel_list) + { + AppendRelInfo *appinfo = lfirst(l); + + /* append_rel_list contains all append rels; ignore others */ + if (appinfo->parent_relid == parentRTindex) + rel_appinfos = lappend(rel_appinfos, appinfo); + } + } + else + rel_appinfos = rel->live_part_appinfos; + /* * Generate access paths for each member relation, and remember the * non-dummy children. */ - foreach(l, root->append_rel_list) + foreach(l, rel_appinfos) { AppendRelInfo *appinfo = (AppendRelInfo *) lfirst(l); int childRTindex; @@ -1325,43 +1807,40 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel, ListCell *l; List *partitioned_rels = NIL; RangeTblEntry *rte; - bool build_partitioned_rels = false; + /* + * AppendPath we are about to generate must record the RT indexes of + * partitioned tables that are direct or indirect children of this Append + * rel. For partitioned tables, we collect its live partitioned children + * from rel->painfo. However, it will contain only its immediate children, + * so collect live partitioned children from all children that are + * themselves partitioned and concatenate to our list before finally + * passing the list to create_append_path() and/or + * generate_mergeappend_paths(). + * + * If this is a sub-query RTE, its RelOptInfo doesn't itself contain the + * list of live partitioned children, so we must assemble the same in the + * loop below from the children that are known to correspond to + * partitioned rels. (This assumes that we don't need to look through + * multiple levels of subquery RTEs; if we ever do, we could consider + * stuffing the list we generate here into sub-query RTE's RelOptInfo, just + * like we do for partitioned rels, which would be used when populating our + * parent rel with paths. For the present, that appears to be + * unnecessary.) + */ if (IS_SIMPLE_REL(rel)) { - /* - * A root partition will already have a PartitionedChildRelInfo, and a - * non-root partitioned table doesn't need one, because its Append - * paths will get flattened into the parent anyway. For a subquery - * RTE, no PartitionedChildRelInfo exists; we collect all - * partitioned_rels associated with any child. (This assumes that we - * don't need to look through multiple levels of subquery RTEs; if we - * ever do, we could create a PartitionedChildRelInfo with the - * accumulated list of partitioned_rels which would then be found when - * populated our parent rel with paths. For the present, that appears - * to be unnecessary.) - */ rte = planner_rt_fetch(rel->relid, root); - switch (rte->rtekind) - { - case RTE_RELATION: - if (rte->relkind == RELKIND_PARTITIONED_TABLE) - partitioned_rels = - get_partitioned_child_rels(root, rel->relid); - break; - case RTE_SUBQUERY: - build_partitioned_rels = true; - break; - default: - elog(ERROR, "unexpcted rtekind: %d", (int) rte->rtekind); - } + if (rte->rtekind == RTE_RELATION && + rte->relkind == RELKIND_PARTITIONED_TABLE) + partitioned_rels = rel->live_partitioned_rels; } else if (rel->reloptkind == RELOPT_JOINREL && rel->part_scheme) { /* - * Associate PartitionedChildRelInfo of the root partitioned tables - * being joined with the root partitioned join (indicated by - * RELOPT_JOINREL). + * For joinrel consisting of root partitioned tables, get + * partitioned_rels list by combining live_partitioned_rels of the + * component partitioned tables. */ partitioned_rels = get_partitioned_child_rels_for_join(root, rel->relids); @@ -1378,17 +1857,12 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel, ListCell *lcp; /* - * If we need to build partitioned_rels, accumulate the partitioned - * rels for this child. + * Accumulate the live partitioned children of this child, if it's + * itself partitioned rel. */ - if (build_partitioned_rels) - { - List *cprels; - - cprels = get_partitioned_child_rels(root, childrel->relid); + if (childrel->part_scheme) partitioned_rels = list_concat(partitioned_rels, - list_copy(cprels)); - } + childrel->live_partitioned_rels); /* * If child has an unparameterized cheapest-total path, add that to diff --git a/src/backend/optimizer/path/joinrels.c b/src/backend/optimizer/path/joinrels.c index 2b868c52de..3e943391b1 100644 --- a/src/backend/optimizer/path/joinrels.c +++ b/src/backend/optimizer/path/joinrels.c @@ -1396,6 +1396,30 @@ try_partition_wise_join(PlannerInfo *root, RelOptInfo *rel1, RelOptInfo *rel2, child_rel2->relids); /* + * If either child_rel1 or child_rel2 is not a live partition, they'd + * not have been touched by set_append_rel_size. So, its RelOptInfo + * would be missing some information that set_append_rel_size sets for + * live partitions, such as the target list, child EQ members, etc. + * We need to make the RelOptInfo of even the dead partitions look + * minimally valid and as having a valid dummy path attached to it. + */ + if (IS_SIMPLE_REL(child_rel1) && child_rel1->pathlist == NIL) + { + AppendRelInfo *appinfo = rel1->part_appinfos[cnt_parts]; + + set_basic_child_rel_properties(root, rel1, child_rel1, appinfo); + mark_dummy_rel(child_rel1); + } + + if (IS_SIMPLE_REL(child_rel2) && child_rel2->pathlist == NIL) + { + AppendRelInfo *appinfo = rel2->part_appinfos[cnt_parts]; + + set_basic_child_rel_properties(root, rel2, child_rel2, appinfo); + mark_dummy_rel(child_rel2); + } + + /* * Construct restrictions applicable to the child join from those * applicable to the parent join. */ diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index ecdd7280eb..d9bbf20acb 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -6160,14 +6160,24 @@ List * get_partitioned_child_rels_for_join(PlannerInfo *root, Relids join_relids) { List *result = NIL; - ListCell *l; + int relid; - foreach(l, root->pcinfo_list) + relid = -1; + while ((relid = bms_next_member(join_relids, relid)) >= 0) { - PartitionedChildRelInfo *pc = lfirst(l); + RelOptInfo *rel; - if (bms_is_member(pc->parent_relid, join_relids)) - result = list_concat(result, list_copy(pc->child_rels)); + /* Paranoia: ignore bogus relid indexes */ + if (relid >= root->simple_rel_array_size) + continue; + rel = root->simple_rel_array[relid]; + if (rel == NULL) + continue; + Assert(rel->relid == relid); /* sanity check on array */ + Assert(rel->part_scheme != NULL); + Assert(rel->live_partitioned_rels != NIL && + list_length(rel->live_partitioned_rels) > 0); + result = list_concat(result, list_copy(rel->live_partitioned_rels)); } return result; diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c index 9d35a41e22..e1ef936e68 100644 --- a/src/backend/optimizer/util/plancat.c +++ b/src/backend/optimizer/util/plancat.c @@ -1918,6 +1918,10 @@ find_partition_scheme(PlannerInfo *root, Relation relation) memcpy(part_scheme->parttypbyval, partkey->parttypbyval, sizeof(bool) * partnatts); + part_scheme->partcollation = (Oid *) palloc(sizeof(Oid) * partnatts); + memcpy(part_scheme->partcollation, partkey->partcollation, + sizeof(Oid) * partnatts); + /* Add the partitioning scheme to PlannerInfo. */ root->part_schemes = lappend(root->part_schemes, part_scheme); diff --git a/src/backend/optimizer/util/relnode.c b/src/backend/optimizer/util/relnode.c index 3bd1063aa8..8e290e19b0 100644 --- a/src/backend/optimizer/util/relnode.c +++ b/src/backend/optimizer/util/relnode.c @@ -18,6 +18,7 @@ #include "miscadmin.h" #include "catalog/partition.h" +#include "catalog/pg_class.h" #include "optimizer/clauses.h" #include "optimizer/cost.h" #include "optimizer/pathnode.h" @@ -154,9 +155,12 @@ build_simple_rel(PlannerInfo *root, int relid, RelOptInfo *parent) rel->part_scheme = NULL; rel->nparts = 0; rel->boundinfo = NULL; + rel->part_appinfos = NULL; rel->part_rels = NULL; rel->partexprs = NULL; rel->nullable_partexprs = NULL; + rel->live_part_appinfos = NIL; + rel->live_partitioned_rels = NIL; /* * Pass top parent's relids down the inheritance hierarchy. If the parent @@ -233,8 +237,12 @@ build_simple_rel(PlannerInfo *root, int relid, RelOptInfo *parent) int cnt_parts = 0; if (nparts > 0) + { + rel->part_appinfos = (AppendRelInfo **) + palloc(sizeof(AppendRelInfo *) * nparts); rel->part_rels = (RelOptInfo **) palloc(sizeof(RelOptInfo *) * nparts); + } foreach(l, root->append_rel_list) { @@ -258,6 +266,7 @@ build_simple_rel(PlannerInfo *root, int relid, RelOptInfo *parent) * also match the PartitionDesc. See expand_partitioned_rtentry. */ Assert(cnt_parts < nparts); + rel->part_appinfos[cnt_parts] = appinfo; rel->part_rels[cnt_parts] = childrel; cnt_parts++; } @@ -567,6 +576,7 @@ build_join_rel(PlannerInfo *root, joinrel->part_scheme = NULL; joinrel->nparts = 0; joinrel->boundinfo = NULL; + joinrel->part_appinfos = NULL; joinrel->part_rels = NULL; joinrel->partexprs = NULL; joinrel->nullable_partexprs = NULL; @@ -735,6 +745,7 @@ build_child_join_rel(PlannerInfo *root, RelOptInfo *outer_rel, joinrel->has_eclass_joins = false; joinrel->top_parent_relids = NULL; joinrel->part_scheme = NULL; + joinrel->part_appinfos = NULL; joinrel->part_rels = NULL; joinrel->partexprs = NULL; joinrel->nullable_partexprs = NULL; @@ -1747,3 +1758,82 @@ build_joinrel_partition_info(RelOptInfo *joinrel, RelOptInfo *outer_rel, joinrel->nullable_partexprs[cnt] = nullable_partexpr; } } + +/* + * Initialize some basic properties of child rel from the parent rel, such + * target list, equivalence class members, etc. + */ +void +set_basic_child_rel_properties(PlannerInfo *root, + RelOptInfo *rel, + RelOptInfo *childrel, + AppendRelInfo *appinfo) +{ + AttrNumber attno; + + if (rel->part_scheme) + { + /* + * We need attr_needed data for building targetlist of a join relation + * representing join between matching partitions for partition-wise + * join. A given attribute of a child will be needed in the same + * highest joinrel where the corresponding attribute of parent is + * needed. Hence it suffices to use the same Relids set for parent and + * child. + */ + for (attno = rel->min_attr; attno <= rel->max_attr; attno++) + { + int index = attno - rel->min_attr; + Relids attr_needed = rel->attr_needed[index]; + + /* System attributes do not need translation. */ + if (attno <= 0) + { + Assert(rel->min_attr == childrel->min_attr); + childrel->attr_needed[index] = attr_needed; + } + else + { + Var *var = list_nth_node(Var, + appinfo->translated_vars, + attno - 1); + int child_index; + + child_index = var->varattno - childrel->min_attr; + childrel->attr_needed[child_index] = attr_needed; + } + } + } + + /* + * Copy/Modify targetlist. Even if this child is deemed empty, we need + * its targetlist in case it falls on nullable side in a child-join + * because of partition-wise join. + * + * NB: the resulting childrel->reltarget->exprs may contain arbitrary + * expressions, which otherwise would not occur in a rel's targetlist. + * Code that might be looking at an appendrel child must cope with + * such. (Normally, a rel's targetlist would only include Vars and + * PlaceHolderVars.) XXX we do not bother to update the cost or width + * fields of childrel->reltarget; not clear if that would be useful. + */ + childrel->reltarget->exprs = (List *) + adjust_appendrel_attrs(root, + (Node *) rel->reltarget->exprs, + 1, &appinfo); + + /* + * We have to make child entries in the EquivalenceClass data + * structures as well. This is needed either if the parent + * participates in some eclass joins (because we will want to consider + * inner-indexscan joins on the individual children) or if the parent + * has useful pathkeys (because we should try to build MergeAppend + * paths that produce those sort orderings). Even if this child is + * deemed dummy, it may fall on nullable side in a child-join, which + * in turn may participate in a MergeAppend, where we will need the + * EquivalenceClass data structures. + */ + if (rel->has_eclass_joins || has_useful_pathkeys(root, rel)) + add_child_rel_equivalences(root, appinfo, rel, childrel); + childrel->has_eclass_joins = rel->has_eclass_joins; +} diff --git a/src/include/catalog/partition.h b/src/include/catalog/partition.h index 945ac0239d..e74a87035e 100644 --- a/src/include/catalog/partition.h +++ b/src/include/catalog/partition.h @@ -101,6 +101,7 @@ extern int get_partition_for_tuple(PartitionDispatch *pd, EState *estate, PartitionDispatchData **failed_at, TupleTableSlot **failed_slot); + extern Oid get_default_oid_from_partdesc(PartitionDesc partdesc); extern Oid get_default_partition_oid(Oid parentId); extern void update_default_partition_oid(Oid parentId, Oid defaultPartId); @@ -108,4 +109,9 @@ extern void check_default_allows_bound(Relation parent, Relation defaultRel, PartitionBoundSpec *new_spec); extern List *get_proposed_default_constraint(List *new_part_constaints); +/* For partition-pruning */ +void get_partitions_from_clauses(Relation relation, int rt_index, + List *partclauses, + int *min_part_idx, int *max_part_idx, + Bitmapset **other_parts); #endif /* PARTITION_H */ diff --git a/src/include/nodes/relation.h b/src/include/nodes/relation.h index e085cefb7b..e47f6e5cd3 100644 --- a/src/include/nodes/relation.h +++ b/src/include/nodes/relation.h @@ -342,6 +342,10 @@ typedef struct PlannerInfo * partition bounds. Since partition key data types and the opclass declared * input data types are expected to be binary compatible (per ResolveOpClass), * both of those should have same byval and length properties. + * + * Since partitioning might be using a collation for a given partition key + * column that is not same as the collation implied by column's type, store + * the same separately. */ typedef struct PartitionSchemeData { @@ -349,7 +353,8 @@ typedef struct PartitionSchemeData int16 partnatts; /* number of partition attributes */ Oid *partopfamily; /* OIDs of operator families */ Oid *partopcintype; /* OIDs of opclass declared input data types */ - Oid *parttypcoll; /* OIDs of collations of partition keys. */ + Oid *parttypcoll; /* OIDs of partition key type collation. */ + Oid *partcollation; /* OIDs of partitioning collation */ /* Cached information about partition key data types. */ int16 *parttyplen; @@ -529,6 +534,7 @@ typedef struct PartitionSchemeData *PartitionScheme; * part_scheme - Partitioning scheme of the relation * boundinfo - Partition bounds * nparts - Number of partitions + * part_appinfos - AppendRelInfo of each partition * part_rels - RelOptInfos for each partition * partexprs, nullable_partexprs - Partition key expressions * @@ -575,6 +581,8 @@ typedef enum RelOptKind ((rel)->reloptkind == RELOPT_OTHER_MEMBER_REL || \ (rel)->reloptkind == RELOPT_OTHER_JOINREL) +typedef struct AppendRelInfo AppendRelInfo; + typedef struct RelOptInfo { NodeTag type; @@ -657,10 +665,27 @@ typedef struct RelOptInfo PartitionScheme part_scheme; /* Partitioning scheme. */ int nparts; /* number of partitions */ struct PartitionBoundInfoData *boundinfo; /* Partition bounds */ - struct RelOptInfo **part_rels; /* Array of RelOptInfos of partitions, - * stored in the same order of bounds */ + struct AppendRelInfo **part_appinfos; /* Array of AppendRelInfos of + * of partitioned, stored in the + * same order as of bounds */ + struct RelOptInfo **part_rels; /* Array of RelOptInfos of *all* + * partitions, stored in the same order as + * of bounds */ List **partexprs; /* Non-nullable partition key expressions. */ List **nullable_partexprs; /* Nullable partition key expressions. */ + + + /* + * List of AppendRelInfo's of the table's partitions that survive a + * query's clauses. + */ + List *live_part_appinfos; + + /* + * RT indexes of live partitions that are partitioned tables themselves. + * This includes the RT index of the table itself. + */ + List *live_partitioned_rels; } RelOptInfo; /* diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h index e9ed16ad32..c1f2fc93cd 100644 --- a/src/include/optimizer/pathnode.h +++ b/src/include/optimizer/pathnode.h @@ -296,5 +296,9 @@ extern RelOptInfo *build_child_join_rel(PlannerInfo *root, RelOptInfo *outer_rel, RelOptInfo *inner_rel, RelOptInfo *parent_joinrel, List *restrictlist, SpecialJoinInfo *sjinfo, JoinType jointype); +extern void set_basic_child_rel_properties(PlannerInfo *root, + RelOptInfo *rel, + RelOptInfo *childrel, + AppendRelInfo *appinfo); #endif /* PATHNODE_H */ -- 2.11.0