From 35d5316f88a295576fd2c43d84a8df33e3f48728 Mon Sep 17 00:00:00 2001 From: amit Date: Wed, 26 Jul 2017 14:42:47 +0900 Subject: [PATCH 3/6] WIP: Defer opening and locking partitions to set_append_rel_size This will still create RT entries for the child tables in expand_inherited_rtentry(), though not AppendRelInfos, because they require locking and opening the relation. Having all the RT entries created in advance means that setup_simple_rel_arrays knows the size of root->simple_rte_array and root->simpl_rel_array to allocate. expand_inherited_rtentry also allocates LeafPartitionInfos and PartitionInfos for individual leaf partitions and partitioned child table, resp. All LPOs and POs thus created are stuffed into PartitionRootInfo that is created for the parent. PartitionRootInfo was previously called PartitionedChildRelInfo. When set_append_rel_size is called for the root parent, the whole partition tree will be recursively processed, creating a PartitionAppendInfo in each recursive step, which maps a given parent table in the partition tree to its immediate partitions (also only those satisfy the query's WHERE condition). Once we have PartitionAppendInfos for all the parents in the tree, we resume set_append_rel_size() processing, creating RelOptInfos and AppendRelInfos for the root parent's children and recursively doing the same for its partitioned children and so on. --- src/backend/catalog/partition.c | 20 ++ src/backend/nodes/copyfuncs.c | 17 -- src/backend/nodes/equalfuncs.c | 12 - src/backend/nodes/outfuncs.c | 59 ++++- src/backend/optimizer/path/allpaths.c | 389 +++++++++++++++++++++++++++++++-- src/backend/optimizer/plan/planner.c | 115 +++++++++- src/backend/optimizer/plan/setrefs.c | 26 +++ src/backend/optimizer/prep/prepunion.c | 300 +++++++++++++++---------- src/backend/optimizer/util/plancat.c | 37 ++++ src/backend/optimizer/util/relnode.c | 91 +++++++- src/backend/utils/cache/lsyscache.c | 50 +++++ src/include/catalog/partition.h | 4 + src/include/nodes/nodes.h | 5 +- src/include/nodes/relation.h | 95 +++++++- src/include/optimizer/plancat.h | 1 + src/include/optimizer/prep.h | 3 + src/include/utils/lsyscache.h | 2 + src/test/regress/expected/insert.out | 4 +- 18 files changed, 1029 insertions(+), 201 deletions(-) diff --git a/src/backend/catalog/partition.c b/src/backend/catalog/partition.c index 9645381fcb..a193f02551 100644 --- a/src/backend/catalog/partition.c +++ b/src/backend/catalog/partition.c @@ -1141,6 +1141,26 @@ RelationGetPartitionDispatchInfo(Relation rel, return pd; } +/* + * get_partitions_for_keys + * Returns the list of indexes (from pd->indexes) of the partitions that + * will need to be scanned for the given scan keys. + * + * TODO: add the interface to pass the query scan keys and the logic to look + * up partitions using those keys. + */ +List * +get_partitions_for_keys(PartitionDispatch pd) +{ + int i; + List *result = NIL; + + for (i = 0; i < pd->partdesc->nparts; i++) + result = lappend_int(result, pd->indexes[i]); + + return result; +} + /* Module-local functions */ /* diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c index f9ddf4ed76..4c888ec3dc 100644 --- a/src/backend/nodes/copyfuncs.c +++ b/src/backend/nodes/copyfuncs.c @@ -2251,20 +2251,6 @@ _copyAppendRelInfo(const AppendRelInfo *from) } /* - * _copyPartitionedChildRelInfo - */ -static PartitionedChildRelInfo * -_copyPartitionedChildRelInfo(const PartitionedChildRelInfo *from) -{ - PartitionedChildRelInfo *newnode = makeNode(PartitionedChildRelInfo); - - COPY_SCALAR_FIELD(parent_relid); - COPY_NODE_FIELD(child_rels); - - return newnode; -} - -/* * _copyPlaceHolderInfo */ static PlaceHolderInfo * @@ -4996,9 +4982,6 @@ copyObjectImpl(const void *from) case T_AppendRelInfo: retval = _copyAppendRelInfo(from); break; - case T_PartitionedChildRelInfo: - retval = _copyPartitionedChildRelInfo(from); - break; case T_PlaceHolderInfo: retval = _copyPlaceHolderInfo(from); break; diff --git a/src/backend/nodes/equalfuncs.c b/src/backend/nodes/equalfuncs.c index 8d92c03633..fb248f31f3 100644 --- a/src/backend/nodes/equalfuncs.c +++ b/src/backend/nodes/equalfuncs.c @@ -905,15 +905,6 @@ _equalAppendRelInfo(const AppendRelInfo *a, const AppendRelInfo *b) } static bool -_equalPartitionedChildRelInfo(const PartitionedChildRelInfo *a, const PartitionedChildRelInfo *b) -{ - COMPARE_SCALAR_FIELD(parent_relid); - COMPARE_NODE_FIELD(child_rels); - - return true; -} - -static bool _equalPlaceHolderInfo(const PlaceHolderInfo *a, const PlaceHolderInfo *b) { COMPARE_SCALAR_FIELD(phid); @@ -3155,9 +3146,6 @@ equal(const void *a, const void *b) case T_AppendRelInfo: retval = _equalAppendRelInfo(a, b); break; - case T_PartitionedChildRelInfo: - retval = _equalPartitionedChildRelInfo(a, b); - break; case T_PlaceHolderInfo: retval = _equalPlaceHolderInfo(a, b); break; diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c index 9ee3e23761..2480fd6429 100644 --- a/src/backend/nodes/outfuncs.c +++ b/src/backend/nodes/outfuncs.c @@ -2213,7 +2213,7 @@ _outPlannerInfo(StringInfo str, const PlannerInfo *node) WRITE_NODE_FIELD(full_join_clauses); WRITE_NODE_FIELD(join_info_list); WRITE_NODE_FIELD(append_rel_list); - WRITE_NODE_FIELD(pcinfo_list); + WRITE_NODE_FIELD(prinfo_list); WRITE_NODE_FIELD(rowMarks); WRITE_NODE_FIELD(placeholder_list); WRITE_NODE_FIELD(fkey_list); @@ -2287,6 +2287,12 @@ _outRelOptInfo(StringInfo str, const RelOptInfo *node) WRITE_NODE_FIELD(joininfo); WRITE_BOOL_FIELD(has_eclass_joins); WRITE_BITMAPSET_FIELD(top_parent_relids); + WRITE_INT_FIELD(num_parted); + /* don't bother printing partition_infos */ + WRITE_INT_FIELD(num_leaf_parts); + /* don't bother printing leaf_part_infos */ + WRITE_NODE_FIELD(live_partition_painfos); + WRITE_UINT_FIELD(root_parent_relid); } static void @@ -2512,12 +2518,44 @@ _outAppendRelInfo(StringInfo str, const AppendRelInfo *node) } static void -_outPartitionedChildRelInfo(StringInfo str, const PartitionedChildRelInfo *node) +_outPartitionInfo(StringInfo str, const PartitionInfo *node) { - WRITE_NODE_TYPE("PARTITIONEDCHILDRELINFO"); + WRITE_NODE_TYPE("PARTITIONINFO"); + + WRITE_BOOL_FIELD(is_other_temp); + WRITE_UINT_FIELD(relid); + /* Don't bother writing out the PartitionDispatch object */ +} + +static void +_outLeafPartitionInfo(StringInfo str, const LeafPartitionInfo *node) +{ + WRITE_NODE_TYPE("LEAFPARTITIONINFO"); + + WRITE_BOOL_FIELD(is_other_temp); + WRITE_OID_FIELD(reloid); + WRITE_UINT_FIELD(relid); +} + +static void +_outPartitionAppendInfo(StringInfo str, const PartitionAppendInfo *node) +{ + WRITE_NODE_TYPE("PARTITIONAPPENDINFO"); + + WRITE_UINT_FIELD(parent_relid); + WRITE_NODE_FIELD(live_partition_relids); +} + +static void +_outPartitionRootInfo(StringInfo str, const PartitionRootInfo *node) +{ + WRITE_NODE_TYPE("PARTITIONROOTINFO"); WRITE_UINT_FIELD(parent_relid); - WRITE_NODE_FIELD(child_rels); + WRITE_NODE_FIELD(partition_infos); + WRITE_NODE_FIELD(partitioned_relids); + WRITE_NODE_FIELD(leaf_part_infos); + WRITE_NODE_FIELD(orig_leaf_part_oids); } static void @@ -4045,8 +4083,17 @@ outNode(StringInfo str, const void *obj) case T_AppendRelInfo: _outAppendRelInfo(str, obj); break; - case T_PartitionedChildRelInfo: - _outPartitionedChildRelInfo(str, obj); + case T_PartitionInfo: + _outPartitionInfo(str, obj); + break; + case T_LeafPartitionInfo: + _outLeafPartitionInfo(str, obj); + break; + case T_PartitionAppendInfo: + _outPartitionAppendInfo(str, obj); + break; + case T_PartitionRootInfo: + _outPartitionRootInfo(str, obj); break; case T_PlaceHolderInfo: _outPlaceHolderInfo(str, obj); diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c index 2d7e1d84d0..c5c50e3b9d 100644 --- a/src/backend/optimizer/path/allpaths.c +++ b/src/backend/optimizer/path/allpaths.c @@ -20,6 +20,7 @@ #include "access/sysattr.h" #include "access/tsmapi.h" +#include "catalog/partition.h" #include "catalog/pg_class.h" #include "catalog/pg_operator.h" #include "catalog/pg_proc.h" @@ -43,6 +44,8 @@ #include "parser/parse_clause.h" #include "parser/parsetree.h" #include "rewrite/rewriteManip.h" +#include "storage/lmgr.h" +#include "utils/builtins.h" #include "utils/lsyscache.h" @@ -845,6 +848,172 @@ set_foreign_pathlist(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte) } /* + * get_partitions_recurse + * Find partitions of the partitioned table described in partinfo, + * recursing for those partitions that are themselves partitioned tables + * + * rootrel is the root of the partition tree of which this table is a part. + * We create a PartitionAppendInfo for this partitioned table and append it to + * rootrel->live_partition_painfos. + * + * List of the leaf partitions of this table will be returned. + */ +static List * +get_rel_partitions_recurse(RelOptInfo *rootrel, + PartitionInfo *partinfo, + PartitionInfo **all_partinfos, + LeafPartitionInfo **leaf_part_infos) +{ + PartitionAppendInfo *painfo; + List *indexes; + List *result = NIL, + *my_live_partitions = NIL; + ListCell *l; + + /* + * Create a PartitionAppendInfo to map this table to the child tables + * that will be its Append children. + */ + painfo = makeNode(PartitionAppendInfo); + painfo->parent_relid = partinfo->relid; + + /* They will all be under the root table's Append node. */ + rootrel->live_partition_painfos = lappend(rootrel->live_partition_painfos, + painfo); + + /* + * TODO: collect the keys by looking at the clauses in + * rootrel->baserestrictinfo considering this table's partition keys. + */ + + /* Ask partition.c which partitions it thinks match the keys. */ + indexes = get_partitions_for_keys(partinfo->pd); + + /* Collect leaf partitions in the result list and recurse for others. */ + foreach(l, indexes) + { + int index = lfirst_int(l); + + if (index >= 0) + { + LeafPartitionInfo *lpinfo = leaf_part_infos[index]; + + if (!lpinfo->is_other_temp) + { + result = lappend_oid(result, lpinfo->reloid); + my_live_partitions = lappend_int(my_live_partitions, + lpinfo->relid); + } + } + else + { + PartitionInfo *recurse_partinfo = all_partinfos[-index]; + List *my_leaf_partitions; + + if (!recurse_partinfo->is_other_temp) + { + my_live_partitions = lappend_int(my_live_partitions, + recurse_partinfo->relid); + my_leaf_partitions = get_rel_partitions_recurse(rootrel, + recurse_partinfo, + all_partinfos, + leaf_part_infos); + result = list_concat(result, my_leaf_partitions); + } + } + } + + painfo->live_partition_relids = my_live_partitions; + + return result; +} + +/* + * get_rel_partitions + * Recursively find partitions of rel + */ +static List * +get_rel_partitions(RelOptInfo *rel) +{ + return get_rel_partitions_recurse(rel, + rel->partition_infos[0], + rel->partition_infos, + rel->leaf_part_infos); +} + +/* + * find_rel_partitions + * Find and lock partitions of rel relevant to this query + * + * Note that we only ever need to lock the leaf partitions, because the + * partitioned tables in the partition tree have already been locked. + */ +static void +find_partitions_for_query(PlannerInfo *root, RelOptInfo *rel) +{ + List *leaf_part_oids = NIL; + ListCell *l; + PlanRowMark *rc = NULL; + int lockmode; + int num_leaf_parts, + i; + Oid *leaf_part_oids_array; + PartitionRootInfo *prinfo = NULL; + + /* Find partitions. */ + Assert(rel->partition_infos != NULL); + leaf_part_oids = get_rel_partitions(rel); + + /* Convert the list to an array and sort for binary searching later. */ + num_leaf_parts = list_length(leaf_part_oids); + leaf_part_oids_array = (Oid *) palloc(num_leaf_parts * sizeof(Oid)); + i = 0; + foreach(l, leaf_part_oids) + { + leaf_part_oids_array[i++] = lfirst_oid(l); + } + qsort(leaf_part_oids_array, num_leaf_parts, sizeof(Oid), oid_cmp); + + /* + * Now lock partitions. Note that rel cannot be a result relation or we + * wouldn't be here (inheritance_planner is where result relations go). + */ + rc = get_plan_rowmark(root->rowMarks, rel->relid); + if (rc && RowMarkRequiresRowShareLock(rc->markType)) + lockmode = RowShareLock; + else + lockmode = AccessShareLock; + + /* + * We lock leaf partitions in the order in which find_all_inheritors + * found them in expand_inherited_rtentry(). Find that list by locating + * the PartitionRootInfo for this table. + */ + foreach(l, root->prinfo_list) + { + prinfo = lfirst(l); + + if (rel->relid == prinfo->parent_relid) + break; + } + Assert(prinfo != NULL && rel->relid == prinfo->parent_relid); + foreach(l, prinfo->orig_leaf_part_oids) + { + Oid relid = lfirst_oid(l); + Oid *test; + + /* Will this leaf partition be scanned? */ + test = (Oid *) bsearch(&relid, + leaf_part_oids_array, + num_leaf_parts, + sizeof(Oid), oid_cmp); + /* Yep, so lock. */ + if (test != NULL) + LockRelationOid(relid, lockmode); + } +} + +/* * set_append_rel_size * Set size estimates for a simple "append relation" * @@ -866,6 +1035,158 @@ set_append_rel_size(PlannerInfo *root, RelOptInfo *rel, double *parent_attrsizes; int nattrs; ListCell *l; + List *rel_appinfos = NIL; + + /* + * Collect a list child AppendRelInfo's, which in the non-partitioned + * case will be found in root->append_rel_list. In the partitioned + * table's case, we didn't build any AppendRelInfo's yet. We will + * do the same after figuring out which of the table's child tables + * (aka partitions) will need to be scanned for this query. + */ + if (rte->relkind != RELKIND_PARTITIONED_TABLE) + { + foreach(l, root->append_rel_list) + { + AppendRelInfo *appinfo = lfirst(l); + + /* append_rel_list contains all append rels; ignore others */ + if (appinfo->parent_relid == parentRTindex) + rel_appinfos = lappend(rel_appinfos, appinfo); + } + } + else + { + Index root_parent_relid; + List *live_partitions, + *parent_vars; + Relation parent; + + /* + * If this is a partitioned table root, we will determine all the + * partitions in this partition tree that we need to scan for this + * query. Among those, partitions that have not yet been locked (viz. + * the leaf partitions), will be. + */ + if (rel->partition_infos != NULL) + { + PartitionAppendInfo *painfo; + + root_parent_relid = rti; + + find_partitions_for_query(root, rel); + painfo = linitial(rel->live_partition_painfos); + Assert(rti == painfo->parent_relid); + live_partitions = painfo->live_partition_relids; + + parent = rel->partition_infos[0]->pd->reldesc; + } + else + { + int i; + RelOptInfo *rootrel; + + root_parent_relid = rel->root_parent_relid; + rootrel = root->simple_rel_array[root_parent_relid]; + + /* + * Just need to get hold of the PartitionAppendInfo via the root + * parent's RelOptInfo. + */ + i = 0; + foreach(l, rootrel->live_partition_painfos) + { + PartitionAppendInfo *painfo = lfirst(l); + + if (rti == painfo->parent_relid) + { + live_partitions = painfo->live_partition_relids; + break; + } + + /* Skip to the index of this table's PartitionInfo. */ + i++; + } + + /* + * For non-root parttioned tables, we already have a relcache + * pointer that RelationGetPartitionDispatchInfo() acquired for + * us. + */ + parent = rootrel->partition_infos[i]->pd->reldesc; + } + + /* + * Create an AppendRelInfo and a RelOptInfo for every candidate + * partition. + */ + parent_vars = build_rel_vars(parent, rti); + foreach(l, live_partitions) + { + Index childRTindex = lfirst_int(l); + RangeTblEntry *childrte = planner_rt_fetch(childRTindex, root); + Relation child; + AppendRelInfo *appinfo; + RelOptInfo *childrel; + + child = heap_open(childrte->relid, NoLock); /* already locked! */ + appinfo = makeNode(AppendRelInfo); + appinfo->parent_relid = rti; + appinfo->child_relid = childRTindex; + appinfo->parent_reltype = parent->rd_rel->reltype; + appinfo->child_reltype = child->rd_rel->reltype; + appinfo->translated_vars = map_partition_varattnos(parent_vars, + rti, + child, parent, + NULL); + ChangeVarNodes((Node *) appinfo->translated_vars, + rti, childRTindex, 0); + appinfo->parent_reloid = rte->relid; + + /* For the main loop below that does per-child table processing. */ + rel_appinfos = lappend(rel_appinfos, appinfo); + + /* + * While at it, also add the appinfo into root->append_rel_list, + * so that any place that obtains a parent's children by looking + * them up in that list are able to do so. + */ + root->append_rel_list = lappend(root->append_rel_list, appinfo); + + /* + * Translate the column permissions bitmaps to the child's attnums + * (we have to build the translated_vars list before we can do + * this). But if this is the parent table, leave copyObject's + * result alone. + * + * Note: we need to do this even though the executor won't run any + * permissions checks on the child RTE. The + * insertedCols/updatedCols bitmaps may be examined for + * trigger-firing purposes. + */ + childrte->selectedCols = translate_col_privs(rte->selectedCols, + appinfo->translated_vars); + childrte->insertedCols = translate_col_privs(rte->insertedCols, + appinfo->translated_vars); + childrte->updatedCols = translate_col_privs(rte->updatedCols, + appinfo->translated_vars); + + childrel = build_simple_rel(root, childRTindex, rel); + childrel->root_parent_relid = root_parent_relid; + Assert(childrel->reloptkind == RELOPT_OTHER_MEMBER_REL); + + /* Copy the data that create_lateral_join_info() created */ + Assert(childrel->direct_lateral_relids == NULL); + childrel->direct_lateral_relids = rel->direct_lateral_relids; + Assert(childrel->lateral_relids == NULL); + childrel->lateral_relids = rel->lateral_relids; + Assert(childrel->lateral_referencers == NULL); + childrel->lateral_referencers = rel->lateral_referencers; + + root->total_table_pages += childrel->pages; + heap_close(child, NoLock); + } + } Assert(IS_SIMPLE_REL(rel)); @@ -889,7 +1210,7 @@ set_append_rel_size(PlannerInfo *root, RelOptInfo *rel, nattrs = rel->max_attr - rel->min_attr + 1; parent_attrsizes = (double *) palloc0(nattrs * sizeof(double)); - foreach(l, root->append_rel_list) + foreach(l, rel_appinfos) { AppendRelInfo *appinfo = (AppendRelInfo *) lfirst(l); int childRTindex; @@ -902,10 +1223,6 @@ set_append_rel_size(PlannerInfo *root, RelOptInfo *rel, ListCell *childvars; ListCell *lc; - /* append_rel_list contains all append rels; ignore others */ - if (appinfo->parent_relid != parentRTindex) - continue; - childRTindex = appinfo->child_relid; childRTE = root->simple_rte_array[childRTindex]; @@ -1211,24 +1528,61 @@ set_append_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, int parentRTindex = rti; List *live_childrels = NIL; ListCell *l; + List *append_rel_children = NIL; + + if (rte->relkind != RELKIND_PARTITIONED_TABLE) + { + foreach(l, root->append_rel_list) + { + AppendRelInfo *appinfo = lfirst(l); + + /* append_rel_list contains all append rels; ignore others */ + if (appinfo->parent_relid == parentRTindex) + append_rel_children = lappend_int(append_rel_children, + appinfo->child_relid); + } + } + else + { + /* For a partitioned table, first find its PartitionAppendInfo */ + if (rel->live_partition_painfos != NIL) + { + PartitionAppendInfo *painfo; + + /* This is the root partitioned rel. */ + painfo = linitial(rel->live_partition_painfos); + append_rel_children = painfo->live_partition_relids; + } + else + { + RelOptInfo *rootrel; + + /* Non-root partitioned table. Get it from the root rel. */ + rootrel = root->simple_rel_array[rel->root_parent_relid]; + foreach(l, rootrel->live_partition_painfos) + { + PartitionAppendInfo *painfo = lfirst(l); + + if (rti == painfo->parent_relid) + { + append_rel_children = painfo->live_partition_relids; + break; + } + } + } + } /* * Generate access paths for each member relation, and remember the * non-dummy children. */ - foreach(l, root->append_rel_list) + foreach(l, append_rel_children) { - AppendRelInfo *appinfo = (AppendRelInfo *) lfirst(l); - int childRTindex; + int childRTindex = lfirst_int(l); RangeTblEntry *childRTE; RelOptInfo *childrel; - /* append_rel_list contains all append rels; ignore others */ - if (appinfo->parent_relid != parentRTindex) - continue; - /* Re-locate the child RTE and RelOptInfo */ - childRTindex = appinfo->child_relid; childRTE = root->simple_rte_array[childRTindex]; childrel = root->simple_rel_array[childRTindex]; @@ -1289,7 +1643,14 @@ add_paths_to_append_rel(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte; rte = planner_rt_fetch(rel->relid, root); - if (rte->relkind == RELKIND_PARTITIONED_TABLE) + + /* + * Note that get_partitioned_child_rels must be called only for root + * partitioned tables and only those have rel->live_partition_painfos + * set. + */ + if (rte->relkind == RELKIND_PARTITIONED_TABLE && + rel->live_partition_painfos != NIL) { partitioned_rels = get_partitioned_child_rels(root, rel->relid); /* The root partitioned table is included as a child rel */ diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index 966230256e..1a85c83c50 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -520,7 +520,7 @@ subquery_planner(PlannerGlobal *glob, Query *parse, root->multiexpr_params = NIL; root->eq_classes = NIL; root->append_rel_list = NIL; - root->pcinfo_list = NIL; + root->prinfo_list = NIL; root->rowMarks = NIL; memset(root->upper_rels, 0, sizeof(root->upper_rels)); memset(root->upper_targets, 0, sizeof(root->upper_targets)); @@ -1056,6 +1056,102 @@ inheritance_planner(PlannerInfo *root) Index rti; RangeTblEntry *parent_rte; List *partitioned_rels = NIL; + List *rel_appinfos = NIL; + ListCell *l; + + parent_rte = rt_fetch(parentRTindex, root->parse->rtable); + if (parent_rte->relkind != RELKIND_PARTITIONED_TABLE) + { + foreach(l, root->append_rel_list) + { + AppendRelInfo *appinfo = lfirst(l); + + /* append_rel_list contains all append rels; ignore others */ + if (appinfo->parent_relid == parentRTindex) + rel_appinfos = lappend(rel_appinfos, appinfo); + } + } + else + { + PartitionRootInfo *prinfo = NULL; + Relation parent; + List *parent_vars; + + /* Find the PartitionRootInfo for this parent. */ + foreach(l, root->prinfo_list) + { + prinfo = lfirst(l); + + if (prinfo->parent_relid == parentRTindex) + break; + } + Assert(prinfo != NULL && prinfo->parent_relid == parentRTindex); + + parent = heap_open(parent_rte->relid, NoLock); + parent_vars = build_rel_vars(parent, parentRTindex); + foreach(l, prinfo->leaf_part_infos) + { + LeafPartitionInfo *lpinfo = lfirst(l); + Index childRTindex = lpinfo->relid; + RangeTblEntry *childrte = planner_rt_fetch(childRTindex, root); + Relation child; + AppendRelInfo *appinfo; + + if (childrte->relkind == RELKIND_PARTITIONED_TABLE) + continue; + + /* + * We'll need RowExclusiveLock, because just like the parent, each + * child is a result relation. + */ + child = heap_open(childrte->relid, RowExclusiveLock); + appinfo = makeNode(AppendRelInfo); + appinfo->parent_relid = parentRTindex; + appinfo->child_relid = childRTindex; + appinfo->parent_reltype = parent->rd_rel->reltype; + appinfo->child_reltype = child->rd_rel->reltype; + appinfo->translated_vars = map_partition_varattnos(parent_vars, + parentRTindex, + child, parent, + NULL); + ChangeVarNodes((Node *) appinfo->translated_vars, + parentRTindex, childRTindex, 0); + appinfo->parent_reloid = RelationGetRelid(parent); + + /* For the main loop below that does per-child table planning. */ + rel_appinfos = lappend(rel_appinfos, appinfo); + + /* + * While at it, also add the appinfo into root->append_rel_list, + * so that any places that obtain a parent's children by looking + * them up in that list are able to do so. + */ + root->append_rel_list = lappend(root->append_rel_list, appinfo); + + /* + * Translate the column permissions bitmaps to the child's attnums + * (we have to build the translated_vars list before we can do + * this). But if this is the parent table, leave copyObject's + * result alone. + * + * Note: we need to do this even though the executor won't run any + * permissions checks on the child RTE. The + * insertedCols/updatedCols bitmaps may be examined for + * trigger-firing purposes. + */ + childrte->selectedCols = + translate_col_privs(parent_rte->selectedCols, + appinfo->translated_vars); + childrte->insertedCols = + translate_col_privs(parent_rte->insertedCols, + appinfo->translated_vars); + childrte->updatedCols = + translate_col_privs(parent_rte->updatedCols, + appinfo->translated_vars); + heap_close(child, NoLock); + } + heap_close(parent, NoLock); + } Assert(parse->commandType != CMD_INSERT); @@ -1121,14 +1217,13 @@ inheritance_planner(PlannerInfo *root) * opposite in the case of non-partitioned inheritance parent as described * below. */ - parent_rte = rt_fetch(parentRTindex, root->parse->rtable); if (parent_rte->relkind == RELKIND_PARTITIONED_TABLE) nominalRelation = parentRTindex; /* * And now we can get on with generating a plan for each child table. */ - foreach(lc, root->append_rel_list) + foreach(lc, rel_appinfos) { AppendRelInfo *appinfo = (AppendRelInfo *) lfirst(lc); PlannerInfo *subroot; @@ -1136,10 +1231,6 @@ inheritance_planner(PlannerInfo *root) RelOptInfo *sub_final_rel; Path *subpath; - /* append_rel_list contains all append rels; ignore others */ - if (appinfo->parent_relid != parentRTindex) - continue; - /* * We need a working copy of the PlannerInfo so that we can control * propagation of information back to the main copy. @@ -6076,7 +6167,7 @@ plan_cluster_use_sort(Oid tableOid, Oid indexOid) * Returns a list of the RT indexes of the partitioned child relations * with rti as the root parent RT index. * - * Note: Only call this function on RTEs known to be partitioned tables. + * Note: Only call this function on RTEs known to be a root partitioned table. */ List * get_partitioned_child_rels(PlannerInfo *root, Index rti) @@ -6084,13 +6175,13 @@ get_partitioned_child_rels(PlannerInfo *root, Index rti) List *result = NIL; ListCell *l; - foreach(l, root->pcinfo_list) + foreach(l, root->prinfo_list) { - PartitionedChildRelInfo *pc = lfirst(l); + PartitionRootInfo *prinfo = lfirst(l); - if (pc->parent_relid == rti) + if (prinfo->parent_relid == rti) { - result = pc->child_rels; + result = prinfo->partitioned_relids; break; } } diff --git a/src/backend/optimizer/plan/setrefs.c b/src/backend/optimizer/plan/setrefs.c index b0c9e94459..4666e446d7 100644 --- a/src/backend/optimizer/plan/setrefs.c +++ b/src/backend/optimizer/plan/setrefs.c @@ -15,7 +15,9 @@ */ #include "postgres.h" +#include "access/heapam.h" #include "access/transam.h" +#include "catalog/partition.h" #include "catalog/pg_type.h" #include "nodes/makefuncs.h" #include "nodes/nodeFuncs.h" @@ -204,6 +206,10 @@ static bool extract_query_dependencies_walker(Node *node, * to process targetlist and qual expressions. We can assume that the Plan * nodes were just built by the planner and are not multiply referenced, but * it's not so safe to assume that for expression tree nodes. + * + * Finally, we close some relcache references lintering in root. They are + * those of the partitioned tables whose PartitionDispatch objects are + * referenced from within root->prinfo_list. */ Plan * set_plan_references(PlannerInfo *root, Plan *plan) @@ -238,6 +244,26 @@ set_plan_references(PlannerInfo *root, Plan *plan) glob->finalrowmarks = lappend(glob->finalrowmarks, newrc); } + /* + * Close relcache references in PartitionDispatch objects referenced in + * root. + */ + foreach(lc, root->prinfo_list) + { + PartitionRootInfo *prinfo = lfirst(lc); + ListCell *lc1; + + foreach(lc1, prinfo->partition_infos) + { + PartitionInfo *pinfo = lfirst(lc1); + + if (pinfo->pd->reldesc) + heap_close(pinfo->pd->reldesc, NoLock); + /* Shouldn't try to close again. XXX - hack? */ + pinfo->pd->reldesc = NULL; + } + } + /* Now fix the Plan tree */ return set_plan_refs(root, plan, rtoffset); } diff --git a/src/backend/optimizer/prep/prepunion.c b/src/backend/optimizer/prep/prepunion.c index 0d20ffa2f7..01de2d778d 100644 --- a/src/backend/optimizer/prep/prepunion.c +++ b/src/backend/optimizer/prep/prepunion.c @@ -105,8 +105,6 @@ static void make_inh_translation_list(Relation oldrelation, Relation newrelation, Index newvarno, List **translated_vars); -static Bitmapset *translate_col_privs(const Bitmapset *parent_privs, - List *translated_vars); static Node *adjust_appendrel_attrs_mutator(Node *node, adjust_appendrel_attrs_context *context); static Relids adjust_child_relids(Relids relids, int nappinfos, @@ -1352,11 +1350,19 @@ expand_inherited_tables(PlannerInfo *root) /* * expand_inherited_rtentry - * Check whether a rangetable entry represents an inheritance set. - * If so, add entries for all the child tables to the query's - * rangetable, and build AppendRelInfo nodes for all the child tables - * and add them to root->append_rel_list. If not, clear the entry's - * "inh" flag to prevent later code from looking for AppendRelInfos. + * Perform actions necessary for applying this query to an inheritance + * set if the rte represents one + * + * That includes adding entries for all the child tables to the query's + * rangetable. Also, if this query requires a PlanRowMark, generate the same + * for each child table and append them to the planner's global list + * (root->rowMarks). If the inheritance set is really a partitioned table, + * our work here is done. If not, we also create AppendRelInfo nodes for + * all the child tables and add them to root->append_rel_list. + * + * If it turns out that the rte is not (or no longer) an inheritance set, + * clear the entry's "inh" flag to prevent later code from looking for + * AppendRelInfos. * * Note that the original RTE is considered to represent the whole * inheritance set. The first of the generated RTEs is an RTE for the same @@ -1381,9 +1387,14 @@ expand_inherited_rtentry(PlannerInfo *root, RangeTblEntry *rte, Index rti) List *inhOIDs; List *appinfos; ListCell *l; - bool has_child; - PartitionedChildRelInfo *pcinfo; List *partitioned_child_rels = NIL; + List *partition_infos = NIL; + List *leaf_part_infos = NIL; + List *orig_leaf_part_oids; + int num_partitioned_children, + i; + PartitionDispatch *pds; + PartitionInfo *pinfo; /* Does RT entry allow inheritance? */ if (!rte->inh) @@ -1408,6 +1419,10 @@ expand_inherited_rtentry(PlannerInfo *root, RangeTblEntry *rte, Index rti) * relation named in the query. However, for each child relation we add * to the query, we must obtain an appropriate lock, because this will be * the first use of those relations in the parse/rewrite/plan pipeline. + * For a partitioned table, we defer locking non-partitioned child tables + * (aka leaf partitions) to when we actually know that they will be + * scanned for this query. We do that by passing 'true' for + * lock_only_partitioned_children. * * If the parent relation is the query's result relation, then we need * RowExclusiveLock. Otherwise, if it's accessed FOR UPDATE/SHARE, we @@ -1425,7 +1440,8 @@ expand_inherited_rtentry(PlannerInfo *root, RangeTblEntry *rte, Index rti) lockmode = AccessShareLock; /* Scan for all members of inheritance set, acquire needed locks */ - inhOIDs = find_all_inheritors(parentOID, lockmode, false, NULL, NULL); + inhOIDs = find_all_inheritors(parentOID, lockmode, true, NULL, + &num_partitioned_children); /* * Check that there's at least one descendant, else treat as no-child @@ -1460,28 +1476,43 @@ expand_inherited_rtentry(PlannerInfo *root, RangeTblEntry *rte, Index rti) if (rte->relkind == RELKIND_PARTITIONED_TABLE) { List *leaf_part_oids; - int num_parted, - i; - PartitionDispatch *pds; + int num_parted; + Relation rootrel; + + /* + * Keep leaf partition OIDs around so that we can lock them in this + * order when we eventually do it. + */ + orig_leaf_part_oids = list_copy_tail(inhOIDs, + num_partitioned_children + 1); - /* Discard the original list. */ - list_free(inhOIDs); + /* Discard the original inhOIDs list. */ inhOIDs = NIL; - /* Request partitioning information. */ - pds = RelationGetPartitionDispatchInfo(oldrelation, &num_parted, + /* + * Request partitioning information. We don't pass oldrelation, + * because we want to keep the relcache pointer in PartitionDispatch + * open until much later, but we'll be closing oldrelation before + * returning from this function. + */ + rootrel = heap_open(rte->relid, NoLock); + pds = RelationGetPartitionDispatchInfo(rootrel, &num_parted, &leaf_part_oids); /* - * First collect the partitioned child table OIDs, which includes the - * root parent at the head. + * We make a PartitionInfo object for every partitioned table in the + * tree, including the root table. Note that we create the root + * table's PartitionInfo outside the loop, because inhOIDs will not + * contain its OID. Also add the original rti to + * partitioned_child_rels. */ - for (i = 0; i < num_parted; i++) - { + pinfo = makeNode(PartitionInfo); + pinfo->relid = rti; + pinfo->pd = pds[0]; + partition_infos = list_make1(pinfo); + partitioned_child_rels = list_make1_int(rti); + for (i = 1; i < num_parted; i++) inhOIDs = lappend_oid(inhOIDs, RelationGetRelid(pds[i]->reldesc)); - if (pds[i]->reldesc != oldrelation) - heap_close(pds[i]->reldesc, NoLock); - } /* Concatenate the leaf partition OIDs. */ inhOIDs = list_concat(inhOIDs, leaf_part_oids); @@ -1489,20 +1520,16 @@ expand_inherited_rtentry(PlannerInfo *root, RangeTblEntry *rte, Index rti) /* Scan the inheritance set and expand it */ appinfos = NIL; - has_child = false; + i = 1; foreach(l, inhOIDs) { Oid childOID = lfirst_oid(l); Relation newrelation; - RangeTblEntry *childrte; - Index childRTindex; + RangeTblEntry *childrte = NULL; + Index childRTindex = 0; AppendRelInfo *appinfo; - - /* Open rel if needed; we already have required locks */ - if (childOID != parentOID) - newrelation = heap_open(childOID, NoLock); - else - newrelation = oldrelation; + bool is_other_temp; + char child_relkind = get_rel_relkind(childOID); /* * It is possible that the parent table has children that are temp @@ -1510,11 +1537,7 @@ expand_inherited_rtentry(PlannerInfo *root, RangeTblEntry *rte, Index rti) * (because of buffering issues), and the best thing to do seems to be * to silently ignore them. */ - if (childOID != parentOID && RELATION_IS_OTHER_TEMP(newrelation)) - { - heap_close(newrelation, lockmode); - continue; - } + is_other_temp = rel_is_other_temp(childOID); /* * Build an RTE for the child, and attach to query's rangetable list. @@ -1528,64 +1551,22 @@ expand_inherited_rtentry(PlannerInfo *root, RangeTblEntry *rte, Index rti) * The parent securityQuals will be propagated to children along with * other base restriction clauses, so we don't need to do it here. */ - childrte = copyObject(rte); - childrte->relid = childOID; - childrte->relkind = newrelation->rd_rel->relkind; - childrte->inh = false; - childrte->requiredPerms = 0; - childrte->securityQuals = NIL; - parse->rtable = lappend(parse->rtable, childrte); - childRTindex = list_length(parse->rtable); - - /* - * Build an AppendRelInfo for this parent and child, unless the child - * is a partitioned table. - */ - if (childrte->relkind != RELKIND_PARTITIONED_TABLE) + if (!is_other_temp) { - /* Remember if we saw a real child. */ - if (childOID != parentOID) - has_child = true; - - appinfo = makeNode(AppendRelInfo); - appinfo->parent_relid = rti; - appinfo->child_relid = childRTindex; - appinfo->parent_reltype = oldrelation->rd_rel->reltype; - appinfo->child_reltype = newrelation->rd_rel->reltype; - make_inh_translation_list(oldrelation, newrelation, childRTindex, - &appinfo->translated_vars); - appinfo->parent_reloid = parentOID; - appinfos = lappend(appinfos, appinfo); - - /* - * Translate the column permissions bitmaps to the child's attnums - * (we have to build the translated_vars list before we can do - * this). But if this is the parent table, leave copyObject's - * result alone. - * - * Note: we need to do this even though the executor won't run any - * permissions checks on the child RTE. The - * insertedCols/updatedCols bitmaps may be examined for - * trigger-firing purposes. - */ - if (childOID != parentOID) - { - childrte->selectedCols = translate_col_privs(rte->selectedCols, - appinfo->translated_vars); - childrte->insertedCols = translate_col_privs(rte->insertedCols, - appinfo->translated_vars); - childrte->updatedCols = translate_col_privs(rte->updatedCols, - appinfo->translated_vars); - } + childrte = copyObject(rte); + childrte->relid = childOID; + childrte->relkind = get_rel_relkind(childOID); + childrte->inh = (childrte->relkind == RELKIND_PARTITIONED_TABLE); + childrte->requiredPerms = 0; + childrte->securityQuals = NIL; + parse->rtable = lappend(parse->rtable, childrte); + childRTindex = list_length(parse->rtable); } - else - partitioned_child_rels = lappend_int(partitioned_child_rels, - childRTindex); /* * Build a PlanRowMark if parent is marked FOR UPDATE/SHARE. */ - if (oldrc) + if (!is_other_temp && oldrc) { PlanRowMark *newrc = makeNode(PlanRowMark); @@ -1606,12 +1587,89 @@ expand_inherited_rtentry(PlannerInfo *root, RangeTblEntry *rte, Index rti) */ newrc->isParent = (childrte->relkind == RELKIND_PARTITIONED_TABLE); - /* Include child's rowmark type in parent's allMarkTypes */ - oldrc->allMarkTypes |= newrc->allMarkTypes; root->rowMarks = lappend(root->rowMarks, newrc); } + /* + * No need to create AppendRelInfo for partitions at this point. We + * will create one if and when we know we'll need it. The fact that + * this is a child table of the parent table will be recorded in the + * PartitionRootInfo that will be created for the parent table. + */ + if (rel_is_partition(childOID) && + child_relkind != RELKIND_PARTITIONED_TABLE) + { + LeafPartitionInfo *lpinfo = makeNode(LeafPartitionInfo); + + lpinfo->is_other_temp = is_other_temp; + lpinfo->reloid = childOID; + lpinfo->relid = childRTindex; + leaf_part_infos = lappend(leaf_part_infos, lpinfo); + continue; + } + + /* Create the PartitionInfo of this child partitioned table. */ + if (child_relkind == RELKIND_PARTITIONED_TABLE) + { + PartitionInfo *pinfo = makeNode(PartitionInfo); + + pinfo->is_other_temp = is_other_temp; + pinfo->relid = childRTindex; + pinfo->pd = pds[i++]; + partition_infos = lappend(partition_infos, pinfo); + + partitioned_child_rels = lappend_int(partitioned_child_rels, + childRTindex); + continue; + } + + if (is_other_temp) + continue; + + /* + * Getting here means this is a non-partitioned child table that is + * not a partition. Build an AppendRelInfo for the same to remember + * the parent-child relationship. + */ + + /* Open rel if needed, we already have required locks */ + if (childOID != parentOID) + newrelation = heap_open(childOID, NoLock); + else + newrelation = oldrelation; + + appinfo = makeNode(AppendRelInfo); + appinfo->parent_relid = rti; + appinfo->child_relid = childRTindex; + appinfo->parent_reltype = oldrelation->rd_rel->reltype; + appinfo->child_reltype = newrelation->rd_rel->reltype; + make_inh_translation_list(oldrelation, newrelation, childRTindex, + &appinfo->translated_vars); + appinfo->parent_reloid = parentOID; + appinfos = lappend(appinfos, appinfo); + + /* + * Translate the column permissions bitmaps to the child's attnums + * (we have to build the translated_vars list before we can do + * this). But if this is the parent table, leave copyObject's + * result alone. + * + * Note: we need to do this even though the executor won't run any + * permissions checks on the child RTE. The + * insertedCols/updatedCols bitmaps may be examined for + * trigger-firing purposes. + */ + if (childOID != parentOID) + { + childrte->selectedCols = translate_col_privs(rte->selectedCols, + appinfo->translated_vars); + childrte->insertedCols = translate_col_privs(rte->insertedCols, + appinfo->translated_vars); + childrte->updatedCols = translate_col_privs(rte->updatedCols, + appinfo->translated_vars); + } + /* Close child relations, but keep locks */ if (childOID != parentOID) heap_close(newrelation, NoLock); @@ -1620,35 +1678,49 @@ expand_inherited_rtentry(PlannerInfo *root, RangeTblEntry *rte, Index rti) heap_close(oldrelation, NoLock); /* - * If all the children were temp tables or a partitioned parent did not - * have any leaf partitions, pretend it's a non-inheritance situation; we - * don't need Append node in that case. The duplicate RTE we added for - * the parent table is harmless, so we don't bother to get rid of it; - * ditto for the useless PlanRowMark node. + * We keep a list of objects in root, each of which maps a partitioned + * parent RT index to a bunch of information about the partition tree + * rooted at that parent. The information includes a list of RT indexes + * of partitioned tables appearing in the tree, a list of PartitionInfo + * objects for each such partitioned table, a list of LeafPartitionInfo + * objects for each leaf partition in tree, and finally a list containing + * leaf partition OIDs in an order in which find_all_inheritors() returned + * them. The first of these is used when creating an Append or a + * ModifyTable path for the parent to be copied verbatim into the path + * (and subsequently the plan) so that it could be carried over to the + * executor. That list is the only place where the executor could find + * partitioned child tables to lock them. */ - if (!has_child) + if (rte->relkind == RELKIND_PARTITIONED_TABLE) { - /* Clear flag before returning */ - rte->inh = false; + PartitionRootInfo *prinfo = makeNode(PartitionRootInfo); + + Assert(list_length(partition_infos) >= 1); + prinfo->parent_relid = rti; + prinfo->partitioned_relids = partitioned_child_rels; + prinfo->partition_infos = partition_infos; + prinfo->leaf_part_infos = leaf_part_infos; + prinfo->orig_leaf_part_oids = orig_leaf_part_oids; + + root->prinfo_list = lappend(root->prinfo_list, prinfo); + + /* + * Our job here is done, because we didn't create any AppendRelInfos. + */ return; } /* - * We keep a list of objects in root, each of which maps a partitioned - * parent RT index to the list of RT indexes of its partitioned child - * tables. When creating an Append or a ModifyTable path for the parent, - * we copy the child RT index list verbatim to the path so that it could - * be carried over to the executor so that the latter could identify the - * partitioned child tables. + * If all the children were temp tables, pretend it's a non-inheritance + * situation; we don't need Append node in that case. The duplicate + * RTE we added for the parent table is harmless, so we don't bother to + * get rid of it; ditto for the useless PlanRowMark node. */ - if (partitioned_child_rels != NIL) + if (list_length(appinfos) < 2) { - pcinfo = makeNode(PartitionedChildRelInfo); - - Assert(rte->relkind == RELKIND_PARTITIONED_TABLE); - pcinfo->parent_relid = rti; - pcinfo->child_rels = partitioned_child_rels; - root->pcinfo_list = lappend(root->pcinfo_list, pcinfo); + /* Clear flag before returning */ + rte->inh = false; + return; } /* Otherwise, OK to add to root->append_rel_list */ @@ -1769,7 +1841,7 @@ make_inh_translation_list(Relation oldrelation, Relation newrelation, * query is really only going to reference the inherited columns. Instead * we set the per-column bits for all inherited columns. */ -static Bitmapset * +Bitmapset * translate_col_privs(const Bitmapset *parent_privs, List *translated_vars) { diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c index a1ebd4acc8..3781a91b76 100644 --- a/src/backend/optimizer/util/plancat.c +++ b/src/backend/optimizer/util/plancat.c @@ -1577,6 +1577,43 @@ build_physical_tlist(PlannerInfo *root, RelOptInfo *rel) } /* + * build_rel_vars + * + * Returns a list containing Var expressions corresponding to a relation's + * attributes. Since the caller may already have the RangeTblEntry, we it + * pass the same instead of PlannerInfo to avoid finding it in the range + * table all over again. + */ +List * +build_rel_vars(Relation relation, Index relid) +{ + AttrNumber attrno; + int numattrs; + List *result = NIL; + + numattrs = RelationGetNumberOfAttributes(relation); + for (attrno = 1; attrno <= numattrs; attrno++) + { + Form_pg_attribute att_tup = TupleDescAttr(relation->rd_att, + attrno - 1); + + if (att_tup->attisdropped) + continue; + + result = lappend(result, + makeVar(relid, + attrno, + att_tup->atttypid, + att_tup->atttypmod, + att_tup->attcollation, + 0)); + + } + + return result; +} + +/* * build_index_tlist * * Build a targetlist representing the columns of the specified index. diff --git a/src/backend/optimizer/util/relnode.c b/src/backend/optimizer/util/relnode.c index 8ad0b4a669..1bcda9254f 100644 --- a/src/backend/optimizer/util/relnode.c +++ b/src/backend/optimizer/util/relnode.c @@ -16,7 +16,9 @@ #include +#include "catalog/pg_class.h" #include "miscadmin.h" +#include "nodes/relation.h" #include "optimizer/clauses.h" #include "optimizer/cost.h" #include "optimizer/pathnode.h" @@ -146,6 +148,15 @@ build_simple_rel(PlannerInfo *root, int relid, RelOptInfo *parent) rel->baserestrict_min_security = UINT_MAX; rel->joininfo = NIL; rel->has_eclass_joins = false; + /* Set in build_simple_rel if rel is root partitioned table */ + rel->num_parted = 0; + rel->partition_infos = NULL; + rel->num_leaf_parts = 0; + rel->leaf_part_infos = NULL; + /* Set in get_rel_partitions_recurse */ + rel->live_partition_painfos = NIL; + /* Set in set_append_rel_size if rel is a partition. */ + rel->root_parent_relid = 0; /* * Pass top parent's relids down the inheritance hierarchy. If the parent @@ -210,25 +221,83 @@ build_simple_rel(PlannerInfo *root, int relid, RelOptInfo *parent) list_length(rte->securityQuals)); /* - * If this rel is an appendrel parent, recurse to build "other rel" - * RelOptInfos for its children. They are "other rels" because they are - * not in the main join tree, but we will need RelOptInfos to plan access - * to them. + * If this rel is an appendrel parent, generate additional information + * based on whether the parent is a partitioned table or not. For + * regular parent tables, recurse to build "other rel" RelOptInfos for its + * children. They are "other rels" because they are not in the main join + * tree, but we will need RelOptInfos to plan access to them. For + * partitioned parent tables, we do not yet create "other rel" RelOptInfos + * for the children. Instead, we set up some informations that will be + * used in set_append_rel_size() to look up its partitions. */ if (rte->inh) { ListCell *l; - foreach(l, root->append_rel_list) + if (rte->relkind == RELKIND_PARTITIONED_TABLE) { - AppendRelInfo *appinfo = (AppendRelInfo *) lfirst(l); + PartitionRootInfo *prinfo = NULL; + LeafPartitionInfo **lpinfos; + int i; + + foreach(l, root->prinfo_list) + { + if (((PartitionRootInfo *) lfirst(l))->parent_relid == relid) + { + prinfo = lfirst(l); + break; + } + } - /* append_rel_list contains all append rels; ignore others */ - if (appinfo->parent_relid != relid) - continue; + /* + * Only the root partitioned tables have an entry in + * root->prinfo_list. For other partitioned table rels, we don't + * need to set the following fields. + */ + if (prinfo == NULL) + return rel; + + Assert(prinfo->parent_relid == relid); + rel->num_parted = list_length(prinfo->partition_infos); + rel->num_leaf_parts = list_length(prinfo->leaf_part_infos); + rel->partition_infos = (PartitionInfo **) + palloc0(rel->num_parted * + sizeof(PartitionInfo *)); + lpinfos = (LeafPartitionInfo **) palloc0(rel->num_leaf_parts * + sizeof(LeafPartitionInfo *)); + i = 0; + foreach(l, prinfo->partition_infos) + { + rel->partition_infos[i++] = lfirst(l); + } + i = 0; + foreach(l, prinfo->leaf_part_infos) + { + lpinfos[i++] = lfirst(l); + } + rel->leaf_part_infos = lpinfos; + + /* + * Don't build RelOptInfo for partitions yet; we don't know which + * ones we'll need. We did create RangeTblEntry's though, so we + * have an empty slot in root->simple_rel_array that will be + * filled eventually if the respective partition is chosen to be + * scanned after all. + */ + } + else + { + foreach(l, root->append_rel_list) + { + AppendRelInfo *appinfo = (AppendRelInfo *) lfirst(l); + + /* append_rel_list contains all append rels; ignore others */ + if (appinfo->parent_relid != relid) + continue; - (void) build_simple_rel(root, appinfo->child_relid, - rel); + (void) build_simple_rel(root, appinfo->child_relid, + rel); + } } } diff --git a/src/backend/utils/cache/lsyscache.c b/src/backend/utils/cache/lsyscache.c index 82763f8013..ebbc3da985 100644 --- a/src/backend/utils/cache/lsyscache.c +++ b/src/backend/utils/cache/lsyscache.c @@ -1817,6 +1817,28 @@ get_rel_relkind(Oid relid) } /* + * rel_is_partition + * + * Returns the relkind associated with a given relation. + */ +char +rel_is_partition(Oid relid) +{ + HeapTuple tp; + Form_pg_class reltup; + bool result; + + tp = SearchSysCache1(RELOID, ObjectIdGetDatum(relid)); + if (!HeapTupleIsValid(tp)) + elog(ERROR, "cache lookup failed for relation %u", relid); + reltup = (Form_pg_class) GETSTRUCT(tp); + result = reltup->relispartition; + ReleaseSysCache(tp); + + return result; +} + +/* * get_rel_tablespace * * Returns the pg_tablespace OID associated with a given relation. @@ -1865,6 +1887,34 @@ get_rel_persistence(Oid relid) return result; } +/* + * rel_is_other_temp + * + * Returns whether a relation is a temp table from another session + */ +bool +rel_is_other_temp(Oid relid) +{ + HeapTuple tp; + Form_pg_class reltup; + bool result = false; + + tp = SearchSysCache1(RELOID, ObjectIdGetDatum(relid)); + if (!HeapTupleIsValid(tp)) + elog(ERROR, "cache lookup failed for relation %u", relid); + reltup = (Form_pg_class) GETSTRUCT(tp); + + if (reltup->relpersistence == RELPERSISTENCE_TEMP && + !isTempOrTempToastNamespace(reltup->relnamespace)) + { + result = true; + } + + ReleaseSysCache(tp); + + return result; +} + /* ---------- TRANSFORM CACHE ---------- */ diff --git a/src/include/catalog/partition.h b/src/include/catalog/partition.h index 1091dd572c..20fc3a89db 100644 --- a/src/include/catalog/partition.h +++ b/src/include/catalog/partition.h @@ -16,6 +16,7 @@ #include "fmgr.h" #include "executor/tuptable.h" #include "nodes/execnodes.h" +#include "nodes/relation.h" #include "parser/parse_node.h" #include "utils/rel.h" @@ -93,4 +94,7 @@ extern int get_partition_for_tuple(PartitionTupleRoutingInfo **ptrinfos, EState *estate, PartitionTupleRoutingInfo **failed_at, TupleTableSlot **failed_slot); + +/* Planner support stuff. */ +extern List *get_partitions_for_keys(PartitionDispatch pd); #endif /* PARTITION_H */ diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h index 27bd4f3363..e957615ac6 100644 --- a/src/include/nodes/nodes.h +++ b/src/include/nodes/nodes.h @@ -260,7 +260,10 @@ typedef enum NodeTag T_PlaceHolderVar, T_SpecialJoinInfo, T_AppendRelInfo, - T_PartitionedChildRelInfo, + T_PartitionInfo, + T_LeafPartitionInfo, + T_PartitionAppendInfo, + T_PartitionRootInfo, T_PlaceHolderInfo, T_MinMaxAggInfo, T_PlannerParamItem, diff --git a/src/include/nodes/relation.h b/src/include/nodes/relation.h index a39e59d8ac..a67a43b069 100644 --- a/src/include/nodes/relation.h +++ b/src/include/nodes/relation.h @@ -251,7 +251,7 @@ typedef struct PlannerInfo List *append_rel_list; /* list of AppendRelInfos */ - List *pcinfo_list; /* list of PartitionedChildRelInfos */ + List *prinfo_list; /* list of PartitionRootInfos */ List *rowMarks; /* list of PlanRowMarks */ @@ -515,6 +515,9 @@ typedef enum RelOptKind /* Is the given relation an "other" relation? */ #define IS_OTHER_REL(rel) ((rel)->reloptkind == RELOPT_OTHER_MEMBER_REL) +typedef struct PartitionInfo PartitionInfo; +typedef struct LeafPartitionInfo LeafPartitionInfo; + typedef struct RelOptInfo { NodeTag type; @@ -592,6 +595,23 @@ typedef struct RelOptInfo /* used by "other" relations */ Relids top_parent_relids; /* Relids of topmost parents */ + + /* Fields set for "root" partitioned relations */ + int num_parted; /* Number of entries in partition_infos */ + PartitionInfo **partition_infos; + int num_leaf_parts; /* Number of entries in leaf_part_infos */ + LeafPartitionInfo **leaf_part_infos; /* LeafPartitionInfos */ + + /* Fields set for partitioned relations (list of PartitionAppendInfo's) */ + List *live_partition_painfos; + + /* Fields set for partition otherrels */ + + /* + * RT index of the root partitioned table in the the partition tree of + * which this rel is a member. + */ + Index root_parent_relid; } RelOptInfo; /* @@ -2012,24 +2032,75 @@ typedef struct AppendRelInfo Oid parent_reloid; /* OID of parent relation */ } AppendRelInfo; +/* Forward declarations, to avoid including other headers */ +typedef struct PartitionDispatchData *PartitionDispatch; + +/* + * PartitionInfo - information about partitioning of one partitioned table in + * a given partition tree + */ +typedef struct PartitionInfo +{ + NodeTag type; + + bool is_other_temp; /* If true, ignore the following fields */ + Index relid; /* Ordinal position in the rangetable */ + PartitionDispatch pd; /* Information about partitions */ +} PartitionInfo; + +/* + * LeafPartitionInfo - (OID, RT index) pair for one leaf partition + * + * Created when a leaf partition's RT entry is created in + * expand_inherited_rtentry(). + */ +typedef struct LeafPartitionInfo +{ + NodeTag type; + + bool is_other_temp; /* If true, ignore the following fields. */ + Oid reloid; /* OID */ + Index relid; /* RT index */ +} LeafPartitionInfo; + /* - * For a partitioned table, this maps its RT index to the list of RT indexes - * of the partitioned child tables in the partition tree. We need to - * separately store this information, because we do not create AppendRelInfos - * for the partitioned child tables of a parent table, since AppendRelInfos - * contain information that is unnecessary for the partitioned child tables. - * The child_rels list must contain at least one element, because the parent - * partitioned table is itself counted as a child. + * PartitionAppendInfo - list of child RT indexes for one partitioned table + * in a given partition tree + */ +typedef struct PartitionAppendInfo +{ + NodeTag type; + + Index parent_relid; + List *live_partition_relids; /* List of RT indexes */ +} PartitionAppendInfo; + +/* + * For a partitioned table, this maps its RT index to the information about + * the partition tree collected in expand_inherited_rtentry(). + * + * That information includes a list of PartitionInfo nodes, one for each + * partitioned table in the partition tree, including for the table itself. + * Also included is a list of RT indexes of the entries for leaf partitions + * that are created at the same time by expand_inherited_rtentry(). + * + * orig_leaf_part_oids contains the list of leaf partition OIDs as it was + * generated by find_all_inheritors(). We keep it around so that we can + * lock leaf partitions in that order when we actually do it. * - * These structs are kept in the PlannerInfo node's pcinfo_list. + * PartitionRootInfo's for different partitioned tables in a query are placed + * in root->prinfo_list. */ -typedef struct PartitionedChildRelInfo +typedef struct PartitionRootInfo { NodeTag type; Index parent_relid; - List *child_rels; -} PartitionedChildRelInfo; + List *partition_infos; + List *partitioned_relids; + List *leaf_part_infos; + List *orig_leaf_part_oids; +} PartitionRootInfo; /* * For each distinct placeholder expression generated during planning, we diff --git a/src/include/optimizer/plancat.h b/src/include/optimizer/plancat.h index 71f0faf938..e8e30f8f52 100644 --- a/src/include/optimizer/plancat.h +++ b/src/include/optimizer/plancat.h @@ -39,6 +39,7 @@ extern bool relation_excluded_by_constraints(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte); extern List *build_physical_tlist(PlannerInfo *root, RelOptInfo *rel); +extern List *build_rel_vars(Relation relation, Index relid); extern bool has_unique_index(RelOptInfo *rel, AttrNumber attno); diff --git a/src/include/optimizer/prep.h b/src/include/optimizer/prep.h index 4be0afd566..d0af8dc7bc 100644 --- a/src/include/optimizer/prep.h +++ b/src/include/optimizer/prep.h @@ -16,6 +16,7 @@ #include "nodes/plannodes.h" #include "nodes/relation.h" +#include "utils/rel.h" /* @@ -51,6 +52,8 @@ extern PlanRowMark *get_plan_rowmark(List *rowmarks, Index rtindex); extern RelOptInfo *plan_set_operations(PlannerInfo *root); extern void expand_inherited_tables(PlannerInfo *root); +extern Bitmapset *translate_col_privs(const Bitmapset *parent_privs, + List *translated_vars); extern Node *adjust_appendrel_attrs(PlannerInfo *root, Node *node, int nappinfos, AppendRelInfo **appinfos); diff --git a/src/include/utils/lsyscache.h b/src/include/utils/lsyscache.h index 07208b56ce..b5b615a6fa 100644 --- a/src/include/utils/lsyscache.h +++ b/src/include/utils/lsyscache.h @@ -126,8 +126,10 @@ extern char *get_rel_name(Oid relid); extern Oid get_rel_namespace(Oid relid); extern Oid get_rel_type_id(Oid relid); extern char get_rel_relkind(Oid relid); +extern bool rel_is_partition(Oid relid); extern Oid get_rel_tablespace(Oid relid); extern char get_rel_persistence(Oid relid); +extern bool rel_is_other_temp(Oid relid); extern Oid get_transform_fromsql(Oid typid, Oid langid, List *trftypes); extern Oid get_transform_tosql(Oid typid, Oid langid, List *trftypes); extern bool get_typisdefined(Oid typid); diff --git a/src/test/regress/expected/insert.out b/src/test/regress/expected/insert.out index a2d9469592..e159d62b66 100644 --- a/src/test/regress/expected/insert.out +++ b/src/test/regress/expected/insert.out @@ -278,12 +278,12 @@ select tableoid::regclass, * from list_parted; -------------+----+---- part_aa_bb | aA | part_cc_dd | cC | 1 - part_null | | 0 - part_null | | 1 part_ee_ff1 | ff | 1 part_ee_ff1 | EE | 1 part_ee_ff2 | ff | 11 part_ee_ff2 | EE | 10 + part_null | | 0 + part_null | | 1 (8 rows) -- some more tests to exercise tuple-routing with multi-level partitioning -- 2.11.0