From 7a22aedc7c1ae8e1568745c99cf1d11d42cf59d9 Mon Sep 17 00:00:00 2001 From: amit Date: Mon, 24 Jul 2017 18:59:57 +0900 Subject: [PATCH 1/3] Decouple RelationGetPartitionDispatchInfo() from executor Currently it and the structure it generates viz. PartitionDispatch objects are too coupled with the executor's tuple-routing code. That include locking considerations and responsibilities for releasing relcache references, etc. That makes it useless for usage in other places such as during planning. --- src/backend/catalog/partition.c | 326 +++++++++++++++++---------------- src/backend/commands/copy.c | 35 ++-- src/backend/executor/execMain.c | 156 ++++++++++++++-- src/backend/executor/nodeModifyTable.c | 29 ++- src/include/catalog/partition.h | 53 ++---- src/include/executor/executor.h | 4 +- src/include/nodes/execnodes.h | 53 +++++- 7 files changed, 409 insertions(+), 247 deletions(-) diff --git a/src/backend/catalog/partition.c b/src/backend/catalog/partition.c index e20ddce2db..e07701d5e8 100644 --- a/src/backend/catalog/partition.c +++ b/src/backend/catalog/partition.c @@ -105,6 +105,24 @@ typedef struct PartitionRangeBound bool lower; /* this is the lower (vs upper) bound */ } PartitionRangeBound; +/*----------------------- + * PartitionDispatchData - information of partitions of one partitioned table + * in a partition tree + * + * partkey Partition key of the table + * partdesc Partition descriptor of the table + * indexes Array with partdesc->nparts members (for details on what the + * individual value represents, see the comments in + * RelationGetPartitionDispatchInfo()) + *----------------------- + */ +typedef struct PartitionDispatchData +{ + PartitionKey partkey; /* Points into the table's relcache entry */ + PartitionDesc partdesc; /* Ditto */ + int *indexes; +} PartitionDispatchData; + static int32 qsort_partition_list_value_cmp(const void *a, const void *b, void *arg); static int32 qsort_partition_rbound_cmp(const void *a, const void *b, @@ -972,178 +990,167 @@ get_partition_qual_relid(Oid relid) } /* - * Append OIDs of rel's partitions to the list 'partoids' and for each OID, - * append pointer rel to the list 'parents'. - */ -#define APPEND_REL_PARTITION_OIDS(rel, partoids, parents) \ - do\ - {\ - int i;\ - for (i = 0; i < (rel)->rd_partdesc->nparts; i++)\ - {\ - (partoids) = lappend_oid((partoids), (rel)->rd_partdesc->oids[i]);\ - (parents) = lappend((parents), (rel));\ - }\ - } while(0) - -/* * RelationGetPartitionDispatchInfo - * Returns information necessary to route tuples down a partition tree + * Returns necessary information for each partition in the partition + * tree rooted at rel * - * All the partitions will be locked with lockmode, unless it is NoLock. - * A list of the OIDs of all the leaf partitions of rel is returned in - * *leaf_part_oids. + * Information returned includes the following: *ptinfos contains a list of + * PartitionedTableInfo objects, one for each partitioned table (with at least + * one member, that is, one for the root partitioned table), *leaf_part_oids + * contains a list of the OIDs of of all the leaf partitions. + * + * Note that we lock only those partitions that are partitioned tables, because + * we need to look at its relcache entry to get its PartitionKey and its + * PartitionDesc. It's the caller's responsibility to lock the leaf partitions + * that will actually be accessed during a given query. */ -PartitionDispatch * +void RelationGetPartitionDispatchInfo(Relation rel, int lockmode, - int *num_parted, List **leaf_part_oids) + List **ptinfos, List **leaf_part_oids) { - PartitionDispatchData **pd; - List *all_parts = NIL, - *all_parents = NIL, - *parted_rels, - *parted_rel_parents; + List *all_parts, + *all_parents; ListCell *lc1, *lc2; int i, - k, offset; /* - * Lock partitions and make a list of the partitioned ones to prepare - * their PartitionDispatch objects below. + * We rely on the relcache to traverse the partition tree, building + * both the leaf partition OIDs list and the PartitionedTableInfo list. + * Starting with the root partitioned table for which we already have the + * relcache entry, we look at its partition descriptor to get the + * partition OIDs. For partitions that are themselves partitioned tables, + * we get their relcache entries after locking them with lockmode and + * queue their partitions to be looked at later. Leaf partitions are + * added to the result list without locking. For each partitioned table, + * we build a PartitionedTableInfo object and add it to the other result + * list. * - * Cannot use find_all_inheritors() here, because then the order of OIDs - * in parted_rels list would be unknown, which does not help, because we - * assign indexes within individual PartitionDispatch in an order that is - * predetermined (determined by the order of OIDs in individual partition - * descriptors). + * Since RelationBuildPartitionDescriptor() puts partitions in a canonical + * order determined by comparing partition bounds, we can rely that + * concurrent backends see the partitions in the same order, ensuring that + * there are no deadlocks when locking the partitions. */ - *num_parted = 1; - parted_rels = list_make1(rel); - /* Root partitioned table has no parent, so NULL for parent */ - parted_rel_parents = list_make1(NULL); - APPEND_REL_PARTITION_OIDS(rel, all_parts, all_parents); + i = offset = 0; + *ptinfos = *leaf_part_oids = NIL; + + /* Start with the root table. */ + all_parts = list_make1_oid(RelationGetRelid(rel)); + all_parents = list_make1_oid(InvalidOid); forboth(lc1, all_parts, lc2, all_parents) { - Relation partrel = heap_open(lfirst_oid(lc1), lockmode); - Relation parent = lfirst(lc2); - PartitionDesc partdesc = RelationGetPartitionDesc(partrel); + Oid partrelid = lfirst_oid(lc1); + Oid parentrelid = lfirst_oid(lc2); - /* - * If this partition is a partitioned table, add its children to the - * end of the list, so that they are processed as well. - */ - if (partdesc) + if (get_rel_relkind(partrelid) == RELKIND_PARTITIONED_TABLE) { - (*num_parted)++; - parted_rels = lappend(parted_rels, partrel); - parted_rel_parents = lappend(parted_rel_parents, parent); - APPEND_REL_PARTITION_OIDS(partrel, all_parts, all_parents); - } - else - heap_close(partrel, NoLock); + int j, + k; + Relation partrel; + PartitionKey partkey; + PartitionDesc partdesc; + PartitionedTableInfo *ptinfo; + PartitionDispatch pd; + + if (partrelid != RelationGetRelid(rel)) + partrel = heap_open(partrelid, lockmode); + else + partrel = rel; - /* - * We keep the partitioned ones open until we're done using the - * information being collected here (for example, see - * ExecEndModifyTable). - */ - } + partkey = RelationGetPartitionKey(partrel); + partdesc = RelationGetPartitionDesc(partrel); + + ptinfo = (PartitionedTableInfo *) + palloc0(sizeof(PartitionedTableInfo)); + ptinfo->relid = partrelid; + ptinfo->parentid = parentrelid; + + ptinfo->pd = pd = (PartitionDispatchData *) + palloc0(sizeof(PartitionDispatchData)); + pd->partkey = partkey; - /* - * We want to create two arrays - one for leaf partitions and another for - * partitioned tables (including the root table and internal partitions). - * While we only create the latter here, leaf partition array of suitable - * objects (such as, ResultRelInfo) is created by the caller using the - * list of OIDs we return. Indexes into these arrays get assigned in a - * breadth-first manner, whereby partitions of any given level are placed - * consecutively in the respective arrays. - */ - pd = (PartitionDispatchData **) palloc(*num_parted * - sizeof(PartitionDispatchData *)); - *leaf_part_oids = NIL; - i = k = offset = 0; - forboth(lc1, parted_rels, lc2, parted_rel_parents) - { - Relation partrel = lfirst(lc1); - Relation parent = lfirst(lc2); - PartitionKey partkey = RelationGetPartitionKey(partrel); - TupleDesc tupdesc = RelationGetDescr(partrel); - PartitionDesc partdesc = RelationGetPartitionDesc(partrel); - int j, - m; - - pd[i] = (PartitionDispatch) palloc(sizeof(PartitionDispatchData)); - pd[i]->reldesc = partrel; - pd[i]->key = partkey; - pd[i]->keystate = NIL; - pd[i]->partdesc = partdesc; - if (parent != NULL) - { /* - * For every partitioned table other than root, we must store a - * tuple table slot initialized with its tuple descriptor and a - * tuple conversion map to convert a tuple from its parent's - * rowtype to its own. That is to make sure that we are looking at - * the correct row using the correct tuple descriptor when - * computing its partition key for tuple routing. - */ - pd[i]->tupslot = MakeSingleTupleTableSlot(tupdesc); - pd[i]->tupmap = convert_tuples_by_name(RelationGetDescr(parent), - tupdesc, - gettext_noop("could not convert row type")); - } - else - { - /* Not required for the root partitioned table */ - pd[i]->tupslot = NULL; - pd[i]->tupmap = NULL; - } - pd[i]->indexes = (int *) palloc(partdesc->nparts * sizeof(int)); + * Pin the partition descriptor before stashing the references to the + * information contained in it into this PartitionDispatch object. + * + PinPartitionDesc(partdesc);*/ + pd->partdesc = partdesc; - /* - * Indexes corresponding to the internal partitions are multiplied by - * -1 to distinguish them from those of leaf partitions. Encountering - * an index >= 0 means we found a leaf partition, which is immediately - * returned as the partition we are looking for. A negative index - * means we found a partitioned table, whose PartitionDispatch object - * is located at the above index multiplied back by -1. Using the - * PartitionDispatch object, search is continued further down the - * partition tree. - */ - m = 0; - for (j = 0; j < partdesc->nparts; j++) - { - Oid partrelid = partdesc->oids[j]; + /* + * The values contained in the following array correspond to + * indexes of this table's partitions in the global sequence of + * all the partitions contained in the partition tree rooted at + * rel, traversed in a breadh-first manner. The values should be + * such that we will be able to distinguish the leaf partitions + * from the non-leaf partitions, because they are returned to + * to the caller in separate structures from where they will be + * accessed. The way that's done is described below: + * + * Leaf partition OIDs are put into the global leaf_part_oids list, + * and for each one, the value stored is its ordinal position in + * the list minus 1. + * + * PartitionedTableInfo objects corresponding to partitions that + * are partitioned tables are put into the global ptinfos[] list, + * and for each one, the value stored is its ordinal position in + * the list multiplied by -1. + * + * So while looking at the values in the indexes array, if one + * gets zero or a positive value, then it's a leaf partition, + * Otherwise, it's a partitioned table. + */ + pd->indexes = (int *) palloc(partdesc->nparts * sizeof(int)); - if (get_rel_relkind(partrelid) != RELKIND_PARTITIONED_TABLE) - { - *leaf_part_oids = lappend_oid(*leaf_part_oids, partrelid); - pd[i]->indexes[j] = k++; - } - else + k = 0; + for (j = 0; j < partdesc->nparts; j++) { + Oid partrelid = partdesc->oids[j]; + /* - * offset denotes the number of partitioned tables of upper - * levels including those of the current level. Any partition - * of this table must belong to the next level and hence will - * be placed after the last partitioned table of this level. + * Queue this partition so that it will be processed later + * by the outer loop. */ - pd[i]->indexes[j] = -(1 + offset + m); - m++; + all_parts = lappend_oid(all_parts, partrelid); + all_parents = lappend_oid(all_parents, + RelationGetRelid(partrel)); + + if (get_rel_relkind(partrelid) != RELKIND_PARTITIONED_TABLE) + { + *leaf_part_oids = lappend_oid(*leaf_part_oids, partrelid); + pd->indexes[j] = i++; + } + else + { + /* + * offset denotes the number of partitioned tables that + * we have already processed. k counts the number of + * partitions of this table that were found to be + * partitioned tables. + */ + pd->indexes[j] = -(1 + offset + k); + k++; + } } - } - i++; - /* - * This counts the number of partitioned tables at upper levels - * including those of the current level. - */ - offset += m; + offset += k; + + /* + * Release the relation descriptor. Lock that we have on the + * table will keep the PartitionDesc that is pointing into + * RelationData intact, a pointer to which hope to keep + * through this transaction's commit. + * (XXX - how true is that?) + */ + if (partrel != rel) + heap_close(partrel, NoLock); + + *ptinfos = lappend(*ptinfos, ptinfo); + } } - return pd; + Assert(i == list_length(*leaf_part_oids)); + Assert((offset + 1) == list_length(*ptinfos)); } /* Module-local functions */ @@ -1855,7 +1862,7 @@ generate_partition_qual(Relation rel) * ---------------- */ void -FormPartitionKeyDatum(PartitionDispatch pd, +FormPartitionKeyDatum(PartitionKeyInfo *keyinfo, TupleTableSlot *slot, EState *estate, Datum *values, @@ -1864,20 +1871,21 @@ FormPartitionKeyDatum(PartitionDispatch pd, ListCell *partexpr_item; int i; - if (pd->key->partexprs != NIL && pd->keystate == NIL) + if (keyinfo->key->partexprs != NIL && keyinfo->keystate == NIL) { /* Check caller has set up context correctly */ Assert(estate != NULL && GetPerTupleExprContext(estate)->ecxt_scantuple == slot); /* First time through, set up expression evaluation state */ - pd->keystate = ExecPrepareExprList(pd->key->partexprs, estate); + keyinfo->keystate = ExecPrepareExprList(keyinfo->key->partexprs, + estate); } - partexpr_item = list_head(pd->keystate); - for (i = 0; i < pd->key->partnatts; i++) + partexpr_item = list_head(keyinfo->keystate); + for (i = 0; i < keyinfo->key->partnatts; i++) { - AttrNumber keycol = pd->key->partattrs[i]; + AttrNumber keycol = keyinfo->key->partattrs[i]; Datum datum; bool isNull; @@ -1914,13 +1922,13 @@ FormPartitionKeyDatum(PartitionDispatch pd, * the latter case. */ int -get_partition_for_tuple(PartitionDispatch *pd, +get_partition_for_tuple(PartitionTupleRoutingInfo **ptrinfos, TupleTableSlot *slot, EState *estate, - PartitionDispatchData **failed_at, + PartitionTupleRoutingInfo **failed_at, TupleTableSlot **failed_slot) { - PartitionDispatch parent; + PartitionTupleRoutingInfo *parent; Datum values[PARTITION_MAX_KEYS]; bool isnull[PARTITION_MAX_KEYS]; int cur_offset, @@ -1931,11 +1939,11 @@ get_partition_for_tuple(PartitionDispatch *pd, TupleTableSlot *ecxt_scantuple_old = ecxt->ecxt_scantuple; /* start with the root partitioned table */ - parent = pd[0]; + parent = ptrinfos[0]; while (true) { - PartitionKey key = parent->key; - PartitionDesc partdesc = parent->partdesc; + PartitionKey key = parent->pd->partkey; + PartitionDesc partdesc = parent->pd->partdesc; TupleTableSlot *myslot = parent->tupslot; TupleConversionMap *map = parent->tupmap; @@ -1967,7 +1975,7 @@ get_partition_for_tuple(PartitionDispatch *pd, * So update ecxt_scantuple accordingly. */ ecxt->ecxt_scantuple = slot; - FormPartitionKeyDatum(parent, slot, estate, values, isnull); + FormPartitionKeyDatum(parent->keyinfo, slot, estate, values, isnull); if (key->strategy == PARTITION_STRATEGY_RANGE) { @@ -2038,13 +2046,13 @@ get_partition_for_tuple(PartitionDispatch *pd, *failed_slot = slot; break; } - else if (parent->indexes[cur_index] >= 0) + else if (parent->pd->indexes[cur_index] >= 0) { - result = parent->indexes[cur_index]; + result = parent->pd->indexes[cur_index]; break; } else - parent = pd[-parent->indexes[cur_index]]; + parent = ptrinfos[-parent->pd->indexes[cur_index]]; } error_exit: diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index 53e296559a..b3de3de454 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -165,8 +165,8 @@ typedef struct CopyStateData bool volatile_defexprs; /* is any of defexprs volatile? */ List *range_table; - PartitionDispatch *partition_dispatch_info; - int num_dispatch; /* Number of entries in the above array */ + PartitionTupleRoutingInfo **ptrinfos; + int num_parted; /* Number of entries in the above array */ int num_partitions; /* Number of members in the following arrays */ ResultRelInfo *partitions; /* Per partition result relation */ TupleConversionMap **partition_tupconv_maps; @@ -1425,7 +1425,7 @@ BeginCopy(ParseState *pstate, /* Initialize state for CopyFrom tuple routing. */ if (is_from && rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) { - PartitionDispatch *partition_dispatch_info; + PartitionTupleRoutingInfo **ptrinfos; ResultRelInfo *partitions; TupleConversionMap **partition_tupconv_maps; TupleTableSlot *partition_tuple_slot; @@ -1434,13 +1434,13 @@ BeginCopy(ParseState *pstate, ExecSetupPartitionTupleRouting(rel, 1, - &partition_dispatch_info, + &ptrinfos, &partitions, &partition_tupconv_maps, &partition_tuple_slot, &num_parted, &num_partitions); - cstate->partition_dispatch_info = partition_dispatch_info; - cstate->num_dispatch = num_parted; + cstate->ptrinfos = ptrinfos; + cstate->num_parted = num_parted; cstate->partitions = partitions; cstate->num_partitions = num_partitions; cstate->partition_tupconv_maps = partition_tupconv_maps; @@ -2495,7 +2495,7 @@ CopyFrom(CopyState cstate) if ((resultRelInfo->ri_TrigDesc != NULL && (resultRelInfo->ri_TrigDesc->trig_insert_before_row || resultRelInfo->ri_TrigDesc->trig_insert_instead_row)) || - cstate->partition_dispatch_info != NULL || + cstate->ptrinfos != NULL || cstate->volatile_defexprs) { useHeapMultiInsert = false; @@ -2573,7 +2573,7 @@ CopyFrom(CopyState cstate) ExecStoreTuple(tuple, slot, InvalidBuffer, false); /* Determine the partition to heap_insert the tuple into */ - if (cstate->partition_dispatch_info) + if (cstate->ptrinfos) { int leaf_part_index; TupleConversionMap *map; @@ -2587,7 +2587,7 @@ CopyFrom(CopyState cstate) * partition, respectively. */ leaf_part_index = ExecFindPartition(resultRelInfo, - cstate->partition_dispatch_info, + cstate->ptrinfos, slot, estate); Assert(leaf_part_index >= 0 && @@ -2818,23 +2818,20 @@ CopyFrom(CopyState cstate) ExecCloseIndices(resultRelInfo); - /* Close all the partitioned tables, leaf partitions, and their indices */ - if (cstate->partition_dispatch_info) + /* Close all the leaf partitions and their indices */ + if (cstate->ptrinfos) { int i; /* - * Remember cstate->partition_dispatch_info[0] corresponds to the root - * partitioned table, which we must not try to close, because it is - * the main target table of COPY that will be closed eventually by - * DoCopy(). Also, tupslot is NULL for the root partitioned table. + * cstate->ptrinfo[0] corresponds to the root partitioned table, for + * which we didn't create tupslot. */ - for (i = 1; i < cstate->num_dispatch; i++) + for (i = 1; i < cstate->num_parted; i++) { - PartitionDispatch pd = cstate->partition_dispatch_info[i]; + PartitionTupleRoutingInfo *ptrinfo = cstate->ptrinfos[i]; - heap_close(pd->reldesc, NoLock); - ExecDropSingleTupleTableSlot(pd->tupslot); + ExecDropSingleTupleTableSlot(ptrinfo->tupslot); } for (i = 0; i < cstate->num_partitions; i++) { diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c index 78cbcd1a32..428172ae8e 100644 --- a/src/backend/executor/execMain.c +++ b/src/backend/executor/execMain.c @@ -3214,8 +3214,8 @@ EvalPlanQualEnd(EPQState *epqstate) * tuple routing for partitioned tables * * Output arguments: - * 'pd' receives an array of PartitionDispatch objects with one entry for - * every partitioned table in the partition tree + * 'ptrinfos' receives an array of PartitionTupleRoutingInfo objects with one + * entry for each partitioned table in the partition tree * 'partitions' receives an array of ResultRelInfo objects with one entry for * every leaf partition in the partition tree * 'tup_conv_maps' receives an array of TupleConversionMap objects with one @@ -3237,7 +3237,7 @@ EvalPlanQualEnd(EPQState *epqstate) void ExecSetupPartitionTupleRouting(Relation rel, Index resultRTindex, - PartitionDispatch **pd, + PartitionTupleRoutingInfo ***ptrinfos, ResultRelInfo **partitions, TupleConversionMap ***tup_conv_maps, TupleTableSlot **partition_tuple_slot, @@ -3245,13 +3245,135 @@ ExecSetupPartitionTupleRouting(Relation rel, { TupleDesc tupDesc = RelationGetDescr(rel); List *leaf_parts; + List *ptinfos = NIL; ListCell *cell; int i; ResultRelInfo *leaf_part_rri; + Relation parent; /* Get the tuple-routing information and lock partitions */ - *pd = RelationGetPartitionDispatchInfo(rel, RowExclusiveLock, num_parted, - &leaf_parts); + RelationGetPartitionDispatchInfo(rel, RowExclusiveLock, + &ptinfos, &leaf_parts); + + /* + * The ptinfos list contains PartitionedTableInfo objects for all the + * partitioned tables in the partition tree. From the, we construct + * an array of PartitionTupleRoutingInfo objects to be used during + * tuple-routing. + */ + *num_parted = list_length(ptinfos); + *ptrinfos = (PartitionTupleRoutingInfo **) palloc0(*num_parted * + sizeof(PartitionTupleRoutingInfo *)); + + /* + * Free the List structure itself as we go through. + * (open-coded list_free) + */ + i = 0; + cell = list_head(ptinfos); + parent = NULL; + while (cell) + { + ListCell *tmp = cell; + PartitionedTableInfo *ptinfo = lfirst(tmp), + *next_ptinfo; + Relation partrel; + PartitionTupleRoutingInfo *ptrinfo; + + if (lnext(tmp)) + next_ptinfo = lfirst(lnext(tmp)); + + /* + * RelationGetPartitionDispatchInfo() already locked the Partitioned + * tables. + */ + if (ptinfo->relid != RelationGetRelid(rel)) + partrel = heap_open(ptinfo->relid, NoLock); + else + partrel = rel; + + ptrinfo = (PartitionTupleRoutingInfo *) + palloc0(sizeof(PartitionTupleRoutingInfo)); + ptrinfo->relid = ptinfo->relid; + + /* Stash a reference to this PartitionDispatch. */ + ptrinfo->pd = ptinfo->pd; + + /* State for extracting partition key from tuples will go here. */ + ptrinfo->keyinfo = (PartitionKeyInfo *) + palloc0(sizeof(PartitionKeyInfo)); + ptrinfo->keyinfo->key = RelationGetPartitionKey(partrel); + ptrinfo->keyinfo->keystate = NIL; + + /* + * For every partitioned table other than root, we must store a tuple + * table slot initialized with its tuple descriptor and a tuple + * conversion map to convert a tuple from its parent's rowtype to its + * own. That is to make sure that we are looking at the correct row + * using the correct tuple descriptor when computing its partition key + * for tuple routing. + */ + if (ptinfo->parentid != InvalidOid) + { + TupleDesc tupdesc = RelationGetDescr(partrel); + + /* Open the parent relation descriptor if not already done. */ + if (ptinfo->parentid == RelationGetRelid(rel)) + { + parent = rel; + } + else if (parent == NULL) + { + /* Locked by RelationGetPartitionDispatchInfo(). */ + parent = heap_open(ptinfo->parentid, NoLock); + } + + ptrinfo->tupslot = MakeSingleTupleTableSlot(tupdesc); + ptrinfo->tupmap = convert_tuples_by_name(RelationGetDescr(parent), + tupdesc, + gettext_noop("could not convert row type")); + + /* + * Close the parent descriptor, if the next partitioned table in + * the list is not a sibling, because it will have a different + * parent if so. + */ + if (parent && parent != rel && + next_ptinfo->parentid != ptinfo->parentid) + { + heap_close(parent, NoLock); + parent = NULL; + } + + /* + * Release the relation descriptor. Lock that we have on the + * table will keep the PartitionDesc that is pointing into + * RelationData intact, a pointer to which hope to keep + * through this transaction's commit. + * (XXX - how true is that?) + */ + if (partrel != rel) + heap_close(partrel, NoLock); + } + else + { + /* Not required for the root partitioned table */ + ptrinfo->tupslot = NULL; + ptrinfo->tupmap = NULL; + } + + (*ptrinfos)[i++] = ptrinfo; + + /* Free the ListCell. */ + cell = lnext(cell); + pfree(tmp); + } + + /* Free the List itself. */ + if (ptinfos) + pfree(ptinfos); + + /* For leaf partitions, we build ResultRelInfos and TupleConversionMaps. */ *num_partitions = list_length(leaf_parts); *partitions = (ResultRelInfo *) palloc(*num_partitions * sizeof(ResultRelInfo)); @@ -3274,11 +3396,11 @@ ExecSetupPartitionTupleRouting(Relation rel, TupleDesc part_tupdesc; /* - * We locked all the partitions above including the leaf partitions. - * Note that each of the relations in *partitions are eventually - * closed by the caller. + * RelationGetPartitionDispatchInfo didn't lock the leaf partitions, + * so lock here. Note that each of the relations in *partitions are + * eventually closed (when the plan is shut down, for instance). */ - partrel = heap_open(lfirst_oid(cell), NoLock); + partrel = heap_open(lfirst_oid(cell), RowExclusiveLock); part_tupdesc = RelationGetDescr(partrel); /* @@ -3291,7 +3413,7 @@ ExecSetupPartitionTupleRouting(Relation rel, * partition from the parent's type to the partition's. */ (*tup_conv_maps)[i] = convert_tuples_by_name(tupDesc, part_tupdesc, - gettext_noop("could not convert row type")); + gettext_noop("could not convert row type")); InitResultRelInfo(leaf_part_rri, partrel, @@ -3325,11 +3447,13 @@ ExecSetupPartitionTupleRouting(Relation rel, * by get_partition_for_tuple() unchanged. */ int -ExecFindPartition(ResultRelInfo *resultRelInfo, PartitionDispatch *pd, - TupleTableSlot *slot, EState *estate) +ExecFindPartition(ResultRelInfo *resultRelInfo, + PartitionTupleRoutingInfo **ptrinfos, + TupleTableSlot *slot, + EState *estate) { int result; - PartitionDispatchData *failed_at; + PartitionTupleRoutingInfo *failed_at; TupleTableSlot *failed_slot; /* @@ -3339,7 +3463,7 @@ ExecFindPartition(ResultRelInfo *resultRelInfo, PartitionDispatch *pd, if (resultRelInfo->ri_PartitionCheck) ExecPartitionCheck(resultRelInfo, slot, estate); - result = get_partition_for_tuple(pd, slot, estate, + result = get_partition_for_tuple(ptrinfos, slot, estate, &failed_at, &failed_slot); if (result < 0) { @@ -3349,9 +3473,9 @@ ExecFindPartition(ResultRelInfo *resultRelInfo, PartitionDispatch *pd, char *val_desc; ExprContext *ecxt = GetPerTupleExprContext(estate); - failed_rel = failed_at->reldesc; + failed_rel = heap_open(failed_at->relid, NoLock); ecxt->ecxt_scantuple = failed_slot; - FormPartitionKeyDatum(failed_at, failed_slot, estate, + FormPartitionKeyDatum(failed_at->keyinfo, failed_slot, estate, key_values, key_isnull); val_desc = ExecBuildSlotPartitionKeyDescription(failed_rel, key_values, diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index 77ba15dd90..61e6dfa884 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -277,7 +277,7 @@ ExecInsert(ModifyTableState *mtstate, resultRelInfo = estate->es_result_relation_info; /* Determine the partition to heap_insert the tuple into */ - if (mtstate->mt_partition_dispatch_info) + if (mtstate->mt_ptrinfos) { int leaf_part_index; TupleConversionMap *map; @@ -291,7 +291,7 @@ ExecInsert(ModifyTableState *mtstate, * respectively. */ leaf_part_index = ExecFindPartition(resultRelInfo, - mtstate->mt_partition_dispatch_info, + mtstate->mt_ptrinfos, slot, estate); Assert(leaf_part_index >= 0 && @@ -1486,7 +1486,7 @@ ExecSetupTransitionCaptureState(ModifyTableState *mtstate, EState *estate) int numResultRelInfos; /* Find the set of partitions so that we can find their TupleDescs. */ - if (mtstate->mt_partition_dispatch_info != NULL) + if (mtstate->mt_ptrinfos != NULL) { /* * For INSERT via partitioned table, so we need TupleDescs based @@ -1906,7 +1906,7 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) if (operation == CMD_INSERT && rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) { - PartitionDispatch *partition_dispatch_info; + PartitionTupleRoutingInfo **ptrinfos; ResultRelInfo *partitions; TupleConversionMap **partition_tupconv_maps; TupleTableSlot *partition_tuple_slot; @@ -1915,13 +1915,13 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags) ExecSetupPartitionTupleRouting(rel, node->nominalRelation, - &partition_dispatch_info, + &ptrinfos, &partitions, &partition_tupconv_maps, &partition_tuple_slot, &num_parted, &num_partitions); - mtstate->mt_partition_dispatch_info = partition_dispatch_info; - mtstate->mt_num_dispatch = num_parted; + mtstate->mt_ptrinfos = ptrinfos; + mtstate->mt_num_parted = num_parted; mtstate->mt_partitions = partitions; mtstate->mt_num_partitions = num_partitions; mtstate->mt_partition_tupconv_maps = partition_tupconv_maps; @@ -2328,19 +2328,16 @@ ExecEndModifyTable(ModifyTableState *node) } /* - * Close all the partitioned tables, leaf partitions, and their indices + * Close all the leaf partitions and their indices. * - * Remember node->mt_partition_dispatch_info[0] corresponds to the root - * partitioned table, which we must not try to close, because it is the - * main target table of the query that will be closed by ExecEndPlan(). - * Also, tupslot is NULL for the root partitioned table. + * node->mt_partition_dispatch_info[0] corresponds to the root partitioned + * table, for which we didn't create tupslot. */ - for (i = 1; i < node->mt_num_dispatch; i++) + for (i = 1; i < node->mt_num_parted; i++) { - PartitionDispatch pd = node->mt_partition_dispatch_info[i]; + PartitionTupleRoutingInfo *ptrinfo = node->mt_ptrinfos[i]; - heap_close(pd->reldesc, NoLock); - ExecDropSingleTupleTableSlot(pd->tupslot); + ExecDropSingleTupleTableSlot(ptrinfo->tupslot); } for (i = 0; i < node->mt_num_partitions; i++) { diff --git a/src/include/catalog/partition.h b/src/include/catalog/partition.h index f10879a162..50f5574831 100644 --- a/src/include/catalog/partition.h +++ b/src/include/catalog/partition.h @@ -39,36 +39,23 @@ typedef struct PartitionDescData typedef struct PartitionDescData *PartitionDesc; -/*----------------------- - * PartitionDispatch - information about one partitioned table in a partition - * hierarchy required to route a tuple to one of its partitions - * - * reldesc Relation descriptor of the table - * key Partition key information of the table - * keystate Execution state required for expressions in the partition key - * partdesc Partition descriptor of the table - * tupslot A standalone TupleTableSlot initialized with this table's tuple - * descriptor - * tupmap TupleConversionMap to convert from the parent's rowtype to - * this table's rowtype (when extracting the partition key of a - * tuple just before routing it through this table) - * indexes Array with partdesc->nparts members (for details on what - * individual members represent, see how they are set in - * RelationGetPartitionDispatchInfo()) - *----------------------- +typedef struct PartitionDispatchData *PartitionDispatch; + +/* + * Information about one partitioned table in a given partition tree */ -typedef struct PartitionDispatchData +typedef struct PartitionedTableInfo { - Relation reldesc; - PartitionKey key; - List *keystate; /* list of ExprState */ - PartitionDesc partdesc; - TupleTableSlot *tupslot; - TupleConversionMap *tupmap; - int *indexes; -} PartitionDispatchData; + Oid relid; + Oid parentid; -typedef struct PartitionDispatchData *PartitionDispatch; + /* + * This contains information about bounds of the partitions of this + * table and about where individual partitions are placed in the global + * partition tree. + */ + PartitionDispatch pd; +} PartitionedTableInfo; extern void RelationBuildPartitionDesc(Relation relation); extern bool partition_bounds_equal(PartitionKey key, @@ -84,18 +71,18 @@ extern List *map_partition_varattnos(List *expr, int target_varno, extern List *RelationGetPartitionQual(Relation rel); extern Expr *get_partition_qual_relid(Oid relid); +extern void RelationGetPartitionDispatchInfo(Relation rel, int lockmode, + List **ptinfos, List **leaf_part_oids); + /* For tuple routing */ -extern PartitionDispatch *RelationGetPartitionDispatchInfo(Relation rel, - int lockmode, int *num_parted, - List **leaf_part_oids); -extern void FormPartitionKeyDatum(PartitionDispatch pd, +extern void FormPartitionKeyDatum(PartitionKeyInfo *keyinfo, TupleTableSlot *slot, EState *estate, Datum *values, bool *isnull); -extern int get_partition_for_tuple(PartitionDispatch *pd, +extern int get_partition_for_tuple(PartitionTupleRoutingInfo **pd, TupleTableSlot *slot, EState *estate, - PartitionDispatchData **failed_at, + PartitionTupleRoutingInfo **failed_at, TupleTableSlot **failed_slot); #endif /* PARTITION_H */ diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h index 59c28b709e..6e5f55c06d 100644 --- a/src/include/executor/executor.h +++ b/src/include/executor/executor.h @@ -208,13 +208,13 @@ extern void EvalPlanQualSetTuple(EPQState *epqstate, Index rti, extern HeapTuple EvalPlanQualGetTuple(EPQState *epqstate, Index rti); extern void ExecSetupPartitionTupleRouting(Relation rel, Index resultRTindex, - PartitionDispatch **pd, + PartitionTupleRoutingInfo ***ptrinfos, ResultRelInfo **partitions, TupleConversionMap ***tup_conv_maps, TupleTableSlot **partition_tuple_slot, int *num_parted, int *num_partitions); extern int ExecFindPartition(ResultRelInfo *resultRelInfo, - PartitionDispatch *pd, + PartitionTupleRoutingInfo **ptrinfos, TupleTableSlot *slot, EState *estate); diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 85fac8ab91..e7bd8617bd 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -414,6 +414,55 @@ typedef struct ResultRelInfo Relation ri_PartitionRoot; } ResultRelInfo; +/* Forward declarations, to avoid including other headers */ +typedef struct PartitionKeyData *PartitionKey; +typedef struct PartitionDispatchData *PartitionDispatch; + +/* + * PartitionKeyInfoData - execution state for the partition key of a + * partitioned table + * + * keystate is the execution state required for expressions contained in the + * partition key. It is NIL until initialized by FormPartitionKeyDatum() if + * and when it is called; for example, during tuple routing through a given + * partitioned table. + */ +typedef struct PartitionKeyInfo +{ + PartitionKey key; /* Points into the table's relcache entry */ + List *keystate; +} PartitionKeyInfo; + +/* + * PartitionTupleRoutingInfo - information required for tuple-routing + * through one partitioned table in a partition + * tree + */ +typedef struct PartitionTupleRoutingInfo +{ + /* OID of the table */ + Oid relid; + + /* Information about the table's partitions */ + PartitionDispatch pd; + + /* See comment above the definition of PartitionKeyInfo */ + PartitionKeyInfo *keyinfo; + + /* + * A standalone TupleTableSlot initialized with this table's tuple + * descriptor + */ + TupleTableSlot *tupslot; + + /* + * TupleConversionMap to convert from the parent's rowtype to this table's + * rowtype (when extracting the partition key of a tuple just before + * routing it through this table) + */ + TupleConversionMap *tupmap; +} PartitionTupleRoutingInfo; + /* ---------------- * EState information * @@ -954,9 +1003,9 @@ typedef struct ModifyTableState TupleTableSlot *mt_existing; /* slot to store existing target tuple in */ List *mt_excludedtlist; /* the excluded pseudo relation's tlist */ TupleTableSlot *mt_conflproj; /* CONFLICT ... SET ... projection target */ - struct PartitionDispatchData **mt_partition_dispatch_info; /* Tuple-routing support info */ - int mt_num_dispatch; /* Number of entries in the above array */ + struct PartitionTupleRoutingInfo **mt_ptrinfos; + int mt_num_parted; /* Number of entries in the above array */ int mt_num_partitions; /* Number of members in the following * arrays */ ResultRelInfo *mt_partitions; /* Per partition result relation */ -- 2.11.0