From 79c906997d80dc426530dea0b75363ef20286001 Mon Sep 17 00:00:00 2001 From: "dgrowley@gmail.com" Date: Thu, 26 Jul 2018 19:54:55 +1200 Subject: [PATCH v7] Speed up INSERT and UPDATE on partitioned tables This is more or less a complete redesign of PartitionTupleRouting. The aim here is to get rid of all the possibly large arrays that were being allocated during ExecSetupPartitionTupleRouting(). We now allocate small arrays to store the partition's ResultRelInfo and only enlarge these when we run out of space. The partitions array is now ordered by the order in which the partition's ResultRelInfos are initialized rather than in same order as partdesc. The slowest part of ExecSetupPartitionTupleRouting still remains. The find_all_inheritors call still remains by far the slowest part of the function. This patch just removes the other slow parts. Initialization of the parent to child and child to parent translation maps arrays are now only performed when we need to store the first translation map. If the column order between the parent and its child are the same, then no map ever needs to be stored, these (possibly large) arrays did nothing. The fact that we now always initialize the child to parent map whenever transition capture is required, we no longer need the child_parent_map_not_required array. Previously this was only required so we could determine if no map was required or if the map had not yet been initialized. In simple INSERTs hitting a single partition to a partitioned table with many partitions, the shutdown of the executor was also slow in comparison to the actual execution. This was down to the loop which cleans up each ResultRelInfo having to loop over an array which contained mostly NULLs which had to be skipped. Performance of this has now improved as the array we loop over now no longer has to skip possibly many NULL values. David Rowley and Amit Langote --- src/backend/commands/copy.c | 48 +- src/backend/executor/execPartition.c | 798 ++++++++++++++------------ src/backend/executor/nodeModifyTable.c | 109 +--- src/backend/utils/cache/partcache.c | 11 +- src/include/catalog/partition.h | 6 +- src/include/executor/execPartition.h | 171 ++++-- src/test/regress/expected/insert_conflict.out | 22 + src/test/regress/sql/insert_conflict.sql | 26 + 8 files changed, 626 insertions(+), 565 deletions(-) diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index 9bc67ce60f..0dfb9e2e95 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -2510,8 +2510,12 @@ CopyFrom(CopyState cstate) /* * If there are any triggers with transition tables on the named relation, * we need to be prepared to capture transition tuples. + * + * Because partition tuple routing would like to know about whether + * transition capture is active, we also set it in mtstate, which is + * passed to ExecFindPartition below. */ - cstate->transition_capture = + cstate->transition_capture = mtstate->mt_transition_capture = MakeTransitionCaptureState(cstate->rel->trigdesc, RelationGetRelid(cstate->rel), CMD_INSERT); @@ -2521,19 +2525,8 @@ CopyFrom(CopyState cstate) * CopyFrom tuple routing. */ if (cstate->rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) - { proute = ExecSetupPartitionTupleRouting(NULL, cstate->rel); - /* - * If we are capturing transition tuples, they may need to be - * converted from partition format back to partitioned table format - * (this is only ever necessary if a BEFORE trigger modifies the - * tuple). - */ - if (cstate->transition_capture != NULL) - ExecSetupChildParentMapForLeaf(proute); - } - /* * It's more efficient to prepare a bunch of tuples for insertion, and * insert them in one heap_multi_insert() call, than call heap_insert() @@ -2699,10 +2692,8 @@ CopyFrom(CopyState cstate) * will get us the ResultRelInfo and TupleConversionMap for the * partition, respectively. */ - leaf_part_index = ExecFindPartition(target_resultRelInfo, - proute->partition_dispatch_info, - slot, - estate); + leaf_part_index = ExecFindPartition(mtstate, target_resultRelInfo, + proute, slot, estate); Assert(leaf_part_index >= 0 && leaf_part_index < proute->num_partitions); @@ -2800,15 +2791,7 @@ CopyFrom(CopyState cstate) * one. */ resultRelInfo = proute->partitions[leaf_part_index]; - if (unlikely(resultRelInfo == NULL)) - { - resultRelInfo = ExecInitPartitionInfo(mtstate, - target_resultRelInfo, - proute, estate, - leaf_part_index); - proute->partitions[leaf_part_index] = resultRelInfo; - Assert(resultRelInfo != NULL); - } + Assert(resultRelInfo != NULL); /* Determine which triggers exist on this partition */ has_before_insert_row_trig = (resultRelInfo->ri_TrigDesc && @@ -2845,8 +2828,7 @@ CopyFrom(CopyState cstate) */ cstate->transition_capture->tcs_original_insert_tuple = NULL; cstate->transition_capture->tcs_map = - TupConvMapForLeaf(proute, target_resultRelInfo, - leaf_part_index); + PartitionTupRoutingGetToParentMap(proute, leaf_part_index); } else { @@ -2864,11 +2846,13 @@ CopyFrom(CopyState cstate) * partition rowtype. Don't free the already stored tuple as it * may still be required for a multi-insert batch. */ - tuple = ConvertPartitionTupleSlot(proute->parent_child_tupconv_maps[leaf_part_index], - tuple, - proute->partition_tuple_slot, - &slot, - false); + tuple = + ConvertPartitionTupleSlot(PartitionTupRoutingGetToChildMap(proute, + leaf_part_index), + tuple, + proute->partition_tuple_slot, + &slot, + false); tuple->t_tableOid = RelationGetRelid(resultRelInfo->ri_RelationDesc); } diff --git a/src/backend/executor/execPartition.c b/src/backend/executor/execPartition.c index 1a9943c3aa..9ba3664441 100644 --- a/src/backend/executor/execPartition.c +++ b/src/backend/executor/execPartition.c @@ -31,11 +31,18 @@ #include "utils/rls.h" #include "utils/ruleutils.h" +#define PARTITION_ROUTING_INITSIZE 8 -static PartitionDispatch *RelationGetPartitionDispatchInfo(Relation rel, - int *num_parted, List **leaf_part_oids); -static void get_partition_dispatch_recurse(Relation rel, Relation parent, - List **pds, List **leaf_part_oids); +static void ExecHashSubPlanResultRelsByOid(ModifyTableState *mtstate, + PartitionTupleRouting *proute); +static void ExecExpandRoutingArrays(PartitionTupleRouting *proute); +static int ExecInitPartitionInfo(ModifyTableState *mtstate, + ResultRelInfo *rootResultRelInfo, + PartitionTupleRouting *proute, + EState *estate, + PartitionDispatch parent, int partidx); +static PartitionDispatch ExecInitPartitionDispatchInfo(PartitionTupleRouting *proute, + Oid partoid, PartitionDispatch parent_pd, int partidx); static void FormPartitionKeyDatum(PartitionDispatch pd, TupleTableSlot *slot, EState *estate, @@ -62,143 +69,119 @@ static void find_matching_subplans_recurse(PartitionPruningData *prunedata, * Note that all the relations in the partition tree are locked using the * RowExclusiveLock mode upon return from this function. * - * While we allocate the arrays of pointers of ResultRelInfo and - * TupleConversionMap for all partitions here, actual objects themselves are - * lazily allocated for a given partition if a tuple is actually routed to it; - * see ExecInitPartitionInfo. However, if the function is invoked for update - * tuple routing, caller would already have initialized ResultRelInfo's for - * some of the partitions, which are reused and assigned to their respective - * slot in the aforementioned array. For such partitions, we delay setting - * up objects such as TupleConversionMap until those are actually chosen as - * the partitions to route tuples to. See ExecPrepareTupleRouting. + * Callers must use the returned PartitionTupleRouting during calls to + * ExecFindPartition. The actual ResultRelInfos are allocated lazily by that + * function. */ PartitionTupleRouting * ExecSetupPartitionTupleRouting(ModifyTableState *mtstate, Relation rel) { - List *leaf_parts; - ListCell *cell; - int i; - ResultRelInfo *update_rri = NULL; - int num_update_rri = 0, - update_rri_index = 0; PartitionTupleRouting *proute; - int nparts; ModifyTable *node = mtstate ? (ModifyTable *) mtstate->ps.plan : NULL; - /* - * Get the information about the partition tree after locking all the - * partitions. - */ + /* Lock all the partitions. */ (void) find_all_inheritors(RelationGetRelid(rel), RowExclusiveLock, NULL); - proute = (PartitionTupleRouting *) palloc0(sizeof(PartitionTupleRouting)); - proute->partition_dispatch_info = - RelationGetPartitionDispatchInfo(rel, &proute->num_dispatch, - &leaf_parts); - proute->num_partitions = nparts = list_length(leaf_parts); - proute->partitions = - (ResultRelInfo **) palloc(nparts * sizeof(ResultRelInfo *)); - proute->parent_child_tupconv_maps = - (TupleConversionMap **) palloc0(nparts * sizeof(TupleConversionMap *)); - proute->partition_oids = (Oid *) palloc(nparts * sizeof(Oid)); - /* Set up details specific to the type of tuple routing we are doing. */ + /* + * Here we attempt to expend as little effort as possible in setting up + * the PartitionTupleRouting. Each partition's ResultRelInfo is built + * lazily, only when we actually need to route a tuple to that partition. + * The reason for this is that a common case is for INSERT to insert a + * single tuple into a partitioned table and this must be fast. + * + * We initially allocate enough memory to hold PARTITION_ROUTING_INITSIZE + * PartitionDispatch and ResultRelInfo pointers in their respective + * arrays. More space can be allocated later, if required via + * ExecExpandRoutingArrays. + * + * The PartitionDispatch for the target partitioned table of the command + * must be set up, but any sub-partitioned tables can be set up lazily as + * and when the tuples get routed to (through) them. + */ + proute = (PartitionTupleRouting *) palloc(sizeof(PartitionTupleRouting)); + proute->partition_root = rel; + proute->partition_dispatch_info = (PartitionDispatchData **) + palloc(sizeof(PartitionDispatchData) * PARTITION_ROUTING_INITSIZE); + proute->num_dispatch = 0; + proute->dispatch_allocsize = PARTITION_ROUTING_INITSIZE; + + proute->partitions = (ResultRelInfo **) + palloc(sizeof(ResultRelInfo *) * PARTITION_ROUTING_INITSIZE); + proute->num_partitions = 0; + proute->partitions_allocsize = PARTITION_ROUTING_INITSIZE; + + /* We only allocate these arrays when we need to store the first map */ + proute->parent_child_tupconv_maps = NULL; + proute->child_parent_tupconv_maps = NULL; + + /* + * Initialize this table's PartitionDispatch object. Here we pass in the + * parent as NULL as we don't need to care about any parent of the target + * partitioned table. + */ + (void) ExecInitPartitionDispatchInfo(proute, RelationGetRelid(rel), NULL, + 0); + + /* + * If performing an UPDATE with tuple routing, we can reuse partition + * sub-plan result rels. We build a hash table to map the OIDs of + * partitions present in mtstate->resultRelInfo to their ResultRelInfos. + * Every time a tuple is routed to a partition that we've yet to set the + * ResultRelInfo for, before we go make one, we check for a pre-made one + * in the hash table. + * + * Also, we'll need a slot that will transiently store the tuple being + * routed using the root parent's rowtype. + */ if (node && node->operation == CMD_UPDATE) { - update_rri = mtstate->resultRelInfo; - num_update_rri = list_length(node->plans); - proute->subplan_partition_offsets = - palloc(num_update_rri * sizeof(int)); - proute->num_subplan_partition_offsets = num_update_rri; - - /* - * We need an additional tuple slot for storing transient tuples that - * are converted to the root table descriptor. - */ + ExecHashSubPlanResultRelsByOid(mtstate, proute); proute->root_tuple_slot = MakeTupleTableSlot(NULL); } + else + { + proute->subplan_resultrel_hash = NULL; + proute->root_tuple_slot = NULL; + } /* * Initialize an empty slot that will be used to manipulate tuples of any - * given partition's rowtype. It is attached to the caller-specified node - * (such as ModifyTableState) and released when the node finishes - * processing. + * given partition's rowtype. */ proute->partition_tuple_slot = MakeTupleTableSlot(NULL); - i = 0; - foreach(cell, leaf_parts) - { - ResultRelInfo *leaf_part_rri = NULL; - Oid leaf_oid = lfirst_oid(cell); - - proute->partition_oids[i] = leaf_oid; - - /* - * If the leaf partition is already present in the per-subplan result - * rels, we re-use that rather than initialize a new result rel. The - * per-subplan resultrels and the resultrels of the leaf partitions - * are both in the same canonical order. So while going through the - * leaf partition oids, we need to keep track of the next per-subplan - * result rel to be looked for in the leaf partition resultrels. - */ - if (update_rri_index < num_update_rri && - RelationGetRelid(update_rri[update_rri_index].ri_RelationDesc) == leaf_oid) - { - leaf_part_rri = &update_rri[update_rri_index]; - - /* - * This is required in order to convert the partition's tuple to - * be compatible with the root partitioned table's tuple - * descriptor. When generating the per-subplan result rels, this - * was not set. - */ - leaf_part_rri->ri_PartitionRoot = rel; - - /* Remember the subplan offset for this ResultRelInfo */ - proute->subplan_partition_offsets[update_rri_index] = i; - - update_rri_index++; - } - - proute->partitions[i] = leaf_part_rri; - i++; - } - - /* - * For UPDATE, we should have found all the per-subplan resultrels in the - * leaf partitions. (If this is an INSERT, both values will be zero.) - */ - Assert(update_rri_index == num_update_rri); - return proute; } /* - * ExecFindPartition -- Find a leaf partition in the partition tree rooted - * at parent, for the heap tuple contained in *slot + * ExecFindPartition -- Find a leaf partition for the tuple contained in *slot. + * If the partition's ResultRelInfo does not yet exist in 'proute' then we set + * one up or reuse one from mtstate's resultRelInfo array. * * estate must be non-NULL; we'll need it to compute any expressions in the * partition key(s) * * If no leaf partition is found, this routine errors out with the appropriate - * error message, else it returns the leaf partition sequence number - * as an index into the array of (ResultRelInfos of) all leaf partitions in - * the partition tree. + * error message, else it returns the index of the leaf partition's + * ResultRelInfo in the proute->partitions array. */ int -ExecFindPartition(ResultRelInfo *resultRelInfo, PartitionDispatch *pd, +ExecFindPartition(ModifyTableState *mtstate, + ResultRelInfo *resultRelInfo, + PartitionTupleRouting *proute, TupleTableSlot *slot, EState *estate) { - int result; + PartitionDispatch *pd = proute->partition_dispatch_info; Datum values[PARTITION_MAX_KEYS]; bool isnull[PARTITION_MAX_KEYS]; Relation rel; PartitionDispatch dispatch; + PartitionDesc partdesc; ExprContext *ecxt = GetPerTupleExprContext(estate); TupleTableSlot *ecxt_scantuple_old = ecxt->ecxt_scantuple; TupleTableSlot *myslot = NULL; - MemoryContext oldcxt; - HeapTuple tuple; + MemoryContext oldcxt; + HeapTuple tuple; /* use per-tuple context here to avoid leaking memory */ oldcxt = MemoryContextSwitchTo(GetPerTupleMemoryContext(estate)); @@ -216,9 +199,10 @@ ExecFindPartition(ResultRelInfo *resultRelInfo, PartitionDispatch *pd, while (true) { TupleConversionMap *map = dispatch->tupmap; - int cur_index = -1; + int partidx = -1; rel = dispatch->reldesc; + partdesc = dispatch->partdesc; /* * Convert the tuple to this parent's layout, if different from the @@ -244,37 +228,114 @@ ExecFindPartition(ResultRelInfo *resultRelInfo, PartitionDispatch *pd, FormPartitionKeyDatum(dispatch, slot, estate, values, isnull); /* - * Nothing for get_partition_for_tuple() to do if there are no - * partitions to begin with. + * If this partitioned table has no partitions or no partition for + * these values, then error out. */ - if (dispatch->partdesc->nparts == 0) + if (partdesc->nparts == 0 || + (partidx = get_partition_for_tuple(dispatch, values, isnull)) < 0) { - result = -1; - break; + char *val_desc; + + val_desc = ExecBuildSlotPartitionKeyDescription(rel, + values, isnull, 64); + Assert(OidIsValid(RelationGetRelid(rel))); + ereport(ERROR, + (errcode(ERRCODE_CHECK_VIOLATION), + errmsg("no partition of relation \"%s\" found for row", + RelationGetRelationName(rel)), + val_desc ? errdetail("Partition key of the failing row contains %s.", val_desc) : 0)); } - cur_index = get_partition_for_tuple(dispatch, values, isnull); + if (partdesc->is_leaf[partidx]) + { + int result = -1; - /* - * cur_index < 0 means we failed to find a partition of this parent. - * cur_index >= 0 means we either found the leaf partition, or the - * next parent to find a partition of. - */ - if (cur_index < 0) - { - result = -1; - break; - } - else if (dispatch->indexes[cur_index] >= 0) - { - result = dispatch->indexes[cur_index]; - /* success! */ - break; + /* + * Get this leaf partition's index in the + * PartitionTupleRouting->partitions array. We may require + * building a new ResultRelInfo. + */ + if (likely(dispatch->indexes[partidx] >= 0)) + { + /* ResultRelInfo already built */ + Assert(dispatch->indexes[partidx] < proute->num_partitions); + result = dispatch->indexes[partidx]; + } + else + { + /* + * A ResultRelInfo has not been set up for this partition yet, + * so either use one of the sub-plan result rels or create a + * fresh one. + */ + if (proute->subplan_resultrel_hash) + { + ResultRelInfo *rri; + Oid partoid = partdesc->oids[partidx]; + + rri = hash_search(proute->subplan_resultrel_hash, + &partoid, HASH_FIND, NULL); + + if (rri) + { + result = proute->num_partitions++; + dispatch->indexes[partidx] = result; + + + /* Allocate more space in the arrays, if required */ + if (result >= proute->partitions_allocsize) + ExecExpandRoutingArrays(proute); + + /* Save here for later use. */ + proute->partitions[result] = rri; + } + } + + /* We need to create one afresh. */ + if (result < 0) + { + MemoryContextSwitchTo(oldcxt); + result = ExecInitPartitionInfo(mtstate, resultRelInfo, + proute, estate, + dispatch, partidx); + MemoryContextSwitchTo(GetPerTupleMemoryContext(estate)); + Assert(result >= 0 && result < proute->num_partitions); + } + } + + /* Release the tuple in the lowest parent's dedicated slot. */ + if (slot == myslot) + ExecClearTuple(myslot); + + MemoryContextSwitchTo(oldcxt); + ecxt->ecxt_scantuple = ecxt_scantuple_old; + return result; } else { - /* move down one level */ - dispatch = pd[-dispatch->indexes[cur_index]]; + /* + * Partition is a sub-partitioned table; get the PartitionDispatch + */ + if (likely(dispatch->indexes[partidx] >= 0)) + { + /* Already built. */ + Assert(dispatch->indexes[partidx] < proute->num_dispatch); + dispatch = pd[dispatch->indexes[partidx]]; + } + else + { + /* Not yet built. Do that now. */ + PartitionDispatch subdispatch; + + MemoryContextSwitchTo(oldcxt); + subdispatch = ExecInitPartitionDispatchInfo(proute, + partdesc->oids[partidx], + dispatch, partidx); + MemoryContextSwitchTo(GetPerTupleMemoryContext(estate)); + Assert(dispatch->indexes[partidx] >= 0 && + dispatch->indexes[partidx] < proute->num_dispatch); + dispatch = subdispatch; + } /* * Release the dedicated slot, if it was used. Create a copy of @@ -287,58 +348,122 @@ ExecFindPartition(ResultRelInfo *resultRelInfo, PartitionDispatch *pd, } } } +} - /* Release the tuple in the lowest parent's dedicated slot. */ - if (slot == myslot) - ExecClearTuple(myslot); +/* + * ExecHashSubPlanResultRelsByOid + * Build a hash table to allow fast lookups of subplan ResultRelInfos by + * partition Oid. We also populate the subplan ResultRelInfo with an + * ri_PartitionRoot. + */ +static void +ExecHashSubPlanResultRelsByOid(ModifyTableState *mtstate, + PartitionTupleRouting *proute) +{ + ModifyTable *node = (ModifyTable *) mtstate->ps.plan; + ResultRelInfo *subplan_result_rels; + HASHCTL ctl; + HTAB *htab; + int nsubplans; + int i; - /* A partition was not found. */ - if (result < 0) + subplan_result_rels = mtstate->resultRelInfo; + nsubplans = list_length(node->plans); + + memset(&ctl, 0, sizeof(ctl)); + ctl.keysize = sizeof(Oid); + ctl.entrysize = sizeof(ResultRelInfo **); + ctl.hcxt = CurrentMemoryContext; + + htab = hash_create("PartitionTupleRouting table", nsubplans, &ctl, + HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); + proute->subplan_resultrel_hash = htab; + + /* Hash all subplans by their Oid */ + for (i = 0; i < nsubplans; i++) { - char *val_desc; + ResultRelInfo *rri = &subplan_result_rels[i]; + bool found; + Oid partoid = RelationGetRelid(rri->ri_RelationDesc); + ResultRelInfo **subplanrri; - val_desc = ExecBuildSlotPartitionKeyDescription(rel, - values, isnull, 64); - Assert(OidIsValid(RelationGetRelid(rel))); - ereport(ERROR, - (errcode(ERRCODE_CHECK_VIOLATION), - errmsg("no partition of relation \"%s\" found for row", - RelationGetRelationName(rel)), - val_desc ? errdetail("Partition key of the failing row contains %s.", val_desc) : 0)); + subplanrri = (ResultRelInfo **) hash_search(htab, &partoid, HASH_ENTER, + &found); + + if (!found) + *subplanrri = rri; + + /* + * This is required in order to convert the partition's tuple to be + * compatible with the root partitioned table's tuple descriptor. When + * generating the per-subplan result rels, this was not set. + */ + rri->ri_PartitionRoot = proute->partition_root; + } +} + +/* + * ExecExpandRoutingArrays + * Double the size of the allocated arrays in 'proute' + */ +static void +ExecExpandRoutingArrays(PartitionTupleRouting *proute) +{ + int new_size = proute->partitions_allocsize * 2; + int old_size = proute->partitions_allocsize; + + proute->partitions_allocsize = new_size; + + proute->partitions = (ResultRelInfo **) + repalloc(proute->partitions, sizeof(ResultRelInfo *) * new_size); + + if (proute->parent_child_tupconv_maps != NULL) + { + proute->parent_child_tupconv_maps = (TupleConversionMap **) + repalloc(proute->parent_child_tupconv_maps, + sizeof(TupleConversionMap *) * new_size); + memset(&proute->parent_child_tupconv_maps[old_size], 0, + sizeof(TupleConversionMap *) * (new_size - old_size)); } - MemoryContextSwitchTo(oldcxt); - ecxt->ecxt_scantuple = ecxt_scantuple_old; - - return result; + if (proute->child_parent_tupconv_maps != NULL) + { + proute->child_parent_tupconv_maps = (TupleConversionMap **) + repalloc(proute->child_parent_tupconv_maps, + sizeof(TupleConversionMap *) * new_size); + memset(&proute->child_parent_tupconv_maps[old_size], 0, + sizeof(TupleConversionMap *) * (new_size - old_size)); + } } /* * ExecInitPartitionInfo * Initialize ResultRelInfo and other information for a partition - * - * Returns the ResultRelInfo + * and store it in the next empty slot in 'proute's partitions array and + * return the index of that element. */ -ResultRelInfo * +static int ExecInitPartitionInfo(ModifyTableState *mtstate, - ResultRelInfo *resultRelInfo, + ResultRelInfo *rootResultRelInfo, PartitionTupleRouting *proute, - EState *estate, int partidx) + EState *estate, + PartitionDispatch dispatch, int partidx) { ModifyTable *node = (ModifyTable *) mtstate->ps.plan; - Relation rootrel = resultRelInfo->ri_RelationDesc, + Relation rootrel = rootResultRelInfo->ri_RelationDesc, partrel; Relation firstResultRel = mtstate->resultRelInfo[0].ri_RelationDesc; ResultRelInfo *leaf_part_rri; MemoryContext oldContext; AttrNumber *part_attnos = NULL; bool found_whole_row; + int part_result_rel_index; /* * We locked all the partitions in ExecSetupPartitionTupleRouting * including the leaf partitions. */ - partrel = heap_open(proute->partition_oids[partidx], NoLock); + partrel = heap_open(dispatch->partdesc->oids[partidx], NoLock); /* * Keep ResultRelInfo and other information for this partition in the @@ -514,15 +639,25 @@ ExecInitPartitionInfo(ModifyTableState *mtstate, &mtstate->ps, RelationGetDescr(partrel)); } + part_result_rel_index = proute->num_partitions++; + dispatch->indexes[partidx] = part_result_rel_index; + + /* Allocate more space in the arrays, if required */ + if (part_result_rel_index >= proute->partitions_allocsize) + ExecExpandRoutingArrays(proute); + + /* Save here for later use. */ + proute->partitions[part_result_rel_index] = leaf_part_rri; + /* Set up information needed for routing tuples to the partition. */ - ExecInitRoutingInfo(mtstate, estate, proute, leaf_part_rri, partidx); + ExecInitRoutingInfo(mtstate, estate, proute, leaf_part_rri, + part_result_rel_index); /* * If there is an ON CONFLICT clause, initialize state for it. */ if (node && node->onConflictAction != ONCONFLICT_NONE) { - TupleConversionMap *map = proute->parent_child_tupconv_maps[partidx]; int firstVarno = mtstate->resultRelInfo[0].ri_RangeTableIndex; TupleDesc partrelDesc = RelationGetDescr(partrel); ExprContext *econtext = mtstate->ps.ps_ExprContext; @@ -535,7 +670,7 @@ ExecInitPartitionInfo(ModifyTableState *mtstate, * list and searching for ancestry relationships to each index in the * ancestor table. */ - if (list_length(resultRelInfo->ri_onConflictArbiterIndexes) > 0) + if (list_length(rootResultRelInfo->ri_onConflictArbiterIndexes) > 0) { List *childIdxs; @@ -548,7 +683,7 @@ ExecInitPartitionInfo(ModifyTableState *mtstate, ListCell *lc2; ancestors = get_partition_ancestors(childIdx); - foreach(lc2, resultRelInfo->ri_onConflictArbiterIndexes) + foreach(lc2, rootResultRelInfo->ri_onConflictArbiterIndexes) { if (list_member_oid(ancestors, lfirst_oid(lc2))) arbiterIndexes = lappend_oid(arbiterIndexes, childIdx); @@ -562,7 +697,7 @@ ExecInitPartitionInfo(ModifyTableState *mtstate, * (This shouldn't happen, since arbiter index selection should not * pick up an invalid index.) */ - if (list_length(resultRelInfo->ri_onConflictArbiterIndexes) != + if (list_length(rootResultRelInfo->ri_onConflictArbiterIndexes) != list_length(arbiterIndexes)) elog(ERROR, "invalid arbiter index list"); leaf_part_rri->ri_onConflictArbiterIndexes = arbiterIndexes; @@ -572,8 +707,12 @@ ExecInitPartitionInfo(ModifyTableState *mtstate, */ if (node->onConflictAction == ONCONFLICT_UPDATE) { + TupleConversionMap *map; + + map = PartitionTupRoutingGetToChildMap(proute, part_result_rel_index); + Assert(node->onConflictSet != NIL); - Assert(resultRelInfo->ri_onConflict != NULL); + Assert(rootResultRelInfo->ri_onConflict != NULL); /* * If the partition's tuple descriptor matches exactly the root @@ -582,7 +721,7 @@ ExecInitPartitionInfo(ModifyTableState *mtstate, * need to create state specific to this partition. */ if (map == NULL) - leaf_part_rri->ri_onConflict = resultRelInfo->ri_onConflict; + leaf_part_rri->ri_onConflict = rootResultRelInfo->ri_onConflict; else { List *onconflset; @@ -673,12 +812,9 @@ ExecInitPartitionInfo(ModifyTableState *mtstate, } } - Assert(proute->partitions[partidx] == NULL); - proute->partitions[partidx] = leaf_part_rri; - MemoryContextSwitchTo(oldContext); - return leaf_part_rri; + return part_result_rel_index; } /* @@ -693,6 +829,7 @@ ExecInitRoutingInfo(ModifyTableState *mtstate, int partidx) { MemoryContext oldContext; + TupleConversionMap *map; /* * Switch into per-query memory context. @@ -703,10 +840,52 @@ ExecInitRoutingInfo(ModifyTableState *mtstate, * Set up a tuple conversion map to convert a tuple routed to the * partition from the parent's type to the partition's. */ - proute->parent_child_tupconv_maps[partidx] = - convert_tuples_by_name(RelationGetDescr(partRelInfo->ri_PartitionRoot), - RelationGetDescr(partRelInfo->ri_RelationDesc), - gettext_noop("could not convert row type")); + map = convert_tuples_by_name(RelationGetDescr(partRelInfo->ri_PartitionRoot), + RelationGetDescr(partRelInfo->ri_RelationDesc), + gettext_noop("could not convert row type")); + + if (map) + { + /* Allocate parent child map array only if we need to store a map */ + if (proute->parent_child_tupconv_maps == NULL) + { + int size; + + size = proute->partitions_allocsize; + proute->parent_child_tupconv_maps = (TupleConversionMap **) + palloc0(sizeof(TupleConversionMap *) * size); + } + + proute->parent_child_tupconv_maps[partidx] = map; + } + + /* + * Also, if transition capture is required, store a map to convert tuples + * from partition's rowtype to the parent's. + */ + if (mtstate && + (mtstate->mt_transition_capture || mtstate->mt_oc_transition_capture)) + { + map = + convert_tuples_by_name(RelationGetDescr(partRelInfo->ri_RelationDesc), + RelationGetDescr(partRelInfo->ri_PartitionRoot), + gettext_noop("could not convert row type")); + + /* Allocate child parent map array only if we need to store a map */ + if (map) + { + if (proute->child_parent_tupconv_maps == NULL) + { + int size; + + size = proute->partitions_allocsize; + proute->child_parent_tupconv_maps = (TupleConversionMap **) + palloc0(sizeof(TupleConversionMap *) * size); + } + + proute->child_parent_tupconv_maps[partidx] = map; + } + } /* * If the partition is a foreign table, let the FDW init itself for @@ -722,67 +901,82 @@ ExecInitRoutingInfo(ModifyTableState *mtstate, } /* - * ExecSetupChildParentMapForLeaf -- Initialize the per-leaf-partition - * child-to-root tuple conversion map array. + * ExecInitPartitionDispatchInfo + * Initialize PartitionDispatch for a partitioned table * - * This map is required for capturing transition tuples when the target table - * is a partitioned table. For a tuple that is routed by an INSERT or UPDATE, - * we need to convert it from the leaf partition to the target table - * descriptor. + * This also stores it in the proute->partition_dispatch_info array at the + * specified index ('partidx'), possibly expanding the array if there isn't + * enough space left in it. */ -void -ExecSetupChildParentMapForLeaf(PartitionTupleRouting *proute) +static PartitionDispatch +ExecInitPartitionDispatchInfo(PartitionTupleRouting *proute, Oid partoid, + PartitionDispatch parent_pd, int partidx) { - Assert(proute != NULL); + Relation rel; + PartitionDesc partdesc; + PartitionDispatch pd; + int dispatchidx; + + if (partoid != RelationGetRelid(proute->partition_root)) + rel = heap_open(partoid, NoLock); + else + rel = proute->partition_root; + partdesc = RelationGetPartitionDesc(rel); + + pd = (PartitionDispatch) palloc(offsetof(PartitionDispatchData, indexes) + + (partdesc->nparts * sizeof(int))); + pd->reldesc = rel; + pd->key = RelationGetPartitionKey(rel); + pd->keystate = NIL; + pd->partdesc = partdesc; + if (parent_pd != NULL) + { + TupleDesc tupdesc = RelationGetDescr(rel); + + /* + * For every partitioned table other than the root, we must store a + * tuple table slot initialized with its tuple descriptor and a tuple + * conversion map to convert a tuple from its parent's rowtype to its + * own. That is to make sure that we are looking at the correct row + * using the correct tuple descriptor when computing its partition key + * for tuple routing. + */ + pd->tupslot = MakeSingleTupleTableSlot(tupdesc); + pd->tupmap = + convert_tuples_by_name(RelationGetDescr(parent_pd->reldesc), + tupdesc, + gettext_noop("could not convert row type")); + } + else + { + /* Not required for the root partitioned table */ + pd->tupslot = NULL; + pd->tupmap = NULL; + } /* - * These array elements get filled up with maps on an on-demand basis. - * Initially just set all of them to NULL. + * Initialize with -1 to signify that the corresponding partition's + * ResultRelInfo or PartitionDispatch has not been created yet. */ - proute->child_parent_tupconv_maps = - (TupleConversionMap **) palloc0(sizeof(TupleConversionMap *) * - proute->num_partitions); + memset(pd->indexes, -1, sizeof(int) * partdesc->nparts); - /* Same is the case for this array. All the values are set to false */ - proute->child_parent_map_not_required = - (bool *) palloc0(sizeof(bool) * proute->num_partitions); -} + dispatchidx = proute->num_dispatch++; + if (parent_pd) + parent_pd->indexes[partidx] = dispatchidx; + if (dispatchidx >= proute->dispatch_allocsize) + { + /* Expand allocated space. */ + proute->dispatch_allocsize *= 2; + proute->partition_dispatch_info = (PartitionDispatchData **) + repalloc(proute->partition_dispatch_info, + sizeof(PartitionDispatchData *) * + proute->dispatch_allocsize); + } -/* - * TupConvMapForLeaf -- Get the tuple conversion map for a given leaf partition - * index. - */ -TupleConversionMap * -TupConvMapForLeaf(PartitionTupleRouting *proute, - ResultRelInfo *rootRelInfo, int leaf_index) -{ - ResultRelInfo **resultRelInfos = proute->partitions; - TupleConversionMap **map; - TupleDesc tupdesc; + /* Save here for later use. */ + proute->partition_dispatch_info[dispatchidx] = pd; - /* Don't call this if we're not supposed to be using this type of map. */ - Assert(proute->child_parent_tupconv_maps != NULL); - - /* If it's already known that we don't need a map, return NULL. */ - if (proute->child_parent_map_not_required[leaf_index]) - return NULL; - - /* If we've already got a map, return it. */ - map = &proute->child_parent_tupconv_maps[leaf_index]; - if (*map != NULL) - return *map; - - /* No map yet; try to create one. */ - tupdesc = RelationGetDescr(resultRelInfos[leaf_index]->ri_RelationDesc); - *map = - convert_tuples_by_name(tupdesc, - RelationGetDescr(rootRelInfo->ri_RelationDesc), - gettext_noop("could not convert row type")); - - /* If it turns out no map is needed, remember for next time. */ - proute->child_parent_map_not_required[leaf_index] = (*map == NULL); - - return *map; + return pd; } /* @@ -827,8 +1021,8 @@ void ExecCleanupTupleRouting(ModifyTableState *mtstate, PartitionTupleRouting *proute) { + HTAB *resultrel_hash = proute->subplan_resultrel_hash; int i; - int subplan_index = 0; /* * Remember, proute->partition_dispatch_info[0] corresponds to the root @@ -849,10 +1043,6 @@ ExecCleanupTupleRouting(ModifyTableState *mtstate, { ResultRelInfo *resultRelInfo = proute->partitions[i]; - /* skip further processsing for uninitialized partitions */ - if (resultRelInfo == NULL) - continue; - /* Allow any FDWs to shut down if they've been exercised */ if (resultRelInfo->ri_PartitionReadyForRouting && resultRelInfo->ri_FdwRoutine != NULL && @@ -861,21 +1051,19 @@ ExecCleanupTupleRouting(ModifyTableState *mtstate, resultRelInfo); /* - * If this result rel is one of the UPDATE subplan result rels, let - * ExecEndPlan() close it. For INSERT or COPY, - * proute->subplan_partition_offsets will always be NULL. Note that - * the subplan_partition_offsets array and the partitions array have - * the partitions in the same order. So, while we iterate over - * partitions array, we also iterate over the - * subplan_partition_offsets array in order to figure out which of the - * result rels are present in the UPDATE subplans. + * Check if this result rel is one belonging to the node's subplans, + * if so, let ExecEndPlan() clean it up. */ - if (proute->subplan_partition_offsets && - subplan_index < proute->num_subplan_partition_offsets && - proute->subplan_partition_offsets[subplan_index] == i) + if (resultrel_hash) { - subplan_index++; - continue; + Oid partoid; + bool found; + + partoid = RelationGetRelid(resultRelInfo->ri_RelationDesc); + + (void) hash_search(resultrel_hash, &partoid, HASH_FIND, &found); + if (found) + continue; } ExecCloseIndices(resultRelInfo); @@ -889,144 +1077,6 @@ ExecCleanupTupleRouting(ModifyTableState *mtstate, ExecDropSingleTupleTableSlot(proute->partition_tuple_slot); } -/* - * RelationGetPartitionDispatchInfo - * Returns information necessary to route tuples down a partition tree - * - * The number of elements in the returned array (that is, the number of - * PartitionDispatch objects for the partitioned tables in the partition tree) - * is returned in *num_parted and a list of the OIDs of all the leaf - * partitions of rel is returned in *leaf_part_oids. - * - * All the relations in the partition tree (including 'rel') must have been - * locked (using at least the AccessShareLock) by the caller. - */ -static PartitionDispatch * -RelationGetPartitionDispatchInfo(Relation rel, - int *num_parted, List **leaf_part_oids) -{ - List *pdlist = NIL; - PartitionDispatchData **pd; - ListCell *lc; - int i; - - Assert(rel->rd_rel->relkind == RELKIND_PARTITIONED_TABLE); - - *num_parted = 0; - *leaf_part_oids = NIL; - - get_partition_dispatch_recurse(rel, NULL, &pdlist, leaf_part_oids); - *num_parted = list_length(pdlist); - pd = (PartitionDispatchData **) palloc(*num_parted * - sizeof(PartitionDispatchData *)); - i = 0; - foreach(lc, pdlist) - { - pd[i++] = lfirst(lc); - } - - return pd; -} - -/* - * get_partition_dispatch_recurse - * Recursively expand partition tree rooted at rel - * - * As the partition tree is expanded in a depth-first manner, we maintain two - * global lists: of PartitionDispatch objects corresponding to partitioned - * tables in *pds and of the leaf partition OIDs in *leaf_part_oids. - * - * Note that the order of OIDs of leaf partitions in leaf_part_oids matches - * the order in which the planner's expand_partitioned_rtentry() processes - * them. It's not necessarily the case that the offsets match up exactly, - * because constraint exclusion might prune away some partitions on the - * planner side, whereas we'll always have the complete list; but unpruned - * partitions will appear in the same order in the plan as they are returned - * here. - */ -static void -get_partition_dispatch_recurse(Relation rel, Relation parent, - List **pds, List **leaf_part_oids) -{ - TupleDesc tupdesc = RelationGetDescr(rel); - PartitionDesc partdesc = RelationGetPartitionDesc(rel); - PartitionKey partkey = RelationGetPartitionKey(rel); - PartitionDispatch pd; - int i; - - check_stack_depth(); - - /* Build a PartitionDispatch for this table and add it to *pds. */ - pd = (PartitionDispatch) palloc(sizeof(PartitionDispatchData)); - *pds = lappend(*pds, pd); - pd->reldesc = rel; - pd->key = partkey; - pd->keystate = NIL; - pd->partdesc = partdesc; - if (parent != NULL) - { - /* - * For every partitioned table other than the root, we must store a - * tuple table slot initialized with its tuple descriptor and a tuple - * conversion map to convert a tuple from its parent's rowtype to its - * own. That is to make sure that we are looking at the correct row - * using the correct tuple descriptor when computing its partition key - * for tuple routing. - */ - pd->tupslot = MakeSingleTupleTableSlot(tupdesc); - pd->tupmap = convert_tuples_by_name(RelationGetDescr(parent), - tupdesc, - gettext_noop("could not convert row type")); - } - else - { - /* Not required for the root partitioned table */ - pd->tupslot = NULL; - pd->tupmap = NULL; - } - - /* - * Go look at each partition of this table. If it's a leaf partition, - * simply add its OID to *leaf_part_oids. If it's a partitioned table, - * recursively call get_partition_dispatch_recurse(), so that its - * partitions are processed as well and a corresponding PartitionDispatch - * object gets added to *pds. - * - * The 'indexes' array is used when searching for a partition matching a - * given tuple. The actual value we store here depends on whether the - * array element belongs to a leaf partition or a subpartitioned table. - * For leaf partitions we store the index into *leaf_part_oids, and for - * sub-partitioned tables we store a negative version of the index into - * the *pds list. Both indexes are 0-based, but the first element of the - * *pds list is the root partition, so 0 always means the first leaf. When - * searching, if we see a negative value, the search must continue in the - * corresponding sub-partition; otherwise, we've identified the correct - * partition. - */ - pd->indexes = (int *) palloc(partdesc->nparts * sizeof(int)); - for (i = 0; i < partdesc->nparts; i++) - { - Oid partrelid = partdesc->oids[i]; - - if (get_rel_relkind(partrelid) != RELKIND_PARTITIONED_TABLE) - { - *leaf_part_oids = lappend_oid(*leaf_part_oids, partrelid); - pd->indexes[i] = list_length(*leaf_part_oids) - 1; - } - else - { - /* - * We assume all tables in the partition tree were already locked - * by the caller. - */ - Relation partrel = heap_open(partrelid, NoLock); - - pd->indexes[i] = -list_length(*pds); - get_partition_dispatch_recurse(partrel, rel, pds, leaf_part_oids); - } - } -} - /* ---------------- * FormPartitionKeyDatum * Construct values[] and isnull[] arrays for the partition key diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index d8d89c7983..365b4fd6f9 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -68,7 +68,6 @@ static TupleTableSlot *ExecPrepareTupleRouting(ModifyTableState *mtstate, ResultRelInfo *targetRelInfo, TupleTableSlot *slot); static ResultRelInfo *getTargetResultRelInfo(ModifyTableState *node); -static void ExecSetupChildParentMapForTcs(ModifyTableState *mtstate); static void ExecSetupChildParentMapForSubplan(ModifyTableState *mtstate); static TupleConversionMap *tupconv_map_for_subplan(ModifyTableState *node, int whichplan); @@ -1667,7 +1666,7 @@ ExecSetupTransitionCaptureState(ModifyTableState *mtstate, EState *estate) if (mtstate->mt_transition_capture != NULL || mtstate->mt_oc_transition_capture != NULL) { - ExecSetupChildParentMapForTcs(mtstate); + ExecSetupChildParentMapForSubplan(mtstate); /* * Install the conversion map for the first plan for UPDATE and DELETE @@ -1710,21 +1709,13 @@ ExecPrepareTupleRouting(ModifyTableState *mtstate, * value is to be used as an index into the arrays for the ResultRelInfo * and TupleConversionMap for the partition. */ - partidx = ExecFindPartition(targetRelInfo, - proute->partition_dispatch_info, - slot, - estate); + partidx = ExecFindPartition(mtstate, targetRelInfo, proute, slot, estate); Assert(partidx >= 0 && partidx < proute->num_partitions); - /* - * Get the ResultRelInfo corresponding to the selected partition; if not - * yet there, initialize it. - */ + Assert(proute->partitions[partidx] != NULL); + /* Get the ResultRelInfo corresponding to the selected partition. */ partrel = proute->partitions[partidx]; - if (partrel == NULL) - partrel = ExecInitPartitionInfo(mtstate, targetRelInfo, - proute, estate, - partidx); + Assert(partrel != NULL); /* * Check whether the partition is routable if we didn't yet @@ -1769,7 +1760,7 @@ ExecPrepareTupleRouting(ModifyTableState *mtstate, */ mtstate->mt_transition_capture->tcs_original_insert_tuple = NULL; mtstate->mt_transition_capture->tcs_map = - TupConvMapForLeaf(proute, targetRelInfo, partidx); + PartitionTupRoutingGetToParentMap(proute, partidx); } else { @@ -1784,16 +1775,14 @@ ExecPrepareTupleRouting(ModifyTableState *mtstate, if (mtstate->mt_oc_transition_capture != NULL) { mtstate->mt_oc_transition_capture->tcs_map = - TupConvMapForLeaf(proute, targetRelInfo, partidx); + PartitionTupRoutingGetToParentMap(proute, partidx); } /* * Convert the tuple, if necessary. */ - ConvertPartitionTupleSlot(proute->parent_child_tupconv_maps[partidx], - tuple, - proute->partition_tuple_slot, - &slot, + ConvertPartitionTupleSlot(PartitionTupRoutingGetToChildMap(proute, partidx), + tuple, proute->partition_tuple_slot, &slot, true); /* Initialize information needed to handle ON CONFLICT DO UPDATE. */ @@ -1831,17 +1820,6 @@ ExecSetupChildParentMapForSubplan(ModifyTableState *mtstate) int i; /* - * First check if there is already a per-subplan array allocated. Even if - * there is already a per-leaf map array, we won't require a per-subplan - * one, since we will use the subplan offset array to convert the subplan - * index to per-leaf index. - */ - if (mtstate->mt_per_subplan_tupconv_maps || - (mtstate->mt_partition_tuple_routing && - mtstate->mt_partition_tuple_routing->child_parent_tupconv_maps)) - return; - - /* * Build array of conversion maps from each child's TupleDesc to the one * used in the target relation. The map pointers may be NULL when no * conversion is necessary, which is hopefully a common case. @@ -1863,78 +1841,17 @@ ExecSetupChildParentMapForSubplan(ModifyTableState *mtstate) } /* - * Initialize the child-to-root tuple conversion map array required for - * capturing transition tuples. - * - * The map array can be indexed either by subplan index or by leaf-partition - * index. For transition tables, we need a subplan-indexed access to the map, - * and where tuple-routing is present, we also require a leaf-indexed access. - */ -static void -ExecSetupChildParentMapForTcs(ModifyTableState *mtstate) -{ - PartitionTupleRouting *proute = mtstate->mt_partition_tuple_routing; - - /* - * If partition tuple routing is set up, we will require partition-indexed - * access. In that case, create the map array indexed by partition; we - * will still be able to access the maps using a subplan index by - * converting the subplan index to a partition index using - * subplan_partition_offsets. If tuple routing is not set up, it means we - * don't require partition-indexed access. In that case, create just a - * subplan-indexed map. - */ - if (proute) - { - /* - * If a partition-indexed map array is to be created, the subplan map - * array has to be NULL. If the subplan map array is already created, - * we won't be able to access the map using a partition index. - */ - Assert(mtstate->mt_per_subplan_tupconv_maps == NULL); - - ExecSetupChildParentMapForLeaf(proute); - } - else - ExecSetupChildParentMapForSubplan(mtstate); -} - -/* * For a given subplan index, get the tuple conversion map. */ static TupleConversionMap * tupconv_map_for_subplan(ModifyTableState *mtstate, int whichplan) { - /* - * If a partition-index tuple conversion map array is allocated, we need - * to first get the index into the partition array. Exactly *one* of the - * two arrays is allocated. This is because if there is a partition array - * required, we don't require subplan-indexed array since we can translate - * subplan index into partition index. And, we create a subplan-indexed - * array *only* if partition-indexed array is not required. - */ + /* If nobody else set the per-subplan array of maps, do so ourselves. */ if (mtstate->mt_per_subplan_tupconv_maps == NULL) - { - int leaf_index; - PartitionTupleRouting *proute = mtstate->mt_partition_tuple_routing; + ExecSetupChildParentMapForSubplan(mtstate); - /* - * If subplan-indexed array is NULL, things should have been arranged - * to convert the subplan index to partition index. - */ - Assert(proute && proute->subplan_partition_offsets != NULL && - whichplan < proute->num_subplan_partition_offsets); - - leaf_index = proute->subplan_partition_offsets[whichplan]; - - return TupConvMapForLeaf(proute, getTargetResultRelInfo(mtstate), - leaf_index); - } - else - { - Assert(whichplan >= 0 && whichplan < mtstate->mt_nplans); - return mtstate->mt_per_subplan_tupconv_maps[whichplan]; - } + Assert(whichplan >= 0 && whichplan < mtstate->mt_nplans); + return mtstate->mt_per_subplan_tupconv_maps[whichplan]; } /* ---------------------------------------------------------------- diff --git a/src/backend/utils/cache/partcache.c b/src/backend/utils/cache/partcache.c index 115a9fe78f..82acfeb460 100644 --- a/src/backend/utils/cache/partcache.c +++ b/src/backend/utils/cache/partcache.c @@ -594,6 +594,7 @@ RelationBuildPartitionDesc(Relation rel) int next_index = 0; result->oids = (Oid *) palloc0(nparts * sizeof(Oid)); + result->is_leaf = (bool *) palloc(nparts * sizeof(bool)); boundinfo = (PartitionBoundInfoData *) palloc0(sizeof(PartitionBoundInfoData)); @@ -782,7 +783,15 @@ RelationBuildPartitionDesc(Relation rel) * defined by canonicalized representation of the partition bounds. */ for (i = 0; i < nparts; i++) - result->oids[mapping[i]] = oids[i]; + { + int index = mapping[i]; + + result->oids[index] = oids[i]; + /* Record if the partition is a leaf partition */ + result->is_leaf[index] = + (get_rel_relkind(oids[i]) != RELKIND_PARTITIONED_TABLE); + } + pfree(mapping); } diff --git a/src/include/catalog/partition.h b/src/include/catalog/partition.h index 1f49e5d3a9..8a639b8b7d 100644 --- a/src/include/catalog/partition.h +++ b/src/include/catalog/partition.h @@ -26,7 +26,11 @@ typedef struct PartitionDescData { int nparts; /* Number of partitions */ - Oid *oids; /* OIDs of partitions */ + Oid *oids; /* Array of 'nparts' elements containing + * partition OIDs in order of the their bounds */ + bool *is_leaf; /* Array of 'nparts' elements storing whether + * the corresponding 'oids' element belongs to + * a leaf partition or not */ PartitionBoundInfo boundinfo; /* collection of partition bounds */ } PartitionDescData; diff --git a/src/include/executor/execPartition.h b/src/include/executor/execPartition.h index f6cd842cc9..7370e24b1c 100644 --- a/src/include/executor/execPartition.h +++ b/src/include/executor/execPartition.h @@ -31,9 +31,13 @@ * tupmap TupleConversionMap to convert from the parent's rowtype to * this table's rowtype (when extracting the partition key of a * tuple just before routing it through this table) - * indexes Array with partdesc->nparts members (for details on what - * individual members represent, see how they are set in - * get_partition_dispatch_recurse()) + * indexes Array with partdesc->nparts elements. For leaf partitions the + * index into the PartitionTupleRouting->partitions array is + * stored. When the partition is itself a partitioned table then + * we store the index into + * PartitionTupleRouting->partition_dispatch_info. -1 means + * we've not yet allocated anything in PartitionTupleRouting for + * the partition. *----------------------- */ typedef struct PartitionDispatchData @@ -44,75 +48,122 @@ typedef struct PartitionDispatchData PartitionDesc partdesc; TupleTableSlot *tupslot; TupleConversionMap *tupmap; - int *indexes; + int indexes[FLEXIBLE_ARRAY_MEMBER]; } PartitionDispatchData; typedef struct PartitionDispatchData *PartitionDispatch; /*----------------------- - * PartitionTupleRouting - Encapsulates all information required to execute - * tuple-routing between partitions. + * PartitionTupleRouting - Encapsulates all information required to + * route a tuple inserted into a partitioned table to one of its leaf + * partitions * - * partition_dispatch_info Array of PartitionDispatch objects with one - * entry for every partitioned table in the - * partition tree. - * num_dispatch number of partitioned tables in the partition - * tree (= length of partition_dispatch_info[]) - * partition_oids Array of leaf partitions OIDs with one entry - * for every leaf partition in the partition tree, - * initialized in full by - * ExecSetupPartitionTupleRouting. - * partitions Array of ResultRelInfo* objects with one entry - * for every leaf partition in the partition tree, - * initialized lazily by ExecInitPartitionInfo. - * num_partitions Number of leaf partitions in the partition tree - * (= 'partitions_oid'/'partitions' array length) - * parent_child_tupconv_maps Array of TupleConversionMap objects with one - * entry for every leaf partition (required to - * convert tuple from the root table's rowtype to - * a leaf partition's rowtype after tuple routing - * is done) - * child_parent_tupconv_maps Array of TupleConversionMap objects with one - * entry for every leaf partition (required to - * convert an updated tuple from the leaf - * partition's rowtype to the root table's rowtype - * so that tuple routing can be done) - * child_parent_map_not_required Array of bool. True value means that a map is - * determined to be not required for the given - * partition. False means either we haven't yet - * checked if a map is required, or it was - * determined to be required. - * subplan_partition_offsets Integer array ordered by UPDATE subplans. Each - * element of this array has the index into the - * corresponding partition in partitions array. - * num_subplan_partition_offsets Length of 'subplan_partition_offsets' array - * partition_tuple_slot TupleTableSlot to be used to manipulate any - * given leaf partition's rowtype after that - * partition is chosen for insertion by - * tuple-routing. - * root_tuple_slot TupleTableSlot to be used to transiently hold - * copy of a tuple that's being moved across - * partitions in the root partitioned table's - * rowtype + * partition_root The partitioned table that's the target of the + * command. + * + * partition_dispatch_info Array of 'dispatch_allocsize' elements containing + * a pointer to a PartitionDispatch objects for every + * partitioned table touched by tuple routing. The + * entry for the target partitioned table is *always* + * present as the first entry of this array. See + * comment for PartitionDispatchData->indexes for + * details on how this array is indexed. + * + * num_dispatch The current number of items stored in the + * 'partition_dispatch_info' array. Also serves as + * the index of the next free array element for new + * PartitionDispatch which need to be stored. + * + * dispatch_allocsize The current allocated size of the + * 'partition_dispatch_info' array. + * + * partitions Array of 'partitions_allocsize' elements + * containing pointers to a ResultRelInfos of all + * leaf partitions touched by tuple routing. Some of + * these are pointers to ResultRelInfos which are + * borrowed out of 'subplan_resultrel_hash'. The + * remainder have been built especially for tuple + * routing. See comment for + * PartitionDispatchData->indexes for details on how + * this array is indexed. + * + * num_partitions The current number of items stored in the + * 'partitions' array. Also serves as the index of + * the next free array element for new ResultRelInfos + * which need to be stored. + * + * partitions_allocsize The current allocated size of the 'partitions' + * array. Also, if they're non-NULL, marks the size + * of the 'parent_child_tupconv_maps', + * 'child_parent_tupconv_maps' and + * 'child_parent_map_not_required' arrays. + * + * parent_child_tupconv_maps Array of partitions_allocsize elements + * containing information on how to convert tuples of + * partition_root's rowtype to the rowtype of the + * corresponding partition as stored in 'partitions', + * or NULL if no conversion is required. The entire + * array is only allocated when the first conversion + * map needs to stored. When not allocated it's set + * to NULL. + * + * partition_tuple_slot This is a tuple slot used to store a tuple using + * rowtype of the partition chosen by tuple + * routing. Maintained separately because partitions + * may have different rowtype. + * + * child_parent_tupconv_maps As 'parent_child_tupconv_maps' but stores + * conversion maps to translate partition tuples into + * partition_root's rowtype, needed if transition + * capture is active + * + * Note: The following fields are used only when UPDATE ends up needing to + * do tuple routing. + * + * subplan_resultrel_hash Hash table to store subplan ResultRelInfos by Oid. + * This is used to cache ResultRelInfos from subplans + * of a ModifyTable node. Some of these may be + * useful for tuple routing to save having to build + * duplicates. + * + * root_tuple_slot During UPDATE tuple routing, this tuple slot is + * used to transiently store a tuple using the root + * table's rowtype after converting it from the + * tuple's source leaf partition's rowtype. That is, + * if leaf partition's rowtype is different. *----------------------- */ typedef struct PartitionTupleRouting { + Relation partition_root; PartitionDispatch *partition_dispatch_info; int num_dispatch; - Oid *partition_oids; + int dispatch_allocsize; ResultRelInfo **partitions; int num_partitions; + int partitions_allocsize; TupleConversionMap **parent_child_tupconv_maps; TupleConversionMap **child_parent_tupconv_maps; - bool *child_parent_map_not_required; - int *subplan_partition_offsets; - int num_subplan_partition_offsets; - TupleTableSlot *partition_tuple_slot; + HTAB *subplan_resultrel_hash; TupleTableSlot *root_tuple_slot; + TupleTableSlot *partition_tuple_slot; } PartitionTupleRouting; /* + * Accessor macros for tuple conversion maps contained in + * PartitionTupleRouting. Beware of multiple evaluations of p! + */ +#define PartitionTupRoutingGetToParentMap(p, i) \ + ((p)->child_parent_tupconv_maps != NULL ? \ + (p)->child_parent_tupconv_maps[(i)] : \ + NULL) + +#define PartitionTupRoutingGetToChildMap(p, i) \ + ((p)->parent_child_tupconv_maps != NULL ? \ + (p)->parent_child_tupconv_maps[(i)] : \ + NULL) + +/* * PartitionedRelPruningData - Per-partitioned-table data for run-time pruning * of partitions. For a multilevel partitioned table, we have one of these * for the topmost partition plus one for each non-leaf child partition. @@ -200,22 +251,20 @@ typedef struct PartitionPruneState extern PartitionTupleRouting *ExecSetupPartitionTupleRouting(ModifyTableState *mtstate, Relation rel); -extern int ExecFindPartition(ResultRelInfo *resultRelInfo, - PartitionDispatch *pd, +extern int ExecFindPartition(ModifyTableState *mtstate, + ResultRelInfo *resultRelInfo, + PartitionTupleRouting *proute, TupleTableSlot *slot, EState *estate); -extern ResultRelInfo *ExecInitPartitionInfo(ModifyTableState *mtstate, - ResultRelInfo *resultRelInfo, - PartitionTupleRouting *proute, - EState *estate, int partidx); +extern ResultRelInfo *ExecGetPartitionInfo(ModifyTableState *mtstate, + ResultRelInfo *resultRelInfo, + PartitionTupleRouting *proute, + EState *estate, int partidx); extern void ExecInitRoutingInfo(ModifyTableState *mtstate, EState *estate, PartitionTupleRouting *proute, ResultRelInfo *partRelInfo, int partidx); -extern void ExecSetupChildParentMapForLeaf(PartitionTupleRouting *proute); -extern TupleConversionMap *TupConvMapForLeaf(PartitionTupleRouting *proute, - ResultRelInfo *rootRelInfo, int leaf_index); extern HeapTuple ConvertPartitionTupleSlot(TupleConversionMap *map, HeapTuple tuple, TupleTableSlot *new_slot, diff --git a/src/test/regress/expected/insert_conflict.out b/src/test/regress/expected/insert_conflict.out index 27cf5a01b3..6b841c7850 100644 --- a/src/test/regress/expected/insert_conflict.out +++ b/src/test/regress/expected/insert_conflict.out @@ -904,4 +904,26 @@ select * from parted_conflict order by a; 50 | cincuenta | 2 (1 row) +-- test with statement level triggers +create or replace function parted_conflict_update_func() returns trigger as $$ +declare + r record; +begin + for r in select * from inserted loop + raise notice 'a = %, b = %, c = %', r.a, r.b, r.c; + end loop; + return new; +end; +$$ language plpgsql; +create trigger parted_conflict_update + after update on parted_conflict + referencing new table as inserted + for each statement + execute procedure parted_conflict_update_func(); +truncate parted_conflict; +insert into parted_conflict values (0, 'cero', 1); +insert into parted_conflict values(0, 'cero', 1) + on conflict (a,b) do update set c = parted_conflict.c + 1; +NOTICE: a = 0, b = cero, c = 2 drop table parted_conflict; +drop function parted_conflict_update_func(); diff --git a/src/test/regress/sql/insert_conflict.sql b/src/test/regress/sql/insert_conflict.sql index c677d70fb7..fe6dcfaa06 100644 --- a/src/test/regress/sql/insert_conflict.sql +++ b/src/test/regress/sql/insert_conflict.sql @@ -576,4 +576,30 @@ insert into parted_conflict values (50, 'cincuenta', 2) -- should see (50, 'cincuenta', 2) select * from parted_conflict order by a; +-- test with statement level triggers +create or replace function parted_conflict_update_func() returns trigger as $$ +declare + r record; +begin + for r in select * from inserted loop + raise notice 'a = %, b = %, c = %', r.a, r.b, r.c; + end loop; + return new; +end; +$$ language plpgsql; + +create trigger parted_conflict_update + after update on parted_conflict + referencing new table as inserted + for each statement + execute procedure parted_conflict_update_func(); + +truncate parted_conflict; + +insert into parted_conflict values (0, 'cero', 1); + +insert into parted_conflict values(0, 'cero', 1) + on conflict (a,b) do update set c = parted_conflict.c + 1; + drop table parted_conflict; +drop function parted_conflict_update_func(); -- 2.11.0