From 1bf84886550b8d31e8f1f8f4a46c43b14db3dd6c Mon Sep 17 00:00:00 2001
From: amit <amitlangote09@gmail.com>
Date: Fri, 19 Jul 2019 16:24:38 +0900
Subject: [PATCH v9 3/4] Rearrange partition update row movement code a bit

The block of code that does the actual moving (DELETE+INSERT) has
been moved to a function named ExecCrossPartitionUpdate() which must
be retried until it says the movement has been done or can't be done.

This also rearrange the code in ExecDelete() and ExecInsert() around
executing AFTER ROW DELETE and AFTER ROW INSERT triggers, resp.  In
the case of an update row movement, such triggers should not see the
affected tuple in their OLD/NEW transition table.
---
 src/backend/executor/nodeModifyTable.c | 347 +++++++++++++++++++--------------
 1 file changed, 199 insertions(+), 148 deletions(-)

diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c
index b01601578a..1362b2f2d1 100644
--- a/src/backend/executor/nodeModifyTable.c
+++ b/src/backend/executor/nodeModifyTable.c
@@ -356,7 +356,6 @@ ExecInsert(ModifyTableState *mtstate,
 	Relation	resultRelationDesc;
 	List	   *recheckIndexes = NIL;
 	TupleTableSlot *result = NULL;
-	TransitionCaptureState *ar_insert_trig_tcs;
 	ModifyTable *node = (ModifyTable *) mtstate->ps.plan;
 	OnConflictAction onconflict = node->onConflictAction;
 	PartitionTupleRouting *proute = mtstate->mt_partition_tuple_routing;
@@ -620,31 +619,30 @@ ExecInsert(ModifyTableState *mtstate,
 	}
 
 	/*
-	 * If this insert is the result of a partition key update that moved the
-	 * tuple to a new partition, put this row into the transition NEW TABLE,
-	 * if there is one. We need to do this separately for DELETE and INSERT
-	 * because they happen on different tables.
+	 * If the insert is a part of update row movement, put this row into the
+	 * UPDATE trigger's NEW TABLE (transition table) instead of that of an
+	 * INSERT trigger.
 	 */
-	ar_insert_trig_tcs = mtstate->mt_transition_capture;
-	if (mtstate->operation == CMD_UPDATE && mtstate->mt_transition_capture
-		&& mtstate->mt_transition_capture->tcs_update_new_table)
+	if (mtstate->operation == CMD_UPDATE &&
+		mtstate->mt_transition_capture &&
+		mtstate->mt_transition_capture->tcs_update_new_table)
 	{
-		ExecARUpdateTriggers(estate, resultRelInfo, NULL,
-							 NULL,
-							 slot,
-							 NULL,
-							 mtstate->mt_transition_capture);
+		ExecARUpdateTriggers(estate, resultRelInfo, NULL, NULL, slot,
+							 NIL, mtstate->mt_transition_capture);
 
 		/*
-		 * We've already captured the NEW TABLE row, so make sure any AR
-		 * INSERT trigger fired below doesn't capture it again.
+		 * Execute AFTER ROW INSERT Triggers, but such that the row is not
+		 * captured again in the transition table if any.
 		 */
-		ar_insert_trig_tcs = NULL;
+		ExecARInsertTriggers(estate, resultRelInfo, slot, recheckIndexes,
+							 NULL);
+	}
+	else
+	{
+		/* AFTER ROW INSERT Triggers */
+		ExecARInsertTriggers(estate, resultRelInfo, slot, recheckIndexes,
+							 mtstate->mt_transition_capture);
 	}
-
-	/* AFTER ROW INSERT Triggers */
-	ExecARInsertTriggers(estate, resultRelInfo, slot, recheckIndexes,
-						 ar_insert_trig_tcs);
 
 	list_free(recheckIndexes);
 
@@ -710,7 +708,6 @@ ExecDelete(ModifyTableState *mtstate,
 	TM_Result	result;
 	TM_FailureData tmfd;
 	TupleTableSlot *slot = NULL;
-	TransitionCaptureState *ar_delete_trig_tcs;
 
 	if (tupleDeleted)
 		*tupleDeleted = false;
@@ -954,32 +951,30 @@ ldelete:;
 		*tupleDeleted = true;
 
 	/*
-	 * If this delete is the result of a partition key update that moved the
-	 * tuple to a new partition, put this row into the transition OLD TABLE,
-	 * if there is one. We need to do this separately for DELETE and INSERT
-	 * because they happen on different tables.
+	 * If the delete is a part of update row movement, put this row into the
+	 * UPDATE trigger's OLD TABLE (transition table) instead of that of an
+	 * DELETE trigger.
 	 */
-	ar_delete_trig_tcs = mtstate->mt_transition_capture;
-	if (mtstate->operation == CMD_UPDATE && mtstate->mt_transition_capture
-		&& mtstate->mt_transition_capture->tcs_update_old_table)
+	if (mtstate->operation == CMD_UPDATE &&
+		mtstate->mt_transition_capture &&
+		mtstate->mt_transition_capture->tcs_update_old_table)
 	{
-		ExecARUpdateTriggers(estate, resultRelInfo,
-							 tupleid,
-							 oldtuple,
-							 NULL,
-							 NULL,
-							 mtstate->mt_transition_capture);
+		ExecARUpdateTriggers(estate, resultRelInfo, tupleid, oldtuple,
+							 NULL, NIL, mtstate->mt_transition_capture);
 
 		/*
-		 * We've already captured the NEW TABLE row, so make sure any AR
-		 * DELETE trigger fired below doesn't capture it again.
+		 * Execute AFTER ROW DELETE Triggers, but such that the row is not
+		 * captured again in the transition table if any.
 		 */
-		ar_delete_trig_tcs = NULL;
+		ExecARDeleteTriggers(estate, resultRelInfo, tupleid, oldtuple,
+							 NULL);
+	}
+	else
+	{
+		/* AFTER ROW DELETE Triggers */
+		ExecARDeleteTriggers(estate, resultRelInfo, tupleid, oldtuple,
+							 mtstate->mt_transition_capture);
 	}
-
-	/* AFTER ROW DELETE Triggers */
-	ExecARDeleteTriggers(estate, resultRelInfo, tupleid, oldtuple,
-						 ar_delete_trig_tcs);
 
 	/* Process RETURNING if present and if requested */
 	if (processReturning && resultRelInfo->ri_projectReturning)
@@ -1026,6 +1021,153 @@ ldelete:;
 	return NULL;
 }
 
+/*
+ *	ExecCrossPartitionUpdate
+ *		Move an updated tuple from a given partition to the correct partition
+ *		of its root parent table
+ *
+ *	This works by first deleting the tuple from the current partition,
+ *	followed by inserting it into the root parent table, that is,
+ *	mtstate->rootResultRelInfo, from where it's re-routed to the correct
+ *	partition.
+ *
+ *	Returns true if the tuple has been successfully moved or if it's found
+ *	that the tuple was concurrently deleted so there's nothing more to do
+ *	for the caller.
+ *
+ *	False is returned if the tuple we're trying to move is found to have been
+ *	concurrently updated.  Caller should check if the updated tuple that's
+ *	returned in *retry_slot still needs to be re-routed and call this function
+ *	again if needed.
+ */
+static bool
+ExecCrossPartitionUpdate(ModifyTableState *mtstate,
+						 ResultRelInfo *resultRelInfo,
+						 ItemPointer tupleid, HeapTuple oldtuple,
+						 TupleTableSlot *slot, TupleTableSlot *planSlot,
+						 EPQState *epqstate, bool canSetTag,
+						 TupleTableSlot **retry_slot,
+						 TupleTableSlot **inserted_tuple)
+{
+	EState	   *estate = mtstate->ps.state;
+	PartitionTupleRouting *proute = mtstate->mt_partition_tuple_routing;
+	int			map_index;
+	TupleConversionMap *tupconv_map;
+	TupleConversionMap *saved_tcs_map = NULL;
+	bool		tuple_deleted;
+	TupleTableSlot *epqslot = NULL;
+
+	*inserted_tuple = NULL;
+	*retry_slot = NULL;
+
+	/*
+	 * Disallow an INSERT ON CONFLICT DO UPDATE that causes the
+	 * original row to migrate to a different partition.  Maybe this
+	 * can be implemented some day, but it seems a fringe feature with
+	 * little redeeming value.
+	 */
+	if (((ModifyTable *) mtstate->ps.plan)->onConflictAction == ONCONFLICT_UPDATE)
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("invalid ON UPDATE specification"),
+				 errdetail("The result tuple would appear in a different partition than the original tuple.")));
+
+	/*
+	 * When an UPDATE is run on a leaf partition, we will not have
+	 * partition tuple routing set up. In that case, fail with
+	 * partition constraint violation error.
+	 */
+	if (proute == NULL)
+		ExecPartitionCheckEmitError(resultRelInfo, slot, estate);
+
+	/*
+	 * Row movement, part 1.  Delete the tuple, but skip RETURNING
+	 * processing. We want to return rows from INSERT.
+	 */
+	ExecDelete(mtstate, resultRelInfo, tupleid, oldtuple, planSlot,
+			   epqstate, estate,
+			   false,	/* processReturning */
+			   false,	/* canSetTag */
+			   true,	/* changingPart */
+			   &tuple_deleted, &epqslot);
+
+	/*
+	 * For some reason if DELETE didn't happen (e.g. trigger prevented
+	 * it, or it was already deleted by self, or it was concurrently
+	 * deleted by another transaction), then we should skip the insert
+	 * as well; otherwise, an UPDATE could cause an increase in the
+	 * total number of rows across all partitions, which is clearly
+	 * wrong.
+	 *
+	 * For a normal UPDATE, the case where the tuple has been the
+	 * subject of a concurrent UPDATE or DELETE would be handled by
+	 * the EvalPlanQual machinery, but for an UPDATE that we've
+	 * translated into a DELETE from this partition and an INSERT into
+	 * some other partition, that's not available, because CTID chains
+	 * can't span relation boundaries.  We mimic the semantics to a
+	 * limited extent by skipping the INSERT if the DELETE fails to
+	 * find a tuple. This ensures that two concurrent attempts to
+	 * UPDATE the same tuple at the same time can't turn one tuple
+	 * into two, and that an UPDATE of a just-deleted tuple can't
+	 * resurrect it.
+	 */
+	if (!tuple_deleted)
+	{
+		/*
+		 * epqslot will be typically NULL.  But when ExecDelete()
+		 * finds that another transaction has concurrently updated the
+		 * same row, it re-fetches the row, skips the delete, and
+		 * epqslot is set to the re-fetched tuple slot. In that case,
+		 * we need to do all the checks again.
+		 */
+		if (TupIsNull(epqslot))
+			return true;
+		else
+		{
+			*retry_slot = ExecFilterJunk(resultRelInfo->ri_junkFilter, epqslot);
+			return false;
+		}
+	}
+
+	/*
+	 * resultRelInfo is one of the per-subplan resultRelInfos.  So we
+	 * should convert the tuple into root's tuple descriptor, since
+	 * ExecInsert() starts the search from root.  The tuple conversion
+	 * map list is in the order of mtstate->resultRelInfo[], so to
+	 * retrieve the one for this resultRel, we need to know the
+	 * position of the resultRel in mtstate->resultRelInfo[].
+	 */
+	map_index = resultRelInfo - mtstate->resultRelInfo;
+	Assert(map_index >= 0 && map_index < mtstate->mt_nplans);
+	tupconv_map = tupconv_map_for_subplan(mtstate, map_index);
+	if (tupconv_map != NULL)
+		slot = execute_attr_map_slot(tupconv_map->attrMap,
+									 slot,
+									 mtstate->mt_root_tuple_slot);
+
+	/*
+	 * ExecInsert() may scribble on mtstate->mt_transition_capture,
+	 * so save the currently active map.
+	 */
+	if (mtstate->mt_transition_capture)
+		saved_tcs_map = mtstate->mt_transition_capture->tcs_map;
+
+	/* Tuple routing starts from the root table. */
+	Assert(mtstate->rootResultRelInfo != NULL);
+	*inserted_tuple = ExecInsert(mtstate, mtstate->rootResultRelInfo, slot,
+								 planSlot, estate, canSetTag);
+
+	/* Clear the INSERT's tuple and restore the saved map. */
+	if (mtstate->mt_transition_capture)
+	{
+		mtstate->mt_transition_capture->tcs_original_insert_tuple = NULL;
+		mtstate->mt_transition_capture->tcs_map = saved_tcs_map;
+	}
+
+	/* We're done moving. */
+	return true;
+}
+
 /* ----------------------------------------------------------------
  *		ExecUpdate
  *
@@ -1179,119 +1321,28 @@ lreplace:;
 		 */
 		if (partition_constraint_failed)
 		{
-			bool		tuple_deleted;
-			TupleTableSlot *ret_slot;
-			TupleTableSlot *epqslot = NULL;
-			PartitionTupleRouting *proute = mtstate->mt_partition_tuple_routing;
-			int			map_index;
-			TupleConversionMap *tupconv_map;
-			TupleConversionMap *saved_tcs_map = NULL;
-
-			/*
-			 * Disallow an INSERT ON CONFLICT DO UPDATE that causes the
-			 * original row to migrate to a different partition.  Maybe this
-			 * can be implemented some day, but it seems a fringe feature with
-			 * little redeeming value.
-			 */
-			if (((ModifyTable *) mtstate->ps.plan)->onConflictAction == ONCONFLICT_UPDATE)
-				ereport(ERROR,
-						(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-						 errmsg("invalid ON UPDATE specification"),
-						 errdetail("The result tuple would appear in a different partition than the original tuple.")));
-
-			/*
-			 * When an UPDATE is run on a leaf partition, we will not have
-			 * partition tuple routing set up. In that case, fail with
-			 * partition constraint violation error.
-			 */
-			if (proute == NULL)
-				ExecPartitionCheckEmitError(resultRelInfo, slot, estate);
-
-			/*
-			 * Row movement, part 1.  Delete the tuple, but skip RETURNING
-			 * processing. We want to return rows from INSERT.
-			 */
-			ExecDelete(mtstate, resultRelInfo, tupleid, oldtuple, planSlot,
-					   epqstate, estate,
-					   false,	/* processReturning */
-					   false,	/* canSetTag */
-					   true,	/* changingPart */
-					   &tuple_deleted, &epqslot);
+			TupleTableSlot *inserted_tuple,
+						   *retry_slot;
+			bool			retry;
 
 			/*
-			 * For some reason if DELETE didn't happen (e.g. trigger prevented
-			 * it, or it was already deleted by self, or it was concurrently
-			 * deleted by another transaction), then we should skip the insert
-			 * as well; otherwise, an UPDATE could cause an increase in the
-			 * total number of rows across all partitions, which is clearly
-			 * wrong.
-			 *
-			 * For a normal UPDATE, the case where the tuple has been the
-			 * subject of a concurrent UPDATE or DELETE would be handled by
-			 * the EvalPlanQual machinery, but for an UPDATE that we've
-			 * translated into a DELETE from this partition and an INSERT into
-			 * some other partition, that's not available, because CTID chains
-			 * can't span relation boundaries.  We mimic the semantics to a
-			 * limited extent by skipping the INSERT if the DELETE fails to
-			 * find a tuple. This ensures that two concurrent attempts to
-			 * UPDATE the same tuple at the same time can't turn one tuple
-			 * into two, and that an UPDATE of a just-deleted tuple can't
-			 * resurrect it.
+			 * ExecCrossPartitionUpdate will first DELETE the row from the
+			 * partition it's currently in and then insert it back into the
+			 * root table, which will re-route it to the correct partition.
+			 * The first part may have to be repeated if it is detected that
+			 * the tuple we're trying to move has been concurrently updated.
 			 */
-			if (!tuple_deleted)
-			{
-				/*
-				 * epqslot will be typically NULL.  But when ExecDelete()
-				 * finds that another transaction has concurrently updated the
-				 * same row, it re-fetches the row, skips the delete, and
-				 * epqslot is set to the re-fetched tuple slot. In that case,
-				 * we need to do all the checks again.
-				 */
-				if (TupIsNull(epqslot))
-					return NULL;
-				else
-				{
-					slot = ExecFilterJunk(resultRelInfo->ri_junkFilter, epqslot);
-					goto lreplace;
-				}
-			}
-
-			/*
-			 * resultRelInfo is one of the per-subplan resultRelInfos.  So we
-			 * should convert the tuple into root's tuple descriptor, since
-			 * ExecInsert() starts the search from root.  The tuple conversion
-			 * map list is in the order of mtstate->resultRelInfo[], so to
-			 * retrieve the one for this resultRel, we need to know the
-			 * position of the resultRel in mtstate->resultRelInfo[].
-			 */
-			map_index = resultRelInfo - mtstate->resultRelInfo;
-			Assert(map_index >= 0 && map_index < mtstate->mt_nplans);
-			tupconv_map = tupconv_map_for_subplan(mtstate, map_index);
-			if (tupconv_map != NULL)
-				slot = execute_attr_map_slot(tupconv_map->attrMap,
-											 slot,
-											 mtstate->mt_root_tuple_slot);
-
-			/*
-			 * ExecInsert() may scribble on mtstate->mt_transition_capture,
-			 * so save the currently active map.
-			 */
-			if (mtstate->mt_transition_capture)
-				saved_tcs_map = mtstate->mt_transition_capture->tcs_map;
-
-			/* Tuple routing starts from the root table. */
-			Assert(mtstate->rootResultRelInfo != NULL);
-			ret_slot = ExecInsert(mtstate, mtstate->rootResultRelInfo, slot,
-								  planSlot, estate, canSetTag);
-
-			/* Clear the INSERT's tuple and restore the saved map. */
-			if (mtstate->mt_transition_capture)
+			retry = !ExecCrossPartitionUpdate(mtstate, resultRelInfo, tupleid,
+											  oldtuple, slot, planSlot,
+											  epqstate, canSetTag,
+											  &retry_slot, &inserted_tuple);
+			if (retry)
 			{
-				mtstate->mt_transition_capture->tcs_original_insert_tuple = NULL;
-				mtstate->mt_transition_capture->tcs_map = saved_tcs_map;
+				slot = retry_slot;
+				goto lreplace;
 			}
 
-			return ret_slot;
+			return inserted_tuple;
 		}
 
 		/*
-- 
2.16.5