From cac625b22990c12a537eaa4c77434017a2303b92 Mon Sep 17 00:00:00 2001
From: amit <amitlangote09@gmail.com>
Date: Thu, 22 Dec 2016 15:33:41 +0900
Subject: [PATCH 1/5] Use correct tuple descriptor before and after routing a
 tuple

When we have a multi-level partitioned hierarchy where tables at
successive levels have different attnums for a partition key attribute(s),
we must use the tuple descriptor of a partitioned table of the given level
to inspect the appropriately converted representation of the input tuple
before computing the partition key to perform tuple-routing.

So, store in each PartitionDispatch object, a standalone TupleTableSlot
initialized with the TupleDesc of the corresponding partitioned table,
along with a TupleConversionMap to map tuples from the its parent's
rowtype to own rowtype.

After the routing is done and a leaf partition returned, we must use the
leaf partition's tuple descriptor, not the root table's.  For roughly
same reasons as above.  For the ext row and so on, we must then switch
back to the root table's tuple descriptor.  To that end, a dedicated
TupleTableSlot is allocated in EState called es_partition_tuple_slot,
whose descriptor is set to a given leaf partition for every input tuple
after it's routed.

Reported by: n/a
Patch by: Amit Langote
Reports: n/a
---
 src/backend/catalog/partition.c        | 74 ++++++++++++++++++++++++++++------
 src/backend/commands/copy.c            | 31 +++++++++++++-
 src/backend/executor/nodeModifyTable.c | 27 +++++++++++++
 src/include/catalog/partition.h        |  7 ++++
 src/include/nodes/execnodes.h          |  3 ++
 src/test/regress/expected/insert.out   | 37 +++++++++++++++++
 src/test/regress/sql/insert.sql        | 26 ++++++++++++
 7 files changed, 190 insertions(+), 15 deletions(-)

diff --git a/src/backend/catalog/partition.c b/src/backend/catalog/partition.c
index 9980582b77..fca874752f 100644
--- a/src/backend/catalog/partition.c
+++ b/src/backend/catalog/partition.c
@@ -923,13 +923,19 @@ RelationGetPartitionQual(Relation rel, bool recurse)
 	return generate_partition_qual(rel, recurse);
 }
 
-/* Turn an array of OIDs with N elements into a list */
-#define OID_ARRAY_TO_LIST(arr, N, list) \
+/*
+ * Append OIDs of rel's partitions to the list 'partoids' and for each OID,
+ * append pointer rel to the list 'parents'.
+ */
+#define APPEND_REL_PARTITION_OIDS(rel, partoids, parents) \
 	do\
 	{\
 		int		i;\
-		for (i = 0; i < (N); i++)\
-			(list) = lappend_oid((list), (arr)[i]);\
+		for (i = 0; i < (rel)->rd_partdesc->nparts; i++)\
+		{\
+			(partoids) = lappend_oid((partoids), (rel)->rd_partdesc->oids[i]);\
+			(parents) = lappend((parents), (rel));\
+		}\
 	} while(0)
 
 /*
@@ -944,11 +950,13 @@ PartitionDispatch *
 RelationGetPartitionDispatchInfo(Relation rel, int lockmode,
 								 int *num_parted, List **leaf_part_oids)
 {
-	PartitionDesc rootpartdesc = RelationGetPartitionDesc(rel);
 	PartitionDispatchData **pd;
 	List	   *all_parts = NIL,
-			   *parted_rels;
-	ListCell   *lc;
+			   *all_parents = NIL,
+			   *parted_rels,
+			   *parted_rel_parents;
+	ListCell   *lc1,
+			   *lc2;
 	int			i,
 				k,
 				offset;
@@ -965,10 +973,13 @@ RelationGetPartitionDispatchInfo(Relation rel, int lockmode,
 	 */
 	*num_parted = 1;
 	parted_rels = list_make1(rel);
-	OID_ARRAY_TO_LIST(rootpartdesc->oids, rootpartdesc->nparts, all_parts);
-	foreach(lc, all_parts)
+	/* Root partitioned table has no parent, so NULL for parent */
+	parted_rel_parents = list_make1(NULL);
+	APPEND_REL_PARTITION_OIDS(rel, all_parts, all_parents);
+	forboth(lc1, all_parts, lc2, all_parents)
 	{
-		Relation	partrel = heap_open(lfirst_oid(lc), lockmode);
+		Relation	partrel = heap_open(lfirst_oid(lc1), lockmode);
+		Relation	parent = lfirst(lc2);
 		PartitionDesc partdesc = RelationGetPartitionDesc(partrel);
 
 		/*
@@ -979,7 +990,8 @@ RelationGetPartitionDispatchInfo(Relation rel, int lockmode,
 		{
 			(*num_parted)++;
 			parted_rels = lappend(parted_rels, partrel);
-			OID_ARRAY_TO_LIST(partdesc->oids, partdesc->nparts, all_parts);
+			parted_rel_parents = lappend(parted_rel_parents, parent);
+			APPEND_REL_PARTITION_OIDS(partrel, all_parts, all_parents);
 		}
 		else
 			heap_close(partrel, NoLock);
@@ -1004,10 +1016,12 @@ RelationGetPartitionDispatchInfo(Relation rel, int lockmode,
 										   sizeof(PartitionDispatchData *));
 	*leaf_part_oids = NIL;
 	i = k = offset = 0;
-	foreach(lc, parted_rels)
+	forboth(lc1, parted_rels, lc2, parted_rel_parents)
 	{
-		Relation	partrel = lfirst(lc);
+		Relation	partrel = lfirst(lc1);
+		Relation	parent = lfirst(lc2);
 		PartitionKey partkey = RelationGetPartitionKey(partrel);
+		TupleDesc	 tupdesc = RelationGetDescr(partrel);
 		PartitionDesc partdesc = RelationGetPartitionDesc(partrel);
 		int			j,
 					m;
@@ -1017,6 +1031,27 @@ RelationGetPartitionDispatchInfo(Relation rel, int lockmode,
 		pd[i]->key = partkey;
 		pd[i]->keystate = NIL;
 		pd[i]->partdesc = partdesc;
+		if (parent != NULL)
+		{
+			/*
+			 * For every partitioned table other than root, we must store
+			 * a tuple table slot initialized with its tuple descriptor and
+			 * a tuple conversion map to convert a tuple from its parent's
+			 * rowtype to its own. That is to make sure that we are looking
+			 * at the correct row using the correct tuple descriptor when
+			 * computing its partition key for tuple routing.
+			 */
+			pd[i]->tupslot = MakeSingleTupleTableSlot(tupdesc);
+			pd[i]->tupmap = convert_tuples_by_name(RelationGetDescr(parent),
+												   tupdesc,
+								gettext_noop("could not convert row type"));
+		}
+		else
+		{
+			/* Not required for the root partitioned table */
+			pd[i]->tupslot = NULL;
+			pd[i]->tupmap = NULL;
+		}
 		pd[i]->indexes = (int *) palloc(partdesc->nparts * sizeof(int));
 
 		/*
@@ -1610,6 +1645,8 @@ get_partition_for_tuple(PartitionDispatch *pd,
 	{
 		PartitionKey key = parent->key;
 		PartitionDesc partdesc = parent->partdesc;
+		TupleTableSlot *myslot = parent->tupslot;
+		TupleConversionMap *map = parent->tupmap;
 
 		/* Quick exit */
 		if (partdesc->nparts == 0)
@@ -1618,6 +1655,17 @@ get_partition_for_tuple(PartitionDispatch *pd,
 			return -1;
 		}
 
+		if (myslot != NULL)
+		{
+			HeapTuple	tuple = ExecFetchSlotTuple(slot);
+
+			ExecClearTuple(myslot);
+			Assert(map != NULL);
+			tuple = do_convert_tuple(tuple, map);
+			ExecStoreTuple(tuple, myslot, InvalidBuffer, true);
+			slot = myslot;
+		}
+
 		/* Extract partition key from tuple */
 		FormPartitionKeyDatum(parent, slot, estate, values, isnull);
 
diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c
index d5901651db..aa25a23336 100644
--- a/src/backend/commands/copy.c
+++ b/src/backend/commands/copy.c
@@ -2436,6 +2436,15 @@ CopyFrom(CopyState cstate)
 	estate->es_trig_tuple_slot = ExecInitExtraTupleSlot(estate);
 
 	/*
+	 * Initialize a dedicated slot to manipulate tuples of any given
+	 * partition's rowtype.
+	 */
+	if (cstate->partition_dispatch_info)
+		estate->es_partition_tuple_slot = ExecInitExtraTupleSlot(estate);
+	else
+		estate->es_partition_tuple_slot = NULL;
+
+	/*
 	 * It's more efficient to prepare a bunch of tuples for insertion, and
 	 * insert them in one heap_multi_insert() call, than call heap_insert()
 	 * separately for every tuple. However, we can't do that if there are
@@ -2484,7 +2493,8 @@ CopyFrom(CopyState cstate)
 
 	for (;;)
 	{
-		TupleTableSlot *slot;
+		TupleTableSlot *slot,
+					   *oldslot = NULL;
 		bool		skip_tuple;
 		Oid			loaded_oid = InvalidOid;
 
@@ -2571,7 +2581,19 @@ CopyFrom(CopyState cstate)
 			map = cstate->partition_tupconv_maps[leaf_part_index];
 			if (map)
 			{
+				Relation	partrel = resultRelInfo->ri_RelationDesc;
+
 				tuple = do_convert_tuple(tuple, map);
+
+				/*
+				 * We must use the partition's tuple descriptor from this
+				 * point on.  Use a dedicated slot from this point on until
+				 * we're finished dealing with the partition.
+				 */
+				oldslot = slot;
+				slot = estate->es_partition_tuple_slot;
+				Assert(slot != NULL);
+				ExecSetSlotDescriptor(slot, RelationGetDescr(partrel));
 				ExecStoreTuple(tuple, slot, InvalidBuffer, true);
 			}
 
@@ -2667,6 +2689,10 @@ CopyFrom(CopyState cstate)
 			{
 				resultRelInfo = saved_resultRelInfo;
 				estate->es_result_relation_info = resultRelInfo;
+
+				/* Switch back to the slot corresponding to the root table */
+				Assert(oldslot != NULL);
+				slot = oldslot;
 			}
 		}
 	}
@@ -2714,13 +2740,14 @@ CopyFrom(CopyState cstate)
 		 * Remember cstate->partition_dispatch_info[0] corresponds to the root
 		 * partitioned table, which we must not try to close, because it is
 		 * the main target table of COPY that will be closed eventually by
-		 * DoCopy().
+		 * DoCopy().  Also, tupslot is NULL for the root partitioned table.
 		 */
 		for (i = 1; i < cstate->num_dispatch; i++)
 		{
 			PartitionDispatch pd = cstate->partition_dispatch_info[i];
 
 			heap_close(pd->reldesc, NoLock);
+			ExecDropSingleTupleTableSlot(pd->tupslot);
 		}
 		for (i = 0; i < cstate->num_partitions; i++)
 		{
diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c
index a9546106ce..0d85b151c2 100644
--- a/src/backend/executor/nodeModifyTable.c
+++ b/src/backend/executor/nodeModifyTable.c
@@ -262,6 +262,7 @@ ExecInsert(ModifyTableState *mtstate,
 	Relation	resultRelationDesc;
 	Oid			newId;
 	List	   *recheckIndexes = NIL;
+	TupleTableSlot *oldslot = NULL;
 
 	/*
 	 * get the heap tuple out of the tuple table slot, making sure we have a
@@ -318,7 +319,19 @@ ExecInsert(ModifyTableState *mtstate,
 		map = mtstate->mt_partition_tupconv_maps[leaf_part_index];
 		if (map)
 		{
+			Relation partrel = resultRelInfo->ri_RelationDesc;
+
 			tuple = do_convert_tuple(tuple, map);
+
+			/*
+			 * We must use the partition's tuple descriptor from this
+			 * point on, until we're finished dealing with the partition.
+			 * Use the dedicated slot for that.
+			 */
+			oldslot = slot;
+			slot = estate->es_partition_tuple_slot;
+			Assert(slot != NULL);
+			ExecSetSlotDescriptor(slot, RelationGetDescr(partrel));
 			ExecStoreTuple(tuple, slot, InvalidBuffer, true);
 		}
 	}
@@ -566,6 +579,10 @@ ExecInsert(ModifyTableState *mtstate,
 	{
 		resultRelInfo = saved_resultRelInfo;
 		estate->es_result_relation_info = resultRelInfo;
+
+		/* Switch back to the slot corresponding to the root table */
+		Assert(oldslot != NULL);
+		slot = oldslot;
 	}
 
 	/*
@@ -1734,7 +1751,15 @@ ExecInitModifyTable(ModifyTable *node, EState *estate, int eflags)
 		mtstate->mt_partitions = partitions;
 		mtstate->mt_num_partitions = num_partitions;
 		mtstate->mt_partition_tupconv_maps = partition_tupconv_maps;
+
+		/*
+		 * Initialize a dedicated slot to manipulate tuples of any given
+		 * partition's rowtype.
+		 */
+		estate->es_partition_tuple_slot = ExecInitExtraTupleSlot(estate);
 	}
+	else
+		estate->es_partition_tuple_slot = NULL;
 
 	/*
 	 * Initialize any WITH CHECK OPTION constraints if needed.
@@ -2058,12 +2083,14 @@ ExecEndModifyTable(ModifyTableState *node)
 	 * Remember node->mt_partition_dispatch_info[0] corresponds to the root
 	 * partitioned table, which we must not try to close, because it is the
 	 * main target table of the query that will be closed by ExecEndPlan().
+	 * Also, tupslot is NULL for the root partitioned table.
 	 */
 	for (i = 1; i < node->mt_num_dispatch; i++)
 	{
 		PartitionDispatch pd = node->mt_partition_dispatch_info[i];
 
 		heap_close(pd->reldesc, NoLock);
+		ExecDropSingleTupleTableSlot(pd->tupslot);
 	}
 	for (i = 0; i < node->mt_num_partitions; i++)
 	{
diff --git a/src/include/catalog/partition.h b/src/include/catalog/partition.h
index 21effbf87b..bf38df5d29 100644
--- a/src/include/catalog/partition.h
+++ b/src/include/catalog/partition.h
@@ -47,6 +47,11 @@ typedef struct PartitionDescData *PartitionDesc;
  *	key			Partition key information of the table
  *	keystate	Execution state required for expressions in the partition key
  *	partdesc	Partition descriptor of the table
+ *	tupslot		A standalone TupleTableSlot initialized with this table's tuple
+ *				descriptor
+ *	tupmap		TupleConversionMap to convert from the parent's rowtype to
+ *				this table's rowtype (when extracting the partition key of a
+ *				tuple just before routing it through this table)
  *	indexes		Array with partdesc->nparts members (for details on what
  *				individual members represent, see how they are set in
  *				RelationGetPartitionDispatchInfo())
@@ -58,6 +63,8 @@ typedef struct PartitionDispatchData
 	PartitionKey			key;
 	List				   *keystate;	/* list of ExprState */
 	PartitionDesc			partdesc;
+	TupleTableSlot		   *tupslot;
+	TupleConversionMap	   *tupmap;
 	int					   *indexes;
 } PartitionDispatchData;
 
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 5c3b8683f5..9073a9a1bc 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -384,6 +384,9 @@ typedef struct EState
 	TupleTableSlot *es_trig_oldtup_slot;		/* for TriggerEnabled */
 	TupleTableSlot *es_trig_newtup_slot;		/* for TriggerEnabled */
 
+	/* Slot used to manipulate a tuple after it is routed to a partition */
+	TupleTableSlot *es_partition_tuple_slot;
+
 	/* Parameter info: */
 	ParamListInfo es_param_list_info;	/* values of external params */
 	ParamExecData *es_param_exec_vals;	/* values of internal params */
diff --git a/src/test/regress/expected/insert.out b/src/test/regress/expected/insert.out
index 561cefa3c4..49f667b119 100644
--- a/src/test/regress/expected/insert.out
+++ b/src/test/regress/expected/insert.out
@@ -300,3 +300,40 @@ drop cascades to table part_null
 drop cascades to table part_ee_ff
 drop cascades to table part_ee_ff1
 drop cascades to table part_ee_ff2
+-- more tests for certain multi-level partitioning scenarios
+create table p (a int, b int) partition by range (a, b);
+create table p1 (b int, a int not null) partition by range (b);
+create table p11 (like p1);
+alter table p11 drop a;
+alter table p11 add a int;
+alter table p11 drop a;
+alter table p11 add a int not null;
+-- attnum for key attribute 'a' is different in p, p1, and p11
+select attrelid::regclass, attname, attnum
+from pg_attribute
+where attname = 'a'
+ and (attrelid = 'p'::regclass
+   or attrelid = 'p1'::regclass
+   or attrelid = 'p11'::regclass);
+ attrelid | attname | attnum 
+----------+---------+--------
+ p        | a       |      1
+ p1       | a       |      2
+ p11      | a       |      4
+(3 rows)
+
+alter table p1 attach partition p11 for values from (2) to (5);
+alter table p attach partition p1 for values from (1, 2) to (1, 10);
+-- check that "(1, 2)" is correctly routed to p11.
+insert into p values (1, 2);
+select tableoid::regclass, * from p;
+ tableoid | a | b 
+----------+---+---
+ p11      | 1 | 2
+(1 row)
+
+-- cleanup
+drop table p cascade;
+NOTICE:  drop cascades to 2 other objects
+DETAIL:  drop cascades to table p1
+drop cascades to table p11
diff --git a/src/test/regress/sql/insert.sql b/src/test/regress/sql/insert.sql
index 846bb5897a..08dc068de8 100644
--- a/src/test/regress/sql/insert.sql
+++ b/src/test/regress/sql/insert.sql
@@ -170,3 +170,29 @@ select tableoid::regclass, * from list_parted;
 -- cleanup
 drop table range_parted cascade;
 drop table list_parted cascade;
+
+-- more tests for certain multi-level partitioning scenarios
+create table p (a int, b int) partition by range (a, b);
+create table p1 (b int, a int not null) partition by range (b);
+create table p11 (like p1);
+alter table p11 drop a;
+alter table p11 add a int;
+alter table p11 drop a;
+alter table p11 add a int not null;
+-- attnum for key attribute 'a' is different in p, p1, and p11
+select attrelid::regclass, attname, attnum
+from pg_attribute
+where attname = 'a'
+ and (attrelid = 'p'::regclass
+   or attrelid = 'p1'::regclass
+   or attrelid = 'p11'::regclass);
+
+alter table p1 attach partition p11 for values from (2) to (5);
+alter table p attach partition p1 for values from (1, 2) to (1, 10);
+
+-- check that "(1, 2)" is correctly routed to p11.
+insert into p values (1, 2);
+select tableoid::regclass, * from p;
+
+-- cleanup
+drop table p cascade;
-- 
2.11.0

