From dae436ec2b72c01c0ee0f4d029281c2413768381 Mon Sep 17 00:00:00 2001
From: Alvaro Herrera <alvherre@alvh.no-ip.org>
Date: Mon, 12 Aug 2024 13:37:29 -0400
Subject: [PATCH v2] Fix creation of partition descriptor during concurrent
 detach+drop

Author: Kuntal Ghosh <kuntalghosh.2007@gmail.com>
Reviewed-by: Junwang Zhao <zhjwpku@gmail.com>
Reviewed-by: Tender Wang <tndrwang@gmail.com>
Discussion: https://postgr.es/m/18559-b48286d2eacd9a4e@postgresql.org
---
 src/backend/partitioning/partdesc.c | 30 +++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/src/backend/partitioning/partdesc.c b/src/backend/partitioning/partdesc.c
index c661a303bf..b4e0ed0e71 100644
--- a/src/backend/partitioning/partdesc.c
+++ b/src/backend/partitioning/partdesc.c
@@ -209,6 +209,10 @@ retry:
 		 * shared queue.  We solve this problem by reading pg_class directly
 		 * for the desired tuple.
 		 *
+		 * If the partition recently detached is also dropped, we get no tuple
+		 * from the scan.  In that case, we also retry, and next time through
+		 * here, we don't see that partition anymore.
+		 *
 		 * The other problem is that DETACH CONCURRENTLY is in the process of
 		 * removing a partition, which happens in two steps: first it marks it
 		 * as "detach pending", commits, then unsets relpartbound.  If
@@ -223,8 +227,6 @@ retry:
 			Relation	pg_class;
 			SysScanDesc scan;
 			ScanKeyData key[1];
-			Datum		datum;
-			bool		isnull;
 
 			pg_class = table_open(RelationRelationId, AccessShareLock);
 			ScanKeyInit(&key[0],
@@ -233,17 +235,29 @@ retry:
 						ObjectIdGetDatum(inhrelid));
 			scan = systable_beginscan(pg_class, ClassOidIndexId, true,
 									  NULL, 1, key);
+
+			/*
+			 * We could get one tuple from the scan (the normal case), or zero
+			 * tuples if the table has been dropped meanwhile.
+			 */
 			tuple = systable_getnext(scan);
-			datum = heap_getattr(tuple, Anum_pg_class_relpartbound,
-								 RelationGetDescr(pg_class), &isnull);
-			if (!isnull)
-				boundspec = stringToNode(TextDatumGetCString(datum));
+			if (HeapTupleIsValid(tuple))
+			{
+				Datum		datum;
+				bool		isnull;
+
+				datum = heap_getattr(tuple, Anum_pg_class_relpartbound,
+									 RelationGetDescr(pg_class), &isnull);
+				if (!isnull)
+					boundspec = stringToNode(TextDatumGetCString(datum));
+			}
 			systable_endscan(scan);
 			table_close(pg_class, AccessShareLock);
 
 			/*
-			 * If we still don't get a relpartbound value, then it must be
-			 * because of DETACH CONCURRENTLY.  Restart from the top, as
+			 * If we still don't get a relpartbound value (either because
+			 * boundspec is null or because there was no tuple), then it must
+			 * be because of DETACH CONCURRENTLY.  Restart from the top, as
 			 * explained above.  We only do this once, for two reasons: first,
 			 * only one DETACH CONCURRENTLY session could affect us at a time,
 			 * since each of them would have to wait for the snapshot under
-- 
2.39.2

