From 9d27383ea62e14f914024ce3121e3c78f53e1539 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Herrera?= <alvherre@kurilemu.de>
Date: Wed, 25 Mar 2026 20:35:46 +0100
Subject: [PATCH v45 7/8] Error out any process that would block at REPACK

Any process waiting on REPACK to release its lock would actually cause
it to deadlock when it tries to upgrade its lock to AEL, losing all work
done to that point.  We avoid this by teaching the deadlock detector to
raise an error when this condition is detected.
---
 src/backend/commands/repack.c                 | 53 ++++++++----
 src/backend/storage/lmgr/deadlock.c           | 15 ++++
 src/include/storage/proc.h                    |  6 +-
 src/test/modules/injection_points/Makefile    |  1 +
 .../expected/repack_deadlock.out              | 63 ++++++++++++++
 src/test/modules/injection_points/meson.build |  1 +
 .../specs/repack_deadlock.spec                | 83 +++++++++++++++++++
 7 files changed, 202 insertions(+), 20 deletions(-)
 create mode 100644 src/test/modules/injection_points/expected/repack_deadlock.out
 create mode 100644 src/test/modules/injection_points/specs/repack_deadlock.spec

diff --git a/src/backend/commands/repack.c b/src/backend/commands/repack.c
index d25ab43da7b..7291b4180ab 100644
--- a/src/backend/commands/repack.c
+++ b/src/backend/commands/repack.c
@@ -276,6 +276,16 @@ ExecRepack(ParseState *pstate, RepackStmt *stmt, bool isTopLevel)
 	/* Determine the lock mode to use. */
 	lockmode = RepackLockLevel((params.options & CLUOPT_CONCURRENT) != 0);
 
+	/*
+	 * If in concurrent mode, set the PROC_IN_CONCURRENT_REPACK flag.  This
+	 * makes the deadlock checker cause anyone that would conflict with us
+	 * to error out.  It's important to set this flag ahead of actually locking
+	 * the relation; it won't of course affect anyone until we do have a lock
+	 * that others can conflict with.
+	 */
+	if ((params.options & CLUOPT_CONCURRENT) != 0)
+		MyProc->statusFlags |= PROC_IN_CONCURRENT_REPACK;
+
 	/*
 	 * If a single relation is specified, process it and we're done ... unless
 	 * the relation is a partitioned table, in which case we fall through.
@@ -476,11 +486,8 @@ RepackLockLevel(bool concurrent)
  * If indexOid is InvalidOid, the table will be rewritten in physical order
  * instead of index order.
  *
- * Note that, in the concurrent case, the function releases the lock at some
- * point, in order to get AccessExclusiveLock for the final steps (i.e. to
- * swap the relation files). To make things simpler, the caller should expect
- * OldHeap to be closed on return, regardless CLUOPT_CONCURRENT. (The
- * AccessExclusiveLock is kept till the end of the transaction.)
+ * On return, OldHeap is closed but locked with AccessExclusiveLock - the lock
+ * will be released at end of the transaction.
  *
  * 'cmd' indicates which command is being executed, to be used for error
  * messages.
@@ -512,10 +519,12 @@ cluster_rel(RepackCommand cmd, Relation OldHeap, Oid indexOid,
 		/*
 		 * Make sure we're not in a transaction block.
 		 *
-		 * The reason is that repack_setup_logical_decoding() could deadlock
-		 * if there's an XID already assigned.  It would be possible to run in
-		 * a transaction block if we had no XID, but this restriction is
-		 * simpler for users to understand and we don't lose anything.
+		 * The reason is that repack_setup_logical_decoding() could wait
+		 * indefinitely for our XID to complete. (The deadlock detector would
+		 * not recognize it because we'd be waiting for ourselves, i.e. no
+		 * real lock conflict.) It would be possible to run in a transaction
+		 * block if we had no XID, but this restriction is simpler for users
+		 * to understand and we don't lose anything.
 		 */
 		PreventInTransactionBlock(isTopLevel, "REPACK (CONCURRENTLY)");
 
@@ -998,10 +1007,8 @@ rebuild_relation(Relation OldHeap, Relation index, bool verbose,
 		 * Note that the worker has to wait for all transactions with XID
 		 * already assigned to finish. If some of those transactions is
 		 * waiting for a lock conflicting with ShareUpdateExclusiveLock on our
-		 * table (e.g.  it runs CREATE INDEX), we can end up in a deadlock.
-		 * Not sure this risk is worth unlocking/locking the table (and its
-		 * clustering index) and checking again if it's still eligible for
-		 * REPACK CONCURRENTLY.
+		 * table (e.g. it runs CREATE INDEX), it should encounter ERROR in the
+		 * deadlock checking code.
 		 */
 		start_repack_decoding_worker(tableOid);
 
@@ -3119,10 +3126,18 @@ rebuild_relation_finish_concurrent(Relation NewHeap, Relation OldHeap,
 		LockRelationOid(OldHeap->rd_rel->reltoastrelid, AccessExclusiveLock);
 
 	/*
-	 * Tuples and pages of the old heap will be gone, but the heap will stay.
+	 * Now that we have all access-exclusive locks on all relations, we no
+	 * longer want other processes to error out when trying to acquire a
+	 * conflicting lock.  Therefore, unset our flag.
+	 */
+	MyProc->statusFlags &= ~PROC_IN_CONCURRENT_REPACK;
+
+	/*
+	 * Tuples and pages of the old heap will be gone, but the heap itself will
+	 * stay.  In order for predicate locks to continue to work, convert them
+	 * to relation-level locks.  We do this both for table and indexes.
 	 */
 	TransferPredicateLocksToHeapRelation(OldHeap);
-	/* The same for indexes. */
 	for (int i = 0; i < nind; i++)
 	{
 		Relation	index = ind_refs[i];
@@ -3342,9 +3357,11 @@ start_repack_decoding_worker(Oid relid)
 
 	/*
 	 * The decoding setup must be done before the caller can have XID assigned
-	 * for any reason, otherwise the worker might end up in a deadlock,
-	 * waiting for the caller's transaction to end. Therefore wait here until
-	 * the worker indicates that it has the logical decoding initialized.
+	 * for any reason, otherwise the worker might end up waiting for the
+	 * caller's transaction to end. (Deadlock detector does not consider this
+	 * a conflict because the worker is in the same locking group as the
+	 * backend that launched it.) Therefore wait here until the worker
+	 * indicates that it has the logical decoding initialized.
 	 */
 	ConditionVariablePrepareToSleep(&shared->cv);
 	for (;;)
diff --git a/src/backend/storage/lmgr/deadlock.c b/src/backend/storage/lmgr/deadlock.c
index b8962d875b6..3d3ec0da111 100644
--- a/src/backend/storage/lmgr/deadlock.c
+++ b/src/backend/storage/lmgr/deadlock.c
@@ -620,6 +620,21 @@ FindLockCycleRecurseMember(PGPROC *checkProc,
 						proc->statusFlags & PROC_IS_AUTOVACUUM)
 						blocking_autovacuum_proc = proc;
 
+					/*
+					 * Similarly, if we note that we're blocked by some process
+					 * running REPACK (CONCURRENTLY), just fail.  That process
+					 * is going to upgrade its lock at some point, and it would
+					 * be inappropriate for any other process to cause that
+					 * to fail.
+					 */
+					if (checkProc == MyProc &&
+						proc->statusFlags & PROC_IN_CONCURRENT_REPACK)
+						ereport(ERROR,
+								errcode(ERRCODE_OBJECT_IN_USE),
+								errmsg("could not wait for concurrent REPACK"),
+								errdetail("Process %d waits for REPACK running on process %d",
+									   MyProc->pid, proc->pid));
+
 					/* We're done looking at this proclock */
 					break;
 				}
diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h
index 1dad125706e..8ad9718f3d6 100644
--- a/src/include/storage/proc.h
+++ b/src/include/storage/proc.h
@@ -69,10 +69,12 @@ struct XidCache
 #define		PROC_AFFECTS_ALL_HORIZONS	0x20	/* this proc's xmin must be
 												 * included in vacuum horizons
 												 * in all databases */
+#define		PROC_IN_CONCURRENT_REPACK	0x40	/* REPACK (CONCURRENTLY) */
 
-/* flags reset at EOXact */
+/* flags reset at EOXact.  A bit of a misnomer ... */
 #define		PROC_VACUUM_STATE_MASK \
-	(PROC_IN_VACUUM | PROC_IN_SAFE_IC | PROC_VACUUM_FOR_WRAPAROUND)
+	(PROC_IN_VACUUM | PROC_IN_SAFE_IC | PROC_VACUUM_FOR_WRAPAROUND | \
+	 PROC_IN_CONCURRENT_REPACK)
 
 /*
  * Xmin-related flags. Make sure any flags that affect how the process' Xmin
diff --git a/src/test/modules/injection_points/Makefile b/src/test/modules/injection_points/Makefile
index 2cd7d87c533..f7663859fe2 100644
--- a/src/test/modules/injection_points/Makefile
+++ b/src/test/modules/injection_points/Makefile
@@ -15,6 +15,7 @@ REGRESS_OPTS = --dlpath=$(top_builddir)/src/test/regress
 ISOLATION = basic \
 	    inplace \
 	    repack \
+	    repack_deadlock \
 	    repack_toast \
 	    syscache-update-pruned \
 	    heap_lock_update
diff --git a/src/test/modules/injection_points/expected/repack_deadlock.out b/src/test/modules/injection_points/expected/repack_deadlock.out
new file mode 100644
index 00000000000..a86e4767536
--- /dev/null
+++ b/src/test/modules/injection_points/expected/repack_deadlock.out
@@ -0,0 +1,63 @@
+Parsed test spec with 2 sessions
+
+starting permutation: wait_before_lock add_column wakeup_before_lock check1
+injection_points_attach
+-----------------------
+                       
+(1 row)
+
+step wait_before_lock: 
+	REPACK (CONCURRENTLY) repack_deadlock USING INDEX repack_deadlock_pkey;
+ <waiting ...>
+step add_column: 
+	alter table repack_deadlock add column noise text;
+ <waiting ...>
+step add_column: <... completed>
+ERROR:  could not wait for concurrent REPACK
+step wakeup_before_lock: 
+	SELECT injection_points_wakeup('repack-concurrently-before-lock');
+
+injection_points_wakeup
+-----------------------
+                       
+(1 row)
+
+step wait_before_lock: <... completed>
+step check1: 
+	INSERT INTO relfilenodes(node)
+	SELECT relfilenode FROM pg_class WHERE relname='repack_deadlock';
+
+	SELECT count(DISTINCT node) FROM relfilenodes;
+
+	SELECT i, j FROM repack_deadlock ORDER BY i, j;
+
+	INSERT INTO data_s1(i, j)
+	SELECT i, j FROM repack_deadlock;
+
+	SELECT count(*)
+	FROM data_s1 d1 FULL JOIN data_s2 d2 USING (i, j)
+	WHERE d1.i ISNULL OR d2.i ISNULL;
+
+count
+-----
+    1
+(1 row)
+
+i|j
+-+-
+1|1
+2|2
+3|3
+4|4
+(4 rows)
+
+count
+-----
+    4
+(1 row)
+
+injection_points_detach
+-----------------------
+                       
+(1 row)
+
diff --git a/src/test/modules/injection_points/meson.build b/src/test/modules/injection_points/meson.build
index a414abb924b..1cd88d6db65 100644
--- a/src/test/modules/injection_points/meson.build
+++ b/src/test/modules/injection_points/meson.build
@@ -46,6 +46,7 @@ tests += {
       'basic',
       'inplace',
       'repack',
+      'repack_deadlock',
       'repack_toast',
       'syscache-update-pruned',
       'heap_lock_update',
diff --git a/src/test/modules/injection_points/specs/repack_deadlock.spec b/src/test/modules/injection_points/specs/repack_deadlock.spec
new file mode 100644
index 00000000000..9d23a6588c2
--- /dev/null
+++ b/src/test/modules/injection_points/specs/repack_deadlock.spec
@@ -0,0 +1,83 @@
+# Test REPACK with a concurrent transaction that would cause a deadlock
+setup
+{
+	CREATE EXTENSION injection_points;
+
+	CREATE TABLE repack_deadlock(i int PRIMARY KEY, j int);
+	INSERT INTO repack_deadlock(i, j) VALUES (1, 1), (2, 2), (3, 3), (4, 4);
+
+	CREATE TABLE relfilenodes(node oid);
+
+	CREATE TABLE data_s1(i int, j int);
+	CREATE TABLE data_s2(i int, j int);
+}
+
+teardown
+{
+	DROP TABLE repack_deadlock;
+	DROP EXTENSION injection_points;
+
+	DROP TABLE relfilenodes;
+	DROP TABLE data_s1;
+	DROP TABLE data_s2;
+}
+
+session s1
+setup
+{
+	SELECT injection_points_set_local();
+	SELECT injection_points_attach('repack-concurrently-before-lock', 'wait');
+}
+# Perform the initial load and wait for s2 to do some data changes.
+step wait_before_lock
+{
+	REPACK (CONCURRENTLY) repack_deadlock USING INDEX repack_deadlock_pkey;
+}
+# Check the table from the perspective of s1.
+#
+# Besides the contents, we also check that relfilenode has changed.
+
+# Have each session write the contents into a table and use FULL JOIN to check
+# if the outputs are identical.
+step check1
+{
+	INSERT INTO relfilenodes(node)
+	SELECT relfilenode FROM pg_class WHERE relname='repack_deadlock';
+
+	SELECT count(DISTINCT node) FROM relfilenodes;
+
+	SELECT i, j FROM repack_deadlock ORDER BY i, j;
+
+	INSERT INTO data_s1(i, j)
+	SELECT i, j FROM repack_deadlock;
+
+	SELECT count(*)
+	FROM data_s1 d1 FULL JOIN data_s2 d2 USING (i, j)
+	WHERE d1.i ISNULL OR d2.i ISNULL;
+}
+teardown
+{
+	SELECT injection_points_detach('repack-concurrently-before-lock');
+}
+
+session s2
+# Change the existing data. UPDATE changes both key and non-key columns. Also
+# update one row twice to test whether tuple version generated by this session
+# can be found.
+step add_column
+{
+	alter table repack_deadlock add column noise text;
+}
+
+step wakeup_before_lock
+{
+	SELECT injection_points_wakeup('repack-concurrently-before-lock');
+}
+
+# Test if data changes introduced while one session is performing REPACK
+# CONCURRENTLY find their way into the table.
+permutation
+	wait_before_lock
+	add_column
+	wakeup_before_lock
+	check1
-- 
2.47.3

