From afc1e031b1d0599e657a2a27a7afa5f4a8bc9209 Mon Sep 17 00:00:00 2001
From: Antonin Houska <ah@cybertec.at>
Date: Fri, 17 Apr 2026 15:13:17 +0200
Subject: [PATCH] Lock upgrade without deadlocks.

For REPACK (CONCURRENTLY), it's essential to upgrade ShareUpdateExclusive lock
to AccessExclusiveLock for the final stage of table processing. However, we
cannot release the earlier before requesting the latter, because if another
process ran DDL on the table in between, REPACK would have to cancel its
transaction, and waste possibly a lot of work. Without releasing
ShareUpdateExclusive lock, the upgrade can cause deadlock, and it's not really
deterministic whether REPACK or the other process will get canceled.

This patch adds a new field to the LOCK structure which tells which process is
trying to upgrade the lock - this field is set right before the upgrade. All
processes waiting in the lock queue are then woken up to check the flag.

When another process tries to get the lock after that, it checks this field,
and if it's set, it checks for deadlocks even if deadlock timeout hasn't
expired yet. If the process is already in the lock's queue and sleeping, the
lock upgrading process wakes it up so it checks the flag immediately.

At the time the upgrading process starts to sleep, all the other process
should be aware that they need to check for deadlock during their lock
acquisitions, so the upgrading process does not have to check for deadlocks
when it gets woken up. Thus it should never receive deadlock error report.
---
 src/backend/commands/repack.c   |   4 +-
 src/backend/storage/lmgr/lmgr.c |  35 ++++++++---
 src/backend/storage/lmgr/lock.c | 107 +++++++++++++++++++++++++++++++-
 src/backend/storage/lmgr/proc.c |  32 +++++++---
 src/include/storage/lmgr.h      |   1 +
 src/include/storage/lock.h      |   5 +-
 src/include/storage/proc.h      |   4 +-
 7 files changed, 166 insertions(+), 22 deletions(-)

diff --git a/src/backend/commands/repack.c b/src/backend/commands/repack.c
index 58e3867246f..310b2a65099 100644
--- a/src/backend/commands/repack.c
+++ b/src/backend/commands/repack.c
@@ -3055,8 +3055,10 @@ rebuild_relation_finish_concurrent(Relation NewHeap, Relation OldHeap,
 	/*
 	 * Acquire AccessExclusiveLock on the table, its TOAST relation (if there
 	 * is one), all its indexes, so that we can swap the files.
+	 *
+	 * TODO The same for indexes and TOAST?
 	 */
-	LockRelationOid(old_table_oid, AccessExclusiveLock);
+	LockRelationOidUpgrade(old_table_oid, AccessExclusiveLock);
 
 	/*
 	 * Lock all indexes now, not only the clustering one: all indexes need to
diff --git a/src/backend/storage/lmgr/lmgr.c b/src/backend/storage/lmgr/lmgr.c
index 2ccf7237fee..0b103485d54 100644
--- a/src/backend/storage/lmgr/lmgr.c
+++ b/src/backend/storage/lmgr/lmgr.c
@@ -58,6 +58,8 @@ typedef struct XactLockTableWaitInfo
 	const ItemPointerData *ctid;
 } XactLockTableWaitInfo;
 
+static void LockRelationOidCommon(Oid relid, LOCKMODE lockmode,
+								  bool isUpgrade);
 static void XactLockTableWaitErrorCb(void *arg);
 
 /*
@@ -105,6 +107,21 @@ SetLocktagRelationOid(LOCKTAG *tag, Oid relid)
  */
 void
 LockRelationOid(Oid relid, LOCKMODE lockmode)
+{
+	LockRelationOidCommon(relid, lockmode, false);
+}
+
+/*
+ * Like above, but upgrade an existing lock.
+ */
+void
+LockRelationOidUpgrade(Oid relid, LOCKMODE lockmode)
+{
+	LockRelationOidCommon(relid, lockmode, true);
+}
+
+static void
+LockRelationOidCommon(Oid relid, LOCKMODE lockmode, bool isUpgrade)
 {
 	LOCKTAG		tag;
 	LOCALLOCK  *locallock;
@@ -113,7 +130,7 @@ LockRelationOid(Oid relid, LOCKMODE lockmode)
 	SetLocktagRelationOid(&tag, relid);
 
 	res = LockAcquireExtended(&tag, lockmode, false, false, true, &locallock,
-							  false);
+							  false, isUpgrade);
 
 	/*
 	 * Now that we have the lock, check for invalidation messages, so that we
@@ -157,7 +174,7 @@ ConditionalLockRelationOid(Oid relid, LOCKMODE lockmode)
 	SetLocktagRelationOid(&tag, relid);
 
 	res = LockAcquireExtended(&tag, lockmode, false, true, true, &locallock,
-							  false);
+							  false, false);
 
 	if (res == LOCKACQUIRE_NOT_AVAIL)
 		return false;
@@ -191,7 +208,7 @@ LockRelationId(LockRelId *relid, LOCKMODE lockmode)
 	SET_LOCKTAG_RELATION(tag, relid->dbId, relid->relId);
 
 	res = LockAcquireExtended(&tag, lockmode, false, false, true, &locallock,
-							  false);
+							  false, false);
 
 	/*
 	 * Now that we have the lock, check for invalidation messages; see notes
@@ -254,7 +271,7 @@ LockRelation(Relation relation, LOCKMODE lockmode)
 						 relation->rd_lockInfo.lockRelId.relId);
 
 	res = LockAcquireExtended(&tag, lockmode, false, false, true, &locallock,
-							  false);
+							  false, false);
 
 	/*
 	 * Now that we have the lock, check for invalidation messages; see notes
@@ -286,7 +303,7 @@ ConditionalLockRelation(Relation relation, LOCKMODE lockmode)
 						 relation->rd_lockInfo.lockRelId.relId);
 
 	res = LockAcquireExtended(&tag, lockmode, false, true, true, &locallock,
-							  false);
+							  false, false);
 
 	if (res == LOCKACQUIRE_NOT_AVAIL)
 		return false;
@@ -591,7 +608,7 @@ ConditionalLockTuple(Relation relation, const ItemPointerData *tid, LOCKMODE loc
 					  ItemPointerGetOffsetNumber(tid));
 
 	return (LockAcquireExtended(&tag, lockmode, false, true, true, NULL,
-								logLockFailure) != LOCKACQUIRE_NOT_AVAIL);
+								logLockFailure, false) != LOCKACQUIRE_NOT_AVAIL);
 }
 
 /*
@@ -749,7 +766,7 @@ ConditionalXactLockTableWait(TransactionId xid, bool logLockFailure)
 		SET_LOCKTAG_TRANSACTION(tag, xid);
 
 		if (LockAcquireExtended(&tag, ShareLock, false, true, true, NULL,
-								logLockFailure)
+								logLockFailure, false)
 			== LOCKACQUIRE_NOT_AVAIL)
 			return false;
 
@@ -1043,7 +1060,7 @@ ConditionalLockDatabaseObject(Oid classid, Oid objid, uint16 objsubid,
 					   objsubid);
 
 	res = LockAcquireExtended(&tag, lockmode, false, true, true, &locallock,
-							  false);
+							  false, false);
 
 	if (res == LOCKACQUIRE_NOT_AVAIL)
 		return false;
@@ -1123,7 +1140,7 @@ ConditionalLockSharedObject(Oid classid, Oid objid, uint16 objsubid,
 					   objsubid);
 
 	res = LockAcquireExtended(&tag, lockmode, false, true, true, &locallock,
-							  false);
+							  false, false);
 
 	if (res == LOCKACQUIRE_NOT_AVAIL)
 		return false;
diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c
index c221fe96889..bcbbf36c091 100644
--- a/src/backend/storage/lmgr/lock.c
+++ b/src/backend/storage/lmgr/lock.c
@@ -810,7 +810,7 @@ LockAcquire(const LOCKTAG *locktag,
 			bool dontWait)
 {
 	return LockAcquireExtended(locktag, lockmode, sessionLock, dontWait,
-							   true, NULL, false);
+							   true, NULL, false, false);
 }
 
 /*
@@ -829,6 +829,9 @@ LockAcquire(const LOCKTAG *locktag,
  *
  * logLockFailure indicates whether to log details when a lock acquisition
  * fails with dontWait = true.
+ *
+ * isUpgrade should be true if the backend already holds this lock in a lower
+ * mode.
  */
 LockAcquireResult
 LockAcquireExtended(const LOCKTAG *locktag,
@@ -837,7 +840,8 @@ LockAcquireExtended(const LOCKTAG *locktag,
 					bool dontWait,
 					bool reportMemoryError,
 					LOCALLOCK **locallockp,
-					bool logLockFailure)
+					bool logLockFailure,
+					bool isUpgrade)
 {
 	LOCKMETHODID lockmethodid = locktag->locktag_lockmethodid;
 	LockMethod	lockMethodTable;
@@ -1119,7 +1123,8 @@ LockAcquireExtended(const LOCKTAG *locktag,
 		 * case, because JoinWaitQueue() may discover that we can acquire the
 		 * lock immediately after all.
 		 */
-		waitResult = JoinWaitQueue(locallock, lockMethodTable, dontWait);
+		waitResult = JoinWaitQueue(locallock, lockMethodTable, dontWait,
+								   isUpgrade);
 	}
 
 	if (waitResult == PROC_WAIT_STATUS_ERROR)
@@ -1225,8 +1230,85 @@ LockAcquireExtended(const LOCKTAG *locktag,
 		Assert(!dontWait);
 		PROCLOCK_PRINT("LockAcquire: sleeping on lock", proclock);
 		LOCK_PRINT("LockAcquire: sleeping on lock", lock, lockmode);
+
+		/*
+		 * Lock upgrade can introduce deadlock, therefore enforce special
+		 * behavior of other processes that deal with this lock. In
+		 * particular, any waiter that sees upgradedBy set is expected to
+		 * perform deadlock check as soon as it's woken up. (Since we are
+		 * already in the queue, the deadlock detector has all the information
+		 * it needs.)
+		 */
+		if (isUpgrade)
+		{
+			dlist_iter	iter;
+
+			/*
+			 * There should not be multiple upgrades at the same time. XXX
+			 * ERROR ?
+			 */
+			Assert(lock->upgradedBy == NULL);
+
+			lock->upgradedBy = MyProc;
+
+			/*
+			 * Now, before I sleep myself, wake up all the existing waiters
+			 * (except for me), so they check for deadlock.
+			 */
+			dclist_foreach(iter, &lock->waitProcs)
+			{
+				PGPROC	   *proc = dlist_container(PGPROC, waitLink,
+												   iter.cur);
+
+				if (proc != MyProc)
+					SetLatch(&proc->procLatch);
+			}
+		}
 		LWLockRelease(partitionLock);
 
+		if (!isUpgrade)
+		{
+			HASH_SEQ_STATUS status;
+			LOCALLOCK	*llock;
+			bool	check_deadlock = false;
+
+			/*
+			 * Likewise, new waiters should check the isUpgrade flag and
+			 * engage deadlock detector if needed. The lock being acquired
+			 * should already be in the hash.
+			 *
+			 * Note that, to make deadlock detection happen as soon as
+			 * possible (i.e. regardless deadlock timeout), we check the other
+			 * locks too. XXX There might be ways to get into the deadlock
+			 * indirectly, but that needs more analysis. As long as the
+			 * upgrading process skips deadlock detection altogether, the
+			 * worst consequence of missing some lock wait cycle here is that
+			 * the other process will be kicked-off no sooner than after its
+			 * deadlock timeout has elapsed. (Which in turn means that the
+			 * lock upgrade will take longer than expected.)
+			 */
+			hash_seq_init(&status, LockMethodLocalHash);
+			while ((llock = (LOCALLOCK *) hash_seq_search(&status)) != NULL)
+			{
+				LOCK	*mylock = llock->lock;
+
+				if (mylock && mylock->upgradedBy)
+				{
+					check_deadlock = true;
+					break;
+				}
+			}
+			/* Check for deadlock if needed. */
+			if (check_deadlock)
+			{
+				DeadLockState	deadlock_state;
+
+				deadlock_state = CheckDeadLock();
+				if (deadlock_state == DS_HARD_DEADLOCK)
+					DeadLockReport();
+			}
+		}
+
 		waitResult = WaitOnLock(locallock, owner);
 
 		/*
@@ -1245,6 +1327,18 @@ LockAcquireExtended(const LOCKTAG *locktag,
 			DeadLockReport();
 			/* DeadLockReport() will not return */
 		}
+
+		/*
+		 * If finishing the lock upgrade, we should not get into a deadlock
+		 * anymore, so let others know that they do not have to care either.
+		 */
+		if (isUpgrade)
+		{
+			LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+			Assert(lock->upgradedBy != NULL);
+			lock->upgradedBy = NULL;
+			LWLockRelease(partitionLock);
+		}
 	}
 	else
 		LWLockRelease(partitionLock);
@@ -1322,6 +1416,7 @@ SetupLockInTable(LockMethod lockMethodTable, PGPROC *proc,
 		lock->nGranted = 0;
 		MemSet(lock->requested, 0, sizeof(int) * MAX_LOCKMODES);
 		MemSet(lock->granted, 0, sizeof(int) * MAX_LOCKMODES);
+		lock->upgradedBy = NULL;
 		LOCK_PRINT("LockAcquire: new", lock, lockmode);
 	}
 	else
@@ -1768,6 +1863,12 @@ CleanUpLock(LOCK *lock, PROCLOCK *proclock,
 			elog(PANIC, "proclock table corrupted");
 	}
 
+	/*
+	 * Was this backend upgrading the lock?
+	 */
+	if (lock->upgradedBy == MyProc)
+		lock->upgradedBy = NULL;
+
 	if (lock->nRequested == 0)
 	{
 		/*
diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c
index 1ac25068d62..7b31370db70 100644
--- a/src/backend/storage/lmgr/proc.c
+++ b/src/backend/storage/lmgr/proc.c
@@ -95,7 +95,6 @@ static volatile sig_atomic_t got_deadlock_timeout;
 static void RemoveProcFromArray(int code, Datum arg);
 static void ProcKill(int code, Datum arg);
 static void AuxiliaryProcKill(int code, Datum arg);
-static DeadLockState CheckDeadLock(void);
 
 
 /*
@@ -1143,7 +1142,8 @@ AuxiliaryPidGetProc(int pid)
  * NOTES: The process queue is now a priority queue for locking.
  */
 ProcWaitStatus
-JoinWaitQueue(LOCALLOCK *locallock, LockMethod lockMethodTable, bool dontWait)
+JoinWaitQueue(LOCALLOCK *locallock, LockMethod lockMethodTable, bool dontWait,
+			  bool isUpgrade)
 {
 	LOCKMODE	lockmode = locallock->tag.mode;
 	LOCK	   *lock = locallock->lock;
@@ -1230,8 +1230,12 @@ JoinWaitQueue(LOCALLOCK *locallock, LockMethod lockMethodTable, bool dontWait)
 			/* Must he wait for me? */
 			if (lockMethodTable->conflictTab[proc->waitLockMode] & myHeldLocks)
 			{
-				/* Must I wait for him ? */
-				if (lockMethodTable->conflictTab[lockmode] & proc->heldLocks)
+				/*
+				 * Must I wait for him? I don't want a deadlock during lock
+				 * upgrade - other processes should fail on it.
+				 */
+				if ((lockMethodTable->conflictTab[lockmode] & proc->heldLocks) &&
+					!isUpgrade)
 				{
 					/*
 					 * Yes, so we have a deadlock.  Easiest way to clean up
@@ -1461,8 +1465,22 @@ ProcSleep(LOCALLOCK *locallock)
 			(void) WaitLatch(MyLatch, WL_LATCH_SET | WL_EXIT_ON_PM_DEATH, 0,
 							 PG_WAIT_LOCK | locallock->tag.lock.locktag_type);
 			ResetLatch(MyLatch);
-			/* check for deadlocks first, as that's probably log-worthy */
-			if (got_deadlock_timeout)
+			/*
+			 * Check for deadlocks first, as that's probably log-worthy.
+			 *
+			 * Do not wait for the timeout if the lock is being upgraded since
+			 * the risk of deadlock is higher now. However, do not check for
+			 * deadlock if the lock is being upgraded by this process - other
+			 * processes should take care.
+			 *
+			 * TODO Possible optimization: if this is the only lock of the
+			 * backend and if it did not have any weaker lock on the table so
+			 * far, it should be safe to skip the deadlock check. However, to
+			 * evaluate the situation, we need to take fast-path locks into
+			 * account.
+			 */
+			if ((got_deadlock_timeout || lock->upgradedBy) &&
+				lock->upgradedBy != MyProc)
 			{
 				deadlock_state = CheckDeadLock();
 				got_deadlock_timeout = false;
@@ -1819,7 +1837,7 @@ ProcLockWakeup(LockMethod lockMethodTable, LOCK *lock)
  * not, just return.  If we have a real deadlock, remove ourselves from the
  * lock's wait queue.
  */
-static DeadLockState
+DeadLockState
 CheckDeadLock(void)
 {
 	int			i;
diff --git a/src/include/storage/lmgr.h b/src/include/storage/lmgr.h
index 2a985ce5e15..eddbe1e8622 100644
--- a/src/include/storage/lmgr.h
+++ b/src/include/storage/lmgr.h
@@ -38,6 +38,7 @@ extern void RelationInitLockInfo(Relation relation);
 
 /* Lock a relation */
 extern void LockRelationOid(Oid relid, LOCKMODE lockmode);
+extern void LockRelationOidUpgrade(Oid relid, LOCKMODE lockmode);
 extern void LockRelationId(LockRelId *relid, LOCKMODE lockmode);
 extern bool ConditionalLockRelationOid(Oid relid, LOCKMODE lockmode);
 extern void UnlockRelationId(LockRelId *relid, LOCKMODE lockmode);
diff --git a/src/include/storage/lock.h b/src/include/storage/lock.h
index ee3cb1dc203..080b69eddfd 100644
--- a/src/include/storage/lock.h
+++ b/src/include/storage/lock.h
@@ -131,6 +131,7 @@ typedef const LockMethodData *LockMethod;
  * nRequested -- total requested locks of all types.
  * granted -- count of each lock type currently granted on the lock.
  * nGranted -- total granted locks of all types.
+ * upgrader -- process that intends to upgrade the lock
  *
  * Note: these counts count 1 for each backend.  Internally to a backend,
  * there may be multiple grabs on a particular lock, but this is not reflected
@@ -150,6 +151,7 @@ typedef struct LOCK
 	int			nRequested;		/* total of requested[] array */
 	int			granted[MAX_LOCKMODES]; /* counts of granted locks */
 	int			nGranted;		/* total of granted[] array */
+	PGPROC		*upgradedBy;	/* is lock being upgraded by this process? */
 } LOCK;
 
 #define LOCK_LOCKMETHOD(lock) ((LOCKMETHODID) (lock).tag.locktag_lockmethodid)
@@ -390,7 +392,8 @@ extern LockAcquireResult LockAcquireExtended(const LOCKTAG *locktag,
 											 bool dontWait,
 											 bool reportMemoryError,
 											 LOCALLOCK **locallockp,
-											 bool logLockFailure);
+											 bool logLockFailure,
+											 bool isUpgrade);
 extern void AbortStrongLockAcquire(void);
 extern void MarkLockClear(LOCALLOCK *locallock);
 extern bool LockRelease(const LOCKTAG *locktag,
diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h
index 3e1d1fad5f9..c4bc3a24044 100644
--- a/src/include/storage/proc.h
+++ b/src/include/storage/proc.h
@@ -563,10 +563,12 @@ extern bool HaveNFreeProcs(int n, int *nfree);
 extern void ProcReleaseLocks(bool isCommit);
 
 extern ProcWaitStatus JoinWaitQueue(LOCALLOCK *locallock,
-									LockMethod lockMethodTable, bool dontWait);
+									LockMethod lockMethodTable, bool dontWait,
+									bool isUpgrade);
 extern ProcWaitStatus ProcSleep(LOCALLOCK *locallock);
 extern void ProcWakeup(PGPROC *proc, ProcWaitStatus waitStatus);
 extern void ProcLockWakeup(LockMethod lockMethodTable, LOCK *lock);
+extern DeadLockState CheckDeadLock(void);
 extern void CheckDeadLockAlert(void);
 extern void LockErrorCleanup(void);
 extern void GetLockHoldersAndWaiters(LOCALLOCK *locallock,
-- 
2.47.3