From fc0b3b6df48d7c5d69edf891cd84f8c4848a0e65 Mon Sep 17 00:00:00 2001
From: Vitaly Davydov <v.davydov@postgrespro.ru>
Date: Wed, 20 May 2026 11:48:34 +0300
Subject: [PATCH] Fix deadlock detector activation in a recovery conflict

When the startup process in a deadlock with a backend, it sends the
signal to the backend to trigger the deadlock detector when
the deadlock timeout is elapsed (deadlock_timeout guc). Due to some
optimization in timeout.c, when spontaneous SIGALRM signals are
possible, which doesn't relate to any enabled timeout, the function
ResolveRecoveryConflictWithBufferPin can never send the signal to the
conflicting backend, becase the deadlock timeout will never be
triggered.

The patch fixes ResolveRecoveryConflictWithBufferPin by ignoring
spontaneous SIGALRM signals, that are possible in the current
implementation of timeout.c functionality.
---
 src/backend/storage/buffer/bufmgr.c           |  25 +++-
 src/backend/storage/ipc/standby.c             |  67 +++++----
 src/include/storage/bufmgr.h                  |   1 +
 src/include/storage/standby.h                 |   2 +-
 .../t/053_startup_backend_deadlock.pl         | 127 ++++++++++++++++++
 5 files changed, 194 insertions(+), 28 deletions(-)
 create mode 100644 src/test/recovery/t/053_startup_backend_deadlock.pl

diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index cc398db124d..ce5871fbd9b 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -4749,6 +4749,29 @@ BufferGetLSNAtomic(Buffer buffer)
 #endif
 }
 
+/*
+ * BufferGetRefCount
+ *		Return the current reference counter.
+ */
+uint32
+BufferGetRefCount(Buffer buffer)
+{
+	BufferDesc *bufHdr;
+	uint64		buf_state;
+	uint32		buf_refcount;
+
+	Assert(BufferIsValid(buffer));
+	Assert(!BufferIsLocal(buffer));
+
+	bufHdr = GetBufferDescriptor(buffer - 1);
+
+	buf_state = LockBufHdr(bufHdr);
+	buf_refcount = BUF_STATE_GET_REFCOUNT(buf_state);
+	UnlockBufHdr(bufHdr);
+
+	return buf_refcount;
+}
+
 /* ---------------------------------------------------------------------
  *		DropRelationBuffers
  *
@@ -6789,7 +6812,7 @@ LockBufferForCleanup(Buffer buffer)
 			/* Publish the bufid that Startup process waits on */
 			SetStartupBufferPinWaitBufId(buffer - 1);
 			/* Set alarm and then wait to be signaled by UnpinBuffer() */
-			ResolveRecoveryConflictWithBufferPin();
+			ResolveRecoveryConflictWithBufferPin(buffer);
 			/* Reset the published bufid */
 			SetStartupBufferPinWaitBufId(-1);
 		}
diff --git a/src/backend/storage/ipc/standby.c b/src/backend/storage/ipc/standby.c
index de9092fdf5b..00399eafee3 100644
--- a/src/backend/storage/ipc/standby.c
+++ b/src/backend/storage/ipc/standby.c
@@ -790,13 +790,17 @@ cleanup:
  * Deadlocks are extremely rare, and relatively expensive to check for,
  * so we don't do a deadlock check right away ... only if we have had to wait
  * at least deadlock_timeout.
+ *
+ * The precondition: the current process should be the waiter process.
  */
 void
-ResolveRecoveryConflictWithBufferPin(void)
+ResolveRecoveryConflictWithBufferPin(Buffer buffer)
 {
 	TimestampTz ltime;
 
 	Assert(InHotStandby);
+	Assert(BufferIsValid(buffer));
+	Assert(!BufferIsLocal(buffer));
 
 	ltime = GetStandbyLimitTime();
 
@@ -833,35 +837,46 @@ ResolveRecoveryConflictWithBufferPin(void)
 		enable_timeouts(timeouts, cnt);
 	}
 
-	/*
-	 * Wait to be signaled by UnpinBuffer() or for the wait to be interrupted
-	 * by one of the timeouts established above.
-	 *
-	 * We assume that only UnpinBuffer() and the timeout requests established
-	 * above can wake us up here. WakeupRecovery() called by walreceiver or
-	 * SIGHUP signal handler, etc cannot do that because it uses the different
-	 * latch from that ProcWaitForSignal() waits on.
-	 */
-	ProcWaitForSignal(WAIT_EVENT_BUFFER_CLEANUP);
-
-	if (got_standby_delay_timeout)
-		SendRecoveryConflictWithBufferPin(RECOVERY_CONFLICT_BUFFERPIN);
-	else if (got_standby_deadlock_timeout)
+	for (;;)
 	{
 		/*
-		 * Send out a request for hot-standby backends to check themselves for
-		 * deadlocks.
+		 * Wait to be signaled by UnpinBuffer() or for the wait to be interrupted
+		 * by one of the timeouts established above.
 		 *
-		 * XXX The subsequent ResolveRecoveryConflictWithBufferPin() will wait
-		 * to be signaled by UnpinBuffer() again and send a request for
-		 * deadlocks check if deadlock_timeout happens. This causes the
-		 * request to continue to be sent every deadlock_timeout until the
-		 * buffer is unpinned or ltime is reached. This would increase the
-		 * workload in the startup process and backends. In practice it may
-		 * not be so harmful because the period that the buffer is kept pinned
-		 * is basically no so long. But we should fix this?
+		 * ProcWaitForSignal() can also wake up for unrelated reasons, so recheck
+		 * that the buffer is pinned by the current waiter process only (reference
+		 * counter should be 1). Continue waiting, if no registered timeout is
+		 * fired or the buffer is still pinned by other processes as well.
 		 */
-		SendRecoveryConflictWithBufferPin(RECOVERY_CONFLICT_BUFFERPIN_DEADLOCK);
+		ProcWaitForSignal(WAIT_EVENT_BUFFER_CLEANUP);
+
+		/*
+		 * Once the reference count is less or equal to 1 and we are the waiter
+		 * process, no one uses the buffer at the moment. There is a chance to
+		 * lock the buffer exclusively.
+		 */
+		if (BufferGetRefCount(buffer) <= 1)
+			break;
+
+		if (got_standby_delay_timeout)
+			SendRecoveryConflictWithBufferPin(RECOVERY_CONFLICT_BUFFERPIN);
+		else if (got_standby_deadlock_timeout)
+		{
+			/*
+			* Send out a request for hot-standby backends to check themselves for
+			* deadlocks.
+			*
+			* XXX The subsequent ResolveRecoveryConflictWithBufferPin() will wait
+			* to be signaled by UnpinBuffer() again and send a request for
+			* deadlocks check if deadlock_timeout happens. This causes the
+			* request to continue to be sent every deadlock_timeout until the
+			* buffer is unpinned or ltime is reached. This would increase the
+			* workload in the startup process and backends. In practice it may
+			* not be so harmful because the period that the buffer is kept pinned
+			* is basically no so long. But we should fix this?
+			*/
+			SendRecoveryConflictWithBufferPin(RECOVERY_CONFLICT_BUFFERPIN_DEADLOCK);
+		}
 	}
 
 	/*
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index 6837b35fc6d..b7dcff85a8d 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -313,6 +313,7 @@ extern bool BufferIsPermanent(Buffer buffer);
 extern XLogRecPtr BufferGetLSNAtomic(Buffer buffer);
 extern void BufferGetTag(Buffer buffer, RelFileLocator *rlocator,
 						 ForkNumber *forknum, BlockNumber *blknum);
+uint32		BufferGetRefCount(Buffer buffer);
 
 extern void MarkBufferDirtyHint(Buffer buffer, bool buffer_std);
 
diff --git a/src/include/storage/standby.h b/src/include/storage/standby.h
index 6a314c693cd..c75d87b7ddc 100644
--- a/src/include/storage/standby.h
+++ b/src/include/storage/standby.h
@@ -79,7 +79,7 @@ extern void ResolveRecoveryConflictWithTablespace(Oid tsid);
 extern void ResolveRecoveryConflictWithDatabase(Oid dbid);
 
 extern void ResolveRecoveryConflictWithLock(LOCKTAG locktag, bool logging_conflict);
-extern void ResolveRecoveryConflictWithBufferPin(void);
+extern void ResolveRecoveryConflictWithBufferPin(Buffer buffer);
 extern void CheckRecoveryConflictDeadlock(void);
 extern void StandbyDeadLockHandler(void);
 extern void StandbyTimeoutHandler(void);
diff --git a/src/test/recovery/t/053_startup_backend_deadlock.pl b/src/test/recovery/t/053_startup_backend_deadlock.pl
new file mode 100644
index 00000000000..5bd24f315e6
--- /dev/null
+++ b/src/test/recovery/t/053_startup_backend_deadlock.pl
@@ -0,0 +1,127 @@
+#!/usr/bin/perl
+#
+# Test a deadlock between a backend and the startup processes when processing
+# XLOG_PRUNE_PAGE wal record. The test is based on 031_recovery_conflict.pl
+# vanilla test.
+#
+
+use strict;
+use warnings;
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+# Test settings
+my $deadlock_timeout = 3000; # ms
+my $log_startup_progress_interval = 2000; # ms
+my $testdb = "testdb";
+my $backup_name = 'my_backup';
+my $table1 = "table1";
+my $table2 = "table2";
+
+# Set up nodes
+my $node_primary = PostgreSQL::Test::Cluster->new('primary');
+$node_primary->init(allows_streaming => 1);
+
+$node_primary->append_conf('postgresql.conf', qq[
+
+	# for deadlock test
+	max_prepared_transactions = 10
+
+	# wait some to test the wait paths as well, but not long for obvious reasons
+	max_standby_streaming_delay = -1
+
+	# Some of the recovery conflict logging code only gets exercised after
+	# deadlock_timeout. The test doesn't rely on that additional output, but it's
+	# nice to get some minimal coverage of that code.
+	log_recovery_conflict_waits = on
+
+	log_startup_progress_interval = ${log_startup_progress_interval}
+	deadlock_timeout = ${deadlock_timeout}
+	autovacuum = off
+]);
+
+$node_primary->start;
+$node_primary->backup($backup_name);
+
+my $node_standby = PostgreSQL::Test::Cluster->new('standby');
+$node_standby->init_from_backup($node_primary, $backup_name, has_streaming => 1);
+$node_standby->start();
+
+my $log_location = -s $node_standby->logfile;
+
+# use a new database, to trigger database recovery conflict
+$node_primary->safe_psql('postgres', "CREATE DATABASE $testdb");
+#$node_primary->safe_psql('postgres', "CREATE TABLE $table2(a int, b int)");
+
+$node_primary->safe_psql($testdb, qq[
+	CREATE TABLE $table1(a int, b int);
+	CREATE TABLE $table2(a int, b int);
+	INSERT INTO $table1 VALUES (1);
+]);
+
+# Generate a few dead rows, to later be cleaned up by vacuum. Then acquire a
+# lock on another relation in a prepared xact, so it's held continuously by
+# the startup process. The standby psql will block acquiring that lock while
+# holding a pin that vacuum needs, triggering the deadlock.
+$node_primary->safe_psql($testdb, qq[
+	BEGIN;
+	INSERT INTO $table1(a) SELECT generate_series(1, 100) i;
+	ROLLBACK;
+]);
+
+$node_primary->safe_psql($testdb, qq[
+	BEGIN;
+	LOCK TABLE $table2;
+	PREPARE TRANSACTION 'lock';
+	INSERT INTO $table1(a) VALUES (170);
+	SELECT txid_current();
+]);
+
+$node_primary->wait_for_catchup($node_standby, 'replay', $node_primary->lsn('write'));
+
+my $psql_standby = $node_standby->background_psql($testdb, on_error_stop => 0);
+
+$psql_standby->query_until(qr/^1$/m, qq[
+	BEGIN;
+	-- hold pin
+	DECLARE test_recovery_conflict_cursor CURSOR FOR SELECT a FROM $table1;
+	FETCH FORWARD FROM test_recovery_conflict_cursor;
+	-- wait for lock held by prepared transaction
+	SELECT * FROM $table2;
+]);
+
+ok(1, "cursor holding conflicting pin, also waiting for lock, established");
+
+# VACUUM will prune away rows, causing a buffer pin conflict, while standby
+# psql is waiting on lock
+$node_primary->safe_psql($testdb, qq[VACUUM $table1;]);
+
+# Wait and check that the deadlock detector was triggered and found a deadlock
+# in the backend process (not in startup process).
+note("Waiting for deadlock detector to launch...");
+check_conflict_log("User transaction caused buffer deadlock with recovery.");
+
+# clean up for next tests
+$node_primary->safe_psql($testdb, qq[ROLLBACK PREPARED 'lock';]);
+
+# Check that expected number of conflicts show in pg_stat_database. Needs to
+# be tested before database is dropped, for obvious reasons.
+my $nconflicts = $node_standby->safe_psql($testdb, "SELECT conflicts FROM pg_stat_database WHERE datname = '$testdb'");
+note("number of recovery conflicts: $nconflicts");
+
+$psql_standby->quit();
+
+sub check_conflict_log
+{
+	my $message = shift;
+	my $old_log_location = $log_location;
+
+	$log_location = $node_standby->wait_for_log(qr/$message/, $log_location);
+
+	cmp_ok($log_location, '>', $old_log_location,
+		"logfile contains terminated connection due to recovery conflict"
+	);
+}
+
+done_testing();
-- 
2.43.0

