From 20995d3594f9870cea7182cf86ced183fe2cd199 Mon Sep 17 00:00:00 2001 From: JoongHyuk Shin Date: Sun, 31 May 2026 22:36:34 +0900 Subject: [PATCH] Prevent repeated deadlock-check signals in standby buffer pin waits When a startup process in hot standby waits on a buffer pin held by backends and deadlock_timeout fires, ResolveRecoveryConflictWithBufferPin() sent a deadlock-check signal to all backends and then returned to its caller (LockBufferForCleanup). The caller looped back and re-armed the deadlock_timeout, so the signal was re-broadcast every deadlock_timeout interval until the buffer was unpinned or max_standby_streaming_delay was reached. Commit 8900b5a9d59a (2021) fixed the same pattern in the parallel lock path but left an XXX comment in the buffer pin path noting the same issue had not been addressed. After sending the deadlock-check signal, wait inside this function for UnpinBuffer() to wake us up. This makes the signal fire once per buffer-pin wait instead of once per deadlock_timeout interval. Because control no longer returns to the caller between the signal and the unpin, the caller's loop-top "recovery still waiting" log emission (bufmgr.c) would otherwise be skipped. Emit that log message inside the function instead, using the caller's waitStart and logged_recovery_conflict state passed by reference. This keeps the user-visible log timing at deadlock_timeout (typically 100ms) rather than letting it slip to max_standby_streaming_delay. The lock path's logging_conflict early-return pattern was considered but not adopted here. The deadlock-check signal handler matches against the bufid published by SetStartupBufferPinWaitBufId(), and LockBufferForCleanup unpublishes between caller iterations. An early-return fix would reopen a race in that unpublish/re-publish gap where a backend with a deferred signal handler sees bufid < 0 and no-ops. The lock path has no such publish/unpublish step because conflicting backends are identified directly from the lock table, so the early-return pattern is safe there. The log-timing fix is covered by src/test/recovery/t/054_bufferpin_conflict_log_timing.pl, which asserts the "still waiting" log appears well before max_standby_streaming_delay. The repeated-signal behavior is verified manually via strace on the startup process. --- src/backend/storage/buffer/bufmgr.c | 3 +- src/backend/storage/ipc/standby.c | 73 ++++++++-- src/include/storage/standby.h | 3 +- src/test/recovery/meson.build | 1 + .../t/054_bufferpin_conflict_log_timing.pl | 130 ++++++++++++++++++ 5 files changed, 198 insertions(+), 12 deletions(-) create mode 100644 src/test/recovery/t/054_bufferpin_conflict_log_timing.pl diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index cc398db124d..5b184f99103 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -6789,7 +6789,8 @@ LockBufferForCleanup(Buffer buffer) /* Publish the bufid that Startup process waits on */ SetStartupBufferPinWaitBufId(buffer - 1); /* Set alarm and then wait to be signaled by UnpinBuffer() */ - ResolveRecoveryConflictWithBufferPin(); + ResolveRecoveryConflictWithBufferPin(waitStart, + &logged_recovery_conflict); /* Reset the published bufid */ SetStartupBufferPinWaitBufId(-1); } diff --git a/src/backend/storage/ipc/standby.c b/src/backend/storage/ipc/standby.c index de9092fdf5b..d3dc26efd7f 100644 --- a/src/backend/storage/ipc/standby.c +++ b/src/backend/storage/ipc/standby.c @@ -790,13 +790,26 @@ cleanup: * Deadlocks are extremely rare, and relatively expensive to check for, * so we don't do a deadlock check right away ... only if we have had to wait * at least deadlock_timeout. + * + * Parameters waitStart and logged_recovery_conflict are forwarded from the + * caller (LockBufferForCleanup) so that, when this function emits the + * "recovery still waiting" log after the deadlock-check signal, it uses the + * caller's wait-start timestamp and gate. When log_recovery_conflict_waits + * is disabled, waitStart is 0 and the in-function emission is skipped. + * logged_recovery_conflict must be a valid (non-NULL) pointer; the function + * sets *logged_recovery_conflict to true after emission so that neither the + * caller's loop-top branch nor a later entry re-emits the same conflict. + * The caller's "waiting" ps-display suffix remains active across the second + * ProcWaitForSignal until UnpinBuffer() wakes us up. */ void -ResolveRecoveryConflictWithBufferPin(void) +ResolveRecoveryConflictWithBufferPin(TimestampTz waitStart, + bool *logged_recovery_conflict) { TimestampTz ltime; Assert(InHotStandby); + Assert(logged_recovery_conflict != NULL); ltime = GetStandbyLimitTime(); @@ -818,6 +831,7 @@ ResolveRecoveryConflictWithBufferPin(void) if (ltime != 0) { + got_standby_delay_timeout = false; timeouts[cnt].id = STANDBY_TIMEOUT; timeouts[cnt].type = TMPARAM_AT; timeouts[cnt].fin_time = ltime; @@ -851,17 +865,56 @@ ResolveRecoveryConflictWithBufferPin(void) /* * Send out a request for hot-standby backends to check themselves for * deadlocks. - * - * XXX The subsequent ResolveRecoveryConflictWithBufferPin() will wait - * to be signaled by UnpinBuffer() again and send a request for - * deadlocks check if deadlock_timeout happens. This causes the - * request to continue to be sent every deadlock_timeout until the - * buffer is unpinned or ltime is reached. This would increase the - * workload in the startup process and backends. In practice it may - * not be so harmful because the period that the buffer is kept pinned - * is basically no so long. But we should fix this? */ SendRecoveryConflictWithBufferPin(RECOVERY_CONFLICT_BUFFERPIN_DEADLOCK); + + /* + * Emit the "still waiting" log here, because the second + * ProcWaitForSignal() below keeps control inside this function until + * the buffer is unpinned, so the caller's loop-top log emission would + * otherwise be skipped. The caller passes its own waitStart and + * logged_recovery_conflict so the emission semantics match what the + * caller would have produced on its next iteration. + * + * The caller's loop-top branch additionally guards on + * TimestampDifferenceExceeds(waitStart, now, DeadlockTimeout) before + * logging. That extra guard is redundant here: we just woke from + * STANDBY_DEADLOCK_TIMEOUT, so DeadlockTimeout has elapsed since + * waitStart by construction. The caller-side waitStart != 0 check + * is a shorthand for log_recovery_conflict_waits=on (the caller + * sets waitStart only when that GUC is true), so this branch + * stays a no-op when conflict-wait logging is disabled. + */ + if (waitStart != 0 && !*logged_recovery_conflict) + { + LogRecoveryConflict(RECOVERY_CONFLICT_BUFFERPIN, + waitStart, GetCurrentTimestamp(), + NULL, true); + *logged_recovery_conflict = true; + } + + /* + * Wait here to be signaled by UnpinBuffer(), to prevent the + * subsequent ResolveRecoveryConflictWithBufferPin() call (from the + * caller's loop) from firing another deadlock_timeout and re-sending + * the deadlock-check signal. Without this, the signal would be sent + * every deadlock_timeout interval until the buffer is unpinned or + * ltime is reached. + * + * The same wakeup assumption as the first wait above applies: only + * UnpinBuffer() and STANDBY_TIMEOUT (if armed) can wake us here + * (STANDBY_DEADLOCK_TIMEOUT was one-shot and has already fired). + * + * If STANDBY_TIMEOUT fires during this wait, control returns to the + * caller (LockBufferForCleanup), which re-enters this function on + * its next iteration; the GetCurrentTimestamp() >= ltime fast-path + * at the top then sends the cancel signal. We intentionally do not + * check got_standby_delay_timeout here because the cancel is one + * caller-loop iteration away, and adding the check would duplicate + * logic with the fast-path branch. + */ + got_standby_deadlock_timeout = false; + ProcWaitForSignal(WAIT_EVENT_BUFFER_CLEANUP); } /* diff --git a/src/include/storage/standby.h b/src/include/storage/standby.h index 6a314c693cd..6aa0c9ac2bb 100644 --- a/src/include/storage/standby.h +++ b/src/include/storage/standby.h @@ -79,7 +79,8 @@ extern void ResolveRecoveryConflictWithTablespace(Oid tsid); extern void ResolveRecoveryConflictWithDatabase(Oid dbid); extern void ResolveRecoveryConflictWithLock(LOCKTAG locktag, bool logging_conflict); -extern void ResolveRecoveryConflictWithBufferPin(void); +extern void ResolveRecoveryConflictWithBufferPin(TimestampTz waitStart, + bool *logged_recovery_conflict); extern void CheckRecoveryConflictDeadlock(void); extern void StandbyDeadLockHandler(void); extern void StandbyTimeoutHandler(void); diff --git a/src/test/recovery/meson.build b/src/test/recovery/meson.build index 9eb8ed11425..64d05d5041c 100644 --- a/src/test/recovery/meson.build +++ b/src/test/recovery/meson.build @@ -62,6 +62,7 @@ tests += { 't/051_effective_wal_level.pl', 't/052_checkpoint_segment_missing.pl', 't/053_standby_login_event_trigger.pl', + 't/054_bufferpin_conflict_log_timing.pl', ], }, } diff --git a/src/test/recovery/t/054_bufferpin_conflict_log_timing.pl b/src/test/recovery/t/054_bufferpin_conflict_log_timing.pl new file mode 100644 index 00000000000..570e5108b36 --- /dev/null +++ b/src/test/recovery/t/054_bufferpin_conflict_log_timing.pl @@ -0,0 +1,130 @@ +# Copyright (c) 2026, PostgreSQL Global Development Group + +# Verify that "recovery still waiting" is logged after deadlock_timeout +# during a buffer pin recovery conflict, rather than only at +# max_standby_streaming_delay. +# +# This guards against a regression where the startup process waits the +# full max_standby_streaming_delay before returning to the caller, which +# delays the conflict log emitted from LockBufferForCleanup(). +# +# Reported by Ilmar Yunusov on the v2 thread of "Standby deadlock check +# repeats signal every deadlock_timeout". + +use strict; +use warnings FATAL => 'all'; +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; +use Time::HiRes qw(usleep); + +my $node_primary = PostgreSQL::Test::Cluster->new('primary'); +$node_primary->init(allows_streaming => 1); + +# Ilmar's reported environment: short deadlock_timeout, long streaming +# delay, conflict-wait logging enabled. +$node_primary->append_conf( + 'postgresql.conf', qq[ +log_recovery_conflict_waits = on +deadlock_timeout = 100ms +max_standby_streaming_delay = 5s +]); +$node_primary->start; + +my $backup_name = 'bp_backup'; +$node_primary->backup($backup_name); + +my $node_standby = PostgreSQL::Test::Cluster->new('standby'); +$node_standby->init_from_backup($node_primary, $backup_name, + has_streaming => 1); +$node_standby->start; + +my $test_db = 'bp_db'; +$node_primary->safe_psql('postgres', "CREATE DATABASE $test_db"); + +my $table = 'bp_table'; +$node_primary->safe_psql( + $test_db, qq[ +CREATE TABLE $table (a int, b int); +INSERT INTO $table SELECT 0, 0 FROM generate_series(1, 20); +]); + +# Create dead tuples that VACUUM FREEZE will prune; this is what +# triggers the buffer pin recovery conflict on the standby. +$node_primary->safe_psql( + $test_db, qq[ +BEGIN; +INSERT INTO $table VALUES (1, 0); +ROLLBACK; +BEGIN; LOCK $table; COMMIT; +]); +$node_primary->wait_for_replay_catchup($node_standby); + +my $psql_standby = + $node_standby->background_psql($test_db, on_error_stop => 0); + +my $cursor = 'bp_cursor'; +my $res = $psql_standby->query_safe( + qq[ +BEGIN; +DECLARE $cursor CURSOR FOR SELECT b FROM $table; +FETCH FORWARD FROM $cursor; +]); +like($res, qr/^0$/m, 'standby cursor pins buffer'); + +my $log_location = -s $node_standby->logfile; + +# Trigger the conflict. +$node_primary->safe_psql($test_db, "VACUUM FREEZE $table;"); + +# Wait for the "still waiting" log. Bound the wait below +# max_standby_streaming_delay (5s) so a regression where the log only +# appears at the streaming-delay boundary fails fast. +my $deadline = time() + 7; +my $found = 0; +my $elapsed_ms; +while (time() < $deadline) +{ + my $log = PostgreSQL::Test::Utils::slurp_file($node_standby->logfile, + $log_location); + if ($log =~ /recovery still waiting after (\d+)\.(\d+) ms: .*buffer pin/) + { + $elapsed_ms = $1 + 0; + $found = 1; + last; + } + usleep(50_000); # 50ms poll +} + +ok($found, + "buffer pin conflict 'still waiting' log appears before " + . "max_standby_streaming_delay"); + +if ($found) +{ + # The log should appear close to deadlock_timeout (100ms), not + # anywhere near max_standby_streaming_delay (5000ms). The v2 + # regression manifests as the log appearing at approximately 5001ms. + # Use a 4000ms upper bound: tight enough to catch the regression + # (which fires at 5001ms) but loose enough to absorb scheduling + # noise on slow buildfarm animals (valgrind, CLOBBER_CACHE_ALWAYS, + # ASan, etc.) where the typical-case latency of ~100ms can grow + # 10x. + cmp_ok($elapsed_ms, '<', 4000, + "'still waiting' elapsed ($elapsed_ms ms) is near " + . "deadlock_timeout, not max_standby_streaming_delay"); +} + +$psql_standby->reconnect_and_clear(); + +# The standby session was terminated by the conflict cancel. Wait for +# replay to catch up before stopping the cluster, mirroring the pattern +# in t/031_recovery_conflict.pl; without this, shutdown can race the +# FATAL message and leave the test output flaky on slow buildfarm +# animals. +$node_primary->wait_for_replay_catchup($node_standby); + +$node_standby->stop; +$node_primary->stop; + +done_testing(); -- 2.52.0