From 11f20b33db27013cfd39ac878c0e453a302eca2c Mon Sep 17 00:00:00 2001 From: "masao.fujii" Date: Wed, 10 Jun 2026 12:12:55 +0900 Subject: [PATCH v4 2/2] Add new deadlock conflict test in 031_recovery_conflict.pl It checks that the deadlock detector is triggered on the hot standby's client backend process when a deadlock with the startup process occurs in the scenario of an interference of log_startup_process_interval and deadlock_timeout timeouts. An infinite deadlock occurs between the standby's startup process and a backend process when handling an XLOG_PRUNE_PAGE record. The issue arises because the startup process fails to trigger the deadlock detector in the conflicting backend. --- src/test/recovery/t/031_recovery_conflict.pl | 166 ++++++++++++------- 1 file changed, 109 insertions(+), 57 deletions(-) diff --git a/src/test/recovery/t/031_recovery_conflict.pl b/src/test/recovery/t/031_recovery_conflict.pl index 7a740f69806..06de8b51668 100644 --- a/src/test/recovery/t/031_recovery_conflict.pl +++ b/src/test/recovery/t/031_recovery_conflict.pl @@ -223,63 +223,7 @@ $node_standby->adjust_conf( $node_standby->restart(); $psql_standby->reconnect_and_clear(); -# Generate a few dead rows, to later be cleaned up by vacuum. Then acquire a -# lock on another relation in a prepared xact, so it's held continuously by -# the startup process. The standby psql will block acquiring that lock while -# holding a pin that vacuum needs, triggering the deadlock. -$node_primary->safe_psql( - $test_db, - qq[ -CREATE TABLE $table1(a int, b int); -INSERT INTO $table1 VALUES (1); -BEGIN; -INSERT INTO $table1(a) SELECT generate_series(1, 100) i; -ROLLBACK; -BEGIN; -LOCK TABLE $table2; -PREPARE TRANSACTION 'lock'; -INSERT INTO $table1(a) VALUES (170); -SELECT txid_current(); -]); - -$node_primary->wait_for_replay_catchup($node_standby); - -$res = $psql_standby->query_until( - qr/^1$/m, qq[ - BEGIN; - -- hold pin - DECLARE $cursor1 CURSOR FOR SELECT a FROM $table1; - FETCH FORWARD FROM $cursor1; - -- wait for lock held by prepared transaction - SELECT * FROM $table2; - ]); -ok(1, - "$sect: cursor holding conflicting pin, also waiting for lock, established" -); - -# just to make sure we're waiting for lock already -ok( $node_standby->poll_query_until( - 'postgres', qq[ -SELECT 'waiting' FROM pg_locks WHERE locktype = 'relation' AND NOT granted; -], 'waiting'), - "$sect: lock acquisition is waiting"); - -# VACUUM FREEZE will prune away rows, causing a buffer pin conflict, while -# standby psql is waiting on lock -$node_primary->safe_psql($test_db, qq[VACUUM FREEZE $table1;]); -$node_primary->wait_for_replay_catchup($node_standby); - -check_conflict_log("User transaction caused buffer deadlock with recovery."); -$psql_standby->reconnect_and_clear(); -check_conflict_stat("deadlock"); - -# clean up for next tests -$node_primary->safe_psql($test_db, qq[ROLLBACK PREPARED 'lock';]); -$node_standby->adjust_conf('postgresql.conf', 'max_standby_streaming_delay', - '50ms'); -$node_standby->restart(); -$psql_standby->reconnect_and_clear(); - +test_deadlock_conflict_scenario(); # Check that expected number of conflicts show in pg_stat_database. Needs to # be tested before database is dropped, for obvious reasons. @@ -289,6 +233,11 @@ is( $node_standby->safe_psql( $expected_conflicts, qq[$expected_conflicts recovery conflicts shown in pg_stat_database]); +$node_standby->adjust_conf('postgresql.conf', 'max_standby_streaming_delay', + '50ms'); +$node_standby->restart(); +$psql_standby->reconnect_and_clear(); + ## RECOVERY CONFLICT 6: Database conflict $sect = "database conflict"; @@ -299,6 +248,47 @@ $node_primary->wait_for_replay_catchup($node_standby); check_conflict_log("User was connected to a database that must be dropped"); +## FOLLOW-UP SCENARIO: startup deadlock from coalesced timeouts +# +# This is an additional deadlock-coverage scenario, not a new numbered +# recovery-conflict type. It stays here, after the numbered conflict cases, +# because the main test database has already been dropped above and this +# follow-up can run independently in the always-present "postgres" database. +# +# The scenario exercises an infinite deadlock between the standby startup +# process and a standby backend while replaying XLOG_PRUNE_PAGE. The backend +# holds a buffer pin and then waits for a relation lock; replay of VACUUM +# needs that pin to be released, but the lock is held on behalf of replay. +# +# The bug was caused by interference between deadlock_timeout and +# log_startup_progress_interval: timeout.c can coalesce timer events, so each +# timeout handler must verify that its own timeout really fired. In this case +# the startup process could miss the deadlock_timeout event and fail to ask +# the conflicting backend to run deadlock detection. + +$sect = "startup deadlock from coalesced timeouts"; + +# Use the always-present postgres database so this follow-up remains +# independent of the earlier database-conflict case that dropped $test_db. +$test_db = 'postgres'; + +$node_standby->adjust_conf('postgresql.conf', 'max_standby_streaming_delay', + '-1'); +$node_standby->adjust_conf('postgresql.conf', + 'log_startup_progress_interval', '2s'); +$node_standby->adjust_conf('postgresql.conf', 'deadlock_timeout', '3s'); +$node_standby->restart(); +$psql_standby = $node_standby->background_psql($test_db, on_error_stop => 0); + +test_deadlock_conflict_scenario(); + +# Check that the additional startup/backend deadlock scenario is also counted +# in pg_stat_database for the postgres database. +is( $node_standby->safe_psql( + $test_db, + qq[SELECT conflicts FROM pg_stat_database WHERE datname='$test_db';]), + 1, + qq[1 recovery conflict shown in pg_stat_database for $test_db]); # explicitly shut down psql instances gracefully - to avoid hangs or worse on # windows @@ -331,3 +321,65 @@ sub check_conflict_stat is($count, 1, "$sect: stats show conflict on standby"); } + +sub test_deadlock_conflict_scenario +{ + my $res; + + # Generate a few dead rows, to later be cleaned up by vacuum. Then + # acquire a lock on another relation in a prepared xact, so it's held + # continuously by the startup process. The standby psql will block + # acquiring that lock while holding a pin that vacuum needs, triggering + # the deadlock. + $node_primary->safe_psql( + $test_db, + qq[ +CREATE TABLE IF NOT EXISTS $table1(a int, b int); +CREATE TABLE IF NOT EXISTS $table2(a int, b int); +INSERT INTO $table1 VALUES (1); +BEGIN; +INSERT INTO $table1(a) SELECT generate_series(1, 100) i; +ROLLBACK; +BEGIN; +LOCK TABLE $table2; +PREPARE TRANSACTION 'lock'; +INSERT INTO $table1(a) VALUES (170); +SELECT txid_current(); +]); + + $node_primary->wait_for_replay_catchup($node_standby); + + $res = $psql_standby->query_until( + qr/^1$/m, qq[ + BEGIN; + -- hold pin + DECLARE $cursor1 CURSOR FOR SELECT a FROM $table1; + FETCH FORWARD FROM $cursor1; + -- wait for lock held by prepared transaction + SELECT * FROM $table2; +]); + + like($res, qr/^1$/m, + "$sect: cursor holding conflicting pin, also waiting for lock, established" + ); + + # Just to make sure we're waiting for the relation lock already. + ok( $node_standby->poll_query_until( + 'postgres', qq[ +SELECT 'waiting' FROM pg_locks WHERE locktype = 'relation' AND NOT granted; +], 'waiting'), + "$sect: lock acquisition is waiting"); + + # VACUUM FREEZE will prune away rows, causing a buffer pin conflict, while + # standby psql is waiting on lock. + $node_primary->safe_psql($test_db, qq[VACUUM FREEZE $table1;]); + $node_primary->wait_for_replay_catchup($node_standby); + + check_conflict_log( + "User transaction caused buffer deadlock with recovery."); + $psql_standby->reconnect_and_clear(); + check_conflict_stat("deadlock"); + + # clean up for next tests + $node_primary->safe_psql($test_db, qq[ROLLBACK PREPARED 'lock';]); +} -- 2.53.0