From 16d26eccc862d67aebe14a74ef13f684a5397d84 Mon Sep 17 00:00:00 2001 From: Jan Nidzwetzki Date: Mon, 15 Jun 2026 13:45:05 +0200 Subject: [PATCH] Shut down instead of promoting when recovery cannot pause When recovery_target_action is set to 'pause' and recovery reaches the target, recoveryPausesHere() only actually pauses if hot standby is active. If hot standby never became active, recovery silently fell through and promoted, ignoring the configured action. Hot standby stays inactive while standbyState is STANDBY_SNAPSHOT_PENDING. We enter that state from an overflowed standby snapshot and leave it only once a non-overflowed snapshot arrives, or the oldest running xid advances past the pending snapshot's xmin. Neither happens when a long-lived transaction keeps the snapshots overflowed. Enabling hot standby from such a snapshot is not a safe alternative: the visibility of the missing subtransactions cannot be determined. Fix this by shutting down when the target is reached but hot standby is not active, unless a promotion has already been triggered, in which case we honor it. This mirrors how 'pause' is downgraded to 'shutdown' when hot_standby is turned off. The user can then choose a different recovery target or action. Document the new behavior and add a TAP test that drives an archive-recovery standby into this case and checks that it shuts down rather than pausing or promoting. --- doc/src/sgml/config.sgml | 4 + src/backend/access/transam/xlogrecovery.c | 28 +++++ src/test/recovery/meson.build | 1 + .../recovery/t/054_recovery_pause_subxid.pl | 115 ++++++++++++++++++ 4 files changed, 148 insertions(+) create mode 100644 src/test/recovery/t/054_recovery_pause_subxid.pl diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index fa566c9e553..2c79d200fe9 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -4647,6 +4647,10 @@ restore_command = 'copy "C:\\server\\archivedir\\%f" "%p"' # Windows This setting has no effect if no recovery target is set. If is not enabled, a setting of pause will act the same as shutdown. + Likewise, if hot standby could not be started before the recovery + target was reached, for example because the replayed standby + snapshots remained overflowed, a setting of pause + will act the same as shutdown. If the recovery target is reached while a promotion is ongoing, a setting of pause will act the same as promote. diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c index 73b78a83fa7..cd9f132992d 100644 --- a/src/backend/access/transam/xlogrecovery.c +++ b/src/backend/access/transam/xlogrecovery.c @@ -1832,6 +1832,34 @@ PerformWalRecovery(void) proc_exit(3); case RECOVERY_TARGET_ACTION_PAUSE: + + /* + * If hot standby is inactive, recoveryPausesHere() does + * nothing, so we would fall through and promote, silently + * ignoring the requested action. + * + * Hot standby stays inactive while standbyState is + * STANDBY_SNAPSHOT_PENDING. We enter that state from an + * overflowed standby snapshot and leave it only once a + * non-overflowed snapshot arrives, or the oldest running + * xid advances past the pending snapshot's xmin (see + * ProcArrayApplyRecoveryInfo()). So, instead of falling + * through and silently promoting, we shut down and let + * the user choose a different recovery target or action. + * + * If a promotion has already been triggered, let it + * proceed (the fall-through below promotes) rather than + * shutting down. + */ + if (!LocalHotStandbyActive && !LocalPromoteIsTriggered) + { + ereport(LOG, + (errmsg("recovery cannot pause at the recovery target because hot standby is not active"), + errdetail("The standby snapshots stayed overflowed and the transactions responsible did not finish, so hot standby never started."), + errhint("Set \"recovery_target_action\" to \"promote\", or choose a recovery target at which hot standby can be enabled."))); + proc_exit(3); + } + SetRecoveryPause(true); recoveryPausesHere(true); diff --git a/src/test/recovery/meson.build b/src/test/recovery/meson.build index 9eb8ed11425..7dcf1ddbe13 100644 --- a/src/test/recovery/meson.build +++ b/src/test/recovery/meson.build @@ -62,6 +62,7 @@ tests += { 't/051_effective_wal_level.pl', 't/052_checkpoint_segment_missing.pl', 't/053_standby_login_event_trigger.pl', + 't/054_recovery_pause_subxid.pl', ], }, } diff --git a/src/test/recovery/t/054_recovery_pause_subxid.pl b/src/test/recovery/t/054_recovery_pause_subxid.pl new file mode 100644 index 00000000000..a80b402ea78 --- /dev/null +++ b/src/test/recovery/t/054_recovery_pause_subxid.pl @@ -0,0 +1,115 @@ +# Copyright (c) 2026, PostgreSQL Global Development Group +# +# Verify that a standby reaching a recovery target with +# 'recovery_target_action = pause' shuts down, rather than silently +# promoting, when hot standby never became active because the standby +# snapshot was still incomplete (STANDBY_SNAPSHOT_PENDING). + +use strict; +use warnings FATAL => 'all'; +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +# Initialize primary node. +my $node_primary = PostgreSQL::Test::Cluster->new('primary'); +$node_primary->init(allows_streaming => 1, has_archiving => 1); +$node_primary->start; + +# Create a table to use by the subtransactions. +$node_primary->safe_psql('postgres', "CREATE TABLE subxid_test (id int);"); + +# Open a transaction with more than PGPROC_MAX_CACHED_SUBXIDS (64) +# subtransactions and keep it open to force a snapshot overflow. The INSERT +# (not the SAVEPOINT) is what assigns each subtransaction its XID. +my $bg = $node_primary->background_psql('postgres'); +$bg->query_safe('BEGIN'); +for my $i (1 .. 70) +{ + $bg->query_safe("SAVEPOINT s$i"); + $bg->query_safe("INSERT INTO subxid_test VALUES ($i)"); +} + +# Take a base backup. +my $backup_name = 'my_backup'; +$node_primary->backup($backup_name); + +# Perform a checkpoint and record the LSN as recovery target. +$node_primary->safe_psql('postgres', 'CHECKPOINT'); +my $until_lsn = + $node_primary->safe_psql('postgres', 'SELECT pg_current_wal_lsn()'); + +# Force a segment switch so the WAL segment containing the recovery target is +# archived and reachable through the standby's restore_command. +$node_primary->safe_psql('postgres', 'SELECT pg_switch_wal()'); + +# Create an archive-recovery standby that is configured to pause once it +# reaches the target. +my $node_standby = PostgreSQL::Test::Cluster->new('standby'); +$node_standby->init_from_backup($node_primary, $backup_name, + has_restoring => 1); +$node_standby->append_conf('postgresql.conf', + "recovery_target_lsn = '$until_lsn'"); +$node_standby->append_conf('postgresql.conf', 'recovery_target_action = pause'); +# Raise the log level so we can observe that the standby kept waiting for a +# non-overflowed snapshot. +$node_standby->append_conf('postgresql.conf', 'log_min_messages = debug1'); + +# The standby reaches the recovery target but, because every snapshot it saw +# was overflowed, it could never enable hot standby. It must therefore refuse +# to pause and shut down instead of promoting. The server stops on its own, so +# we drive it with pg_ctl directly (a regular ->start() would error out because +# the server never becomes ready to accept connections). +run_log( + [ + 'pg_ctl', + '--pgdata' => $node_standby->data_dir, + '--log' => $node_standby->logfile, + 'start', + ]); + +# Wait for the standby to resolve the recovery target. It must shut down; a +# regressed fix would instead promote and open the server for connections. +$node_standby->wait_for_log( + qr/database system is shut down|database system is ready to accept/); + +my $logfile = slurp_file($node_standby->logfile()); + +# The standby actually exercised the overflowed-snapshot path... +like( + $logfile, + qr/recovery snapshot waiting for non-overflowed snapshot/, + 'standby kept waiting for a non-overflowed snapshot'); + +# ... and never reached STANDBY_SNAPSHOT_READY. +unlike( + $logfile, + qr/recovery snapshots are now enabled/, + 'standby never reached a ready (non-overflowed) snapshot'); + +# It must have shut down for the documented reason ... +like( + $logfile, + qr/recovery cannot pause at the recovery target because hot standby is not active/, + 'standby refused to pause without hot standby'); + +# ... by taking the clean shutdown-at-recovery-target path (not a crash) ... +like( + $logfile, + qr/shutdown at recovery target/, + 'standby shut down cleanly at the recovery target'); + +# ... and never opened up for connections (neither as a hot standby nor by +# promoting), which is what would have allowed unsafe queries. +unlike( + $logfile, + qr/database system is ready to accept/, + 'standby never accepted connections'); + +# Close the held-open transaction on the primary. +$bg->quit; + +$node_standby->teardown_node; +$node_primary->teardown_node; + +done_testing(); -- 2.47.3