From 2d78ccc3ae05a1930428d47bb7aaf0221d9ae0ad Mon Sep 17 00:00:00 2001
From: Jan Nidzwetzki <jan@planetscale.com>
Date: Thu, 11 Jun 2026 11:38:43 +0200
Subject: [PATCH 2/2] Honor recovery_target_action='pause' on inconsistent
 snapshots
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Ensure that recovery_target_action='pause' takes effect even when every
standby snapshot replayed was overflowed and standbyState therefore
remains at STANDBY_SNAPSHOT_PENDING.

In that situation hot standby connections were never enabled, so when
recovery reached its target recoveryPausesHere() bailed out early (it
refuses to pause unless connections are possible) and the instance
promoted instead of pausing, silently ignoring the configured action.

Fix this by enabling hot standby connections from the pending snapshot
just before pausing.  This is safe because replay is frozen at this
point: the only ways out of the pause are promotion and shutdown, so no
transaction's commit status can change afterwards, and any transaction a
query finds committed in CLOG necessarily committed before that query's
snapshot.

Add a TAP test that drives a standby into the overflowed-snapshot state
with a long-running transaction holding more than
PGPROC_MAX_CACHED_SUBXIDS subtransactions, and verifies that the standby
pauses (rather than promotes) at the target and that visibility is
correct both before and after a restart.

Co-authored-by: Fabrízio de Royes Mello <fabrizio@planetscale.com>
---
 src/backend/access/transam/xlogrecovery.c     |  28 ++++
 src/test/recovery/meson.build                 |   1 +
 .../recovery/t/054_recovery_pause_subxid.pl   | 138 ++++++++++++++++++
 3 files changed, 167 insertions(+)
 create mode 100644 src/test/recovery/t/054_recovery_pause_subxid.pl

diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c
index 254603d7ef6..27989fc7332 100644
--- a/src/backend/access/transam/xlogrecovery.c
+++ b/src/backend/access/transam/xlogrecovery.c
@@ -1833,6 +1833,34 @@ PerformWalRecovery(void)
 					proc_exit(3);
 
 				case RECOVERY_TARGET_ACTION_PAUSE:
+
+					/*
+					 * If we have not been able to enable hot standby yet (for
+					 * example because every standby snapshot we have seen was
+					 * overflowed, leaving standbyState at
+					 * STANDBY_SNAPSHOT_PENDING), do so now. Otherwise
+					 * recoveryPausesHere() would refuse to pause and we would
+					 * fall through and promote instead.
+					 *
+					 * Enabling connections from a pending snapshot is safe at
+					 * this point because we will replay no more WAL: the only
+					 * ways out of this pause are promotion and shutdown. With
+					 * replay frozen, no transaction's commit status can
+					 * change after this point, so any transaction that a
+					 * query finds committed in CLOG necessarily committed
+					 * before that query's snapshot.
+					 *
+					 * hot_standby is known to be enabled, otherwise the
+					 * action would have been downgraded to "shutdown" when
+					 * recovery parameters were validated. standbyState is
+					 * therefore at least STANDBY_SNAPSHOT_PENDING: we only
+					 * get here after reaching consistency, and consistency
+					 * cannot be reached without first replaying the
+					 * running-xacts record and replaying that record advances
+					 * standbyState past STANDBY_INITIALIZED.
+					 */
+					EnableHotStandbyConnections();
+
 					SetRecoveryPause(true);
 					recoveryPausesHere(true);
 
diff --git a/src/test/recovery/meson.build b/src/test/recovery/meson.build
index 9eb8ed11425..7dcf1ddbe13 100644
--- a/src/test/recovery/meson.build
+++ b/src/test/recovery/meson.build
@@ -62,6 +62,7 @@ tests += {
       't/051_effective_wal_level.pl',
       't/052_checkpoint_segment_missing.pl',
       't/053_standby_login_event_trigger.pl',
+      't/054_recovery_pause_subxid.pl',
     ],
   },
 }
diff --git a/src/test/recovery/t/054_recovery_pause_subxid.pl b/src/test/recovery/t/054_recovery_pause_subxid.pl
new file mode 100644
index 00000000000..f53d9b18147
--- /dev/null
+++ b/src/test/recovery/t/054_recovery_pause_subxid.pl
@@ -0,0 +1,138 @@
+# Copyright (c) 2026, PostgreSQL Global Development Group
+#
+# Verify that a hot standby reaching a recovery target with
+# 'recovery_target_action = pause' becomes actually paused
+# even when the standby snapshot it replayed was marked as overflowed.
+
+use strict;
+use warnings FATAL => 'all';
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+# Initialize primary node.
+my $node_primary = PostgreSQL::Test::Cluster->new('primary');
+$node_primary->init(allows_streaming => 1, has_archiving => 1);
+$node_primary->start;
+
+# Create a table to use by the subtransactions.
+$node_primary->safe_psql('postgres', "CREATE TABLE subxid_test (id int);");
+
+# Open a transaction with more than PGPROC_MAX_CACHED_SUBXIDS (64)
+# subtransactions and keep it open to force a snapshot overflow.
+my $bg = $node_primary->background_psql('postgres');
+$bg->query_safe('BEGIN');
+for my $i (1 .. 70)
+{
+	$bg->query_safe("SAVEPOINT s$i");
+	$bg->query_safe("INSERT INTO subxid_test VALUES ($i)");
+}
+
+# Take a base backup.
+my $backup_name = 'my_backup';
+$node_primary->backup($backup_name);
+
+# Insert committed data after the base backup but before the recovery target.
+# It is not part of the backup, so it can only reach the standby through WAL
+# replay, and it commits before until_lsn -- hence it must be visible on the
+# paused standby.
+$node_primary->safe_psql('postgres',
+	'CREATE TABLE committed_test (id int); '
+	  . 'INSERT INTO committed_test VALUES (1), (2), (3);');
+
+# Determine the recovery target LSN.
+my $until_lsn =
+  $node_primary->safe_psql('postgres', 'SELECT pg_current_wal_lsn()');
+
+# Force a segment switch so the WAL segment containing the recovery target is
+# archived and reachable through the standby's restore_command.
+$node_primary->safe_psql('postgres', 'SELECT pg_switch_wal()');
+
+# Create an archive-recovery standby that pauses once it reaches the target.
+my $node_standby = PostgreSQL::Test::Cluster->new('standby');
+$node_standby->init_from_backup($node_primary, $backup_name,
+	has_restoring => 1);
+$node_standby->append_conf('postgresql.conf',
+	"recovery_target_lsn = '$until_lsn'");
+$node_standby->append_conf('postgresql.conf', 'recovery_target_action = pause');
+# Raise the log level so we can observe that the standby kept waiting for a
+# non-overflowed snapshot.
+$node_standby->append_conf('postgresql.conf', 'log_min_messages = debug1');
+$node_standby->start;
+
+# After reaching the recovery target the standby should pause and stay in
+# recovery. We wait here until the state_query returns something other than
+# 'not paused'...
+my $state_query =
+    "SELECT CASE WHEN pg_is_in_recovery()"
+  . " THEN pg_get_wal_replay_pause_state()"
+  . " ELSE 'not in recovery' END";
+$node_standby->poll_query_until('postgres', "$state_query <> 'not paused'")
+  or die "Timed out while waiting for the standby to reach the recovery target";
+
+# ... and the expectation is that we see a 'paused' recovery state now.
+my $state = $node_standby->safe_psql('postgres', $state_query);
+is($state, 'paused',
+	'standby pauses at recovery target instead of ending recovery');
+
+# Confirm it paused at the requested target.
+is( $node_standby->safe_psql(
+		'postgres', "SELECT '$until_lsn'::pg_lsn <= pg_last_wal_replay_lsn()"),
+	't',
+	'standby replayed up to the recovery target LSN');
+
+# Make sure the standby actually exercised the overflowed-snapshot path.
+ok( $node_standby->log_contains(
+		qr/recovery snapshot waiting for non-overflowed snapshot/),
+	'standby paused while still waiting for a non-overflowed snapshot');
+
+# Furthermore, it must never have reached STANDBY_SNAPSHOT_READY.
+ok( !$node_standby->log_contains(qr/recovery snapshots are now enabled/),
+	'standby never reached a ready (non-overflowed) snapshot');
+
+# The rows inserted by the still-open transaction are physically present in the
+# base backup, but they belong to subtransactions that never committed.  Even
+# though the standby served queries from an overflowed (pending) snapshot, those
+# rows must not be visible: their xmin is uncommitted in CLOG.
+is( $node_standby->safe_psql('postgres', 'SELECT count(*) FROM subxid_test'),
+	'0',
+	'uncommitted overflowed-subxid rows are not visible before restart');
+
+# Conversely, data committed after the backup but before the target must be
+# visible: it was replayed, and it committed before any query's snapshot.
+is( $node_standby->safe_psql('postgres', 'SELECT count(*) FROM committed_test'),
+	'3',
+	'data committed before the recovery target is visible before restart');
+
+# Close the held-open transaction on the primary.
+$bg->quit;
+
+# Restarting the standby must lead to the pause state again.
+$node_standby->restart;
+
+$node_standby->poll_query_until('postgres', "$state_query <> 'not paused'")
+  or die "Timed out while waiting for the standby to pause again after restart";
+
+is( $node_standby->safe_psql('postgres', $state_query),
+	'paused',
+	'standby pauses again at recovery target after restart');
+
+is( $node_standby->safe_psql(
+		'postgres', "SELECT '$until_lsn'::pg_lsn <= pg_last_wal_replay_lsn()"),
+	't',
+	'standby replayed up to the recovery target LSN after restart');
+
+# The same must hold after the restart, the data from the uncommitted
+# subtransactions must not be visible.
+is( $node_standby->safe_psql('postgres', 'SELECT count(*) FROM subxid_test'),
+	'0',
+	'uncommitted overflowed-subxid rows are not visible after restart');
+
+is( $node_standby->safe_psql('postgres', 'SELECT count(*) FROM committed_test'),
+	'3',
+	'data committed before the recovery target is visible after restart');
+
+$node_standby->teardown_node;
+$node_primary->teardown_node;
+
+done_testing();
-- 
2.47.3