From 4fdf81fc15f89f6c53cb1780ccf69f96b72de623 Mon Sep 17 00:00:00 2001
From: Bertrand Drouvot <bertranddrouvot.pg@gmail.com>
Date: Mon, 8 Jun 2026 07:04:16 +0000
Subject: [PATCH v1 2/4] Add injection-point test for logical decoding timeline
 race during promotion

Add an injection point "promotion-after-wal-segment-cleanup" in StartupXLOG(),
right after CleanupAfterArchiveRecovery() removes old timeline WAL segments but
before SharedRecoveryState is set to RECOVERY_STATE_DONE.

Add a test in 035_standby_logical_decoding.pl that uses this injection point to
deterministically reproduce the race condition where a walsender doing logical
decoding would pick the old timeline after segment removal, resulting in:

ERROR: requested WAL segment ... has already been removed

The test pauses the startup process at the injection point, starts pg_recvlogical
(whose walsender must read WAL from the removed segment), then resumes promotion
and verifies decoding succeeds.

Author: Bertrand Drouvot <bertranddrouvot.pg@gmail.com>
Reviewed-by:
Discussion: https://postgr.es/m/7daef094-abf3-4672-bc23-3df4763b16a3%40gmail.com
---
 src/backend/access/transam/xlog.c             |  2 +
 .../t/035_standby_logical_decoding.pl         | 66 +++++++++++++++++++
 2 files changed, 68 insertions(+)
  97.5% src/test/recovery/t/

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index d69d03b2ef3..6c2304fef33 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -6571,6 +6571,8 @@ StartupXLOG(void)
 	if (ArchiveRecoveryRequested)
 		CleanupAfterArchiveRecovery(EndOfLogTLI, EndOfLog, newTLI);
 
+	INJECTION_POINT("promotion-after-wal-segment-cleanup", NULL);
+
 	/*
 	 * Local WAL inserts enabled, so it's time to finish initialization of
 	 * commit timestamp.
diff --git a/src/test/recovery/t/035_standby_logical_decoding.pl b/src/test/recovery/t/035_standby_logical_decoding.pl
index 4421059f100..ce80123844d 100644
--- a/src/test/recovery/t/035_standby_logical_decoding.pl
+++ b/src/test/recovery/t/035_standby_logical_decoding.pl
@@ -1060,4 +1060,70 @@ is($cascading_stdout, $expected,
 	'got same expected output from pg_recvlogical decoding session on cascading standby'
 );
 
+##################################################
+# Test that logical decoding on standby correctly handles the timeline
+# change during promotion. There is a window during promotion where
+# RecoveryInProgress() still returns true but old timeline WAL segments
+# have already been removed. Verify the walsender uses the correct
+# timeline in this window.
+##################################################
+
+# Create a logical slot on the cascading standby for this test.
+$node_cascading_standby->create_logical_slot_on_standby($node_standby,
+	'race_slot', 'testdb');
+
+# Insert data so the slot has WAL to decode.
+$node_standby->safe_psql('testdb',
+	qq[INSERT INTO decoding_test(x,y) SELECT s, s::text FROM generate_series(10,13) s;]
+);
+$node_standby->wait_for_replay_catchup($node_cascading_standby);
+
+# Create the injection_points extension on the cascading standby.
+$node_standby->safe_psql('testdb', 'CREATE EXTENSION injection_points;');
+$node_standby->wait_for_replay_catchup($node_cascading_standby);
+
+# Attach injection point to pause startup after WAL segment cleanup
+# but before RecoveryInProgress() flips to false.
+$node_cascading_standby->safe_psql('testdb',
+	"SELECT injection_points_attach('promotion-after-wal-segment-cleanup', 'wait');"
+);
+
+# Promote with no-wait so we can synchronize with the injection point.
+$node_cascading_standby->safe_psql('testdb', "SELECT pg_promote(false)");
+
+# Wait for startup to pause after removing old timeline WAL segments.
+$node_cascading_standby->wait_for_event('startup',
+	'promotion-after-wal-segment-cleanup');
+
+# Start pg_recvlogical
+my ($stdout2, $stderr2);
+my $handle2 = IPC::Run::start(
+	[
+		'pg_recvlogical',
+		'--dbname' => $node_cascading_standby->connstr('testdb'),
+		'--slot' => 'race_slot',
+		'--option' => 'include-xids=0',
+		'--option' => 'skip-empty-xacts=1',
+		'--file' => '-',
+		'--no-loop',
+		'--start',
+	],
+	'>' => \$stdout2,
+	'2>' => \$stderr2,
+	IPC::Run::timeout($default_timeout));
+
+# Wait for the walsender to acquire the slot.
+$node_cascading_standby->poll_query_until('testdb',
+	qq[SELECT EXISTS (SELECT 1 FROM pg_replication_slots WHERE slot_name = 'race_slot' AND active_pid IS NOT NULL)]
+) or die "slot race_slot never became active";
+
+# Resume promotion.
+$node_cascading_standby->safe_psql('testdb',
+	"SELECT injection_points_wakeup('promotion-after-wal-segment-cleanup');");
+
+# Verify pg_recvlogical successfully decodes the data.
+$pump_timeout = IPC::Run::timer($default_timeout);
+ok( pump_until($handle2, $pump_timeout, \$stdout2, qr/COMMIT/s),
+	'pg_recvlogical works during promotion timeline switch');
+
 done_testing();
-- 
2.34.1

