From ef4b59760a910fbfd3ea88011307c3af188c2b42 Mon Sep 17 00:00:00 2001
From: Bryan Green <dbryan.green@gmail.com>
Date: Tue, 4 Nov 2025 23:41:29 -0600
Subject: [PATCH] Fix timing-dependent failure in recovery/004_timeline_switch.

The test verified walreceiver survival by searching logs for termination
messages. However, it called restart() before the timeline switch, which
kills walreceiver and logs the exact message being checked. TAP tests
reuse the same log file across restarts, so log_contains() finds that
message and the test fails.

Except sometimes it didn't fail, because the message wasn't flushed yet.
Recent changes to file handle inheritance altered I/O timing enough to
make this fail consistently while testing another patch.

Replace the log check with a PID comparison, which actually tests what
we care about.
---
 src/test/recovery/t/004_timeline_switch.pl | 28 ++++++++++++++++------
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/src/test/recovery/t/004_timeline_switch.pl b/src/test/recovery/t/004_timeline_switch.pl
index 13874ff866..21e94a1de3 100644
--- a/src/test/recovery/t/004_timeline_switch.pl
+++ b/src/test/recovery/t/004_timeline_switch.pl
@@ -56,6 +56,16 @@ primary_conninfo='$connstr_1'
 ));
 $node_standby_2->restart;
 
+# Wait for walreceiver to reconnect after the restart.  The restart
+# will have killed the old walreceiver, which is expected.  We want to
+# verify that after reconnection, the walreceiver stays alive during
+# the timeline switch.
+$node_standby_2->poll_query_until('postgres',
+	"SELECT EXISTS(SELECT 1 FROM pg_stat_wal_receiver)");
+my $wr_pid_before_switch = $node_standby_2->safe_psql('postgres',
+	"SELECT pid FROM pg_stat_wal_receiver");
+
+
 # Insert some data in standby 1 and check its presence in standby 2
 # to ensure that the timeline switch has been done.
 $node_standby_1->safe_psql('postgres',
@@ -66,13 +76,17 @@ my $result =
   $node_standby_2->safe_psql('postgres', "SELECT count(*) FROM tab_int");
 is($result, qq(2000), 'check content of standby 2');
 
-# Check the logs, WAL receiver should not have been stopped while
-# transitioning to its new timeline.  There is no need to rely on an
-# offset in this check of the server logs: a new log file is used on
-# node restart when primary_conninfo is updated above.
-ok( !$node_standby_2->log_contains(
-		"FATAL: .* terminating walreceiver process due to administrator command"
-	),
+
+# Verify that the walreceiver process stayed alive across the timeline
+# switch.  We recorded the PID after reconnection and now check that
+# it's the same after completing the timeline switch.  This is more
+# reliable than checking log messages, which may include expected
+# shutdown messages from the restart.
+my $wr_pid_after_switch = $node_standby_2->safe_psql('postgres',
+	"SELECT pid FROM pg_stat_wal_receiver");
+
+is( $wr_pid_before_switch,
+	$wr_pid_after_switch,
 	'WAL receiver should not be stopped across timeline jumps');
 
 # Ensure that a standby is able to follow a primary on a newer timeline
-- 
2.46.0.windows.1