From 04bda25a7d52526282f3f1a0437077c94794b5a2 Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Mon, 15 Dec 2025 17:48:07 +0900
Subject: [PATCH v4] Check presence of REDO record

---
 src/backend/access/transam/xlog.c             |   6 +
 src/backend/access/transam/xlogrecovery.c     |  10 ++
 src/test/recovery/meson.build                 |   1 +
 .../recovery/t/050_redo_segment_missing.pl    | 117 ++++++++++++++++++
 4 files changed, 134 insertions(+)
 create mode 100644 src/test/recovery/t/050_redo_segment_missing.pl

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 6a5640df51af..430a38b1a216 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -7001,6 +7001,10 @@ CreateCheckPoint(int flags)
 	 */
 	SyncPreCheckpoint();
 
+	/* Run these points outside the critical section. */
+	INJECTION_POINT("create-checkpoint-initial", NULL);
+	INJECTION_POINT_LOAD("create-checkpoint-run");
+
 	/*
 	 * Use a critical section to force system panic if we have trouble.
 	 */
@@ -7151,6 +7155,8 @@ CreateCheckPoint(int flags)
 	if (log_checkpoints)
 		LogCheckpointStart(flags, false);
 
+	INJECTION_POINT_CACHED("create-checkpoint-run", NULL);
+
 	/* Update the process title */
 	update_checkpoint_display(flags, false, false);
 
diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c
index ae2398d6975b..38b594d21709 100644
--- a/src/backend/access/transam/xlogrecovery.c
+++ b/src/backend/access/transam/xlogrecovery.c
@@ -805,6 +805,16 @@ InitWalRecovery(ControlFileData *ControlFile, bool *wasShutdown_ptr,
 		}
 		memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
 		wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
+
+		/* Make sure that REDO location exists. */
+		if (checkPoint.redo < CheckPointLoc)
+		{
+			XLogPrefetcherBeginRead(xlogprefetcher, checkPoint.redo);
+			if (!ReadRecord(xlogprefetcher, LOG, false, checkPoint.ThisTimeLineID))
+				ereport(FATAL,
+						errmsg("could not find redo location %X/%08X referenced by checkpoint record at %X/%08X",
+							   LSN_FORMAT_ARGS(checkPoint.redo), LSN_FORMAT_ARGS(CheckPointLoc)));
+		}
 	}
 
 	if (ArchiveRecoveryRequested)
diff --git a/src/test/recovery/meson.build b/src/test/recovery/meson.build
index 523a5cd5b527..e93248bd66e2 100644
--- a/src/test/recovery/meson.build
+++ b/src/test/recovery/meson.build
@@ -58,6 +58,7 @@ tests += {
       't/047_checkpoint_physical_slot.pl',
       't/048_vacuum_horizon_floor.pl',
       't/049_wait_for_lsn.pl',
+      't/050_redo_segment_missing.pl',
     ],
   },
 }
diff --git a/src/test/recovery/t/050_redo_segment_missing.pl b/src/test/recovery/t/050_redo_segment_missing.pl
new file mode 100644
index 000000000000..113477e01520
--- /dev/null
+++ b/src/test/recovery/t/050_redo_segment_missing.pl
@@ -0,0 +1,117 @@
+# Copyright (c) 2025, PostgreSQL Global Development Group
+#
+# Evaluates PostgreSQL's recovery behavior when a WAL segment containing the
+# redo record is missing, with a checkpoint record located in a different
+# segment.
+
+use strict;
+use warnings FATAL => 'all';
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+if ($ENV{enable_injection_points} ne 'yes')
+{
+	plan skip_all => 'Injection points not supported by this build';
+}
+
+my $node = PostgreSQL::Test::Cluster->new('testnode');
+$node->init;
+$node->append_conf('postgresql.conf', 'log_checkpoints = on');
+$node->start;
+
+# Check if the extension injection_points is available, as it may be
+# possible that this script is run with installcheck, where the module
+# would not be installed by default.
+if (!$node->check_extension('injection_points'))
+{
+	plan skip_all => 'Extension injection_points not installed';
+}
+$node->safe_psql('postgres', q(CREATE EXTENSION injection_points));
+
+# Note that this uses two injection points based on waits, not one.
+# This may look strange, but this works as a workaround to enforce
+# all memory allocations to happen outside the critical section of a
+# checkpoint.
+# First, "create-checkpoint-initial" is run outside the critical
+# section, and is used as a way to initialize the shared memory required
+# for the wait machinery with its DSM registry.
+# Then, "create-checkpoint-run" is run inside the critical section of
+# a checkpoint, with its callback loaded outside its critical section to
+# allocate any memory required by the callback.
+$node->safe_psql('postgres',
+	q{select injection_points_attach('create-checkpoint-initial', 'wait')});
+$node->safe_psql('postgres',
+	q{select injection_points_attach('create-checkpoint-run', 'wait')});
+
+# Start a psql session to run the checkpoint in the background and make
+# the test wait on the injection point so the checkpoint stops just after
+# it starts.
+my $checkpoint = $node->background_psql('postgres');
+$checkpoint->query_until(
+	qr/starting_checkpoint/,
+	q(\echo starting_checkpoint
+checkpoint;
+));
+
+# Wait for initialization point to finish, the checkpointer is still
+# outside its critical section.  Then release to reach the second
+# point.
+$node->wait_for_event('checkpointer', 'create-checkpoint-initial');
+$node->safe_psql('postgres',
+	q{select injection_points_wakeup('create-checkpoint-initial')});
+
+# Wait until the checkpoint has reached the second injection point,
+# we are now in the middle of a checkpoint running after the redo
+# record has been logged.
+$node->wait_for_event('checkpointer', 'create-checkpoint-run');
+
+# Switch the WAL segment, ensuring that the redo record will be
+# included in a different segment than the checkpoint record.
+$node->safe_psql('postgres', 'SELECT pg_switch_wal()');
+
+# Continue the checkpoint and wait for its completion.
+my $log_offset = -s $node->logfile;
+$node->safe_psql('postgres',
+	q{select injection_points_wakeup('create-checkpoint-run')});
+$node->wait_for_log(qr/checkpoint complete/, $log_offset);
+
+$checkpoint->quit;
+
+# Retrieve the WAL file names for the redo record and checkpoint record.
+my $redo_lsn = $node->safe_psql('postgres',
+	"SELECT redo_lsn FROM pg_control_checkpoint()");
+my $redo_walfile_name =
+  $node->safe_psql('postgres', "SELECT pg_walfile_name('$redo_lsn')");
+my $checkpoint_lsn = $node->safe_psql('postgres',
+	"SELECT checkpoint_lsn FROM pg_control_checkpoint()");
+my $checkpoint_walfile_name =
+  $node->safe_psql('postgres', "SELECT pg_walfile_name('$checkpoint_lsn')");
+
+# Redo record and checkpoint record should be on different segments.
+isnt($redo_walfile_name, $checkpoint_walfile_name,
+	'redo and checkpoint records on different segments');
+
+# Remove the WAL segment containing the redo record.
+unlink $node->data_dir . "/pg_wal/$redo_walfile_name"
+  or die "Could not remove WAL file: $!";
+
+$node->stop('immediate');
+
+# Use run_log instead of node->start because this test expects
+# that the server ends with an error during recovery.
+run_log(
+	[
+		'pg_ctl',
+		'--pgdata' => $node->data_dir,
+		'--log' => $node->logfile,
+		'start',
+	]);
+
+# Confirm that recovery has failed, as expected.
+my $logfile = slurp_file($node->logfile());
+ok( $logfile =~
+	  qr/FATAL:  could not find redo location .* referenced by checkpoint record at .*/,
+	"ends with FATAL because it could not find redo location");
+
+done_testing();
-- 
2.51.0

