From 93970d6e62812738778668fe5551198e8749dc7f Mon Sep 17 00:00:00 2001
From: John Hsu <johnyvr@gmail.com>
Date: Wed, 8 Oct 2025 21:20:59 +0000
Subject: [PATCH v12] Avoid copying WAL segments before divergence to speed up
 pg_rewind

Adds a check to avoid copying any WAL segment files from source
to target if they are common between both servers before the
point of WAL divergence during pg_rewind.
All WAL files that exist on source and target, which fall
before the segment of the first diverged LSN can safely be
skipped from copying to the target as they have been replicated
from the original primary.

Author: John Hsu <johnhyvr@gmail.com>
Co-Author: Justin Kwan <justinpkwan@outlook.com>
---
 src/bin/pg_rewind/filemap.c         |  53 ++++++++++-
 src/bin/pg_rewind/filemap.h         |   3 +-
 src/bin/pg_rewind/meson.build       |   1 +
 src/bin/pg_rewind/pg_rewind.c       |   9 +-
 src/bin/pg_rewind/t/011_wal_copy.pl | 141 ++++++++++++++++++++++++++++
 doc/src/sgml/ref/pg_rewind.sgml     |  32 +++++--
 6 files changed, 226 insertions(+), 13 deletions(-)
 create mode 100644 src/bin/pg_rewind/t/011_wal_copy.pl

diff --git a/src/bin/pg_rewind/filemap.c b/src/bin/pg_rewind/filemap.c
index 00f5d60d6209..2d82edec0608 100644
--- a/src/bin/pg_rewind/filemap.c
+++ b/src/bin/pg_rewind/filemap.c
@@ -706,11 +706,45 @@ final_filemap_cmp(const void *a, const void *b)
 		return strcmp(fa->path, fb->path);
 }
 
+/*
+ * Decide what to do with a WAL segment file based on its position
+ * relative to the point of divergence.
+ *
+ * Caller is responsible for ensuring that the file exists on both
+ * source and target servers.
+ */
+static file_action_t
+decide_wal_file_action(const char *fname, XLogSegNo last_common_segno,
+					   size_t source_size, size_t target_size)
+{
+	TimeLineID	file_tli;
+	XLogSegNo	file_segno;
+
+	/* Get current WAL segment number given current segment file name */
+	XLogFromFileName(fname, &file_tli, &file_segno, WalSegSz);
+
+	/*
+	 * Avoid copying files before the last common segment.
+	 *
+	 * These files are assumed to exist on source and target which means they
+	 * should already be identical and before the last_common_segno.
+	 *
+	 * However we check last_common_segno and file_size again for sanity.
+	 */
+	if (file_segno < last_common_segno && source_size == target_size)
+	{
+		pg_log_debug("WAL segment \"%s\" not copied to target", fname);
+		return FILE_ACTION_NONE;
+	}
+
+	return FILE_ACTION_COPY;
+}
+
 /*
  * Decide what action to perform to a file.
  */
 static file_action_t
-decide_file_action(file_entry_t *entry)
+decide_file_action(file_entry_t *entry, XLogSegNo last_common_segno)
 {
 	const char *path = entry->path;
 
@@ -814,8 +848,17 @@ decide_file_action(file_entry_t *entry)
 		case FILE_TYPE_REGULAR:
 			if (entry->content_type == FILE_CONTENT_TYPE_WAL)
 			{
-				/* It's a WAL file, copy it. */
-				return FILE_ACTION_COPY;
+				/* Handle WAL segment file */
+				const char *filename = last_dir_separator(entry->path);
+
+				if (filename == NULL)
+					filename = entry->path;
+				else
+					filename++; /* Skip the separator */
+
+				return decide_wal_file_action(filename, last_common_segno,
+											  entry->source_size,
+											  entry->target_size);
 			}
 			else if (entry->content_type != FILE_CONTENT_TYPE_RELATION)
 			{
@@ -876,7 +919,7 @@ decide_file_action(file_entry_t *entry)
  * should be executed.
  */
 filemap_t *
-decide_file_actions(void)
+decide_file_actions(XLogSegNo last_common_segno)
 {
 	int			i;
 	filehash_iterator it;
@@ -886,7 +929,7 @@ decide_file_actions(void)
 	filehash_start_iterate(filehash, &it);
 	while ((entry = filehash_iterate(filehash, &it)) != NULL)
 	{
-		entry->action = decide_file_action(entry);
+		entry->action = decide_file_action(entry, last_common_segno);
 	}
 
 	/*
diff --git a/src/bin/pg_rewind/filemap.h b/src/bin/pg_rewind/filemap.h
index fada420fc230..5145f0b4c46c 100644
--- a/src/bin/pg_rewind/filemap.h
+++ b/src/bin/pg_rewind/filemap.h
@@ -11,6 +11,7 @@
 #include "datapagemap.h"
 #include "storage/block.h"
 #include "storage/relfilelocator.h"
+#include "access/xlogdefs.h"
 
 /* these enum values are sorted in the order we want actions to be processed */
 typedef enum
@@ -113,7 +114,7 @@ extern void process_target_wal_block_change(ForkNumber forknum,
 											RelFileLocator rlocator,
 											BlockNumber blkno);
 
-extern filemap_t *decide_file_actions(void);
+extern filemap_t *decide_file_actions(XLogSegNo last_common_segno);
 extern void calculate_totals(filemap_t *filemap);
 extern void print_filemap(filemap_t *filemap);
 
diff --git a/src/bin/pg_rewind/meson.build b/src/bin/pg_rewind/meson.build
index 36171600ccaf..97f001d94a50 100644
--- a/src/bin/pg_rewind/meson.build
+++ b/src/bin/pg_rewind/meson.build
@@ -44,6 +44,7 @@ tests += {
       't/008_min_recovery_point.pl',
       't/009_growing_files.pl',
       't/010_keep_recycled_wals.pl',
+      't/011_wal_copy.pl',
     ],
   },
 }
diff --git a/src/bin/pg_rewind/pg_rewind.c b/src/bin/pg_rewind/pg_rewind.c
index 0c68dd4235e6..1b953692b176 100644
--- a/src/bin/pg_rewind/pg_rewind.c
+++ b/src/bin/pg_rewind/pg_rewind.c
@@ -147,6 +147,7 @@ main(int argc, char **argv)
 	TimeLineID	source_tli;
 	TimeLineID	target_tli;
 	XLogRecPtr	target_wal_endrec;
+	XLogSegNo	last_common_segno;
 	size_t		size;
 	char	   *buffer;
 	bool		no_ensure_shutdown = false;
@@ -397,6 +398,12 @@ main(int argc, char **argv)
 					LSN_FORMAT_ARGS(divergerec),
 					targetHistory[lastcommontliIndex].tli);
 
+		/*
+		 * Convert the divergence LSN to a segment number, that will be used
+		 * to decide how WAL segments should be processed.
+		 */
+		XLByteToSeg(divergerec, last_common_segno, ControlFile_target.xlog_seg_size);
+
 		/*
 		 * Don't need the source history anymore. The target history is still
 		 * needed by the routines in parsexlog.c, when we read the target WAL.
@@ -492,7 +499,7 @@ main(int argc, char **argv)
 	 * We have collected all information we need from both systems. Decide
 	 * what to do with each file.
 	 */
-	filemap = decide_file_actions();
+	filemap = decide_file_actions(last_common_segno);
 	if (showprogress)
 		calculate_totals(filemap);
 
diff --git a/src/bin/pg_rewind/t/011_wal_copy.pl b/src/bin/pg_rewind/t/011_wal_copy.pl
new file mode 100644
index 000000000000..b9e24844654d
--- /dev/null
+++ b/src/bin/pg_rewind/t/011_wal_copy.pl
@@ -0,0 +1,141 @@
+# Copyright (c) 2025, PostgreSQL Global Development Group
+#
+# Check how the copy of WAL segments is handled from the source to
+# the target server.
+
+use strict;
+use warnings FATAL => 'all';
+use PostgreSQL::Test::Utils;
+use Test::More;
+use File::stat qw(stat);
+
+use FindBin;
+use lib $FindBin::RealBin;
+use RewindTest;
+use Time::HiRes qw(usleep);
+
+RewindTest::setup_cluster();
+RewindTest::start_primary();
+RewindTest::create_standby();
+
+# Advance WAL on primary
+RewindTest::primary_psql("CREATE TABLE t(a int)");
+RewindTest::primary_psql("INSERT INTO t VALUES(0)");
+
+# Segment that is not copied from the source to the target, being
+# generated before the servers have diverged.
+my $wal_seg_skipped = $node_primary->safe_psql('postgres',
+	'SELECT pg_walfile_name(pg_current_wal_lsn())');
+
+RewindTest::primary_psql("SELECT pg_switch_wal()");
+
+# Follow-up segment, that will include corrupted contents, and should be
+# copied from the source to the target even if generated before the point
+# of divergence.
+RewindTest::primary_psql("INSERT INTO t VALUES(0)");
+my $corrupt_wal_seg = $node_primary->safe_psql('postgres',
+	'SELECT pg_walfile_name(pg_current_wal_lsn())');
+RewindTest::primary_psql("SELECT pg_switch_wal()");
+
+RewindTest::primary_psql("CHECKPOINT");
+RewindTest::promote_standby;
+
+# New segment on a new timeline, expected to be copied.
+my $new_timeline_wal_seg = $node_standby->safe_psql('postgres',
+	'SELECT pg_walfile_name(pg_current_wal_lsn())');
+
+# Get some stats info for the WAL files whose copies should be skipped.
+my $wal_skipped_path =
+  $node_primary->data_dir . '/pg_wal/' . $wal_seg_skipped;
+my $wal_skipped_stat = stat($wal_skipped_path);
+defined($wal_skipped_stat) or die("unable to stat $wal_skipped_path");
+
+# Store modification time for later comparison
+my $wal_seg_skipped_mod_time = $wal_skipped_stat->mtime;
+
+# Corrupt a WAL segment on target that has been generated before the
+# divergence point.  We will check that it is copied over from source.
+my $corrupt_wal_seg_in_target_path =
+  $node_primary->data_dir . '/pg_wal/' . $corrupt_wal_seg;
+open my $fh, ">>", $corrupt_wal_seg_in_target_path
+  or die "could not open $corrupt_wal_seg_in_target_path";
+
+print $fh 'a';
+close $fh;
+
+my $corrupt_wal_seg_stat_before_rewind =
+  stat($corrupt_wal_seg_in_target_path);
+ok(defined($corrupt_wal_seg_stat_before_rewind),
+	"WAL segment $corrupt_wal_seg should exist in target before rewind");
+
+# Verify that the WAL segment on the new timeline does not exist in target
+# before the rewind.
+my $new_timeline_wal_seg_path =
+  $node_primary->data_dir . '/pg_wal/' . $new_timeline_wal_seg;
+my $new_timeline_wal_seg_stat = stat($new_timeline_wal_seg_path);
+ok(!defined($new_timeline_wal_seg_stat),
+	"WAL segment $new_timeline_wal_seg should not exist in target before rewind"
+);
+
+$node_standby->stop();
+$node_primary->stop();
+
+# Sleep to allow mtime to be different
+usleep(1000000);
+
+command_checks_all(
+	[
+		'pg_rewind', '--debug',
+		'--source-pgdata' => $node_standby->data_dir,
+		'--target-pgdata' => $node_primary->data_dir,
+		'--no-sync',
+	],
+	0,
+	[qr//],
+	[
+		qr/WAL segment \"$wal_seg_skipped\" not copied to target/,
+		qr/pg_wal\/$corrupt_wal_seg \(COPY\)/
+	],
+	'run pg_rewind');
+
+# Verify that the copied WAL segment now exists in target.
+$new_timeline_wal_seg_stat = stat($new_timeline_wal_seg_path);
+ok(defined($new_timeline_wal_seg_stat),
+	"WAL segment $new_timeline_wal_seg should exist in target after rewind");
+
+# Get current modification time of the skipped WAL segment.
+my $wal_skipped_stat_after_rewind = stat($wal_skipped_path);
+defined($wal_skipped_stat_after_rewind)
+  or die("unable to stat $wal_skipped_path after rewind");
+my $wal_seg_latest_skipped_mod_time = $wal_skipped_stat_after_rewind->mtime;
+
+# Validate that modification time hasn't changed.
+is($wal_seg_latest_skipped_mod_time, $wal_seg_skipped_mod_time,
+	"WAL segment $wal_seg_skipped modification time should be unchanged (not overwritten)"
+);
+
+# Validate that the WAL segment with the same file name as the
+# corrupted WAL segment in target has been copied from source
+# where we have it intact.
+my $corrupt_wal_seg_in_source_path =
+  $node_standby->data_dir . '/pg_wal/' . $corrupt_wal_seg;
+my $corrupt_wal_seg_source_stat = stat($corrupt_wal_seg_in_source_path);
+ok(defined($corrupt_wal_seg_source_stat),
+	"WAL segment $corrupt_wal_seg should exist in source after rewind");
+my $corrupt_wal_seg_stat_after_rewind = stat($corrupt_wal_seg_in_target_path);
+ok(defined($corrupt_wal_seg_stat_after_rewind),
+	"WAL segment $corrupt_wal_seg should exist in target after rewind");
+ok( $corrupt_wal_seg_stat_before_rewind->size !=
+	  $corrupt_wal_seg_source_stat->size,
+	"Expected WAL segment $corrupt_wal_seg to have different size in source vs target before rewind"
+);
+ok( $corrupt_wal_seg_stat_after_rewind->mtime >
+	  $corrupt_wal_seg_stat_before_rewind->mtime,
+	"Expected WAL segment $corrupt_wal_seg to have later mtime on target than source after rewind as it was copied"
+);
+ok( $corrupt_wal_seg_stat_after_rewind->size ==
+	  $corrupt_wal_seg_source_stat->size,
+	"Expected WAL segment $corrupt_wal_seg file sizes to be same between target and source after rewind as it was copied"
+);
+
+done_testing();
diff --git a/doc/src/sgml/ref/pg_rewind.sgml b/doc/src/sgml/ref/pg_rewind.sgml
index 5485033ed8c7..440b2c79f5f0 100644
--- a/doc/src/sgml/ref/pg_rewind.sgml
+++ b/doc/src/sgml/ref/pg_rewind.sgml
@@ -52,12 +52,32 @@ PostgreSQL documentation
    analogous to a base backup of the source data directory. Unlike taking
    a new base backup or using a tool like <application>rsync</application>,
    <application>pg_rewind</application> does not require comparing or copying
-   unchanged relation blocks in the cluster. Only changed blocks from existing
-   relation files are copied; all other files, including new relation files,
-   configuration files, and WAL segments, are copied in full. As such the
-   rewind operation is significantly faster than other approaches when the
-   database is large and only a small fraction of blocks differ between the
-   clusters.
+   unchanged relation blocks in the cluster.
+  </para>
+  <orderedlist>
+   <listitem>
+    <para>
+     Only changed blocks from existing relation files are copied.
+    </para>
+   </listitem>
+   <listitem>
+    <para>
+     WAL segments prior to the point where the source and target servers
+     have diverged are not copied. WAL segments generated after the source
+     and target servers have diverged are copied in full.
+    </para>
+   </listitem>
+   <listitem>
+    <para>
+     All other files, including new relation files and configuration files,
+     are copied in full.
+    </para>
+   </listitem>
+   </orderedlist>
+   <para>
+    As such, the rewind operation is significantly faster than other
+    approaches when the database is large and only a small fraction of blocks
+    differ between the clusters.
   </para>
 
   <para>
-- 
2.51.0

