From 2697ce6f8f2267a844e80e5ce7c253489b1f450d Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Fri, 24 Oct 2025 17:35:31 +0900
Subject: [PATCH v13] Avoid copying WAL segments before divergence to speed up
 pg_rewind

Adds a check to avoid copying any WAL segment files from source
to target if they are common between both servers before the
point of WAL divergence during pg_rewind.
All WAL files that exist on source and target, which fall
before the segment of the first diverged LSN can safely be
skipped from copying to the target as they have been replicated
from the original primary.

Author: John Hsu <johnhyvr@gmail.com>
Co-Author: Justin Kwan <justinpkwan@outlook.com>
---
 src/bin/pg_rewind/filemap.c         |  55 ++++++++++--
 src/bin/pg_rewind/filemap.h         |   3 +-
 src/bin/pg_rewind/meson.build       |   1 +
 src/bin/pg_rewind/pg_rewind.c       |   9 +-
 src/bin/pg_rewind/t/011_wal_copy.pl | 128 ++++++++++++++++++++++++++++
 doc/src/sgml/ref/pg_rewind.sgml     |  32 +++++--
 6 files changed, 215 insertions(+), 13 deletions(-)
 create mode 100644 src/bin/pg_rewind/t/011_wal_copy.pl

diff --git a/src/bin/pg_rewind/filemap.c b/src/bin/pg_rewind/filemap.c
index 00f5d60d6209..59672e669323 100644
--- a/src/bin/pg_rewind/filemap.c
+++ b/src/bin/pg_rewind/filemap.c
@@ -546,7 +546,9 @@ print_filemap(filemap_t *filemap)
 	for (i = 0; i < filemap->nentries; i++)
 	{
 		entry = filemap->entries[i];
+
 		if (entry->action != FILE_ACTION_NONE ||
+			entry->content_type == FILE_CONTENT_TYPE_WAL ||
 			entry->target_pages_to_overwrite.bitmapsize > 0)
 		{
 			pg_log_debug("%s (%s)", entry->path,
@@ -706,11 +708,45 @@ final_filemap_cmp(const void *a, const void *b)
 		return strcmp(fa->path, fb->path);
 }
 
+/*
+ * Decide what to do with a WAL segment file based on its position
+ * relative to the point of divergence.
+ *
+ * Caller is responsible for ensuring that the file exists on both
+ * source and target servers.
+ */
+static file_action_t
+decide_wal_file_action(const char *fname, XLogSegNo last_common_segno,
+					   size_t source_size, size_t target_size)
+{
+	TimeLineID	file_tli;
+	XLogSegNo	file_segno;
+
+	/* Get current WAL segment number given current segment file name */
+	XLogFromFileName(fname, &file_tli, &file_segno, WalSegSz);
+
+	/*
+	 * Avoid copying files before the last common segment.
+	 *
+	 * These files exist on the source and the target services, so they should
+	 * be identical and located strictly before the segment that contains the
+	 * LSN where target and source servers have diverged.
+	 *
+	 * While we are on it, double-check the size of each file and copy the
+	 * file if they do not match, in case.
+	 */
+	if (file_segno < last_common_segno &&
+		source_size == target_size)
+		return FILE_ACTION_NONE;
+
+	return FILE_ACTION_COPY;
+}
+
 /*
  * Decide what action to perform to a file.
  */
 static file_action_t
-decide_file_action(file_entry_t *entry)
+decide_file_action(file_entry_t *entry, XLogSegNo last_common_segno)
 {
 	const char *path = entry->path;
 
@@ -814,8 +850,17 @@ decide_file_action(file_entry_t *entry)
 		case FILE_TYPE_REGULAR:
 			if (entry->content_type == FILE_CONTENT_TYPE_WAL)
 			{
-				/* It's a WAL file, copy it. */
-				return FILE_ACTION_COPY;
+				/* Handle WAL segment file */
+				const char *filename = last_dir_separator(entry->path);
+
+				if (filename == NULL)
+					filename = entry->path;
+				else
+					filename++; /* Skip the separator */
+
+				return decide_wal_file_action(filename, last_common_segno,
+											  entry->source_size,
+											  entry->target_size);
 			}
 			else if (entry->content_type != FILE_CONTENT_TYPE_RELATION)
 			{
@@ -876,7 +921,7 @@ decide_file_action(file_entry_t *entry)
  * should be executed.
  */
 filemap_t *
-decide_file_actions(void)
+decide_file_actions(XLogSegNo last_common_segno)
 {
 	int			i;
 	filehash_iterator it;
@@ -886,7 +931,7 @@ decide_file_actions(void)
 	filehash_start_iterate(filehash, &it);
 	while ((entry = filehash_iterate(filehash, &it)) != NULL)
 	{
-		entry->action = decide_file_action(entry);
+		entry->action = decide_file_action(entry, last_common_segno);
 	}
 
 	/*
diff --git a/src/bin/pg_rewind/filemap.h b/src/bin/pg_rewind/filemap.h
index fada420fc230..5145f0b4c46c 100644
--- a/src/bin/pg_rewind/filemap.h
+++ b/src/bin/pg_rewind/filemap.h
@@ -11,6 +11,7 @@
 #include "datapagemap.h"
 #include "storage/block.h"
 #include "storage/relfilelocator.h"
+#include "access/xlogdefs.h"
 
 /* these enum values are sorted in the order we want actions to be processed */
 typedef enum
@@ -113,7 +114,7 @@ extern void process_target_wal_block_change(ForkNumber forknum,
 											RelFileLocator rlocator,
 											BlockNumber blkno);
 
-extern filemap_t *decide_file_actions(void);
+extern filemap_t *decide_file_actions(XLogSegNo last_common_segno);
 extern void calculate_totals(filemap_t *filemap);
 extern void print_filemap(filemap_t *filemap);
 
diff --git a/src/bin/pg_rewind/meson.build b/src/bin/pg_rewind/meson.build
index 36171600ccaf..97f001d94a50 100644
--- a/src/bin/pg_rewind/meson.build
+++ b/src/bin/pg_rewind/meson.build
@@ -44,6 +44,7 @@ tests += {
       't/008_min_recovery_point.pl',
       't/009_growing_files.pl',
       't/010_keep_recycled_wals.pl',
+      't/011_wal_copy.pl',
     ],
   },
 }
diff --git a/src/bin/pg_rewind/pg_rewind.c b/src/bin/pg_rewind/pg_rewind.c
index 0c68dd4235e6..1b953692b176 100644
--- a/src/bin/pg_rewind/pg_rewind.c
+++ b/src/bin/pg_rewind/pg_rewind.c
@@ -147,6 +147,7 @@ main(int argc, char **argv)
 	TimeLineID	source_tli;
 	TimeLineID	target_tli;
 	XLogRecPtr	target_wal_endrec;
+	XLogSegNo	last_common_segno;
 	size_t		size;
 	char	   *buffer;
 	bool		no_ensure_shutdown = false;
@@ -397,6 +398,12 @@ main(int argc, char **argv)
 					LSN_FORMAT_ARGS(divergerec),
 					targetHistory[lastcommontliIndex].tli);
 
+		/*
+		 * Convert the divergence LSN to a segment number, that will be used
+		 * to decide how WAL segments should be processed.
+		 */
+		XLByteToSeg(divergerec, last_common_segno, ControlFile_target.xlog_seg_size);
+
 		/*
 		 * Don't need the source history anymore. The target history is still
 		 * needed by the routines in parsexlog.c, when we read the target WAL.
@@ -492,7 +499,7 @@ main(int argc, char **argv)
 	 * We have collected all information we need from both systems. Decide
 	 * what to do with each file.
 	 */
-	filemap = decide_file_actions();
+	filemap = decide_file_actions(last_common_segno);
 	if (showprogress)
 		calculate_totals(filemap);
 
diff --git a/src/bin/pg_rewind/t/011_wal_copy.pl b/src/bin/pg_rewind/t/011_wal_copy.pl
new file mode 100644
index 000000000000..fb5b7104378e
--- /dev/null
+++ b/src/bin/pg_rewind/t/011_wal_copy.pl
@@ -0,0 +1,128 @@
+# Copyright (c) 2025, PostgreSQL Global Development Group
+#
+# Check how the copy of WAL segments is handled from the source to
+# the target server.
+
+use strict;
+use warnings FATAL => 'all';
+use PostgreSQL::Test::Utils;
+use Test::More;
+use File::stat qw(stat);
+
+use FindBin;
+use lib $FindBin::RealBin;
+use RewindTest;
+
+RewindTest::setup_cluster();
+RewindTest::start_primary();
+RewindTest::create_standby();
+
+# Advance WAL on primary
+RewindTest::primary_psql("CREATE TABLE t(a int)");
+RewindTest::primary_psql("INSERT INTO t VALUES(0)");
+
+# Segment that is not copied from the source to the target, being
+# generated before the servers have diverged.
+my $wal_seg_skipped = $node_primary->safe_psql('postgres',
+	'SELECT pg_walfile_name(pg_current_wal_lsn())');
+
+RewindTest::primary_psql("SELECT pg_switch_wal()");
+
+# Follow-up segment, that will include corrupted contents, and will be
+# copied from the source to the target even if generated before the point
+# of divergence.
+RewindTest::primary_psql("INSERT INTO t VALUES(0)");
+my $corrupt_wal_seg = $node_primary->safe_psql('postgres',
+	'SELECT pg_walfile_name(pg_current_wal_lsn())');
+RewindTest::primary_psql("SELECT pg_switch_wal()");
+
+RewindTest::primary_psql("CHECKPOINT");
+RewindTest::promote_standby;
+
+# New segment on a new timeline, expected to be copied.
+my $new_timeline_wal_seg = $node_standby->safe_psql('postgres',
+	'SELECT pg_walfile_name(pg_current_wal_lsn())');
+
+# Get some stats info for the WAL file whose copy is skipped.
+my $wal_skipped_path =
+  $node_primary->data_dir . '/pg_wal/' . $wal_seg_skipped;
+my $wal_skipped_stat = stat($wal_skipped_path);
+defined($wal_skipped_stat) or die("unable to stat $wal_skipped_path");
+
+# Corrupt a WAL segment on target that has been generated before the
+# divergence point.  We will check that it is copied from the source.
+my $corrupt_wal_seg_in_target_path =
+  $node_primary->data_dir . '/pg_wal/' . $corrupt_wal_seg;
+open my $fh, ">>", $corrupt_wal_seg_in_target_path
+  or die "could not open $corrupt_wal_seg_in_target_path";
+
+print $fh 'a';
+close $fh;
+
+my $corrupt_wal_seg_stat_before_rewind =
+  stat($corrupt_wal_seg_in_target_path);
+ok(defined($corrupt_wal_seg_stat_before_rewind),
+	"segment $corrupt_wal_seg exists in target before rewind");
+
+# Verify that the WAL segment on the new timeline does not exist in target
+# before the rewind.
+my $new_timeline_wal_seg_path =
+  $node_primary->data_dir . '/pg_wal/' . $new_timeline_wal_seg;
+my $new_timeline_wal_seg_stat = stat($new_timeline_wal_seg_path);
+ok(!defined($new_timeline_wal_seg_stat),
+	"segment $new_timeline_wal_seg does not exist in target before rewind"
+);
+
+$node_standby->stop();
+$node_primary->stop();
+
+# Cross-check how WAL segments are handled:
+# - The "corrupted" segment generated before the point of divergence is
+#   copied.
+# - The "clean" segment generated before the point of divergence is skipped.
+# - The segment of the new timeline is copied.
+command_checks_all(
+	[
+		'pg_rewind', '--debug',
+		'--source-pgdata' => $node_standby->data_dir,
+		'--target-pgdata' => $node_primary->data_dir,
+		'--no-sync',
+	],
+	0,
+	[qr//],
+	[
+		qr/pg_wal\/$wal_seg_skipped \(NONE\)/,
+		qr/pg_wal\/$corrupt_wal_seg \(COPY\)/,
+		qr/pg_wal\/$new_timeline_wal_seg \(COPY\)/,
+	],
+	'run pg_rewind');
+
+# Verify that the first WAL segment of the new timeline now exists in
+# target.
+$new_timeline_wal_seg_stat = stat($new_timeline_wal_seg_path);
+ok(defined($new_timeline_wal_seg_stat),
+	"new timeline segment $new_timeline_wal_seg exists in target after rewind"
+);
+
+# Validate that the WAL segment with the same file name as the
+# corrupted WAL segment in target has been copied from source
+# where it was still intact.
+my $corrupt_wal_seg_in_source_path =
+  $node_standby->data_dir . '/pg_wal/' . $corrupt_wal_seg;
+my $corrupt_wal_seg_source_stat = stat($corrupt_wal_seg_in_source_path);
+ok(defined($corrupt_wal_seg_source_stat),
+	"corrupted $corrupt_wal_seg exists in source after rewind");
+
+my $corrupt_wal_seg_stat_after_rewind = stat($corrupt_wal_seg_in_target_path);
+ok(defined($corrupt_wal_seg_stat_after_rewind),
+	"corrupted $corrupt_wal_seg exists in target after rewind");
+isnt( $corrupt_wal_seg_stat_before_rewind->size,
+	  $corrupt_wal_seg_source_stat->size,
+	"different size of corrupted $corrupt_wal_seg in source vs target before rewind"
+);
+is( $corrupt_wal_seg_stat_after_rewind->size,
+	  $corrupt_wal_seg_source_stat->size,
+	"same size of corrupted $corrupt_wal_seg in source and target after rewind"
+);
+
+done_testing();
diff --git a/doc/src/sgml/ref/pg_rewind.sgml b/doc/src/sgml/ref/pg_rewind.sgml
index 5485033ed8c7..928e78cda33a 100644
--- a/doc/src/sgml/ref/pg_rewind.sgml
+++ b/doc/src/sgml/ref/pg_rewind.sgml
@@ -52,12 +52,32 @@ PostgreSQL documentation
    analogous to a base backup of the source data directory. Unlike taking
    a new base backup or using a tool like <application>rsync</application>,
    <application>pg_rewind</application> does not require comparing or copying
-   unchanged relation blocks in the cluster. Only changed blocks from existing
-   relation files are copied; all other files, including new relation files,
-   configuration files, and WAL segments, are copied in full. As such the
-   rewind operation is significantly faster than other approaches when the
-   database is large and only a small fraction of blocks differ between the
-   clusters.
+   unchanged relation blocks in the cluster:
+  </para>
+  <itemizedlist>
+   <listitem>
+    <para>
+     Only changed blocks from existing relation files are copied.
+    </para>
+   </listitem>
+   <listitem>
+    <para>
+     WAL segments prior to the point where the source and target servers
+     have diverged are not copied. WAL segments generated after the source
+     and target servers have diverged are copied in full.
+    </para>
+   </listitem>
+   <listitem>
+    <para>
+     All other files, including new relation files and configuration files,
+     are copied in full.
+    </para>
+   </listitem>
+  </itemizedlist>
+  <para>
+   As such, the rewind operation is significantly faster than other
+   approaches when the database is large and only a small fraction of blocks
+   differ between the clusters.
   </para>
 
   <para>
-- 
2.51.0

