From b02301cf8d58413bcb9e719cd364963b9e98e2a8 Mon Sep 17 00:00:00 2001 From: Zsolt Parragi Date: Wed, 1 Apr 2026 08:03:47 +0000 Subject: [PATCH 2/2] pg_rewind: fix remote source WAL race condition When using a remote source, pg_rewind previously called pg_current_wal_insert_lsn() after copying all files to determine minRecoveryPoint. This had two problems: 1. The insert LSN includes WAL in shared memory that hasn't been written to disk, but pg_read_binary_file() only reads from disk. 2. The file list was captured via SQL earlier, so WAL segments created between the traversal and the LSN query were missing. Both issues could result in minRecoveryPoint being set to a WAL position beyond the available WAL files. PostgreSQL would then silently start with incomplete recovery, causing data loss. Fix by capturing pg_current_wal_flush_lsn() before the file traversal. Since the flush LSN is already on disk at capture time, the subsequent traversal is guaranteed to see all WAL segments up to that point, and the normal file copy transfers them. Any WAL generated after the capture point will be obtained via streaming replication or restore_command during recovery. --- src/bin/pg_rewind/libpq_source.c | 12 +- src/bin/pg_rewind/local_source.c | 2 +- src/bin/pg_rewind/meson.build | 1 + src/bin/pg_rewind/pg_rewind.c | 20 ++- src/bin/pg_rewind/rewind_source.h | 4 +- src/bin/pg_rewind/t/012_remote_wal_race.pl | 180 +++++++++++++++++++++ 6 files changed, 207 insertions(+), 12 deletions(-) create mode 100644 src/bin/pg_rewind/t/012_remote_wal_race.pl diff --git a/src/bin/pg_rewind/libpq_source.c b/src/bin/pg_rewind/libpq_source.c index 6955bc575ea..97435427803 100644 --- a/src/bin/pg_rewind/libpq_source.c +++ b/src/bin/pg_rewind/libpq_source.c @@ -68,7 +68,7 @@ static void libpq_queue_fetch_range(rewind_source *source, const char *path, static void libpq_finish_fetch(rewind_source *source); static char *libpq_fetch_file(rewind_source *source, const char *path, size_t *filesize); -static XLogRecPtr libpq_get_current_wal_insert_lsn(rewind_source *source); +static XLogRecPtr libpq_get_current_wal_flush_lsn(rewind_source *source); static void libpq_destroy(rewind_source *source); /* @@ -91,7 +91,7 @@ init_libpq_source(PGconn *conn) src->common.queue_fetch_file = libpq_queue_fetch_file; src->common.queue_fetch_range = libpq_queue_fetch_range; src->common.finish_fetch = libpq_finish_fetch; - src->common.get_current_wal_insert_lsn = libpq_get_current_wal_insert_lsn; + src->common.get_current_wal_flush_lsn = libpq_get_current_wal_flush_lsn; src->common.destroy = libpq_destroy; src->conn = conn; @@ -202,10 +202,10 @@ run_simple_command(PGconn *conn, const char *sql) } /* - * Call the pg_current_wal_insert_lsn() function in the remote system. + * Call the pg_current_wal_flush_lsn() function in the remote system. */ static XLogRecPtr -libpq_get_current_wal_insert_lsn(rewind_source *source) +libpq_get_current_wal_flush_lsn(rewind_source *source) { PGconn *conn = ((libpq_source *) source)->conn; XLogRecPtr result; @@ -213,10 +213,10 @@ libpq_get_current_wal_insert_lsn(rewind_source *source) uint32 lo; char *val; - val = run_simple_query(conn, "SELECT pg_current_wal_insert_lsn()"); + val = run_simple_query(conn, "SELECT pg_current_wal_flush_lsn()"); if (sscanf(val, "%X/%08X", &hi, &lo) != 2) - pg_fatal("unrecognized result \"%s\" for current WAL insert location", val); + pg_fatal("unrecognized result \"%s\" for current WAL flush location", val); result = ((uint64) hi) << 32 | lo; diff --git a/src/bin/pg_rewind/local_source.c b/src/bin/pg_rewind/local_source.c index 4841cf01fb7..029f13a7396 100644 --- a/src/bin/pg_rewind/local_source.c +++ b/src/bin/pg_rewind/local_source.c @@ -46,7 +46,7 @@ init_local_source(const char *datadir) src->common.queue_fetch_file = local_queue_fetch_file; src->common.queue_fetch_range = local_queue_fetch_range; src->common.finish_fetch = local_finish_fetch; - src->common.get_current_wal_insert_lsn = NULL; + src->common.get_current_wal_flush_lsn = NULL; src->common.destroy = local_destroy; src->datadir = datadir; diff --git a/src/bin/pg_rewind/meson.build b/src/bin/pg_rewind/meson.build index 45c843c837e..7c612c4936b 100644 --- a/src/bin/pg_rewind/meson.build +++ b/src/bin/pg_rewind/meson.build @@ -46,6 +46,7 @@ tests += { 't/010_keep_recycled_wals.pl', 't/011_wal_copy.pl', 't/012_missing_source_wal.pl', + 't/012_remote_wal_race.pl', ], }, } diff --git a/src/bin/pg_rewind/pg_rewind.c b/src/bin/pg_rewind/pg_rewind.c index 9d745d4b25b..53643fbd9c0 100644 --- a/src/bin/pg_rewind/pg_rewind.c +++ b/src/bin/pg_rewind/pg_rewind.c @@ -87,6 +87,7 @@ uint64 fetch_done; static PGconn *conn; static rewind_source *source; +static XLogRecPtr source_wal_flush_lsn = InvalidXLogRecPtr; static void usage(const char *progname) @@ -476,6 +477,16 @@ main(int argc, char **argv) /* Initialize the hash table to track the status of each file */ filehash_init(); + /* + * When the source is a live server, capture the current WAL flush + * position before collecting the file list. This will be used as + * minRecoveryPoint. By capturing it first, the subsequent file + * traversal is guaranteed to see all WAL segments up to this point, + * so the normal file copy will include them. + */ + if (connstr_source && source->get_current_wal_flush_lsn) + source_wal_flush_lsn = source->get_current_wal_flush_lsn(source); + /* * Collect information about all files in the both data directories. */ @@ -711,13 +722,16 @@ perform_rewind(filemap_t *filemap, rewind_source *source, else { /* - * Source is a production, non-standby, server. We must replay to - * the last WAL insert location. + * Source is a production, non-standby, server. Use the WAL + * flush LSN captured before the file traversal. Because we + * captured it first, the traversal saw all WAL segments up to + * this point, and the normal file copy already transferred + * them. */ if (ControlFile_source_after.state != DB_IN_PRODUCTION) pg_fatal("source system was in unexpected state at end of rewind"); - endrec = source->get_current_wal_insert_lsn(source); + endrec = source_wal_flush_lsn; endtli = Max(ControlFile_source_after.checkPointCopy.ThisTimeLineID, ControlFile_source_after.minRecoveryPointTLI); } diff --git a/src/bin/pg_rewind/rewind_source.h b/src/bin/pg_rewind/rewind_source.h index f2e9b84a732..093c2ca90d0 100644 --- a/src/bin/pg_rewind/rewind_source.h +++ b/src/bin/pg_rewind/rewind_source.h @@ -66,9 +66,9 @@ typedef struct rewind_source void (*finish_fetch) (struct rewind_source *); /* - * Get the current WAL insert position in the source system. + * Get the current WAL flush position in the source system. */ - XLogRecPtr (*get_current_wal_insert_lsn) (struct rewind_source *); + XLogRecPtr (*get_current_wal_flush_lsn) (struct rewind_source *); /* * Free this rewind_source object. diff --git a/src/bin/pg_rewind/t/012_remote_wal_race.pl b/src/bin/pg_rewind/t/012_remote_wal_race.pl new file mode 100644 index 00000000000..90935924d57 --- /dev/null +++ b/src/bin/pg_rewind/t/012_remote_wal_race.pl @@ -0,0 +1,180 @@ +# Copyright (c) 2025-2026, PostgreSQL Global Development Group +# +# Test for a race condition in pg_rewind --source-server mode: +# If the source server generates WAL between the file list snapshot +# and the pg_current_wal_insert_lsn() call, the target can end up +# with a minRecoveryPoint beyond the copied WAL, causing silent +# incomplete recovery and data loss. +# +# Strategy: launch pg_rewind and heavy WAL generation concurrently, +# then verify that the target's WAL actually covers minRecoveryPoint +# by running pg_waldump up to that LSN. This is a deterministic +# check that does not depend on whether the server can start. + +use strict; +use warnings FATAL => 'all'; +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; +use File::Copy qw(copy); + +my $tmp_folder = PostgreSQL::Test::Utils::tempdir; + +# Set up primary +my $node_primary = PostgreSQL::Test::Cluster->new('primary'); +$node_primary->init(allows_streaming => 1); +$node_primary->append_conf( + 'postgresql.conf', qq( +wal_keep_size = 320MB +wal_log_hints = on +)); +$node_primary->start; +$node_primary->safe_psql('postgres', 'CREATE DATABASE testdb'); + +# Create test data on primary +$node_primary->safe_psql('testdb', + "CREATE TABLE t1 (d text)"); +$node_primary->safe_psql('testdb', + "INSERT INTO t1 VALUES ('before promotion')"); +$node_primary->safe_psql('testdb', "CHECKPOINT"); + +# Set up standby +$node_primary->backup('my_backup'); +my $node_standby = PostgreSQL::Test::Cluster->new('standby'); +$node_standby->init_from_backup($node_primary, 'my_backup', + has_streaming => 1); +$node_standby->start; +$node_primary->wait_for_catchup($node_standby, 'write'); + +# Promote standby +$node_standby->promote; +$node_standby->poll_query_until('testdb', + "SELECT NOT pg_is_in_recovery()"); + +# Pre-generate a large dataset on the promoted standby. +# This makes pg_rewind's file copy take significant time, widening +# the window for the race. +$node_standby->safe_psql('testdb', qq{ + CREATE TABLE bulk (id serial, data text); + INSERT INTO bulk (data) SELECT repeat('x', 200) + FROM generate_series(1, 500000); + CREATE TABLE counter (id serial, batch int); + INSERT INTO counter (batch) SELECT 0 FROM generate_series(1, 1000); + CHECKPOINT; +}); + +# Generate divergent WAL on old primary, then stop it +$node_primary->safe_psql('testdb', + "INSERT INTO t1 VALUES ('after promotion on old primary')"); +$node_primary->stop('fast'); + +# Save primary's postgresql.conf +my $primary_pgdata = $node_primary->data_dir; +copy("$primary_pgdata/postgresql.conf", + "$tmp_folder/primary-postgresql.conf.tmp") + or die "copy postgresql.conf: $!"; + +# Launch pg_rewind in the background so we can start WAL generation +# concurrently. +my ($rewind_out, $rewind_err); +my $rewind_handle = IPC::Run::start( + [ + 'pg_rewind', '--debug', + '--source-server' => $node_standby->connstr('testdb'), + '--target-pgdata' => $primary_pgdata, + '--no-sync', + '--config-file' => "$tmp_folder/primary-postgresql.conf.tmp", + ], + '<', \undef, '>', \$rewind_out, '2>', \$rewind_err); + +# Immediately start background WAL generation +my @bg_handles; +for my $worker (1 .. 8) +{ + my ($out, $err); + my $h = IPC::Run::start( + [ + 'psql', '-X', '-q', + '-d', $node_standby->connstr('testdb'), + '-c', qq{ +DO \$\$ +BEGIN + FOR i IN 1..200 LOOP + INSERT INTO counter (batch) + SELECT ${worker} * 10000 + i FROM generate_series(1, 2000); + COMMIT; + END LOOP; +END \$\$; +} + ], + '<', \undef, '>', \$out, '2>', \$err); + push @bg_handles, $h; +} + +# Wait for pg_rewind to complete +IPC::Run::finish($rewind_handle); +my $rewind_exit = $?; +ok($rewind_exit == 0, 'pg_rewind with concurrent WAL generation succeeds'); +if ($rewind_exit != 0) +{ + diag "pg_rewind stderr: $rewind_err"; +} + +# Wait for all background workers to finish +for my $h (@bg_handles) +{ + IPC::Run::finish($h); +} + +# Get minRecoveryPoint and divergence LSN for verification +my $controldata = `pg_controldata '$primary_pgdata'`; +my ($min_recovery_point) = + ($controldata =~ /Minimum recovery ending location:\s+(\S+)/); +note "target minRecoveryPoint: $min_recovery_point"; + +my $history_content = + slurp_file("$primary_pgdata/pg_wal/00000002.history"); +my ($diverge_lsn) = ($history_content =~ /^1\t([0-9A-F\/]+)\t/m); +die "could not parse divergence LSN from timeline history" + unless $diverge_lsn; +note "divergence LSN: $diverge_lsn"; + +# Diagnostics +my @target_wal = sort grep { /^00000002/ && !/history/ } + map { s/.*\///r } glob("$primary_pgdata/pg_wal/00000002*"); +note "target TL2 WAL segments: @target_wal"; + +# verify that the WAL on the target actually reaches +# minRecoveryPoint. Without the fix, pg_rewind sets +# minRecoveryPoint to a position beyond the available WAL, so +# pg_waldump will fail trying to read up to that point. +command_ok( + [ + 'pg_waldump', '--quiet', + '--path' => "$primary_pgdata/pg_wal", + '--timeline' => '2', + '--start' => $diverge_lsn, + '--end' => $min_recovery_point, + ], + 'WAL on target covers minRecoveryPoint'); + +# Also verify the server can start and has consistent data +copy("$tmp_folder/primary-postgresql.conf.tmp", + "$primary_pgdata/postgresql.conf") + or die "restore postgresql.conf: $!"; + +$node_primary->start; + +my $t1_value = $node_primary->safe_psql('testdb', 'SELECT d FROM t1'); +is($t1_value, 'before promotion', + 't1 has correct data after rewind'); + +my $batch0 = $node_primary->safe_psql('testdb', + 'SELECT count(*) FROM counter WHERE batch = 0'); +is($batch0, '1000', + 'pre-rewind data is present after rewind'); + +$node_primary->teardown_node; +$node_standby->teardown_node; + +done_testing(); -- 2.43.0