From 2cf5edb2639104ccc42333e7546c89076309c40f Mon Sep 17 00:00:00 2001 From: Hayato Kuroda Date: Mon, 8 Jun 2026 12:42:12 +0900 Subject: [PATCH] WIP: try reproducing the race condition for promotion --- src/backend/replication/walsender.c | 6 ++ src/test/recovery/meson.build | 1 + src/test/recovery/t/099_repro.pl | 130 ++++++++++++++++++++++++++++ 3 files changed, 137 insertions(+) create mode 100644 src/test/recovery/t/099_repro.pl diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c index 04aa770d981..f764007052d 100644 --- a/src/backend/replication/walsender.c +++ b/src/backend/replication/walsender.c @@ -94,6 +94,7 @@ #include "utils/acl.h" #include "utils/builtins.h" #include "utils/guc.h" +#include "utils/injection_point.h" #include "utils/lsyscache.h" #include "utils/memutils.h" #include "utils/pg_lsn.h" @@ -1103,11 +1104,16 @@ logical_read_xlog_page(XLogReaderState *state, XLogRecPtr targetPagePtr, int req */ am_cascading_walsender = RecoveryInProgress(); + INJECTION_POINT("logical-read-xlog-page-before-tli", NULL); + if (am_cascading_walsender) GetXLogReplayRecPtr(&currTLI); else currTLI = GetWALInsertionTimeLine(); + elog(LOG, "XXX am_cascading_walsender: %d, currTLI: %u", + am_cascading_walsender, currTLI); + XLogReadDetermineTimeline(state, targetPagePtr, reqLen, currTLI); sendTimeLineIsHistoric = (state->currTLI != currTLI); sendTimeLine = state->currTLI; diff --git a/src/test/recovery/meson.build b/src/test/recovery/meson.build index 9eb8ed11425..bfd06a06124 100644 --- a/src/test/recovery/meson.build +++ b/src/test/recovery/meson.build @@ -62,6 +62,7 @@ tests += { 't/051_effective_wal_level.pl', 't/052_checkpoint_segment_missing.pl', 't/053_standby_login_event_trigger.pl', + 't/099_repro.pl', ], }, } diff --git a/src/test/recovery/t/099_repro.pl b/src/test/recovery/t/099_repro.pl new file mode 100644 index 00000000000..909141c0773 --- /dev/null +++ b/src/test/recovery/t/099_repro.pl @@ -0,0 +1,130 @@ +# Copyright (c) 2026, PostgreSQL Global Development Group + +# Reproducer for wrong timeline bug + +use strict; +use warnings FATAL => 'all'; + +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +if ($ENV{enable_injection_points} ne 'yes') +{ + plan skip_all => 'Injection points not supported by this build'; +} + +my ($stdout, $stderr, $cascading_stdout, $cascading_stderr, $handle); + +my $node_primary = PostgreSQL::Test::Cluster->new('primary'); +my $node_standby = PostgreSQL::Test::Cluster->new('standby'); +my $default_timeout = $PostgreSQL::Test::Utils::timeout_default; +my $res; + +# Name for the physical slot on primary +my $primary_slotname = 'primary_physical'; +my $standby_physical_slotname = 'standby_physical'; + +######################## +# Initialize primary node +######################## + +$node_primary->init(allows_streaming => 1, has_archiving => 1); +$node_primary->append_conf( + 'postgresql.conf', q{ +wal_level = 'logical' +max_replication_slots = 4 +max_wal_senders = 4 +autovacuum = off +}); +$node_primary->dump_info; +$node_primary->start; + +# Check if the extension injection_points is available, as it may be +# possible that this script is run with installcheck, where the module +# would not be installed by default. +if (!$node_primary->check_extension('injection_points')) +{ + plan skip_all => 'Extension injection_points not installed'; +} + +# Create the injection_points extension +$node_primary->safe_psql('postgres', 'CREATE EXTENSION injection_points;'); + +$node_primary->safe_psql('postgres', + qq[SELECT * FROM pg_create_physical_replication_slot('$primary_slotname');] +); + +$node_primary->safe_psql('postgres', "CREATE TABLE foo (id int)"); + +my $backup_name = 'b1'; +$node_primary->backup($backup_name); + +####################### +# Initialize standby node +####################### + +$node_standby->init_from_backup( + $node_primary, $backup_name, + has_streaming => 1, + has_restoring => 1); +$node_standby->append_conf( + 'postgresql.conf', + qq[primary_slot_name = '$primary_slotname' + max_replication_slots = 5]); +$node_standby->start; +$node_primary->wait_for_replay_catchup($node_standby); + +# create logical slot +$node_standby->create_logical_slot_on_standby($node_primary, 'testslot', + 'postgres'); + +# Start continuous logical decoding on the standby +$handle = IPC::Run::start( + [ + 'pg_recvlogical', + '--dbname' => $node_standby->connstr('postgres'), + '--slot' => 'testslot', + '--option' => 'include-xids=0', + '--option' => 'skip-empty-xacts=1', + '--file' => '-', + '--no-loop', + '--start', + ], + '>' => \$stdout, + '2>' => \$stderr, + IPC::Run::timeout($default_timeout)); + +# Ensure the pg_recvlogical works well +$node_primary->safe_psql('postgres', "INSERT INTO foo SELECT generate_series(1, 10)"); + +# check that we are decoding pre and post promotion inserted rows +# with pg_recvlogical that has started before the promotion +my $pump_timeout = IPC::Run::timer($PostgreSQL::Test::Utils::timeout_default); + +ok(pump_until($handle, $pump_timeout, \$stdout, qr/^.*COMMIT$/s), + 'got 1 COMMIT from pg_recvlogical output'); + +# Set an injection_point to make the walsender wait before reading the timeline +# of the standby. +$node_standby->safe_psql('postgres', "SELECT injection_points_attach('logical-read-xlog-page-before-tli', 'wait');"); + +# Insert some data to make the walsender read the timeline +$node_primary->safe_psql('postgres', "INSERT INTO foo SELECT generate_series(11, 20)"); +$node_standby->wait_for_event('walsender', 'logical-read-xlog-page-before-tli'); + +# Now the standby can accept INSERTs +$node_standby->promote; + +# Insert some data on the promoted standby +$node_standby->safe_psql('postgres', "INSERT INTO foo SELECT generate_series(21, 30)"); + +# Walsender now resumes decoding +$node_standby->safe_psql('postgres', qq{SELECT injection_points_detach('logical-read-xlog-page-before-tli'); +SELECT injection_points_wakeup('logical-read-xlog-page-before-tli');}); + +# Check that we can decode both pre and post promotion inserted rows with pg_recvlogical. +ok(pump_until($handle, $pump_timeout, \$stdout, qr/^.*COMMIT.*COMMIT$/s), + 'got 2 COMMIT from pg_recvlogical output'); + +done_testing(); -- 2.52.0