From ca765ee0a49ac1005c5eb7ebd1d29bea97a63552 Mon Sep 17 00:00:00 2001
From: Antonin Houska <ah@cybertec.at>
Date: Mon, 19 Jan 2026 12:54:55 +0100
Subject: [PATCH 1/2] Demonstrate possible race conditions in logical decoding.

The problem is that the snapshot builder can create a snapshot before CLOG has
been updated. That breaks visibility check that use such snapshot. For more
details, see startup_race.spec.

Success of the test means that the problem is present. Thus it would need to
be modified if it should be merged into the tree.

Another problem that currently prevents this test from being merged is that it
hard-wires the logical decoding setup into the SET TRANSACTION command. I
tried to modify the isolation tester so it can use the logical replication
protocol (in which case the test could use the "CREATE_REPLICATION_SNAPSHOT
... (SNAPSHOT 'use')" command), but the tester does things that are not
compatible with that protocol (e.g. it sets the application_name parameter).
---
 .../test_decoding/expected/startup_race.out   |  85 ++++++++++++
 contrib/test_decoding/specs/startup_race.spec | 126 ++++++++++++++++++
 src/backend/access/transam/xact.c             |   6 +
 src/backend/replication/logical/snapbuild.c   |   3 +
 src/backend/utils/time/snapmgr.c              |  66 +++++++++
 src/include/utils/snapmgr.h                   |   1 +
 6 files changed, 287 insertions(+)
 create mode 100644 contrib/test_decoding/expected/startup_race.out
 create mode 100644 contrib/test_decoding/specs/startup_race.spec

diff --git a/contrib/test_decoding/expected/startup_race.out b/contrib/test_decoding/expected/startup_race.out
new file mode 100644
index 00000000000..597fa617831
--- /dev/null
+++ b/contrib/test_decoding/expected/startup_race.out
@@ -0,0 +1,85 @@
+Parsed test spec with 6 sessions
+
+starting permutation: s1_assign_xid s2_set_snapshot s3_assign_xid s1_rollback s3_rollback s4_do_changes s5_wake_up_full_snapshot s2_scan s2_rollback s5_wake_up_before_clog s6_check
+injection_points_attach
+-----------------------
+                       
+(1 row)
+
+injection_points_attach
+-----------------------
+                       
+(1 row)
+
+step s1_assign_xid: 
+	BEGIN;
+	CREATE TABLE b(i int);
+
+step s2_set_snapshot: 
+	BEGIN READ ONLY ISOLATION LEVEL REPEATABLE READ;
+	SET TRANSACTION SNAPSHOT 'from_slot';
+ <waiting ...>
+step s3_assign_xid: 
+	BEGIN;
+	CREATE TABLE c(i int);
+
+step s1_rollback: 
+	ROLLBACK;
+
+step s3_rollback: 
+	ROLLBACK;
+
+step s4_do_changes: 
+	INSERT INTO a(i, j) VALUES (3, 3);
+	UPDATE a SET j = j + 1 WHERE i = 1;
+	DELETE FROM a WHERE i = 2;
+ <waiting ...>
+step s5_wake_up_full_snapshot: 
+	SELECT injection_points_wakeup('snapbuild-full-snapshot');
+
+injection_points_wakeup
+-----------------------
+                       
+(1 row)
+
+step s2_set_snapshot: <... completed>
+step s2_scan: 
+	TABLE a;
+
+i|j
+-+-
+1|1
+2|2
+(2 rows)
+
+step s2_rollback: 
+	ROLLBACK;
+
+step s5_wake_up_before_clog: 
+	SELECT injection_points_wakeup('before-clog-update');
+
+injection_points_wakeup
+-----------------------
+                       
+(1 row)
+
+step s4_do_changes: <... completed>
+step s6_check: 
+	SELECT * FROM a ORDER BY i;
+
+i|j
+-+-
+1|1
+2|2
+(2 rows)
+
+injection_points_detach
+-----------------------
+                       
+(1 row)
+
+injection_points_detach
+-----------------------
+                       
+(1 row)
+
diff --git a/contrib/test_decoding/specs/startup_race.spec b/contrib/test_decoding/specs/startup_race.spec
new file mode 100644
index 00000000000..8f67e07fa7a
--- /dev/null
+++ b/contrib/test_decoding/specs/startup_race.spec
@@ -0,0 +1,126 @@
+setup
+{
+	CREATE TABLE a(i int primary key, j int) WITH (autovacuum_enabled = off);
+	INSERT INTO a(i, j) VALUES (1, 1), (2, 2);
+	CREATE EXTENSION injection_points;
+}
+
+session s1
+step s1_assign_xid
+{
+	BEGIN;
+	CREATE TABLE b(i int);
+}
+step s1_rollback
+{
+	ROLLBACK;
+}
+
+session s2
+setup
+{
+	SELECT injection_points_set_local();
+	SELECT injection_points_attach('snapbuild-full-snapshot', 'wait');
+}
+# Use special, hard-wired snapshot name to set the initial snapshot from
+# logical replication slot.
+step s2_set_snapshot
+{
+	BEGIN READ ONLY ISOLATION LEVEL REPEATABLE READ;
+	SET TRANSACTION SNAPSHOT 'from_slot';
+}
+# Perform the scan.
+step s2_scan
+{
+	TABLE a;
+}
+step s2_rollback
+{
+	ROLLBACK;
+}
+teardown
+{
+	SELECT injection_points_detach('snapbuild-full-snapshot');
+}
+
+session s3
+step s3_assign_xid
+{
+	BEGIN;
+	CREATE TABLE c(i int);
+}
+step s3_rollback
+{
+	ROLLBACK;
+}
+
+session s4
+setup
+{
+	SELECT injection_points_set_local();
+	SELECT injection_points_attach('before-clog-update', 'wait');
+}
+step s4_do_changes
+{
+	INSERT INTO a(i, j) VALUES (3, 3);
+	UPDATE a SET j = j + 1 WHERE i = 1;
+	DELETE FROM a WHERE i = 2;
+}
+teardown
+{
+	SELECT injection_points_detach('before-clog-update');
+}
+
+session s5
+step s5_wake_up_full_snapshot
+{
+	SELECT injection_points_wakeup('snapbuild-full-snapshot');
+}
+step s5_wake_up_before_clog
+{
+	SELECT injection_points_wakeup('before-clog-update');
+}
+
+session s6
+step s6_check
+{
+	SELECT * FROM a ORDER BY i;
+}
+
+permutation
+# Let the snapshot builder go through all the states. The problematic case
+# happens in the FULL_SNAPSHOT.
+s1_assign_xid
+# This should leave the builder in BUILDING_SNAPSHOT, waiting for the active
+# transaction to end.
+s2_set_snapshot
+# Make sure that s1_rollback does not allow going to CONSISTENT directly.
+s3_assign_xid
+# Let the builder proceed to FULL_SNAPSHOT. It should stop at the injection
+# point 'snapbuild-full-snapshot'.
+s1_rollback
+# The transaction of s3 is not needed anymore, CONSISTENT should be the next
+# stage.
+s3_rollback
+# Perform data changes before the snapshot builder triggers creation of the
+# RUNNING_XACTS record. This will stop before setting transaction status in
+# CLOG.
+s4_do_changes
+# Unblock the injection point so that the snapshot can finally be created.
+s5_wake_up_full_snapshot
+# Use the snapshot for a scan. The snapshot will consider s4 not running
+# anymore, however CLOG is not aware of the commit yet. Thus
+# HeapTupleSatisfiesMVCC will consider the transaction aborted. In particular,
+# for UPDATE, if both xmax of the old version and xmin of the new version are
+# considered aborted, so the effects of the UPDATE are lost
+# altogether. Similarly, INSERT and DELETE have no effect because the xmin
+# transaction of the inserted tuple and xmax of the deleted tuple are
+# considered aborted
+s2_scan
+s2_rollback
+# CLOG can be updated now.
+s5_wake_up_before_clog
+# Scan the table again using a new transaction, with a normal transaction
+# snapshot. The results are still wrong due to hint bits set incorrectly.
+s6_check
+
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index c857e23552f..2fea45b2fed 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -65,6 +65,7 @@
 #include "utils/builtins.h"
 #include "utils/combocid.h"
 #include "utils/guc.h"
+#include "utils/injection_point.h"
 #include "utils/inval.h"
 #include "utils/memutils.h"
 #include "utils/relmapper.h"
@@ -1348,6 +1349,9 @@ RecordTransactionCommit(void)
 													 &RelcacheInitFileInval);
 	wrote_xlog = (XactLastRecEnd != 0);
 
+	/* Load the injection point before entering the critical section */
+	INJECTION_POINT_LOAD("before-clog-update");
+
 	/*
 	 * If we haven't been assigned an XID yet, we neither can, nor do we want
 	 * to write a COMMIT record.
@@ -1514,6 +1518,8 @@ RecordTransactionCommit(void)
 	{
 		XLogFlush(XactLastRecEnd);
 
+		INJECTION_POINT_CACHED("before-clog-update", NULL);
+
 		/*
 		 * Now we may update the CLOG, if we wrote a COMMIT record above
 		 */
diff --git a/src/backend/replication/logical/snapbuild.c b/src/backend/replication/logical/snapbuild.c
index 7f79621b57e..9b09dc8eac1 100644
--- a/src/backend/replication/logical/snapbuild.c
+++ b/src/backend/replication/logical/snapbuild.c
@@ -141,6 +141,7 @@
 #include "storage/procarray.h"
 #include "storage/standby.h"
 #include "utils/builtins.h"
+#include "utils/injection_point.h"
 #include "utils/memutils.h"
 #include "utils/snapmgr.h"
 #include "utils/snapshot.h"
@@ -1387,6 +1388,8 @@ SnapBuildFindSnapshot(SnapBuild *builder, XLogRecPtr lsn, xl_running_xacts *runn
 				errdetail("Waiting for transactions (approximately %d) older than %u to end.",
 						  running->xcnt, running->nextXid));
 
+		INJECTION_POINT("snapbuild-full-snapshot", NULL);
+
 		SnapBuildWaitSnapshot(running, running->nextXid);
 	}
 
diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c
index 2e6197f5f35..f327b779004 100644
--- a/src/backend/utils/time/snapmgr.c
+++ b/src/backend/utils/time/snapmgr.c
@@ -110,10 +110,13 @@
 #include "access/subtrans.h"
 #include "access/transam.h"
 #include "access/xact.h"
+#include "access/xlogutils.h"
 #include "datatype/timestamp.h"
 #include "lib/pairingheap.h"
 #include "miscadmin.h"
 #include "port/pg_lfind.h"
+#include "replication/logical.h"
+#include "replication/snapbuild.h"
 #include "storage/fd.h"
 #include "storage/predicate.h"
 #include "storage/proc.h"
@@ -1421,6 +1424,17 @@ ImportSnapshot(const char *idstr)
 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 				 errmsg("a snapshot-importing transaction must have isolation level SERIALIZABLE or REPEATABLE READ")));
 
+	if (strcmp(idstr, "from_slot") == 0)
+	{
+		Snapshot	snap;
+
+		snap = create_test_snapshot();
+		/* XXX sourcevxid shouldn't be needed in this special case */
+		SetTransactionSnapshot(snap, NULL, MyProcPid, MyProc);
+
+		return;
+	}
+
 	/*
 	 * Verify the identifier: only 0-9, A-F and hyphens are allowed.  We do
 	 * this mainly to prevent reading arbitrary files.
@@ -1969,3 +1983,55 @@ ResOwnerReleaseSnapshot(Datum res)
 {
 	UnregisterSnapshotNoOwner((Snapshot) DatumGetPointer(res));
 }
+
+/*
+ * CreateReplicationSlot() with the CRS_USE_SNAPSHOT option would be useful
+ * for testing, but regression tests cannot speak both replication and query
+ * protocol at the same time. This function can be used instead to create a
+ * snapshot for special tests of logical replication.
+ */
+Snapshot
+create_test_snapshot(void)
+{
+	const	char	*slotname = "test_slot";
+	const	char *plugin;
+	LogicalDecodingContext *ctx;
+	Snapshot	snap;
+
+	/*
+	 * XXX Hard-wired values are fine for the special test that needs this
+	 * function.
+	 */
+	plugin = "test_decoding";
+
+	Assert(!MyReplicationSlot);
+
+	CheckLogicalDecodingRequirements();
+
+	ReplicationSlotCreate(slotname, true, RS_TEMPORARY,
+						  false, false, false);
+
+	/*
+	 * Ensure the logical decoding is enabled before initializing the
+	 * logical decoding context.
+	 */
+	EnsureLogicalDecodingEnabled();
+	Assert(IsLogicalDecodingEnabled());
+
+	ctx = CreateInitDecodingContext(plugin, NIL, true,
+									InvalidXLogRecPtr,
+									XL_ROUTINE(.page_read = read_local_xlog_page,
+											   .segment_open = wal_segment_open,
+											   .segment_close = wal_segment_close),
+									NULL, NULL, NULL);
+
+	/* build initial snapshot, might take a while */
+	DecodingContextFindStartpoint(ctx);
+
+	/* Do what the function is called for. */
+	snap = SnapBuildInitialSnapshot(ctx->snapshot_builder);
+	snap = CopySnapshot(snap);
+	FreeDecodingContext(ctx);
+
+	return snap;
+}
diff --git a/src/include/utils/snapmgr.h b/src/include/utils/snapmgr.h
index b8c01a291a1..9f0d60ebc1b 100644
--- a/src/include/utils/snapmgr.h
+++ b/src/include/utils/snapmgr.h
@@ -123,4 +123,5 @@ extern Snapshot RestoreSnapshot(char *start_address);
 struct PGPROC;
 extern void RestoreTransactionSnapshot(Snapshot snapshot, struct PGPROC *source_pgproc);
 
+extern Snapshot create_test_snapshot(void);
 #endif							/* SNAPMGR_H */
-- 
2.47.3

