From 6bcc6c35e480ffa02117c1e6591f0bccdc70ad12 Mon Sep 17 00:00:00 2001
From: Antonin Houska <ah@cybertec.at>
Date: Mon, 19 Jan 2026 16:07:45 +0100
Subject: [PATCH 1/2] Fix race conditions during the setup of logical decoding.

Although it's rather unlikely, it can happen that the snapshot builder
considers transaction committed (according to WAL) before the commit could be
recorded in CLOG. In an extreme case, snapshot can even be created and used in
between. Since both snapshot and CLOG are needed for visibility checks, this
inconsistency can make them work incorrectly.

The typical symptom is that a transaction that the snapshot considers not
running anymore is (per CLOG) considered aborted instead of committed. Thus a
new tuple version can be evaluated as invisible (if xmin is incorrectly
considered aborted) or a deleted tuple version can be evaluated as visible (if
xmax is incorrectly considered aborted).

This patch fixes the problem by checking if all the XIDs that the new snapshot
considers committed are really committed per CLOG. If at least one is not, the
check is repeated after a short delay. However, a single check is sufficient
in almost all cases, so the performance impact should be minimal.
---
 src/backend/replication/logical/snapbuild.c   | 27 ++++++++++++++++++-
 .../utils/activity/wait_event_names.txt       |  1 +
 2 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/src/backend/replication/logical/snapbuild.c b/src/backend/replication/logical/snapbuild.c
index 37f0c6028bd..d7ea098cb37 100644
--- a/src/backend/replication/logical/snapbuild.c
+++ b/src/backend/replication/logical/snapbuild.c
@@ -377,7 +377,7 @@ SnapBuildBuildSnapshot(SnapBuild *builder)
 
 	/*
 	 * We misuse the original meaning of SnapshotData's xip and subxip fields
-	 * to make the more fitting for our needs.
+	 * to make them more fitting for our needs.
 	 *
 	 * In the 'xip' array we store transactions that have to be treated as
 	 * committed. Since we will only ever look at tuples from transactions
@@ -402,6 +402,31 @@ SnapBuildBuildSnapshot(SnapBuild *builder)
 	snapshot->xmin = builder->xmin;
 	snapshot->xmax = builder->xmax;
 
+	/*
+	 * Although it's very unlikely, it's possible that a commit WAL record was
+	 * decoded but CLOG is not aware of the commit yet. Should the CLOG update
+	 * be delayed even more, visibility checks that use this snapshot could
+	 * work incorrectly. Therefore we check the CLOG status here.
+	 */
+	for (int i = 0; i < builder->committed.xcnt; i++)
+	{
+		for (;;)
+		{
+			if (TransactionIdDidCommit(builder->committed.xip[i]))
+				break;
+			else
+			{
+				(void) WaitLatch(MyLatch,
+								 WL_LATCH_SET | WL_TIMEOUT |
+								 WL_EXIT_ON_PM_DEATH,
+								 10L,
+								 WAIT_EVENT_SNAPBUILD_CLOG);
+				ResetLatch(MyLatch);
+			}
+			CHECK_FOR_INTERRUPTS();
+		}
+	}
+
 	/* store all transactions to be treated as committed by this snapshot */
 	snapshot->xip =
 		(TransactionId *) ((char *) snapshot + sizeof(SnapshotData));
diff --git a/src/backend/utils/activity/wait_event_names.txt b/src/backend/utils/activity/wait_event_names.txt
index 4aa864fe3c3..987df777e47 100644
--- a/src/backend/utils/activity/wait_event_names.txt
+++ b/src/backend/utils/activity/wait_event_names.txt
@@ -181,6 +181,7 @@ PG_SLEEP	"Waiting due to a call to <function>pg_sleep</function> or a sibling fu
 RECOVERY_APPLY_DELAY	"Waiting to apply WAL during recovery because of a delay setting."
 RECOVERY_RETRIEVE_RETRY_INTERVAL	"Waiting during recovery when WAL data is not available from any source (<filename>pg_wal</filename>, archive or stream)."
 REGISTER_SYNC_REQUEST	"Waiting while sending synchronization requests to the checkpointer, because the request queue is full."
+SNAPBUILD_CLOG	"Waiting for CLOG update before building snapshot."
 SPIN_DELAY	"Waiting while acquiring a contended spinlock."
 VACUUM_DELAY	"Waiting in a cost-based vacuum delay point."
 VACUUM_TRUNCATE	"Waiting to acquire an exclusive lock to truncate off any empty pages at the end of a table vacuumed."
-- 
2.47.3

