From 1cc6f1c3f3b77fd4cc4ea965085f66eb1db487a7 Mon Sep 17 00:00:00 2001
From: Bertrand Drouvot <bertranddrouvot.pg@gmail.com>
Date: Sat, 11 Oct 2025 04:19:15 +0000
Subject: [PATCH v1] Don't use initial_restart_lsn in
 InvalidatePossiblyObsoleteSlot()

818fefd8fd4 fixed race leading to incorrect conflict cause in
InvalidatePossiblyObsoleteSlot() for all possible conflicts.

However it's possible that for the restart_lsn case the slot has been able
to advance enough to move from a conflicting case to a non conflicting case.

Indeed, it's safe to re-check the LSN position because the
checkpointer/startup process hasn't removed the WALs yet (because it's busy
in InvalidatePossiblyObsoleteSlot()).

In that case we should not invalidate the slot (it was not invalidated prior
to 818fefd8fd4).

This commit puts the pre 818fefd8fd4 behavior back for the restart_lsn case.

Reported-by: suyu.cmj <mengjuan.cmj@alibaba-inc.com>
---
 src/backend/replication/slot.c | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)
 100.0% src/backend/replication/

diff --git a/src/backend/replication/slot.c b/src/backend/replication/slot.c
index ac188bb2f77..44a8dc3a001 100644
--- a/src/backend/replication/slot.c
+++ b/src/backend/replication/slot.c
@@ -1716,15 +1716,15 @@ DetermineSlotInvalidationCause(uint32 possible_causes, ReplicationSlot *s,
 							   TransactionId snapshotConflictHorizon,
 							   TransactionId initial_effective_xmin,
 							   TransactionId initial_catalog_effective_xmin,
-							   XLogRecPtr initial_restart_lsn,
+							   XLogRecPtr restart_lsn,
 							   TimestampTz *inactive_since, TimestampTz now)
 {
 	Assert(possible_causes != RS_INVAL_NONE);
 
 	if (possible_causes & RS_INVAL_WAL_REMOVED)
 	{
-		if (initial_restart_lsn != InvalidXLogRecPtr &&
-			initial_restart_lsn < oldestLSN)
+		if (restart_lsn != InvalidXLogRecPtr &&
+			restart_lsn < oldestLSN)
 			return RS_INVAL_WAL_REMOVED;
 	}
 
@@ -1811,7 +1811,6 @@ InvalidatePossiblyObsoleteSlot(uint32 possible_causes,
 	bool		terminated = false;
 	TransactionId initial_effective_xmin = InvalidTransactionId;
 	TransactionId initial_catalog_effective_xmin = InvalidTransactionId;
-	XLogRecPtr	initial_restart_lsn = InvalidXLogRecPtr;
 	ReplicationSlotInvalidationCause invalidation_cause_prev PG_USED_FOR_ASSERTS_ONLY = RS_INVAL_NONE;
 	TimestampTz inactive_since = 0;
 
@@ -1869,7 +1868,6 @@ InvalidatePossiblyObsoleteSlot(uint32 possible_causes,
 			 */
 			if (!terminated)
 			{
-				initial_restart_lsn = s->data.restart_lsn;
 				initial_effective_xmin = s->effective_xmin;
 				initial_catalog_effective_xmin = s->effective_catalog_xmin;
 			}
@@ -1880,17 +1878,20 @@ InvalidatePossiblyObsoleteSlot(uint32 possible_causes,
 																snapshotConflictHorizon,
 																initial_effective_xmin,
 																initial_catalog_effective_xmin,
-																initial_restart_lsn,
+																s->data.restart_lsn,
 																&inactive_since,
 																now);
 		}
 
 		/*
-		 * The invalidation cause recorded previously should not change while
-		 * the process owning the slot (if any) has been terminated.
+		 * The invalidation cause should remain consistent after termination,
+		 * except when RS_INVAL_WAL_REMOVED becomes RS_INVAL_NONE (slot caught
+		 * up).
 		 */
 		Assert(!(invalidation_cause_prev != RS_INVAL_NONE && terminated &&
-				 invalidation_cause_prev != invalidation_cause));
+				 invalidation_cause_prev != invalidation_cause) ||
+			   (invalidation_cause_prev == RS_INVAL_WAL_REMOVED &&
+				invalidation_cause == RS_INVAL_NONE));
 
 		/* if there's no invalidation, we're done */
 		if (invalidation_cause == RS_INVAL_NONE)
-- 
2.34.1

