From 95bfea2b12418671b5f9513dee107a0b18c8a78e Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@vondra.me>
Date: Fri, 14 Mar 2025 22:04:27 +0100
Subject: [PATCH v20250315 3/4] reworks

---
 src/backend/access/transam/xlog.c       | 45 +++++++++----------------
 src/backend/postmaster/auxprocess.c     | 19 +++++++++++
 src/backend/postmaster/launch_backend.c | 10 ------
 src/backend/utils/init/postinit.c       | 17 ++++++++--
 src/include/access/xlog.h               |  2 +-
 src/include/miscadmin.h                 |  1 -
 6 files changed, 51 insertions(+), 43 deletions(-)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index e4c72f985e4..064cc5555dc 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -661,12 +661,15 @@ static bool updateMinRecoveryPoint = true;
 static uint32 LocalDataChecksumVersion = 0;
 
 /*
- * Flag to remember if the procsignalbarrier being absorbed for enabling
- * checksums is the first one or not. The first procsignalbarrier can in rare
- * circumstances cause a transition from 'on' to 'on' when a new process is
+ * Flag to remember if the procsignalbarrier being absorbed for checksums
+ * is the first one. The first procsignalbarrier can in rare cases be for
+ * the state we've initialized, i.e. a duplicate. This may happen for any
+ * data_checksum_version value, but for PG_DATA_CHECKSUM_ON_VERSION this
+ * would trigger an assert failure (this is the only transition with an
+ * assert) when processing the barrier. This may happen if the process is
  * spawned between the update of XLogCtl->data_checksum_version and the
- * barrier being emitted.  This can only happen on the very first barrier so
- * mark that with this flag.
+ * barrier being emitted. This can only happen on the very first barrier
+ * so mark that with this flag.
  */
 static bool InitialDataChecksumTransition = true;
 
@@ -4938,6 +4941,7 @@ SetDataChecksumsOff(void)
 bool
 AbsorbChecksumsOnInProgressBarrier(void)
 {
+	/* XXX can't we check we're in OFF or INPROGRESSS_ON? */
 	SetLocalDataChecksumVersion(PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION);
 	return true;
 }
@@ -4950,22 +4954,19 @@ AbsorbChecksumsOnBarrier(void)
 	 * barrier it will have seen the updated value, so for the first barrier
 	 * we accept both "on" and "inprogress-on".
 	 */
-	if (InitialDataChecksumTransition)
-	{
-		Assert((LocalDataChecksumVersion == PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION) ||
-			   (LocalDataChecksumVersion == PG_DATA_CHECKSUM_VERSION));
-		InitialDataChecksumTransition = false;
-	}
-	else
-		Assert(LocalDataChecksumVersion == PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION);
+	Assert((LocalDataChecksumVersion == PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION) ||
+		   (InitialDataChecksumTransition &&
+			(LocalDataChecksumVersion == PG_DATA_CHECKSUM_VERSION)));
 
 	SetLocalDataChecksumVersion(PG_DATA_CHECKSUM_VERSION);
+	InitialDataChecksumTransition = false;
 	return true;
 }
 
 bool
 AbsorbChecksumsOffInProgressBarrier(void)
 {
+	/* XXX can't we check we're in ON or INPROGRESSS_OFF? */
 	SetLocalDataChecksumVersion(PG_DATA_CHECKSUM_INPROGRESS_OFF_VERSION);
 	return true;
 }
@@ -4973,6 +4974,7 @@ AbsorbChecksumsOffInProgressBarrier(void)
 bool
 AbsorbChecksumsOffBarrier(void)
 {
+	/* XXX can't we check we're in INPROGRESSS_OFF? */
 	SetLocalDataChecksumVersion(0);
 	return true;
 }
@@ -4986,7 +4988,7 @@ AbsorbChecksumsOffBarrier(void)
  * purpose enough to handle future cases.
  */
 void
-InitLocalControldata(void)
+InitLocalDataChecksumVersion(void)
 {
 	SpinLockAcquire(&XLogCtl->info_lck);
 	SetLocalDataChecksumVersion(XLogCtl->data_checksum_version);
@@ -5024,21 +5026,6 @@ SetLocalDataChecksumVersion(uint32 data_checksum_version)
 	}
 }
 
-/*
- * Initialize the various data checksum values - GUC, local, ....
- */
-void
-InitLocalDataChecksumVersion(void)
-{
-	uint32	data_checksum_version;
-
-	SpinLockAcquire(&XLogCtl->info_lck);
-	data_checksum_version = XLogCtl->data_checksum_version;
-	SpinLockRelease(&XLogCtl->info_lck);
-
-	SetLocalDataChecksumVersion(data_checksum_version);
-}
-
 /*
  * Get the local data_checksum_version (cached XLogCtl value).
  */
diff --git a/src/backend/postmaster/auxprocess.c b/src/backend/postmaster/auxprocess.c
index 4f6795f7265..50d5308816c 100644
--- a/src/backend/postmaster/auxprocess.c
+++ b/src/backend/postmaster/auxprocess.c
@@ -15,6 +15,7 @@
 #include <unistd.h>
 #include <signal.h>
 
+#include "access/xlog.h"
 #include "miscadmin.h"
 #include "pgstat.h"
 #include "postmaster/auxprocess.h"
@@ -68,6 +69,24 @@ AuxiliaryProcessMainCommon(void)
 
 	ProcSignalInit(false, 0);
 
+	/*
+	 * Initialize a local cache of the data_checksum_version, to be updated
+	 * by the procsignal-based barriers.
+	 *
+	 * This intentionally happens after initializing the procsignal, otherwise
+	 * we might miss a state change. This means we can get a barrier for the
+	 * state we've just initialized - but it can happen only once.
+	 *
+	 * The postmaster (which is what gets forked into the new child process)
+	 * does not handle barriers, therefore it may not have the current value
+	 * of LocalDataChecksumVersion value (it'll have the value read from the
+	 * control file, which may be arbitrarily old).
+	 *
+	 * NB: Even if the postmaster handled barriers, the value might still be
+	 * stale, as it might have changed after this process forked.
+	 */
+	InitLocalDataChecksumVersion();
+
 	/*
 	 * Auxiliary processes don't run transactions, but they may need a
 	 * resource owner anyway to manage buffer pins acquired outside
diff --git a/src/backend/postmaster/launch_backend.c b/src/backend/postmaster/launch_backend.c
index 65bbf770d28..b06b5fb45dd 100644
--- a/src/backend/postmaster/launch_backend.c
+++ b/src/backend/postmaster/launch_backend.c
@@ -285,16 +285,6 @@ postmaster_child_launch(BackendType child_type, int child_slot,
 			memcpy(MyClientSocket, client_sock, sizeof(ClientSocket));
 		}
 
-		/*
-		 * update the LocalProcessControlFile to match XLogCtl->data_checksum_version
-		 *
-		 * XXX It seems the postmaster (which is what gets forked into the new
-		 * child process) does not absorb the checksum barriers, therefore it
-		 * does not update the value (except after a restart). Not sure if there
-		 * is some sort of race condition.
-		 */
-		InitLocalDataChecksumVersion();
-
 		/*
 		 * Run the appropriate Main function
 		 */
diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c
index 692570eb0f1..d1394fc05f9 100644
--- a/src/backend/utils/init/postinit.c
+++ b/src/backend/utils/init/postinit.c
@@ -749,9 +749,22 @@ InitPostgres(const char *in_dbname, Oid dboid,
 	ProcSignalInit(MyCancelKeyValid, MyCancelKey);
 
 	/*
-	 * Set up backend local cache of Controldata values.
+	 * Initialize a local cache of the data_checksum_version, to be updated
+	 * by the procsignal-based barriers.
+	 *
+	 * This intentionally happens after initializing the procsignal, otherwise
+	 * we might miss a state change. This means we can get a barrier for the
+	 * state we've just initialized - but it can happen only once.
+	 *
+	 * The postmaster (which is what gets forked into the new child process)
+	 * does not handle barriers, therefore it may not have the current value
+	 * of LocalDataChecksumVersion value (it'll have the value read from the
+	 * control file, which may be arbitrarily old).
+	 *
+	 * NB: Even if the postmaster handled barriers, the value might still be
+	 * stale, as it might have changed after this process forked.
 	 */
-	InitLocalControldata();
+	InitLocalDataChecksumVersion();
 
 	/*
 	 * Also set up timeout handlers needed for backend operation.  We need
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index aec3ea0bc63..615b2cf4ec8 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -243,7 +243,7 @@ extern bool AbsorbChecksumsOffInProgressBarrier(void);
 extern bool AbsorbChecksumsOnBarrier(void);
 extern bool AbsorbChecksumsOffBarrier(void);
 extern const char *show_data_checksums(void);
-extern void InitLocalControldata(void);
+extern void InitLocalDataChecksumVersion(void);
 extern bool GetDefaultCharSignedness(void);
 extern XLogRecPtr GetFakeLSNForUnloggedRel(void);
 extern Size XLOGShmemSize(void);
diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h
index 133e7fde290..193ce1b4514 100644
--- a/src/include/miscadmin.h
+++ b/src/include/miscadmin.h
@@ -542,7 +542,6 @@ extern void RestoreClientConnectionInfo(char *conninfo);
 
 extern uint32 GetLocalDataChecksumVersion(void);
 extern uint32 GetCurrentDataChecksumVersion(void);
-extern void InitLocalDataChecksumVersion(void);
 
 /* in executor/nodeHash.c */
 extern size_t get_hash_memory_limit(void);
-- 
2.48.1

