From d9a9caf0425543ef21700239dcd8d62ccaadfd5b Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@vondra.me>
Date: Sat, 15 Mar 2025 10:42:16 +0100
Subject: [PATCH v20250315 4/4] debug stuff

---
 src/backend/access/transam/xlog.c       | 86 ++++++++++++++++++++++++-
 src/backend/postmaster/launch_backend.c |  3 +
 src/backend/storage/ipc/procsignal.c    | 34 ++++++++++
 src/backend/storage/page/bufpage.c      | 35 ++++++++++
 src/backend/utils/init/miscinit.c       |  3 +
 src/bin/pg_controldata/pg_controldata.c |  2 +
 6 files changed, 160 insertions(+), 3 deletions(-)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 064cc5555dc..a01afc32af8 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -4284,6 +4284,8 @@ InitControlFile(uint64 sysidentifier, uint32 data_checksum_version)
 	 * processes get the current value from. (Maybe it should go just there?)
 	 */
 	XLogCtl->data_checksum_version = data_checksum_version;
+
+	elog(LOG, "InitControlFile %p data_checksum_version %u", XLogCtl, ControlFile->data_checksum_version);
 }
 
 static void
@@ -4623,6 +4625,8 @@ ReadControlFile(void)
 		(SizeOfXLogLongPHD - SizeOfXLogShortPHD);
 
 	CalculateCheckpointSegments();
+
+	elog(LOG, "ReadControlFile ControlFile->data_checksum_version = %u", ControlFile->data_checksum_version);
 }
 
 /*
@@ -4758,6 +4762,8 @@ SetDataChecksumsOnInProgress(void)
 	XLogCtl->data_checksum_version = PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION;
 	SpinLockRelease(&XLogCtl->info_lck);
 
+	elog(LOG, "SetDataChecksumsOnInProgress XLogCtl->data_checksum_version = %u", XLogCtl->data_checksum_version);
+
 	barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_ON);
 
 	END_CRIT_SECTION();
@@ -4823,6 +4829,8 @@ SetDataChecksumsOn(void)
 	XLogCtl->data_checksum_version = PG_DATA_CHECKSUM_VERSION;
 	SpinLockRelease(&XLogCtl->info_lck);
 
+	elog(LOG, "SetDataChecksumsOn XLogCtl->data_checksum_version = %u", XLogCtl->data_checksum_version);
+
 	barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_ON);
 
 	END_CRIT_SECTION();
@@ -4861,6 +4869,7 @@ SetDataChecksumsOff(void)
 	/* If data checksums are already disabled there is nothing to do */
 	if (XLogCtl->data_checksum_version == 0)
 	{
+		elog(LOG, "SetDataChecksumsOff XLogCtl->data_checksum_version = %u (SKIP)", XLogCtl->data_checksum_version);
 		SpinLockRelease(&XLogCtl->info_lck);
 		return;
 	}
@@ -4885,6 +4894,8 @@ SetDataChecksumsOff(void)
 		XLogCtl->data_checksum_version = PG_DATA_CHECKSUM_INPROGRESS_OFF_VERSION;
 		SpinLockRelease(&XLogCtl->info_lck);
 
+		elog(LOG, "SetDataChecksumsOff XLogCtl->data_checksum_version = %u", XLogCtl->data_checksum_version);
+
 		barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_OFF);
 
 		END_CRIT_SECTION();
@@ -4905,6 +4916,8 @@ SetDataChecksumsOff(void)
 	}
 	else
 	{
+		elog(LOG, "SetDataChecksumsOff XLogCtl->data_checksum_version = %u (SKIP)", XLogCtl->data_checksum_version);
+
 		/*
 		 * Ending up here implies that the checksums state is "inprogress-on"
 		 * or "inprogress-off" and we can transition directly to "off" from
@@ -4942,6 +4955,7 @@ bool
 AbsorbChecksumsOnInProgressBarrier(void)
 {
 	/* XXX can't we check we're in OFF or INPROGRESSS_ON? */
+	elog(LOG, "AbsorbChecksumsOnInProgressBarrier");
 	SetLocalDataChecksumVersion(PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION);
 	return true;
 }
@@ -4949,6 +4963,8 @@ AbsorbChecksumsOnInProgressBarrier(void)
 bool
 AbsorbChecksumsOnBarrier(void)
 {
+	elog(LOG, "AbsorbChecksumsOnBarrier");
+
 	/*
 	 * If the process was spawned between updating XLogCtl and emitting the
 	 * barrier it will have seen the updated value, so for the first barrier
@@ -4967,6 +4983,7 @@ bool
 AbsorbChecksumsOffInProgressBarrier(void)
 {
 	/* XXX can't we check we're in ON or INPROGRESSS_OFF? */
+	elog(LOG, "AbsorbChecksumsOffInProgressBarrier");
 	SetLocalDataChecksumVersion(PG_DATA_CHECKSUM_INPROGRESS_OFF_VERSION);
 	return true;
 }
@@ -4975,6 +4992,7 @@ bool
 AbsorbChecksumsOffBarrier(void)
 {
 	/* XXX can't we check we're in INPROGRESSS_OFF? */
+	elog(LOG, "AbsorbChecksumsOffBarrier");
 	SetLocalDataChecksumVersion(0);
 	return true;
 }
@@ -4990,6 +5008,7 @@ AbsorbChecksumsOffBarrier(void)
 void
 InitLocalDataChecksumVersion(void)
 {
+	elog(LOG, "InitLocalDataChecksumVersion XLogCtl->data_checksum_version = %u", XLogCtl->data_checksum_version);
 	SpinLockAcquire(&XLogCtl->info_lck);
 	SetLocalDataChecksumVersion(XLogCtl->data_checksum_version);
 	SpinLockRelease(&XLogCtl->info_lck);
@@ -5004,6 +5023,7 @@ InitLocalDataChecksumVersion(void)
 void
 SetLocalDataChecksumVersion(uint32 data_checksum_version)
 {
+	elog(LOG, "SetLocalDataChecksumVersion %u", data_checksum_version);
 	LocalDataChecksumVersion = data_checksum_version;
 
 	switch (LocalDataChecksumVersion)
@@ -5499,9 +5519,15 @@ XLOGShmemInit(void)
 	XLogCtl->InstallXLogFileSegmentActive = false;
 	XLogCtl->WalWriterSleeping = false;
 
+	elog(LOG, "XLogCtl->data_checksum_version %u ControlFile->data_checksum_version %u",
+		 XLogCtl->data_checksum_version, ControlFile->data_checksum_version);
+
 	/* use the checksum info from control file */
 	XLogCtl->data_checksum_version = ControlFile->data_checksum_version;
 
+	elog(LOG, "XLogCtl->data_checksum_version %u ControlFile->data_checksum_version %u (UPDATED)",
+		 XLogCtl->data_checksum_version, ControlFile->data_checksum_version);
+
 	SetLocalDataChecksumVersion(XLogCtl->data_checksum_version);
 
 	SpinLockInit(&XLogCtl->Insert.insertpos_lck);
@@ -6635,6 +6661,8 @@ StartupXLOG(void)
 	 */
 	CompleteCommitTsInitialization();
 
+	elog(LOG, "StartupXLOG XLogCtl->data_checksum_version = %u", XLogCtl->data_checksum_version);
+
 	/*
 	 * If we reach this point with checksums being enabled ("inprogress-on"
 	 * state), we notify the user that they need to manually restart the
@@ -6652,6 +6680,9 @@ StartupXLOG(void)
 	 * we know that we have a state where all backends have stopped validating
 	 * checksums and we can move to off instead of prompting the user to
 	 * perform any action.
+	 *
+	 * XXX Is it safe to access the data_checksum_version without holding the
+	 * spinlock?
 	 */
 	if (XLogCtl->data_checksum_version == PG_DATA_CHECKSUM_INPROGRESS_OFF_VERSION)
 	{
@@ -6660,6 +6691,8 @@ StartupXLOG(void)
 		SpinLockAcquire(&XLogCtl->info_lck);
 		XLogCtl->data_checksum_version = 0;
 		SpinLockRelease(&XLogCtl->info_lck);
+
+		elog(LOG, "StartupXLOG XLogCtl->data_checksum_version = %u (UPDATED)", XLogCtl->data_checksum_version);
 	}
 
 	/*
@@ -7523,6 +7556,8 @@ CreateCheckPoint(int flags)
 	 */
 	checkPoint.data_checksum_version = XLogCtl->data_checksum_version;
 
+	elog(WARNING, "CREATECHECKPOINT XLogCtl->data_checksum_version %u", XLogCtl->data_checksum_version);
+
 	if (shutdown)
 	{
 		XLogRecPtr	curInsert = XLogBytePosToRecPtr(Insert->CurrBytePos);
@@ -7778,6 +7813,10 @@ CreateCheckPoint(int flags)
 	ControlFile->minRecoveryPoint = InvalidXLogRecPtr;
 	ControlFile->minRecoveryPointTLI = 0;
 
+	elog(LOG, "CreateCheckPoint data_checksum_version %u %u",
+		 ControlFile->data_checksum_version,
+		 checkPoint.data_checksum_version);
+
 	/* make sure we start with the checksum version as of the checkpoint */
 	ControlFile->data_checksum_version = checkPoint.data_checksum_version;
 
@@ -7928,6 +7967,10 @@ CreateEndOfRecoveryRecord(void)
 	ControlFile->minRecoveryPoint = recptr;
 	ControlFile->minRecoveryPointTLI = xlrec.ThisTimeLineID;
 
+	elog(LOG, "CreateEndOfRecoveryRecord data_checksum_version %u xlog %u",
+		 ControlFile->data_checksum_version,
+		 XLogCtl->data_checksum_version);
+
 	/* start with the latest checksum version (as of the end of recovery) */
 	ControlFile->data_checksum_version = XLogCtl->data_checksum_version;
 
@@ -8101,6 +8144,14 @@ RecoveryRestartPoint(const CheckPoint *checkPoint, XLogReaderState *record)
 	XLogCtl->lastCheckPointEndPtr = record->EndRecPtr;
 	XLogCtl->lastCheckPoint = *checkPoint;
 	SpinLockRelease(&XLogCtl->info_lck);
+
+	elog(LOG, "RecoveryRestartPoint lastCheckPointRecPtr %X/%X XLogCtl->lastCheckPointEndPtr %X/%X redo %X/%X checksums xlogctl %u record %d file %u",
+		 LSN_FORMAT_ARGS(XLogCtl->lastCheckPointRecPtr),
+		 LSN_FORMAT_ARGS(XLogCtl->lastCheckPointEndPtr),
+		 LSN_FORMAT_ARGS(XLogCtl->lastCheckPoint.redo),
+		 XLogCtl->lastCheckPoint.data_checksum_version,
+		 checkPoint->data_checksum_version,
+		 ControlFile->data_checksum_version);
 }
 
 /*
@@ -8138,6 +8189,9 @@ CreateRestartPoint(int flags)
 	lastCheckPoint = XLogCtl->lastCheckPoint;
 	SpinLockRelease(&XLogCtl->info_lck);
 
+	elog(LOG, "CreateRestartPoint lastCheckPointRecPtr %X/%X lastCheckPointEndPtr %X/%X",
+		 LSN_FORMAT_ARGS(lastCheckPointRecPtr), LSN_FORMAT_ARGS(lastCheckPointEndPtr));
+
 	/*
 	 * Check that we're still in recovery mode. It's ok if we exit recovery
 	 * mode after this check, the restart point is valid anyway.
@@ -8166,9 +8220,11 @@ CreateRestartPoint(int flags)
 	if (XLogRecPtrIsInvalid(lastCheckPointRecPtr) ||
 		lastCheckPoint.redo <= ControlFile->checkPointCopy.redo)
 	{
-		ereport(DEBUG2,
-				(errmsg_internal("skipping restartpoint, already performed at %X/%X",
-								 LSN_FORMAT_ARGS(lastCheckPoint.redo))));
+		ereport(LOG,
+				(errmsg_internal("CreateRestartPoint: skipping restartpoint, already performed at %X/%X <= %X/%X lastCheckPointRecPtr %X/%X",
+								 LSN_FORMAT_ARGS(lastCheckPoint.redo),
+								 LSN_FORMAT_ARGS(ControlFile->checkPointCopy.redo),
+								 LSN_FORMAT_ARGS(lastCheckPointRecPtr))));
 
 		UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
 		if (flags & CHECKPOINT_IS_SHUTDOWN)
@@ -8236,6 +8292,8 @@ CreateRestartPoint(int flags)
 	 * end-of-recovery checkpoint.
 	 */
 	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+	elog(LOG, "CreateRestartPoint ControlFile->checkPointCopy.redo %X/%X lastCheckPoint.redo %X/%X",
+		 LSN_FORMAT_ARGS(ControlFile->checkPointCopy.redo), LSN_FORMAT_ARGS(lastCheckPoint.redo));
 	if (ControlFile->checkPointCopy.redo < lastCheckPoint.redo)
 	{
 		/*
@@ -8274,11 +8332,17 @@ CreateRestartPoint(int flags)
 				ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
 		}
 
+		elog(LOG, "CreateRestartPoint data_checksum_version %u %u",
+			 ControlFile->data_checksum_version,
+			 lastCheckPoint.data_checksum_version);
+
 		/* we shall start with the latest checksum version */
 		ControlFile->data_checksum_version = lastCheckPoint.data_checksum_version;
 
 		UpdateControlFile();
 	}
+	else
+		elog(LOG, "CreateRestartPoint: skipped ControlFile update");
 	LWLockRelease(ControlFileLock);
 
 	/*
@@ -9136,13 +9200,25 @@ xlog_redo(XLogReaderState *record)
 	{
 		xl_checksum_state state;
 		uint64		barrier;
+		XLogRecPtr	checkpointLsn;
+		uint32		value,
+					value_last;
 
 		memcpy(&state, XLogRecGetData(record), sizeof(xl_checksum_state));
 
 		SpinLockAcquire(&XLogCtl->info_lck);
+		value_last = XLogCtl->data_checksum_version;
 		XLogCtl->data_checksum_version = state.new_checksumtype;
 		SpinLockRelease(&XLogCtl->info_lck);
 
+		LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+		checkpointLsn = ControlFile->checkPoint;
+		value = ControlFile->data_checksum_version;
+		LWLockRelease(ControlFileLock);
+
+		elog(LOG, "XLOG_CHECKSUMS xlog_redo %X/%X control checkpoint %X/%X control %u last %u record %u",
+			 LSN_FORMAT_ARGS(lsn), LSN_FORMAT_ARGS(checkpointLsn), value, value_last, state.new_checksumtype);
+
 		/*
 		 * Block on a procsignalbarrier to await all processes having seen the
 		 * change to checksum status. Once the barrier has been passed we can
@@ -9151,22 +9227,26 @@ xlog_redo(XLogReaderState *record)
 		switch (state.new_checksumtype)
 		{
 			case PG_DATA_CHECKSUM_INPROGRESS_ON_VERSION:
+				elog(LOG, "XLOG_CHECKSUMS emit PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_ON");
 				barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_ON);
 				WaitForProcSignalBarrier(barrier);
 				break;
 
 			case PG_DATA_CHECKSUM_INPROGRESS_OFF_VERSION:
+				elog(LOG, "XLOG_CHECKSUMS emit PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_OFF");
 				barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_OFF);
 				WaitForProcSignalBarrier(barrier);
 				break;
 
 			case PG_DATA_CHECKSUM_VERSION:
+				elog(LOG, "XLOG_CHECKSUMS emit PROCSIGNAL_BARRIER_CHECKSUM_ON");
 				barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_ON);
 				WaitForProcSignalBarrier(barrier);
 				break;
 
 			default:
 				Assert(state.new_checksumtype == 0);
+				elog(LOG, "XLOG_CHECKSUMS emit PROCSIGNAL_BARRIER_CHECKSUM_OFF");
 				barrier = EmitProcSignalBarrier(PROCSIGNAL_BARRIER_CHECKSUM_OFF);
 				WaitForProcSignalBarrier(barrier);
 				break;
diff --git a/src/backend/postmaster/launch_backend.c b/src/backend/postmaster/launch_backend.c
index b06b5fb45dd..2119c5d59b4 100644
--- a/src/backend/postmaster/launch_backend.c
+++ b/src/backend/postmaster/launch_backend.c
@@ -239,6 +239,9 @@ postmaster_child_launch(BackendType child_type, int child_slot,
 	if (IsExternalConnectionBackend(child_type))
 		((BackendStartupData *) startup_data)->fork_started = GetCurrentTimestamp();
 
+	elog(LOG, "postmaster_child_launch: LocalDataChecksumVersion %u xlog %u", GetLocalDataChecksumVersion(),
+	GetCurrentDataChecksumVersion());
+
 #ifdef EXEC_BACKEND
 	pid = internal_forkexec(child_process_kinds[child_type].name, child_slot,
 							startup_data, startup_data_len, client_sock);
diff --git a/src/backend/storage/ipc/procsignal.c b/src/backend/storage/ipc/procsignal.c
index 2b13a8cd260..b88d1d07431 100644
--- a/src/backend/storage/ipc/procsignal.c
+++ b/src/backend/storage/ipc/procsignal.c
@@ -553,6 +553,40 @@ ProcessProcSignalBarrier(void)
 
 		PG_TRY();
 		{
+			/* print info about barriers */
+			{
+				uint32	tmp = flags;
+
+				elog(LOG, "ProcessProcSignalBarrier flags %u", tmp);
+
+				while (tmp != 0)
+				{
+					ProcSignalBarrierType type;
+
+					type = (ProcSignalBarrierType) pg_rightmost_one_pos32(tmp);
+					switch (type)
+					{
+						case PROCSIGNAL_BARRIER_SMGRRELEASE:
+							elog(LOG, "PROCSIGNAL_BARRIER_SMGRRELEASE");
+							break;
+						case PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_ON:
+							elog(LOG, "PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_ON");
+							break;
+						case PROCSIGNAL_BARRIER_CHECKSUM_ON:
+							elog(LOG, "PROCSIGNAL_BARRIER_CHECKSUM_ON");
+							break;
+						case PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_OFF:
+							elog(LOG, "PROCSIGNAL_BARRIER_CHECKSUM_INPROGRESS_OFF");
+							break;
+						case PROCSIGNAL_BARRIER_CHECKSUM_OFF:
+							elog(LOG, "PROCSIGNAL_BARRIER_CHECKSUM_OFF");
+							break;
+					}
+
+					BARRIER_CLEAR_BIT(tmp, type);
+				}
+			}
+
 			/*
 			 * Process each type of barrier. The barrier-processing functions
 			 * should normally return true, but may return false if the
diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c
index 8bd5fed8c85..a8af9a7068b 100644
--- a/src/backend/storage/page/bufpage.c
+++ b/src/backend/storage/page/bufpage.c
@@ -22,6 +22,7 @@
 #include "utils/memdebug.h"
 #include "utils/memutils.h"
 
+#include <execinfo.h>
 
 /* GUC variable */
 bool		ignore_checksum_failure = false;
@@ -136,10 +137,44 @@ PageIsVerifiedExtended(PageData *page, BlockNumber blkno, int flags)
 	if (checksum_failure)
 	{
 		if ((flags & PIV_LOG_WARNING) != 0)
+		{
+			XLogRecPtr	lsn = PageGetLSN(page);
+
 			ereport(WARNING,
 					(errcode(ERRCODE_DATA_CORRUPTED),
 					 errmsg("page verification failed, calculated checksum %u but expected %u",
 							checksum, p->pd_checksum)));
+			ereport(WARNING,
+					(errcode(ERRCODE_DATA_CORRUPTED),
+					 errmsg("page verification failed, LSN %X/%X", LSN_FORMAT_ARGS(lsn))));
+			ereport(WARNING,
+					(errcode(ERRCODE_DATA_CORRUPTED),
+					 errmsg("page verification failed, header flags %u lower %u upper %u special %u pagesize_version %u prune_xid %u",
+							p->pd_flags,
+							p->pd_lower,
+							p->pd_upper,
+							p->pd_special,
+							p->pd_pagesize_version,
+							p->pd_prune_xid)));
+
+			{
+#define BT_BUF_SIZE 32
+				int nptrs;
+				void *buffer[BT_BUF_SIZE];
+				char **strings;
+
+				nptrs = backtrace(buffer, BT_BUF_SIZE);
+				strings = backtrace_symbols(buffer, nptrs);
+
+				if (strings != NULL)
+				{
+					for (int i = 0; i < nptrs; i++)
+					{
+						elog(WARNING, "backtrace %d : %s", i, strings[i]);
+					}
+				}
+			}
+		}
 
 		if ((flags & PIV_REPORT_STAT) != 0)
 			pgstat_report_checksum_failure();
diff --git a/src/backend/utils/init/miscinit.c b/src/backend/utils/init/miscinit.c
index a071ba6f455..df52ce8ad7f 100644
--- a/src/backend/utils/init/miscinit.c
+++ b/src/backend/utils/init/miscinit.c
@@ -164,6 +164,9 @@ InitPostmasterChild(void)
 				(errcode_for_socket_access(),
 				 errmsg_internal("could not set postmaster death monitoring pipe to FD_CLOEXEC mode: %m")));
 #endif
+
+	elog(LOG, "InitPostmasterChild: LocalDataChecksumVersion %u xlog %u", GetLocalDataChecksumVersion(),
+	GetCurrentDataChecksumVersion());
 }
 
 /*
diff --git a/src/bin/pg_controldata/pg_controldata.c b/src/bin/pg_controldata/pg_controldata.c
index bea779eef94..3d176fc0a9f 100644
--- a/src/bin/pg_controldata/pg_controldata.c
+++ b/src/bin/pg_controldata/pg_controldata.c
@@ -280,6 +280,8 @@ main(int argc, char *argv[])
 		   ControlFile->checkPointCopy.oldestCommitTsXid);
 	printf(_("Latest checkpoint's newestCommitTsXid:%u\n"),
 		   ControlFile->checkPointCopy.newestCommitTsXid);
+	printf(_("Latest checkpoint's data_checksum_version:%u\n"),
+		   ControlFile->checkPointCopy.data_checksum_version);
 	printf(_("Time of latest checkpoint:            %s\n"),
 		   ckpttime_str);
 	printf(_("Fake LSN counter for unlogged rels:   %X/%X\n"),
-- 
2.48.1

