From 9a200daba7a44a157865eb9f63078d484ee44b5f Mon Sep 17 00:00:00 2001
From: Alena Vinter <dlaaren8@gmail.com>
Date: Mon, 26 May 2025 13:18:35 +0700
Subject: [PATCH v1 1/2] Handle WAL timeline switches with incomplete records

When switching timelines with incomplete WAL records at the end of the
old timeline, physical replicas could enter an infinite recovery loop by
repeatedly requesting the same WAL data. This occurs because the
incomplete record isn't properly marked with XLOG_OVERWRITE_CONTRECORD
record, causing replicas to retry fetching it.

To fix this, we preserve WAL's append-only nature by writing an
XLOG_OVERWRITE_CONTRECORD to explicitly mark incomplete records before
initializing the new timeline. This ensures replicas can properly detect
transition to the new timeline without getting stuck.

---
 src/backend/access/transam/xlog.c         | 179 +++++++++++-----------
 src/backend/access/transam/xlogrecovery.c |   3 +-
 2 files changed, 93 insertions(+), 89 deletions(-)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 47ffc0a2307..7a3242c62bc 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -6044,6 +6044,7 @@ StartupXLOG(void)
 	EndOfLogTLI = endOfRecoveryInfo->endOfLogTLI;
 	abortedRecPtr = endOfRecoveryInfo->abortedRecPtr;
 	missingContrecPtr = endOfRecoveryInfo->missingContrecPtr;
+	newTLI = endOfRecoveryInfo->lastRecTLI;
 
 	/*
 	 * Reset ps status display, so as no information related to recovery shows
@@ -6116,6 +6117,97 @@ StartupXLOG(void)
 	 */
 	SetInstallXLogFileSegmentActive();
 
+	/*
+	 * Actually, if WAL ended in an incomplete record, skip the parts that
+	 * made it through and start writing after the portion that persisted.
+	 * (It's critical to first write an OVERWRITE_CONTRECORD message, which
+	 * we'll do as soon as we're open for writing new WAL.)
+	 */
+	if (!XLogRecPtrIsInvalid(missingContrecPtr) &&
+		endOfRecoveryInfo->endOfLog == abortedRecPtr)
+	{
+		Assert(!XLogRecPtrIsInvalid(abortedRecPtr));
+		EndOfLog = missingContrecPtr;
+	}
+
+	/*
+	 * Tricky point here: lastPage contains the *last* block that the LastRec
+	 * record spans, not the one it starts in.  The last block is indeed the
+	 * one we want to use.
+	 */
+	if (EndOfLog % XLOG_BLCKSZ != 0)
+	{
+		char	   *page;
+		int			len;
+		int			firstIdx;
+
+		firstIdx = XLogRecPtrToBufIdx(EndOfLog);
+		len = EndOfLog - endOfRecoveryInfo->lastPageBeginPtr;
+		Assert(len < XLOG_BLCKSZ);
+
+		/* Copy the valid part of the last block, and zero the rest */
+		page = &XLogCtl->pages[firstIdx * XLOG_BLCKSZ];
+		memcpy(page, endOfRecoveryInfo->lastPage, len);
+		memset(page + len, 0, XLOG_BLCKSZ - len);
+
+		pg_atomic_write_u64(&XLogCtl->xlblocks[firstIdx], endOfRecoveryInfo->lastPageBeginPtr + XLOG_BLCKSZ);
+		pg_atomic_write_u64(&XLogCtl->InitializedUpTo, endOfRecoveryInfo->lastPageBeginPtr + XLOG_BLCKSZ);
+		XLogCtl->InitializedFrom = endOfRecoveryInfo->lastPageBeginPtr;
+	}
+	else
+	{
+		/*
+		 * There is no partial block to copy. Just set InitializedUpTo, and
+		 * let the first attempt to insert a log record to initialize the next
+		 * buffer.
+		 */
+		pg_atomic_write_u64(&XLogCtl->InitializedUpTo, EndOfLog);
+		XLogCtl->InitializedFrom = EndOfLog;
+	}
+	pg_atomic_write_u64(&XLogCtl->InitializeReserved, pg_atomic_read_u64(&XLogCtl->InitializedUpTo));
+
+	/*
+	 * Prepare to write WAL starting at EndOfLog location, and init xlog
+	 * buffer cache using the block containing the last record from the
+	 * previous incarnation.
+	 */
+	Insert = &XLogCtl->Insert;
+	Insert->PrevBytePos = XLogRecPtrToBytePos(endOfRecoveryInfo->lastRec);
+	Insert->CurrBytePos = XLogRecPtrToBytePos(EndOfLog);
+
+	/*
+	 * Update local and shared status.  This is OK to do without any locks
+	 * because no other process can be reading or writing WAL yet.
+	 */
+	LogwrtResult.Write = LogwrtResult.Flush = EndOfLog;
+	pg_atomic_write_u64(&XLogCtl->logInsertResult, EndOfLog);
+	pg_atomic_write_u64(&XLogCtl->logWriteResult, EndOfLog);
+	pg_atomic_write_u64(&XLogCtl->logFlushResult, EndOfLog);
+	XLogCtl->LogwrtRqst.Write = EndOfLog;
+	XLogCtl->LogwrtRqst.Flush = EndOfLog;
+
+	/* Enable WAL writes for this backend only. */
+	LocalSetXLogInsertAllowed();
+
+	/* If necessary, write overwrite-contrecord before doing anything else */
+	if (!XLogRecPtrIsInvalid(abortedRecPtr) &&
+		!XLogRecPtrIsInvalid(missingContrecPtr) &&
+		EndOfLog == missingContrecPtr)
+	{
+		SpinLockAcquire(&XLogCtl->info_lck);
+		XLogCtl->InsertTimeLineID = newTLI;
+		XLogCtl->PrevTimeLineID = endOfRecoveryInfo->lastRecTLI;
+		SpinLockRelease(&XLogCtl->info_lck);
+
+		EndOfLog = CreateOverwriteContrecordRecord(abortedRecPtr, missingContrecPtr, newTLI);
+		/*
+		 * Ensure next records are written to the next timeline segment by
+		 * closing the current segment.
+		 */
+		if (openLogFile >= 0)
+			XLogFileClose();
+	}
+
 	/*
 	 * Consider whether we need to assign a new timeline ID.
 	 *
@@ -6130,7 +6222,6 @@ StartupXLOG(void)
 	 *
 	 * In a normal crash recovery, we can just extend the timeline we were in.
 	 */
-	newTLI = endOfRecoveryInfo->lastRecTLI;
 	if (ArchiveRecoveryRequested)
 	{
 		newTLI = findNewestTimeLine(recoveryTargetTLI) + 1;
@@ -6177,82 +6268,6 @@ StartupXLOG(void)
 	XLogCtl->PrevTimeLineID = endOfRecoveryInfo->lastRecTLI;
 	SpinLockRelease(&XLogCtl->info_lck);
 
-	/*
-	 * Actually, if WAL ended in an incomplete record, skip the parts that
-	 * made it through and start writing after the portion that persisted.
-	 * (It's critical to first write an OVERWRITE_CONTRECORD message, which
-	 * we'll do as soon as we're open for writing new WAL.)
-	 */
-	if (!XLogRecPtrIsInvalid(missingContrecPtr))
-	{
-		/*
-		 * We should only have a missingContrecPtr if we're not switching to a
-		 * new timeline. When a timeline switch occurs, WAL is copied from the
-		 * old timeline to the new only up to the end of the last complete
-		 * record, so there can't be an incomplete WAL record that we need to
-		 * disregard.
-		 */
-		Assert(newTLI == endOfRecoveryInfo->lastRecTLI);
-		Assert(!XLogRecPtrIsInvalid(abortedRecPtr));
-		EndOfLog = missingContrecPtr;
-	}
-
-	/*
-	 * Prepare to write WAL starting at EndOfLog location, and init xlog
-	 * buffer cache using the block containing the last record from the
-	 * previous incarnation.
-	 */
-	Insert = &XLogCtl->Insert;
-	Insert->PrevBytePos = XLogRecPtrToBytePos(endOfRecoveryInfo->lastRec);
-	Insert->CurrBytePos = XLogRecPtrToBytePos(EndOfLog);
-
-	/*
-	 * Tricky point here: lastPage contains the *last* block that the LastRec
-	 * record spans, not the one it starts in.  The last block is indeed the
-	 * one we want to use.
-	 */
-	if (EndOfLog % XLOG_BLCKSZ != 0)
-	{
-		char	   *page;
-		int			len;
-		int			firstIdx;
-
-		firstIdx = XLogRecPtrToBufIdx(EndOfLog);
-		len = EndOfLog - endOfRecoveryInfo->lastPageBeginPtr;
-		Assert(len < XLOG_BLCKSZ);
-
-		/* Copy the valid part of the last block, and zero the rest */
-		page = &XLogCtl->pages[firstIdx * XLOG_BLCKSZ];
-		memcpy(page, endOfRecoveryInfo->lastPage, len);
-		memset(page + len, 0, XLOG_BLCKSZ - len);
-
-		pg_atomic_write_u64(&XLogCtl->xlblocks[firstIdx], endOfRecoveryInfo->lastPageBeginPtr + XLOG_BLCKSZ);
-		pg_atomic_write_u64(&XLogCtl->InitializedUpTo, endOfRecoveryInfo->lastPageBeginPtr + XLOG_BLCKSZ);
-		XLogCtl->InitializedFrom = endOfRecoveryInfo->lastPageBeginPtr;
-	}
-	else
-	{
-		/*
-		 * There is no partial block to copy. Just set InitializedUpTo, and
-		 * let the first attempt to insert a log record to initialize the next
-		 * buffer.
-		 */
-		pg_atomic_write_u64(&XLogCtl->InitializedUpTo, EndOfLog);
-		XLogCtl->InitializedFrom = EndOfLog;
-	}
-	pg_atomic_write_u64(&XLogCtl->InitializeReserved, pg_atomic_read_u64(&XLogCtl->InitializedUpTo));
-
-	/*
-	 * Update local and shared status.  This is OK to do without any locks
-	 * because no other process can be reading or writing WAL yet.
-	 */
-	LogwrtResult.Write = LogwrtResult.Flush = EndOfLog;
-	pg_atomic_write_u64(&XLogCtl->logInsertResult, EndOfLog);
-	pg_atomic_write_u64(&XLogCtl->logWriteResult, EndOfLog);
-	pg_atomic_write_u64(&XLogCtl->logFlushResult, EndOfLog);
-	XLogCtl->LogwrtRqst.Write = EndOfLog;
-	XLogCtl->LogwrtRqst.Flush = EndOfLog;
-
 	/*
 	 * Preallocate additional log files, if wanted.
 	 */
@@ -6296,16 +6311,6 @@ StartupXLOG(void)
 	/* Shut down xlogreader */
 	ShutdownWalRecovery();
 
-	/* Enable WAL writes for this backend only. */
-	LocalSetXLogInsertAllowed();
-
-	/* If necessary, write overwrite-contrecord before doing anything else */
-	if (!XLogRecPtrIsInvalid(abortedRecPtr))
-	{
-		Assert(!XLogRecPtrIsInvalid(missingContrecPtr));
-		CreateOverwriteContrecordRecord(abortedRecPtr, missingContrecPtr, newTLI);
-	}
-
 	/*
 	 * Update full_page_writes in shared memory and write an XLOG_FPW_CHANGE
 	 * record before resource manager writes cleanup WAL records or checkpoint
diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c
index 6ce979f2d8b..9b024697026 100644
--- a/src/backend/access/transam/xlogrecovery.c
+++ b/src/backend/access/transam/xlogrecovery.c
@@ -3174,8 +3174,7 @@ ReadRecord(XLogPrefetcher *xlogprefetcher, int emode,
 			 * complete record, so if we did this, we would later create an
 			 * overwrite contrecord in the wrong place, breaking everything.
 			 */
-			if (!ArchiveRecoveryRequested &&
-				!XLogRecPtrIsInvalid(xlogreader->abortedRecPtr))
+			if (!XLogRecPtrIsInvalid(xlogreader->abortedRecPtr))
 			{
 				abortedRecPtr = xlogreader->abortedRecPtr;
 				missingContrecPtr = xlogreader->missingContrecPtr;
-- 
2.49.0

