From 619c8bc3c2bd3a2cf24a283dcfc9666b9769b50c Mon Sep 17 00:00:00 2001
From: Michael Paquier <michael@paquier.xyz>
Date: Tue, 26 Sep 2023 15:23:37 +0900
Subject: [PATCH v4 2/3] Make WAL replay more robust on OOM failures

This takes advantage of the new error facility for WAL readers, allowing
WAL replay to loop when an out-of-memory happens when reading a record.
This was the origin of potential data loss scenarios, making crash
recovery more robust by acting the same way as a standby here: each time
a record cannot be read because of an OOM, loop and try to read again
the record.
---
 src/backend/access/transam/xlogrecovery.c | 75 ++++++++++++++++-------
 1 file changed, 52 insertions(+), 23 deletions(-)

diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c
index 68100bfa4a..ed5ac06938 100644
--- a/src/backend/access/transam/xlogrecovery.c
+++ b/src/backend/access/transam/xlogrecovery.c
@@ -3067,29 +3067,50 @@ ReadRecord(XLogPrefetcher *xlogprefetcher, int emode,
 		record = XLogPrefetcherReadRecord(xlogprefetcher, &errordata);
 		if (record == NULL)
 		{
-			/*
-			 * When we find that WAL ends in an incomplete record, keep track
-			 * of that record.  After recovery is done, we'll write a record
-			 * to indicate to downstream WAL readers that that portion is to
-			 * be ignored.
-			 *
-			 * However, when ArchiveRecoveryRequested = true, we're going to
-			 * switch to a new timeline at the end of recovery. We will only
-			 * copy WAL over to the new timeline up to the end of the last
-			 * complete record, so if we did this, we would later create an
-			 * overwrite contrecord in the wrong place, breaking everything.
-			 */
-			if (!ArchiveRecoveryRequested &&
-				!XLogRecPtrIsInvalid(xlogreader->abortedRecPtr))
+			switch (errordata.code)
 			{
-				abortedRecPtr = xlogreader->abortedRecPtr;
-				missingContrecPtr = xlogreader->missingContrecPtr;
-			}
+				case XLOG_READER_NO_ERROR:
+					/* Possible when XLogPageRead() has failed */
+					Assert(!errordata.message);
+					/* FALLTHROUGH */
 
-			if (readFile >= 0)
-			{
-				close(readFile);
-				readFile = -1;
+				case XLOG_READER_INVALID_DATA:
+
+					/*
+					 * When we find that WAL ends in an incomplete record,
+					 * keep track of that record.  After recovery is done,
+					 * we'll write a record to indicate to downstream WAL
+					 * readers that that portion is to be ignored.
+					 *
+					 * However, when ArchiveRecoveryRequested = true, we're
+					 * going to switch to a new timeline at the end of
+					 * recovery. We will only copy WAL over to the new
+					 * timeline up to the end of the last complete record, so
+					 * if we did this, we would later create an overwrite
+					 * contrecord in the wrong place, breaking everything.
+					 */
+					if (!ArchiveRecoveryRequested &&
+						!XLogRecPtrIsInvalid(xlogreader->abortedRecPtr))
+					{
+						abortedRecPtr = xlogreader->abortedRecPtr;
+						missingContrecPtr = xlogreader->missingContrecPtr;
+					}
+
+					if (readFile >= 0)
+					{
+						close(readFile);
+						readFile = -1;
+					}
+					break;
+				case XLOG_READER_OOM:
+
+					/*
+					 * If we failed because of an out-of-memory problem, just
+					 * give up and retry recovery later.  It may be posible
+					 * that the WAL record to decode required a larger memory
+					 * allocation than what the host can offer.
+					 */
+					break;
 			}
 
 			/*
@@ -3147,9 +3168,12 @@ ReadRecord(XLogPrefetcher *xlogprefetcher, int emode,
 			 * WAL from the archive, even if pg_wal is completely empty, but
 			 * we'd have no idea how far we'd have to replay to reach
 			 * consistency.  So err on the safe side and give up.
+			 *
+			 * It may be possible that the record was not decoded because of
+			 * an out-of-memory failure.  In this case, just loop.
 			 */
 			if (!InArchiveRecovery && ArchiveRecoveryRequested &&
-				!fetching_ckpt)
+				!fetching_ckpt && errordata.code != XLOG_READER_OOM)
 			{
 				ereport(DEBUG1,
 						(errmsg_internal("reached end of WAL in pg_wal, entering archive recovery")));
@@ -3173,9 +3197,14 @@ ReadRecord(XLogPrefetcher *xlogprefetcher, int emode,
 				continue;
 			}
 
-			/* In standby mode, loop back to retry. Otherwise, give up. */
+			/*
+			 * In standby mode or if the WAL record failed on an
+			 * out-of-memory, loop back and retry.  Otherwise, give up.
+			 */
 			if (StandbyMode && !CheckForStandbyTrigger())
 				continue;
+			else if (errordata.code == XLOG_READER_OOM)
+				continue;
 			else
 				return NULL;
 		}
-- 
2.40.1

