From 6654b0cd3073c4bed6c0cbf7a3b0e9fc7a4b9e2d Mon Sep 17 00:00:00 2001
From: Jehan-Guillaume de Rorthais <jgdr@dalibo.com>
Date: Wed, 1 Apr 2020 16:36:06 +0200
Subject: [PATCH 1/2] Fix WAL retention during production crash recovery

During crash recovery of a production cluster with archive_mode=on,
XLogArchiveCheckDone() was considering the cluster as inRecovery
without archive_mode=always. Because of this non-arcived WAL and
related .ready files were recycled or removed.
---
 src/backend/access/transam/xlog.c        | 48 +++++++++++++++++++-----
 src/backend/access/transam/xlogarchive.c | 21 +++++++----
 src/include/access/xlog.h                | 21 +++++++++++
 3 files changed, 74 insertions(+), 16 deletions(-)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 11e32733c4..26920c7f3e 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -221,8 +221,8 @@ static TimeLineID receiveTLI = 0;
 static bool lastFullPageWrites;
 
 /*
- * Local copy of SharedRecoveryInProgress variable. True actually means "not
- * known, need to check the shared state".
+ * This is false when SharedRecoveryState is RECOVERT_STATE_NONE.
+ * True actually means "not known, need to check the shared state".
  */
 static bool LocalRecoveryInProgress = true;
 
@@ -653,10 +653,10 @@ typedef struct XLogCtlData
 	TimeLineID	PrevTimeLineID;
 
 	/*
-	 * SharedRecoveryInProgress indicates if we're still in crash or archive
+	 * SharedRecoveryState indicates if we're either in crash or archive
 	 * recovery.  Protected by info_lck.
 	 */
-	bool		SharedRecoveryInProgress;
+	RecoveryState	SharedRecoveryState;
 
 	/*
 	 * SharedHotStandbyActive indicates if we allow hot standby queries to be
@@ -4434,6 +4434,16 @@ ReadRecord(XLogReaderState *xlogreader, int emode,
 				updateMinRecoveryPoint = true;
 
 				UpdateControlFile();
+
+				/*
+				 * We update SharedRecoveryState while holding the lock on
+				 * ControlFileLock so both states are consistent in shared
+				 * memory.
+				 */
+				SpinLockAcquire(&XLogCtl->info_lck);
+				XLogCtl->SharedRecoveryState = RECOVERT_STATE_ARCHIVE;
+				SpinLockRelease(&XLogCtl->info_lck);
+
 				LWLockRelease(ControlFileLock);
 
 				CheckRecoveryConsistency();
@@ -5166,10 +5176,10 @@ XLOGShmemInit(void)
 	 * in additional info.)
 	 */
 	XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
-	XLogCtl->SharedRecoveryInProgress = true;
 	XLogCtl->SharedHotStandbyActive = false;
 	XLogCtl->SharedPromoteIsTriggered = false;
 	XLogCtl->WalWriterSleeping = false;
+	XLogCtl->SharedRecoveryState = RECOVERY_STATE_CRASH;
 
 	SpinLockInit(&XLogCtl->Insert.insertpos_lck);
 	SpinLockInit(&XLogCtl->info_lck);
@@ -7911,7 +7921,7 @@ StartupXLOG(void)
 	ControlFile->time = (pg_time_t) time(NULL);
 
 	SpinLockAcquire(&XLogCtl->info_lck);
-	XLogCtl->SharedRecoveryInProgress = false;
+	XLogCtl->SharedRecoveryState = RECOVERY_STATE_NONE;
 	SpinLockRelease(&XLogCtl->info_lck);
 
 	UpdateControlFile();
@@ -8057,7 +8067,7 @@ RecoveryInProgress(void)
 		 */
 		volatile XLogCtlData *xlogctl = XLogCtl;
 
-		LocalRecoveryInProgress = xlogctl->SharedRecoveryInProgress;
+		LocalRecoveryInProgress = (xlogctl->SharedRecoveryState != RECOVERY_STATE_NONE);
 
 		/*
 		 * Initialize TimeLineID and RedoRecPtr when we discover that recovery
@@ -8069,8 +8079,8 @@ RecoveryInProgress(void)
 		{
 			/*
 			 * If we just exited recovery, make sure we read TimeLineID and
-			 * RedoRecPtr after SharedRecoveryInProgress (for machines with
-			 * weak memory ordering).
+			 * RedoRecPtr after SharedRecoveryState (for machines with weak
+			 * memory ordering).
 			 */
 			pg_memory_barrier();
 			InitXLOGAccess();
@@ -8086,6 +8096,26 @@ RecoveryInProgress(void)
 	}
 }
 
+/*
+ * Returns current recovery state from shared memory.
+ *
+ * This returned state is accurate after StartupXLOG()
+ * finished.
+ *
+ * See details about RecoveryState in xlog.h
+ */
+RecoveryState
+GetRecoveryState(void)
+{
+	RecoveryState	retval;
+
+	SpinLockAcquire(&XLogCtl->info_lck);
+	retval = XLogCtl->SharedRecoveryState;
+	SpinLockRelease(&XLogCtl->info_lck);
+
+	return retval;
+}
+
 /*
  * Is HotStandby active yet? This is only important in special backends
  * since normal backends won't ever be able to connect until this returns
diff --git a/src/backend/access/transam/xlogarchive.c b/src/backend/access/transam/xlogarchive.c
index d62c12310a..d2ffc7e1a2 100644
--- a/src/backend/access/transam/xlogarchive.c
+++ b/src/backend/access/transam/xlogarchive.c
@@ -572,18 +572,25 @@ XLogArchiveCheckDone(const char *xlog)
 {
 	char		archiveStatusPath[MAXPGPATH];
 	struct stat stat_buf;
-	bool		inRecovery = RecoveryInProgress();
+	RecoveryState	recoveryState = GetRecoveryState();
+
+	/* The file is always deletable if archive_mode is "off". */
+	if (!XLogArchivingActive())
+		return true;
 
 	/*
-	 * The file is always deletable if archive_mode is "off".  On standbys
-	 * archiving is disabled if archive_mode is "on", and enabled with
-	 * "always".  On a primary, archiving is enabled if archive_mode is "on"
-	 * or "always".
+	 * On standbys, the file is deletable if archive_mode is not
+	 * "always".
 	 */
-	if (!((XLogArchivingActive() && !inRecovery) ||
-		  (XLogArchivingAlways() && inRecovery)))
+	if (recoveryState == RECOVERT_STATE_ARCHIVE && !XLogArchivingAlways())
 		return true;
 
+	/*
+	 * We are either:
+	 * * a primary with archive_mode set to "on" or "always"
+	 * * a standby with archive_mode set to "always".
+	 */
+
 	/* First check for .done --- this means archiver is done with it */
 	StatusFilePath(archiveStatusPath, xlog, ".done");
 	if (stat(archiveStatusPath, &stat_buf) == 0)
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index f60ed2d36c..157a3fb104 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -166,6 +166,26 @@ typedef enum WalLevel
 	WAL_LEVEL_LOGICAL
 } WalLevel;
 
+/*
+ * Recovery state.
+ *
+ * There's two different reasons to recover WAL: when standby mode is requested
+ * or after a crash to catchup with consistency.
+ *
+ * RECOVERY_STATE_CRASH is set when recovering after a crash.
+ * RECOVERT_STATE_ARCHIVE is set when archive recovery or standby is requested.
+ * RECOVERY_STATE_NONE means the cluster is in production.
+ *
+ * Note that even if archive recovery or standby is requested, recovery state
+ * starts first with RECOVERY_STATE_CRASH before switching to ARCHIVE_RECOVERING
+ */
+typedef enum RecoveryState
+{
+	RECOVERY_STATE_NONE = 0, /* currently in production */
+	RECOVERY_STATE_CRASH,    /* recovering from a crash */
+	RECOVERT_STATE_ARCHIVE   /* recovering archives as requested */
+} RecoveryState;
+
 extern PGDLLIMPORT int wal_level;
 
 /* Is WAL archiving enabled (always or only while server is running normally)? */
@@ -291,6 +311,7 @@ extern const char *xlog_identify(uint8 info);
 extern void issue_xlog_fsync(int fd, XLogSegNo segno);
 
 extern bool RecoveryInProgress(void);
+extern RecoveryState GetRecoveryState(void);
 extern bool HotStandbyActive(void);
 extern bool HotStandbyActiveInReplay(void);
 extern bool XLogInsertAllowed(void);
-- 
2.20.1

