From 22b0e8fb9c778fbfdc945a647f82f6bbd8d6ec0a Mon Sep 17 00:00:00 2001
From: Dipesh Pandit <dipesh.pandit@enterprisedb.com>
Date: Wed, 30 Jun 2021 14:05:58 +0530
Subject: [PATCH] mitigate directory scan for WAL archiver

WAL archiver scans the status directory to identify the next WAL file
that needs to be archived. This directory scan can be minimised by
maintaining the log segment number of current file which is being
archived and incrementing it by '1' to get the next WAL file.
Archiver can check the availability of next file and in case if the
file is not available then it should fall-back to directory scan to
get the oldest WAL file.

If there is a timeline switch then backend sends a notification to
archiver. Archiver registers the timeline switch and performs a full
directory scan to make sure that archiving history files takes
precedence over archiving WAL files
---
 src/backend/access/transam/xlog.c        |   6 ++
 src/backend/access/transam/xlogarchive.c |  10 +++
 src/backend/postmaster/pgarch.c          | 124 ++++++++++++++++++++++++++++---
 src/include/access/xlogarchive.h         |   1 +
 src/include/postmaster/pgarch.h          |   1 +
 5 files changed, 133 insertions(+), 9 deletions(-)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index c7c928f..40969a9 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -8130,6 +8130,12 @@ StartupXLOG(void)
 	WalSndWakeup();
 
 	/*
+	 * If archiver is active, send notification that timeline has switched.
+	 */
+	if (XLogArchivingActive() && ArchiveRecoveryRequested)
+		XLogArchiveNotifyTLISwitch();
+
+	/*
 	 * If this was a promotion, request an (online) checkpoint now. This isn't
 	 * required for consistency, but the last restartpoint might be far back,
 	 * and in case of a crash, recovering from it might take a longer than is
diff --git a/src/backend/access/transam/xlogarchive.c b/src/backend/access/transam/xlogarchive.c
index 26b023e..1968872 100644
--- a/src/backend/access/transam/xlogarchive.c
+++ b/src/backend/access/transam/xlogarchive.c
@@ -507,6 +507,16 @@ XLogArchiveNotifySeg(XLogSegNo segno)
 }
 
 /*
+ * Signal archiver to notify timeline switch
+ */
+void
+XLogArchiveNotifyTLISwitch(void)
+{
+	if (IsUnderPostmaster)
+		PgArchNotifyTLISwitch();
+}
+
+/*
  * XLogArchiveForceDone
  *
  * Emit notification forcibly that an XLOG segment file has been successfully
diff --git a/src/backend/postmaster/pgarch.c b/src/backend/postmaster/pgarch.c
index 74a7d7c..e23b9bc 100644
--- a/src/backend/postmaster/pgarch.c
+++ b/src/backend/postmaster/pgarch.c
@@ -90,12 +90,22 @@ static PgArchData *PgArch = NULL;
  * Flags set by interrupt handlers for later service in the main loop.
  */
 static volatile sig_atomic_t ready_to_stop = false;
+static volatile sig_atomic_t timeline_switch = false;
+
+/*
+ * Log segment number to get next WAL file in a sequence.
+ */
+static XLogSegNo nextLogSegNo = 0;
+
+/* Flag to specify a full directory scan to find next log file */
+static bool dirScan = true;
 
 /* ----------
  * Local function forward declarations
  * ----------
  */
 static void pgarch_waken_stop(SIGNAL_ARGS);
+static void pgarch_timeline_switch(SIGNAL_ARGS);
 static void pgarch_MainLoop(void);
 static void pgarch_ArchiverCopyLoop(void);
 static bool pgarch_archiveXlog(char *xlog);
@@ -169,10 +179,11 @@ PgArchiverMain(void)
 {
 	/*
 	 * Ignore all signals usually bound to some action in the postmaster,
-	 * except for SIGHUP, SIGTERM, SIGUSR1, SIGUSR2, and SIGQUIT.
+	 * except for SIGHUP, SIGINT, SIGTERM, SIGUSR1, SIGUSR2, and SIGQUIT.
 	 */
 	pqsignal(SIGHUP, SignalHandlerForConfigReload);
-	pqsignal(SIGINT, SIG_IGN);
+	/* Archiver is notified by backend if there is a timeline switch */
+	pqsignal(SIGINT, pgarch_timeline_switch);
 	pqsignal(SIGTERM, SignalHandlerForShutdownRequest);
 	/* SIGQUIT handler was already set up by InitPostmasterChild */
 	pqsignal(SIGALRM, SIG_IGN);
@@ -221,6 +232,23 @@ PgArchWakeup(void)
 		SetLatch(&ProcGlobal->allProcs[arch_pgprocno].procLatch);
 }
 
+/*
+ * Called by backend process to notify a timeline switch.
+ */
+void
+PgArchNotifyTLISwitch(void)
+{
+	int			arch_pgprocno = PgArch->pgprocno;
+
+	if (arch_pgprocno != INVALID_PGPROCNO)
+	{
+		int		archiver_pid = ProcGlobal->allProcs[arch_pgprocno].pid;
+
+		if (kill(archiver_pid, SIGINT) < 0)
+			elog(ERROR, "could not notify timeline change to archiver");
+	}
+}
+
 
 /* SIGUSR2 signal handler for archiver process */
 static void
@@ -236,6 +264,23 @@ pgarch_waken_stop(SIGNAL_ARGS)
 }
 
 /*
+ * Interrupt handler for archiver
+ *
+ * There is a timeline switch and we have been notified by backend.
+ */
+static void
+pgarch_timeline_switch(SIGNAL_ARGS)
+{
+	int			save_errno = errno;
+
+	/* Set the flag to register a timeline switch */
+	timeline_switch = true;
+	SetLatch(MyLatch);
+
+	errno = save_errno;
+}
+
+/*
  * pgarch_MainLoop
  *
  * Main loop for archiver
@@ -336,6 +381,13 @@ pgarch_ArchiverCopyLoop(void)
 		int			failures = 0;
 		int			failures_orphan = 0;
 
+		if (timeline_switch)
+		{
+			/* Perform a full directory scan in next cycle */
+			dirScan = true;
+			timeline_switch = false;
+		}
+
 		for (;;)
 		{
 			struct stat stat_buf;
@@ -411,6 +463,10 @@ pgarch_ArchiverCopyLoop(void)
 				/* successful */
 				pgarch_archiveDone(xlog);
 
+				/* Increment log segment number to point to the next WAL file */
+				if (!IsTLHistoryFileName(xlog))
+					nextLogSegNo++;
+
 				/*
 				 * Tell the collector about the WAL file that we successfully
 				 * archived
@@ -596,29 +652,63 @@ pgarch_archiveXlog(char *xlog)
  * larger ID; the net result being that past timelines are given higher
  * priority for archiving.  This seems okay, or at least not obviously worth
  * changing.
+ *
+ * WAL files are generated in a specific order of log segment number. The
+ * directory scan for each WAL file can be minimised by identifying the next
+ * WAL file in the sequence. This can be achieved by maintaining log segment
+ * number and timeline ID corresponding to WAL file currently being archived.
+ * The log segment number of current WAL file can be incremented by '1' upon
+ * successful archival to point to the next WAL file. The full directory scan
+ * can be avoided by checking the availability of next WAL file.
+ *
+ * However, a full directory scan is performed in case if there is a timeline
+ * switch to make sure that archiving history file takes precedence over
+ * archiving WAL files from older timeline.
  */
 static bool
 pgarch_readyXlog(char *xlog)
 {
-	/*
-	 * open xlog status directory and read through list of xlogs that have the
-	 * .ready suffix, looking for earliest file. It is possible to optimise
-	 * this code, though only a single file is expected on the vast majority
-	 * of calls, so....
-	 */
+	char		basename[MAX_XFN_CHARS + 1];
+	char		xlogready[MAXPGPATH];
 	char		XLogArchiveStatusDir[MAXPGPATH];
 	DIR		   *rldir;
 	struct dirent *rlde;
+	struct stat	st;
+	static TimeLineID curFileTLI = 0;
 	bool		found = false;
 	bool		historyFound = false;
 
+	if (!dirScan)
+	{
+		/*
+		 * We already have the next anticipated log segment and timeline, check
+		 * if this WAL is ready to be archived. If yes, skip the directory
+		 * scan.
+		 */
+		XLogFileName(basename, curFileTLI, nextLogSegNo, wal_segment_size);
+		StatusFilePath(xlogready, basename, ".ready");
+
+		if (stat(xlogready, &st) == 0)
+		{
+			strcpy(xlog, basename);
+			return true;
+		}
+	}
+
+	/*
+	 * Fall-back to directory scan
+	 *
+	 * open xlog status directory and read through list of xlogs that have the
+	 * .ready suffix, looking for earliest file. It is possible to optimise
+	 * this code, though only a single file is expected on the vast majority
+	 * of calls, so....
+	 */
 	snprintf(XLogArchiveStatusDir, MAXPGPATH, XLOGDIR "/archive_status");
 	rldir = AllocateDir(XLogArchiveStatusDir);
 
 	while ((rlde = ReadDir(rldir, XLogArchiveStatusDir)) != NULL)
 	{
 		int			basenamelen = (int) strlen(rlde->d_name) - 6;
-		char		basename[MAX_XFN_CHARS + 1];
 		bool		ishistory;
 
 		/* Ignore entries with unexpected number of characters */
@@ -661,6 +751,22 @@ pgarch_readyXlog(char *xlog)
 				strcpy(xlog, basename);
 		}
 	}
+
+	/*
+	 * Found the oldest WAL, reset timeline ID and log segment number to generate
+	 * the next WAL file in the sequence.
+	 */
+	if (found && !historyFound)
+	{
+		XLogFromFileName(xlog, &curFileTLI, &nextLogSegNo, wal_segment_size);
+		ereport(LOG,
+				(errmsg("directory scan to archive write-ahead log file \"%s\"",
+						xlog)));
+
+		/* Disable full directory scan until there is a timeline switch */
+		dirScan = false;
+	}
+
 	FreeDir(rldir);
 
 	return found;
diff --git a/src/include/access/xlogarchive.h b/src/include/access/xlogarchive.h
index 3edd1a9..51e2796 100644
--- a/src/include/access/xlogarchive.h
+++ b/src/include/access/xlogarchive.h
@@ -25,6 +25,7 @@ extern void ExecuteRecoveryCommand(const char *command, const char *commandName,
 extern void KeepFileRestoredFromArchive(const char *path, const char *xlogfname);
 extern void XLogArchiveNotify(const char *xlog);
 extern void XLogArchiveNotifySeg(XLogSegNo segno);
+extern void XLogArchiveNotifyTLISwitch(void);
 extern void XLogArchiveForceDone(const char *xlog);
 extern bool XLogArchiveCheckDone(const char *xlog);
 extern bool XLogArchiveIsBusy(const char *xlog);
diff --git a/src/include/postmaster/pgarch.h b/src/include/postmaster/pgarch.h
index 1e47a14..e6d2e18 100644
--- a/src/include/postmaster/pgarch.h
+++ b/src/include/postmaster/pgarch.h
@@ -31,5 +31,6 @@ extern void PgArchShmemInit(void);
 extern bool PgArchCanRestart(void);
 extern void PgArchiverMain(void) pg_attribute_noreturn();
 extern void PgArchWakeup(void);
+extern void PgArchNotifyTLISwitch(void);
 
 #endif							/* _PGARCH_H */
-- 
1.8.3.1

