From 02bbfc3b1cda477c41687972e20ac662e3483a5e Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Wed, 8 Jan 2025 14:09:28 -0500
Subject: [PATCH v2 7/7] Change shutdown sequence to terminate checkpointer
 last

The main motivation for this change is to have a process that can serialize
stats after all other processes have terminated. That already happens in
checkpointer, even though walsenders can be active longer.

The only reason the current state does not actively cause problems is that
walsender currently generate any stats. However, there is a patch to change
that.  Another need for this originates in the AIO patchset, where IO workers
can emit stats in some edge cases and need to run while the shutdown
checkpoint is being written.

This commit changes the shutdown sequence so checkpointer is signalled to
trigger writing the shutdown checkpoint without terminating it. Once
checkpointer wrote the checkpoint it will wait for a termination
signal.

Postmaster now triggers the shutdown checkpoint where we previously did so by
terminating checkpointer. Checkpointer is terminated after all other children
have been terminated, tracked using the new PM_WAIT_CHECKPOINTER PMState.

Reviewed-by: Heikki Linnakangas <hlinnaka@iki.fi>
Reviewed-by: Bertrand Drouvot <bertranddrouvot.pg@gmail.com>
Discussion: https://postgr.es/m/kgng5nrvnlv335evmsuvpnh354rw7qyazl73kdysev2cr2v5zu@m3cfzxicm5kp
---
 src/include/storage/pmsignal.h                |   3 +-
 src/backend/postmaster/checkpointer.c         | 114 +++++++++++----
 src/backend/postmaster/postmaster.c           | 136 ++++++++++++------
 .../utils/activity/wait_event_names.txt       |   1 +
 4 files changed, 180 insertions(+), 74 deletions(-)

diff --git a/src/include/storage/pmsignal.h b/src/include/storage/pmsignal.h
index 3fbe5bf1136..d84a383047e 100644
--- a/src/include/storage/pmsignal.h
+++ b/src/include/storage/pmsignal.h
@@ -40,9 +40,10 @@ typedef enum
 	PMSIGNAL_BACKGROUND_WORKER_CHANGE,	/* background worker state change */
 	PMSIGNAL_START_WALRECEIVER, /* start a walreceiver */
 	PMSIGNAL_ADVANCE_STATE_MACHINE, /* advance postmaster's state machine */
+	PMSIGNAL_XLOG_IS_SHUTDOWN,	/* ShutdownXLOG() completed */
 } PMSignalReason;
 
-#define NUM_PMSIGNALS (PMSIGNAL_ADVANCE_STATE_MACHINE+1)
+#define NUM_PMSIGNALS (PMSIGNAL_XLOG_IS_SHUTDOWN+1)
 
 /*
  * Reasons why the postmaster would send SIGQUIT to its children.
diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c
index cd76315bfa4..411aa66aa61 100644
--- a/src/backend/postmaster/checkpointer.c
+++ b/src/backend/postmaster/checkpointer.c
@@ -51,6 +51,7 @@
 #include "storage/fd.h"
 #include "storage/ipc.h"
 #include "storage/lwlock.h"
+#include "storage/pmsignal.h"
 #include "storage/proc.h"
 #include "storage/procsignal.h"
 #include "storage/shmem.h"
@@ -141,6 +142,7 @@ double		CheckPointCompletionTarget = 0.9;
  * Private state
  */
 static bool ckpt_active = false;
+static volatile sig_atomic_t ShutdownXLOGPending = false;
 
 /* these values are valid when ckpt_active is true: */
 static pg_time_t ckpt_start_time;
@@ -159,6 +161,9 @@ static bool ImmediateCheckpointRequested(void);
 static bool CompactCheckpointerRequestQueue(void);
 static void UpdateSharedMemoryConfig(void);
 
+/* Signal handlers */
+static void ReqShutdownXLOG(SIGNAL_ARGS);
+
 
 /*
  * Main entry point for checkpointer process
@@ -188,7 +193,7 @@ CheckpointerMain(char *startup_data, size_t startup_data_len)
 	 * tell us it's okay to shut down (via SIGUSR2).
 	 */
 	pqsignal(SIGHUP, SignalHandlerForConfigReload);
-	pqsignal(SIGINT, SIG_IGN);
+	pqsignal(SIGINT, ReqShutdownXLOG);
 	pqsignal(SIGTERM, SIG_IGN); /* ignore SIGTERM */
 	/* SIGQUIT handler was already set up by InitPostmasterChild */
 	pqsignal(SIGALRM, SIG_IGN);
@@ -211,8 +216,11 @@ CheckpointerMain(char *startup_data, size_t startup_data_len)
 	 * process during a normal shutdown, and since checkpointer is shut down
 	 * very late...
 	 *
-	 * Walsenders are shut down after the checkpointer, but currently don't
-	 * report stats. If that changes, we need a more complicated solution.
+	 * While e.g. walsenders are active after the shutdown checkpoint has been
+	 * written (and thus could produce more stats), checkpointer stays around
+	 * after the shutdown checkpoint has been written. postmaster will only
+	 * signal checkpointer to exit after all processes that could emit stats
+	 * have been shut down.
 	 */
 	before_shmem_exit(pgstat_before_server_shutdown, 0);
 
@@ -327,7 +335,7 @@ CheckpointerMain(char *startup_data, size_t startup_data_len)
 	ProcGlobal->checkpointerProc = MyProcNumber;
 
 	/*
-	 * Loop forever
+	 * Loop until we've been asked to write shutdown checkpoint or terminate.
 	 */
 	for (;;)
 	{
@@ -346,7 +354,10 @@ CheckpointerMain(char *startup_data, size_t startup_data_len)
 		 * Process any requests or signals received recently.
 		 */
 		AbsorbSyncRequests();
+
 		HandleCheckpointerInterrupts();
+		if (ShutdownXLOGPending || ShutdownRequestPending)
+			break;
 
 		/*
 		 * Detect a pending checkpoint request by checking whether the flags
@@ -517,8 +528,13 @@ CheckpointerMain(char *startup_data, size_t startup_data_len)
 
 			ckpt_active = false;
 
-			/* We may have received an interrupt during the checkpoint. */
+			/*
+			 * We may have received an interrupt during the checkpoint and the
+			 * latch might have been reset (e.g. in CheckpointWriteDelay).
+			 */
 			HandleCheckpointerInterrupts();
+			if (ShutdownXLOGPending || ShutdownRequestPending)
+				break;
 		}
 
 		/* Check for archive_timeout and switch xlog files if necessary. */
@@ -557,6 +573,56 @@ CheckpointerMain(char *startup_data, size_t startup_data_len)
 						 cur_timeout * 1000L /* convert to ms */ ,
 						 WAIT_EVENT_CHECKPOINTER_MAIN);
 	}
+
+	/*
+	 * From here on, elog(ERROR) should end with exit(1), not send control
+	 * back to the sigsetjmp block above.
+	 */
+	ExitOnAnyError = true;
+
+	if (ShutdownXLOGPending)
+	{
+		/*
+		 * Close down the database.
+		 *
+		 * Since ShutdownXLOG() creates restartpoint or checkpoint, and
+		 * updates the statistics, increment the checkpoint request and flush
+		 * out pending statistic.
+		 */
+		PendingCheckpointerStats.num_requested++;
+		ShutdownXLOG(0, 0);
+		pgstat_report_checkpointer();
+		pgstat_report_wal(true);
+
+		/*
+		 * Tell postmaster that we're done.
+		 */
+		SendPostmasterSignal(PMSIGNAL_XLOG_IS_SHUTDOWN);
+	}
+
+	/*
+	 * Wait until we're asked to shut down. By seperating the writing of the
+	 * shutdown checkpoint from checkpointer exiting, checkpointer can perform
+	 * some should-be-as-late-as-possible work like writing out stats.
+	 */
+	for (;;)
+	{
+		/* Clear any already-pending wakeups */
+		ResetLatch(MyLatch);
+
+		HandleCheckpointerInterrupts();
+
+		if (ShutdownRequestPending)
+			break;
+
+		(void) WaitLatch(MyLatch,
+						 WL_LATCH_SET | WL_EXIT_ON_PM_DEATH,
+						 0,
+						 WAIT_EVENT_CHECKPOINTER_SHUTDOWN);
+	}
+
+	/* Normal exit from the checkpointer is here */
+	proc_exit(0);				/* done */
 }
 
 /*
@@ -586,29 +652,6 @@ HandleCheckpointerInterrupts(void)
 		 */
 		UpdateSharedMemoryConfig();
 	}
-	if (ShutdownRequestPending)
-	{
-		/*
-		 * From here on, elog(ERROR) should end with exit(1), not send control
-		 * back to the sigsetjmp block above
-		 */
-		ExitOnAnyError = true;
-
-		/*
-		 * Close down the database.
-		 *
-		 * Since ShutdownXLOG() creates restartpoint or checkpoint, and
-		 * updates the statistics, increment the checkpoint request and flush
-		 * out pending statistic.
-		 */
-		PendingCheckpointerStats.num_requested++;
-		ShutdownXLOG(0, 0);
-		pgstat_report_checkpointer();
-		pgstat_report_wal(true);
-
-		/* Normal exit from the checkpointer is here */
-		proc_exit(0);			/* done */
-	}
 
 	/* Perform logging of memory contexts of this process */
 	if (LogMemoryContextPending)
@@ -729,6 +772,7 @@ CheckpointWriteDelay(int flags, double progress)
 	 * in which case we just try to catch up as quickly as possible.
 	 */
 	if (!(flags & CHECKPOINT_IMMEDIATE) &&
+		!ShutdownXLOGPending &&
 		!ShutdownRequestPending &&
 		!ImmediateCheckpointRequested() &&
 		IsCheckpointOnSchedule(progress))
@@ -857,6 +901,20 @@ IsCheckpointOnSchedule(double progress)
 }
 
 
+/* --------------------------------
+ *		signal handler routines
+ * --------------------------------
+ */
+
+/* SIGINT: set flag to trigger writing of shutdown checkpoint */
+static void
+ReqShutdownXLOG(SIGNAL_ARGS)
+{
+	ShutdownXLOGPending = true;
+	SetLatch(MyLatch);
+}
+
+
 /* --------------------------------
  *		communication with backends
  * --------------------------------
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index 58f2619c45f..87ddc7a2da0 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -335,6 +335,7 @@ typedef enum
 	PM_WAIT_XLOG_ARCHIVAL,		/* waiting for archiver and walsenders to
 								 * finish */
 	PM_WAIT_DEAD_END,			/* waiting for dead-end children to exit */
+	PM_WAIT_CHECKPOINTER,		/* waiting for checkpointer to shut down */
 	PM_NO_CHILDREN,				/* all important children have exited */
 } PMState;
 
@@ -2355,35 +2356,16 @@ process_pm_child_exit(void)
 		{
 			ReleasePostmasterChildSlot(CheckpointerPMChild);
 			CheckpointerPMChild = NULL;
-			if (EXIT_STATUS_0(exitstatus) && pmState == PM_WAIT_XLOG_SHUTDOWN)
+			if (EXIT_STATUS_0(exitstatus) && pmState == PM_WAIT_CHECKPOINTER)
 			{
 				/*
 				 * OK, we saw normal exit of the checkpointer after it's been
-				 * told to shut down.  We expect that it wrote a shutdown
-				 * checkpoint.  (If for some reason it didn't, recovery will
-				 * occur on next postmaster start.)
+				 * told to shut down.  We know it wrote a shutdown checkpoint,
+				 * otherwise we'd still be in PM_SHUTDOWN.
 				 *
-				 * At this point we should have no normal backend children
-				 * left (else we'd not be in PM_WAIT_XLOG_SHUTDOWN state) but
-				 * we might have dead-end children to wait for.
-				 *
-				 * If we have an archiver subprocess, tell it to do a last
-				 * archive cycle and quit. Likewise, if we have walsender
-				 * processes, tell them to send any remaining WAL and quit.
-				 */
-				Assert(Shutdown > NoShutdown);
-
-				/* Waken archiver for the last time */
-				if (PgArchPMChild != NULL)
-					signal_child(PgArchPMChild, SIGUSR2);
-
-				/*
-				 * Waken walsenders for the last time. No regular backends
-				 * should be around anymore.
+				 * At this point we have no children left.
 				 */
-				SignalChildren(SIGUSR2, btmask(B_WAL_SENDER));
-
-				UpdatePMState(PM_WAIT_XLOG_ARCHIVAL);
+				UpdatePMState(PM_NO_CHILDREN);
 			}
 			else
 			{
@@ -2938,9 +2920,9 @@ PostmasterStateMachine(void)
 				SignalChildren(SIGQUIT, btmask(B_DEAD_END_BACKEND));
 
 				/*
-				 * We already SIGQUIT'd walsenders and the archiver, if any,
-				 * when we started immediate shutdown or entered FatalError
-				 * state.
+				 * We already SIGQUIT'd archiver, checkpointer and walsenders,
+				 * if any, when we started immediate shutdown or entered
+				 * FatalError state.
 				 */
 			}
 			else
@@ -2954,10 +2936,10 @@ PostmasterStateMachine(void)
 				/* Start the checkpointer if not running */
 				if (CheckpointerPMChild == NULL)
 					CheckpointerPMChild = StartChildProcess(B_CHECKPOINTER);
-				/* And tell it to shut down */
+				/* And tell it to write the shutdown checkpoint */
 				if (CheckpointerPMChild != NULL)
 				{
-					signal_child(CheckpointerPMChild, SIGUSR2);
+					signal_child(CheckpointerPMChild, SIGINT);
 					UpdatePMState(PM_WAIT_XLOG_SHUTDOWN);
 				}
 				else
@@ -2986,16 +2968,16 @@ PostmasterStateMachine(void)
 	if (pmState == PM_WAIT_XLOG_ARCHIVAL)
 	{
 		/*
-		 * PM_WAIT_XLOG_ARCHIVAL state ends when there's no other children
-		 * than dead-end children left. There shouldn't be any regular
-		 * backends left by now anyway; what we're really waiting for is
-		 * walsenders and archiver.
+		 * PM_WAIT_XLOG_ARCHIVAL state ends when there's no children other
+		 * than checkpointer and dead-end children left. There shouldn't be
+		 * any regular backends left by now anyway; what we're really waiting
+		 * for is for walsenders and archiver to exit.
 		 */
-		if (CountChildren(btmask_all_except(B_LOGGER, B_DEAD_END_BACKEND)) == 0)
+		if (CountChildren(btmask_all_except(B_LOGGER, B_DEAD_END_BACKEND, B_CHECKPOINTER)) == 0)
 		{
 			UpdatePMState(PM_WAIT_DEAD_END);
 			ConfigurePostmasterWaitSet(false);
-			SignalChildren(SIGTERM, btmask_all_except(B_LOGGER));
+			SignalChildren(SIGTERM, btmask_all_except(B_LOGGER, B_CHECKPOINTER));
 		}
 	}
 
@@ -3003,29 +2985,52 @@ PostmasterStateMachine(void)
 	{
 		/*
 		 * PM_WAIT_DEAD_END state ends when all other children are gone except
-		 * for the logger.  During normal shutdown, all that remains are
-		 * dead-end backends, but in FatalError processing we jump straight
-		 * here with more processes remaining.  Note that they have already
-		 * been sent appropriate shutdown signals, either during a normal
-		 * state transition leading up to PM_WAIT_DEAD_END, or during
-		 * FatalError processing.
+		 * for the logger and checkpointer.  During normal shutdown, all that
+		 * remains are dead-end backends and checkpointer, but in FatalError
+		 * processing we jump straight here with more processes remaining.
+		 * Note that they have already been sent appropriate shutdown signals,
+		 * either during a normal state transition leading up to
+		 * PM_WAIT_DEAD_END, or during FatalError processing.
 		 *
 		 * The reason we wait is to protect against a new postmaster starting
 		 * conflicting subprocesses; this isn't an ironclad protection, but it
 		 * at least helps in the shutdown-and-immediately-restart scenario.
 		 */
-		if (CountChildren(btmask_all_except(B_LOGGER)) == 0)
+		if (CountChildren(btmask_all_except(B_LOGGER, B_CHECKPOINTER)) == 0)
 		{
 			/* These other guys should be dead already */
 			Assert(StartupPMChild == NULL);
 			Assert(WalReceiverPMChild == NULL);
 			Assert(WalSummarizerPMChild == NULL);
 			Assert(BgWriterPMChild == NULL);
-			Assert(CheckpointerPMChild == NULL);
 			Assert(WalWriterPMChild == NULL);
 			Assert(AutoVacLauncherPMChild == NULL);
 			Assert(SlotSyncWorkerPMChild == NULL);
 			/* syslogger is not considered here */
+
+			UpdatePMState(PM_WAIT_CHECKPOINTER);
+
+			/*
+			 * Now that everyone else is gone, tell checkpointer to shut down
+			 * too. That allows checkpointer to perform some last bits of
+			 * cleanup without other processes interfering.
+			 */
+			if (CheckpointerPMChild != NULL)
+				signal_child(CheckpointerPMChild, SIGUSR2);
+		}
+	}
+
+	if (pmState == PM_WAIT_CHECKPOINTER)
+	{
+		/*
+		 * PM_WAIT_CHECKPOINTER ends when, drumroll, checkpointer has shut
+		 * down. Note that checkpointer already has completed the shutdown
+		 * checkpoint, we just wait for it to do some last bits of cleanup and
+		 * then exit.
+		 */
+		if (CountChildren(btmask_all_except(B_LOGGER)) == 0)
+		{
+			Assert(CheckpointerPMChild == NULL);
 			UpdatePMState(PM_NO_CHILDREN);
 		}
 	}
@@ -3135,6 +3140,7 @@ pmstate_name(PMState state)
 			PM_TOSTR_CASE(PM_WAIT_XLOG_SHUTDOWN);
 			PM_TOSTR_CASE(PM_WAIT_XLOG_ARCHIVAL);
 			PM_TOSTR_CASE(PM_WAIT_DEAD_END);
+			PM_TOSTR_CASE(PM_WAIT_CHECKPOINTER);
 			PM_TOSTR_CASE(PM_NO_CHILDREN);
 	}
 #undef PM_TOSTR_CASE
@@ -3563,6 +3569,8 @@ ExitPostmaster(int status)
 static void
 process_pm_pmsignal(void)
 {
+	bool		request_state_update = false;
+
 	pending_pm_pmsignal = false;
 
 	ereport(DEBUG2,
@@ -3674,9 +3682,46 @@ process_pm_pmsignal(void)
 		WalReceiverRequested = true;
 	}
 
+	if (pmState == PM_WAIT_XLOG_SHUTDOWN &&
+		CheckPostmasterSignal(PMSIGNAL_XLOG_IS_SHUTDOWN))
+	{
+		/*
+		 * Checkpointer completed the shutdown checkpoint.
+		 *
+		 * If we have an archiver subprocess, tell it to do a last archive
+		 * cycle and quit. Likewise, if we have walsender processes, tell them
+		 * to send any remaining WAL and quit.
+		 */
+		Assert(Shutdown > NoShutdown);
+
+		/* Waken archiver for the last time */
+		if (PgArchPMChild != NULL)
+			signal_child(PgArchPMChild, SIGUSR2);
+
+		/*
+		 * Waken walsenders for the last time. No regular backends should be
+		 * around anymore.
+		 */
+		SignalChildren(SIGUSR2, btmask(B_WAL_SENDER));
+
+		UpdatePMState(PM_WAIT_XLOG_ARCHIVAL);
+
+		/*
+		 * Need to run PostmasterStateMachine() to check if we already can go
+		 * to the next state.
+		 */
+		request_state_update = true;
+	}
+
 	/*
 	 * Try to advance postmaster's state machine, if a child requests it.
-	 *
+	 */
+	if (CheckPostmasterSignal(PMSIGNAL_ADVANCE_STATE_MACHINE))
+	{
+		request_state_update = true;
+	}
+
+	/*
 	 * Be careful about the order of this action relative to this function's
 	 * other actions.  Generally, this should be after other actions, in case
 	 * they have effects PostmasterStateMachine would need to know about.
@@ -3684,7 +3729,7 @@ process_pm_pmsignal(void)
 	 * cannot have any (immediate) effect on the state machine, but does
 	 * depend on what state we're in now.
 	 */
-	if (CheckPostmasterSignal(PMSIGNAL_ADVANCE_STATE_MACHINE))
+	if (request_state_update)
 	{
 		PostmasterStateMachine();
 	}
@@ -3995,6 +4040,7 @@ bgworker_should_start_now(BgWorkerStartTime start_time)
 	switch (pmState)
 	{
 		case PM_NO_CHILDREN:
+		case PM_WAIT_CHECKPOINTER:
 		case PM_WAIT_DEAD_END:
 		case PM_WAIT_XLOG_ARCHIVAL:
 		case PM_WAIT_XLOG_SHUTDOWN:
diff --git a/src/backend/utils/activity/wait_event_names.txt b/src/backend/utils/activity/wait_event_names.txt
index 0b53cba807d..e199f071628 100644
--- a/src/backend/utils/activity/wait_event_names.txt
+++ b/src/backend/utils/activity/wait_event_names.txt
@@ -56,6 +56,7 @@ AUTOVACUUM_MAIN	"Waiting in main loop of autovacuum launcher process."
 BGWRITER_HIBERNATE	"Waiting in background writer process, hibernating."
 BGWRITER_MAIN	"Waiting in main loop of background writer process."
 CHECKPOINTER_MAIN	"Waiting in main loop of checkpointer process."
+CHECKPOINTER_SHUTDOWN	"Waiting for checkpointer process to be terminated."
 LOGICAL_APPLY_MAIN	"Waiting in main loop of logical replication apply process."
 LOGICAL_LAUNCHER_MAIN	"Waiting in main loop of logical replication launcher process."
 LOGICAL_PARALLEL_APPLY_MAIN	"Waiting in main loop of logical replication parallel apply process."
-- 
2.45.2.746.g06e570c0df.dirty

