From 627fcf50e7e03e12dc1c61ec76c32fb48532796a Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Tue, 14 Jan 2025 00:25:01 -0500
Subject: [PATCH v5 5/7] postmaster: Commonalize FatalError paths

This includes some behavioral changes:

- Previously PM_WAIT_XLOG_ARCHIVAL wasn't handled in HandleFatalError(), that
  doesn't seem quite right.

- Previously a fatal error in PM_WAIT_XLOG_SHUTDOWN lead to jumping back to
  PM_WAIT_BACKENDS, no we go to PM_WAIT_DEAD_END. Jumping backwards doesn't
  seem quite right and we didn't do so when checkpointer failed to fork during
  a shutdown.

- Previously a checkpointer fork failure didn't call SetQuitSignalReason(),
  which would lead to quickdie() reporting
  "terminating connection because of unexpected SIGQUIT signal"
  which seems even worse than the PMQUIT_FOR_CRASH message. If I saw that in
  the log I'd suspect somebody outside of postgres sent SIGQUITs

Reviewed-by: Bertrand Drouvot <bertranddrouvot.pg@gmail.com>
Discussion: https://postgr.es/m/kgng5nrvnlv335evmsuvpnh354rw7qyazl73kdysev2cr2v5zu@m3cfzxicm5kp
---
 src/backend/postmaster/postmaster.c | 73 ++++++++++++++++++++++-------
 1 file changed, 57 insertions(+), 16 deletions(-)

diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index a92fe4240b2..95eb989355c 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -2706,13 +2706,49 @@ HandleFatalError(QuitSignalReason reason, bool consider_sigabrt)
 
 	FatalError = true;
 
-	/* We now transit into a state of waiting for children to die */
-	if (pmState == PM_RECOVERY ||
-		pmState == PM_HOT_STANDBY ||
-		pmState == PM_RUN ||
-		pmState == PM_STOP_BACKENDS ||
-		pmState == PM_WAIT_XLOG_SHUTDOWN)
-		UpdatePMState(PM_WAIT_BACKENDS);
+	/*
+	 * Choose the appropriate new state to react to the fatal error. Unless we
+	 * were already in the process of shutting down, we go through
+	 * PM_WAIT_BACKEND. For errors during the shutdown sequence, we directly
+	 * switch to PM_WAIT_DEAD_END.
+	 */
+	switch (pmState)
+	{
+		case PM_INIT:
+			/* shouldn't have any children */
+			Assert(false);
+			break;
+		case PM_STARTUP:
+			/* should have been handled in process_pm_child_exit */
+			Assert(false);
+			break;
+
+			/* wait for children to die */
+		case PM_RECOVERY:
+		case PM_HOT_STANDBY:
+		case PM_RUN:
+		case PM_STOP_BACKENDS:
+			UpdatePMState(PM_WAIT_BACKENDS);
+			break;
+
+		case PM_WAIT_BACKENDS:
+			/* there might be more backends to wait for */
+			break;
+
+		case PM_WAIT_XLOG_SHUTDOWN:
+		case PM_WAIT_XLOG_ARCHIVAL:
+			/*
+			 * NB: Similar code exists in PostmasterStateMachine()'s handling
+			 * of FatalError in PM_STOP_BACKENDS/PM_WAIT_BACKENDS states.
+			 */
+			ConfigurePostmasterWaitSet(false);
+			UpdatePMState(PM_WAIT_DEAD_END);
+			break;
+
+		case PM_WAIT_DEAD_END:
+		case PM_NO_CHILDREN:
+			break;
+	}
 
 	/*
 	 * .. and if this doesn't happen quickly enough, now the clock is ticking
@@ -2942,15 +2978,18 @@ PostmasterStateMachine(void)
 			{
 				/*
 				 * Stop any dead-end children and stop creating new ones.
+				 *
+				 * NB: Similar code exists in HandleFatalErrors(), when the
+				 * error happens in pmState > PM_WAIT_BACKENDS.
 				 */
 				UpdatePMState(PM_WAIT_DEAD_END);
 				ConfigurePostmasterWaitSet(false);
 				SignalChildren(SIGQUIT, btmask(B_DEAD_END_BACKEND));
 
 				/*
-				 * We already SIGQUIT'd walsenders and the archiver, if any,
-				 * when we started immediate shutdown or entered FatalError
-				 * state.
+				 * We already SIGQUIT'd auxiliary processes (other than
+				 * logger), if any, when we started immediate shutdown or
+				 * entered FatalError state.
 				 */
 			}
 			else
@@ -2981,13 +3020,15 @@ PostmasterStateMachine(void)
 					 * We don't consult send_abort_for_crash here, as it's
 					 * unlikely that dumping cores would illuminate the reason
 					 * for checkpointer fork failure.
+					 *
+					 * XXX: It may be worth to introduce a different PMQUIT
+					 * value that signals that the cluster is in a bad state,
+					 * without a process having crashed. But right now this
+					 * path is very unlikely to be reached, so it isn't
+					 * obviously worthwhile adding a distinct error message in
+					 * quickdie().
 					 */
-					FatalError = true;
-					UpdatePMState(PM_WAIT_DEAD_END);
-					ConfigurePostmasterWaitSet(false);
-
-					/* Kill the walsenders and archiver too */
-					SignalChildren(SIGQUIT, btmask_all_except(B_LOGGER));
+					HandleFatalError(PMQUIT_FOR_CRASH, false);
 				}
 			}
 		}
-- 
2.48.1.76.g4e746b1a31.dirty

