From a8ecdf34b6c39a07012059eddff503b8259b8423 Mon Sep 17 00:00:00 2001
From: Vlad Lesin <vladlesin@gmail.com>
Date: Thu, 14 May 2026 23:22:01 +0300
Subject: [PATCH 5/5] Fix ProcKill lock-group vs procLatch recycle race

After the preceding refactor, every freelist push happens in a single
ProcGlobal->freeProcsLock critical section near the end of ProcKill().
But SwitchBackToLocalLatch() and DisownLatch(&proc->procLatch) still
ran below that point, so two windows remained in which a PGPROC could
appear on a freelist while its procLatch still had owner_pid != 0:

  * follower-returns-leader: a follower's consolidated push of the
    leader's PGPROC races the leader's own DisownLatch in a different
    process; the lwlock-held decision and the freeProcsLock-held push
    do not order the leader's own DisownLatch.

  * leader-returns-self: a leader that outlives the last follower
    pushes its own PGPROC above the DisownLatch of that same slot.

A newly-forked backend that picks up the recycled slot then calls
OwnLatch() and PANICs with "latch already owned by PID ...".

Hoist SwitchBackToLocalLatch() and DisownLatch(&MyProc->procLatch)
to the very top of ProcKill(), while leaving
pgstat_reset_wait_event_storage() in its existing position after the
lock-group block so wait_event_info remains observable in PGPROC
while a backend may be inspected there (e.g. parked at an injection
point).  The freelist-push consolidation introduced in the preceding
commit gives a single named place where the slot becomes visible to
other backends; this commit ensures every such publication is below
the hoisted DisownLatch.

The invariant is now easy to state: by the time any PGPROC can be
observed on a freelist, its procLatch has already been disowned.
---
 src/backend/storage/lmgr/proc.c | 38 ++++++++++++++++++++-------------
 1 file changed, 23 insertions(+), 15 deletions(-)

diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c
index 7201ad9caa8..da3bd3a6436 100644
--- a/src/backend/storage/lmgr/proc.c
+++ b/src/backend/storage/lmgr/proc.c
@@ -963,6 +963,22 @@ ProcKill(int code, Datum arg)
 	/* Cancel any pending condition variable sleep, too */
 	ConditionVariableCancelSleep();
 
+	/*
+	 * Reset MyLatch to the process local one and disown the shared latch.
+	 * DisownLatch must happen before our PGPROC can appear on a freelist: a
+	 * newly-forked backend that pops our slot and calls OwnLatch() would
+	 * PANIC on a still-owned latch.  The lock-group block below preserves
+	 * that order -- see its comment.
+	 *
+	 * pgstat_reset_wait_event_storage() is intentionally deferred until after
+	 * the lock-group block (and any injection points inside it) so that
+	 * wait_event_info remains visible in our PGPROC slot while we may be
+	 * observed there.  It is safe to defer because our slot is not yet on any
+	 * freelist at this point.
+	 */
+	SwitchBackToLocalLatch();
+	DisownLatch(&MyProc->procLatch);
+
 	proc = MyProc;
 	procgloballist = proc->procgloballist;
 
@@ -980,12 +996,10 @@ ProcKill(int code, Datum arg)
 	 * the source of a double push.)
 	 *
 	 * (b) A follower only sets push_leader when it observes an empty group
-	 * after removing itself, so the leader has already passed its own
-	 * lwlock-held dlist_delete by the time a pusher runs.
-	 *
-	 * NOTE: this commit does NOT establish the latch-ownership invariant that
-	 * the freelist pushes must follow DisownLatch -- see the next commit for
-	 * that.
+	 * after removing itself.  That can only happen if the leader has already
+	 * passed its own lwlock-held dlist_delete, which in turn happened after
+	 * the leader's DisownLatch above.  So by the time any pusher runs, the
+	 * PGPROC it is about to push has already disowned its latch.
 	 */
 	push_leader = false;
 	push_self = true;
@@ -1052,20 +1066,14 @@ ProcKill(int code, Datum arg)
 	INJECTION_POINT("prockill-pre-disown-latch", NULL);
 
 	/*
-	 * Reset MyLatch to the process local one.  This is so that signal
-	 * handlers et al can continue using the latch after the shared latch
-	 * isn't ours anymore.
-	 *
-	 * Similarly, stop reporting wait events to MyProc->wait_event_info.
-	 *
-	 * After that clear MyProc and disown the shared latch.
+	 * Stop reporting wait events to MyProc->wait_event_info now that we are
+	 * done with any injection-point waits.  Do this before zeroing MyProc so
+	 * pgstat internals do not try to dereference it.
 	 */
-	SwitchBackToLocalLatch();
 	pgstat_reset_wait_event_storage();
 
 	MyProc = NULL;
 	MyProcNumber = INVALID_PROC_NUMBER;
-	DisownLatch(&proc->procLatch);
 
 	/* Mark the proc no longer in use */
 	proc->pid = 0;
-- 
2.43.0

