From 74bbf209e18ce7ca9b135b94abe0077d0e0f8dfa Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Tue, 2 Dec 2025 18:44:54 -0500
Subject: [PATCH v8 01/10] bufmgr: Optimize & harmonize LockBufHdr(),
 LWLockWaitListLock()

The main optimization is for LockBufHdr() to delay initializing
SpinDelayStatus, similar to what LWLockWaitListLock already did. The
initialization is sufficiently expensive & buffer header lock acquisitions are
sufficiently frequent, to make it worthwhile to instead have a fastpath (via a
likely() branch) that does not initialize the SpinDelayStatus.

While LWLockWaitListLock() already the aforementioned optimization, it did not
use likely(), and inspection of the assembly shows that this indeed leads to
worse code generation (also observed in a microbenchmark). Fix that by adding
the likely().

While the LockBufHdr() improvement is a small gain on its own, it mainly is
aimed at preventing a regression after a future commit, which requires
additional locking to set hint bits.

While touching both, also make the comments more similar to each other.

Discussion: https://postgr.es/m/fvfmkr5kk4nyex56ejgxj3uzi63isfxovp2biecb4bspbjrze7@az2pljabhnff
---
 src/backend/storage/buffer/bufmgr.c | 36 +++++++++++++++++++++--------
 src/backend/storage/lmgr/lwlock.c   |  8 +++++--
 2 files changed, 33 insertions(+), 11 deletions(-)

diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index a768fb129ae..eb55102b0d7 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -6358,23 +6358,41 @@ rlocator_comparator(const void *p1, const void *p2)
 uint32
 LockBufHdr(BufferDesc *desc)
 {
-	SpinDelayStatus delayStatus;
 	uint32		old_buf_state;
 
 	Assert(!BufferIsLocal(BufferDescriptorGetBuffer(desc)));
 
-	init_local_spin_delay(&delayStatus);
-
 	while (true)
 	{
-		/* set BM_LOCKED flag */
+		/*
+		 * Always try once to acquire the lock directly, without setting up
+		 * the spin-delay infrastructure. The work necessary for that shows up
+		 * in profiles and is rarely necessary.
+		 */
 		old_buf_state = pg_atomic_fetch_or_u32(&desc->state, BM_LOCKED);
-		/* if it wasn't set before we're OK */
-		if (!(old_buf_state & BM_LOCKED))
-			break;
-		perform_spin_delay(&delayStatus);
+		if (likely(!(old_buf_state & BM_LOCKED)))
+			break;				/* got lock */
+
+		/* and then spin without atomic operations until lock is released */
+		{
+			SpinDelayStatus delayStatus;
+
+			init_local_spin_delay(&delayStatus);
+
+			while (old_buf_state & BM_LOCKED)
+			{
+				perform_spin_delay(&delayStatus);
+				old_buf_state = pg_atomic_read_u32(&desc->state);
+			}
+			finish_spin_delay(&delayStatus);
+		}
+
+		/*
+		 * Retry. The lock might obviously already be re-acquired by the time
+		 * we're attempting to get it again.
+		 */
 	}
-	finish_spin_delay(&delayStatus);
+
 	return old_buf_state | BM_LOCKED;
 }
 
diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c
index 255cfa8fa95..b839ace57cb 100644
--- a/src/backend/storage/lmgr/lwlock.c
+++ b/src/backend/storage/lmgr/lwlock.c
@@ -870,9 +870,13 @@ LWLockWaitListLock(LWLock *lock)
 
 	while (true)
 	{
-		/* always try once to acquire lock directly */
+		/*
+		 * Always try once to acquire the lock directly, without setting up
+		 * the spin-delay infrastructure. The work necessary for that shows up
+		 * in profiles and is rarely necessary.
+		 */
 		old_state = pg_atomic_fetch_or_u32(&lock->state, LW_FLAG_LOCKED);
-		if (!(old_state & LW_FLAG_LOCKED))
+		if (likely(!(old_state & LW_FLAG_LOCKED)))
 			break;				/* got lock */
 
 		/* and then spin without atomic operations until lock is released */
-- 
2.48.1.76.g4e746b1a31.dirty

