From 482f56744a72c168492eefac2af9dcf5ee153524 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Mon, 8 Sep 2025 17:16:01 -0400
Subject: [PATCH v3 2/6] bufmgr: Don't lock buffer header in
 StrategyGetBuffer()

This is a small improvement on its own, due to not needing to hold the buffer
spinlock as often. But the main reason for this is to reduce the use of
PinBuffer_Locked() and LockBufHdr(), which a future commit will make a bit
more expensive (to make more common paths faster).

Author:
Reviewed-by:
Discussion: https://postgr.es/m/
Backpatch:
---
 src/include/storage/buf_internals.h   |   4 +
 src/backend/storage/buffer/bufmgr.c   |  44 +++++------
 src/backend/storage/buffer/freelist.c | 110 +++++++++++++++++++-------
 3 files changed, 105 insertions(+), 53 deletions(-)

diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index dfd614f7ca4..c1206a46aba 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -371,6 +371,8 @@ UnlockBufHdr(BufferDesc *desc, uint32 buf_state)
 	pg_atomic_write_u32(&desc->state, buf_state & (~BM_LOCKED));
 }
 
+extern uint32 WaitBufHdrUnlocked(BufferDesc *buf);
+
 /* in bufmgr.c */
 
 /*
@@ -425,6 +427,8 @@ extern void IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_co
 extern void ScheduleBufferTagForWriteback(WritebackContext *wb_context,
 										  IOContext io_context, BufferTag *tag);
 
+extern void TrackNewBufferPin(Buffer buf);
+
 /* solely to make it easier to write tests */
 extern bool StartBufferIO(BufferDesc *buf, bool forInput, bool nowait);
 extern void TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits,
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index b5eebfb6990..0b841eb7838 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -518,7 +518,6 @@ static void PinBuffer_Locked(BufferDesc *buf);
 static void UnpinBuffer(BufferDesc *buf);
 static void UnpinBufferNoOwner(BufferDesc *buf);
 static void BufferSync(int flags);
-static uint32 WaitBufHdrUnlocked(BufferDesc *buf);
 static int	SyncOneBuffer(int buf_id, bool skip_recently_used,
 						  WritebackContext *wb_context);
 static void WaitIO(BufferDesc *buf);
@@ -2325,7 +2324,7 @@ GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context)
 
 	/*
 	 * Ensure, while the spinlock's not yet held, that there's a free refcount
-	 * entry, and a resource owner slot for the pin.
+	 * entry, and a resource owner slot for the pin. FIXME: Need to be updated
 	 */
 	ReservePrivateRefCountEntry();
 	ResourceOwnerEnlarge(CurrentResourceOwner);
@@ -2334,17 +2333,11 @@ GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context)
 again:
 
 	/*
-	 * Select a victim buffer.  The buffer is returned with its header
-	 * spinlock still held!
+	 * Select a victim buffer.  The buffer is returned pinned by this backend.
 	 */
 	buf_hdr = StrategyGetBuffer(strategy, &buf_state, &from_ring);
 	buf = BufferDescriptorGetBuffer(buf_hdr);
 
-	Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 0);
-
-	/* Pin the buffer and then release the buffer spinlock */
-	PinBuffer_Locked(buf_hdr);
-
 	/*
 	 * We shouldn't have any other pins for this buffer.
 	 */
@@ -3043,8 +3036,6 @@ PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy,
 		uint32		buf_state;
 		uint32		old_buf_state;
 
-		ref = NewPrivateRefCountEntry(b);
-
 		old_buf_state = pg_atomic_read_u32(&buf->state);
 		for (;;)
 		{
@@ -3091,6 +3082,8 @@ PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy,
 				break;
 			}
 		}
+
+		TrackNewBufferPin(b);
 	}
 	else
 	{
@@ -3110,11 +3103,12 @@ PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy,
 		 * cannot meddle with that.
 		 */
 		result = (pg_atomic_read_u32(&buf->state) & BM_VALID) != 0;
+
+		Assert(ref->refcount > 0);
+		ref->refcount++;
+		ResourceOwnerRememberBuffer(CurrentResourceOwner, b);
 	}
 
-	ref->refcount++;
-	Assert(ref->refcount > 0);
-	ResourceOwnerRememberBuffer(CurrentResourceOwner, b);
 	return result;
 }
 
@@ -3143,8 +3137,6 @@ PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy,
 static void
 PinBuffer_Locked(BufferDesc *buf)
 {
-	Buffer		b;
-	PrivateRefCountEntry *ref;
 	uint32		buf_state;
 
 	/*
@@ -3169,12 +3161,7 @@ PinBuffer_Locked(BufferDesc *buf)
 	buf_state += BUF_REFCOUNT_ONE;
 	UnlockBufHdr(buf, buf_state);
 
-	b = BufferDescriptorGetBuffer(buf);
-
-	ref = NewPrivateRefCountEntry(b);
-	ref->refcount++;
-
-	ResourceOwnerRememberBuffer(CurrentResourceOwner, b);
+	TrackNewBufferPin(BufferDescriptorGetBuffer(buf));
 }
 
 /*
@@ -3289,6 +3276,17 @@ UnpinBufferNoOwner(BufferDesc *buf)
 	}
 }
 
+inline void
+TrackNewBufferPin(Buffer buf)
+{
+	PrivateRefCountEntry *ref;
+
+	ref = NewPrivateRefCountEntry(buf);
+	ref->refcount++;
+
+	ResourceOwnerRememberBuffer(CurrentResourceOwner, buf);
+}
+
 #define ST_SORT sort_checkpoint_bufferids
 #define ST_ELEMENT_TYPE CkptSortItem
 #define ST_COMPARE(a, b) ckpt_buforder_comparator(a, b)
@@ -6242,7 +6240,7 @@ LockBufHdr(BufferDesc *desc)
  * Obviously the buffer could be locked by the time the value is returned, so
  * this is primarily useful in CAS style loops.
  */
-static uint32
+pg_noinline uint32
 WaitBufHdrUnlocked(BufferDesc *buf)
 {
 	SpinDelayStatus delayStatus;
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
index 7d59a92bd1a..9d9fb0471a0 100644
--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
@@ -164,8 +164,8 @@ ClockSweepTick(void)
  *
  *	strategy is a BufferAccessStrategy object, or NULL for default strategy.
  *
- *	To ensure that no one else can pin the buffer before we do, we must
- *	return the buffer with the buffer header spinlock still held.
+ *	The buffer is pinned before returning, but resource ownership is the
+ *	responsibility of the caller.
  */
 BufferDesc *
 StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_ring)
@@ -173,7 +173,6 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_r
 	BufferDesc *buf;
 	int			bgwprocno;
 	int			trycounter;
-	uint32		local_buf_state;	/* to avoid repeated (de-)referencing */
 
 	*from_ring = false;
 
@@ -228,44 +227,74 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_r
 	trycounter = NBuffers;
 	for (;;)
 	{
+		uint32		old_buf_state;
+		uint32		local_buf_state;
+
 		buf = GetBufferDescriptor(ClockSweepTick());
 
 		/*
 		 * If the buffer is pinned or has a nonzero usage_count, we cannot use
 		 * it; decrement the usage_count (unless pinned) and keep scanning.
 		 */
-		local_buf_state = LockBufHdr(buf);
 
-		if (BUF_STATE_GET_REFCOUNT(local_buf_state) == 0)
+		old_buf_state = pg_atomic_read_u32(&buf->state);
+
+		for (;;)
 		{
+			local_buf_state = old_buf_state;
+
+			if (BUF_STATE_GET_REFCOUNT(local_buf_state) != 0)
+			{
+				if (--trycounter == 0)
+				{
+					/*
+					 * We've scanned all the buffers without making any state
+					 * changes, so all the buffers are pinned (or were when we
+					 * looked at them). We could hope that someone will free
+					 * one eventually, but it's probably better to fail than
+					 * to risk getting stuck in an infinite loop.
+					 */
+					elog(ERROR, "no unpinned buffers available");
+				}
+				break;
+			}
+
+			if (unlikely(local_buf_state & BM_LOCKED))
+			{
+				old_buf_state = WaitBufHdrUnlocked(buf);
+				continue;
+			}
+
 			if (BUF_STATE_GET_USAGECOUNT(local_buf_state) != 0)
 			{
 				local_buf_state -= BUF_USAGECOUNT_ONE;
 
-				trycounter = NBuffers;
+				if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
+												   local_buf_state))
+				{
+					trycounter = NBuffers;
+					break;
+				}
 			}
 			else
 			{
-				/* Found a usable buffer */
-				if (strategy != NULL)
-					AddBufferToRing(strategy, buf);
-				*buf_state = local_buf_state;
-				return buf;
+				local_buf_state += BUF_REFCOUNT_ONE;
+
+				if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
+												   local_buf_state))
+				{
+					/* Found a usable buffer */
+					if (strategy != NULL)
+						AddBufferToRing(strategy, buf);
+					*buf_state = local_buf_state;
+
+					TrackNewBufferPin(BufferDescriptorGetBuffer(buf));
+
+					return buf;
+				}
 			}
+
 		}
-		else if (--trycounter == 0)
-		{
-			/*
-			 * We've scanned all the buffers without making any state changes,
-			 * so all the buffers are pinned (or were when we looked at them).
-			 * We could hope that someone will free one eventually, but it's
-			 * probably better to fail than to risk getting stuck in an
-			 * infinite loop.
-			 */
-			UnlockBufHdr(buf, local_buf_state);
-			elog(ERROR, "no unpinned buffers available");
-		}
-		UnlockBufHdr(buf, local_buf_state);
 	}
 }
 
@@ -621,6 +650,7 @@ GetBufferFromRing(BufferAccessStrategy strategy, uint32 *buf_state)
 {
 	BufferDesc *buf;
 	Buffer		bufnum;
+	uint32		old_buf_state;
 	uint32		local_buf_state;	/* to avoid repeated (de-)referencing */
 
 
@@ -647,14 +677,34 @@ GetBufferFromRing(BufferAccessStrategy strategy, uint32 *buf_state)
 	 * shouldn't re-use it.
 	 */
 	buf = GetBufferDescriptor(bufnum - 1);
-	local_buf_state = LockBufHdr(buf);
-	if (BUF_STATE_GET_REFCOUNT(local_buf_state) == 0
-		&& BUF_STATE_GET_USAGECOUNT(local_buf_state) <= 1)
+
+	old_buf_state = pg_atomic_read_u32(&buf->state);
+
+	for (;;)
 	{
-		*buf_state = local_buf_state;
-		return buf;
+		local_buf_state = old_buf_state;
+
+		if (BUF_STATE_GET_REFCOUNT(local_buf_state) != 0
+			|| BUF_STATE_GET_USAGECOUNT(local_buf_state) > 1)
+			break;
+
+		if (unlikely(local_buf_state & BM_LOCKED))
+		{
+			old_buf_state = WaitBufHdrUnlocked(buf);
+			continue;
+		}
+
+		local_buf_state += BUF_REFCOUNT_ONE;
+
+		if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
+										   local_buf_state))
+		{
+			*buf_state = local_buf_state;
+
+			TrackNewBufferPin(BufferDescriptorGetBuffer(buf));
+			return buf;
+		}
 	}
-	UnlockBufHdr(buf, local_buf_state);
 
 	/*
 	 * Tell caller to allocate a new buffer with the normal allocation
-- 
2.48.1.76.g4e746b1a31.dirty

