From f2f8ce7f495c6ac0e84847e532fcba2d6914b37d Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Mon, 8 Sep 2025 17:16:01 -0400
Subject: [PATCH v4 2/6] bufmgr: Don't lock buffer header in
 StrategyGetBuffer()

Previously StrategyGetBuffer() acquired the buffer header spinlock for every
buffer, whether it was reusable or not. If reusable, it'd be returned, with
the lock held, to GetVictimBuffer(), which then would pin the buffer with
PinBuffer_Locked(). That's somewhat violating the spirit of the guidelines for
holding spinlocks (i.e. that they are only held for a few lines of consecutive
code) and necessitates using PinBuffer_Locked(), which scales worse than
PinBuffer() due to holding the spinlock.  This alone makes it worth changing
the code.

However, the main reason to change this is that a future commit will make
PinBuffer_Locked() slower (due to making UnlockBufHdr() slower), to gain
scalability for the much more common case of pinning a pre-existing buffer. By
pinning the buffer with a single atomic operation, iff the buffer is reusable,
we avoid any potential regression for miss-heavy workloads. There strictly are
fewer atomic operations for each potential buffer after this change.

The price for this improvement is that freelist.c needs two CAS loops and
needs to be able to set up the resource accounting for pinned buffers. The
latter is achieved by exposing a new function for that purpose from bufmgr.c,
that seems better than exposing the entire private refcount infrastructure.
The improvement seems worth the complexity.

Reviewed-by: Robert Haas <robertmhaas@gmail.com>
Discussion: https://postgr.es/m/fvfmkr5kk4nyex56ejgxj3uzi63isfxovp2biecb4bspbjrze7@az2pljabhnff
---
 src/include/storage/buf_internals.h   |   4 +
 src/backend/storage/buffer/bufmgr.c   |  44 +++++------
 src/backend/storage/buffer/freelist.c | 110 +++++++++++++++++++-------
 3 files changed, 105 insertions(+), 53 deletions(-)

diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index dfd614f7ca4..c1206a46aba 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -371,6 +371,8 @@ UnlockBufHdr(BufferDesc *desc, uint32 buf_state)
 	pg_atomic_write_u32(&desc->state, buf_state & (~BM_LOCKED));
 }
 
+extern uint32 WaitBufHdrUnlocked(BufferDesc *buf);
+
 /* in bufmgr.c */
 
 /*
@@ -425,6 +427,8 @@ extern void IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_co
 extern void ScheduleBufferTagForWriteback(WritebackContext *wb_context,
 										  IOContext io_context, BufferTag *tag);
 
+extern void TrackNewBufferPin(Buffer buf);
+
 /* solely to make it easier to write tests */
 extern bool StartBufferIO(BufferDesc *buf, bool forInput, bool nowait);
 extern void TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits,
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index b5eebfb6990..0b841eb7838 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -518,7 +518,6 @@ static void PinBuffer_Locked(BufferDesc *buf);
 static void UnpinBuffer(BufferDesc *buf);
 static void UnpinBufferNoOwner(BufferDesc *buf);
 static void BufferSync(int flags);
-static uint32 WaitBufHdrUnlocked(BufferDesc *buf);
 static int	SyncOneBuffer(int buf_id, bool skip_recently_used,
 						  WritebackContext *wb_context);
 static void WaitIO(BufferDesc *buf);
@@ -2325,7 +2324,7 @@ GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context)
 
 	/*
 	 * Ensure, while the spinlock's not yet held, that there's a free refcount
-	 * entry, and a resource owner slot for the pin.
+	 * entry, and a resource owner slot for the pin. FIXME: Need to be updated
 	 */
 	ReservePrivateRefCountEntry();
 	ResourceOwnerEnlarge(CurrentResourceOwner);
@@ -2334,17 +2333,11 @@ GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context)
 again:
 
 	/*
-	 * Select a victim buffer.  The buffer is returned with its header
-	 * spinlock still held!
+	 * Select a victim buffer.  The buffer is returned pinned by this backend.
 	 */
 	buf_hdr = StrategyGetBuffer(strategy, &buf_state, &from_ring);
 	buf = BufferDescriptorGetBuffer(buf_hdr);
 
-	Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 0);
-
-	/* Pin the buffer and then release the buffer spinlock */
-	PinBuffer_Locked(buf_hdr);
-
 	/*
 	 * We shouldn't have any other pins for this buffer.
 	 */
@@ -3043,8 +3036,6 @@ PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy,
 		uint32		buf_state;
 		uint32		old_buf_state;
 
-		ref = NewPrivateRefCountEntry(b);
-
 		old_buf_state = pg_atomic_read_u32(&buf->state);
 		for (;;)
 		{
@@ -3091,6 +3082,8 @@ PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy,
 				break;
 			}
 		}
+
+		TrackNewBufferPin(b);
 	}
 	else
 	{
@@ -3110,11 +3103,12 @@ PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy,
 		 * cannot meddle with that.
 		 */
 		result = (pg_atomic_read_u32(&buf->state) & BM_VALID) != 0;
+
+		Assert(ref->refcount > 0);
+		ref->refcount++;
+		ResourceOwnerRememberBuffer(CurrentResourceOwner, b);
 	}
 
-	ref->refcount++;
-	Assert(ref->refcount > 0);
-	ResourceOwnerRememberBuffer(CurrentResourceOwner, b);
 	return result;
 }
 
@@ -3143,8 +3137,6 @@ PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy,
 static void
 PinBuffer_Locked(BufferDesc *buf)
 {
-	Buffer		b;
-	PrivateRefCountEntry *ref;
 	uint32		buf_state;
 
 	/*
@@ -3169,12 +3161,7 @@ PinBuffer_Locked(BufferDesc *buf)
 	buf_state += BUF_REFCOUNT_ONE;
 	UnlockBufHdr(buf, buf_state);
 
-	b = BufferDescriptorGetBuffer(buf);
-
-	ref = NewPrivateRefCountEntry(b);
-	ref->refcount++;
-
-	ResourceOwnerRememberBuffer(CurrentResourceOwner, b);
+	TrackNewBufferPin(BufferDescriptorGetBuffer(buf));
 }
 
 /*
@@ -3289,6 +3276,17 @@ UnpinBufferNoOwner(BufferDesc *buf)
 	}
 }
 
+inline void
+TrackNewBufferPin(Buffer buf)
+{
+	PrivateRefCountEntry *ref;
+
+	ref = NewPrivateRefCountEntry(buf);
+	ref->refcount++;
+
+	ResourceOwnerRememberBuffer(CurrentResourceOwner, buf);
+}
+
 #define ST_SORT sort_checkpoint_bufferids
 #define ST_ELEMENT_TYPE CkptSortItem
 #define ST_COMPARE(a, b) ckpt_buforder_comparator(a, b)
@@ -6242,7 +6240,7 @@ LockBufHdr(BufferDesc *desc)
  * Obviously the buffer could be locked by the time the value is returned, so
  * this is primarily useful in CAS style loops.
  */
-static uint32
+pg_noinline uint32
 WaitBufHdrUnlocked(BufferDesc *buf)
 {
 	SpinDelayStatus delayStatus;
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
index 7d59a92bd1a..9d9fb0471a0 100644
--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
@@ -164,8 +164,8 @@ ClockSweepTick(void)
  *
  *	strategy is a BufferAccessStrategy object, or NULL for default strategy.
  *
- *	To ensure that no one else can pin the buffer before we do, we must
- *	return the buffer with the buffer header spinlock still held.
+ *	The buffer is pinned before returning, but resource ownership is the
+ *	responsibility of the caller.
  */
 BufferDesc *
 StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_ring)
@@ -173,7 +173,6 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_r
 	BufferDesc *buf;
 	int			bgwprocno;
 	int			trycounter;
-	uint32		local_buf_state;	/* to avoid repeated (de-)referencing */
 
 	*from_ring = false;
 
@@ -228,44 +227,74 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_r
 	trycounter = NBuffers;
 	for (;;)
 	{
+		uint32		old_buf_state;
+		uint32		local_buf_state;
+
 		buf = GetBufferDescriptor(ClockSweepTick());
 
 		/*
 		 * If the buffer is pinned or has a nonzero usage_count, we cannot use
 		 * it; decrement the usage_count (unless pinned) and keep scanning.
 		 */
-		local_buf_state = LockBufHdr(buf);
 
-		if (BUF_STATE_GET_REFCOUNT(local_buf_state) == 0)
+		old_buf_state = pg_atomic_read_u32(&buf->state);
+
+		for (;;)
 		{
+			local_buf_state = old_buf_state;
+
+			if (BUF_STATE_GET_REFCOUNT(local_buf_state) != 0)
+			{
+				if (--trycounter == 0)
+				{
+					/*
+					 * We've scanned all the buffers without making any state
+					 * changes, so all the buffers are pinned (or were when we
+					 * looked at them). We could hope that someone will free
+					 * one eventually, but it's probably better to fail than
+					 * to risk getting stuck in an infinite loop.
+					 */
+					elog(ERROR, "no unpinned buffers available");
+				}
+				break;
+			}
+
+			if (unlikely(local_buf_state & BM_LOCKED))
+			{
+				old_buf_state = WaitBufHdrUnlocked(buf);
+				continue;
+			}
+
 			if (BUF_STATE_GET_USAGECOUNT(local_buf_state) != 0)
 			{
 				local_buf_state -= BUF_USAGECOUNT_ONE;
 
-				trycounter = NBuffers;
+				if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
+												   local_buf_state))
+				{
+					trycounter = NBuffers;
+					break;
+				}
 			}
 			else
 			{
-				/* Found a usable buffer */
-				if (strategy != NULL)
-					AddBufferToRing(strategy, buf);
-				*buf_state = local_buf_state;
-				return buf;
+				local_buf_state += BUF_REFCOUNT_ONE;
+
+				if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
+												   local_buf_state))
+				{
+					/* Found a usable buffer */
+					if (strategy != NULL)
+						AddBufferToRing(strategy, buf);
+					*buf_state = local_buf_state;
+
+					TrackNewBufferPin(BufferDescriptorGetBuffer(buf));
+
+					return buf;
+				}
 			}
+
 		}
-		else if (--trycounter == 0)
-		{
-			/*
-			 * We've scanned all the buffers without making any state changes,
-			 * so all the buffers are pinned (or were when we looked at them).
-			 * We could hope that someone will free one eventually, but it's
-			 * probably better to fail than to risk getting stuck in an
-			 * infinite loop.
-			 */
-			UnlockBufHdr(buf, local_buf_state);
-			elog(ERROR, "no unpinned buffers available");
-		}
-		UnlockBufHdr(buf, local_buf_state);
 	}
 }
 
@@ -621,6 +650,7 @@ GetBufferFromRing(BufferAccessStrategy strategy, uint32 *buf_state)
 {
 	BufferDesc *buf;
 	Buffer		bufnum;
+	uint32		old_buf_state;
 	uint32		local_buf_state;	/* to avoid repeated (de-)referencing */
 
 
@@ -647,14 +677,34 @@ GetBufferFromRing(BufferAccessStrategy strategy, uint32 *buf_state)
 	 * shouldn't re-use it.
 	 */
 	buf = GetBufferDescriptor(bufnum - 1);
-	local_buf_state = LockBufHdr(buf);
-	if (BUF_STATE_GET_REFCOUNT(local_buf_state) == 0
-		&& BUF_STATE_GET_USAGECOUNT(local_buf_state) <= 1)
+
+	old_buf_state = pg_atomic_read_u32(&buf->state);
+
+	for (;;)
 	{
-		*buf_state = local_buf_state;
-		return buf;
+		local_buf_state = old_buf_state;
+
+		if (BUF_STATE_GET_REFCOUNT(local_buf_state) != 0
+			|| BUF_STATE_GET_USAGECOUNT(local_buf_state) > 1)
+			break;
+
+		if (unlikely(local_buf_state & BM_LOCKED))
+		{
+			old_buf_state = WaitBufHdrUnlocked(buf);
+			continue;
+		}
+
+		local_buf_state += BUF_REFCOUNT_ONE;
+
+		if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
+										   local_buf_state))
+		{
+			*buf_state = local_buf_state;
+
+			TrackNewBufferPin(BufferDescriptorGetBuffer(buf));
+			return buf;
+		}
 	}
-	UnlockBufHdr(buf, local_buf_state);
 
 	/*
 	 * Tell caller to allocate a new buffer with the normal allocation
-- 
2.48.1.76.g4e746b1a31.dirty

