From bbafe39f8345311baa8d6e8d751ba7dfa6f43666 Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Thu, 18 Jun 2026 12:15:22 -0400
Subject: [PATCH v15 04/19] Simplify victim buffer selection

Refactor victim buffer selection so that StrategyGetBuffer() returns a
fully cleaned buffer with its former contents invalidated, moving the
flush/reject/invalidate logic out of GetVictimBuffer() and into
StrategyGetBuffer().

This is step one of a larger refactor to clarify the conditions and
rules for which we continue to look in the strategy ring to reuse a
buffer and when we go to shared buffers and look for a buffer.

GetVictimBuffer()'s control flow was a bit confusing. When using a
strategy, the strategy buffer could be rejected for a variety of reasons
ranging from concurrent users to strategy-specific buffer conditions.
In some of these cases, we would stop looking in the strategy ring and
go look in shared buffers. In others we would advance to the next buffer
in the ring. And even after we start looking in shared buffers, we may
go back to looking in the ring. These rules were not clear nor
documented.

There introduces a new invariant: once we stop looking for a buffer in the
strategy ring and start looking in shared buffers, we will continue
looking in shared buffers until we find a victim. This seems more clear.

This changes one behavior: previously, if we selected a victim from the
clock sweep and then failed to invalidate it (e.g. a concurrent pinner
or dirtier), we would loop back and search the strategy ring again. Now,
once we move from the ring to the clock sweep, we continue searching the
clock sweep until we find a victim. This is clearer and avoids
re-checking the ring for a buffer we already decided to evict from
shared buffers.

It also prevents occupying a spot in the strategy ring with an abandoned
shared buffer we failed to invalidate. After this commit, we do not add
any shared buffers to the strategy ring until we have fully claimed
them.

There is one other difference with how we search for strategy buffers:
now we only loop through the strategy ring once. If we don't find any
buffer that meets our criteria, we go look in shared buffers.
Previously, we would keep looping around the strategy ring.

XXX: should we keep looking in the strategy ring instead of going to
look in SB when a buffer can't be reused due to usage count or pin count
too high? This would be a departure from master but seems like a good
idea.
---
 src/backend/storage/buffer/bufmgr.c   | 107 +++++++++++---------------
 src/backend/storage/buffer/freelist.c | 102 +++++++++++++++++++-----
 src/include/storage/buf_internals.h   |  11 ++-
 3 files changed, 138 insertions(+), 82 deletions(-)

diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index f36d1887970..c060ee46163 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -2541,36 +2541,21 @@ InvalidateVictimBuffer(BufferDesc *buf_hdr)
 	return true;
 }
 
-static Buffer
-GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context)
+/*
+ * Helper to claim a victim buffer -- which is invalidating its existing
+ * contents (including flushing the old contents first if needed).
+ * Returns true if it successfully claimed the victim buffer and false if it
+ * failed to do so. Buffer must already be pinned.
+ */
+bool
+ClaimVictimBuffer(BufferAccessStrategy strategy,
+				  BufferDesc *buf_hdr, Buffer bufnum, uint64 buf_state,
+				  bool from_ring, IOContext io_context)
 {
-	BufferDesc *buf_hdr;
-	Buffer		buf;
-	uint64		buf_state;
-	bool		from_ring;
-	bool		buf_valid;
-
-	/*
-	 * Ensure, before we pin a victim buffer, that there's a free refcount
-	 * entry and resource owner slot for the pin.
-	 */
-	ReservePrivateRefCountEntry();
-	ResourceOwnerEnlarge(CurrentResourceOwner);
-
-	/* we return here if a prospective victim buffer gets used concurrently */
-again:
-
-	/*
-	 * Select a victim buffer.  The buffer is returned pinned and owned by
-	 * this backend.
-	 */
-	buf_hdr = StrategyGetBuffer(strategy, &buf_state, &from_ring);
-	buf = BufferDescriptorGetBuffer(buf_hdr);
-
 	/*
 	 * We shouldn't have any other pins for this buffer.
 	 */
-	CheckBufferIsPinnedOnce(buf);
+	CheckBufferIsPinnedOnce(bufnum);
 
 	/*
 	 * If the buffer was dirty, try to write it out.  There is a race
@@ -2598,14 +2583,14 @@ again:
 		 * index pages, and the second one just happens to be trying to split
 		 * the page the first one got from StrategyGetBuffer.)
 		 */
-		if (!BufferLockConditional(buf, buf_hdr, BUFFER_LOCK_SHARE_EXCLUSIVE))
+		if (!BufferLockConditional(bufnum, buf_hdr, BUFFER_LOCK_SHARE_EXCLUSIVE))
 		{
 			/*
 			 * Someone else has locked the buffer, so give it up and loop back
 			 * to get another one.
 			 */
 			UnpinBuffer(buf_hdr);
-			goto again;
+			return false;
 		}
 
 		/*
@@ -2616,27 +2601,23 @@ again:
 		 * XLogNeedsFlush() is not meaningful for them.
 		 *
 		 * We need to hold the content lock in at least share-exclusive mode
-		 * to safely inspect the page LSN, so this couldn't have been done
-		 * inside StrategyGetBuffer().
+		 * to safely inspect the page LSN.
 		 */
 		if (strategy && from_ring &&
 			StrategyRejectBuffer(strategy, buf_hdr, buf_state))
 		{
-			UnlockReleaseBuffer(buf);
-			goto again;
+			UnlockReleaseBuffer(bufnum);
+			return false;
 		}
 
 		/* OK, do the I/O */
 		FlushBuffer(buf_hdr, NULL, IOOBJECT_RELATION, io_context);
-		LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+		LockBuffer(bufnum, BUFFER_LOCK_UNLOCK);
 
 		ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context,
 									  &buf_hdr->tag);
 	}
 
-	/* Don't use a stale buf_state value after InvalidateVictimBuffer */
-	buf_valid = buf_state & BM_VALID;
-
 	/*
 	 * If the buffer has an entry in the buffer mapping table, delete it. This
 	 * can fail because another backend could have pinned or dirtied the
@@ -2645,43 +2626,47 @@ again:
 	if ((buf_state & BM_TAG_VALID) && !InvalidateVictimBuffer(buf_hdr))
 	{
 		UnpinBuffer(buf_hdr);
-		goto again;
+		return false;
 	}
 
-	if (buf_valid)
-	{
-		/*
-		 * When a BufferAccessStrategy is in use, blocks evicted from shared
-		 * buffers are counted as IOOP_EVICT in the corresponding context
-		 * (e.g. IOCONTEXT_BULKWRITE). Shared buffers are evicted by a
-		 * strategy in two cases: 1) while initially claiming buffers for the
-		 * strategy ring 2) to replace an existing strategy ring buffer
-		 * because it is pinned or in use and cannot be reused.
-		 *
-		 * Blocks evicted from buffers already in the strategy ring are
-		 * counted as IOOP_REUSE in the corresponding strategy context.
-		 *
-		 * At this point, we can accurately count evictions and reuses,
-		 * because we have successfully claimed the valid buffer and
-		 * invalidated its previous tenant. Previously, we may have been
-		 * forced to release the buffer due to concurrent pinners or erroring
-		 * out.
-		 */
-		pgstat_count_io_op(IOOBJECT_RELATION, io_context,
-						   from_ring ? IOOP_REUSE : IOOP_EVICT, 1, 0);
-	}
+	return true;
+}
+
+static Buffer
+GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context)
+{
+	Buffer		bufnum = InvalidBuffer;
+#ifdef USE_ASSERT_CHECKING
+	BufferDesc *buf_hdr;
+	uint64		buf_state;
+#endif
+
+	/*
+	 * Ensure, before we pin a victim buffer, that there's a free refcount
+	 * entry and resource owner slot for the pin.
+	 */
+	ReservePrivateRefCountEntry();
+	ResourceOwnerEnlarge(CurrentResourceOwner);
+
+	/*
+	 * Select a victim buffer.  The buffer is returned pinned and owned by
+	 * this backend.
+	 */
+	bufnum = StrategyGetBuffer(strategy, io_context);
+
 
 	/* a final set of sanity checks */
 #ifdef USE_ASSERT_CHECKING
+	buf_hdr = GetBufferDescriptor(bufnum - 1);
 	buf_state = pg_atomic_read_u64(&buf_hdr->state);
 
 	Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 1);
 	Assert(!(buf_state & (BM_TAG_VALID | BM_VALID | BM_DIRTY)));
 
-	CheckBufferIsPinnedOnce(buf);
+	CheckBufferIsPinnedOnce(bufnum);
 #endif
 
-	return buf;
+	return bufnum;
 }
 
 /*
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
index 746740105a9..f0f3a30cd03 100644
--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
@@ -99,7 +99,7 @@ typedef struct BufferAccessStrategyData
 static BufferDesc *GetBufferFromRing(BufferAccessStrategy strategy,
 									 uint64 *buf_state);
 static void AddBufferToRing(BufferAccessStrategy strategy,
-							BufferDesc *buf);
+							Buffer bufnum);
 
 /*
  * ClockSweepTick - Helper routine for StrategyGetBuffer()
@@ -181,14 +181,15 @@ ClockSweepTick(void)
  *	The buffer is pinned and marked as owned, using TrackNewBufferPin(),
  *	before returning.
  */
-BufferDesc *
-StrategyGetBuffer(BufferAccessStrategy strategy, uint64 *buf_state, bool *from_ring)
+Buffer
+StrategyGetBuffer(BufferAccessStrategy strategy, IOContext io_context)
 {
 	BufferDesc *buf;
+	Buffer		bufnum;
 	int			bgwprocno;
 	int			trycounter;
-
-	*from_ring = false;
+	uint64		buf_state;
+	bool		buf_valid;
 
 	/*
 	 * If given a strategy object, see whether it can select a buffer. We
@@ -196,11 +197,40 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint64 *buf_state, bool *from_r
 	 */
 	if (strategy != NULL)
 	{
-		buf = GetBufferFromRing(strategy, buf_state);
-		if (buf != NULL)
+		int			ring_tries;
+
+		/*
+		 * Don't loop around the ring forever. If we can't reuse any of the
+		 * ring buffers (e.g. because they are all concurrently locked), fall
+		 * back to the clock sweep after one full pass.
+		 */
+		for (ring_tries = 0; ring_tries < strategy->nbuffers; ring_tries++)
 		{
-			*from_ring = true;
-			return buf;
+			buf = GetBufferFromRing(strategy, &buf_state);
+			if (buf == NULL)
+				break;
+
+			bufnum = BufferDescriptorGetBuffer(buf);
+			/* Don't use a stale buf_state value after InvalidateVictimBuffer */
+			buf_valid = buf_state & BM_VALID;
+			if (ClaimVictimBuffer(strategy, buf, bufnum, buf_state,
+								  true, io_context))
+			{
+				/*
+				 * Blocks evicted from buffers already in the strategy ring
+				 * are counted as IOOP_REUSE in the corresponding strategy
+				 * context.
+				 *
+				 * We can only count this now that we've successfully claimed
+				 * the buffer and invalidated its previous tenant. Previously
+				 * we may have been forced to release the buffer due to
+				 * concurrent pinners or erroring out.
+				 */
+				if (buf_valid)
+					pgstat_count_io_op(IOOBJECT_RELATION, io_context,
+									   IOOP_REUSE, 1, 0);
+				return bufnum;
+			}
 		}
 	}
 
@@ -304,13 +334,49 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint64 *buf_state, bool *from_r
 												   local_buf_state))
 				{
 					/* Found a usable buffer */
-					if (strategy != NULL)
-						AddBufferToRing(strategy, buf);
-					*buf_state = local_buf_state;
+					buf_state = local_buf_state;
 
 					TrackNewBufferPin(BufferDescriptorGetBuffer(buf));
+					bufnum = BufferDescriptorGetBuffer(buf);
+
+					/*
+					 * Don't use a stale buf_state value after
+					 * InvalidateVictimBuffer
+					 */
+					buf_valid = buf_state & BM_VALID;
 
-					return buf;
+					/*
+					 * If we can't claim the buffer (concurrent pin/dirty), go
+					 * back to the clock sweep to find another one. Don't add
+					 * a buffer we failed to claim to the strategy ring.
+					 */
+					if (ClaimVictimBuffer(strategy, buf, bufnum, buf_state,
+										  false, io_context))
+					{
+						/*
+						 * Blocks evicted from shared buffers are counted as
+						 * IOOP_EVICT in the corresponding context, even when
+						 * a strategy is in use. Shared buffers are evicted by
+						 * a strategy in two cases: 1) while initially
+						 * claiming buffers for the strategy ring 2) to
+						 * replace an existing strategy ring buffer because it
+						 * is pinned or in use and cannot be reused.
+						 *
+						 * We can only count this now that we've successfully
+						 * claimed the buffer and invalidated its previous
+						 * tenant. Previously we may have been forced to
+						 * release the buffer due to concurrent pinners or
+						 * erroring out.
+						 */
+						if (buf_valid)
+							pgstat_count_io_op(IOOBJECT_RELATION, io_context,
+											   IOOP_EVICT, 1, 0);
+						if (strategy != NULL)
+							AddBufferToRing(strategy, bufnum);
+						return bufnum;
+					}
+
+					break;
 				}
 			}
 		}
@@ -694,15 +760,13 @@ GetBufferFromRing(BufferAccessStrategy strategy, uint64 *buf_state)
 }
 
 /*
- * AddBufferToRing -- add a buffer to the buffer ring
- *
- * Caller must hold the buffer header spinlock on the buffer.  Since this
- * is called with the spinlock held, it had better be quite cheap.
+ * Records the buffer in the current ring slot. The caller must have already
+ * pinned the buffer and invalidated its previous contents.
  */
 static void
-AddBufferToRing(BufferAccessStrategy strategy, BufferDesc *buf)
+AddBufferToRing(BufferAccessStrategy strategy, Buffer bufnum)
 {
-	strategy->buffers[strategy->current] = BufferDescriptorGetBuffer(buf);
+	strategy->buffers[strategy->current] = bufnum;
 }
 
 /*
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index a527bcd454b..8477cec8d34 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -548,6 +548,13 @@ ResourceOwnerForgetBufferIO(ResourceOwner owner, Buffer buffer)
 	ResourceOwnerForget(owner, Int32GetDatum(buffer), &buffer_io_resowner_desc);
 }
 
+/* For use in freelist.c but defined in bufmgr.c */
+extern bool ClaimVictimBuffer(BufferAccessStrategy strategy,
+							  BufferDesc *buf_hdr, Buffer bufnum,
+							  uint64 buf_state,
+							  bool from_ring,
+							  IOContext io_context);
+
 /*
  * Internal buffer management routines
  */
@@ -582,8 +589,8 @@ extern void TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint64 set_flag
 
 /* freelist.c */
 extern IOContext IOContextForStrategy(BufferAccessStrategy strategy);
-extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy,
-									 uint64 *buf_state, bool *from_ring);
+extern Buffer StrategyGetBuffer(BufferAccessStrategy strategy,
+								IOContext io_context);
 extern bool StrategyRejectBuffer(BufferAccessStrategy strategy,
 								 BufferDesc *buf, uint64 buf_state);
 
-- 
2.43.0

