From 847d3ed8f906a20558aa3b458d86eaf6f99cbb5f Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Tue, 18 Nov 2025 09:39:59 -0500
Subject: [PATCH v7 06/15] bufmgr: Add one-entry cache for private refcount

The private refcount entry for a buffer is often looked up repeatedly for the
same buffer, e.g. to pin and then unpin a buffer. Benchmarking shows that it's
worthwhile to have a one-entry cache for that case. With that cache in place,
it's worth splitting GetPrivateRefCountEntry() into a small inline
portion (for the cache hit case) and an out-of-line helper for the rest.

This is helpful for some workloads today, but becomes more important in an
upcoming patch that will utilize the private refcount infrastructure to also
store whether the buffer is currently locked, as that increases the rate of
lookups substantially.

Reviewed-by: Melanie Plageman <melanieplageman@gmail.com>
Discussion: https://postgr.es/m/6rgb2nvhyvnszz4ul3wfzlf5rheb2kkwrglthnna7qhe24onwr@vw27225tkyar
---
 src/backend/storage/buffer/bufmgr.c | 66 ++++++++++++++++++++++++-----
 1 file changed, 55 insertions(+), 11 deletions(-)

diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 4e147d477c7..be32bd596f6 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -237,6 +237,7 @@ static HTAB *PrivateRefCountHash = NULL;
 static int32 PrivateRefCountOverflowed = 0;
 static uint32 PrivateRefCountClock = 0;
 static int	ReservedRefCountSlot = -1;
+static int	PrivateRefCountEntryLast = -1;
 
 static uint32 MaxProportionalPins;
 
@@ -369,28 +370,27 @@ NewPrivateRefCountEntry(Buffer buffer)
 	res->buffer = buffer;
 	res->data.refcount = 0;
 
+	/* update cache for the next lookup */
+	PrivateRefCountEntryLast = ReservedRefCountSlot;
+
 	ReservedRefCountSlot = -1;
 
 	return res;
 }
 
 /*
- * Return the PrivateRefCount entry for the passed buffer.
- *
- * Returns NULL if a buffer doesn't have a refcount entry. Otherwise, if
- * do_move is true, and the entry resides in the hashtable the entry is
- * optimized for frequent access by moving it to the array.
+ * Slow-path for GetPrivateRefCountEntry(). This is big enough to not be worth
+ * inlining. This particularly seems to be true if the compiler is capable of
+ * auto-vectorizing the code, as that imposes additional stack-alignment
+ * requirements etc.
  */
-static inline PrivateRefCountEntry *
-GetPrivateRefCountEntry(Buffer buffer, bool do_move)
+static pg_noinline PrivateRefCountEntry *
+GetPrivateRefCountEntrySlow(Buffer buffer, bool do_move)
 {
 	PrivateRefCountEntry *res;
 	int			match = -1;
 	int			i;
 
-	Assert(BufferIsValid(buffer));
-	Assert(!BufferIsLocal(buffer));
-
 	/*
 	 * First search for references in the array, that'll be sufficient in the
 	 * majority of cases.
@@ -404,8 +404,13 @@ GetPrivateRefCountEntry(Buffer buffer, bool do_move)
 		}
 	}
 
-	if (match != -1)
+	if (likely(match != -1))
+	{
+		/* update cache for the next lookup */
+		PrivateRefCountEntryLast = match;
+
 		return &PrivateRefCountArray[match];
+	}
 
 	/*
 	 * By here we know that the buffer, if already pinned, isn't residing in
@@ -445,6 +450,8 @@ GetPrivateRefCountEntry(Buffer buffer, bool do_move)
 		free->buffer = buffer;
 		free->data = res->data;
 		PrivateRefCountArrayKeys[ReservedRefCountSlot] = buffer;
+		/* update cache for the next lookup */
+		PrivateRefCountEntryLast = match;
 
 		ReservedRefCountSlot = -1;
 
@@ -459,6 +466,43 @@ GetPrivateRefCountEntry(Buffer buffer, bool do_move)
 	}
 }
 
+/*
+ * Return the PrivateRefCount entry for the passed buffer.
+ *
+ * Returns NULL if a buffer doesn't have a refcount entry. Otherwise, if
+ * do_move is true, and the entry resides in the hashtable the entry is
+ * optimized for frequent access by moving it to the array.
+ */
+static inline PrivateRefCountEntry *
+GetPrivateRefCountEntry(Buffer buffer, bool do_move)
+{
+	Assert(BufferIsValid(buffer));
+	Assert(!BufferIsLocal(buffer));
+
+	/*
+	 * It's very common to look up the same buffer repeatedly. To make that
+	 * fast, we have a one-entry cache.
+	 *
+	 * In contrast to the loop below, here it faster to check
+	 * PrivateRefCountArray[].buffer, as in the case of a hit, as fewer
+	 * addresses are computed and fewer cachelines are accessed. Whereas in
+	 * the loop case below, checking PrivateRefCountArrayKeys saves a lot of
+	 * memory accesses.
+	 */
+	if (likely(PrivateRefCountEntryLast != -1) &&
+		likely(PrivateRefCountArray[PrivateRefCountEntryLast].buffer == buffer))
+	{
+		return &PrivateRefCountArray[PrivateRefCountEntryLast];
+	}
+
+	/*
+	 * The code for the cached lookup is small enough to be worth inlining
+	 * into the caller. In the miss case however, that empirically doesn't
+	 * seem worth it.
+	 */
+	return GetPrivateRefCountEntrySlow(buffer, do_move);
+}
+
 /*
  * Returns how many times the passed buffer is pinned by this backend.
  *
-- 
2.48.1.76.g4e746b1a31.dirty

