From 052ac2256afbba3f25f20f683449a0bbb0af4241 Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@vondra.me>
Date: Tue, 2 Jun 2026 22:09:33 +0200
Subject: [PATCH v20260605 2/6] Infrastructure for partitioning of shared
 buffers

The patch introduces a simple "registry" of buffer partitions, keeping
track of the first/last buffer, etc. This serves as a source of truth
for later patches (e.g. to partition clock-sweep or to make the
partitioning NUMA-aware).

The registry is a small array of BufferPartition entries in shared
memory, with partitions sized to be a fair share of shared buffers.

Notes:

* Maybe the number of partitions should be configurable? Right now it's
  hard-coded as 4, but testing shows increasing to e.g. 16) can be
  beneficial.

* This partitioning is independent of the partitions defined in
  lwlock.h, which defines 128 partitions to reduce lock conflict on the
  buffer mapping hashtable. The number of partitions introduced by this
  patch is expected to be much lower (a dozen or so).

* The buffers are divided as evenly as possible, with the first couple
  partitions possibly getting an extra buffer.
---
 contrib/pg_buffercache/Makefile               |   3 +-
 .../pg_buffercache--1.7--1.8.sql              |  23 +++
 contrib/pg_buffercache/pg_buffercache.control |   2 +-
 contrib/pg_buffercache/pg_buffercache_pages.c |  86 +++++++++++
 src/backend/storage/buffer/buf_init.c         | 142 ++++++++++++++++++
 src/include/storage/buf_internals.h           |   5 +
 src/include/storage/bufmgr.h                  |  19 +++
 src/tools/pgindent/typedefs.list              |   2 +
 8 files changed, 280 insertions(+), 2 deletions(-)
 create mode 100644 contrib/pg_buffercache/pg_buffercache--1.7--1.8.sql

diff --git a/contrib/pg_buffercache/Makefile b/contrib/pg_buffercache/Makefile
index 0e618f66aec..7fd5cdfc43d 100644
--- a/contrib/pg_buffercache/Makefile
+++ b/contrib/pg_buffercache/Makefile
@@ -9,7 +9,8 @@ EXTENSION = pg_buffercache
 DATA = pg_buffercache--1.2.sql pg_buffercache--1.2--1.3.sql \
 	pg_buffercache--1.1--1.2.sql pg_buffercache--1.0--1.1.sql \
 	pg_buffercache--1.3--1.4.sql pg_buffercache--1.4--1.5.sql \
-	pg_buffercache--1.5--1.6.sql pg_buffercache--1.6--1.7.sql
+	pg_buffercache--1.5--1.6.sql pg_buffercache--1.6--1.7.sql \
+	pg_buffercache--1.7--1.8.sql
 PGFILEDESC = "pg_buffercache - monitoring of shared buffer cache in real-time"
 
 REGRESS = pg_buffercache pg_buffercache_numa
diff --git a/contrib/pg_buffercache/pg_buffercache--1.7--1.8.sql b/contrib/pg_buffercache/pg_buffercache--1.7--1.8.sql
new file mode 100644
index 00000000000..d62b8339bfc
--- /dev/null
+++ b/contrib/pg_buffercache/pg_buffercache--1.7--1.8.sql
@@ -0,0 +1,23 @@
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "ALTER EXTENSION pg_buffercache UPDATE TO '1.8'" to load this file. \quit
+
+-- Register the new functions.
+CREATE OR REPLACE FUNCTION pg_buffercache_partitions()
+RETURNS SETOF RECORD
+AS 'MODULE_PATHNAME', 'pg_buffercache_partitions'
+LANGUAGE C PARALLEL SAFE;
+
+-- Create a view for convenient access.
+CREATE VIEW pg_buffercache_partitions AS
+	SELECT P.* FROM pg_buffercache_partitions() AS P
+	(partition integer,			-- partition index
+	 num_buffers integer,		-- number of buffers in the partition
+	 first_buffer integer,		-- first buffer of partition
+	 last_buffer integer);		-- last buffer of partition
+
+-- Don't want these to be available to public.
+REVOKE ALL ON FUNCTION pg_buffercache_partitions() FROM PUBLIC;
+REVOKE ALL ON pg_buffercache_partitions FROM PUBLIC;
+
+GRANT EXECUTE ON FUNCTION pg_buffercache_partitions() TO pg_monitor;
+GRANT SELECT ON pg_buffercache_partitions TO pg_monitor;
diff --git a/contrib/pg_buffercache/pg_buffercache.control b/contrib/pg_buffercache/pg_buffercache.control
index 11499550945..d2fa8ba53ba 100644
--- a/contrib/pg_buffercache/pg_buffercache.control
+++ b/contrib/pg_buffercache/pg_buffercache.control
@@ -1,5 +1,5 @@
 # pg_buffercache extension
 comment = 'examine the shared buffer cache'
-default_version = '1.7'
+default_version = '1.8'
 module_pathname = '$libdir/pg_buffercache'
 relocatable = true
diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c
index bf2e6c97220..d678cb045ba 100644
--- a/contrib/pg_buffercache/pg_buffercache_pages.c
+++ b/contrib/pg_buffercache/pg_buffercache_pages.c
@@ -31,6 +31,7 @@
 #define NUM_BUFFERCACHE_MARK_DIRTY_ALL_ELEM 3
 
 #define NUM_BUFFERCACHE_OS_PAGES_ELEM	3
+#define NUM_BUFFERCACHE_PARTITIONS_ELEM	4
 
 PG_MODULE_MAGIC_EXT(
 					.name = "pg_buffercache",
@@ -75,6 +76,7 @@ PG_FUNCTION_INFO_V1(pg_buffercache_evict_all);
 PG_FUNCTION_INFO_V1(pg_buffercache_mark_dirty);
 PG_FUNCTION_INFO_V1(pg_buffercache_mark_dirty_relation);
 PG_FUNCTION_INFO_V1(pg_buffercache_mark_dirty_all);
+PG_FUNCTION_INFO_V1(pg_buffercache_partitions);
 
 
 /* Only need to touch memory once per backend process lifetime */
@@ -871,3 +873,87 @@ pg_buffercache_mark_dirty_all(PG_FUNCTION_ARGS)
 
 	PG_RETURN_DATUM(result);
 }
+
+/*
+ * Inquire about partitioning of shared buffers.
+ */
+Datum
+pg_buffercache_partitions(PG_FUNCTION_ARGS)
+{
+	FuncCallContext *funcctx;
+	MemoryContext oldcontext;
+	TupleDesc	tupledesc;
+	TupleDesc	expected_tupledesc;
+	HeapTuple	tuple;
+	Datum		result;
+
+	if (SRF_IS_FIRSTCALL())
+	{
+		funcctx = SRF_FIRSTCALL_INIT();
+
+		/* Switch context when allocating stuff to be used in later calls */
+		oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
+
+		if (get_call_result_type(fcinfo, NULL, &expected_tupledesc) != TYPEFUNC_COMPOSITE)
+			elog(ERROR, "return type must be a row type");
+
+		if (expected_tupledesc->natts != NUM_BUFFERCACHE_PARTITIONS_ELEM)
+			elog(ERROR, "incorrect number of output arguments");
+
+		/* Construct a tuple descriptor for the result rows. */
+		tupledesc = CreateTemplateTupleDesc(expected_tupledesc->natts);
+		TupleDescInitEntry(tupledesc, (AttrNumber) 1, "partition",
+						   INT4OID, -1, 0);
+		TupleDescInitEntry(tupledesc, (AttrNumber) 2, "num_buffers",
+						   INT4OID, -1, 0);
+		TupleDescInitEntry(tupledesc, (AttrNumber) 3, "first_buffer",
+						   INT4OID, -1, 0);
+		TupleDescInitEntry(tupledesc, (AttrNumber) 4, "last_buffer",
+						   INT4OID, -1, 0);
+
+		funcctx->user_fctx = BlessTupleDesc(tupledesc);
+
+		/* Return to original context when allocating transient memory */
+		MemoryContextSwitchTo(oldcontext);
+
+		/* Set max calls and remember the user function context. */
+		funcctx->max_calls = BufferPartitionCount();
+	}
+
+	funcctx = SRF_PERCALL_SETUP();
+
+	if (funcctx->call_cntr < funcctx->max_calls)
+	{
+		uint32		i = funcctx->call_cntr;
+
+		int			num_buffers,
+					first_buffer,
+					last_buffer;
+
+		Datum		values[NUM_BUFFERCACHE_PARTITIONS_ELEM];
+		bool		nulls[NUM_BUFFERCACHE_PARTITIONS_ELEM];
+
+		BufferPartitionGet(i, &num_buffers,
+						   &first_buffer, &last_buffer);
+
+		values[0] = Int32GetDatum(i);
+		nulls[0] = false;
+
+		values[1] = Int32GetDatum(num_buffers);
+		nulls[1] = false;
+
+		values[2] = Int32GetDatum(first_buffer);
+		nulls[2] = false;
+
+		values[3] = Int32GetDatum(last_buffer);
+		nulls[3] = false;
+
+		/* Build and return the tuple. */
+		tuple = heap_form_tuple((TupleDesc) funcctx->user_fctx, values, nulls);
+		result = HeapTupleGetDatum(tuple);
+
+		SRF_RETURN_NEXT(funcctx, result);
+	}
+	else
+		SRF_RETURN_DONE(funcctx);
+}
diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c
index 1407c930c56..e593b02e0ca 100644
--- a/src/backend/storage/buffer/buf_init.c
+++ b/src/backend/storage/buffer/buf_init.c
@@ -26,10 +26,12 @@ char	   *BufferBlocks;
 ConditionVariableMinimallyPadded *BufferIOCVArray;
 WritebackContext BackendWritebackContext;
 CkptSortItem *CkptBufferIds;
+BufferPartitions *BufferPartitionsRegistry;
 
 static void BufferManagerShmemRequest(void *arg);
 static void BufferManagerShmemInit(void *arg);
 static void BufferManagerShmemAttach(void *arg);
+static void BufferPartitionsInit(void);
 
 const ShmemCallbacks BufferManagerShmemCallbacks = {
 	.request_fn = BufferManagerShmemRequest,
@@ -69,6 +71,9 @@ const ShmemCallbacks BufferManagerShmemCallbacks = {
  *		multiple times. Check the PrivateRefCount infrastructure in bufmgr.c.
  */
 
+/* number of buffer partitions */
+#define NUM_CLOCK_SWEEP_PARTITIONS	4
+
 
 /*
  * Register shared memory area for the buffer pool.
@@ -97,6 +102,13 @@ BufferManagerShmemRequest(void *arg)
 					   .ptr = (void **) &BufferIOCVArray,
 		);
 
+	ShmemRequestStruct(.name = "Buffer Partition Registry",
+					   .size = NUM_CLOCK_SWEEP_PARTITIONS * sizeof(BufferPartition),
+	/* Align descriptors to a cacheline boundary. */
+					   .alignment = PG_CACHE_LINE_SIZE,
+					   .ptr = (void **) &BufferPartitionsRegistry,
+		);
+
 	/*
 	 * The array used to sort to-be-checkpointed buffer ids is located in
 	 * shared memory, to avoid having to allocate significant amounts of
@@ -119,6 +131,12 @@ BufferManagerShmemRequest(void *arg)
 static void
 BufferManagerShmemInit(void *arg)
 {
+	/*
+	 * Initialize the buffer partition registry first, before other parts
+	 * have a chance to touch the memory.
+	 */
+	BufferPartitionsInit();
+
 	/*
 	 * Initialize all the buffer headers.
 	 */
@@ -151,3 +169,127 @@ BufferManagerShmemAttach(void *arg)
 	WritebackContextInit(&BackendWritebackContext,
 						 &backend_flush_after);
 }
+
+/*
+ * Sanity checks of buffers partitions - there must be no gaps, it must cover
+ * the whole range of buffers, etc.
+ */
+static void
+AssertCheckBufferPartitions(void)
+{
+#ifdef USE_ASSERT_CHECKING
+	int			num_buffers = 0;
+
+	Assert(BufferPartitionsRegistry->npartitions > 0);
+
+	for (int i = 0; i < BufferPartitionsRegistry->npartitions; i++)
+	{
+		BufferPartition *part = &BufferPartitionsRegistry->partitions[i];
+
+		/*
+		 * We can get a single-buffer partition, if the sizing forces the last
+		 * partition to be just one buffer. But it's unlikely (and
+		 * undesirable).
+		 */
+		Assert(part->first_buffer <= part->last_buffer);
+		Assert((part->last_buffer - part->first_buffer + 1) == part->num_buffers);
+
+		num_buffers += part->num_buffers;
+
+		/*
+		 * The first partition needs to start on buffer 0. Later partitions
+		 * need to be contiguous, without skipping any buffers.
+		 */
+		if (i == 0)
+		{
+			Assert(part->first_buffer == 0);
+		}
+		else
+		{
+			BufferPartition *prev = &BufferPartitionsRegistry->partitions[i - 1];
+
+			Assert((part->first_buffer - 1) == prev->last_buffer);
+		}
+
+		/* the last partition needs to end on buffer (NBuffers - 1) */
+		if (i == (BufferPartitionsRegistry->npartitions - 1))
+		{
+			Assert(part->last_buffer == (NBuffers - 1));
+		}
+	}
+
+	Assert(num_buffers == NBuffers);
+#endif
+}
+
+/*
+ * BufferPartitionsInit
+ *		Initialize registry of buffer partitions.
+ */
+static void
+BufferPartitionsInit(void)
+{
+	int			buffer = 0;
+
+	/* number of buffers per partition (make sure to not overflow) */
+	int			part_buffers = NBuffers / NUM_CLOCK_SWEEP_PARTITIONS;
+	int			remaining_buffers = NBuffers % NUM_CLOCK_SWEEP_PARTITIONS;
+
+	BufferPartitionsRegistry->npartitions = NUM_CLOCK_SWEEP_PARTITIONS;
+
+	for (int n = 0; n < BufferPartitionsRegistry->npartitions; n++)
+	{
+		BufferPartition *part = &BufferPartitionsRegistry->partitions[n];
+
+		int			num_buffers = part_buffers;
+		if (n < remaining_buffers)
+			num_buffers += 1;
+
+		remaining_buffers -= num_buffers;
+
+		Assert((num_buffers > 0) && (num_buffers <= part_buffers));
+		Assert((buffer >= 0) && (buffer < NBuffers));
+
+		part->num_buffers = num_buffers;
+		part->first_buffer = buffer;
+		part->last_buffer = buffer + (num_buffers - 1);
+
+		buffer += num_buffers;
+	}
+
+	AssertCheckBufferPartitions();
+}
+
+/*
+ * BufferPartitionCount
+ *		Returns the number of partitions created.
+ */
+int
+BufferPartitionCount(void)
+{
+	return BufferPartitionsRegistry->npartitions;
+}
+
+/*
+ * BufferPartitionGet
+ *		Returns information about a partition at the provided index.
+ *
+ * The returned information is first/last buffer, number of buffers.
+ */
+void
+BufferPartitionGet(int idx, int *num_buffers,
+				   int *first_buffer, int *last_buffer)
+{
+	if ((idx >= 0) && (idx < BufferPartitionsRegistry->npartitions))
+	{
+		BufferPartition *part = &BufferPartitionsRegistry->partitions[idx];
+
+		*num_buffers = part->num_buffers;
+		*first_buffer = part->first_buffer;
+		*last_buffer = part->last_buffer;
+
+		return;
+	}
+
+	elog(ERROR, "invalid partition index");
+}
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index 89615a254a3..e5a887b9969 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -411,9 +411,14 @@ typedef struct WritebackContext
 
 /* in buf_init.c */
 extern PGDLLIMPORT BufferDescPadded *BufferDescriptors;
+extern PGDLLIMPORT BufferPartitions *BufferPartitionsRegistry;
 extern PGDLLIMPORT ConditionVariableMinimallyPadded *BufferIOCVArray;
 extern PGDLLIMPORT WritebackContext BackendWritebackContext;
 
+extern int	BufferPartitionCount(void);
+extern void BufferPartitionGet(int idx, int *num_buffers,
+							   int *first_buffer, int *last_buffer);
+
 /* in localbuf.c */
 extern PGDLLIMPORT BufferDesc *LocalBufferDescriptors;
 
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index 6837b35fc6d..79a3f44747a 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -155,6 +155,25 @@ struct ReadBuffersOperation
 
 typedef struct ReadBuffersOperation ReadBuffersOperation;
 
+/*
+ * information about one partition of shared buffers
+ *
+ * first/last buffer - the values are inclusive
+ */
+typedef struct BufferPartition
+{
+	int			num_buffers;	/* number of buffers */
+	int			first_buffer;	/* first buffer of partition */
+	int			last_buffer;	/* last buffer of partition */
+} BufferPartition;
+
+/* an array of information about all partitions */
+typedef struct BufferPartitions
+{
+	int			npartitions;	/* number of partitions */
+	BufferPartition partitions[FLEXIBLE_ARRAY_MEMBER];
+} BufferPartitions;
+
 /* to avoid having to expose buf_internals.h here */
 typedef struct WritebackContext WritebackContext;
 
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 8cf40c87043..ea756015249 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -365,6 +365,8 @@ BufferHeapTupleTableSlot
 BufferLockMode
 BufferLookupEnt
 BufferManagerRelation
+BufferPartition
+BufferPartitions
 BufferStrategyControl
 BufferTag
 BufferUsage
-- 
2.54.0

