From 5ab7aba5f8af84b748180eb6caa2b165aceb2590 Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@vondra.me>
Date: Tue, 2 Jun 2026 22:16:55 +0200
Subject: [PATCH v20260605 4/6] clock-sweep: basic partitioning

Partitions the "clock-sweep" algorithm to work on individual partitions,
one by one. Each backend process is mapped to one "home" partition, with
an independent clock hand. This reduces contention for workloads with
significant buffer pressure.

The patch extends the "pg_buffercache_partitions" view to include
information about the clock-sweep activity.

Note: This needs some sort of "balancing" when one of the partitions is
much busier than the rest (e.g. because there's a single backend consuming
a lot of buffers from it).

Note: There's a problem with some tests running out of unpinned buffers,
due to (intentionally) setting shared buffers very low. That happens
because StrategyGetBuffer() only searches a single partition, and it
has a couple more issues.
---
 .../pg_buffercache--1.7--1.8.sql              |   8 +-
 contrib/pg_buffercache/pg_buffercache_pages.c |  32 +-
 src/backend/storage/buffer/bufmgr.c           | 202 +++++++----
 src/backend/storage/buffer/freelist.c         | 333 ++++++++++++++++--
 src/include/storage/buf_internals.h           |   4 +-
 src/include/storage/bufmgr.h                  |   5 +
 src/test/recovery/t/027_stream_regress.pl     |   5 +
 src/tools/pgindent/typedefs.list              |   1 +
 8 files changed, 486 insertions(+), 104 deletions(-)

diff --git a/contrib/pg_buffercache/pg_buffercache--1.7--1.8.sql b/contrib/pg_buffercache/pg_buffercache--1.7--1.8.sql
index a6e49fd1652..92176fed7f8 100644
--- a/contrib/pg_buffercache/pg_buffercache--1.7--1.8.sql
+++ b/contrib/pg_buffercache/pg_buffercache--1.7--1.8.sql
@@ -14,7 +14,13 @@ CREATE VIEW pg_buffercache_partitions AS
 	 numa_node integer,			-- NUMA node of the partitioon
 	 num_buffers integer,		-- number of buffers in the partition
 	 first_buffer integer,		-- first buffer of partition
-	 last_buffer integer);		-- last buffer of partition
+	 last_buffer integer,		-- last buffer of partition
+
+	 -- clocksweep counters
+	 num_passes bigint,			-- clocksweep passes
+	 next_buffer integer,		-- next victim buffer for clocksweep
+	 total_allocs bigint,		-- handled allocs (running total)
+	 num_allocs bigint);		-- handled allocs (current cycle)
 
 -- Don't want these to be available to public.
 REVOKE ALL ON FUNCTION pg_buffercache_partitions() FROM PUBLIC;
diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c
index e3efeeda675..739c63b0cfc 100644
--- a/contrib/pg_buffercache/pg_buffercache_pages.c
+++ b/contrib/pg_buffercache/pg_buffercache_pages.c
@@ -31,7 +31,7 @@
 #define NUM_BUFFERCACHE_MARK_DIRTY_ALL_ELEM 3
 
 #define NUM_BUFFERCACHE_OS_PAGES_ELEM	3
-#define NUM_BUFFERCACHE_PARTITIONS_ELEM	5
+#define NUM_BUFFERCACHE_PARTITIONS_ELEM	9
 
 PG_MODULE_MAGIC_EXT(
 					.name = "pg_buffercache",
@@ -912,6 +912,14 @@ pg_buffercache_partitions(PG_FUNCTION_ARGS)
 						   INT4OID, -1, 0);
 		TupleDescInitEntry(tupledesc, (AttrNumber) 5, "last_buffer",
 						   INT4OID, -1, 0);
+		TupleDescInitEntry(tupledesc, (AttrNumber) 6, "num_passes",
+						   INT8OID, -1, 0);
+		TupleDescInitEntry(tupledesc, (AttrNumber) 7, "next_buffer",
+						   INT4OID, -1, 0);
+		TupleDescInitEntry(tupledesc, (AttrNumber) 8, "total_allocs",
+						   INT8OID, -1, 0);
+		TupleDescInitEntry(tupledesc, (AttrNumber) 9, "num_allocs",
+						   INT8OID, -1, 0);
 
 		funcctx->user_fctx = BlessTupleDesc(tupledesc);
 
@@ -933,12 +941,22 @@ pg_buffercache_partitions(PG_FUNCTION_ARGS)
 					first_buffer,
 					last_buffer;
 
+		uint64		buffer_total_allocs;
+
+		uint32		complete_passes,
+					next_victim_buffer,
+					buffer_allocs;
+
 		Datum		values[NUM_BUFFERCACHE_PARTITIONS_ELEM];
 		bool		nulls[NUM_BUFFERCACHE_PARTITIONS_ELEM];
 
 		BufferPartitionGet(i, &numa_node, &num_buffers,
 						   &first_buffer, &last_buffer);
 
+		ClockSweepPartitionGetInfo(i,
+								   &complete_passes, &next_victim_buffer,
+								   &buffer_total_allocs, &buffer_allocs);
+
 		values[0] = Int32GetDatum(i);
 		nulls[0] = false;
 
@@ -954,6 +972,18 @@ pg_buffercache_partitions(PG_FUNCTION_ARGS)
 		values[4] = Int32GetDatum(last_buffer);
 		nulls[4] = false;
 
+		values[5] = Int64GetDatum(complete_passes);
+		nulls[5] = false;
+
+		values[6] = Int32GetDatum(next_victim_buffer);
+		nulls[6] = false;
+
+		values[7] = Int64GetDatum(buffer_total_allocs);
+		nulls[7] = false;
+
+		values[8] = Int64GetDatum(buffer_allocs);
+		nulls[8] = false;
+
 		/* Build and return the tuple. */
 		tuple = heap_form_tuple((TupleDesc) funcctx->user_fctx, values, nulls);
 		result = HeapTupleGetDatum(tuple);
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index cc398db124d..02c75f82e5b 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -3826,33 +3826,34 @@ BufferSync(int flags)
 }
 
 /*
- * BgBufferSync -- Write out some dirty buffers in the pool.
- *
- * This is called periodically by the background writer process.
+ * Information saved between calls so we can determine the strategy point's
+ * advance rate and avoid scanning already-cleaned buffers.
  *
- * Returns true if it's appropriate for the bgwriter process to go into
- * low-power hibernation mode.  (This happens if the strategy clock-sweep
- * has been "lapped" and no buffer allocations have occurred recently,
- * or if the bgwriter has been effectively disabled by setting
- * bgwriter_lru_maxpages to 0.)
+ * XXX Does it actually make sense to split all of this information per
+ * partition? For example, does per-partition advance rate mean anything?
+ * Maybe we should have a global advance rate? Although, if we want to
+ * keep enough clean buffers in each partition, maybe having per-partition
+ * rates makes sense.
  */
-bool
-BgBufferSync(WritebackContext *wb_context)
+typedef struct BufferSyncPartition
+{
+	int	prev_strategy_buf_id;
+	uint32 prev_strategy_passes;
+	int	next_to_clean;
+	uint32 next_passes;
+} BufferSyncPartition;
+
+static BufferSyncPartition *saved_info = NULL;
+static bool saved_info_valid = false;
+
+static bool
+BgBufferSyncPartition(WritebackContext *wb_context, int num_partitions,
+					  int partition, int recent_alloc_partition,
+					  BufferSyncPartition *saved)
 {
 	/* info obtained from freelist.c */
 	int			strategy_buf_id;
 	uint32		strategy_passes;
-	uint32		recent_alloc;
-
-	/*
-	 * Information saved between calls so we can determine the strategy
-	 * point's advance rate and avoid scanning already-cleaned buffers.
-	 */
-	static bool saved_info_valid = false;
-	static int	prev_strategy_buf_id;
-	static uint32 prev_strategy_passes;
-	static int	next_to_clean;
-	static uint32 next_passes;
 
 	/* Moving averages of allocation rate and clean-buffer density */
 	static float smoothed_alloc = 0;
@@ -3880,25 +3881,16 @@ BgBufferSync(WritebackContext *wb_context)
 	long		new_strategy_delta;
 	uint32		new_recent_alloc;
 
+	/* buffer range for the clocksweep partition */
+	int			first_buffer;
+	int			num_buffers;
+
 	/*
 	 * Find out where the clock-sweep currently is, and how many buffer
 	 * allocations have happened since our last call.
 	 */
-	strategy_buf_id = StrategySyncStart(&strategy_passes, &recent_alloc);
-
-	/* Report buffer alloc counts to pgstat */
-	PendingBgWriterStats.buf_alloc += recent_alloc;
-
-	/*
-	 * If we're not running the LRU scan, just stop after doing the stats
-	 * stuff.  We mark the saved state invalid so that we can recover sanely
-	 * if LRU scan is turned back on later.
-	 */
-	if (bgwriter_lru_maxpages <= 0)
-	{
-		saved_info_valid = false;
-		return true;
-	}
+	strategy_buf_id = StrategySyncStart(partition, &strategy_passes,
+										&first_buffer, &num_buffers);
 
 	/*
 	 * Compute strategy_delta = how many buffers have been scanned by the
@@ -3910,17 +3902,17 @@ BgBufferSync(WritebackContext *wb_context)
 	 */
 	if (saved_info_valid)
 	{
-		int32		passes_delta = strategy_passes - prev_strategy_passes;
+		int32		passes_delta = strategy_passes - saved->prev_strategy_passes;
 
-		strategy_delta = strategy_buf_id - prev_strategy_buf_id;
-		strategy_delta += (long) passes_delta * NBuffers;
+		strategy_delta = strategy_buf_id - saved->prev_strategy_buf_id;
+		strategy_delta += (long) passes_delta * num_buffers;
 
 		Assert(strategy_delta >= 0);
 
-		if ((int32) (next_passes - strategy_passes) > 0)
+		if ((int32) (saved->next_passes - strategy_passes) > 0)
 		{
 			/* we're one pass ahead of the strategy point */
-			bufs_to_lap = strategy_buf_id - next_to_clean;
+			bufs_to_lap = strategy_buf_id - saved->next_to_clean;
 #ifdef BGW_DEBUG
 			elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
 				 next_passes, next_to_clean,
@@ -3928,11 +3920,11 @@ BgBufferSync(WritebackContext *wb_context)
 				 strategy_delta, bufs_to_lap);
 #endif
 		}
-		else if (next_passes == strategy_passes &&
-				 next_to_clean >= strategy_buf_id)
+		else if (saved->next_passes == strategy_passes &&
+				 saved->next_to_clean >= strategy_buf_id)
 		{
 			/* on same pass, but ahead or at least not behind */
-			bufs_to_lap = NBuffers - (next_to_clean - strategy_buf_id);
+			bufs_to_lap = num_buffers - (saved->next_to_clean - strategy_buf_id);
 #ifdef BGW_DEBUG
 			elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
 				 next_passes, next_to_clean,
@@ -3952,9 +3944,9 @@ BgBufferSync(WritebackContext *wb_context)
 				 strategy_passes, strategy_buf_id,
 				 strategy_delta);
 #endif
-			next_to_clean = strategy_buf_id;
-			next_passes = strategy_passes;
-			bufs_to_lap = NBuffers;
+			saved->next_to_clean = strategy_buf_id;
+			saved->next_passes = strategy_passes;
+			bufs_to_lap = num_buffers;
 		}
 	}
 	else
@@ -3968,15 +3960,16 @@ BgBufferSync(WritebackContext *wb_context)
 			 strategy_passes, strategy_buf_id);
 #endif
 		strategy_delta = 0;
-		next_to_clean = strategy_buf_id;
-		next_passes = strategy_passes;
-		bufs_to_lap = NBuffers;
+		saved->next_to_clean = strategy_buf_id;
+		saved->next_passes = strategy_passes;
+		bufs_to_lap = num_buffers;
 	}
 
 	/* Update saved info for next time */
-	prev_strategy_buf_id = strategy_buf_id;
-	prev_strategy_passes = strategy_passes;
-	saved_info_valid = true;
+	saved->prev_strategy_buf_id = strategy_buf_id;
+	saved->prev_strategy_passes = strategy_passes;
+	/* XXX this needs to happen only after all partitions */
+	/* saved_info_valid = true; */
 
 	/*
 	 * Compute how many buffers had to be scanned for each new allocation, ie,
@@ -3984,9 +3977,9 @@ BgBufferSync(WritebackContext *wb_context)
 	 *
 	 * If the strategy point didn't move, we don't update the density estimate
 	 */
-	if (strategy_delta > 0 && recent_alloc > 0)
+	if (strategy_delta > 0 && recent_alloc_partition > 0)
 	{
-		scans_per_alloc = (float) strategy_delta / (float) recent_alloc;
+		scans_per_alloc = (float) strategy_delta / (float) recent_alloc_partition;
 		smoothed_density += (scans_per_alloc - smoothed_density) /
 			smoothing_samples;
 	}
@@ -3996,7 +3989,7 @@ BgBufferSync(WritebackContext *wb_context)
 	 * strategy point and where we've scanned ahead to, based on the smoothed
 	 * density estimate.
 	 */
-	bufs_ahead = NBuffers - bufs_to_lap;
+	bufs_ahead = num_buffers - bufs_to_lap;
 	reusable_buffers_est = (float) bufs_ahead / smoothed_density;
 
 	/*
@@ -4004,10 +3997,10 @@ BgBufferSync(WritebackContext *wb_context)
 	 * a true average we want a fast-attack, slow-decline behavior: we
 	 * immediately follow any increase.
 	 */
-	if (smoothed_alloc <= (float) recent_alloc)
-		smoothed_alloc = recent_alloc;
+	if (smoothed_alloc <= (float) recent_alloc_partition)
+		smoothed_alloc = recent_alloc_partition;
 	else
-		smoothed_alloc += ((float) recent_alloc - smoothed_alloc) /
+		smoothed_alloc += ((float) recent_alloc_partition - smoothed_alloc) /
 			smoothing_samples;
 
 	/* Scale the estimate by a GUC to allow more aggressive tuning. */
@@ -4034,7 +4027,7 @@ BgBufferSync(WritebackContext *wb_context)
 	 * the BGW will be called during the scan_whole_pool time; slice the
 	 * buffer pool into that many sections.
 	 */
-	min_scan_buffers = (int) (NBuffers / (scan_whole_pool_milliseconds / BgWriterDelay));
+	min_scan_buffers = (int) (num_buffers / (scan_whole_pool_milliseconds / BgWriterDelay));
 
 	if (upcoming_alloc_est < (min_scan_buffers + reusable_buffers_est))
 	{
@@ -4059,20 +4052,20 @@ BgBufferSync(WritebackContext *wb_context)
 	/* Execute the LRU scan */
 	while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est)
 	{
-		int			sync_state = SyncOneBuffer(next_to_clean, true,
+		int			sync_state = SyncOneBuffer(saved->next_to_clean, true,
 											   wb_context);
 
-		if (++next_to_clean >= NBuffers)
+		if (++saved->next_to_clean >= (first_buffer + num_buffers))
 		{
-			next_to_clean = 0;
-			next_passes++;
+			saved->next_to_clean = first_buffer;
+			saved->next_passes++;
 		}
 		num_to_scan--;
 
 		if (sync_state & BUF_WRITTEN)
 		{
 			reusable_buffers++;
-			if (++num_written >= bgwriter_lru_maxpages)
+			if (++num_written >= (bgwriter_lru_maxpages / num_partitions))
 			{
 				PendingBgWriterStats.maxwritten_clean++;
 				break;
@@ -4086,7 +4079,7 @@ BgBufferSync(WritebackContext *wb_context)
 
 #ifdef BGW_DEBUG
 	elog(DEBUG1, "bgwriter: recent_alloc=%u smoothed=%.2f delta=%ld ahead=%d density=%.2f reusable_est=%d upcoming_est=%d scanned=%d wrote=%d reusable=%d",
-		 recent_alloc, smoothed_alloc, strategy_delta, bufs_ahead,
+		 recent_alloc_partition, smoothed_alloc, strategy_delta, bufs_ahead,
 		 smoothed_density, reusable_buffers_est, upcoming_alloc_est,
 		 bufs_to_lap - num_to_scan,
 		 num_written,
@@ -4116,8 +4109,83 @@ BgBufferSync(WritebackContext *wb_context)
 #endif
 	}
 
+	/* can this partition hibernate */
+	return (bufs_to_lap == 0 && recent_alloc_partition == 0);
+}
+
+/*
+ * BgBufferSync -- Write out some dirty buffers in the pool.
+ *
+ * This is called periodically by the background writer process.
+ *
+ * Returns true if it's appropriate for the bgwriter process to go into
+ * low-power hibernation mode.  (This happens if the strategy clock-sweep
+ * has been "lapped" and no buffer allocations have occurred recently,
+ * or if the bgwriter has been effectively disabled by setting
+ * bgwriter_lru_maxpages to 0.)
+ */
+bool
+BgBufferSync(WritebackContext *wb_context)
+{
+	/* info obtained from freelist.c */
+	uint32		recent_alloc;
+	uint32		recent_alloc_partition;
+	int			num_partitions;
+
+	/* assume we can hibernate, any partition can set to false */
+	bool		hibernate = true;
+
+	/* get the number of clocksweep partitions, and total alloc count */
+	StrategySyncPrepare(&num_partitions, &recent_alloc);
+
+	/* allocate space for per-partition information between calls */
+	if (saved_info == NULL)
+	{
+		/*
+		 * XXX Not great it's using malloc(), but how else to allocate a
+		 * variable-length array?
+		 */
+		saved_info = malloc(sizeof(BufferSyncPartition) * num_partitions);
+	}
+
+	/* Report buffer alloc counts to pgstat */
+	PendingBgWriterStats.buf_alloc += recent_alloc;
+
+	/* average alloc buffers per partition */
+	recent_alloc_partition = (recent_alloc / num_partitions);
+
+	/*
+	 * If we're not running the LRU scan, just stop after doing the stats
+	 * stuff.  We mark the saved state invalid so that we can recover sanely
+	 * if LRU scan is turned back on later.
+	 */
+	if (bgwriter_lru_maxpages <= 0)
+	{
+		saved_info_valid = false;
+		return true;
+	}
+
+	/*
+	 * now process the clocksweep partitions, one by one, using the same
+	 * cleanup that we used for all buffers
+	 *
+	 * XXX Maybe we should randomize the order of partitions a bit, so that we
+	 * don't start from partition 0 all the time? Perhaps not entirely, but at
+	 * least pick a random starting point?
+	 */
+	for (int partition = 0; partition < num_partitions; partition++)
+	{
+		/* hibernate if all partitions can hibernate */
+		hibernate &= BgBufferSyncPartition(wb_context, num_partitions,
+										   partition, recent_alloc_partition,
+										   &saved_info[partition]);
+	}
+
+	/* now that we've scanned all partitions, mark the cached info as valid */
+	saved_info_valid = true;
+
 	/* Return true if OK to hibernate */
-	return (bufs_to_lap == 0 && recent_alloc == 0);
+	return hibernate;
 }
 
 /*
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
index 53ef5239e8d..2d56579682e 100644
--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
@@ -36,17 +36,28 @@
 
 
 /*
- * The shared freelist control information.
+ * Information about one partition of the ClockSweep (on a subset of buffers).
+ *
+ * XXX Should be careful to align this to cachelines, etc.
  */
 typedef struct
 {
 	/* Spinlock: protects the values below */
-	slock_t		buffer_strategy_lock;
+	slock_t		clock_sweep_lock;
+
+	/* range for this clock sweep partition */
+	int32		node;
+	int32		firstBuffer;
+	int32		numBuffers;
 
 	/*
 	 * clock-sweep hand: index of next buffer to consider grabbing. Note that
 	 * this isn't a concrete buffer - we only ever increase the value. So, to
 	 * get an actual buffer, it needs to be used modulo NBuffers.
+	 *
+	 * XXX This is relative to firstBuffer, so needs to be offset properly.
+	 *
+	 * XXX firstBuffer + (nextVictimBuffer % numBuffers)
 	 */
 	pg_atomic_uint32 nextVictimBuffer;
 
@@ -57,11 +68,32 @@ typedef struct
 	uint32		completePasses; /* Complete cycles of the clock-sweep */
 	pg_atomic_uint32 numBufferAllocs;	/* Buffers allocated since last reset */
 
+	/* running total of allocs */
+	pg_atomic_uint64 numTotalAllocs;
+
+} ClockSweep;
+
+/*
+ * The shared freelist control information.
+ */
+typedef struct
+{
+	/* Spinlock: protects the values below */
+	slock_t		buffer_strategy_lock;
+
 	/*
 	 * Bgworker process to be notified upon activity or -1 if none. See
 	 * StrategyNotifyBgWriter.
 	 */
 	int			bgwprocno;
+
+	/* cached info about freelist partitioning */
+	int			num_nodes;
+	int			num_partitions;
+	int			num_partitions_per_node;
+
+	/* clocksweep partitions */
+	ClockSweep	sweeps[FLEXIBLE_ARRAY_MEMBER];
 } BufferStrategyControl;
 
 /* Pointers to shared state */
@@ -108,6 +140,7 @@ static BufferDesc *GetBufferFromRing(BufferAccessStrategy strategy,
 									 uint64 *buf_state);
 static void AddBufferToRing(BufferAccessStrategy strategy,
 							BufferDesc *buf);
+static ClockSweep *ChooseClockSweep(void);
 
 /*
  * ClockSweepTick - Helper routine for StrategyGetBuffer()
@@ -119,6 +152,7 @@ static inline uint32
 ClockSweepTick(void)
 {
 	uint32		victim;
+	ClockSweep *sweep = ChooseClockSweep();
 
 	/*
 	 * Atomically move hand ahead one buffer - if there's several processes
@@ -126,14 +160,14 @@ ClockSweepTick(void)
 	 * apparent order.
 	 */
 	victim =
-		pg_atomic_fetch_add_u32(&StrategyControl->nextVictimBuffer, 1);
+		pg_atomic_fetch_add_u32(&sweep->nextVictimBuffer, 1);
 
-	if (victim >= NBuffers)
+	if (victim >= sweep->numBuffers)
 	{
 		uint32		originalVictim = victim;
 
 		/* always wrap what we look up in BufferDescriptors */
-		victim = victim % NBuffers;
+		victim = victim % sweep->numBuffers;
 
 		/*
 		 * If we're the one that just caused a wraparound, force
@@ -159,19 +193,118 @@ ClockSweepTick(void)
 				 * could lead to an overflow of nextVictimBuffers, but that's
 				 * highly unlikely and wouldn't be particularly harmful.
 				 */
-				SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
+				SpinLockAcquire(&sweep->clock_sweep_lock);
 
-				wrapped = expected % NBuffers;
+				wrapped = expected % sweep->numBuffers;
 
-				success = pg_atomic_compare_exchange_u32(&StrategyControl->nextVictimBuffer,
+				success = pg_atomic_compare_exchange_u32(&sweep->nextVictimBuffer,
 														 &expected, wrapped);
 				if (success)
-					StrategyControl->completePasses++;
-				SpinLockRelease(&StrategyControl->buffer_strategy_lock);
+					sweep->completePasses++;
+				SpinLockRelease(&sweep->clock_sweep_lock);
 			}
 		}
 	}
-	return victim;
+
+	/*
+	 * Make sure we've calculated a buffer in the range of the partition. Buffer
+	 * IDs are 1-based, we're calculating 0-based indexes.
+	 */
+	Assert((victim >= 0) && (victim < sweep->numBuffers));
+	Assert(BufferIsValid(1 + sweep->firstBuffer + victim));
+
+	return sweep->firstBuffer + victim;
+}
+
+/*
+ * ClockSweepPartitionIndex
+ *		pick the clock-sweep partition to use based on PID and NUMA node
+ *
+ * With libnuma, use the NUMA node and PID to pick the partition. Otherwise
+ * use just PID (as if there's a single NUMA node).
+ *
+ * XXX This should also check if buffers are NUMA-partitioned, not just if
+ * compiled with libnuma.
+ */
+static int
+ClockSweepPartitionIndex(void)
+{
+	int		node = 0,
+			index;
+	pid_t	pid = MyProcPid;;
+
+	Assert(StrategyControl->num_partitions ==
+		   (StrategyControl->num_nodes * StrategyControl->num_partitions_per_node));
+
+	/*
+	 * If buffers are NUMA-partitioned, determine the partition using the NUMA
+	 * node and PID. Without NUMA assume everything is a single NUMA node 0, and
+	 * we pick the partition based on PID.
+	 */
+#ifdef USE_LIBNUMA
+	if (shared_buffers_numa)
+	{
+		int		cpu;
+
+		/* XXX do we need to check sched_getcpu is available, somehow? */
+		if ((cpu = sched_getcpu()) < 0)
+			elog(ERROR, "sched_getcpu failed: %m");
+
+		node = numa_node_of_cpu(cpu);
+	}
+#endif
+
+	/*
+	 * We should't get unexpected NUMA nodes, not considered when setting up the
+	 * buffer partitions. It could happen if the allowed NUMA nodes get adjusted
+	 * at runtime, but at this point we just create partitions for all existing
+	 * nodes. We could plan for allowed partitions, but then what if those get
+	 * disabled, and the user allows some other partitions?
+	 */
+	if ((node < 0) || (node > StrategyControl->num_nodes))
+		elog(ERROR, "node out of range: %d > %u", node, StrategyControl->num_nodes);
+
+	/*
+	 * Calculate the partition index. Nodes have the same number of partitions,
+	 * and we use the PID to pick one of those (for a given node). If there's
+	 * only a single partition per node, we can ignore PID and use node directly.
+	 */
+	if (StrategyControl->num_partitions_per_node == 1)
+	{
+		/* fast-path */
+		index = node;
+	}
+	else
+	{
+		/* use PID to pick one of node's partitions */
+		index = (node * StrategyControl->num_partitions_per_node)
+			+ (pid % StrategyControl->num_partitions_per_node);
+	}
+
+	/* should have a valid partition index */
+	Assert((index >= 0) && (index < StrategyControl->num_partitions));
+
+	return index;
+}
+
+/*
+ * ChooseClockSweep
+ *		pick a clocksweep partition based on NUMA node and PID
+ *
+ * Pick a partition mapped to the NUMA node the backend is currently running
+ * on, and use PID if there are multiple partitions per node. Without NUMA
+ * supported/enabled, use just PID.
+ *
+ * XXX Maybe we should do both the total and "per group" counts a power of
+ * two? That'd allow using shifts instead of divisions in the calculation,
+ * and that's cheaper. But how would that deal with odd number of nodes?
+ */
+static ClockSweep *
+ChooseClockSweep(void)
+{
+	int			index = ClockSweepPartitionIndex();
+
+	return &StrategyControl->sweeps[index];
 }
 
 /*
@@ -242,10 +375,37 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint64 *buf_state, bool *from_r
 	 * We count buffer allocation requests so that the bgwriter can estimate
 	 * the rate of buffer consumption.  Note that buffers recycled by a
 	 * strategy object are intentionally not counted here.
+	 *
+	 * XXX It's not quite right we call ChooseClockSweep twice - now, and then
+	 * a couple lines later (through ClockSweepTick). If the process moves
+	 * between CPUs / NUMA nodes in between, these call may pick different
+	 * partitions, confusing the logic a bit.
 	 */
-	pg_atomic_fetch_add_u32(&StrategyControl->numBufferAllocs, 1);
+	pg_atomic_fetch_add_u32(&ChooseClockSweep()->numBufferAllocs, 1);
 
-	/* Use the "clock sweep" algorithm to find a free buffer */
+	/*
+	 * Use the "clock sweep" algorithm to find a free buffer
+	 *
+	 * XXX Note that ClockSweepTick() is NUMA-aware, i.e. it only looks at
+	 * buffers from a single partition, aligned with the NUMA node. That means
+	 * a process "sweeps" only a fraction of buffers, even if the other buffers
+	 * are better candidates for eviction. Maybe there should be some logic to
+	 * "steal" buffers from other partitions or other nodes?
+	 *
+	 * XXX This only searches a single partition, which can result in "no
+	 * unpinned buffers available" even if there are buffers in other
+	 * partitions. Needs to scan partitions if needed, as a fallback.
+	 *
+	 * XXX Would that also mean we should have multiple bgwriters, one for each
+	 * node, or would one bgwriter still handle all nodes?
+	 *
+	 * XXX Also, the trycounter should not be set to NBuffers, but to buffer
+	 * count for that one partition. In fact, this should not call ClockSweepTick
+	 * for every iteration. The call is likely quite expensive (does a lot
+	 * of stuff), and also may return a different partition on each call.
+	 * We should just do it once, and then do the for(;;) loop. And then
+	 * maybe advance to the next partition, until we scan through all of them.
+	 */
 	trycounter = NBuffers;
 	for (;;)
 	{
@@ -325,6 +485,48 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint64 *buf_state, bool *from_r
 	}
 }
 
+/*
+ * StrategySyncPrepare -- prepare for sync of all partitions
+ *
+ * Determine the number of clocksweep partitions, and calculate the recent
+ * buffers allocs (as a sum of all the partitions). This allows BgBufferSync
+ * to calculate average number of allocations per partition for the next
+ * sync cycle.
+ *
+ * In addition it returns the count of recent buffer allocs, which is a total
+ * summed from all partitions. The alloc counts are reset after being read,
+ * as the partitions are walked.
+ */
+void
+StrategySyncPrepare(int *num_parts, uint32 *num_buf_alloc)
+{
+	*num_buf_alloc = 0;
+	*num_parts = StrategyControl->num_partitions;
+
+	/*
+	 * We lock the partitions one by one, so not exacly in sync, but that
+	 * should be fine. We're only looking for heuristics anyway.
+	 */
+	for (int i = 0; i < StrategyControl->num_partitions; i++)
+	{
+		ClockSweep *sweep = &StrategyControl->sweeps[i];
+
+		/* XXX Do we need the lock, if we're only accessing atomics? Surely not. */
+		/* XXX Are we ever calling this without num_buf_alloc? */
+		SpinLockAcquire(&sweep->clock_sweep_lock);
+		if (num_buf_alloc)
+		{
+			uint32	allocs = pg_atomic_exchange_u32(&sweep->numBufferAllocs, 0);
+
+			/* include the count in the running total */
+			pg_atomic_fetch_add_u64(&sweep->numTotalAllocs, allocs);
+
+			*num_buf_alloc += allocs;
+		}
+		SpinLockRelease(&sweep->clock_sweep_lock);
+	}
+}
+
 /*
  * StrategySyncStart -- tell BgBufferSync where to start syncing
  *
@@ -332,37 +534,44 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint64 *buf_state, bool *from_r
  * BgBufferSync() will proceed circularly around the buffer array from there.
  *
  * In addition, we return the completed-pass count (which is effectively
- * the higher-order bits of nextVictimBuffer) and the count of recent buffer
- * allocs if non-NULL pointers are passed.  The alloc count is reset after
- * being read.
+ * the higher-order bits of nextVictimBuffer).
+ *
+ * This only considers a single clocksweep partition, as BgBufferSync looks
+ * at them one by one.
  */
 int
-StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc)
+StrategySyncStart(int partition, uint32 *complete_passes,
+				  int *first_buffer, int *num_buffers)
 {
 	uint32		nextVictimBuffer;
 	int			result;
+	ClockSweep *sweep = &StrategyControl->sweeps[partition];
 
-	SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
-	nextVictimBuffer = pg_atomic_read_u32(&StrategyControl->nextVictimBuffer);
-	result = nextVictimBuffer % NBuffers;
+	Assert((partition >= 0) && (partition < StrategyControl->num_partitions));
+
+	SpinLockAcquire(&sweep->clock_sweep_lock);
+	nextVictimBuffer = pg_atomic_read_u32(&sweep->nextVictimBuffer);
+	result = nextVictimBuffer % sweep->numBuffers;
+
+	*first_buffer = sweep->firstBuffer;
+	*num_buffers = sweep->numBuffers;
 
 	if (complete_passes)
 	{
-		*complete_passes = StrategyControl->completePasses;
+		*complete_passes = sweep->completePasses;
 
 		/*
 		 * Additionally add the number of wraparounds that happened before
 		 * completePasses could be incremented. C.f. ClockSweepTick().
 		 */
-		*complete_passes += nextVictimBuffer / NBuffers;
+		*complete_passes += nextVictimBuffer / sweep->numBuffers;
 	}
+	SpinLockRelease(&sweep->clock_sweep_lock);
 
-	if (num_buf_alloc)
-	{
-		*num_buf_alloc = pg_atomic_exchange_u32(&StrategyControl->numBufferAllocs, 0);
-	}
-	SpinLockRelease(&StrategyControl->buffer_strategy_lock);
-	return result;
+	/* XXX buffer IDs start at 1, we're calculating 0-based indexes */
+	Assert(BufferIsValid(1 + sweep->firstBuffer + result));
+
+	return sweep->firstBuffer + result;
 }
 
 /*
@@ -394,8 +603,14 @@ StrategyNotifyBgWriter(int bgwprocno)
 static void
 StrategyCtlShmemRequest(void *arg)
 {
+	int			num_partitions;
+
+	/* get the number of buffer partitions */
+	BufferPartitionsCalculate(NULL, &num_partitions, NULL);
+
 	ShmemRequestStruct(.name = "Buffer Strategy Status",
-					   .size = sizeof(BufferStrategyControl),
+					   .size = offsetof(BufferStrategyControl, sweeps) +
+							mul_size(num_partitions, sizeof(ClockSweep)),
 					   .ptr = (void **) &StrategyControl
 		);
 }
@@ -408,12 +623,42 @@ StrategyCtlShmemInit(void *arg)
 {
 	SpinLockInit(&StrategyControl->buffer_strategy_lock);
 
-	/* Initialize the clock-sweep pointer */
-	pg_atomic_init_u32(&StrategyControl->nextVictimBuffer, 0);
+	/* Remember the number of partitions */
+	BufferPartitionsParams(&StrategyControl->num_nodes,
+						   &StrategyControl->num_partitions,
+						   &StrategyControl->num_partitions_per_node);
+
+	/* Initialize the clock sweep pointers (for all partitions) */
+	for (int i = 0; i < StrategyControl->num_partitions; i++)
+	{
+		int			node,
+					num_buffers,
+					first_buffer,
+					last_buffer;
+
+		SpinLockInit(&StrategyControl->sweeps[i].clock_sweep_lock);
 
-	/* Clear statistics */
-	StrategyControl->completePasses = 0;
-	pg_atomic_init_u32(&StrategyControl->numBufferAllocs, 0);
+		pg_atomic_init_u32(&StrategyControl->sweeps[i].nextVictimBuffer, 0);
+
+		/* get info about the buffer partition */
+		BufferPartitionGet(i, &node, &num_buffers,
+						   &first_buffer, &last_buffer);
+
+		/*
+		 * FIXME This may not quite right, because if NBuffers is not a
+		 * perfect multiple of numBuffers, the last partition will have
+		 * numBuffers set too high. buf_init handles this by tracking the
+		 * remaining number of buffers, and not overflowing.
+		 */
+		StrategyControl->sweeps[i].node = node;
+		StrategyControl->sweeps[i].numBuffers = num_buffers;
+		StrategyControl->sweeps[i].firstBuffer = first_buffer;
+
+		/* Clear statistics */
+		StrategyControl->sweeps[i].completePasses = 0;
+		pg_atomic_init_u32(&StrategyControl->sweeps[i].numBufferAllocs, 0);
+		pg_atomic_init_u64(&StrategyControl->sweeps[i].numTotalAllocs, 0);
+	}
 
 	/* No pending notification */
 	StrategyControl->bgwprocno = -1;
@@ -777,3 +1022,23 @@ StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf, bool from_r
 
 	return true;
 }
+
+void
+ClockSweepPartitionGetInfo(int idx,
+						   uint32 *complete_passes, uint32 *next_victim_buffer,
+						   uint64 *buffer_total_allocs, uint32 *buffer_allocs)
+{
+	ClockSweep *sweep = &StrategyControl->sweeps[idx];
+
+	Assert((idx >= 0) && (idx < StrategyControl->num_partitions));
+
+	/* get the clocksweep stats */
+	*complete_passes = sweep->completePasses;
+	*next_victim_buffer = pg_atomic_read_u32(&sweep->nextVictimBuffer);
+
+	*buffer_allocs = pg_atomic_read_u32(&sweep->numBufferAllocs);
+	*buffer_total_allocs = pg_atomic_read_u64(&sweep->numTotalAllocs);
+
+	/* calculate the actual buffer ID */
+	*next_victim_buffer = sweep->firstBuffer + (*next_victim_buffer % sweep->numBuffers);
+}
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index e944cee2e91..5ab0cee4281 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -593,7 +593,9 @@ extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy,
 extern bool StrategyRejectBuffer(BufferAccessStrategy strategy,
 								 BufferDesc *buf, bool from_ring);
 
-extern int	StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc);
+extern void StrategySyncPrepare(int *num_parts, uint32 *num_buf_alloc);
+extern int	StrategySyncStart(int partition, uint32 *complete_passes,
+							  int *first_buffer, int *num_buffers);
 extern void StrategyNotifyBgWriter(int bgwprocno);
 
 /* buf_table.c */
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index 1cf09e8fb7c..e0bb4cc1df1 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -410,6 +410,11 @@ extern int	GetAccessStrategyBufferCount(BufferAccessStrategy strategy);
 extern int	GetAccessStrategyPinLimit(BufferAccessStrategy strategy);
 
 extern void FreeAccessStrategy(BufferAccessStrategy strategy);
+extern void ClockSweepPartitionGetInfo(int idx,
+									   uint32 *complete_passes,
+									   uint32 *next_victim_buffer,
+									   uint64 *buffer_total_allocs,
+									   uint32 *buffer_allocs);
 
 
 /* inline functions */
diff --git a/src/test/recovery/t/027_stream_regress.pl b/src/test/recovery/t/027_stream_regress.pl
index ae977297849..f68e08e5697 100644
--- a/src/test/recovery/t/027_stream_regress.pl
+++ b/src/test/recovery/t/027_stream_regress.pl
@@ -18,6 +18,11 @@ $node_primary->adjust_conf('postgresql.conf', 'max_connections', '25');
 $node_primary->append_conf('postgresql.conf',
 	'max_prepared_transactions = 10');
 
+# The default is 1MB, which is not enough with clock-sweep partitioning.
+# Increase to 32MB, so that we don't get "no unpinned buffers".
+$node_primary->append_conf('postgresql.conf',
+	'shared_buffers = 32MB');
+
 # Enable pg_stat_statements to force tests to do query jumbling.
 # pg_stat_statements.max should be large enough to hold all the entries
 # of the regression database.
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index ea756015249..183570b4d43 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -451,6 +451,7 @@ ClientCertName
 ClientConnectionInfo
 ClientData
 ClientSocket
+ClockSweep
 ClonePtrType
 ClosePortalStmt
 ClosePtrType
-- 
2.54.0

