From f5b7b6733419186e56b06ed1d8b6da7e34984c2f Mon Sep 17 00:00:00 2001
From: Jakub Wartak <jakub.wartak@enterprisedb.com>
Date: Thu, 11 Jun 2026 12:38:43 +0200
Subject: [PATCH v20260624 7/7] Add parttioned clocksweep and NUMA goodies.

1. Add three clocksweep GUCs to allow manipulation of partitoned clocksweep
   in runtime.
2. Add pg_buffercache_set_weights(int, int[]) to alter partition allocs (with
   clocksweep_balance_recalc=off).
3. Add debug_numa_node GUC to pin to NUMA node.
---
 .../pg_buffercache--1.7--1.8.sql              |  7 ++
 contrib/pg_buffercache/pg_buffercache_pages.c | 37 ++++++++
 src/backend/storage/buffer/freelist.c         | 86 ++++++++++++++++++-
 src/backend/tcop/postgres.c                   | 57 ++++++++++++
 src/backend/utils/misc/guc_parameters.dat     | 31 +++++++
 src/include/miscadmin.h                       |  1 +
 src/include/storage/bufmgr.h                  |  6 ++
 7 files changed, 224 insertions(+), 1 deletion(-)

diff --git a/contrib/pg_buffercache/pg_buffercache--1.7--1.8.sql b/contrib/pg_buffercache/pg_buffercache--1.7--1.8.sql
index 43d2e84f9d2..9d8f4969555 100644
--- a/contrib/pg_buffercache/pg_buffercache--1.7--1.8.sql
+++ b/contrib/pg_buffercache/pg_buffercache--1.7--1.8.sql
@@ -25,9 +25,16 @@ CREATE VIEW pg_buffercache_partitions AS
 	 num_req_allocs bigint,		-- handled allocs (current cycle)
 	 weights int[]);			-- balancing weights
 
+-- Register the function to set clock-sweep balance weights.
+CREATE FUNCTION pg_buffercache_set_partition(IN partition int, IN weights int[])
+RETURNS void
+AS 'MODULE_PATHNAME', 'pg_buffercache_set_partition'
+LANGUAGE C VOLATILE PARALLEL UNSAFE;
+
 -- Don't want these to be available to public.
 REVOKE ALL ON FUNCTION pg_buffercache_partitions() FROM PUBLIC;
 REVOKE ALL ON pg_buffercache_partitions FROM PUBLIC;
+REVOKE ALL ON FUNCTION pg_buffercache_set_partition(int, int[]) FROM PUBLIC;
 
 GRANT EXECUTE ON FUNCTION pg_buffercache_partitions() TO pg_monitor;
 GRANT SELECT ON pg_buffercache_partitions TO pg_monitor;
diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c
index f26b2332c1d..2a1b7cb7549 100644
--- a/contrib/pg_buffercache/pg_buffercache_pages.c
+++ b/contrib/pg_buffercache/pg_buffercache_pages.c
@@ -81,6 +81,7 @@ PG_FUNCTION_INFO_V1(pg_buffercache_mark_dirty);
 PG_FUNCTION_INFO_V1(pg_buffercache_mark_dirty_relation);
 PG_FUNCTION_INFO_V1(pg_buffercache_mark_dirty_all);
 PG_FUNCTION_INFO_V1(pg_buffercache_partitions);
+PG_FUNCTION_INFO_V1(pg_buffercache_set_partition);
 
 
 /* Only need to touch memory once per backend process lifetime */
@@ -1077,3 +1078,39 @@ pg_buffercache_partitions(PG_FUNCTION_ARGS)
 	else
 		SRF_RETURN_DONE(funcctx);
 }
+
+/*
+ * Set the clock-sweep balance weights for a single partition.
+ */
+Datum
+pg_buffercache_set_partition(PG_FUNCTION_ARGS)
+{
+	int			partition = PG_GETARG_INT32(0);
+	ArrayType  *array = PG_GETARG_ARRAYTYPE_P(1);
+	Datum	   *elems;
+	bool	   *nulls;
+	int			nelems;
+	int		   *weights;
+
+	if (ARR_NDIM(array) > 1)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("weights must be a one-dimensional array")));
+
+	deconstruct_array_builtin(array, INT4OID, &elems, &nulls, &nelems);
+
+	weights = palloc_array(int, nelems);
+	for (int i = 0; i < nelems; i++)
+	{
+		if (nulls[i])
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+					 errmsg("weights must not contain NULL values")));
+
+		weights[i] = DatumGetInt32(elems[i]);
+	}
+
+	ClockSweepSetWeights(partition, weights, nelems);
+
+	PG_RETURN_VOID();
+}
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
index 1ac1e3e3490..e677c71e0b3 100644
--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
@@ -55,6 +55,26 @@
  */
 #define CLOCKSWEEP_HISTORY_COEFF	0.5
 
+/*
+ * GUCs controlling the NUMA-aware clock-sweep behavior.
+ *
+ * clocksweep_balance - when enabled, allocations may get redirected between
+ * clock-sweep partitions to keep them balanced (see StrategySyncBalance).
+ *
+ * clocksweep_balance_recalc - when enabled, the balance weights are
+ * periodically recalculated (see StrategySyncBalance). Disabling this keeps
+ * the current weights, e.g. ones configured manually using
+ * pg_buffercache_set_partition(), while still using them to balance
+ * allocations (if clocksweep_balance is enabled).
+ *
+ * clocksweep_scan_all_partitions - when enabled, looking for a free buffer
+ * scans all clock-sweep partitions (in a round-robin way), not just the
+ * backend's "home" partition.
+ */
+bool		clocksweep_balance = true;
+bool		clocksweep_balance_recalc = true;
+bool		clocksweep_scan_all_partitions = true;
+
 /*
  * Information about one partition of the ClockSweep (on a subset of buffers).
  *
@@ -436,8 +456,11 @@ ChooseClockSweep(bool balance)
 	 * When balancing allocations, redirect the allocations to other partitions
 	 * according to the budgets. We move through partitions in a round-robin way,
 	 * after allocating the "budget" of allocations from the current one.
+	 *
+	 * Balancing can be disabled at runtime using the clocksweep_balance GUC, in
+	 * which case allocations always stay in the backend's "home" partition.
 	 */
-	if (balance)
+	if (balance && clocksweep_balance)
 	{
 		/*
 		 * Ran out of budget from the current partition? Move to the next one
@@ -551,6 +574,10 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint64 *buf_state, bool *from_r
 	 * Start with the "preferred" partition, and then proceed in a round-robin
 	 * manner. If we cycle back to the starting partition, it means none of the
 	 * partitions has unpinned buffers.
+	 *
+	 * Scanning of the other partitions can be disabled at runtime using the
+	 * clocksweep_scan_all_partitions GUC. In that case we only scan the
+	 * backend's "home" partition, and fail if it has no unpinned buffers.
 	 */
 	sweep = ChooseClockSweep(true);
 	sweep_start = sweep;
@@ -562,6 +589,10 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint64 *buf_state, bool *from_r
 		if (buf != NULL)
 			return buf;
 
+		/* don't look at other partitions unless allowed to */
+		if (!clocksweep_scan_all_partitions)
+			break;
+
 		/*
 		 * Try advancing to the next partition, round-robin (if last partition,
 		 * wrap around to the beginning).
@@ -739,6 +770,9 @@ StrategySyncBalance(void)
 			avg_allocs,			/* average allocations (per partition) */
 			delta_allocs = 0;	/* sum of allocs above average */
 
+	if (!clocksweep_balance || !clocksweep_balance_recalc)
+		return;
+
 	/*
 	 * Collect the number of allocations requested in the past interval.
 	 * While at it, reset the counter to start the new interval.
@@ -1481,3 +1515,53 @@ ClockSweepPartitionGetInfo(int idx,
 		(*weights)[i] = (int) sweep->balance[i];
 	}
 }
+
+/*
+ * ClockSweepSetWeights override the clock-sweep balance weights of a single
+ * partition.
+ */
+void
+ClockSweepSetWeights(int partition, int *weights, int nweights)
+{
+	ClockSweep *sweep;
+
+	/*
+	 * Disallow manual weights while the automatic recalculation is enabled, as
+	 * StrategySyncBalance would just recompute (and overwrite) them.
+	 */
+	if (clocksweep_balance_recalc)
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("cannot set clock-sweep weights while debug_clocksweep_balance_recalc is enabled"),
+				 errhint("Set debug_clocksweep_balance_recalc to off before setting the weights manually.")));
+
+	if ((partition < 0) || (partition >= StrategyControl->num_partitions))
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("clock-sweep partition %d out of range", partition),
+				 errhint("There are %d clock-sweep partitions, numbered from 0.",
+						 StrategyControl->num_partitions)));
+
+	if (nweights != StrategyControl->num_partitions)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("number of weights (%d) does not match number of clock-sweep partitions (%d)",
+						nweights, StrategyControl->num_partitions)));
+
+	for (int i = 0; i < nweights; i++)
+	{
+		if ((weights[i] < 0) || (weights[i] > 100))
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+					 errmsg("clock-sweep weight %d out of range",
+							weights[i]),
+					 errhint("Each weight must be between 0 and 100.")));
+	}
+
+	sweep = &StrategyControl->sweeps[partition];
+
+	SpinLockAcquire(&sweep->clock_sweep_lock);
+	for (int j = 0; j < nweights; j++)
+		sweep->balance[j] = (uint8) weights[j];
+	SpinLockRelease(&sweep->clock_sweep_lock);
+}
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index dbef734a93f..02787062c83 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -86,6 +86,7 @@
 #include "utils/timeout.h"
 #include "utils/timestamp.h"
 #include "utils/varlena.h"
+#include <numa.h>
 
 /* ----------------
  *		global variables
@@ -110,6 +111,9 @@ int			client_connection_check_interval = 0;
 /* flags for non-system relation kinds to restrict use */
 int			restrict_nonsystem_relation_kind;
 
+/* NUMA node to pin the backend to at query start; -1 disables pinning */
+int			debug_numa_node = -1;
+
 /*
  * Include signal sender PID/UID in the server log when available
  * (SA_SIGINFO). The caller must supply the already-captured pid and uid
@@ -1020,6 +1024,53 @@ pg_plan_queries(List *querytrees, const char *query_string, int cursorOptions,
 }
 
 
+/*
+ * process_debug_numa_node
+ *
+ * If the debug_numa_node GUC is set (>= 0), pin this backend to run on the
+ * CPUs of the requested NUMA node.  -1 disables it (default)
+ */
+static void
+process_debug_numa_node(void)
+{
+#ifdef USE_LIBNUMA
+	static int	applied_numa_node = -1;
+
+	if (debug_numa_node == applied_numa_node)
+		return;
+
+	/* Nothing we can do if the kernel/library has no NUMA support. */
+	if (numa_available() < 0)
+	{
+		applied_numa_node = debug_numa_node;
+		return;
+	}
+
+	if (debug_numa_node < 0)
+	{
+		/* Pinning disabled: allow running on all nodes again. */
+		numa_run_on_node_mask(numa_all_nodes_ptr);
+	}
+	else if (debug_numa_node > numa_max_node())
+	{
+		ereport(WARNING,
+				(errmsg("debug_numa_node %d exceeds the highest available NUMA node %d, ignoring",
+						debug_numa_node, numa_max_node())));
+	}
+	else if (numa_run_on_node(debug_numa_node) != 0)
+	{
+		ereport(WARNING,
+				(errmsg("could not pin backend to NUMA node %d: %m",
+						debug_numa_node)));
+	}
+	else
+		elog(DEBUG1, "pinned backend to NUMA node %d", debug_numa_node);
+
+	applied_numa_node = debug_numa_node;
+#endif
+}
+
+
 /*
  * exec_simple_query
  *
@@ -1044,6 +1095,9 @@ exec_simple_query(const char *query_string)
 
 	pgstat_report_activity(STATE_RUNNING, query_string);
 
+	/* Pin the backend to the configured NUMA node, if requested. */
+	process_debug_numa_node();
+
 	TRACE_POSTGRESQL_QUERY_START(query_string);
 
 	/*
@@ -2185,6 +2239,9 @@ exec_execute_message(const char *portal_name, long max_rows)
 
 	pgstat_report_activity(STATE_RUNNING, sourceText);
 
+	/* Pin the backend to the configured NUMA node, if requested. */
+	process_debug_numa_node();
+
 	foreach(lc, portal->stmts)
 	{
 		PlannedStmt *stmt = lfirst_node(PlannedStmt, lc);
diff --git a/src/backend/utils/misc/guc_parameters.dat b/src/backend/utils/misc/guc_parameters.dat
index 2e71c04282c..b72fce4b92b 100644
--- a/src/backend/utils/misc/guc_parameters.dat
+++ b/src/backend/utils/misc/guc_parameters.dat
@@ -641,6 +641,27 @@
   boot_val => 'DEFAULT_ASSERT_ENABLED',
 },
 
+{ name => 'debug_clocksweep_balance', type => 'bool', context => 'PGC_USERSET', group => 'DEVELOPER_OPTIONS',
+  short_desc => 'Enables balancing of buffer allocations between clock-sweep partitions.',
+  flags => 'GUC_NOT_IN_SAMPLE',
+  variable => 'clocksweep_balance',
+  boot_val => 'true'
+},
+
+{ name => 'debug_clocksweep_balance_recalc', type => 'bool', context => 'PGC_USERSET', group => 'DEVELOPER_OPTIONS',
+  short_desc => 'Enables periodic recalculation of clock-sweep partition balance weights.',
+  flags => 'GUC_NOT_IN_SAMPLE',
+  variable => 'clocksweep_balance_recalc',
+  boot_val => 'true'
+},
+
+{ name => 'debug_clocksweep_scan_all_partitions', type => 'bool', context => 'PGC_USERSET', group => 'DEVELOPER_OPTIONS',
+  short_desc => 'Enables scanning all clock-sweep partitions when looking for a free buffer.',
+  flags => 'GUC_NOT_IN_SAMPLE',
+  variable => 'clocksweep_scan_all_partitions',
+  boot_val => 'true'
+},
+
 { name => 'debug_copy_parse_plan_trees', type => 'bool', context => 'PGC_SUSET', group => 'DEVELOPER_OPTIONS',
   short_desc => 'Set this to force all parse and plan trees to be passed through copyObject(), to facilitate catching errors and omissions in copyObject().',
   flags => 'GUC_NOT_IN_SAMPLE',
@@ -693,6 +714,16 @@
   options => 'debug_logical_replication_streaming_options',
 },
 
+{ name => 'debug_numa_node', type => 'int', context => 'PGC_USERSET', group => 'DEVELOPER_OPTIONS',
+  short_desc => 'Pins the backend to the given NUMA node at query start.',
+  long_desc => '-1 (the default) disables pinning.',
+  flags => 'GUC_NOT_IN_SAMPLE',
+  variable => 'debug_numa_node',
+  boot_val => '-1',
+  min => '-1',
+  max => 'INT_MAX',
+},
+
 { name => 'debug_parallel_query', type => 'enum', context => 'PGC_USERSET', group => 'DEVELOPER_OPTIONS',
   short_desc => 'Forces the planner\'s use parallel query nodes.',
   long_desc => 'This can be useful for testing the parallel query infrastructure by forcing the planner to generate plans that contain nodes that perform tuple communication between workers and the main process.',
diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h
index bf38aa6baa2..9590f7ac467 100644
--- a/src/include/miscadmin.h
+++ b/src/include/miscadmin.h
@@ -215,6 +215,7 @@ extern PGDLLIMPORT bool MyDatabaseHasLoginEventTriggers;
 
 extern PGDLLIMPORT bool shmem_populate;
 extern PGDLLIMPORT bool shmem_interleave;
+extern PGDLLIMPORT int debug_numa_node;
 
 
 /*
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index 02833b19b0c..b5c3873acae 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -190,6 +190,11 @@ extern PGDLLIMPORT int bgwriter_lru_maxpages;
 extern PGDLLIMPORT double bgwriter_lru_multiplier;
 extern PGDLLIMPORT bool track_io_timing;
 
+/* in freelist.c */
+extern PGDLLIMPORT bool clocksweep_balance;
+extern PGDLLIMPORT bool clocksweep_balance_recalc;
+extern PGDLLIMPORT bool clocksweep_scan_all_partitions;
+
 #define DEFAULT_EFFECTIVE_IO_CONCURRENCY 16
 #define DEFAULT_MAINTENANCE_IO_CONCURRENCY 16
 extern PGDLLIMPORT int effective_io_concurrency;
@@ -418,6 +423,7 @@ extern void ClockSweepPartitionGetInfo(int idx,
 									 uint64 *buffer_total_req_allocs,
 									 uint32 *buffer_req_allocs,
 									 int **weights);
+extern void ClockSweepSetWeights(int partition, int *weights, int nweights);
 
 /* inline functions */
 
-- 
2.54.0

