From af009b4530b263f3465bdb539c6976015ef8a173 Mon Sep 17 00:00:00 2001 From: Jakub Wartak Date: Thu, 11 Jun 2026 12:38:43 +0200 Subject: [PATCH vXXX1] Add parttioned clocksweep and NUMA goodies. 1. Add three clocksweep GUCs to allow manipulation of partitoned clocksweep in runtime. 2. Add pg_buffercache_set_weights(int, int[]) to alter partition allocs (with clocksweep_balance_recalc=off). 3. Add debug_numa_node GUC to pin to NUMA node. --- .../pg_buffercache--1.7--1.8.sql | 7 ++ contrib/pg_buffercache/pg_buffercache_pages.c | 37 ++++++++ src/backend/storage/buffer/freelist.c | 86 ++++++++++++++++++- src/backend/tcop/postgres.c | 57 ++++++++++++ src/backend/utils/misc/guc_parameters.dat | 31 +++++++ src/include/miscadmin.h | 1 + src/include/storage/bufmgr.h | 6 ++ 7 files changed, 224 insertions(+), 1 deletion(-) diff --git a/contrib/pg_buffercache/pg_buffercache--1.7--1.8.sql b/contrib/pg_buffercache/pg_buffercache--1.7--1.8.sql index 43d2e84f9d2..9d8f4969555 100644 --- a/contrib/pg_buffercache/pg_buffercache--1.7--1.8.sql +++ b/contrib/pg_buffercache/pg_buffercache--1.7--1.8.sql @@ -25,9 +25,16 @@ CREATE VIEW pg_buffercache_partitions AS num_req_allocs bigint, -- handled allocs (current cycle) weights int[]); -- balancing weights +-- Register the function to set clock-sweep balance weights. +CREATE FUNCTION pg_buffercache_set_partition(IN partition int, IN weights int[]) +RETURNS void +AS 'MODULE_PATHNAME', 'pg_buffercache_set_partition' +LANGUAGE C VOLATILE PARALLEL UNSAFE; + -- Don't want these to be available to public. REVOKE ALL ON FUNCTION pg_buffercache_partitions() FROM PUBLIC; REVOKE ALL ON pg_buffercache_partitions FROM PUBLIC; +REVOKE ALL ON FUNCTION pg_buffercache_set_partition(int, int[]) FROM PUBLIC; GRANT EXECUTE ON FUNCTION pg_buffercache_partitions() TO pg_monitor; GRANT SELECT ON pg_buffercache_partitions TO pg_monitor; diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c index 8e494219abc..fdd15e60798 100644 --- a/contrib/pg_buffercache/pg_buffercache_pages.c +++ b/contrib/pg_buffercache/pg_buffercache_pages.c @@ -79,6 +79,7 @@ PG_FUNCTION_INFO_V1(pg_buffercache_mark_dirty); PG_FUNCTION_INFO_V1(pg_buffercache_mark_dirty_relation); PG_FUNCTION_INFO_V1(pg_buffercache_mark_dirty_all); PG_FUNCTION_INFO_V1(pg_buffercache_partitions); +PG_FUNCTION_INFO_V1(pg_buffercache_set_partition); /* Only need to touch memory once per backend process lifetime */ @@ -1033,3 +1034,39 @@ pg_buffercache_partitions(PG_FUNCTION_ARGS) else SRF_RETURN_DONE(funcctx); } + +/* + * Set the clock-sweep balance weights for a single partition. + */ +Datum +pg_buffercache_set_partition(PG_FUNCTION_ARGS) +{ + int partition = PG_GETARG_INT32(0); + ArrayType *array = PG_GETARG_ARRAYTYPE_P(1); + Datum *elems; + bool *nulls; + int nelems; + int *weights; + + if (ARR_NDIM(array) > 1) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("weights must be a one-dimensional array"))); + + deconstruct_array_builtin(array, INT4OID, &elems, &nulls, &nelems); + + weights = palloc_array(int, nelems); + for (int i = 0; i < nelems; i++) + { + if (nulls[i]) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("weights must not contain NULL values"))); + + weights[i] = DatumGetInt32(elems[i]); + } + + ClockSweepSetWeights(partition, weights, nelems); + + PG_RETURN_VOID(); +} diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c index 1ac1e3e3490..e677c71e0b3 100644 --- a/src/backend/storage/buffer/freelist.c +++ b/src/backend/storage/buffer/freelist.c @@ -55,6 +55,26 @@ */ #define CLOCKSWEEP_HISTORY_COEFF 0.5 +/* + * GUCs controlling the NUMA-aware clock-sweep behavior. + * + * clocksweep_balance - when enabled, allocations may get redirected between + * clock-sweep partitions to keep them balanced (see StrategySyncBalance). + * + * clocksweep_balance_recalc - when enabled, the balance weights are + * periodically recalculated (see StrategySyncBalance). Disabling this keeps + * the current weights, e.g. ones configured manually using + * pg_buffercache_set_partition(), while still using them to balance + * allocations (if clocksweep_balance is enabled). + * + * clocksweep_scan_all_partitions - when enabled, looking for a free buffer + * scans all clock-sweep partitions (in a round-robin way), not just the + * backend's "home" partition. + */ +bool clocksweep_balance = true; +bool clocksweep_balance_recalc = true; +bool clocksweep_scan_all_partitions = true; + /* * Information about one partition of the ClockSweep (on a subset of buffers). * @@ -436,8 +456,11 @@ ChooseClockSweep(bool balance) * When balancing allocations, redirect the allocations to other partitions * according to the budgets. We move through partitions in a round-robin way, * after allocating the "budget" of allocations from the current one. + * + * Balancing can be disabled at runtime using the clocksweep_balance GUC, in + * which case allocations always stay in the backend's "home" partition. */ - if (balance) + if (balance && clocksweep_balance) { /* * Ran out of budget from the current partition? Move to the next one @@ -551,6 +574,10 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint64 *buf_state, bool *from_r * Start with the "preferred" partition, and then proceed in a round-robin * manner. If we cycle back to the starting partition, it means none of the * partitions has unpinned buffers. + * + * Scanning of the other partitions can be disabled at runtime using the + * clocksweep_scan_all_partitions GUC. In that case we only scan the + * backend's "home" partition, and fail if it has no unpinned buffers. */ sweep = ChooseClockSweep(true); sweep_start = sweep; @@ -562,6 +589,10 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint64 *buf_state, bool *from_r if (buf != NULL) return buf; + /* don't look at other partitions unless allowed to */ + if (!clocksweep_scan_all_partitions) + break; + /* * Try advancing to the next partition, round-robin (if last partition, * wrap around to the beginning). @@ -739,6 +770,9 @@ StrategySyncBalance(void) avg_allocs, /* average allocations (per partition) */ delta_allocs = 0; /* sum of allocs above average */ + if (!clocksweep_balance || !clocksweep_balance_recalc) + return; + /* * Collect the number of allocations requested in the past interval. * While at it, reset the counter to start the new interval. @@ -1481,3 +1515,53 @@ ClockSweepPartitionGetInfo(int idx, (*weights)[i] = (int) sweep->balance[i]; } } + +/* + * ClockSweepSetWeights override the clock-sweep balance weights of a single + * partition. + */ +void +ClockSweepSetWeights(int partition, int *weights, int nweights) +{ + ClockSweep *sweep; + + /* + * Disallow manual weights while the automatic recalculation is enabled, as + * StrategySyncBalance would just recompute (and overwrite) them. + */ + if (clocksweep_balance_recalc) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("cannot set clock-sweep weights while debug_clocksweep_balance_recalc is enabled"), + errhint("Set debug_clocksweep_balance_recalc to off before setting the weights manually."))); + + if ((partition < 0) || (partition >= StrategyControl->num_partitions)) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("clock-sweep partition %d out of range", partition), + errhint("There are %d clock-sweep partitions, numbered from 0.", + StrategyControl->num_partitions))); + + if (nweights != StrategyControl->num_partitions) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("number of weights (%d) does not match number of clock-sweep partitions (%d)", + nweights, StrategyControl->num_partitions))); + + for (int i = 0; i < nweights; i++) + { + if ((weights[i] < 0) || (weights[i] > 100)) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("clock-sweep weight %d out of range", + weights[i]), + errhint("Each weight must be between 0 and 100."))); + } + + sweep = &StrategyControl->sweeps[partition]; + + SpinLockAcquire(&sweep->clock_sweep_lock); + for (int j = 0; j < nweights; j++) + sweep->balance[j] = (uint8) weights[j]; + SpinLockRelease(&sweep->clock_sweep_lock); +} diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index b591092ad6e..a5d6a4bc007 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -91,6 +91,7 @@ #include "utils/timeout.h" #include "utils/timestamp.h" #include "utils/varlena.h" +#include /* ---------------- * global variables @@ -115,6 +116,9 @@ int client_connection_check_interval = 0; /* flags for non-system relation kinds to restrict use */ int restrict_nonsystem_relation_kind; +/* NUMA node to pin the backend to at query start; -1 disables pinning */ +int debug_numa_node = -1; + /* * Include signal sender PID/UID in the server log when available * (SA_SIGINFO). The caller must supply the already-captured pid and uid @@ -1025,6 +1029,53 @@ pg_plan_queries(List *querytrees, const char *query_string, int cursorOptions, } +/* + * process_debug_numa_node + * + * If the debug_numa_node GUC is set (>= 0), pin this backend to run on the + * CPUs of the requested NUMA node. -1 disables it (default) + */ +static void +process_debug_numa_node(void) +{ +#ifdef USE_LIBNUMA + static int applied_numa_node = -1; + + if (debug_numa_node == applied_numa_node) + return; + + /* Nothing we can do if the kernel/library has no NUMA support. */ + if (numa_available() < 0) + { + applied_numa_node = debug_numa_node; + return; + } + + if (debug_numa_node < 0) + { + /* Pinning disabled: allow running on all nodes again. */ + numa_run_on_node_mask(numa_all_nodes_ptr); + } + else if (debug_numa_node > numa_max_node()) + { + ereport(WARNING, + (errmsg("debug_numa_node %d exceeds the highest available NUMA node %d, ignoring", + debug_numa_node, numa_max_node()))); + } + else if (numa_run_on_node(debug_numa_node) != 0) + { + ereport(WARNING, + (errmsg("could not pin backend to NUMA node %d: %m", + debug_numa_node))); + } + else + elog(DEBUG1, "pinned backend to NUMA node %d", debug_numa_node); + + applied_numa_node = debug_numa_node; +#endif +} + + /* * exec_simple_query * @@ -1049,6 +1100,9 @@ exec_simple_query(const char *query_string) pgstat_report_activity(STATE_RUNNING, query_string); + /* Pin the backend to the configured NUMA node, if requested. */ + process_debug_numa_node(); + TRACE_POSTGRESQL_QUERY_START(query_string); /* @@ -2190,6 +2244,9 @@ exec_execute_message(const char *portal_name, long max_rows) pgstat_report_activity(STATE_RUNNING, sourceText); + /* Pin the backend to the configured NUMA node, if requested. */ + process_debug_numa_node(); + foreach(lc, portal->stmts) { PlannedStmt *stmt = lfirst_node(PlannedStmt, lc); diff --git a/src/backend/utils/misc/guc_parameters.dat b/src/backend/utils/misc/guc_parameters.dat index 2e71c04282c..b72fce4b92b 100644 --- a/src/backend/utils/misc/guc_parameters.dat +++ b/src/backend/utils/misc/guc_parameters.dat @@ -641,6 +641,27 @@ boot_val => 'DEFAULT_ASSERT_ENABLED', }, +{ name => 'debug_clocksweep_balance', type => 'bool', context => 'PGC_USERSET', group => 'DEVELOPER_OPTIONS', + short_desc => 'Enables balancing of buffer allocations between clock-sweep partitions.', + flags => 'GUC_NOT_IN_SAMPLE', + variable => 'clocksweep_balance', + boot_val => 'true' +}, + +{ name => 'debug_clocksweep_balance_recalc', type => 'bool', context => 'PGC_USERSET', group => 'DEVELOPER_OPTIONS', + short_desc => 'Enables periodic recalculation of clock-sweep partition balance weights.', + flags => 'GUC_NOT_IN_SAMPLE', + variable => 'clocksweep_balance_recalc', + boot_val => 'true' +}, + +{ name => 'debug_clocksweep_scan_all_partitions', type => 'bool', context => 'PGC_USERSET', group => 'DEVELOPER_OPTIONS', + short_desc => 'Enables scanning all clock-sweep partitions when looking for a free buffer.', + flags => 'GUC_NOT_IN_SAMPLE', + variable => 'clocksweep_scan_all_partitions', + boot_val => 'true' +}, + { name => 'debug_copy_parse_plan_trees', type => 'bool', context => 'PGC_SUSET', group => 'DEVELOPER_OPTIONS', short_desc => 'Set this to force all parse and plan trees to be passed through copyObject(), to facilitate catching errors and omissions in copyObject().', flags => 'GUC_NOT_IN_SAMPLE', @@ -693,6 +714,16 @@ options => 'debug_logical_replication_streaming_options', }, +{ name => 'debug_numa_node', type => 'int', context => 'PGC_USERSET', group => 'DEVELOPER_OPTIONS', + short_desc => 'Pins the backend to the given NUMA node at query start.', + long_desc => '-1 (the default) disables pinning.', + flags => 'GUC_NOT_IN_SAMPLE', + variable => 'debug_numa_node', + boot_val => '-1', + min => '-1', + max => 'INT_MAX', +}, + { name => 'debug_parallel_query', type => 'enum', context => 'PGC_USERSET', group => 'DEVELOPER_OPTIONS', short_desc => 'Forces the planner\'s use parallel query nodes.', long_desc => 'This can be useful for testing the parallel query infrastructure by forcing the planner to generate plans that contain nodes that perform tuple communication between workers and the main process.', diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index 13bb29f8702..917a31b8140 100644 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -215,6 +215,7 @@ extern PGDLLIMPORT bool MyDatabaseHasLoginEventTriggers; extern PGDLLIMPORT bool shmem_populate; extern PGDLLIMPORT bool shmem_interleave; +extern PGDLLIMPORT int debug_numa_node; /* diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index f3f1d046f95..8693fe57dd0 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -190,6 +190,11 @@ extern PGDLLIMPORT int bgwriter_lru_maxpages; extern PGDLLIMPORT double bgwriter_lru_multiplier; extern PGDLLIMPORT bool track_io_timing; +/* in freelist.c */ +extern PGDLLIMPORT bool clocksweep_balance; +extern PGDLLIMPORT bool clocksweep_balance_recalc; +extern PGDLLIMPORT bool clocksweep_scan_all_partitions; + #define DEFAULT_EFFECTIVE_IO_CONCURRENCY 16 #define DEFAULT_MAINTENANCE_IO_CONCURRENCY 16 extern PGDLLIMPORT int effective_io_concurrency; @@ -426,6 +431,7 @@ extern void ClockSweepPartitionGetInfo(int idx, uint64 *buffer_total_req_allocs, uint32 *buffer_req_allocs, int **weights); +extern void ClockSweepSetWeights(int partition, int *weights, int nweights); /* inline functions */ -- 2.43.0