From 630e4f0c9ce995ff5a9f1aaf68e8c64836cadffb Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@vondra.me>
Date: Tue, 27 May 2025 23:08:48 +0200
Subject: [PATCH v20250807 11/11] NUMA: pin backends to NUMA nodes

When initializing the backend, we pick a PGPROC entry from the right
NUMA node where the backend is running. But the process can move to a
different core / node, so to prevent that we pin it.
---
 src/backend/storage/lmgr/proc.c     | 21 +++++++++++++++++++++
 src/backend/utils/init/globals.c    |  1 +
 src/backend/utils/misc/guc_tables.c | 10 ++++++++++
 src/include/miscadmin.h             |  1 +
 4 files changed, 33 insertions(+)

diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c
index 11259151a7d..dbb4cbb1bfa 100644
--- a/src/backend/storage/lmgr/proc.c
+++ b/src/backend/storage/lmgr/proc.c
@@ -766,6 +766,27 @@ InitProcess(void)
 	}
 	MyProcNumber = GetNumberFromPGProc(MyProc);
 
+	/*
+	 * Optionally, restrict the process to only run on CPUs from the same NUMA
+	 * as the PGPROC. We do this even if the PGPROC has a different NUMA node,
+	 * but not for PGPROC entries without a node (i.e. aux/2PC entries).
+	 *
+	 * This also means we only do this with numa_procs_interleave, because
+	 * without that we'll have numa_node=-1 for all PGPROC entries.
+	 *
+	 * FIXME add proper error-checking for libnuma functions
+	 */
+	if (numa_procs_pin && MyProc->numa_node != -1)
+	{
+		struct bitmask *cpumask = numa_allocate_cpumask();
+
+		numa_node_to_cpus(MyProc->numa_node, cpumask);
+
+		numa_sched_setaffinity(MyProcPid, cpumask);
+
+		numa_free_cpumask(cpumask);
+	}
+
 	/*
 	 * Cross-check that the PGPROC is of the type we expect; if this were not
 	 * the case, it would get returned to the wrong list.
diff --git a/src/backend/utils/init/globals.c b/src/backend/utils/init/globals.c
index 6ee4684d1b8..3f88659b49f 100644
--- a/src/backend/utils/init/globals.c
+++ b/src/backend/utils/init/globals.c
@@ -150,6 +150,7 @@ bool		numa_buffers_interleave = false;
 bool		numa_localalloc = false;
 bool		numa_partition_freelist = false;
 bool		numa_procs_interleave = false;
+bool		numa_procs_pin = false;
 
 /* GUC parameters for vacuum */
 int			VacuumBufferUsageLimit = 2048;
diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c
index 7b718760248..862341e137e 100644
--- a/src/backend/utils/misc/guc_tables.c
+++ b/src/backend/utils/misc/guc_tables.c
@@ -2156,6 +2156,16 @@ struct config_bool ConfigureNamesBool[] =
 		NULL, NULL, NULL
 	},
 
+	{
+		{"numa_procs_pin", PGC_POSTMASTER, DEVELOPER_OPTIONS,
+			gettext_noop("Enables pinning backends to NUMA nodes (matching the PGPROC node)."),
+			gettext_noop("When enabled, sets affinity to CPUs from the same NUMA node."),
+		},
+		&numa_procs_pin,
+		false,
+		NULL, NULL, NULL
+	},
+
 	{
 		{"sync_replication_slots", PGC_SIGHUP, REPLICATION_STANDBY,
 			gettext_noop("Enables a physical standby to synchronize logical failover replication slots from the primary server."),
diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h
index cdeee8dccba..a97741c6707 100644
--- a/src/include/miscadmin.h
+++ b/src/include/miscadmin.h
@@ -182,6 +182,7 @@ extern PGDLLIMPORT bool numa_buffers_interleave;
 extern PGDLLIMPORT bool numa_localalloc;
 extern PGDLLIMPORT bool numa_partition_freelist;
 extern PGDLLIMPORT bool numa_procs_interleave;
+extern PGDLLIMPORT bool numa_procs_pin;
 
 extern PGDLLIMPORT int commit_timestamp_buffers;
 extern PGDLLIMPORT int multixact_member_buffers;
-- 
2.50.1

