From 238935f06c793dcd0271423af83e039d2ee00120 Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@vondra.me>
Date: Wed, 3 Jun 2026 16:30:28 +0200
Subject: [PATCH v20260605 1/6] Add shmem_populate and shmem_interleave GUCs

- shmem_populate - Forces mmap() with MAP_POPULATE, which faults all
  memory pages backing the shared memory segment.

- shmem_interleave - Applies NUMA interleaving on the whole shared
  memory segment, to balance allocations between nodes.
---
 src/backend/port/sysv_shmem.c             | 47 +++++++++++++++++++++++
 src/backend/utils/misc/guc_parameters.dat | 14 +++++++
 src/include/miscadmin.h                   |  4 ++
 3 files changed, 65 insertions(+)

diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c
index 2e3886cf9fe..9eaff838a04 100644
--- a/src/backend/port/sysv_shmem.c
+++ b/src/backend/port/sysv_shmem.c
@@ -27,6 +27,10 @@
 #include <sys/shm.h>
 #include <sys/stat.h>
 
+#ifdef USE_LIBNUMA
+#include <numa.h>
+#endif
+
 #include "miscadmin.h"
 #include "port/pg_bitutils.h"
 #include "portability/mem.h"
@@ -98,6 +102,10 @@ void	   *UsedShmemSegAddr = NULL;
 static Size AnonymousShmemSize;
 static void *AnonymousShmem = NULL;
 
+/* GUCs */
+bool shmem_populate = false;	/* MAP_POPULATE */
+bool shmem_interleave = false;	/* NUMA interleaving */
+
 static void *InternalIpcMemoryCreate(IpcMemoryKey memKey, Size size);
 static void IpcMemoryDetach(int status, Datum shmaddr);
 static void IpcMemoryDelete(int status, Datum shmId);
@@ -604,6 +612,21 @@ CreateAnonymousSegment(Size *size)
 	int			mmap_errno = 0;
 	int			mmap_flags = MAP_SHARED | MAP_ANONYMOUS | MAP_HASSEMAPHORE;
 
+	/* If requested, populate the shared memory by MAP_POPULATE. */
+	if (shmem_populate)
+		mmap_flags |= MAP_POPULATE;
+
+#ifdef USE_LIBNUMA
+	/*
+	 * If requested, interleave the shared memory by setting a memory policy
+	 * before the mmap() call. This really matters only with MAP_POPULATE,
+	 * because without page faults the memory does not actually get placed
+	 * to the nodes. But without MAP_POPULATE it's virtually free.
+	 */
+	if (shmem_interleave)
+		numa_set_interleave_mask(numa_all_nodes_ptr);
+#endif
+
 #ifndef MAP_HUGETLB
 	/* PGSharedMemoryCreate should have dealt with this case */
 	Assert(huge_pages != HUGE_PAGES_ON);
@@ -665,6 +688,30 @@ CreateAnonymousSegment(Size *size)
 						 allocsize) : 0));
 	}
 
+#ifdef USE_LIBNUMA
+	/*
+	 * If set the policy to interleaving by numa_set_membind(), undo it now by
+	 * setting the policy to localalloc. With MAP_POPULATE, all the pages were
+	 * faulted and are now interleaved on the available nodes.
+	 *
+	 * To handle the case without MAP_POPULATE, apply the interleaving policy to
+	 * the shared memory segment allocated by mmap() before touching it in any
+	 * way, so that it gets placed on the correct node on first access.
+	 *
+	 * This matters especially with huge pages, where it's possible to run out
+	 * of huge pages on some nodes and then crash. By explicitly interleaving
+	 * the whole segment, that's much less likely.
+	 */
+	if (shmem_interleave)
+	{
+		/* undo the policy set by numa_set_membind() earlier */
+		numa_set_localalloc();
+
+		/* set interleaving policy for not yet faulted memory */
+		numa_interleave_memory(ptr, allocsize, numa_all_nodes_ptr);
+	}
+#endif
+
 	*size = allocsize;
 	return ptr;
 }
diff --git a/src/backend/utils/misc/guc_parameters.dat b/src/backend/utils/misc/guc_parameters.dat
index afaa058b046..f15e74198c5 100644
--- a/src/backend/utils/misc/guc_parameters.dat
+++ b/src/backend/utils/misc/guc_parameters.dat
@@ -740,6 +740,20 @@
   ifdef => 'DEBUG_NODE_TESTS_ENABLED',
 },
 
+{ name => 'debug_shmem_interleave', type => 'bool', context => 'PGC_SUSET', group => 'DEVELOPER_OPTIONS',
+  short_desc => 'Forces interleaving for the whole shared memory segment.',
+  flags => 'GUC_NOT_IN_SAMPLE',
+  variable => 'shmem_interleave',
+  boot_val => 'false'
+},
+
+{ name => 'debug_shmem_populate', type => 'bool', context => 'PGC_SUSET', group => 'DEVELOPER_OPTIONS',
+  short_desc => 'Populates (faults) the whole shared memory segment using MAP_POPULATE.',
+  flags => 'GUC_NOT_IN_SAMPLE',
+  variable => 'shmem_populate',
+  boot_val => 'false'
+},
+
 { name => 'debug_write_read_parse_plan_trees', type => 'bool', context => 'PGC_SUSET', group => 'DEVELOPER_OPTIONS',
   short_desc => 'Set this to force all parse and plan trees to be passed through outfuncs.c/readfuncs.c, to facilitate catching errors and omissions in those modules.',
   flags => 'GUC_NOT_IN_SAMPLE',
diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h
index 7de0a115402..13bb29f8702 100644
--- a/src/include/miscadmin.h
+++ b/src/include/miscadmin.h
@@ -213,6 +213,10 @@ extern PGDLLIMPORT Oid MyDatabaseTableSpace;
 
 extern PGDLLIMPORT bool MyDatabaseHasLoginEventTriggers;
 
+extern PGDLLIMPORT bool shmem_populate;
+extern PGDLLIMPORT bool shmem_interleave;
+
+
 /*
  * Date/Time Configuration
  *
-- 
2.54.0

