From 4672b0b53f68177ca76b9400e7f9ca4172a08ddb Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@vondra.me>
Date: Tue, 2 Jun 2026 23:27:13 +0200
Subject: [PATCH v20260605 3/6] NUMA: shared buffers partitioning

Ensure shared buffers are allocated from all NUMA nodes, in a balanced
way, instead of just using the node where Postgres initially starts, or
where the kernel decides to migrate the page, etc.

In cases like pre-warming a database from a single worker (e.g. using
pg_prewarm), we may end up with severely unbalanced memory distribution
(with most memory located on a single NUMA node). Unbalanced allocation
may put a lot of pressure on the memory system on a small number of NUMA
nodes, limiting the bandwidth etc.

With zone_reclaim, the kernel would eventually move some of the memory
to other nodes, but that tends to take a long time and is unpredictable.

This change forces even distribution of shared buffers on all NUMA
nodes, improving predictability, reducing the time needed for warmup
during benchmarking, etc. It's also less dependent on what the CPU
scheduler decides to do (which cores get used for the warmup.)

The effect is similar to

     numactl --interleave=all

in that the buffers are distributed on the NUMA nodes evenly, but
there's also a number of important differences.

Firstly, it's applied only to shared buffers (and buffer descriptors),
not to the whole shared memory segment. It's possible to enable memory
interleaving using the shmem_interleave GUC, introduced in an earlier
patch in this series.

NUMA works at the granularity of a memory page, which is typically
either 4K or 2MB (hugepage), but other sizes are possible. For systems
where NUMA matters, we expect large amounts of memory (hundreds of
gigabytes) and hugepages enabled. But not necessarily.

The partitioning scheme is best-effort with respect to memory page size.
The shared buffers do not "align" with memory pages (i.e. a partition
may not end at the memory page boundary), in which case we simply locate
just the section of the partition with complete memory pages. This means
there may be ~one unmapped memory page between partitions. Considering
the expected amounts of memory, this is negligible, and the alternative
would be a significant amount of complexity to align the pages and
enforce "allowed" partition sizes.

Buffer descriptors are affected by this too, and the effect may be more
significant, simply because the descriptors are much smaller (~64B). So
the array is smaller, and a single 2MB memory page is worth ~32K buffer
descriptors. But with large systems it's still negligible.

The "buffer partitions" may not be 1:1 with NUMA nodes. We want to allow
clock-sweep partitioning even on non-NUMA systems, or when running only
on a small number of NUMA nodes. There's a minimal number of partitions
(default: 4), and a node may get multiple partitions. Nodes always get
the same number of partitions (e.g. with 3 NUMA nodes there will be 6
partitions in total, as each node gets 2 partitions).

The feature is enabled by dshared_buffers_numa GUC (default: false).
---
 .../pg_buffercache--1.7--1.8.sql              |   1 +
 contrib/pg_buffercache/pg_buffercache_pages.c |  24 +-
 src/backend/storage/buffer/buf_init.c         | 244 ++++++++++++++++--
 src/backend/storage/buffer/freelist.c         |   9 +
 src/backend/utils/misc/guc_parameters.dat     |   6 +
 src/backend/utils/misc/postgresql.conf.sample |   1 +
 src/include/port/pg_numa.h                    |   7 +
 src/include/storage/buf_internals.h           |  16 +-
 src/include/storage/bufmgr.h                  |   8 +
 src/port/pg_numa.c                            | 112 ++++++++
 10 files changed, 392 insertions(+), 36 deletions(-)

diff --git a/contrib/pg_buffercache/pg_buffercache--1.7--1.8.sql b/contrib/pg_buffercache/pg_buffercache--1.7--1.8.sql
index d62b8339bfc..a6e49fd1652 100644
--- a/contrib/pg_buffercache/pg_buffercache--1.7--1.8.sql
+++ b/contrib/pg_buffercache/pg_buffercache--1.7--1.8.sql
@@ -11,6 +11,7 @@ LANGUAGE C PARALLEL SAFE;
 CREATE VIEW pg_buffercache_partitions AS
 	SELECT P.* FROM pg_buffercache_partitions() AS P
 	(partition integer,			-- partition index
+	 numa_node integer,			-- NUMA node of the partitioon
 	 num_buffers integer,		-- number of buffers in the partition
 	 first_buffer integer,		-- first buffer of partition
 	 last_buffer integer);		-- last buffer of partition
diff --git a/contrib/pg_buffercache/pg_buffercache_pages.c b/contrib/pg_buffercache/pg_buffercache_pages.c
index d678cb045ba..e3efeeda675 100644
--- a/contrib/pg_buffercache/pg_buffercache_pages.c
+++ b/contrib/pg_buffercache/pg_buffercache_pages.c
@@ -31,7 +31,7 @@
 #define NUM_BUFFERCACHE_MARK_DIRTY_ALL_ELEM 3
 
 #define NUM_BUFFERCACHE_OS_PAGES_ELEM	3
-#define NUM_BUFFERCACHE_PARTITIONS_ELEM	4
+#define NUM_BUFFERCACHE_PARTITIONS_ELEM	5
 
 PG_MODULE_MAGIC_EXT(
 					.name = "pg_buffercache",
@@ -904,11 +904,13 @@ pg_buffercache_partitions(PG_FUNCTION_ARGS)
 		tupledesc = CreateTemplateTupleDesc(expected_tupledesc->natts);
 		TupleDescInitEntry(tupledesc, (AttrNumber) 1, "partition",
 						   INT4OID, -1, 0);
-		TupleDescInitEntry(tupledesc, (AttrNumber) 2, "num_buffers",
+		TupleDescInitEntry(tupledesc, (AttrNumber) 2, "numa_node",
 						   INT4OID, -1, 0);
-		TupleDescInitEntry(tupledesc, (AttrNumber) 3, "first_buffer",
+		TupleDescInitEntry(tupledesc, (AttrNumber) 3, "num_buffers",
 						   INT4OID, -1, 0);
-		TupleDescInitEntry(tupledesc, (AttrNumber) 4, "last_buffer",
+		TupleDescInitEntry(tupledesc, (AttrNumber) 4, "first_buffer",
+						   INT4OID, -1, 0);
+		TupleDescInitEntry(tupledesc, (AttrNumber) 5, "last_buffer",
 						   INT4OID, -1, 0);
 
 		funcctx->user_fctx = BlessTupleDesc(tupledesc);
@@ -926,28 +928,32 @@ pg_buffercache_partitions(PG_FUNCTION_ARGS)
 	{
 		uint32		i = funcctx->call_cntr;
 
-		int			num_buffers,
+		int			numa_node,
+					num_buffers,
 					first_buffer,
 					last_buffer;
 
 		Datum		values[NUM_BUFFERCACHE_PARTITIONS_ELEM];
 		bool		nulls[NUM_BUFFERCACHE_PARTITIONS_ELEM];
 
-		BufferPartitionGet(i, &num_buffers,
+		BufferPartitionGet(i, &numa_node, &num_buffers,
 						   &first_buffer, &last_buffer);
 
 		values[0] = Int32GetDatum(i);
 		nulls[0] = false;
 
-		values[1] = Int32GetDatum(num_buffers);
+		values[1] = Int32GetDatum(numa_node);
 		nulls[1] = false;
 
-		values[2] = Int32GetDatum(first_buffer);
+		values[2] = Int32GetDatum(num_buffers);
 		nulls[2] = false;
 
-		values[3] = Int32GetDatum(last_buffer);
+		values[3] = Int32GetDatum(first_buffer);
 		nulls[3] = false;
 
+		values[4] = Int32GetDatum(last_buffer);
+		nulls[4] = false;
+
 		/* Build and return the tuple. */
 		tuple = heap_form_tuple((TupleDesc) funcctx->user_fctx, values, nulls);
 		result = HeapTupleGetDatum(tuple);
diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c
index e593b02e0ca..1f93a31d451 100644
--- a/src/backend/storage/buffer/buf_init.c
+++ b/src/backend/storage/buffer/buf_init.c
@@ -14,12 +14,20 @@
  */
 #include "postgres.h"
 
+#ifdef USE_LIBNUMA
+#include <numa.h>
+#include <numaif.h>
+#endif
+
+#include "port/pg_numa.h"
 #include "storage/aio.h"
 #include "storage/buf_internals.h"
 #include "storage/bufmgr.h"
 #include "storage/proclist.h"
 #include "storage/shmem.h"
 #include "storage/subsystems.h"
+#include "utils/guc_hooks.h"
+#include "utils/varlena.h"
 
 BufferDescPadded *BufferDescriptors;
 char	   *BufferBlocks;
@@ -71,9 +79,12 @@ const ShmemCallbacks BufferManagerShmemCallbacks = {
  *		multiple times. Check the PrivateRefCount infrastructure in bufmgr.c.
  */
 
-/* number of buffer partitions */
-#define NUM_CLOCK_SWEEP_PARTITIONS	4
+/*
+ * Minimum number of buffer partitions, no matter the number of NUMA nodes.
+ */
+#define MIN_BUFFER_PARTITIONS	4
 
+bool	shared_buffers_numa = false;
 
 /*
  * Register shared memory area for the buffer pool.
@@ -81,6 +92,10 @@ const ShmemCallbacks BufferManagerShmemCallbacks = {
 static void
 BufferManagerShmemRequest(void *arg)
 {
+	int		nparts;
+
+	BufferPartitionsCalculate(NULL, &nparts, NULL);
+
 	ShmemRequestStruct(.name = "Buffer Descriptors",
 					   .size = NBuffers * sizeof(BufferDescPadded),
 	/* Align descriptors to a cacheline boundary. */
@@ -103,7 +118,7 @@ BufferManagerShmemRequest(void *arg)
 		);
 
 	ShmemRequestStruct(.name = "Buffer Partition Registry",
-					   .size = NUM_CLOCK_SWEEP_PARTITIONS * sizeof(BufferPartition),
+					   .size = nparts * sizeof(BufferPartition),
 	/* Align descriptors to a cacheline boundary. */
 					   .alignment = PG_CACHE_LINE_SIZE,
 					   .ptr = (void **) &BufferPartitionsRegistry,
@@ -134,6 +149,10 @@ BufferManagerShmemInit(void *arg)
 	/*
 	 * Initialize the buffer partition registry first, before other parts
 	 * have a chance to touch the memory.
+	 *
+	 * Also moves memory to different NUMA nodes (if enabled by a GUC).
+	 * Do this before the loop that initializes buffer headers etc. which
+	 * may fault some of the memory pages etc.
 	 */
 	BufferPartitionsInit();
 
@@ -231,35 +250,203 @@ BufferPartitionsInit(void)
 {
 	int			buffer = 0;
 
-	/* number of buffers per partition (make sure to not overflow) */
-	int			part_buffers = NBuffers / NUM_CLOCK_SWEEP_PARTITIONS;
-	int			remaining_buffers = NBuffers % NUM_CLOCK_SWEEP_PARTITIONS;
+	int			nnodes,
+				npartitions,
+				npartitions_per_node;
 
-	BufferPartitionsRegistry->npartitions = NUM_CLOCK_SWEEP_PARTITIONS;
+	int			buffers_per_partition,
+				buffers_remaining;
 
-	for (int n = 0; n < BufferPartitionsRegistry->npartitions; n++)
-	{
-		BufferPartition *part = &BufferPartitionsRegistry->partitions[n];
+	/* calculate partitioning parameters */
+	BufferPartitionsCalculate(&nnodes, &npartitions, &npartitions_per_node);
+
+	/* paranoia */
+	Assert(nnodes > 0);
+	Assert(npartitions >= MIN_BUFFER_PARTITIONS);
+	Assert((npartitions % nnodes) == 0);
+	Assert((npartitions_per_node * nnodes) == npartitions);
 
-		int			num_buffers = part_buffers;
-		if (n < remaining_buffers)
-			num_buffers += 1;
+	BufferPartitionsRegistry->nnodes = nnodes;
+	BufferPartitionsRegistry->npartitions = npartitions;
+	BufferPartitionsRegistry->npartitions_per_node = npartitions_per_node;
 
-		remaining_buffers -= num_buffers;
+	/* regular partition size, the first couple get an extra buffer */
+	buffers_per_partition = (NBuffers / npartitions);
+	buffers_remaining = (NBuffers % buffers_per_partition);
 
-		Assert((num_buffers > 0) && (num_buffers <= part_buffers));
-		Assert((buffer >= 0) && (buffer < NBuffers));
+	/* should have all the buffers */
+	Assert((buffers_per_partition * npartitions + buffers_remaining) == NBuffers);
 
-		part->num_buffers = num_buffers;
-		part->first_buffer = buffer;
-		part->last_buffer = buffer + (num_buffers - 1);
+	/*
+	 * Now walk the partitions, and set the buffer range. Optionally, place
+	 * the partitions on a given node (for all partitions at once).
+	 */
+	for (int n = 0; n < nnodes; n++)
+	{
+		for (int p = 0; p < npartitions_per_node; p++)
+		{
+			int			idx = (n * npartitions_per_node) + p;
+			BufferPartition *part = &BufferPartitionsRegistry->partitions[idx];
+
+			/*
+			 * Assign to the NUMA node, but only with shared_buffers_numa=on.
+			 *
+			 * XXX we should get an actual node ID from the mask, in case the
+			 * task is restricted to only some nodes.
+			 */
+			part->numa_node = (shared_buffers_numa) ? n : -1;
+
+			/* The first couple partitions may get an extra buffer. */
+			part->num_buffers = buffers_per_partition;
+			if (idx < buffers_remaining)
+				part->num_buffers += 1;
+
+			/* remember the buffer range */
+			part->first_buffer = buffer;
+			part->last_buffer = buffer + (part->num_buffers - 1);
+
+			/* remember start of the next partition */
+			buffer += part->num_buffers;
+		}
 
-		buffer += num_buffers;
+#ifdef USE_LIBNUMA
+		/*
+		 * Now try to locate buffers and buffer descriptors to the node (all
+		 * partitions for the node at once).
+		 */
+		if (shared_buffers_numa)
+		{
+			Size	numa_page_size = pg_numa_page_size();
+
+			int		part_first,
+					part_last,
+					buff_first,
+					buff_last;
+
+			char   *startptr,
+				   *endptr;
+
+			/* first/last partition for this node */
+			part_first = (n * npartitions_per_node);
+			part_last = part_first + (npartitions_per_node - 1);
+
+			/* buffers (blocks) */
+
+			/* first/last buffer */
+			buff_first = BufferPartitionsRegistry->partitions[part_first].first_buffer;
+			buff_last = BufferPartitionsRegistry->partitions[part_last].last_buffer;
+
+			/* beginning of the first block, end of last block */
+			startptr = BufferBlocks + ((Size) buff_first * BLCKSZ);
+			endptr = BufferBlocks + ((Size) (buff_last + 1) * BLCKSZ);
+
+			/* print some warnings when the partitions are not aligned */
+			if ((startptr != (char *) TYPEALIGN(numa_page_size, startptr)) ||
+				(endptr != (char *) TYPEALIGN_DOWN(numa_page_size, endptr)))
+			{
+				elog(WARNING, "buffers for node %d not well aligned [%p,%p] aligned [%p,%p]",
+					 n, startptr, endptr,
+					 (char *) TYPEALIGN(numa_page_size, startptr),
+					 (char *) TYPEALIGN_DOWN(numa_page_size, endptr));
+			}
+
+			/* best effort: align the pointers, so that the mbind() works */
+			startptr = (char *) TYPEALIGN(numa_page_size, startptr);
+			endptr = (char *) TYPEALIGN_DOWN(numa_page_size, endptr);
+
+			/* XXX or should we use pg_numa_move_to_node? */
+			pg_numa_bind_to_node(startptr, endptr, n);
+
+			/* buffer descriptors */
+
+			/* beginning of the first descriptor, end of last descriptor */
+			startptr = (char *) &BufferDescriptors[buff_first];
+			endptr = (char *) &BufferDescriptors[buff_last] + 1;
+
+			/* print some warnings when the partitions are not aligned */
+			if ((startptr != (char *) TYPEALIGN(numa_page_size, startptr)) ||
+				(endptr != (char *) TYPEALIGN_DOWN(numa_page_size, endptr)))
+			{
+				elog(WARNING, "buffers descriptors for node %d not well aligned [%p,%p] aligned [%p,%p]",
+					 n, startptr, endptr,
+					 (char *) TYPEALIGN(numa_page_size, startptr),
+					 (char *) TYPEALIGN_DOWN(numa_page_size, endptr));
+			}
+
+			/* best effort: align the pointers, so that the mbind() works */
+			startptr = (char *) TYPEALIGN(numa_page_size, startptr);
+			endptr = (char *) TYPEALIGN_DOWN(numa_page_size, endptr);
+
+			/* XXX or should we use pg_numa_move_to_node? */
+			pg_numa_bind_to_node(startptr, endptr, n);
+		}
+#endif
 	}
 
 	AssertCheckBufferPartitions();
 }
 
+/*
+ * BufferPartitionsCalculate
+ *		Pick number of buffer partitions for the number of nodes and
+ *		MIN_BUFFER_PARTITIONS.
+ *
+ * Picks the smallest number of partitions higher thah MIN_BUFFER_PARTITIONS,
+ * such that all nodes have the same number of partitions.
+ *
+ * This is best-effort with respect to size of the partitions. It's possible
+ * the partitions are not a perfect multiple of page size, in which case
+ * we set location only for the part where that is possible. The buffers on
+ * the "boundary" may get located up on arbitrary nodes.
+ *
+ * The extra complexity of figuring out the right "partition size" is not
+ * worth it, and it can lead to some partitions being much smaller. This way
+ * we end up with partitions of almost exactly the same size (one BLCKSZ is
+ * the largest difference).
+ *
+ * We expect shared buffers to be much larger than page size (at least on
+ * system where NUMA is a relevant feature), so the number of "not located"
+ * buffers should be a negligible fraction. This only affects pages between
+ * partitions for different nodes, so (nodes-1) pages. This is certainly
+ * fine with 2MB huge pages, but even with 1GB pages it should be OK (as
+ * such systems should have humongous amounts of memory).
+ *
+ * It also means we don't need to worry about memory page size before knowing
+ * if huge pages got used (which we only learn during allocation).
+ */
+void
+BufferPartitionsCalculate(int *num_nodes, int *num_partitions,
+						  int *num_partitions_per_node)
+{
+	int		nnodes,
+			nparts,
+			nparts_per_node;
+
+#if USE_LIBNUMA
+	nnodes = numa_num_configured_nodes();
+	nparts_per_node = 1;	/* at least one partition per node */
+
+	while ((nparts_per_node * nnodes) < MIN_BUFFER_PARTITIONS)
+		nparts_per_node++;
+
+	nparts = (nnodes * nparts_per_node);
+#else
+	/* without NUMA, assume there's just one node */
+	nnodes = 1;
+	nparts = MIN_BUFFER_PARTITIONS;
+	nparts_per_node = MIN_BUFFER_PARTITIONS;
+#endif
+
+	if (num_nodes)
+		*num_nodes = nnodes;
+
+	if (num_partitions)
+		*num_partitions = nparts;
+
+	if (num_partitions_per_node)
+		*num_partitions_per_node = nparts_per_node;
+}
+
 /*
  * BufferPartitionCount
  *		Returns the number of partitions created.
@@ -277,13 +464,14 @@ BufferPartitionCount(void)
  * The returned information is first/last buffer, number of buffers.
  */
 void
-BufferPartitionGet(int idx, int *num_buffers,
+BufferPartitionGet(int idx, int *node, int *num_buffers,
 				   int *first_buffer, int *last_buffer)
 {
 	if ((idx >= 0) && (idx < BufferPartitionsRegistry->npartitions))
 	{
 		BufferPartition *part = &BufferPartitionsRegistry->partitions[idx];
 
+		*node = part->numa_node;
 		*num_buffers = part->num_buffers;
 		*first_buffer = part->first_buffer;
 		*last_buffer = part->last_buffer;
@@ -293,3 +481,17 @@ BufferPartitionGet(int idx, int *num_buffers,
 
 	elog(ERROR, "invalid partition index");
 }
+
+void
+BufferPartitionsParams(int *num_nodes, int *num_partitions,
+					   int *num_partitions_per_node)
+{
+	if (num_nodes)
+		*num_nodes = BufferPartitionsRegistry->nnodes;
+
+	if (num_partitions)
+		*num_partitions = BufferPartitionsRegistry->npartitions;
+
+	if (num_partitions_per_node)
+		*num_partitions_per_node = BufferPartitionsRegistry->npartitions_per_node;
+}
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
index fdb5bad7910..53ef5239e8d 100644
--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
@@ -15,6 +15,15 @@
  */
 #include "postgres.h"
 
+#ifdef USE_LIBNUMA
+#include <sched.h>
+#endif
+
+#ifdef USE_LIBNUMA
+#include <numa.h>
+#include <numaif.h>
+#endif
+
 #include "pgstat.h"
 #include "port/atomics.h"
 #include "storage/buf_internals.h"
diff --git a/src/backend/utils/misc/guc_parameters.dat b/src/backend/utils/misc/guc_parameters.dat
index f15e74198c5..2e71c04282c 100644
--- a/src/backend/utils/misc/guc_parameters.dat
+++ b/src/backend/utils/misc/guc_parameters.dat
@@ -2724,6 +2724,12 @@
   max => 'INT_MAX / 2',
 },
 
+{ name => 'shared_buffers_numa', type => 'bool', context => 'PGC_POSTMASTER', group => 'RESOURCES_MEM',
+  short_desc => 'Locate partitions of shared buffers (and descriptors) to NUMA nodes.',
+  variable => 'shared_buffers_numa',
+  boot_val => 'false',
+},
+
 { name => 'shared_memory_size', type => 'int', context => 'PGC_INTERNAL', group => 'PRESET_OPTIONS',
   short_desc => 'Shows the size of the server\'s main shared memory area (rounded up to the nearest MB).',
   flags => 'GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE | GUC_UNIT_MB | GUC_RUNTIME_COMPUTED',
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index ac38cddaaf9..c0f79c779cc 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -142,6 +142,7 @@
 #temp_buffers = 8MB                     # min 800kB
 #max_prepared_transactions = 0          # zero disables the feature
                                         # (change requires restart)
+#shared_buffers_numa = off              # NUMA-aware partitioning
 # Caution: it is not advisable to set max_prepared_transactions nonzero unless
 # you actively intend to use prepared transactions.
 #work_mem = 4MB                         # min 64kB
diff --git a/src/include/port/pg_numa.h b/src/include/port/pg_numa.h
index 1b668fe1d91..8fe4d4ab7e3 100644
--- a/src/include/port/pg_numa.h
+++ b/src/include/port/pg_numa.h
@@ -17,6 +17,13 @@
 extern PGDLLIMPORT int pg_numa_init(void);
 extern PGDLLIMPORT int pg_numa_query_pages(int pid, unsigned long count, void **pages, int *status);
 extern PGDLLIMPORT int pg_numa_get_max_node(void);
+extern PGDLLIMPORT Size pg_numa_page_size(void);
+extern PGDLLIMPORT void pg_numa_move_to_node(char *startptr, char *endptr, int node);
+extern PGDLLIMPORT int pg_numa_bind_to_node(char *startptr, char *endptr, int node);
+
+extern PGDLLIMPORT int numa_flags;
+
+#define		NUMA_BUFFERS		0x01
 
 #ifdef USE_LIBNUMA
 
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index e5a887b9969..e944cee2e91 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -365,10 +365,10 @@ typedef struct BufferDesc
  * line sized.
  *
  * XXX: As this is primarily matters in highly concurrent workloads which
- * probably all are 64bit these days, and the space wastage would be a bit
- * more noticeable on 32bit systems, we don't force the stride to be cache
- * line sized on those. If somebody does actual performance testing, we can
- * reevaluate.
+ * probably all are 64bit these days. We force the stride to be cache line
+ * sized even on 32bit systems, where the space wastage is be a bit more
+ * noticeable, to allow partitioning of shared buffers (which requires the
+ * memory page be a multiple of buffer descriptor).
  *
  * Note that local buffer descriptors aren't forced to be aligned - as there's
  * no concurrent access to those it's unlikely to be beneficial.
@@ -378,7 +378,7 @@ typedef struct BufferDesc
  * platform with either 32 or 128 byte line sizes, it's good to align to
  * boundaries and avoid false sharing.
  */
-#define BUFFERDESC_PAD_TO_SIZE	(SIZEOF_VOID_P == 8 ? 64 : 1)
+#define BUFFERDESC_PAD_TO_SIZE	64
 
 typedef union BufferDescPadded
 {
@@ -416,8 +416,12 @@ extern PGDLLIMPORT ConditionVariableMinimallyPadded *BufferIOCVArray;
 extern PGDLLIMPORT WritebackContext BackendWritebackContext;
 
 extern int	BufferPartitionCount(void);
-extern void BufferPartitionGet(int idx, int *num_buffers,
+extern void BufferPartitionGet(int idx, int *node, int *num_buffers,
 							   int *first_buffer, int *last_buffer);
+extern void BufferPartitionsCalculate(int *num_nodes, int *num_partitions,
+									  int *num_partitions_per_node);
+extern void BufferPartitionsParams(int *num_nodes, int *num_partitions,
+								   int *num_partitions_per_node);
 
 /* in localbuf.c */
 extern PGDLLIMPORT BufferDesc *LocalBufferDescriptors;
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index 79a3f44747a..1cf09e8fb7c 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -158,10 +158,12 @@ typedef struct ReadBuffersOperation ReadBuffersOperation;
 /*
  * information about one partition of shared buffers
  *
+ * numa_nod specifies node for this partition (-1 means allocated on any node)
  * first/last buffer - the values are inclusive
  */
 typedef struct BufferPartition
 {
+	int			numa_node;		/* NUMA node (-1 no node) */
 	int			num_buffers;	/* number of buffers */
 	int			first_buffer;	/* first buffer of partition */
 	int			last_buffer;	/* last buffer of partition */
@@ -170,7 +172,9 @@ typedef struct BufferPartition
 /* an array of information about all partitions */
 typedef struct BufferPartitions
 {
+	int			nnodes;			/* number of NUMA nodes */
 	int			npartitions;	/* number of partitions */
+	int			npartitions_per_node;	/* for convenience */
 	BufferPartition partitions[FLEXIBLE_ARRAY_MEMBER];
 } BufferPartitions;
 
@@ -206,6 +210,7 @@ extern PGDLLIMPORT const PgAioHandleCallbacks aio_local_buffer_readv_cb;
 
 /* in buf_init.c */
 extern PGDLLIMPORT char *BufferBlocks;
+extern PGDLLIMPORT bool shared_buffers_numa;
 
 /* in localbuf.c */
 extern PGDLLIMPORT int NLocBuffer;
@@ -390,6 +395,9 @@ extern void MarkDirtyAllUnpinnedBuffers(int32 *buffers_dirtied,
 										int32 *buffers_already_dirty,
 										int32 *buffers_skipped);
 
+/* in buf_init.c */
+extern int	BufferGetNode(Buffer buffer);
+
 /* in localbuf.c */
 extern void AtProcExit_LocalBuffers(void);
 
diff --git a/src/port/pg_numa.c b/src/port/pg_numa.c
index 8954669273a..11c8a4503da 100644
--- a/src/port/pg_numa.c
+++ b/src/port/pg_numa.c
@@ -18,6 +18,9 @@
 
 #include "miscadmin.h"
 #include "port/pg_numa.h"
+#include "storage/pg_shmem.h"
+
+int	numa_flags;
 
 /*
  * At this point we provide support only for Linux thanks to libnuma, but in
@@ -118,6 +121,94 @@ pg_numa_get_max_node(void)
 	return numa_max_node();
 }
 
+/*
+ * pg_numa_move_to_node
+ *		move memory to different NUMA nodes in larger chunks
+ *
+ * startptr - start of the region (should be aligned to page size)
+ * endptr - end of the region (doesn't need to be aligned)
+ * node - node to move the memory to
+ *
+ * The "startptr" is expected to be a multiple of system memory page size, as
+ * determined by pg_numa_page_size.
+ *
+ * XXX We only expect to do this during startup, when the shared memory is
+ * still being setup.
+ */
+void
+pg_numa_move_to_node(char *startptr, char *endptr, int node)
+{
+	Size		sz = (endptr - startptr);
+
+	Assert((int64) startptr % pg_numa_page_size() == 0);
+
+	/*
+	 * numa_tonode_memory does not actually cause a page fault, and thus does
+	 * not locate the memory on the node. So it's fast, at least compared to
+	 * pg_numa_query_pages, and does not make startup longer. But it also
+	 * means the expensive part happen later, on the first access.
+	 */
+	numa_tonode_memory(startptr, sz, node);
+}
+
+int
+pg_numa_bind_to_node(char *startptr, char *endptr, int node)
+{
+	int				ret;
+	struct bitmask *nodemask;
+
+	if (node < 0)
+	{
+		errno = EINVAL;
+		return -1;
+	}
+
+	nodemask = numa_allocate_nodemask();
+	if (nodemask == NULL)
+	{
+		errno = ENOMEM;
+		return -1;
+	}
+
+	numa_bitmask_setbit(nodemask, node);
+
+	/*
+	 * MPOL_BIND places the pages strictly on the node, and MPOL_MF_MOVE migrates
+	 * pages already faulted in to that node. If mbind() fails, leave the default
+	 * placement in effect, and report the failure.
+	 */
+	ret = mbind(startptr, (endptr - startptr),
+				MPOL_BIND, nodemask->maskp, node, MPOL_MF_MOVE);
+
+	numa_free_nodemask(nodemask);
+
+	return ret;
+}
+
+Size
+pg_numa_page_size(void)
+{
+	Size		os_page_size;
+	Size		huge_page_size;
+
+#ifdef WIN32
+	SYSTEM_INFO sysinfo;
+
+	GetSystemInfo(&sysinfo);
+	os_page_size = sysinfo.dwPageSize;
+#else
+	os_page_size = sysconf(_SC_PAGESIZE);
+#endif
+
+	/* assume huge pages get used, unless HUGE_PAGES_OFF */
+	if (huge_pages_status != HUGE_PAGES_OFF)
+		GetHugePageSize(&huge_page_size, NULL);
+	else
+		huge_page_size = 0;
+
+	return Max(os_page_size, huge_page_size);
+}
+
 #else
 
 /* Empty wrappers */
@@ -140,4 +231,25 @@ pg_numa_get_max_node(void)
 	return 0;
 }
 
+void
+pg_numa_move_to_node(char *startptr, char *endptr, int node)
+{
+	/* we don't expect to ever get here in builds without libnuma */
+	Assert(false);
+}
+
+int
+pg_numa_bind_to_node(char *startptr, char *endptr, int node)
+{
+	/* we don't expect to ever get here in builds without libnuma */
+	Assert(false);
+}
+
+Size
+pg_numa_page_size(void)
+{
+	/* we don't expect to ever get here in builds without libnuma */
+	Assert(false);
+}
+
 #endif
-- 
2.54.0

