From 8debe072c3123ddbe065d677cf63c091414c03f9 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Sat, 28 Mar 2026 00:32:29 +0200
Subject: [PATCH v1 6/8] Allocate all parts of shmem hash table from a single
 contiguous area

Previously, the shared header (HASHHDR) and the directory were
allocated by the caller, and passed to hash_create(), while the actual
elements were allocated separately with ShmemAlloc(). After this
commit, all the memory needed by the header, the directory, and all
the elements is allocated using a single ShmemInitStruct() call, and
the different parts are carved out of that allocation. This way the
ShmemIndex entries (and thus pg_shmem_allocations) reflect the size
size of the whole hash table.

Commit f5930f9a98 attempted this earlier, but it had to be reverted.
The new strategy is to let dynahash perform all the allocations with
the alloc function, but have the alloc function carve out the parts
from the one larger allocation. The shared header and the directory
are now also allocated with alloc calls, instead of passing the area
for those directly from the caller.
---
 src/backend/storage/ipc/shmem.c   | 71 +++++++++++++++++++++-------
 src/backend/utils/hash/dynahash.c | 78 +++++++++++++------------------
 src/include/utils/hsearch.h       |  5 +-
 src/tools/pgindent/typedefs.list  |  1 +
 4 files changed, 91 insertions(+), 64 deletions(-)

diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c
index 47065bb3603..c8171125871 100644
--- a/src/backend/storage/ipc/shmem.c
+++ b/src/backend/storage/ipc/shmem.c
@@ -112,6 +112,16 @@ static bool firstNumaTouch = true;
 
 Datum		pg_numa_available(PG_FUNCTION_ARGS);
 
+/*
+ * A very simple allocator used to carve out different parts of a hash table,
+ * from a previously allocated contiguous shared memory area.
+ */
+typedef struct shmem_hash_allocator
+{
+	char	   *next;			/* start of free space in the area */
+	char	   *end;			/* end of the shmem area */
+} shmem_hash_allocator;
+
 /*
  *	InitShmemAllocator() --- set up basic pointers to shared memory.
  *
@@ -126,7 +136,7 @@ InitShmemAllocator(PGShmemHeader *seghdr)
 	Size		offset;
 	HASHCTL		info;
 	int			hash_flags;
-	size_t		size;
+	shmem_hash_allocator allocator;
 
 #ifndef EXEC_BACKEND
 	Assert(!IsUnderPostmaster);
@@ -182,15 +192,27 @@ InitShmemAllocator(PGShmemHeader *seghdr)
 	info.dsize = info.max_dsize = hash_select_dirsize(SHMEM_INDEX_SIZE);
 	info.alloc = ShmemHashAlloc;
 	info.alloc_arg = NULL;
-	hash_flags = HASH_ELEM | HASH_STRINGS | HASH_SHARED_MEM | HASH_ALLOC | HASH_DIRSIZE | HASH_FIXED_SIZE;
+	hash_flags = HASH_ELEM | HASH_STRINGS
+		| HASH_SHARED_MEM | HASH_ALLOC | HASH_DIRSIZE | HASH_FIXED_SIZE;
+
 	if (!IsUnderPostmaster)
 	{
-		size = hash_get_shared_size(&info, hash_flags);
-		ShmemAllocator->index = (HASHHDR *) ShmemAlloc(size);
+		size_t		size = hash_estimate_size(SHMEM_INDEX_SIZE, info.entrysize);
+		char	   *location = ShmemAlloc(size);
+
+		allocator.next = location;
+		allocator.end = location + size;
+		info.alloc_arg = &allocator;
+
+		info.hctl = NULL;
+		hash_flags |= HASH_ALLOC | HASH_FIXED_SIZE;
+		ShmemAllocator->index = (HASHHDR *) location;
 	}
 	else
+	{
+		info.hctl = ShmemAllocator->index;
 		hash_flags |= HASH_ATTACH;
-	info.hctl = ShmemAllocator->index;
+	}
 	ShmemIndex = hash_create("ShmemIndex", SHMEM_INDEX_SIZE, &info, hash_flags);
 	Assert(ShmemIndex != NULL);
 }
@@ -233,9 +255,17 @@ ShmemAllocNoError(Size size)
 static void *
 ShmemHashAlloc(Size size, void *alloc_arg)
 {
-	Size		allocated_size;
+	shmem_hash_allocator *allocator = (shmem_hash_allocator *) alloc_arg;
+	void	   *result;
 
-	return ShmemAllocRaw(size, &allocated_size);
+	size = MAXALIGN(size);
+
+	if (allocator->end - allocator->next < size)
+		return NULL;
+	result = allocator->next;
+	allocator->next += size;
+
+	return result;
 }
 
 /*
@@ -321,12 +351,14 @@ ShmemAddrIsValid(const void *addr)
  */
 HTAB *
 ShmemInitHash(const char *name,		/* table string name for shmem index */
-			  int64 nelems,	/* size of the table */
+			  int64 nelems,		/* size of the table */
 			  HASHCTL *infoP,	/* info about key and bucket size */
 			  int hash_flags)	/* info about infoP */
 {
 	bool		found;
+	size_t		size;
 	void	   *location;
+	shmem_hash_allocator allocator;
 
 	/*
 	 * Hash tables allocated in shared memory have a fixed directory; it can't
@@ -341,20 +373,27 @@ ShmemInitHash(const char *name,		/* table string name for shmem index */
 	infoP->alloc_arg = NULL;
 	hash_flags |= HASH_SHARED_MEM | HASH_ALLOC | HASH_DIRSIZE | HASH_FIXED_SIZE;
 
-	/* look it up in the shmem index */
-	location = ShmemInitStruct(name,
-							   hash_get_shared_size(infoP, hash_flags),
-							   &found);
+	size = hash_estimate_size(nelems, infoP->entrysize);
+
+	/* look it up in the shmem index or allocate */
+	location = ShmemInitStruct(name, size, &found);
 
 	/*
 	 * if it already exists, attach to it rather than allocate and initialize
 	 * new space
 	 */
-	if (found)
+	if (!found)
+	{
+		allocator.next = (char *) location;
+		allocator.end = (char *) location + size;
+		infoP->alloc_arg = &allocator;
+	}
+	else
+	{
+		/* Pass location of hashtable header to hash_create */
+		infoP->hctl = (HASHHDR *) location;
 		hash_flags |= HASH_ATTACH;
-
-	/* Pass location of hashtable header to hash_create */
-	infoP->hctl = (HASHHDR *) location;
+	}
 
 	return hash_create(name, nelems, infoP, hash_flags);
 }
diff --git a/src/backend/utils/hash/dynahash.c b/src/backend/utils/hash/dynahash.c
index d3dd16a4300..1173304ef0f 100644
--- a/src/backend/utils/hash/dynahash.c
+++ b/src/backend/utils/hash/dynahash.c
@@ -195,6 +195,9 @@ struct HASHHDR
 	int			nelem_alloc;	/* number of entries to allocate at once */
 	bool		isfixed;		/* if true, don't enlarge */
 
+	/* Current directory.  In shared tables, this doesn't change */
+	HASHSEGMENT *dir;
+
 #ifdef HASH_STATISTICS
 
 	/*
@@ -224,7 +227,7 @@ struct HTAB
 	HashCompareFunc match;		/* key comparison function */
 	HashCopyFunc keycopy;		/* key copying function */
 	HashAllocFunc alloc;		/* memory allocator */
-	void	   *alloc_arg;		/* opaque argument to pass to allocator function */
+	void	   *alloc_arg;		/* opaque argument to pass to alloc function */
 	MemoryContext hcxt;			/* memory context if default allocator used */
 	char	   *tabname;		/* table name (for error messages) */
 	bool		isshared;		/* true if table is in shared memory */
@@ -294,7 +297,7 @@ DynaHashAlloc(Size size, void *alloc_arg)
 	MemoryContext cxt = (MemoryContext) alloc_arg;
 
 	Assert(MemoryContextIsValid(cxt));
-	return MemoryContextAllocExtended(cxt, size,  MCXT_ALLOC_NO_OOM);
+	return MemoryContextAllocExtended(cxt, size, MCXT_ALLOC_NO_OOM);
 }
 
 
@@ -374,6 +377,8 @@ hash_create(const char *tabname, int64 nelem, const HASHCTL *info, int flags)
 	 * hash_destroy very simple.  The memory context is made a child of either
 	 * a context specified by the caller, or TopMemoryContext if nothing is
 	 * specified.
+	 *
+	 * Note that HASH_DIRSIZE and HASH_ALLOC had better be set as well.
 	 */
 	if (flags & HASH_SHARED_MEM)
 	{
@@ -486,21 +491,18 @@ hash_create(const char *tabname, int64 nelem, const HASHCTL *info, int flags)
 
 	if (flags & HASH_SHARED_MEM)
 	{
-		/*
-		 * ctl structure and directory are preallocated for shared memory
-		 * tables.  Note that HASH_DIRSIZE and HASH_ALLOC had better be set as
-		 * well.
-		 */
-		hashp->hctl = info->hctl;
-		hashp->dir = (HASHSEGMENT *) (((char *) info->hctl) + sizeof(HASHHDR));
 		hashp->hcxt = NULL;
 		hashp->isshared = true;
 
 		/* hash table already exists, we're just attaching to it */
 		if (flags & HASH_ATTACH)
 		{
+			hctl = info->hctl;
+
+			hashp->hctl = hctl;
+			hashp->dir = hctl->dir;
+
 			/* make local copies of some heavily-used values */
-			hctl = hashp->hctl;
 			hashp->keysize = hctl->keysize;
 
 			return hashp;
@@ -515,14 +517,20 @@ hash_create(const char *tabname, int64 nelem, const HASHCTL *info, int flags)
 		hashp->isshared = false;
 	}
 
+	/*
+	 * Allocate the header structure.
+	 *
+	 * XXX: In case of a shared memory hash table, other procesess need the
+	 * pointer to the header to re-find the hash table.  There is currently no
+	 * explicit way to pass it back from here, the caller relies on the fact
+	 * that this is the first allocation made with the alloc function.  That's
+	 * a little ugly, but works for now.
+	 */
+	hashp->hctl = (HASHHDR *) hashp->alloc(sizeof(HASHHDR), hashp->alloc_arg);
 	if (!hashp->hctl)
-	{
-		hashp->hctl = (HASHHDR *) hashp->alloc(sizeof(HASHHDR), hashp->alloc_arg);
-		if (!hashp->hctl)
-			ereport(ERROR,
-					(errcode(ERRCODE_OUT_OF_MEMORY),
-					 errmsg("out of memory")));
-	}
+		ereport(ERROR,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("out of memory")));
 
 	hashp->frozen = false;
 
@@ -725,25 +733,17 @@ init_htab(HTAB *hashp, int64 nelem)
 	nsegs = next_pow2_int(nsegs);
 
 	/*
-	 * Make sure directory is big enough. If pre-allocated directory is too
-	 * small, choke (caller screwed up).
+	 * Make sure directory is big enough.
 	 */
 	if (nsegs > hctl->dsize)
-	{
-		if (!(hashp->dir))
-			hctl->dsize = nsegs;
-		else
-			return false;
-	}
+		hctl->dsize = nsegs;
 
 	/* Allocate a directory */
-	if (!(hashp->dir))
-	{
-		hashp->dir = (HASHSEGMENT *)
-			hashp->alloc(hctl->dsize * sizeof(HASHSEGMENT), hashp->alloc_arg);
-		if (!hashp->dir)
-			return false;
-	}
+	hctl->dir = (HASHSEGMENT *)
+		hashp->alloc(hctl->dsize * sizeof(HASHSEGMENT), hashp->alloc_arg);
+	if (!hctl->dir)
+		return false;
+	hashp->dir = hctl->dir;
 
 	/* Allocate initial segments */
 	for (segp = hashp->dir; hctl->nsegs < nsegs; hctl->nsegs++, segp++)
@@ -832,19 +832,6 @@ hash_select_dirsize(int64 num_entries)
 	return nDirEntries;
 }
 
-/*
- * Compute the required initial memory allocation for a shared-memory
- * hashtable with the given parameters.  We need space for the HASHHDR
- * and for the (non expansible) directory.
- */
-Size
-hash_get_shared_size(HASHCTL *info, int flags)
-{
-	Assert(flags & HASH_DIRSIZE);
-	Assert(info->dsize == info->max_dsize);
-	return sizeof(HASHHDR) + info->dsize * sizeof(HASHSEGMENT);
-}
-
 
 /********************** DESTROY ROUTINES ************************/
 
@@ -1648,6 +1635,7 @@ dir_realloc(HTAB *hashp)
 	{
 		memcpy(p, old_p, old_dirsize);
 		MemSet(((char *) p) + old_dirsize, 0, new_dirsize - old_dirsize);
+		hashp->hctl->dir = p;
 		hashp->dir = p;
 		hashp->hctl->dsize = new_dsize;
 
diff --git a/src/include/utils/hsearch.h b/src/include/utils/hsearch.h
index 03bc1c171cd..b60ae20acc7 100644
--- a/src/include/utils/hsearch.h
+++ b/src/include/utils/hsearch.h
@@ -80,10 +80,10 @@ typedef struct HASHCTL
 	HashCopyFunc keycopy;		/* key copying function */
 	/* Used if HASH_ALLOC flag is set: */
 	HashAllocFunc alloc;		/* memory allocator */
-	void	   *alloc_arg;		/* opaque argument to pass to allocator function */
+	void	   *alloc_arg;		/* opaque argument to pass to alloc */
 	/* Used if HASH_CONTEXT flag is set: */
 	MemoryContext hcxt;			/* memory context to use for allocations */
-	/* Used if HASH_SHARED_MEM flag is set: */
+	/* Used if HASH_ATTACH flag is set: */
 	HASHHDR    *hctl;			/* location of header in shared mem */
 } HASHCTL;
 
@@ -150,7 +150,6 @@ extern void hash_seq_term(HASH_SEQ_STATUS *status);
 extern void hash_freeze(HTAB *hashp);
 extern Size hash_estimate_size(int64 num_entries, Size entrysize);
 extern int64 hash_select_dirsize(int64 num_entries);
-extern Size hash_get_shared_size(HASHCTL *info, int flags);
 extern void AtEOXact_HashTables(bool isCommit);
 extern void AtEOSubXact_HashTables(bool isCommit, int nestDepth);
 
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 712d84128ca..5f6502be030 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -4209,6 +4209,7 @@ shm_mq_result
 shm_toc
 shm_toc_entry
 shm_toc_estimator
+shmem_hash_allocator
 shmem_request_hook_type
 shmem_startup_hook_type
 sig_atomic_t
-- 
2.47.3

