From 0e3c671082743f2826a7e8a96a19a071f5c8aeb3 Mon Sep 17 00:00:00 2001 From: Dmitrii Dolgov <9erthalion6@gmail.com> Date: Sat, 15 Mar 2025 16:39:45 +0100 Subject: [PATCH v4 7/8] Use anonymous files to back shared memory segments Allow to use anonymous files for shared memory, instead of plain anonymous memory. Such an anonymous file is created via memfd_create, it lives in memory, behaves like a regular file and semantically equivalent to an anonymous memory allocated via mmap with MAP_ANONYMOUS. Advantages of using anon files are following: * We've got a file descriptor, which could be used for regular file operations (modification, truncation, you name it). * The file could be given a name, which improves readability when it comes to process maps. Here is how it looks like 7f90cde00000-7f90d5126000 rw-s 00000000 00:01 5463 /memfd:main (deleted) 7f90d5126000-7f914de00000 ---p 00000000 00:00 0 7f914de00000-7f9175128000 rw-s 00000000 00:01 5466 /memfd:buffers (deleted) 7f9175128000-7f944de00000 ---p 00000000 00:00 0 7f944de00000-7f9455528000 rw-s 00000000 00:01 5469 /memfd:descriptors (deleted) 7f9455528000-7f94cde00000 ---p 00000000 00:00 0 7f94cde00000-7f94d5228000 rw-s 00000000 00:01 5472 /memfd:iocv (deleted) 7f94d5228000-7f954de00000 ---p 00000000 00:00 0 7f954de00000-7f9555266000 rw-s 00000000 00:01 5475 /memfd:checkpoint (deleted) 7f9555266000-7f958de00000 ---p 00000000 00:00 0 7f958de00000-7f95954aa000 rw-s 00000000 00:01 5478 /memfd:strategy (deleted) 7f95954aa000-7f95cde00000 ---p 00000000 00:00 0 * By default, Linux will not add file-backed shared mappings into a core dump, making it more convenient to work with them in PostgreSQL: no more huge dumps to process. The downside is that memfd_create is Linux specific. --- src/backend/port/sysv_shmem.c | 73 +++++++++++++++++++++++++++++----- src/backend/port/win32_shmem.c | 2 +- src/backend/storage/ipc/ipci.c | 2 +- src/include/portability/mem.h | 2 +- src/include/storage/pg_shmem.h | 3 +- 5 files changed, 68 insertions(+), 14 deletions(-) diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c index a3437973784..87000a24eea 100644 --- a/src/backend/port/sysv_shmem.c +++ b/src/backend/port/sysv_shmem.c @@ -107,6 +107,7 @@ typedef struct AnonymousMapping Pointer shmem; /* Pointer to the start of the mapped memory */ Pointer seg_addr; /* SysV shared memory for the header */ unsigned long seg_id; /* IPC key */ + int segment_fd; /* fd for the backing anon file */ } AnonymousMapping; static AnonymousMapping Mappings[ANON_MAPPINGS]; @@ -127,7 +128,7 @@ static int next_free_segment = 0; * 00400000-00490000 /path/bin/postgres * ... * 012d9000-0133e000 [heap] - * 7f443a800000-7f470a800000 /dev/zero (deleted) + * 7f443a800000-7f470a800000 /memfd:main (deleted) * 7f470a800000-7f471831d000 /usr/lib/locale/locale-archive * 7f4718400000-7f4718401000 /usr/lib64/libicudata.so.74.2 * ... @@ -150,9 +151,9 @@ static int next_free_segment = 0; * The result would look like this: * * 012d9000-0133e000 [heap] - * 7f4426f54000-7f442e010000 /dev/zero (deleted) + * 7f4426f54000-7f442e010000 /memfd:main (deleted) * 7f442e010000-7f443a800000 # reserved empty space - * 7f443a800000-7f444196c000 /dev/zero (deleted) + * 7f443a800000-7f444196c000 /memfd:buffers (deleted) * 7f444196c000-7f470a800000 # reserved empty space * 7f470a800000-7f471831d000 /usr/lib/locale/locale-archive * 7f4718400000-7f4718401000 /usr/lib64/libicudata.so.74.2 @@ -643,13 +644,14 @@ PGSharedMemoryAttach(IpcMemoryId shmId, * *hugepagesize and *mmap_flags are set to 0. */ void -GetHugePageSize(Size *hugepagesize, int *mmap_flags) +GetHugePageSize(Size *hugepagesize, int *mmap_flags, int *memfd_flags) { #ifdef MAP_HUGETLB Size default_hugepagesize = 0; Size hugepagesize_local = 0; int mmap_flags_local = 0; + int memfd_flags_local = 0; /* * System-dependent code to find out the default huge page size. @@ -708,6 +710,7 @@ GetHugePageSize(Size *hugepagesize, int *mmap_flags) } mmap_flags_local = MAP_HUGETLB; + memfd_flags_local = MFD_HUGETLB; /* * On recent enough Linux, also include the explicit page size, if @@ -718,7 +721,16 @@ GetHugePageSize(Size *hugepagesize, int *mmap_flags) { int shift = pg_ceil_log2_64(hugepagesize_local); - mmap_flags_local |= (shift & MAP_HUGE_MASK) << MAP_HUGE_SHIFT; + memfd_flags_local |= (shift & MAP_HUGE_MASK) << MAP_HUGE_SHIFT; + } +#endif + +#if defined(MFD_HUGE_MASK) && defined(MFD_HUGE_SHIFT) + if (hugepagesize_local != default_hugepagesize) + { + int shift = pg_ceil_log2_64(hugepagesize_local); + + memfd_flags_local |= (shift & MAP_HUGE_MASK) << MAP_HUGE_SHIFT; } #endif @@ -727,6 +739,8 @@ GetHugePageSize(Size *hugepagesize, int *mmap_flags) *mmap_flags = mmap_flags_local; if (hugepagesize) *hugepagesize = hugepagesize_local; + if (memfd_flags) + *memfd_flags = memfd_flags_local; #else @@ -734,6 +748,8 @@ GetHugePageSize(Size *hugepagesize, int *mmap_flags) *hugepagesize = 0; if (mmap_flags) *mmap_flags = 0; + if (memfd_flags) + *memfd_flags = 0; #endif /* MAP_HUGETLB */ } @@ -771,7 +787,7 @@ CreateAnonymousSegment(AnonymousMapping *mapping, Pointer base) Size allocsize = mapping->shmem_size; void *ptr = MAP_FAILED; int mmap_errno = 0; - int mmap_flags = PG_MMAP_FLAGS; + int mmap_flags = PG_MMAP_FLAGS, memfd_flags = 0; #ifndef MAP_HUGETLB /* ReserveAnonymousMemory should have dealt with this case */ @@ -785,7 +801,7 @@ CreateAnonymousSegment(AnonymousMapping *mapping, Pointer base) Assert(huge_pages == HUGE_PAGES_ON || huge_pages == HUGE_PAGES_TRY); /* Round up the request size to a suitable large value */ - GetHugePageSize(&hugepagesize, &mmap_flags); + GetHugePageSize(&hugepagesize, &mmap_flags, &memfd_flags); if (allocsize % hugepagesize != 0) allocsize += hugepagesize - (allocsize % hugepagesize); @@ -794,6 +810,29 @@ CreateAnonymousSegment(AnonymousMapping *mapping, Pointer base) } #endif + /* + * Prepare an anonymous file backing the segment. Its size will be + * specified later via ftruncate. + * + * The file behaves like a regular file, but lives in memory. Once all + * references to the file are dropped, it is automatically released. + * Anonymous memory is used for all backing pages of the file, thus it has + * the same semantics as anonymous memory allocations using mmap with the + * MAP_ANONYMOUS flag. + */ + mapping->segment_fd = memfd_create(MappingName(mapping->shmem_segment), + memfd_flags); + + /* + * Specify the segment file size using allocsize, which contains + * potentially modified size. + */ + if(ftruncate(mapping->segment_fd, allocsize) == -1) + ereport(FATAL, + (errcode(ERRCODE_SYSTEM_ERROR), + errmsg("could not truncase anonymous file for \"%s\": %m", + MappingName(mapping->shmem_segment)))); + elog(DEBUG1, "segment[%s]: mmap(%zu) at address %p", MappingName(mapping->shmem_segment), allocsize, base + reserved_offset); @@ -807,7 +846,7 @@ CreateAnonymousSegment(AnonymousMapping *mapping, Pointer base) * a restart. */ ptr = mmap(base + reserved_offset, allocsize, PROT_READ | PROT_WRITE, - mmap_flags | MAP_FIXED, -1, 0); + mmap_flags | MAP_FIXED, mapping->segment_fd, 0); mmap_errno = errno; if (ptr == MAP_FAILED) @@ -817,8 +856,15 @@ CreateAnonymousSegment(AnonymousMapping *mapping, Pointer base) "fallback to the non-resizable allocation", MappingName(mapping->shmem_segment), allocsize, base + reserved_offset); + /* Specify the segment file size using allocsize. */ + if(ftruncate(mapping->segment_fd, allocsize) == -1) + ereport(FATAL, + (errcode(ERRCODE_SYSTEM_ERROR), + errmsg("could not truncase anonymous file for \"%s\": %m", + MappingName(mapping->shmem_segment)))); + ptr = mmap(NULL, allocsize, PROT_READ | PROT_WRITE, - PG_MMAP_FLAGS, -1, 0); + PG_MMAP_FLAGS, mapping->segment_fd, 0); mmap_errno = errno; } else @@ -889,7 +935,7 @@ ReserveAnonymousMemory(Size reserve_size) Size hugepagesize, total_size = 0; int mmap_flags; - GetHugePageSize(&hugepagesize, &mmap_flags); + GetHugePageSize(&hugepagesize, &mmap_flags, NULL); /* * Figure out how much memory is needed for all segments, keeping in @@ -1070,6 +1116,13 @@ AnonymousShmemResize(void) if (m->shmem_size == new_size) continue; + /* Resize the backing anon file. */ + if(ftruncate(m->segment_fd, new_size) == -1) + ereport(FATAL, + (errcode(ERRCODE_SYSTEM_ERROR), + errmsg("could not truncase anonymous file for \"%s\": %m", + MappingName(m->shmem_segment)))); + /* Clean up some reserved space to resize into */ if (munmap(m->shmem + m->shmem_size, new_size - m->shmem_size) == -1) ereport(FATAL, diff --git a/src/backend/port/win32_shmem.c b/src/backend/port/win32_shmem.c index ce719f1b412..ba972106de1 100644 --- a/src/backend/port/win32_shmem.c +++ b/src/backend/port/win32_shmem.c @@ -627,7 +627,7 @@ pgwin32_ReserveSharedMemoryRegion(HANDLE hChild) * use GetLargePageMinimum() instead. */ void -GetHugePageSize(Size *hugepagesize, int *mmap_flags) +GetHugePageSize(Size *hugepagesize, int *mmap_flags, int *memfd_flags) { if (hugepagesize) *hugepagesize = 0; diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c index abeb91e24fd..dc2b4becf4a 100644 --- a/src/backend/storage/ipc/ipci.c +++ b/src/backend/storage/ipc/ipci.c @@ -396,7 +396,7 @@ InitializeShmemGUCs(void) /* * Calculate the number of huge pages required. */ - GetHugePageSize(&hp_size, NULL); + GetHugePageSize(&hp_size, NULL, NULL); if (hp_size != 0) { Size hp_required; diff --git a/src/include/portability/mem.h b/src/include/portability/mem.h index ef9800732d9..40588ff6968 100644 --- a/src/include/portability/mem.h +++ b/src/include/portability/mem.h @@ -38,7 +38,7 @@ #define MAP_NOSYNC 0 #endif -#define PG_MMAP_FLAGS (MAP_SHARED|MAP_ANONYMOUS|MAP_HASSEMAPHORE) +#define PG_MMAP_FLAGS (MAP_SHARED|MAP_HASSEMAPHORE) /* Some really old systems don't define MAP_FAILED. */ #ifndef MAP_FAILED diff --git a/src/include/storage/pg_shmem.h b/src/include/storage/pg_shmem.h index 2e47b222cbb..b9573520d9a 100644 --- a/src/include/storage/pg_shmem.h +++ b/src/include/storage/pg_shmem.h @@ -124,7 +124,8 @@ extern PGShmemHeader *PGSharedMemoryCreate(Size size, PGShmemHeader **shim, Pointer base); extern bool PGSharedMemoryIsInUse(unsigned long id1, unsigned long id2); extern void PGSharedMemoryDetach(void); -extern void GetHugePageSize(Size *hugepagesize, int *mmap_flags); +extern void GetHugePageSize(Size *hugepagesize, int *mmap_flags, + int *memfd_flags); void *ReserveAnonymousMemory(Size reserve_size); bool ProcessBarrierShmemResize(Barrier *barrier); -- 2.45.1