From 0001d43117dc5cad08fb0908a3e50a00c56f88d3 Mon Sep 17 00:00:00 2001 From: Dmitrii Dolgov <9erthalion6@gmail.com> Date: Sat, 13 Apr 2024 11:31:46 +0200 Subject: [PATCH v1] Identify huge pages accesibility using madvise Currently, PostgreSQL tries to figure out whether huge pages are available, to fallback if "huge_pages = try" is set. There is an annoying situation that this approach cannot handle, when there are huge pages available, but they are restricted via cgroups. If this happens and PostgreSQL is running inside a cgroup that limits on huge pages to 0, the allocation part with mmap would work, but the very first page fault will return SIGBUS. To handle this situation more gracefully, add madvise call with MADV_POPULATE_READ flag if available (it was introduced in Linux kernel 5.14). This flag tells kernel to populate page tables by triggering read faults if required, and in the situation described above it will fail, giving PostgreSQL an opportunity to fallback and proceed without huge pages. Note that it's not a side effect, but rather a designed behaviour [1]. [1]: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=4ca9b3859dac14bbef0c27d00667bb5b10917adb --- src/backend/port/sysv_shmem.c | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c index 1a6d8fa0fb..cbacf62066 100644 --- a/src/backend/port/sysv_shmem.c +++ b/src/backend/port/sysv_shmem.c @@ -600,7 +600,7 @@ CreateAnonymousSegment(Size *size) { Size allocsize = *size; void *ptr = MAP_FAILED; - int mmap_errno = 0; + int mmap_errno = 0, madv_errno = 0; #ifndef MAP_HUGETLB /* PGSharedMemoryCreate should have dealt with this case */ @@ -625,6 +625,28 @@ CreateAnonymousSegment(Size *size) if (huge_pages == HUGE_PAGES_TRY && ptr == MAP_FAILED) elog(DEBUG1, "mmap(%zu) with MAP_HUGETLB failed, huge pages disabled: %m", allocsize); + +#ifdef MADV_POPULATE_READ + /* + * Verifying if huge pages are available is done in two steps: first + * mmap with MAP_HUGETLB, then madvise with MADV_POPULATE_READ. For the + * latter the MADV_POPULATE_READ flag will tell kernel to populate page + * tables by triggering read faults if required, revealing potential + * access issues that otherwise would result in SIGBUS. + * + * If mmap fails, no huge pages are available; if it does not, there is + * still possibility that huge pages are limited via cgroups. If + * madvise fails, there are some huge pages, but we cannot access them + * due to cgroup limitations. If both succeeds, we're good to go. + */ + if(ptr != MAP_FAILED && madvise(ptr, allocsize, MADV_POPULATE_READ) != 0) + { + elog(DEBUG1, "madvise(%zu) with MAP_HUGETLB and MADV_POPULATE_READ " + "failed, huge pages disabled: %m", allocsize); + madv_errno = errno; + ptr = MAP_FAILED; + } +#endif } #endif @@ -650,7 +672,11 @@ CreateAnonymousSegment(Size *size) if (ptr == MAP_FAILED) { - errno = mmap_errno; + if (mmap_errno != 0) + errno = mmap_errno; + else + errno = madv_errno; + ereport(FATAL, (errmsg("could not map anonymous shared memory: %m"), (mmap_errno == ENOMEM) ? base-commit: 3a4a3537a999932642ba7a459900fe3c4f5cad02 -- 2.31.1