From 5e190dc84ce7078b6c680d18736e56be90cecdab Mon Sep 17 00:00:00 2001 From: Ashutosh Bapat Date: Thu, 19 Jun 2025 17:38:29 +0200 Subject: [PATCH v5 08/10] Support shrinking shared buffers When shrinking the shared buffers pool, each buffer in the area being shrunk needs to be flushed if it's dirty so as not to loose the changes to that buffer after shrinking. Also, each such buffer needs to be removed from the buffer mapping table so that backends do not access it after shrinking. Buffer eviction requires a separate barrier phase for two reasons: 1. No other backend should map a new page to any of buffers being evicted when eviction is in progress. So they wait while eviction is in progress. 2. Since a pinned buffer has the pin recorded in the backend local memory as well as the buffer descriptor (which is in shared memory), eviction should not coincide with remapping the shared memory of a backend. Otherwise we might loose consistency of local and shared pinning records. Hence it needs to be carried out in ProcessBarrierShmemResize() and not in AnonymousShmemResize() as indicated by now removed comment. If a buffer being evicted is pinned, we raise a FATAL error but this should improve. There are multiple options 1. to wait for the pinned buffer to get unpinned, 2. the backend is killed or it itself cancels the query or 3. rollback the operation. Note that option 1 and 2 would require the pinning related local and shared records to be accessed. But we need infrastructure to do either of this right now. Ashutosh Bapat --- src/backend/port/sysv_shmem.c | 42 +++++--- src/backend/storage/buffer/buf_init.c | 8 +- src/backend/storage/buffer/bufmgr.c | 95 +++++++++++++++++++ src/backend/storage/buffer/freelist.c | 68 +++++++++++++ .../utils/activity/wait_event_names.txt | 1 + src/include/storage/buf_internals.h | 1 + src/include/storage/bufmgr.h | 1 + src/include/storage/pg_shmem.h | 1 + 8 files changed, 202 insertions(+), 15 deletions(-) diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c index b3c90d15d52..e612a83c9f0 100644 --- a/src/backend/port/sysv_shmem.c +++ b/src/backend/port/sysv_shmem.c @@ -1011,14 +1011,6 @@ AnonymousShmemResize(void) */ pending_pm_shmem_resize = false; - /* - * XXX: Currently only increasing of shared_buffers is supported. For - * decreasing something similar has to be done, but buffer blocks with - * data have to be drained first. - */ - if(NBuffersOld > NBuffers) - return false; - #ifndef MAP_HUGETLB /* PrepareHugePages should have dealt with this case */ Assert(huge_pages != HUGE_PAGES_ON && !huge_pages_on); @@ -1112,11 +1104,14 @@ AnonymousShmemResize(void) * all the pointers are still valid, and we only need to update * structures size in the ShmemIndex once -- any other backend * will pick up this shared structure from the index. - * - * XXX: This is the right place for buffer eviction as well. */ BufferManagerShmemInit(NBuffersOld); + /* + * Wipe out the evictor PID so that it can be used for the next + * buffer resizing operation. + */ + ShmemCtrl->evictor_pid = 0; /* If all fine, broadcast the new value */ pg_atomic_write_u32(&ShmemCtrl->NSharedBuffers, NBuffers); } @@ -1169,11 +1164,31 @@ ProcessBarrierShmemResize(Barrier *barrier) * XXX: If we need to be able to abort resizing, this has to be done later, * after the SHMEM_RESIZE_DONE. */ - if (BarrierArriveAndWait(barrier, WAIT_EVENT_SHMEM_RESIZE_START)) + + /* + * Evict extra buffers when shrinking shared buffers. We need to do this + * while the memory for extra buffers is still mapped i.e. before remapping + * the shared memory segments to a smaller memory area. + */ + if (NBuffersOld > NBuffersPending) { - Assert(IsUnderPostmaster); - SendPostmasterSignal(PMSIGNAL_SHMEM_RESIZE); + BarrierArriveAndWait(barrier, WAIT_EVENT_SHMEM_RESIZE_START); + + /* + * TODO: If the buffer eviction fails for any reason, we should + * gracefully rollback the shared buffer resizing and try again. But the + * infrastructure to do so is not available right now. Hence just raise + * a FATAL so that the system restarts. + */ + if (!EvictExtraBuffers(NBuffersPending, NBuffersOld)) + elog(FATAL, "buffer eviction failed"); + + if (BarrierArriveAndWait(barrier, WAIT_EVENT_SHMEM_RESIZE_EVICT)) + SendPostmasterSignal(PMSIGNAL_SHMEM_RESIZE); } + else + if (BarrierArriveAndWait(barrier, WAIT_EVENT_SHMEM_RESIZE_START)) + SendPostmasterSignal(PMSIGNAL_SHMEM_RESIZE); AnonymousShmemResize(); @@ -1683,5 +1698,6 @@ ShmemControlInit(void) /* shmem_resizable should be initialized by now */ ShmemCtrl->Resizable = shmem_resizable; + ShmemCtrl->evictor_pid = 0; } } diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c index 8c1ea623392..6f5743036a2 100644 --- a/src/backend/storage/buffer/buf_init.c +++ b/src/backend/storage/buffer/buf_init.c @@ -157,8 +157,12 @@ BufferManagerShmemInit(int FirstBufferToInit) } #endif - /* Correct last entry of linked list */ - GetBufferDescriptor(NBuffers - 1)->freeNext = FREENEXT_END_OF_LIST; + /* + * Correct last entry of linked list, when initializing the buffers or when + * expanding the buffers. + */ + if (FirstBufferToInit < NBuffers) + GetBufferDescriptor(NBuffers - 1)->freeNext = FREENEXT_END_OF_LIST; /* Init other shared buffer-management stuff */ StrategyInitialize(!foundDescs); diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 667aa0c0c78..169a44dd9fc 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -57,6 +57,7 @@ #include "storage/fd.h" #include "storage/ipc.h" #include "storage/lmgr.h" +#include "storage/pg_shmem.h" #include "storage/proc.h" #include "storage/read_stream.h" #include "storage/smgr.h" @@ -7453,3 +7454,97 @@ const PgAioHandleCallbacks aio_local_buffer_readv_cb = { .complete_local = local_buffer_readv_complete, .report = buffer_readv_report, }; + +/* + * When shrinking shared buffers pool, evict the buffers which will not be part + * of the shrunk buffer pool. + */ +bool +EvictExtraBuffers(int newBufSize, int oldBufSize) +{ + bool result = true; + + /* + * If the buffer being evicated is locked, this function will need to wait. + * This function should not be called from a Postmaster since it can not wait on a lock. + */ + Assert(IsUnderPostmaster); + + /* + * Let only one backend perform eviction. We could split the work across all + * the backends but that doesn't seem necessary. + * + * The first backend to acquire ShmemResizeLock, sets its own PID as the + * evictor PID for other backends to know that the eviction is in progress or + * has already been performed. The evictor backend releases the lock when it + * finishes eviction. While the eviction is in progress, backends other than + * evictor backend won't be able to take the lock. They won't perform + * eviction. A backend may acquire the lock after eviction has completed, but + * it will not perform eviction since the evictor PID is already set. Evictor + * PID is reset only when the buffer resizing finishes. Thus only one backend + * will perform eviction in a given instance of shared buffers resizing. + * + * Any backend which acquires this lock will release it before the eviction + * phase finishes, hence the same lock can be reused for the next phase of + * resizing buffers. + */ + if (LWLockConditionalAcquire(ShmemResizeLock, LW_EXCLUSIVE)) + { + if (ShmemCtrl->evictor_pid == 0) + { + ShmemCtrl->evictor_pid = MyProcPid; + + StrategyPurgeFreeList(newBufSize); + + /* + * TODO: Before evicting any buffer, we should check whether any of the + * buffers are pinned. If we find that a buffer is pinned after evicting + * most of them, that will impact performance since all those evicted + * buffers might need to be read again. + */ + for (Buffer buf = newBufSize + 1; buf <= oldBufSize; buf++) + { + BufferDesc *desc = GetBufferDescriptor(buf - 1); + uint32 buf_state; + bool buffer_flushed; + + buf_state = pg_atomic_read_u32(&desc->state); + + /* + * Nobody is expected to touch the buffers while resizing is + * going one hence unlocked precheck should be safe and saves + * some cycles. + */ + if (!(buf_state & BM_VALID)) + continue; + + /* + * XXX: Looks like CurrentResourceOwner can be NULL here, find + * another one in that case? + * */ + if (CurrentResourceOwner) + ResourceOwnerEnlarge(CurrentResourceOwner); + + ReservePrivateRefCountEntry(); + + LockBufHdr(desc); + + /* + * Now that we have locked buffer descriptor, make sure that the + * buffer without valid data has been skipped above. + */ + Assert(buf_state & BM_VALID); + + if (!EvictUnpinnedBufferInternal(desc, &buffer_flushed)) + { + elog(WARNING, "could not remove buffer %u, it is pinned", buf); + result = false; + break; + } + } + } + LWLockRelease(ShmemResizeLock); + } + + return result; +} diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c index bd390f2709d..7b9ed010e2f 100644 --- a/src/backend/storage/buffer/freelist.c +++ b/src/backend/storage/buffer/freelist.c @@ -527,6 +527,74 @@ StrategyInitialize(bool init) } +/* + * StrategyPurgeFreeList -- remove all buffers with id higher than the number of + * buffers in the buffer pool. + * + * This is called before evicting buffers while shrinking shared buffers, so that + * the free list does not reference a buffer that will be removed. + * + * The function is called after resizing has started and thus nobody should be + * traversing the free list and also not touching the buffers. + */ +void +StrategyPurgeFreeList(int numBuffers) +{ + int firstBuffer = FREENEXT_END_OF_LIST; + int nextFree = StrategyControl->firstFreeBuffer; + BufferDesc *prevValidBuf = NULL; + + SpinLockAcquire(&StrategyControl->buffer_strategy_lock); + + while (nextFree != FREENEXT_END_OF_LIST) + { + BufferDesc *buf = GetBufferDescriptor(nextFree); + + /* nextFree should be id of buffer being examined. */ + Assert(nextFree == buf->buf_id); + /* The buffer should not be marked as not in the list. */ + Assert(buf->freeNext != FREENEXT_NOT_IN_LIST); + + /* + * If the buffer is within the new size of pool, keep it in the free list + * otherwise discard it. + */ + if (buf->buf_id < numBuffers) + { + if (prevValidBuf != NULL) + prevValidBuf->freeNext = buf->buf_id; + prevValidBuf = buf; + + /* Save the first free buffer in the list if not already known. */ + if (firstBuffer == FREENEXT_NOT_IN_LIST) + firstBuffer = nextFree; + } + /* Examine the next buffer in the free list. */ + nextFree = buf->freeNext; + } + + /* Update the last valid free buffer, if there's any. */ + if (prevValidBuf != NULL) + { + StrategyControl->lastFreeBuffer = prevValidBuf->buf_id; + prevValidBuf->freeNext = FREENEXT_END_OF_LIST; + } + else + StrategyControl->lastFreeBuffer = FREENEXT_END_OF_LIST; + + /* Update first valid free buffer, if there's any. */ + StrategyControl->firstFreeBuffer = firstBuffer; + + SpinLockRelease(&StrategyControl->buffer_strategy_lock); + + /* + * TODO: following was suggested by AI. Check whether it is required. + * If we removed all buffers from the freelist, reset the clock sweep + * pointer to zero. This is not strictly necessary, but it seems like a + * good idea to avoid confusion. + */ +} + /* ---------------------------------------------------------------- * Backend-private buffer ring management * ---------------------------------------------------------------- diff --git a/src/backend/utils/activity/wait_event_names.txt b/src/backend/utils/activity/wait_event_names.txt index 691fa14e9e3..0c588b69a90 100644 --- a/src/backend/utils/activity/wait_event_names.txt +++ b/src/backend/utils/activity/wait_event_names.txt @@ -156,6 +156,7 @@ REPLICATION_SLOT_DROP "Waiting for a replication slot to become inactive so it c RESTORE_COMMAND "Waiting for to complete." SAFE_SNAPSHOT "Waiting to obtain a valid snapshot for a READ ONLY DEFERRABLE transaction." SHMEM_RESIZE_START "Waiting for other backends to start resizing shared memory." +SHMEM_RESIZE_EVICT "Waiting for other backends to finish buffer evication phase." SHMEM_RESIZE_DONE "Waiting for other backends to finish resizing shared memory." SYNC_REP "Waiting for confirmation from a remote server during synchronous replication." WAL_BUFFER_INIT "Waiting on WAL buffer to be initialized." diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h index 0dec7d93b3b..add15e3723b 100644 --- a/src/include/storage/buf_internals.h +++ b/src/include/storage/buf_internals.h @@ -453,6 +453,7 @@ extern void StrategyNotifyBgWriter(int bgwprocno); extern Size StrategyShmemSize(void); extern void StrategyInitialize(bool init); +extern void StrategyPurgeFreeList(int numBuffers); extern bool have_free_buffer(void); /* buf_table.c */ diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index 4239ebe640b..0c554f0b130 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -315,6 +315,7 @@ extern void EvictRelUnpinnedBuffers(Relation rel, int32 *buffers_evicted, int32 *buffers_flushed, int32 *buffers_skipped); +extern bool EvictExtraBuffers(int fromBuf, int toBuf); /* in buf_init.c */ extern void BufferManagerShmemInit(int); diff --git a/src/include/storage/pg_shmem.h b/src/include/storage/pg_shmem.h index bccdd45b1f7..9a3e68f76d8 100644 --- a/src/include/storage/pg_shmem.h +++ b/src/include/storage/pg_shmem.h @@ -64,6 +64,7 @@ extern PGDLLIMPORT ShmemSegment Segments[ANON_MAPPINGS]; typedef struct { pg_atomic_uint32 NSharedBuffers; + pid_t evictor_pid; Barrier Barrier; pg_atomic_uint64 Generation; bool Resizable; -- 2.49.0