From 475fffc314b4d5975faf94d43504f2dcc0f1dc8d Mon Sep 17 00:00:00 2001 From: Ashutosh Bapat Date: Thu, 19 Jun 2025 17:38:51 +0200 Subject: [PATCH v5 09/10] Reinitialize StrategyControl after resizing buffers ... and BgBufferSync and ClockSweepTick adjustments The commit introduces a separate function StrategyReInitialize() instead of reusing StrategyInitialize() since some of the things that the second one does are not required in the first one. Here's list of what StrategyReInitialize() does and how does it differ from StrategyInitialize(). 1. When expanding the buffer pool add new buffers to the free list. 2. When shrinking buffers, we remove any buffers, in the area being shrunk, from the freelist. While doing so we adjust the first and last free buffer pointers in the StrategyControl area. Hence nothing more needed after resizing. 3. Check the sanity of the free buffer list is added after resizing. 4. StrategyControl pointer needn't be fetched again since it should not change. But added an Assert to make sure the pointer is valid. 5. &StrategyControl->buffer_strategy_lock need not be initialized again. 6. nextVictimBuffer, completePasses and numBufferAllocs are viewed in the context of NBuffers. Now that NBuffers itself has changed, those three do not make sense. Reset them as if the server has restarted again. This commit introduces a flag delay_shmem_resize, which postgresql backends and workers can use to signal the coordinator to delay resizing operation. Background writer sets this flag when its scanning buffers. Background writer is blocked when the actual resizing is in progress. But if resizing is about to begin, it does not scan the buffers by returning from BgBufferSync(). It stops a scan in progress when it sees that the resizing has begun. After the resizing is finished, it adjusts the collected statistics according to the new size of the buffer pool at the end of barrier processing. Once the buffer resizing is finished, before resuming the regular operation, bgwriter resets the information saved so far. This information is viewed in the context of NBuffers and hence does not make sense after NBuffer has changed. Ashutosh Bapat --- src/backend/port/sysv_shmem.c | 24 ++++- src/backend/postmaster/bgwriter.c | 2 +- src/backend/storage/buffer/buf_init.c | 11 ++- src/backend/storage/buffer/bufmgr.c | 74 ++++++++++---- src/backend/storage/buffer/freelist.c | 133 ++++++++++++++++++++++++++ src/include/storage/buf_internals.h | 1 + src/include/storage/bufmgr.h | 3 +- src/include/storage/ipc.h | 1 + 8 files changed, 226 insertions(+), 23 deletions(-) diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c index e612a83c9f0..e61a557966b 100644 --- a/src/backend/port/sysv_shmem.c +++ b/src/backend/port/sysv_shmem.c @@ -115,6 +115,7 @@ static AnonymousMapping Mappings[ANON_MAPPINGS]; /* Flag telling postmaster that resize is needed */ volatile bool pending_pm_shmem_resize = false; +volatile bool delay_shmem_resize = false; /* Keeps track of the previous NBuffers value */ static int NBuffersOld = -1; @@ -1118,6 +1119,12 @@ AnonymousShmemResize(void) LWLockRelease(ShmemResizeLock); } + + /* + * TODO: Shouldn't we call ResizeBufferPool() here as well? Or those + * backend who can not lock the LWLock conditionally won't resize the + * buffers. + */ } return true; @@ -1138,13 +1145,17 @@ ProcessBarrierShmemResize(Barrier *barrier) { Assert(IsUnderPostmaster); - elog(DEBUG1, "Handle a barrier for shmem resizing from %d to %d, %d", - NBuffersOld, NBuffersPending, pending_pm_shmem_resize); + elog(DEBUG1, "Handle a barrier for shmem resizing from %d to %d, %d, %d", + NBuffersOld, NBuffersPending, pending_pm_shmem_resize, delay_shmem_resize); /* Wait until we have seen the new NBuffers value */ if (!pending_pm_shmem_resize) return false; + /* Wait till this process becomes ready to resize buffers. */ + if (delay_shmem_resize) + return false; + /* * First thing to do after attaching to the barrier is to wait for others. * We can't simply use BarrierArriveAndWait, because backends might arrive @@ -1195,6 +1206,15 @@ ProcessBarrierShmemResize(Barrier *barrier) /* The second phase means the resize has finished, SHMEM_RESIZE_DONE */ BarrierArriveAndWait(barrier, WAIT_EVENT_SHMEM_RESIZE_DONE); + if (MyBackendType == B_BG_WRITER) + { + /* + * Before resuming regular background writer activity, adjust the + * statistics collected so far. + */ + BgBufferSyncAdjust(NBuffersOld, NBuffers); + } + BarrierDetach(barrier); return true; } diff --git a/src/backend/postmaster/bgwriter.c b/src/backend/postmaster/bgwriter.c index 72f5acceec7..32b34f28ead 100644 --- a/src/backend/postmaster/bgwriter.c +++ b/src/backend/postmaster/bgwriter.c @@ -233,7 +233,7 @@ BackgroundWriterMain(const void *startup_data, size_t startup_data_len) /* * Do one cycle of dirty-buffer writing. */ - can_hibernate = BgBufferSync(&wb_context); + can_hibernate = BgBufferSync(&wb_context, false); /* Report pending statistics to the cumulative stats system */ pgstat_report_bgwriter(); diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c index 6f5743036a2..62c1672fb70 100644 --- a/src/backend/storage/buffer/buf_init.c +++ b/src/backend/storage/buffer/buf_init.c @@ -164,8 +164,15 @@ BufferManagerShmemInit(int FirstBufferToInit) if (FirstBufferToInit < NBuffers) GetBufferDescriptor(NBuffers - 1)->freeNext = FREENEXT_END_OF_LIST; - /* Init other shared buffer-management stuff */ - StrategyInitialize(!foundDescs); + /* + * Init other shared buffer-management stuff from scratch configuring buffer + * pool the first time. If we are just resizing buffer pool adjust only the + * required structures. + */ + if (FirstBufferToInit == 0) + StrategyInitialize(!foundDescs); + else + StrategyReInitialize(FirstBufferToInit); /* Initialize per-backend file flush context */ WritebackContextInit(&BackendWritebackContext, diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 169a44dd9fc..590fc737da8 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -3611,6 +3611,32 @@ BufferSync(int flags) TRACE_POSTGRESQL_BUFFER_SYNC_DONE(NBuffers, num_written, num_to_scan); } +/* + * Information saved between BgBufferSync() calls so we can determine the + * strategy point's advance rate and avoid scanning already-cleaned buffers. The + * variables are global instead of static local so that BgBufferSyncAdjust() can + * adjust it when resizing shared buffers. + */ +static bool saved_info_valid = false; +static int prev_strategy_buf_id; +static uint32 prev_strategy_passes; +static int next_to_clean; +static uint32 next_passes; + +/* Moving averages of allocation rate and clean-buffer density */ +static float smoothed_alloc = 0; +static float smoothed_density = 10.0; + +void +BgBufferSyncAdjust(int NBuffersOld, int NBuffersNew) +{ + saved_info_valid = false; +#ifdef BGW_DEBUG + elog(DEBUG2, "invalidated background writer status after resizing buffers from %d to %d", + NBuffersOld, NBuffersNew); +#endif +} + /* * BgBufferSync -- Write out some dirty buffers in the pool. * @@ -3623,27 +3649,13 @@ BufferSync(int flags) * bgwriter_lru_maxpages to 0.) */ bool -BgBufferSync(WritebackContext *wb_context) +BgBufferSync(WritebackContext *wb_context, bool reset) { /* info obtained from freelist.c */ int strategy_buf_id; uint32 strategy_passes; uint32 recent_alloc; - /* - * Information saved between calls so we can determine the strategy - * point's advance rate and avoid scanning already-cleaned buffers. - */ - static bool saved_info_valid = false; - static int prev_strategy_buf_id; - static uint32 prev_strategy_passes; - static int next_to_clean; - static uint32 next_passes; - - /* Moving averages of allocation rate and clean-buffer density */ - static float smoothed_alloc = 0; - static float smoothed_density = 10.0; - /* Potentially these could be tunables, but for now, not */ float smoothing_samples = 16; float scan_whole_pool_milliseconds = 120000.0; @@ -3666,6 +3678,22 @@ BgBufferSync(WritebackContext *wb_context) long new_strategy_delta; uint32 new_recent_alloc; + /* + * If buffer pool is being shrunk the buffer being written out may not remain + * valid. If the buffer pool is being expanded, more buffers will become + * available without even this function writing out any. Hence wait till + * buffer resizing finishes i.e. go into hibernation mode. + */ + if (pg_atomic_read_u32(&ShmemCtrl->NSharedBuffers) != NBuffers) + return true; + + /* + * Resizing shared buffers while this function is performing an LRU scan on + * them may lead to wrong results. Indicate that the resizing should wait for + * the LRU scan to complete. + */ + delay_shmem_resize = true; + /* * Find out where the freelist clock sweep currently is, and how many * buffer allocations have happened since our last call. @@ -3842,8 +3870,17 @@ BgBufferSync(WritebackContext *wb_context) num_written = 0; reusable_buffers = reusable_buffers_est; - /* Execute the LRU scan */ - while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est) + /* + * Execute the LRU scan. + * + * If buffer pool is being shrunk, the buffer being written may not remain + * valid. If the buffer pool is being expanded, more buffers will become + * available without even this function writing any. Hence stop what we are doing. This + * also unblocks other processes that are waiting for buffer resizing to + * finish. + */ + while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est && + pg_atomic_read_u32(&ShmemCtrl->NSharedBuffers) == NBuffers) { int sync_state = SyncOneBuffer(next_to_clean, true, wb_context); @@ -3902,6 +3939,9 @@ BgBufferSync(WritebackContext *wb_context) #endif } + /* Let the resizing commence. */ + delay_shmem_resize = false; + /* Return true if OK to hibernate */ return (bufs_to_lap == 0 && recent_alloc == 0); } diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c index 7b9ed010e2f..41641bb3ae6 100644 --- a/src/backend/storage/buffer/freelist.c +++ b/src/backend/storage/buffer/freelist.c @@ -98,6 +98,9 @@ static BufferDesc *GetBufferFromRing(BufferAccessStrategy strategy, uint32 *buf_state); static void AddBufferToRing(BufferAccessStrategy strategy, BufferDesc *buf); +#ifdef USE_ASSERT_CHECKING +static void StrategyValidateFreeList(void); +#endif /* USE_ASSERT_CHECKING */ /* * ClockSweepTick - Helper routine for StrategyGetBuffer() @@ -526,6 +529,88 @@ StrategyInitialize(bool init) Assert(!init); } +/* + * StrategyReInitialize -- re-initialize the buffer cache replacement + * strategy. + * + * To be called when resizing buffer manager and only from the coordinator. + * TODO: Assess the differences between this function and StrategyInitialize(). + */ +void +StrategyReInitialize(int FirstBufferIdToInit) +{ + bool found; + + /* + * Resizing memory for buffer pools should not affect the address of + * StrategyControl. + */ + if (StrategyControl != (BufferStrategyControl *) + ShmemInitStructInSegment("Buffer Strategy Status", + sizeof(BufferStrategyControl), + &found, STRATEGY_SHMEM_SEGMENT)) + elog(FATAL, "something went wrong while re-initializing the buffer strategy"); + + Assert(found); + + /* TODO: Buffer lookup table adjustment: There are two options: + * + * 1. Resize the buffer lookup table to match the new number of buffers. But + * this requires rehashing all the entries in the buffer lookup table with + * the new table size. + * + * 2. Allocate maximum size of the buffer lookup table at the beginning and + * never resize it. This leaves sparse buffer lookup table which is + * inefficient from both memory and time perspective. According to David + * Rowley, the sparse entries in the buffer look up table cause frequent + * cacheline reload which affect performance. If the impact of that + * inefficiency in a benchmark is significant, we will need to consider first + * option. + */ + + /* + * When shrinking buffers, we must have adjusted the first and the last free + * buffer when removing the buffers being shrunk from the free list. Nothing + * to be done here. + * + * When expanding the shared buffers, new buffers are added at the end of the + * freelist or they form the new free list if there are no free buffers. + */ + if (FirstBufferIdToInit < NBuffers) + { + if (StrategyControl->firstFreeBuffer == FREENEXT_END_OF_LIST) + StrategyControl->firstFreeBuffer = FirstBufferIdToInit; + else + { + Assert(StrategyControl->lastFreeBuffer >= 0); + GetBufferDescriptor(StrategyControl->lastFreeBuffer - 1)->freeNext = FirstBufferIdToInit; + } + + StrategyControl->lastFreeBuffer = NBuffers - 1; + } + + /* Check free list sanity after resizing. */ +#ifdef USE_ASSERT_CHECKING + StrategyValidateFreeList(); +#endif /* USE_ASSERT_CHECKING */ + + /* + * The clock sweep tick pointer might have got invalidated. Reset it as if + * starting a fresh server. + */ + pg_atomic_write_u32(&StrategyControl->nextVictimBuffer, 0); + + /* + * The old statistics is viewed in the context of the number of shared + * buffers. It does not make sense now that the number of shared buffers + * itself has changed. + */ + StrategyControl->completePasses = 0; + pg_atomic_init_u32(&StrategyControl->numBufferAllocs, 0); + + /* No pending notification */ + StrategyControl->bgwprocno = -1; +} /* * StrategyPurgeFreeList -- remove all buffers with id higher than the number of @@ -595,6 +680,54 @@ StrategyPurgeFreeList(int numBuffers) */ } +#ifdef USE_ASSERT_CHECKING +/* + * StrategyValidateFreeList-- check sanity of free buffer list. + */ +static void +StrategyValidateFreeList(void) +{ + int nextFree = StrategyControl->firstFreeBuffer; + int numFreeBuffers = 0; + int lastFreeBuffer = FREENEXT_END_OF_LIST; + + SpinLockAcquire(&StrategyControl->buffer_strategy_lock); + + while (nextFree != FREENEXT_END_OF_LIST) + { + BufferDesc *buf = GetBufferDescriptor(nextFree); + + /* nextFree should be id of buffer being examined. */ + Assert(nextFree == buf->buf_id); + Assert(buf->buf_id < NBuffers); + /* The buffer should not be marked as not in the list. */ + Assert(buf->freeNext != FREENEXT_NOT_IN_LIST); + + /* Update our knowledge of last buffer in the free list. */ + lastFreeBuffer = buf->buf_id; + + numFreeBuffers++; + + /* Avoid infinite recursion in case there are cycles in free list. */ + if (numFreeBuffers > NBuffers) + break; + + nextFree = buf->freeNext; + } + + Assert(numFreeBuffers <= NBuffers); + + /* + * Make sure that the StrategyControl's knowledge of last free buffer + * agrees with what's there in the free list. + */ + if (StrategyControl->firstFreeBuffer != FREENEXT_END_OF_LIST) + Assert(StrategyControl->lastFreeBuffer == lastFreeBuffer); + + SpinLockRelease(&StrategyControl->buffer_strategy_lock); +} +#endif /* USE_ASSERT_CHECKING */ + /* ---------------------------------------------------------------- * Backend-private buffer ring management * ---------------------------------------------------------------- diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h index add15e3723b..46949e9d90e 100644 --- a/src/include/storage/buf_internals.h +++ b/src/include/storage/buf_internals.h @@ -454,6 +454,7 @@ extern void StrategyNotifyBgWriter(int bgwprocno); extern Size StrategyShmemSize(void); extern void StrategyInitialize(bool init); extern void StrategyPurgeFreeList(int numBuffers); +extern void StrategyReInitialize(int FirstBufferToInit); extern bool have_free_buffer(void); /* buf_table.c */ diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index 0c554f0b130..83a75eab844 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -298,7 +298,8 @@ extern bool ConditionalLockBufferForCleanup(Buffer buffer); extern bool IsBufferCleanupOK(Buffer buffer); extern bool HoldingBufferPinThatDelaysRecovery(void); -extern bool BgBufferSync(struct WritebackContext *wb_context); +extern bool BgBufferSync(struct WritebackContext *wb_context, bool reset); +extern void BgBufferSyncAdjust(int NBuffersOld, int NBuffersNew); extern uint32 GetPinLimit(void); extern uint32 GetLocalPinLimit(void); diff --git a/src/include/storage/ipc.h b/src/include/storage/ipc.h index bb7ae4d33b3..7d1c64a9267 100644 --- a/src/include/storage/ipc.h +++ b/src/include/storage/ipc.h @@ -65,6 +65,7 @@ typedef void (*shmem_startup_hook_type) (void); extern PGDLLIMPORT bool proc_exit_inprogress; extern PGDLLIMPORT bool shmem_exit_inprogress; extern PGDLLIMPORT volatile bool pending_pm_shmem_resize; +extern PGDLLIMPORT volatile bool delay_shmem_resize; pg_noreturn extern void proc_exit(int code); extern void shmem_exit(int code); -- 2.49.0