From 5d966e57a2684e6dcb5d94e63668979ae1b94ccb Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Wed, 17 Jun 2026 17:00:08 -0400 Subject: [PATCH v1 2/2] nbtree: resurrect the recovery-side "pin scan" for VACUUM, This patch is for illustration only. The pin scan is the O(N) REDO interlock that commits 687f2cd7, 3e4b7d87, f65b94f6 and 9f83468b deliberately removed, so restoring it as-is reintroduces exactly the replication-delay regression those commits set out to avoid. --- src/include/access/nbtree.h | 5 ++- src/include/access/nbtxlog.h | 1 + src/include/access/xlog_internal.h | 2 +- src/backend/access/nbtree/nbtpage.c | 7 ++--- src/backend/access/nbtree/nbtree.c | 44 ++++++++++++++++++++++++++- src/backend/access/nbtree/nbtxlog.c | 34 ++++++++++++++++++--- src/backend/access/rmgrdesc/nbtdesc.c | 5 +-- 7 files changed, 85 insertions(+), 13 deletions(-) diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 3097e9bb1..3b128dc08 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -335,6 +335,8 @@ typedef struct BTVacState IndexBulkDeleteCallback callback; void *callback_state; BTCycleId cycleid; + BlockNumber lastBlockVacuumed; /* highest blkno we've vacuumed */ + BlockNumber lastBlockLocked; /* highest blkno we've cleanup-locked */ MemoryContext pagedelcontext; /* @@ -1253,7 +1255,8 @@ extern void _bt_upgradelockbufcleanup(Relation rel, Buffer buf); extern void _bt_pageinit(Page page, Size size); extern void _bt_delitems_vacuum(Relation rel, Buffer buf, OffsetNumber *deletable, int ndeletable, - BTVacuumPosting *updatable, int nupdatable); + BTVacuumPosting *updatable, int nupdatable, + BlockNumber lastBlockVacuumed); struct TM_IndexDeleteOp; /* avoid including tableam.h here */ extern void _bt_delitems_delete_check(Relation rel, Buffer buf, Relation heapRel, diff --git a/src/include/access/nbtxlog.h b/src/include/access/nbtxlog.h index 3a78ec27f..809e1323a 100644 --- a/src/include/access/nbtxlog.h +++ b/src/include/access/nbtxlog.h @@ -222,6 +222,7 @@ typedef struct xl_btree_reuse_page */ typedef struct xl_btree_vacuum { + BlockNumber lastBlockVacuumed; uint16 ndeleted; uint16 nupdated; diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h index 55663e6f4..be7189934 100644 --- a/src/include/access/xlog_internal.h +++ b/src/include/access/xlog_internal.h @@ -32,7 +32,7 @@ /* * Each page of XLOG file has a header like this: */ -#define XLOG_PAGE_MAGIC 0xD120 /* can be used as WAL version indicator */ +#define XLOG_PAGE_MAGIC 0xD121 /* can be used as WAL version indicator */ typedef struct XLogPageHeaderData { diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index 054703861..a837539a5 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -1181,7 +1181,8 @@ _bt_pageinit(Page page, Size size) void _bt_delitems_vacuum(Relation rel, Buffer buf, OffsetNumber *deletable, int ndeletable, - BTVacuumPosting *updatable, int nupdatable) + BTVacuumPosting *updatable, int nupdatable, + BlockNumber lastBlockVacuumed) { Page page = BufferGetPage(buf); BTPageOpaque opaque; @@ -1191,9 +1192,6 @@ _bt_delitems_vacuum(Relation rel, Buffer buf, OffsetNumber updatedoffsets[MaxIndexTuplesPerPage]; XLogRecPtr recptr; - /* Shouldn't be called unless there's something to do */ - Assert(ndeletable > 0 || nupdatable > 0); - /* Generate new version of posting lists without deleted TIDs */ if (nupdatable > 0) updatedbuf = _bt_delitems_update(updatable, nupdatable, @@ -1256,6 +1254,7 @@ _bt_delitems_vacuum(Relation rel, Buffer buf, { xl_btree_vacuum xlrec_vacuum; + xlrec_vacuum.lastBlockVacuumed = lastBlockVacuumed; xlrec_vacuum.ndeleted = ndeletable; xlrec_vacuum.nupdated = nupdatable; diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 3df2c752e..d28f2a111 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -1274,6 +1274,8 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, vstate.callback = callback; vstate.callback_state = callback_state; vstate.cycleid = cycleid; + vstate.lastBlockVacuumed = BTREE_METAPAGE; /* Initialise at first block */ + vstate.lastBlockLocked = BTREE_METAPAGE; /* Create a temporary memory context to run _bt_pagedel in */ vstate.pagedelcontext = AllocSetContextCreate(CurrentMemoryContext, @@ -1383,6 +1385,34 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, /* Set statistics num_pages field to final size of index */ stats->num_pages = num_pages; + /* + * Force one final XLOG_BTREE_VACUUM record if there are leaf pages after + * the last one we vacuumed -- which includes the common case where the + * index's last leaf page had nothing to delete. Its lastBlockVacuumed + * field makes replay pin-scan those trailing leaf pages, so that the + * replay pin scan reaches the end of the index, just as our own scan + * cleanup-locked every leaf page. + */ + if (XLogStandbyInfoActive() && + vstate.lastBlockVacuumed < vstate.lastBlockLocked) + { + Buffer buf; + + /* + * The page should be valid, but we can't use _bt_getbuf() because we + * want to use a nondefault buffer access strategy. Since we aren't + * going to delete any items, getting a cleanup lock again is probably + * overkill, but for consistency do that anyway. + */ + buf = ReadBufferExtended(rel, MAIN_FORKNUM, vstate.lastBlockLocked, + RBM_NORMAL, info->strategy); + LockBufferForCleanup(buf); + _bt_checkpage(rel, buf); + _bt_delitems_vacuum(rel, buf, NULL, 0, NULL, 0, + vstate.lastBlockVacuumed); + _bt_relbuf(rel, buf); + } + MemoryContextDelete(vstate.pagedelcontext); /* @@ -1538,6 +1568,14 @@ backtrack: */ _bt_upgradelockbufcleanup(rel, buf); + /* + * Remember the highest leaf page we've cleanup-locked. btvacuumscan() + * uses this to decide whether a final pin-scan record is needed to + * cover the tail of the index during replay (see nbtree/README). + */ + if (blkno > vstate->lastBlockLocked) + vstate->lastBlockLocked = blkno; + /* * Check whether we need to backtrack to earlier pages. What we are * concerned about is a page split that happened since we started the @@ -1639,7 +1677,11 @@ backtrack: { Assert(nhtidsdead >= ndeletable + nupdatable); _bt_delitems_vacuum(rel, buf, deletable, ndeletable, updatable, - nupdatable); + nupdatable, vstate->lastBlockVacuumed); + + /* Remember the highest leaf page we've vacuumed */ + if (blkno > vstate->lastBlockVacuumed) + vstate->lastBlockVacuumed = blkno; stats->tuples_removed += nhtidsdead; /* must recompute maxoff */ diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c index dff7d286f..e2346aa14 100644 --- a/src/backend/access/nbtree/nbtxlog.c +++ b/src/backend/access/nbtree/nbtxlog.c @@ -592,11 +592,37 @@ btree_xlog_vacuum(XLogReaderState *record) BTPageOpaque opaque; /* - * We need to take a cleanup lock here, just like btvacuumpage(). However, - * it isn't necessary to exhaustively get a cleanup lock on every block in - * the index during recovery (just getting a cleanup lock on pages with - * items to kill suffices). See nbtree/README for details. + * We need to take a cleanup lock here, just like btvacuumpage() */ + if (InHotStandby && BlockNumberIsValid(xlrec->lastBlockVacuumed)) + { + RelFileLocator thisrlocator; + BlockNumber thisblkno; + BlockNumber blkno; + + XLogRecGetBlockTag(record, 0, &thisrlocator, NULL, &thisblkno); + + for (blkno = xlrec->lastBlockVacuumed + 1; blkno < thisblkno; blkno++) + { + /* + * We use RBM_NORMAL_NO_LOG mode because it's not an error condition + * to see all-zero pages. The original btvacuumpage() scan would + * have skipped over all-zero pages, noting them in the FSM but not + * bothering to initialize them just yet; so we mustn't throw an + * error here. + */ + Buffer pbuf; + + pbuf = XLogReadBufferExtended(thisrlocator, MAIN_FORKNUM, blkno, + RBM_NORMAL_NO_LOG, InvalidBuffer); + if (BufferIsValid(pbuf)) + { + LockBufferForCleanup(pbuf); + UnlockReleaseBuffer(pbuf); + } + } + } + if (XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &buffer) == BLK_NEEDS_REDO) { diff --git a/src/backend/access/rmgrdesc/nbtdesc.c b/src/backend/access/rmgrdesc/nbtdesc.c index 1d08f9957..5670cf56d 100644 --- a/src/backend/access/rmgrdesc/nbtdesc.c +++ b/src/backend/access/rmgrdesc/nbtdesc.c @@ -59,8 +59,9 @@ btree_desc(StringInfo buf, XLogReaderState *record) { xl_btree_vacuum *xlrec = (xl_btree_vacuum *) rec; - appendStringInfo(buf, "ndeleted: %u, nupdated: %u", - xlrec->ndeleted, xlrec->nupdated); + appendStringInfo(buf, "lastBlockVacuumed: %u, ndeleted: %u, nupdated: %u", + xlrec->lastBlockVacuumed, xlrec->ndeleted, + xlrec->nupdated); if (XLogRecHasBlockData(record, 0)) delvacuum_desc(buf, XLogRecGetBlockData(record, 0, NULL), -- 2.53.0