From e409526a007dff663b65f6deeeeec09190dbb6f3 Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Sun, 22 Mar 2026 02:22:06 -0400 Subject: [PATCH v28 07/11] heapam: Optimize pin transfers during index scans. Add an xs_lastinblock flag to IndexScanHeapData, to track whether the current item's heap block differs from the next item's heap block ("next" in terms of the current scan direction). When these adjacent blocks differ, heapam_index_heap_fetch will transfer its buffer pin to its table slot instead of incrementing the pin count. This avoids an immediate IncrBufferRefCount call. It also avoids a ReleaseBuffer call later on, during the next call to heapam_index_heap_fetch (when the scan has to return the aforementioned "next" item). Also add an explicit ExecClearTuple to the block-switch path in heapam_index_heap_fetch to release the pin on the slot (which is often the pin transferred to the slot during the previous call). This fixes a performance problem where GetPrivateRefCountEntrySlow is called more often than one would hope. The underlying issue has been tied to the pin in the slot being held, even if we decide to release the buffer and move on: ExecStoreBufferHeapTuple will first fail to hit the backend-local cache for the release of the old pin (because we just pinned and locked the new buffer), causing a cache miss. Author: Peter Geoghegan Suggested-by: Andres Freund Reviewed-by: Andres Freund Discussion: https://postgr.es/m/CAH2-Wz=D4Lru9BkvqaRnFRPDaZbfTOdWcxw13zyG6GVFTtz_vw@mail.gmail.com --- src/include/access/heapam.h | 3 + src/backend/access/heap/heapam_indexscan.c | 73 +++++++++++++++++++++- 2 files changed, 75 insertions(+), 1 deletion(-) diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index 5364ce27b..71b6420c9 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -132,6 +132,9 @@ typedef struct IndexScanHeapData bool xs_readonly; /* scan is read-only? */ + /* Plain index scan xs_lastinblock optimization */ + bool xs_lastinblock; /* last TID on this block in current batch? */ + uint16 xs_blkswitch_count; /* number of heap blocks fetched */ /* Per-tuple context for padding "name" columns during index-only scans */ diff --git a/src/backend/access/heap/heapam_indexscan.c b/src/backend/access/heap/heapam_indexscan.c index 5f04041df..e9b1ea851 100644 --- a/src/backend/access/heap/heapam_indexscan.c +++ b/src/backend/access/heap/heapam_indexscan.c @@ -154,6 +154,9 @@ heapam_index_scan_begin(IndexScanDesc scan, uint32 flags) /* Remember if scan is read-only */ hscan->xs_readonly = (flags & SO_HINT_REL_READ_ONLY) != 0; + /* xs_lastinblock optimization state */ + Assert(!hscan->xs_lastinblock); + /* Resolve which xs_getnext_slot implementation to use for this scan */ if (scan->indexRelation->rd_indam->amgetbatch != NULL) { @@ -788,6 +791,15 @@ heapam_index_heap_fetch(IndexScanDesc scan, IndexScanHeapData *hscan, */ hscan->xs_blkswitch_count++; + /* + * Drop the xs_blk pin independently held on by slot (if any) now, + * before calling ReleaseBuffer. This avoids expensive calls to + * GetPrivateRefCountEntrySlow caused by ExecStoreBufferHeapTuple + * failing to hit the backend's cache for the release of the old pin. + */ + if (!index_only) + ExecClearTuple(slot); + if (BufferIsValid(hscan->xs_cbuf)) ReleaseBuffer(hscan->xs_cbuf); @@ -826,7 +838,36 @@ heapam_index_heap_fetch(IndexScanDesc scan, IndexScanHeapData *hscan, *heap_continue = !scan->MVCCScan; slot->tts_tableOid = RelationGetRelid(rel); - ExecStoreBufferHeapTuple(heapTuple, slot, hscan->xs_cbuf); + + /* + * If xs_lastinblock indicates that `tid` is the last TID on the + * current heap block, transfer our buffer pin to the slot rather + * than having the slot increment the pin count. This saves a + * pair of IncrBufferRefCount and ReleaseBuffer calls, since the + * next call here would just release the same pin on xs_cbuf + * anyway. (Actually, this is only true if you assume that the + * scan will continue in the current direction, but it generally + * does. An incorrect prediction costs us little.) + * + * We can only safely do this when heap_continue is false, since + * otherwise the caller will need xs_cbuf to remain valid for the + * next call. + */ + if (hscan->xs_lastinblock && !*heap_continue) + { + ExecStorePinnedBufferHeapTuple(heapTuple, slot, hscan->xs_cbuf); + hscan->xs_cbuf = InvalidBuffer; + hscan->xs_blk = InvalidBlockNumber; + + /* + * Note: the pin now owned by the slot is expected to be + * released on the next call here, via an explicit + * ExecClearTuple. This avoids churn in the backend's private + * refcount cache. + */ + } + else + ExecStoreBufferHeapTuple(heapTuple, slot, hscan->xs_cbuf); } else { @@ -975,10 +1016,40 @@ heapam_index_return_scanpos_tid(IndexScanDesc scan, IndexScanHeapData *hscan, if (all_visible == NULL) { + int nextItem; + bool hasNext; + /* * Plain index scan. + * + * Set xs_lastinblock to indicate whether the next item in the current + * scan direction is on a different heap block to the current item. + * heapam_index_heap_fetch will apply this information about + * scanPos.item's tableTID before we return to the core executor. + * + * Note: We don't set this for index-only scans because it doesn't seem + * worth the trouble of reasoning about all-visible items. + * + * Note: We deliberately don't consider the batch after scanBatch, + * because doing so would add complexity for little benefit. It's + * okay if xs_lastinblock is spuriously set to false. */ Assert(!scan->xs_want_itup); + if (ScanDirectionIsForward(direction)) + { + nextItem = scanPos->item + 1; + hasNext = (nextItem <= scanBatch->lastItem); + } + else + { + nextItem = scanPos->item - 1; + hasNext = (nextItem >= scanBatch->firstItem); + } + + hscan->xs_lastinblock = hasNext && + ItemPointerGetBlockNumber(&scanBatch->items[nextItem].tableTid) != + ItemPointerGetBlockNumber(&scan->xs_heaptid); + return &scan->xs_heaptid; } -- 2.53.0