From 93d7675cb5f56629b948e3e186de7015d67c7070 Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@vondra.me>
Date: Thu, 1 May 2025 20:23:18 +0200
Subject: [PATCH v20250709 2/6] prefetch for btree indexes

Implements the bt_stream_read_next() callback, returning blocks from
the current BTScanOpaque.
---
 src/backend/access/nbtree/nbtree.c    | 162 ++++++++++++++++++++++++++
 src/backend/access/nbtree/nbtsearch.c |  46 ++++++++
 src/include/access/nbtree.h           |   9 ++
 3 files changed, 217 insertions(+)

diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 619b356e848..0fa4af79dac 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -21,9 +21,11 @@
 #include "access/nbtree.h"
 #include "access/relscan.h"
 #include "access/stratnum.h"
+#include "access/visibilitymap.h"
 #include "commands/progress.h"
 #include "commands/vacuum.h"
 #include "nodes/execnodes.h"
+#include "optimizer/cost.h"
 #include "pgstat.h"
 #include "storage/bulk_write.h"
 #include "storage/condition_variable.h"
@@ -329,6 +331,107 @@ btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
 	return ntids;
 }
 
+/*
+ * bt_stream_read_next
+ *		Return the next block to read from the read stream.
+ *
+ * Returns the next block from the current leaf page. The first block is
+ * when accessing the first tuple, after already receiving the TID from the
+ * index (for the item itemIndex points at).
+ *
+ * With index-only scans this skips all-visible pages. The visibility info
+ * is stored, so that we can later pass it to the scan (we must not access
+ * the VM again, the bit might have changes, and the read stream would get
+ * out of sync (we'd get different blocks than we expect expect).
+ *
+ * Returns the block number to get from the read stream. InvalidBlockNumber
+ * means we've ran out of item on the current leaf page - the stream will
+ * end, and we'll need to reset it after reading the next page (or after
+ * changing the scan direction).
+ *
+ * XXX Should skip duplicate blocks (for correlated indexes). But that's
+ * not implemented yet.
+ */
+static BlockNumber
+bt_stream_read_next(ReadStream *stream,
+					void *callback_private_data,
+					void *per_buffer_data)
+{
+	IndexScanDesc	scan = (IndexScanDesc) callback_private_data;
+	BTScanOpaque	so = (BTScanOpaque) scan->opaque;
+	ScanDirection	dir = so->currPos.dir;
+	BlockNumber		block = InvalidBlockNumber;
+
+	/*
+	 * Is this the first request for the read stream (possibly after a reset)?
+	 * If yes, initialize the stream to the current item (itemIndex).
+	 */
+	if (so->currPos.streamIndex == -1)
+		so->currPos.streamIndex = so->currPos.itemIndex;
+
+	/*
+	 * Find the next block to read. For plain index scans we will return the
+	 * very next item, but with index-only scans we skip TIDs from all-visible
+	 * pages (because we won't read those).
+	 */
+	while ((so->currPos.streamIndex >= so->currPos.firstItem) &&
+		   (so->currPos.streamIndex <= so->currPos.lastItem))
+	{
+		ItemPointer		tid;
+		BTScanPosItem  *item;
+
+		item = &so->currPos.items[so->currPos.streamIndex];
+
+		tid = &item->heapTid;
+		block = ItemPointerGetBlockNumber(tid);
+
+		/*
+		 * For index-only scans, check the VM and remember the result. If the page
+		 * is all-visible, don't return the block number, try reading the next one.
+		 *
+		 * XXX Maybe this could use the same logic to check for duplicate blocks,
+		 * and reuse the VM result if possible.
+		 */
+		if (scan->xs_want_itup)
+		{
+			if (!item->allVisibleSet)
+			{
+				item->allVisibleSet = true;
+				item->allVisible = VM_ALL_VISIBLE(scan->heapRelation,
+												  ItemPointerGetBlockNumber(tid),
+												  &so->vmBuffer);
+			}
+
+			/* don't prefetch this all-visible block, try the next one */
+			if (item->allVisible)
+				block = InvalidBlockNumber;
+		}
+
+		/* advance to the next item, assuming the current scan direction */
+		if (ScanDirectionIsForward(dir))
+		{
+			so->currPos.streamIndex++;
+		}
+		else
+		{
+			so->currPos.streamIndex--;
+		}
+
+		/* don't return the same block twice (and remember this one) */
+		if (so->lastBlock == block)
+			block = InvalidBlockNumber;
+
+		/* Did we find a valid block? If yes, we're done. */
+		if (block != InvalidBlockNumber)
+			break;
+	}
+
+	/* remember the block we're returning */
+	so->lastBlock = block;
+
+	return block;
+}
+
 /*
  *	btbeginscan() -- start a scan on a btree index
  */
@@ -364,6 +467,12 @@ btbeginscan(Relation heap, Relation index, int nkeys, int norderbys)
 	so->killedItems = NULL;		/* until needed */
 	so->numKilled = 0;
 
+	/* buffer for accessing the VM in read_next callback */
+	so->vmBuffer = InvalidBuffer;
+
+	/* nothing returned */
+	so->lastBlock = InvalidBlockNumber;
+
 	/*
 	 * We don't know yet whether the scan will be index-only, so we do not
 	 * allocate the tuple workspace arrays until btrescan.  However, we set up
@@ -375,6 +484,27 @@ btbeginscan(Relation heap, Relation index, int nkeys, int norderbys)
 
 	scan->opaque = so;
 
+	/*
+	 * Initialize the read stream too, to opt in into prefetching.
+	 *
+	 * XXX We create a stream depending on the GUC, and only if the heap rel
+	 * is provided. This means we don't initialize the stream even for bitmap
+	 * scans, which don't use it.
+	 *
+	 * XXX The table has to be already locked by the query, so NoLock. Too
+	 * bad the heapRelation does not get passed here.
+	 */
+	if (enable_indexscan_prefetch && heap)
+	{
+		scan->xs_rs = read_stream_begin_relation(READ_STREAM_DEFAULT,
+												 NULL,
+												 heap,
+												 MAIN_FORKNUM,
+												 bt_stream_read_next,
+												 scan,
+												 0);
+	}
+
 	return scan;
 }
 
@@ -461,6 +591,14 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
 		memcpy(scan->keyData, scankey, scan->numberOfKeys * sizeof(ScanKeyData));
 	so->numberOfKeys = 0;		/* until _bt_preprocess_keys sets it */
 	so->numArrayKeys = 0;		/* ditto */
+
+	/* reset the read stream, to start over */
+	if (scan->xs_rs)
+	{
+		so->currPos.streamIndex = -1;
+		so->lastBlock = InvalidBlockNumber;
+		read_stream_reset(scan->xs_rs);
+	}
 }
 
 /*
@@ -483,6 +621,22 @@ btendscan(IndexScanDesc scan)
 	so->markItemIndex = -1;
 	BTScanPosUnpinIfPinned(so->markPos);
 
+	/* release the VM buffer */
+	if (so->vmBuffer != InvalidBuffer)
+	{
+		ReleaseBuffer(so->vmBuffer);
+		so->vmBuffer = InvalidBuffer;
+	}
+
+	/*
+	 * XXX I wonder if maybe the stream should be managed by the indexam.c
+	 * layer, not by each index AM.
+	 */
+
+	/* terminate the read stream (and close the heap) */
+	if (scan->xs_rs)
+		read_stream_end(scan->xs_rs);
+
 	/* No need to invalidate positions, the RAM is about to be freed. */
 
 	/* Release storage */
@@ -581,6 +735,14 @@ btrestrpos(IndexScanDesc scan)
 		else
 			BTScanPosInvalidate(so->currPos);
 	}
+
+	/* we're restored the scan position, reset the read stream */
+	if (scan->xs_rs)
+	{
+		so->currPos.streamIndex = -1;
+		so->lastBlock = InvalidBlockNumber;
+		read_stream_reset(scan->xs_rs);
+	}
 }
 
 /*
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index 4af1ff1e9e5..91e4fe5ec95 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -17,6 +17,7 @@
 
 #include "access/nbtree.h"
 #include "access/relscan.h"
+#include "access/visibilitymap.h"
 #include "access/xact.h"
 #include "miscadmin.h"
 #include "pgstat.h"
@@ -2045,6 +2046,21 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum,
 	 */
 	Assert(!pstate.forcenonrequired);
 
+	/*
+	 * Reset the read stream, to restart it for the new page.
+	 *
+	 * XXX Maybe we should not reset prefetch distance to 0, but start from
+	 * a somewhat higher value. We're merely continuing the same scan as
+	 * before ... maybe reduce it a bit, to not harm LIMIT queries, but not
+	 * reset it all the way to 0.
+	 */
+	if (scan->xs_rs)
+	{
+		so->currPos.streamIndex = -1;
+		so->lastBlock = InvalidBlockNumber;
+		read_stream_reset(scan->xs_rs);
+	}
+
 	return (so->currPos.firstItem <= so->currPos.lastItem);
 }
 
@@ -2059,6 +2075,11 @@ _bt_saveitem(BTScanOpaque so, int itemIndex,
 
 	currItem->heapTid = itup->t_tid;
 	currItem->indexOffset = offnum;
+
+	/* initialize visibility flags */
+	currItem->allVisibleSet = false;
+	currItem->allVisible = false;
+
 	if (so->currTuples)
 	{
 		Size		itupsz = IndexTupleSize(itup);
@@ -2089,6 +2110,11 @@ _bt_setuppostingitems(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
 
 	currItem->heapTid = *heapTid;
 	currItem->indexOffset = offnum;
+
+	/* initialize visibility flags */
+	currItem->allVisibleSet = false;
+	currItem->allVisible = false;
+
 	if (so->currTuples)
 	{
 		/* Save base IndexTuple (truncate posting list) */
@@ -2126,6 +2152,10 @@ _bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
 	currItem->heapTid = *heapTid;
 	currItem->indexOffset = offnum;
 
+	/* initialize visibility flags */
+	currItem->allVisibleSet = false;
+	currItem->allVisible = false;
+
 	/*
 	 * Have index-only scans return the same base IndexTuple for every TID
 	 * that originates from the same posting list
@@ -2150,6 +2180,22 @@ _bt_returnitem(IndexScanDesc scan, BTScanOpaque so)
 
 	/* Return next item, per amgettuple contract */
 	scan->xs_heaptid = currItem->heapTid;
+
+	/*
+	 * XXX If this is index-only scan and we haven't checked the VM yet, do
+	 * that now. We need to make sure scan->xs_visible is set correctly even
+	 * if the scan is not using a read stream.
+	 */
+	if (scan->xs_want_itup && !currItem->allVisibleSet)
+	{
+		currItem->allVisibleSet = true;
+		currItem->allVisible
+			= VM_ALL_VISIBLE(scan->heapRelation,
+							 ItemPointerGetBlockNumber(&currItem->heapTid),
+							 &so->vmBuffer);
+	}
+
+	scan->xs_visible = currItem->allVisible;
 	if (so->currTuples)
 		scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
 }
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index e6e52210b15..e307fd9bf7f 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -957,6 +957,8 @@ typedef struct BTScanPosItem	/* what we remember about each match */
 	ItemPointerData heapTid;	/* TID of referenced heap item */
 	OffsetNumber indexOffset;	/* index item's location within page */
 	LocationIndex tupleOffset;	/* IndexTuple's offset in workspace, if any */
+	bool allVisibleSet;			/* did we set the VM flag already? */
+	bool allVisible;			/* VM info (for IOS) */
 } BTScanPosItem;
 
 typedef struct BTScanPosData
@@ -995,6 +997,7 @@ typedef struct BTScanPosData
 	int			firstItem;		/* first valid index in items[] */
 	int			lastItem;		/* last valid index in items[] */
 	int			itemIndex;		/* current index in items[] */
+	int			streamIndex;	/* item returned to the read stream */
 
 	BTScanPosItem items[MaxTIDsPerBTreePage];	/* MUST BE LAST */
 } BTScanPosData;
@@ -1067,6 +1070,9 @@ typedef struct BTScanOpaqueData
 	FmgrInfo   *orderProcs;		/* ORDER procs for required equality keys */
 	MemoryContext arrayContext; /* scan-lifespan context for array data */
 
+	/* buffer for accessing VM in index-only scans */
+	Buffer		vmBuffer;
+
 	/* info about killed items if any (killedItems is NULL if never used) */
 	int		   *killedItems;	/* currPos.items indexes of killed items */
 	int			numKilled;		/* number of currently stored items */
@@ -1089,6 +1095,9 @@ typedef struct BTScanOpaqueData
 	 */
 	int			markItemIndex;	/* itemIndex, or -1 if not valid */
 
+	/* last block returned by the read_next stream callback */
+	BlockNumber		lastBlock;
+
 	/* keep these last in struct for efficiency */
 	BTScanPosData currPos;		/* current position data */
 	BTScanPosData markPos;		/* marked position, if any */
-- 
2.50.0

