From 19ff92d7034e726c840c83946d87c980a48a594d Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@vondra.me>
Date: Fri, 25 Apr 2025 14:52:56 +0200
Subject: [PATCH v20250709 5/6] prefetch for spgist indexes

Implements the spg_stream_read_next() callback, returning blocks from
SpGistScanOpaque.

Similar to GiST, this handles both regular and ordered scans, but with
just a single read_next callback.

Note: Right now the batches are always 32 items, which may regress
queries with LIMIT clauses, etc. It should start at 1 and gradually
increase the batch size. Similarly to how prefetch distance grows.

XXX I wonder if GiST could be simplified to use a single callback too,
or if SP-GiST is buggy and needs to use two callbacks.
---
 src/backend/access/spgist/spgscan.c | 187 +++++++++++++++++++++++++++-
 src/include/access/spgist_private.h |  11 ++
 2 files changed, 195 insertions(+), 3 deletions(-)

diff --git a/src/backend/access/spgist/spgscan.c b/src/backend/access/spgist/spgscan.c
index 655f5cdc1eb..c90703a522e 100644
--- a/src/backend/access/spgist/spgscan.c
+++ b/src/backend/access/spgist/spgscan.c
@@ -18,6 +18,7 @@
 #include "access/genam.h"
 #include "access/relscan.h"
 #include "access/spgist_private.h"
+#include "access/table.h"
 #include "miscadmin.h"
 #include "pgstat.h"
 #include "storage/bufmgr.h"
@@ -300,6 +301,95 @@ spgPrepareScanKeys(IndexScanDesc scan)
 	}
 }
 
+/*
+ * spg_stream_read_next
+ *		Return the next block to read from the read stream.
+ *
+ * Returns the next block from the current leaf page. The first block is
+ * when accessing the first tuple, after already receiving the TID from the
+ * index (for the item itemIndex points at).
+ *
+ * With index-only scans this skips all-visible pages. The visibility info
+ * is stored, so that we can later pass it to the scan (we must not access
+ * the VM again, the bit might have changes, and the read stream would get
+ * out of sync (we'd get different blocks than we expect expect).
+ *
+ * Returns the block number to get from the read stream. InvalidBlockNumber
+ * means we've ran out of item on the current leaf page - the stream will
+ * end, and we'll need to reset it after reading the next page (or after
+ * changing the scan direction).
+ *
+ * XXX Should skip duplicate blocks (for correlated indexes). But that's
+ * not implemented yet.
+ */
+static BlockNumber
+spg_stream_read_next(ReadStream *stream,
+					 void *callback_private_data,
+					 void *per_buffer_data)
+{
+	IndexScanDesc	scan = (IndexScanDesc) callback_private_data;
+	SpGistScanOpaque so = (SpGistScanOpaque) scan->opaque;
+	BlockNumber		block = InvalidBlockNumber;
+
+	/*
+	 * Is this the first request for the read stream (possibly after a reset)?
+	 * If yes, initialize the stream to the current item (itemIndex).
+	 */
+	if (so->sPtr == -1)
+		so->sPtr = (so->iPtr - 1);
+
+	/*
+	 * Find the next block to read. For plain index scans we will return the
+	 * very next item, but with index-only scans we skip TIDs from all-visible
+	 * pages (because we won't read those).
+	 */
+	while (so->sPtr < so->nPtrs)
+	{
+		ItemPointer		tid;
+
+		tid = &so->heapPtrs[so->sPtr];
+		block = ItemPointerGetBlockNumber(tid);
+
+		/*
+		 * For index-only scans, check the VM and remember the result. If the page
+		 * is all-visible, don't return the block number, try reading the next one.
+		 *
+		 * XXX Maybe this could use the same logic to check for duplicate blocks,
+		 * and reuse the VM result if possible.
+		 */
+		if (scan->xs_want_itup)
+		{
+			if (!so->allVisibleSet[so->sPtr])
+			{
+				so->allVisibleSet[so->sPtr] = true;
+				so->allVisible[so->sPtr] = VM_ALL_VISIBLE(scan->heapRelation,
+														  ItemPointerGetBlockNumber(tid),
+														  &so->vmBuffer);
+			}
+
+			/* don't prefetch this all-visible block, try the next one */
+			if (so->allVisible[so->sPtr])
+				block = InvalidBlockNumber;
+		}
+
+		/* advance to the next item (forward scans only) */
+		so->sPtr++;
+
+		/* don't return the same block twice (and remember this one) */
+		if (so->lastBlock == block)
+			block = InvalidBlockNumber;
+
+		/* Did we find a valid block? If yes, we're done. */
+		if (block != InvalidBlockNumber)
+			break;
+	}
+
+	/* remember the block we're returning */
+	so->lastBlock = block;
+
+	return block;
+}
+
 IndexScanDesc
 spgbeginscan(Relation heap, Relation index, int keysz, int orderbysz)
 {
@@ -371,8 +461,30 @@ spgbeginscan(Relation heap, Relation index, int keysz, int orderbysz)
 
 	so->indexCollation = index->rd_indcollation[0];
 
+	/* access to VM for IOS scans (in read_next callback) */
+	so->vmBuffer = InvalidBuffer;
+
+	/* nothing returned */
+	so->lastBlock = InvalidBlockNumber;
+
 	scan->opaque = so;
 
+	/*
+	 * Initialize the read stream to opt-in into prefetching.
+	 *
+	 * XXX See comments in btbeginscan().
+	 */
+	if (enable_indexscan_prefetch && heap)
+	{
+		scan->xs_rs = read_stream_begin_relation(READ_STREAM_DEFAULT,
+												 NULL,
+												 heap,
+												 MAIN_FORKNUM,
+												 spg_stream_read_next,
+												 scan,
+												 0);
+	}
+
 	return scan;
 }
 
@@ -423,6 +535,14 @@ spgrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
 	pgstat_count_index_scan(scan->indexRelation);
 	if (scan->instrument)
 		scan->instrument->nsearches++;
+
+	/* reset the stream, so that rescan starts from scratch */
+	if (scan->xs_rs)
+	{
+		so->sPtr = -1;
+		so->lastBlock = InvalidBlockNumber;
+		read_stream_reset(scan->xs_rs);
+	}
 }
 
 void
@@ -453,6 +573,15 @@ spgendscan(IndexScanDesc scan)
 		pfree(scan->xs_orderbynulls);
 	}
 
+	if (so->vmBuffer != InvalidBuffer)
+	{
+		ReleaseBuffer(so->vmBuffer);
+		so->vmBuffer = InvalidBuffer;
+	}
+
+	if (scan->xs_rs)
+		read_stream_end(scan->xs_rs);
+
 	pfree(so);
 }
 
@@ -818,9 +947,25 @@ spgWalk(Relation index, SpGistScanOpaque so, bool scanWholeIndex,
 		storeRes_func storeRes)
 {
 	Buffer		buffer = InvalidBuffer;
-	bool		reportedSome = false;
+	int			reportedCount = 0;
 
-	while (scanWholeIndex || !reportedSome)
+	/*
+	 * XXX Read at least 32 items into the queue, to make prefetching work.
+	 *
+	 * XXX We should gradually increase the number of tuples to load, not read
+	 * 32 tuples from the very beginning, similar to how we increase the
+	 * prefetch distance. That might be harmful for queries with LIMIT clause.
+	 *
+	 * XXX Not sure this is quite safe. The the arrays are sized to fit at
+	 * least MaxIndexTuplesPerPage items, but what if there's a page with 31
+	 * items, and then another page with MaxIndexTuplesPerPage? Then we might
+	 * overflow the arrays (in the while loop below), I think.
+	 *
+	 * XXX I wonder if this is actually needed. Maybe it's needed only for
+	 * ordered scans, when we get the items from the pairing heap one by one.
+	 * So maybe we should do this buffering only in that case?
+	 */
+	while (scanWholeIndex || (reportedCount < 32))
 	{
 		SpGistSearchItem *item = spgGetNextQueueItem(so);
 
@@ -838,7 +983,7 @@ redirect:
 			storeRes(so, &item->heapPtr, item->value, item->isNull,
 					 item->leafTuple, item->recheck,
 					 item->recheckDistances, item->distances);
-			reportedSome = true;
+			reportedCount++;
 		}
 		else
 		{
@@ -872,23 +1017,33 @@ redirect:
 
 				if (SpGistBlockIsRoot(blkno))
 				{
+					bool	reportedSome = false;
+
 					/* When root is a leaf, examine all its tuples */
 					for (offset = FirstOffsetNumber; offset <= max; offset++)
 						(void) spgTestLeafTuple(so, item, page, offset,
 												isnull, true,
 												&reportedSome, storeRes);
+
+					if (reportedSome)
+						reportedCount++;
 				}
 				else
 				{
 					/* Normal case: just examine the chain we arrived at */
 					while (offset != InvalidOffsetNumber)
 					{
+						bool	reportedSome = false;
+
 						Assert(offset >= FirstOffsetNumber && offset <= max);
 						offset = spgTestLeafTuple(so, item, page, offset,
 												  isnull, false,
 												  &reportedSome, storeRes);
 						if (offset == SpGistRedirectOffsetNumber)
 							goto redirect;
+
+						if (reportedSome)
+							reportedCount++;
 					}
 				}
 			}
@@ -1042,6 +1197,18 @@ spggettuple(IndexScanDesc scan, ScanDirection dir)
 			scan->xs_recheck = so->recheck[so->iPtr];
 			scan->xs_hitup = so->reconTups[so->iPtr];
 
+			/* determine and store the VM status, if not done already */
+			if (scan->xs_want_itup && !so->allVisibleSet[so->iPtr])
+			{
+				so->allVisibleSet[so->iPtr] = true;
+				so->allVisible[so->iPtr]
+					= VM_ALL_VISIBLE(scan->heapRelation,
+									 ItemPointerGetBlockNumber(&so->heapPtrs[so->iPtr]),
+									 &so->vmBuffer);
+			}
+
+			scan->xs_visible = so->allVisible[so->iPtr];
+
 			if (so->numberOfOrderBys > 0)
 				index_store_float8_orderby_distances(scan, so->orderByTypes,
 													 so->distances[so->iPtr],
@@ -1074,6 +1241,20 @@ spggettuple(IndexScanDesc scan, ScanDirection dir)
 
 		if (so->nPtrs == 0)
 			break;				/* must have completed scan */
+
+		/*
+		 * loaded a leaf page worth of tuples, restart stream
+		 *
+		 * XXX with ordered scans we typically get nPtrs=1, which means the
+		 * prefetch can't really benefit anything. Maybe we should queue a
+		 * couple items and then prefetch those?
+		 */
+		if (scan->xs_rs)
+		{
+			so->sPtr = -1;
+			so->lastBlock = InvalidBlockNumber;
+			read_stream_reset(scan->xs_rs);
+		}
 	}
 
 	return false;
diff --git a/src/include/access/spgist_private.h b/src/include/access/spgist_private.h
index cb43a278f46..46c50041ee1 100644
--- a/src/include/access/spgist_private.h
+++ b/src/include/access/spgist_private.h
@@ -16,8 +16,10 @@
 
 #include "access/itup.h"
 #include "access/spgist.h"
+#include "access/visibilitymap.h"
 #include "catalog/pg_am_d.h"
 #include "nodes/tidbitmap.h"
+#include "optimizer/cost.h"
 #include "storage/buf.h"
 #include "utils/geo_decls.h"
 #include "utils/relcache.h"
@@ -226,15 +228,24 @@ typedef struct SpGistScanOpaqueData
 	TupleDesc	reconTupDesc;	/* if so, descriptor for reconstructed tuples */
 	int			nPtrs;			/* number of TIDs found on current page */
 	int			iPtr;			/* index for scanning through same */
+	int			sPtr;			/* index for scanning through same (for stream) */
 	ItemPointerData heapPtrs[MaxIndexTuplesPerPage];	/* TIDs from cur page */
 	bool		recheck[MaxIndexTuplesPerPage]; /* their recheck flags */
 	bool		recheckDistances[MaxIndexTuplesPerPage];	/* distance recheck
 															 * flags */
 	HeapTuple	reconTups[MaxIndexTuplesPerPage];	/* reconstructed tuples */
 
+	/* for IOS */
+	bool		allVisible[MaxIndexTuplesPerPage];
+	bool		allVisibleSet[MaxIndexTuplesPerPage];
+	Buffer		vmBuffer;
+
 	/* distances (for recheck) */
 	IndexOrderByDistance *distances[MaxIndexTuplesPerPage];
 
+	/* last block returned by the read_next stream callback */
+	BlockNumber	lastBlock;
+
 	/*
 	 * Note: using MaxIndexTuplesPerPage above is a bit hokey since
 	 * SpGistLeafTuples aren't exactly IndexTuples; however, they are larger,
-- 
2.50.0

