From 140b1c672f33cf1391745d86e06d8418124d9124 Mon Sep 17 00:00:00 2001
From: Tomas Vondra <tomas@2ndquadrant.com>
Date: Fri, 6 Sep 2024 19:37:15 +0200
Subject: [PATCH v20240906 2/5] PoC: support for mark/restore

---
 src/backend/access/index/indexam.c       |  35 ++++---
 src/backend/access/nbtree/nbtree.c       | 111 +++++++++++++++++++++++
 src/backend/access/nbtree/nbtsearch.c    |   2 +-
 src/backend/executor/nodeIndexonlyscan.c |  11 +--
 src/backend/executor/nodeIndexscan.c     |  11 +--
 src/include/access/nbtree.h              |   2 +
 src/include/access/relscan.h             |   6 ++
 7 files changed, 153 insertions(+), 25 deletions(-)

diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c
index d170c1d439d..26ffdc9a383 100644
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -497,16 +497,9 @@ index_restrpos(IndexScanDesc scan)
 	scan->indexRelation->rd_indam->amrestrpos(scan);
 
 	/*
-	 * Reset the batch, to make it look empty.
-	 *
-	 * Done after the amrescan() call, in case the AM needs some of the batch
-	 * info (e.g. to properly transfer the killed tuples).
-	 *
-	 * XXX This is a bit misleading, because index_batch_reset does not reset
-	 * the killed tuples. So if that's the only justification, we could have
-	 * done it before the call.
+	 * Don't reset the batch here - amrestrpos should have has already loaded
+	 * the new batch, so don't throw that away.
 	 */
-	index_batch_reset(scan);
 }
 
 /*
@@ -1311,9 +1304,17 @@ AssertCheckBatchInfo(IndexScanDesc scan)
 		((scan)->xs_batch->nheaptids == ((scan)->xs_batch->currIndex + 1)) : \
 		((scan)->xs_batch->currIndex == 0))
 
-/* Does the batch items in the requested direction? */
+/*
+ * Does the batch items in the requested direction? The batch must be non-empty
+ * and we should not have reached the end of the batch (in the direction).
+ * Also, if we just restored the position after mark/restore, there should be
+ * at least one item to process (we won't advance  on the next call).
+ *
+ * XXX This is a bit confusing / ugly, probably should rethink how we track
+ * empty batches, and how we handle not advancing after a restore.
+ */
 #define INDEX_BATCH_HAS_ITEMS(scan, direction) \
-	(!INDEX_BATCH_IS_EMPTY(scan) && !INDEX_BATCH_IS_PROCESSED(scan, direction))
+	(!INDEX_BATCH_IS_EMPTY(scan) && (!INDEX_BATCH_IS_PROCESSED(scan, direction) || scan->xs_batch->restored))
 
 
 /* ----------------
@@ -1466,8 +1467,17 @@ index_batch_getnext_tid(IndexScanDesc scan, ScanDirection direction)
 	/*
 	 * Advance to the next batch item - we know it's not empty and there are
 	 * items to process, so this is valid.
+	 *
+	 * However, don't advance if this is the first getnext_tid() call after
+	 * amrestrpos(). That sets the position on the correct item, and advancing
+	 * here would skip it.
+	 *
+	 * XXX The "restored" flag is a bit weird. Can we do this without it? May
+	 * need to rethink when/how we advance the batch index. Not sure.
 	 */
-	if (ScanDirectionIsForward(direction))
+	if (scan->xs_batch->restored)
+		scan->xs_batch->restored = false;
+	else if (ScanDirectionIsForward(direction))
 		scan->xs_batch->currIndex++;
 	else
 		scan->xs_batch->currIndex--;
@@ -1805,6 +1815,7 @@ index_batch_reset(IndexScanDesc scan)
 	scan->xs_batch->nheaptids = 0;
 	scan->xs_batch->prefetchIndex = 0;
 	scan->xs_batch->currIndex = -1;
+	scan->xs_batch->restored = false;
 }
 
 /*
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index a32cc6e7a3a..c3a25cb5372 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -519,6 +519,25 @@ btmarkpos(IndexScanDesc scan)
 	/* There may be an old mark with a pin (but no lock). */
 	BTScanPosUnpinIfPinned(so->markPos);
 
+	/*
+	 * With batched scans, we don't maintain the itemIndex when processing the
+	 * batch, so we need to calculate the current value.
+	 *
+	 * FIXME I don't like that this requires knowledge of batching details,
+	 * I'd very much prefer those to remain isolated in indexam.c. The best
+	 * idea I have is a function that returns the batch index, something like
+	 * index_batch_get_index(). That's close to how other places in AMs talk
+	 * to indexam.c (e.g. when setting distances / orderby in spgist).
+	 */
+	if (scan->xs_batch)
+	{
+		/* the index should be valid in the batch */
+		Assert(scan->xs_batch->currIndex >= 0);
+		Assert(scan->xs_batch->currIndex < scan->xs_batch->nheaptids);
+
+		so->currPos.itemIndex = so->batch.firstIndex + scan->xs_batch->currIndex;
+	}
+
 	/*
 	 * Just record the current itemIndex.  If we later step to next page
 	 * before releasing the marked position, _bt_steppage makes a full copy of
@@ -552,6 +571,59 @@ btrestrpos(IndexScanDesc scan)
 		 * accurate.
 		 */
 		so->currPos.itemIndex = so->markItemIndex;
+
+		/*
+		 * We're restoring to a different position on the same page, but the
+		 * wrong batch may be loaded. Check if the current batch includes
+		 * itemIndex, and load the correct batch if not. We don't know in
+		 * which direction the scan will move, so we try to put the current
+		 * index in the middle of a batch. For indexes close to the end of the
+		 * page we may load fewer items, but that seems acceptable.
+		 *
+		 * Then update the index of the current item, even if we already have
+		 * the correct batch loaded.
+		 *
+		 * FIXME Similar to btmarkpos() - I don't like how this leaks details
+		 * that should be specific to indexam.c. The "restored" flag is weird
+		 * too - but even if we need it, we could set it in indexam.c, right?
+		 */
+		if (scan->xs_batch != NULL)
+		{
+			if ((so->currPos.itemIndex < so->batch.firstIndex) ||
+				(so->currPos.itemIndex > so->batch.lastIndex))
+			{
+				int			start = Max(so->currPos.firstItem,
+										so->currPos.itemIndex - (scan->xs_batch->currSize / 2));
+				int			end = Min(so->currPos.lastItem,
+									  start + (scan->xs_batch->currSize - 1));
+
+				Assert(start <= end);
+				Assert((end - start + 1) <= scan->xs_batch->currSize);
+
+				/* make it look empty */
+				scan->xs_batch->nheaptids = 0;
+				scan->xs_batch->prefetchIndex = -1;
+
+				/*
+				 * XXX the scan direction is bogus / not important. It affects
+				 * only how we advance the currIndex, but we'll override that
+				 * anyway to point at the "correct" entry.
+				 */
+				_bt_copy_batch(scan, ForwardScanDirection, so, start, end);
+			}
+
+			/*
+			 * Set the batch index to the "correct" position in the batch,
+			 * even if we haven't re-loaded it from the page. Also remember we
+			 * just did this, so that the next call to
+			 * index_batch_getnext_tid() does not advance it again.
+			 *
+			 * XXX This is a bit weird. There should be a way to not need the
+			 * "restored" flag I think.
+			 */
+			scan->xs_batch->currIndex = (so->currPos.itemIndex - so->batch.firstIndex);
+			scan->xs_batch->restored = true;
+		}
 	}
 	else
 	{
@@ -589,6 +661,45 @@ btrestrpos(IndexScanDesc scan)
 				_bt_start_array_keys(scan, so->currPos.dir);
 				so->needPrimScan = false;
 			}
+
+			/*
+			 * With batched scans, we know the current batch is definitely
+			 * wrong (we've moved to a different leaf page). So empty the
+			 * batch, and load the right part of the page.
+			 *
+			 * Similarly to the block above, we place the current index in the
+			 * middle of the batch (if possible). And then we update the index
+			 * to point at the correct batch item.
+			 */
+			if (scan->xs_batch != NULL)
+			{
+				int			start = Max(so->currPos.firstItem,
+										so->currPos.itemIndex - (scan->xs_batch->currSize / 2));
+				int			end = Min(so->currPos.lastItem,
+									  start + (scan->xs_batch->currSize - 1));
+
+				Assert(start <= end);
+				Assert((end - start + 1) <= scan->xs_batch->currSize);
+
+				/* make it look empty */
+				scan->xs_batch->nheaptids = 0;
+				scan->xs_batch->prefetchIndex = -1;
+
+				/* XXX the scan direction is bogus */
+				_bt_copy_batch(scan, ForwardScanDirection, so, start, end);
+
+				/*
+				 * Set the batch index to the "correct" position in the batch,
+				 * even if we haven't re-loaded it from the page. Also
+				 * remember we just did this, so that the next call to
+				 * index_batch_getnext_tid() does not advance it again.
+				 *
+				 * XXX This is a bit weird. There should be a way to not need
+				 * the "restored" flag I think.
+				 */
+				scan->xs_batch->currIndex = (so->currPos.itemIndex - so->batch.firstIndex);
+				scan->xs_batch->restored = true;
+			}
 		}
 		else
 			BTScanPosInvalidate(so->currPos);
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index 3196ba640d5..146c527cab2 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -1543,7 +1543,7 @@ AssertCheckBTBatchInfo(BTScanOpaque so)
  * _bt_copy_batch
  *		Copy a section of the leaf page into the batch.
  */
-static void
+void
 _bt_copy_batch(IndexScanDesc scan, ScanDirection dir, BTScanOpaque so,
 			   int start, int end)
 {
diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c
index dcf5fc5fffc..a93da9d1d24 100644
--- a/src/backend/executor/nodeIndexonlyscan.c
+++ b/src/backend/executor/nodeIndexonlyscan.c
@@ -801,14 +801,13 @@ ExecInitIndexOnlyScan(IndexOnlyScan *node, EState *estate, int eflags)
 		ExecInitQual(node->recheckqual, (PlanState *) indexstate);
 
 	/*
-	 * Can't do batching (and thus prefetching) when the plan requires mark
-	 * and restore. There's an issue with translating the mark/restore
-	 * positions between the batch in scan descriptor and the original
-	 * position recognized in the index AM.
+	 * All index scans can do batching.
 	 *
-	 * XXX Hopefully just a temporary limitation?
+	 * XXX Maybe this should check if the index AM supports batching, or even
+	 * call something like "amcanbatch" (does not exist yet). Or check the
+	 * enable_indexscan_batching GUC?
 	 */
-	indexstate->ioss_CanBatch = !(eflags & EXEC_FLAG_MARK);
+	indexstate->ioss_CanBatch = true;
 
 	/*
 	 * If we are just doing EXPLAIN (ie, aren't going to run the plan), stop
diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c
index 6d5de6e2ce8..9daec76ddc2 100644
--- a/src/backend/executor/nodeIndexscan.c
+++ b/src/backend/executor/nodeIndexscan.c
@@ -1000,14 +1000,13 @@ ExecInitIndexScan(IndexScan *node, EState *estate, int eflags)
 		ExecInitExprList(node->indexorderbyorig, (PlanState *) indexstate);
 
 	/*
-	 * Can't do batching (and thus prefetching) when the plan requires mark
-	 * and restore. There's an issue with translating the mark/restore
-	 * positions between the batch in scan descriptor and the original
-	 * position recognized in the index AM.
+	 * All index scans can do batching.
 	 *
-	 * XXX Hopefully just a temporary limitation?
+	 * XXX Maybe this should check if the index AM supports batching, or even
+	 * call something like "amcanbatch" (does not exist yet). Or check the
+	 * enable_indexscan_batching GUC?
 	 */
-	indexstate->iss_CanBatch = !(eflags & EXEC_FLAG_MARK);
+	indexstate->iss_CanBatch = true;
 
 	/*
 	 * If we are just doing EXPLAIN (ie, aren't going to run the plan), stop
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 3326d2d8958..77cff1030b8 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -1292,6 +1292,8 @@ extern bool _bt_first_batch(IndexScanDesc scan, ScanDirection dir);
 extern bool _bt_next_batch(IndexScanDesc scan, ScanDirection dir);
 extern void _bt_kill_batch(IndexScanDesc scan);
 extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost);
+extern void _bt_copy_batch(IndexScanDesc scan, ScanDirection dir, BTScanOpaque so,
+						   int start, int end);
 
 /*
  * prototypes for functions in nbtutils.c
diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h
index 9dfd933e22e..417ca03b898 100644
--- a/src/include/access/relscan.h
+++ b/src/include/access/relscan.h
@@ -131,6 +131,12 @@ typedef struct IndexScanBatchData
 	/* memory context for per-batch data */
 	MemoryContext ctx;
 
+	/*
+	 * Was this batch just restored by restrpos? if yes, we don't advance on
+	 * the first iteration.
+	 */
+	bool		restored;
+
 	/* batch prefetching */
 	int			prefetchTarget; /* current prefetch distance */
 	int			prefetchMaximum;	/* maximum prefetch distance */
-- 
2.46.0

