From 4b5ea0f361e3feda93852bd084fb0d325e808e4c Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Mon, 12 Nov 2018 13:11:21 -0800
Subject: [PATCH 1/1] Add high key "continuescan" optimization.

Teach B-Tree forward index scans to check the high key before moving to
the next page in the hopes of finding that it isn't actually necessary
to move to the next page.  We already opportunistically force a key
check of the last item on leaf pages, even when it's clear that it
cannot be returned to the scan due to being dead-to-all, for the same
reason.  Since forcing the last item to be key checked no longer makes
any difference in the case of forward scans, the existing extra key
check is now only used for backwards scans.  Like the existing check,
the new check won't always work out, but that seems like an acceptable
price to pay.

The new approach is more effective than just checking non-pivot tuples,
especially with composite indexes and non-unique indexes.  The high key
represents an upper bound on all values that can appear on the page,
which is often greater than whatever tuple happens to appear last at the
time of the check.  Also, suffix truncation's new logic for picking a
split point will often result in high keys that are relatively
dissimilar to the other (non-pivot) tuples on the page, and therefore
more likely to indicate that the scan need not proceed to the next page.

Note that even pre-pg_upgrade'd v3 indexes make use of this
optimization.

(This is Heikki's refactored version)
---
 src/backend/access/nbtree/nbtsearch.c |  86 ++++++++++++++++++---
 src/backend/access/nbtree/nbtutils.c  | 103 +++++++++++---------------
 src/include/access/nbtree.h           |   3 +-
 3 files changed, 122 insertions(+), 70 deletions(-)

diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index af3da3aa5b6..243be6f410d 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -1220,7 +1220,6 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 	OffsetNumber minoff;
 	OffsetNumber maxoff;
 	int			itemIndex;
-	IndexTuple	itup;
 	bool		continuescan;
 
 	/*
@@ -1241,6 +1240,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 			_bt_parallel_release(scan, BufferGetBlockNumber(so->currPos.buf));
 	}
 
+	continuescan = true;		/* default assumption */
 	minoff = P_FIRSTDATAKEY(opaque);
 	maxoff = PageGetMaxOffsetNumber(page);
 
@@ -1282,23 +1282,58 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 
 		while (offnum <= maxoff)
 		{
-			itup = _bt_checkkeys(scan, page, offnum, dir, &continuescan);
-			if (itup != NULL)
+			ItemId		iid = PageGetItemId(page, offnum);
+			IndexTuple	itup;
+
+			/*
+			 * If the scan specifies not to return killed tuples, then we
+			 * treat a killed tuple as not passing the qual.  Most of the
+			 * time, it's a win to not bother examining the tuple's index
+			 * keys, but just skip to the next tuple.
+			 */
+			if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
+			{
+				offnum = OffsetNumberNext(offnum);
+				continue;
+			}
+
+			itup = (IndexTuple) PageGetItem(page, iid);
+
+			if (_bt_checkkeys(scan, itup, dir, &continuescan))
 			{
 				/* tuple passes all scan key conditions, so remember it */
 				_bt_saveitem(so, itemIndex, offnum, itup);
 				itemIndex++;
 			}
+			/* When !continuescan, there can't be any more matches, so stop */
 			if (!continuescan)
-			{
-				/* there can't be any more matches, so stop */
-				so->currPos.moreRight = false;
 				break;
-			}
 
 			offnum = OffsetNumberNext(offnum);
 		}
 
+		/*
+		 * We don't need to visit page to the right when the high key
+		 * indicates that no more matches will be found there.
+		 *
+		 * Checking the high key like this works out more often than you might
+		 * think.  Leaf page splits pick a split point between the two most
+		 * dissimilar tuples (this is weighed against the need to evenly share
+		 * free space).  Leaf pages with high key attribute values that can
+		 * only appear on non-pivot tuples on the right sibling page are
+		 * common.
+		 */
+		if (continuescan && !P_RIGHTMOST(opaque))
+		{
+			ItemId		iid = PageGetItemId(page, P_HIKEY);
+			IndexTuple	itup = (IndexTuple) PageGetItem(page, iid);
+
+			_bt_checkkeys(scan, itup, dir, &continuescan);
+		}
+
+		if (!continuescan)
+			so->currPos.moreRight = false;
+
 		Assert(itemIndex <= MaxIndexTuplesPerPage);
 		so->currPos.firstItem = 0;
 		so->currPos.lastItem = itemIndex - 1;
@@ -1313,8 +1348,41 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 
 		while (offnum >= minoff)
 		{
-			itup = _bt_checkkeys(scan, page, offnum, dir, &continuescan);
-			if (itup != NULL)
+			ItemId		iid = PageGetItemId(page, offnum);
+			IndexTuple	itup;
+			bool		tuple_alive;
+			bool		passes_quals;
+
+			/*
+			 * If the scan specifies not to return killed tuples, then we
+			 * treat a killed tuple as not passing the qual.  Most of the
+			 * time, it's a win to not bother examining the tuple's index
+			 * keys, but just skip to the next tuple (previous, actually,
+			 * since we're scaning backwards) .  However, if this is the
+			 * first tuple on the page, we do check the index keys, to prevent
+			 * uselessly advancing to the page to the left.  This is similar
+			 * to the high key optimization used by forward scans.
+			 */
+			if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
+			{
+				BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+				Assert(offnum >= P_FIRSTDATAKEY(opaque));
+				if (offnum > P_FIRSTDATAKEY(opaque))
+				{
+					offnum = OffsetNumberPrev(offnum);
+					continue;
+				}
+
+				tuple_alive = false;
+			}
+			else
+				tuple_alive = true;
+
+			itup = (IndexTuple) PageGetItem(page, iid);
+
+			passes_quals = _bt_checkkeys(scan, itup, dir, &continuescan);
+			if (passes_quals && tuple_alive)
 			{
 				/* tuple passes all scan key conditions, so remember it */
 				itemIndex--;
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
index 2c05fb5e451..1b1d889bba8 100644
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -47,7 +47,7 @@ static bool _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op,
 static bool _bt_fix_scankey_strategy(ScanKey skey, int16 *indoption);
 static void _bt_mark_scankey_required(ScanKey skey);
 static bool _bt_check_rowcompare(ScanKey skey,
-					 IndexTuple tuple, TupleDesc tupdesc,
+					 IndexTuple tuple, int tupnatts, TupleDesc tupdesc,
 					 ScanDirection dir, bool *continuescan);
 
 
@@ -1350,8 +1350,7 @@ _bt_mark_scankey_required(ScanKey skey)
 /*
  * Test whether an indextuple satisfies all the scankey conditions.
  *
- * If so, return the address of the index tuple on the index page.
- * If not, return NULL.
+ * Returns true, if so.
  *
  * If the tuple fails to pass the qual, we also determine whether there's
  * any need to continue the scan beyond this tuple, and set *continuescan
@@ -1359,21 +1358,19 @@ _bt_mark_scankey_required(ScanKey skey)
  * this is done.
  *
  * scan: index scan descriptor (containing a search-type scankey)
- * page: buffer page containing index tuple
- * offnum: offset number of index tuple (must be a valid item!)
+ * tuple: index tuple to test
  * dir: direction we are scanning in
  * continuescan: output parameter (will be set correctly in all cases)
  *
- * Caller must hold pin and lock on the index page.
+ * Caller can pass a high key tuple in the hopes of discovering that the scan
+ * need not continue on to a page to the right.  We don't currently bother
+ * limiting high key comparisons to SK_BT_REQFWD scan keys.
  */
-IndexTuple
-_bt_checkkeys(IndexScanDesc scan,
-			  Page page, OffsetNumber offnum,
+bool
+_bt_checkkeys(IndexScanDesc scan, IndexTuple tuple,
 			  ScanDirection dir, bool *continuescan)
 {
-	ItemId		iid = PageGetItemId(page, offnum);
-	bool		tuple_alive;
-	IndexTuple	tuple;
+	int			tupnatts;
 	TupleDesc	tupdesc;
 	BTScanOpaque so;
 	int			keysz;
@@ -1382,40 +1379,7 @@ _bt_checkkeys(IndexScanDesc scan,
 
 	*continuescan = true;		/* default assumption */
 
-	/*
-	 * If the scan specifies not to return killed tuples, then we treat a
-	 * killed tuple as not passing the qual.  Most of the time, it's a win to
-	 * not bother examining the tuple's index keys, but just return
-	 * immediately with continuescan = true to proceed to the next tuple.
-	 * However, if this is the last tuple on the page, we should check the
-	 * index keys to prevent uselessly advancing to the next page.
-	 */
-	if (scan->ignore_killed_tuples && ItemIdIsDead(iid))
-	{
-		/* return immediately if there are more tuples on the page */
-		if (ScanDirectionIsForward(dir))
-		{
-			if (offnum < PageGetMaxOffsetNumber(page))
-				return NULL;
-		}
-		else
-		{
-			BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
-
-			if (offnum > P_FIRSTDATAKEY(opaque))
-				return NULL;
-		}
-
-		/*
-		 * OK, we want to check the keys so we can set continuescan correctly,
-		 * but we'll return NULL even if the tuple passes the key tests.
-		 */
-		tuple_alive = false;
-	}
-	else
-		tuple_alive = true;
-
-	tuple = (IndexTuple) PageGetItem(page, iid);
+	tupnatts = BTreeTupleGetNAtts(tuple, scan->indexRelation);
 
 	tupdesc = RelationGetDescr(scan->indexRelation);
 	so = (BTScanOpaque) scan->opaque;
@@ -1427,13 +1391,25 @@ _bt_checkkeys(IndexScanDesc scan,
 		bool		isNull;
 		Datum		test;
 
-		Assert(key->sk_attno <= BTreeTupleGetNAtts(tuple, scan->indexRelation));
+		/*
+		 * Assume that truncated attribute (from high key) passes the qual.
+		 * The value of a truncated attribute for the first tuple on the right
+		 * page could be any possible value, so we may have to visit the next
+		 * page.
+		 */
+		if (key->sk_attno > tupnatts)
+		{
+			Assert(ScanDirectionIsForward(dir));
+			continue;
+		}
+
 		/* row-comparison keys need special processing */
 		if (key->sk_flags & SK_ROW_HEADER)
 		{
-			if (_bt_check_rowcompare(key, tuple, tupdesc, dir, continuescan))
+			if (_bt_check_rowcompare(key, tuple, tupnatts, tupdesc, dir,
+									 continuescan))
 				continue;
-			return NULL;
+			return false;
 		}
 
 		datum = index_getattr(tuple,
@@ -1471,7 +1447,7 @@ _bt_checkkeys(IndexScanDesc scan,
 			/*
 			 * In any case, this indextuple doesn't match the qual.
 			 */
-			return NULL;
+			return false;
 		}
 
 		if (isNull)
@@ -1512,7 +1488,7 @@ _bt_checkkeys(IndexScanDesc scan,
 			/*
 			 * In any case, this indextuple doesn't match the qual.
 			 */
-			return NULL;
+			return false;
 		}
 
 		test = FunctionCall2Coll(&key->sk_func, key->sk_collation,
@@ -1540,16 +1516,12 @@ _bt_checkkeys(IndexScanDesc scan,
 			/*
 			 * In any case, this indextuple doesn't match the qual.
 			 */
-			return NULL;
+			return false;
 		}
 	}
 
-	/* Check for failure due to it being a killed tuple. */
-	if (!tuple_alive)
-		return NULL;
-
 	/* If we get here, the tuple passes all index quals. */
-	return tuple;
+	return true;
 }
 
 /*
@@ -1562,8 +1534,8 @@ _bt_checkkeys(IndexScanDesc scan,
  * This is a subroutine for _bt_checkkeys, which see for more info.
  */
 static bool
-_bt_check_rowcompare(ScanKey skey, IndexTuple tuple, TupleDesc tupdesc,
-					 ScanDirection dir, bool *continuescan)
+_bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts,
+					 TupleDesc tupdesc, ScanDirection dir, bool *continuescan)
 {
 	ScanKey		subkey = (ScanKey) DatumGetPointer(skey->sk_argument);
 	int32		cmpresult = 0;
@@ -1580,6 +1552,19 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, TupleDesc tupdesc,
 
 		Assert(subkey->sk_flags & SK_ROW_MEMBER);
 
+		/*
+		 * Assume that truncated attribute (from high key) passes the qual.
+		 * The value of a truncated attribute for the first tuple on the right
+		 * page could be any possible value, so we may have to visit the next
+		 * page.
+		 */
+		if (subkey->sk_attno > tupnatts)
+		{
+			Assert(ScanDirectionIsForward(dir));
+			cmpresult = 0;
+			continue;
+		}
+
 		datum = index_getattr(tuple,
 							  subkey->sk_attno,
 							  tupdesc,
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 60622ea7906..53e4e6d194d 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -586,8 +586,7 @@ extern bool _bt_advance_array_keys(IndexScanDesc scan, ScanDirection dir);
 extern void _bt_mark_array_keys(IndexScanDesc scan);
 extern void _bt_restore_array_keys(IndexScanDesc scan);
 extern void _bt_preprocess_keys(IndexScanDesc scan);
-extern IndexTuple _bt_checkkeys(IndexScanDesc scan,
-			  Page page, OffsetNumber offnum,
+extern bool _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple,
 			  ScanDirection dir, bool *continuescan);
 extern void _bt_killitems(IndexScanDesc scan);
 extern BTCycleId _bt_vacuum_cycleid(Relation rel);
-- 
2.20.1

