From 7cdd4cc9621a5818c38b27f26c15c570ab4e83f6 Mon Sep 17 00:00:00 2001
From: Dmitrii Dolgov <9erthalion6@gmail.com>
Date: Fri, 14 May 2021 19:22:06 +0200
Subject: [PATCH v41 6/6] Index skip scan for IndexScan

Introduce Skip Scan support for IndexScan, not only for IndexOnlyScan.
It works in the same way as IndexOnlyScan, but planned has to check that
the chosen index is fully covering specified distinct expressions.

Author: Jesper Pedersen, Dmitry Dolgov
Reviewed-by: Thomas Munro, David Rowley, Floris Van Nee, Kyotaro Horiguchi, Tomas Vondra, Peter Geoghegan
---
 src/backend/commands/explain.c                |  6 ++
 src/backend/executor/nodeIndexscan.c          | 56 +++++++++++++++-
 src/backend/nodes/copyfuncs.c                 |  1 +
 src/backend/nodes/outfuncs.c                  |  1 +
 src/backend/nodes/readfuncs.c                 |  1 +
 src/backend/optimizer/path/indxpath.c         | 59 ++++++++++++++++-
 src/backend/optimizer/plan/createplan.c       | 10 ++-
 src/include/nodes/execnodes.h                 |  4 ++
 src/include/nodes/plannodes.h                 |  2 +
 src/test/regress/expected/select_distinct.out | 64 ++++++++++++++++---
 src/test/regress/sql/select_distinct.sql      | 16 +++++
 11 files changed, 206 insertions(+), 14 deletions(-)

diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 4f5bd1d678..67d072b010 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -1754,6 +1754,12 @@ ExplainNode(PlanState *planstate, List *ancestors,
 	switch (nodeTag(plan))
 	{
 		case T_IndexScan:
+			if (((IndexScan *) plan)->indexskipprefixsize > 0)
+			{
+				IndexScan  *indexscan = (IndexScan *) plan;
+				ExplainPropertyBool("Skip scan", true, es);
+				ExplainIndexSkipScanKeys(indexscan->indexskipprefixsize, es);
+			}
 			show_scan_qual(((IndexScan *) plan)->indexqualorig,
 						   "Index Cond", planstate, ancestors, es);
 			if (((IndexScan *) plan)->indexqualorig)
diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c
index 90b2699a96..90aad76b95 100644
--- a/src/backend/executor/nodeIndexscan.c
+++ b/src/backend/executor/nodeIndexscan.c
@@ -85,6 +85,13 @@ IndexNext(IndexScanState *node)
 	ScanDirection direction;
 	IndexScanDesc scandesc;
 	TupleTableSlot *slot;
+	IndexScan *indexscan = (IndexScan *) node->ss.ps.plan;
+
+	/*
+	 * tells if the current position was reached via skipping. In this case
+	 * there is no nead for the index_getnext_tid
+	 */
+	bool skipped = false;
 
 	/*
 	 * extract necessary information from index scan node
@@ -92,7 +99,7 @@ IndexNext(IndexScanState *node)
 	estate = node->ss.ps.state;
 	direction = estate->es_direction;
 	/* flip direction if this is an overall backward scan */
-	if (ScanDirectionIsBackward(((IndexScan *) node->ss.ps.plan)->indexorderdir))
+	if (ScanDirectionIsBackward(indexscan->indexorderdir))
 	{
 		if (ScanDirectionIsForward(direction))
 			direction = BackwardScanDirection;
@@ -117,6 +124,12 @@ IndexNext(IndexScanState *node)
 
 		node->iss_ScanDesc = scandesc;
 
+		/* Index skip scan assumes xs_want_itup, so set it to true */
+		if (indexscan->indexskipprefixsize > 0)
+			node->iss_ScanDesc->xs_want_itup = true;
+		else
+			node->iss_ScanDesc->xs_want_itup = false;
+
 		/*
 		 * If no run-time keys to calculate or they are ready, go ahead and
 		 * pass the scankeys to the index AM.
@@ -127,12 +140,48 @@ IndexNext(IndexScanState *node)
 						 node->iss_OrderByKeys, node->iss_NumOrderByKeys);
 	}
 
+	/*
+	 * Check if we need to skip to the next key prefix, because we've been
+	 * asked to implement DISTINCT.
+	 *
+	 * When fetching a cursor in the direction opposite to a general scan
+	 * direction, the result must be what normal fetching should have
+	 * returned, but in reversed order. In other words, return the last or
+	 * first scanned tuple in a DISTINCT set, depending on a cursor direction.
+	 * Due to that we skip also when the first tuple wasn't emitted yet, but
+	 * the directions are opposite.
+	 */
+	if (node->iss_SkipPrefixSize > 0 &&
+		(node->iss_FirstTupleEmitted ||
+		 ScanDirectionsAreOpposite(direction, indexscan->indexorderdir)))
+	{
+		if (!index_skip(scandesc, direction, indexscan->indexorderdir,
+					   !node->iss_FirstTupleEmitted, node->iss_SkipPrefixSize))
+		{
+			/*
+			 * Reached end of index. At this point currPos is invalidated, and
+			 * we need to reset iss_FirstTupleEmitted, since otherwise after
+			 * going backwards, reaching the end of index, and going forward
+			 * again we apply skip again. It would be incorrect and lead to an
+			 * extra skipped item.
+			 */
+			node->iss_FirstTupleEmitted = false;
+			return ExecClearTuple(slot);
+		}
+		else
+		{
+			skipped = true;
+			index_fetch_heap(scandesc, slot);
+		}
+	}
+
 	/*
 	 * ok, now that we have what we need, fetch the next tuple.
 	 */
-	while (index_getnext_slot(scandesc, direction, slot))
+	while (skipped || index_getnext_slot(scandesc, direction, slot))
 	{
 		CHECK_FOR_INTERRUPTS();
+		skipped = false;
 
 		/*
 		 * If the index was lossy, we have to recheck the index quals using
@@ -149,6 +198,7 @@ IndexNext(IndexScanState *node)
 			}
 		}
 
+		node->iss_FirstTupleEmitted = true;
 		return slot;
 	}
 
@@ -914,6 +964,8 @@ ExecInitIndexScan(IndexScan *node, EState *estate, int eflags)
 	indexstate->ss.ps.plan = (Plan *) node;
 	indexstate->ss.ps.state = estate;
 	indexstate->ss.ps.ExecProcNode = ExecIndexScan;
+	indexstate->iss_SkipPrefixSize = node->indexskipprefixsize;
+	indexstate->iss_FirstTupleEmitted = false;
 
 	/*
 	 * Miscellaneous initialization
diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c
index fe0d92ad46..8d0f92493f 100644
--- a/src/backend/nodes/copyfuncs.c
+++ b/src/backend/nodes/copyfuncs.c
@@ -497,6 +497,7 @@ _copyIndexScan(const IndexScan *from)
 	COPY_NODE_FIELD(indexorderbyorig);
 	COPY_NODE_FIELD(indexorderbyops);
 	COPY_SCALAR_FIELD(indexorderdir);
+	COPY_SCALAR_FIELD(indexskipprefixsize);
 
 	return newnode;
 }
diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c
index 06eb1a89e9..ad5c223bf8 100644
--- a/src/backend/nodes/outfuncs.c
+++ b/src/backend/nodes/outfuncs.c
@@ -569,6 +569,7 @@ _outIndexScan(StringInfo str, const IndexScan *node)
 	WRITE_NODE_FIELD(indexorderbyorig);
 	WRITE_NODE_FIELD(indexorderbyops);
 	WRITE_ENUM_FIELD(indexorderdir, ScanDirection);
+	WRITE_INT_FIELD(indexskipprefixsize);
 }
 
 static void
diff --git a/src/backend/nodes/readfuncs.c b/src/backend/nodes/readfuncs.c
index 169fb408d3..0fefbd5cc8 100644
--- a/src/backend/nodes/readfuncs.c
+++ b/src/backend/nodes/readfuncs.c
@@ -1868,6 +1868,7 @@ _readIndexScan(void)
 	READ_NODE_FIELD(indexorderbyorig);
 	READ_NODE_FIELD(indexorderbyops);
 	READ_ENUM_FIELD(indexorderdir, ScanDirection);
+	READ_INT_FIELD(indexskipprefixsize);
 
 	READ_DONE();
 }
diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c
index 00526f3476..41444d2216 100644
--- a/src/backend/optimizer/path/indxpath.c
+++ b/src/backend/optimizer/path/indxpath.c
@@ -1036,7 +1036,7 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel,
 					   check_index_only(rel, index));
 
 	/* Check if an index skip scan is possible. */
-	can_skip = enable_indexskipscan & index->amcanskip & index_only_scan;
+	can_skip = enable_indexskipscan & index->amcanskip;
 
 	if (can_skip)
 	{
@@ -1099,6 +1099,63 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel,
 				}
 			}
 		}
+
+		/*
+		 * For an index scan verify that index fully covers distinct
+		 * expressions, otherwise there is not enough information for skipping
+		 */
+		if (!index_only_scan && root->query_uniquekeys != NULL)
+		{
+			ListCell *lc;
+
+			foreach(lc, root->query_uniquekeys)
+			{
+				List *uniqExprs = (List *) lfirst(lc);
+				ListCell *lc1;
+
+				foreach(lc1, uniqExprs)
+				{
+					Expr *expr = (Expr *) lfirst(lc1);
+					bool found = false;
+
+					if (!IsA(expr, Var))
+					{
+						ListCell *lc2;
+
+						foreach(lc2, index->indexprs)
+						{
+							if(equal(lfirst(lc1), lfirst(lc2)))
+							{
+								found = true;
+								break;
+							}
+						}
+					}
+					else
+					{
+						Var *var = (Var *) expr;
+
+						for (int i = 0; i < index->ncolumns; i++)
+						{
+							if (index->indexkeys[i] == var->varattno)
+							{
+								found = true;
+								break;
+							}
+						}
+					}
+
+					if (!found)
+					{
+						can_skip = false;
+						break;
+					}
+				}
+
+				if (!can_skip)
+					break;
+			}
+		}
 	}
 
 	/*
diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c
index 511dda3a9f..618d1833ce 100644
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -185,7 +185,8 @@ static IndexScan *make_indexscan(List *qptlist, List *qpqual, Index scanrelid,
 								 Oid indexid, List *indexqual, List *indexqualorig,
 								 List *indexorderby, List *indexorderbyorig,
 								 List *indexorderbyops,
-								 ScanDirection indexscandir);
+								 ScanDirection indexscandir,
+								 int skipprefix);
 static IndexOnlyScan *make_indexonlyscan(List *qptlist, List *qpqual,
 										 Index scanrelid, Oid indexid,
 										 List *indexqual, List *recheckqual,
@@ -3121,7 +3122,8 @@ create_indexscan_plan(PlannerInfo *root,
 											fixed_indexorderbys,
 											indexorderbys,
 											indexorderbyops,
-											best_path->indexscandir);
+											best_path->indexscandir,
+											best_path->indexskipprefix);
 
 	copy_generic_path_info(&scan_plan->plan, &best_path->path);
 
@@ -5454,7 +5456,8 @@ make_indexscan(List *qptlist,
 			   List *indexorderby,
 			   List *indexorderbyorig,
 			   List *indexorderbyops,
-			   ScanDirection indexscandir)
+			   ScanDirection indexscandir,
+			   int skipPrefixSize)
 {
 	IndexScan  *node = makeNode(IndexScan);
 	Plan	   *plan = &node->scan.plan;
@@ -5471,6 +5474,7 @@ make_indexscan(List *qptlist,
 	node->indexorderbyorig = indexorderbyorig;
 	node->indexorderbyops = indexorderbyops;
 	node->indexorderdir = indexscandir;
+	node->indexskipprefixsize = skipPrefixSize;
 
 	return node;
 }
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index fa6ee25bc6..0cf02c1cd7 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -1486,6 +1486,8 @@ typedef struct IndexScanState
 	ExprContext *iss_RuntimeContext;
 	Relation	iss_RelationDesc;
 	struct IndexScanDescData *iss_ScanDesc;
+	int         iss_SkipPrefixSize;
+	bool		iss_FirstTupleEmitted;
 
 	/* These are needed for re-checking ORDER BY expr ordering */
 	pairingheap *iss_ReorderQueue;
@@ -1517,6 +1519,8 @@ typedef struct IndexScanState
  *		TableSlot		   slot for holding tuples fetched from the table
  *		VMBuffer		   buffer in use for visibility map testing, if any
  *		PscanLen		   size of parallel index-only scan descriptor
+ *		SkipPrefixSize	   number of keys for skip-based DISTINCT
+ *		FirstTupleEmitted  has the first tuple been emitted
  * ----------------
  */
 typedef struct IndexOnlyScanState
diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h
index 6b3eefebc6..c0c91d4d09 100644
--- a/src/include/nodes/plannodes.h
+++ b/src/include/nodes/plannodes.h
@@ -411,6 +411,8 @@ typedef struct IndexScan
 	List	   *indexorderbyorig;	/* the same in original form */
 	List	   *indexorderbyops;	/* OIDs of sort ops for ORDER BY exprs */
 	ScanDirection indexorderdir;	/* forward or backward or don't care */
+	int			indexskipprefixsize;	/* the size of the prefix for distinct
+										 * scans */
 } IndexScan;
 
 /* ----------------
diff --git a/src/test/regress/expected/select_distinct.out b/src/test/regress/expected/select_distinct.out
index 36b3291a7f..b8f747d8d2 100644
--- a/src/test/regress/expected/select_distinct.out
+++ b/src/test/regress/expected/select_distinct.out
@@ -445,14 +445,12 @@ SELECT DISTINCT ON (a) a, b FROM distinct_a ORDER BY a DESC, b DESC;
 -- test index skip scan for expressions
 EXPLAIN (COSTS OFF)
 SELECT DISTINCT (a + 1) FROM distinct_a ORDER BY (a + 1);
-             QUERY PLAN             
-------------------------------------
- Sort
-   Sort Key: ((a + 1))
-   ->  HashAggregate
-         Group Key: (a + 1)
-         ->  Seq Scan on distinct_a
-(5 rows)
+                     QUERY PLAN                     
+----------------------------------------------------
+ Index Scan using distinct_a_expr_idx on distinct_a
+   Skip scan: true
+   Distinct Prefix: 1
+(3 rows)
 
 SELECT DISTINCT (a + 1) FROM distinct_a ORDER BY (a + 1);
  ?column? 
@@ -693,6 +691,56 @@ FETCH BACKWARD ALL FROM c;
 
 END;
 DROP TABLE distinct_abc;
+-- index skip scan
+SELECT DISTINCT ON (a) a, b, c
+FROM distinct_a ORDER BY a;
+ a | b | c  
+---+---+----
+ 1 | 1 | 10
+ 2 | 1 | 10
+ 3 | 1 | 10
+ 4 | 1 | 10
+ 5 | 1 | 10
+(5 rows)
+
+SELECT DISTINCT ON (a) a, b, c
+FROM distinct_a WHERE a = 1 ORDER BY a;
+ a | b | c  
+---+---+----
+ 1 | 1 | 10
+(1 row)
+
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT ON (a) a, b, c
+FROM distinct_a ORDER BY a;
+                    QUERY PLAN                     
+---------------------------------------------------
+ Index Scan using distinct_a_a_b_idx on distinct_a
+   Skip scan: true
+   Distinct Prefix: 1
+(3 rows)
+
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT ON (a) a, b, c
+FROM distinct_a WHERE a = 1 ORDER BY a;
+                    QUERY PLAN                     
+---------------------------------------------------
+ Index Scan using distinct_a_a_b_idx on distinct_a
+   Skip scan: true
+   Distinct Prefix: 1
+   Index Cond: (a = 1)
+(4 rows)
+
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT *
+FROM distinct_a;
+          QUERY PLAN          
+------------------------------
+ HashAggregate
+   Group Key: a, b, c
+   ->  Seq Scan on distinct_a
+(3 rows)
+
 -- check colums order
 SELECT DISTINCT a FROM distinct_a WHERE b = 2 AND c = 10;
  a 
diff --git a/src/test/regress/sql/select_distinct.sql b/src/test/regress/sql/select_distinct.sql
index c9ccf4cc7d..b0d2ee7066 100644
--- a/src/test/regress/sql/select_distinct.sql
+++ b/src/test/regress/sql/select_distinct.sql
@@ -294,6 +294,22 @@ END;
 
 DROP TABLE distinct_abc;
 
+-- index skip scan
+SELECT DISTINCT ON (a) a, b, c
+FROM distinct_a ORDER BY a;
+SELECT DISTINCT ON (a) a, b, c
+FROM distinct_a WHERE a = 1 ORDER BY a;
+
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT ON (a) a, b, c
+FROM distinct_a ORDER BY a;
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT ON (a) a, b, c
+FROM distinct_a WHERE a = 1 ORDER BY a;
+EXPLAIN (COSTS OFF)
+SELECT DISTINCT *
+FROM distinct_a;
+
 -- check colums order
 SELECT DISTINCT a FROM distinct_a WHERE b = 2 AND c = 10;
 
-- 
2.32.0

