From 9fce35b14ba11eb3a6a6e7c114e26921cbfd8983 Mon Sep 17 00:00:00 2001
From: erthalion <9erthalion6@gmail.com>
Date: Sat, 15 Sep 2018 21:14:50 +0200
Subject: [PATCH] Index skip scan
Implementation of Index Skip Scan (see Loose Index Scan in the wiki [1])
on top of IndexOnlyScan. To make it suitable for both situations when
there are small number of distinct values and significant amount of
distinct values the following approach is taken - instead of searching
from the root for every value we're searching for then first on the
current page, and then if not found continue searching from the root.
Original patch and design were proposed by Thomas Munro [2], revived and
improved by Jesper Pedersen, and a bit adjusted by Dmitry Dolgov.
[1] https://wiki.postgresql.org/wiki/Loose_indexscan
[2] https://www.postgresql.org/message-id/flat/CADLWmXXbTSBxP-MzJuPAYSsL_2f0iPm5VWPbCvDbVvfX93FKkw%40mail.gmail.com
---
contrib/bloom/blutils.c | 1 +
doc/src/sgml/config.sgml | 16 +++
doc/src/sgml/indexam.sgml | 9 ++
doc/src/sgml/indices.sgml | 193 ++++++++++++++++++++++++++
src/backend/access/brin/brin.c | 1 +
src/backend/access/gin/ginutil.c | 1 +
src/backend/access/gist/gist.c | 1 +
src/backend/access/hash/hash.c | 1 +
src/backend/access/index/indexam.c | 16 +++
src/backend/access/nbtree/nbtree.c | 12 ++
src/backend/access/nbtree/nbtsearch.c | 164 ++++++++++++++++++++++
src/backend/access/spgist/spgutils.c | 1 +
src/backend/commands/explain.c | 12 ++
src/backend/executor/nodeIndexonlyscan.c | 17 +++
src/backend/nodes/copyfuncs.c | 1 +
src/backend/nodes/outfuncs.c | 1 +
src/backend/nodes/readfuncs.c | 1 +
src/backend/optimizer/path/costsize.c | 1 +
src/backend/optimizer/plan/createplan.c | 10 +-
src/backend/optimizer/plan/planner.c | 16 +++
src/backend/optimizer/util/pathnode.c | 39 ++++++
src/backend/optimizer/util/plancat.c | 1 +
src/backend/utils/misc/guc.c | 9 ++
src/backend/utils/misc/postgresql.conf.sample | 1 +
src/include/access/amapi.h | 5 +
src/include/access/genam.h | 1 +
src/include/access/nbtree.h | 5 +
src/include/nodes/execnodes.h | 4 +
src/include/nodes/plannodes.h | 1 +
src/include/nodes/relation.h | 5 +
src/include/optimizer/cost.h | 1 +
src/include/optimizer/pathnode.h | 5 +
src/test/regress/expected/create_index.out | 1 +
src/test/regress/expected/select_distinct.out | 25 ++++
src/test/regress/expected/sysviews.out | 3 +-
src/test/regress/sql/create_index.sql | 2 +
src/test/regress/sql/select_distinct.sql | 7 +
37 files changed, 586 insertions(+), 4 deletions(-)
diff --git a/contrib/bloom/blutils.c b/contrib/bloom/blutils.c
index 6458376578..f637635438 100644
--- a/contrib/bloom/blutils.c
+++ b/contrib/bloom/blutils.c
@@ -129,6 +129,7 @@ blhandler(PG_FUNCTION_ARGS)
amroutine->ambulkdelete = blbulkdelete;
amroutine->amvacuumcleanup = blvacuumcleanup;
amroutine->amcanreturn = NULL;
+ amroutine->amskip = NULL;
amroutine->amcostestimate = blcostestimate;
amroutine->amoptions = bloptions;
amroutine->amproperty = NULL;
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index b6f5822b84..395b7de7e8 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -4284,6 +4284,22 @@ ANY num_sync (
+ enable_indexskipscan (boolean)
+
+ enable_indexskipscan configuration parameter
+
+
+
+
+ Enables or disables the query planner's use of index-skip-scan plan
+ types (see ). This parameter requires
+ that enable_indexonlyscan is on.
+ The default is on.
+
+
+
+
enable_material (boolean)
diff --git a/doc/src/sgml/indexam.sgml b/doc/src/sgml/indexam.sgml
index 05102724ea..1550fcfb86 100644
--- a/doc/src/sgml/indexam.sgml
+++ b/doc/src/sgml/indexam.sgml
@@ -135,6 +135,7 @@ typedef struct IndexAmRoutine
amendscan_function amendscan;
ammarkpos_function ammarkpos; /* can be NULL */
amrestrpos_function amrestrpos; /* can be NULL */
+ amskip_function amskip; /* can be NULL */
/* interface functions to support parallel index scans */
amestimateparallelscan_function amestimateparallelscan; /* can be NULL */
@@ -666,6 +667,14 @@ amrestrpos (IndexScanDesc scan);
+bool
+amskip (IndexScanDesc scan, ScanDirection direction, int prefix);
+
+ TODO
+
+
+
+
Size
amestimateparallelscan (void);
diff --git a/doc/src/sgml/indices.sgml b/doc/src/sgml/indices.sgml
index 46f427b312..60f306571b 100644
--- a/doc/src/sgml/indices.sgml
+++ b/doc/src/sgml/indices.sgml
@@ -1391,6 +1391,199 @@ CREATE INDEX test1c_content_y_index ON test1c (content COLLATE "y");
+
+ Index-Only Scans
+
+
+ index
+ index-only scans
+
+
+ index-only scan
+
+
+
+ All indexes in PostgreSQL are secondary
+ indexes, meaning that each index is stored separately from the table's
+ main data area (which is called the table's heap
+ in PostgreSQL terminology). This means that in an
+ ordinary index scan, each row retrieval requires fetching data from both
+ the index and the heap. Furthermore, while the index entries that match a
+ given indexable WHERE condition are usually close together in
+ the index, the table rows they reference might be anywhere in the heap.
+ The heap-access portion of an index scan thus involves a lot of random
+ access into the heap, which can be slow, particularly on traditional
+ rotating media. (As described in ,
+ bitmap scans try to alleviate this cost by doing the heap accesses in
+ sorted order, but that only goes so far.)
+
+
+
+ To solve this performance problem, PostgreSQL
+ supports index-only scans, which can answer queries from an
+ index alone without any heap access. The basic idea is to return values
+ directly out of each index entry instead of consulting the associated heap
+ entry. There are two fundamental restrictions on when this method can be
+ used:
+
+
+
+
+ The index type must support index-only scans. B-tree indexes always
+ do. GiST and SP-GiST indexes support index-only scans for some
+ operator classes but not others. Other index types have no support.
+ The underlying requirement is that the index must physically store, or
+ else be able to reconstruct, the original data value for each index
+ entry. As a counterexample, GIN indexes cannot support index-only
+ scans because each index entry typically holds only part of the
+ original data value.
+
+
+
+
+
+ The query must reference only columns stored in the index. For
+ example, given an index on columns x and y of a
+ table that also has a column z, these queries could use
+ index-only scans:
+
+SELECT x, y FROM tab WHERE x = 'key';
+SELECT x FROM tab WHERE x = 'key' AND y < 42;
+
+ but these queries could not:
+
+SELECT x, z FROM tab WHERE x = 'key';
+SELECT x FROM tab WHERE x = 'key' AND z < 42;
+
+ (Expression indexes and partial indexes complicate this rule,
+ as discussed below.)
+
+
+
+
+
+
+ If these two fundamental requirements are met, then all the data values
+ required by the query are available from the index, so an index-only scan
+ is physically possible. But there is an additional requirement for any
+ table scan in PostgreSQL: it must verify that each
+ retrieved row be visible
to the query's MVCC snapshot, as
+ discussed in . Visibility information is not stored
+ in index entries, only in heap entries; so at first glance it would seem
+ that every row retrieval would require a heap access anyway. And this is
+ indeed the case, if the table row has been modified recently. However,
+ for seldom-changing data there is a way around this
+ problem. PostgreSQL tracks, for each page in a table's
+ heap, whether all rows stored in that page are old enough to be visible to
+ all current and future transactions. This information is stored in a bit
+ in the table's visibility map. An index-only scan, after
+ finding a candidate index entry, checks the visibility map bit for the
+ corresponding heap page. If it's set, the row is known visible and so the
+ data can be returned with no further work. If it's not set, the heap
+ entry must be visited to find out whether it's visible, so no performance
+ advantage is gained over a standard index scan. Even in the successful
+ case, this approach trades visibility map accesses for heap accesses; but
+ since the visibility map is four orders of magnitude smaller than the heap
+ it describes, far less physical I/O is needed to access it. In most
+ situations the visibility map remains cached in memory all the time.
+
+
+
+ In short, while an index-only scan is possible given the two fundamental
+ requirements, it will be a win only if a significant fraction of the
+ table's heap pages have their all-visible map bits set. But tables in
+ which a large fraction of the rows are unchanging are common enough to
+ make this type of scan very useful in practice.
+
+
+
+ To make effective use of the index-only scan feature, you might choose to
+ create indexes in which only the leading columns are meant to
+ match WHERE clauses, while the trailing columns
+ hold payload
data to be returned by a query. For example, if
+ you commonly run queries like
+
+SELECT y FROM tab WHERE x = 'key';
+
+ the traditional approach to speeding up such queries would be to create an
+ index on x only. However, an index on (x, y)
+ would offer the possibility of implementing this query as an index-only
+ scan. As previously discussed, such an index would be larger and hence
+ more expensive than an index on x alone, so this is attractive
+ only if the table is known to be mostly static. Note it's important that
+ the index be declared on (x, y) not (y, x), as for
+ most index types (particularly B-trees) searches that do not constrain the
+ leading index columns are not very efficient.
+
+
+
+ In principle, index-only scans can be used with expression indexes.
+ For example, given an index on f(x) where x is a
+ table column, it should be possible to execute
+
+SELECT f(x) FROM tab WHERE f(x) < 1;
+
+ as an index-only scan; and this is very attractive if f() is
+ an expensive-to-compute function. However, PostgreSQL's
+ planner is currently not very smart about such cases. It considers a
+ query to be potentially executable by index-only scan only when
+ all columns needed by the query are available from the index.
+ In this example, x is not needed except in the
+ context f(x), but the planner does not notice that and
+ concludes that an index-only scan is not possible. If an index-only scan
+ seems sufficiently worthwhile, this can be worked around by declaring the
+ index to be on (f(x), x), where the second column is not
+ expected to be used in practice but is just there to convince the planner
+ that an index-only scan is possible. An additional caveat, if the goal is
+ to avoid recalculating f(x), is that the planner won't
+ necessarily match uses of f(x) that aren't in
+ indexable WHERE clauses to the index column. It will usually
+ get this right in simple queries such as shown above, but not in queries
+ that involve joins. These deficiencies may be remedied in future versions
+ of PostgreSQL.
+
+
+
+ Partial indexes also have interesting interactions with index-only scans.
+ Consider the partial index shown in :
+
+CREATE UNIQUE INDEX tests_success_constraint ON tests (subject, target)
+ WHERE success;
+
+ In principle, we could do an index-only scan on this index to satisfy a
+ query like
+
+SELECT target FROM tests WHERE subject = 'some-subject' AND success;
+
+ But there's a problem: the WHERE clause refers
+ to success which is not available as a result column of the
+ index. Nonetheless, an index-only scan is possible because the plan does
+ not need to recheck that part of the WHERE clause at run time:
+ all entries found in the index necessarily have success = true
+ so this need not be explicitly checked in the
+ plan. PostgreSQL versions 9.6 and later will recognize
+ such cases and allow index-only scans to be generated, but older versions
+ will not.
+
+
+
+ Index Skip Scans
+
+
+ index
+ index-skip scans
+
+
+ index-skip scan
+
+
+
+ TODO
+
+
+
+
+
Examining Index Usage
diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c
index 467d91e681..720696f84e 100644
--- a/src/backend/access/brin/brin.c
+++ b/src/backend/access/brin/brin.c
@@ -108,6 +108,7 @@ brinhandler(PG_FUNCTION_ARGS)
amroutine->ambulkdelete = brinbulkdelete;
amroutine->amvacuumcleanup = brinvacuumcleanup;
amroutine->amcanreturn = NULL;
+ amroutine->amskip = NULL;
amroutine->amcostestimate = brincostestimate;
amroutine->amoptions = brinoptions;
amroutine->amproperty = NULL;
diff --git a/src/backend/access/gin/ginutil.c b/src/backend/access/gin/ginutil.c
index afc20232ac..36f32f15a4 100644
--- a/src/backend/access/gin/ginutil.c
+++ b/src/backend/access/gin/ginutil.c
@@ -61,6 +61,7 @@ ginhandler(PG_FUNCTION_ARGS)
amroutine->ambulkdelete = ginbulkdelete;
amroutine->amvacuumcleanup = ginvacuumcleanup;
amroutine->amcanreturn = NULL;
+ amroutine->amskip = NULL;
amroutine->amcostestimate = gincostestimate;
amroutine->amoptions = ginoptions;
amroutine->amproperty = NULL;
diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c
index b75b3a8dac..11b0a899d3 100644
--- a/src/backend/access/gist/gist.c
+++ b/src/backend/access/gist/gist.c
@@ -84,6 +84,7 @@ gisthandler(PG_FUNCTION_ARGS)
amroutine->ambulkdelete = gistbulkdelete;
amroutine->amvacuumcleanup = gistvacuumcleanup;
amroutine->amcanreturn = gistcanreturn;
+ amroutine->amskip = NULL;
amroutine->amcostestimate = gistcostestimate;
amroutine->amoptions = gistoptions;
amroutine->amproperty = gistproperty;
diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c
index f1f01a0956..07d7eeda56 100644
--- a/src/backend/access/hash/hash.c
+++ b/src/backend/access/hash/hash.c
@@ -79,6 +79,7 @@ hashhandler(PG_FUNCTION_ARGS)
amroutine->ambulkdelete = hashbulkdelete;
amroutine->amvacuumcleanup = hashvacuumcleanup;
amroutine->amcanreturn = NULL;
+ amroutine->amskip = NULL;
amroutine->amcostestimate = hashcostestimate;
amroutine->amoptions = hashoptions;
amroutine->amproperty = NULL;
diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c
index 4ad30186d9..4f3774128b 100644
--- a/src/backend/access/index/indexam.c
+++ b/src/backend/access/index/indexam.c
@@ -33,6 +33,7 @@
* index_can_return - does index support index-only scans?
* index_getprocid - get a support procedure OID
* index_getprocinfo - get a support procedure's lookup info
+ * index_skip - advance past duplicate key values in a scan
*
* NOTES
* This file contains the index_ routines which used
@@ -792,6 +793,21 @@ index_can_return(Relation indexRelation, int attno)
return indexRelation->rd_indam->amcanreturn(indexRelation, attno);
}
+/* ----------------
+ * index_skip
+ *
+ * Skip past all tuples where the first 'prefix' columns have the
+ * same value as the last tuple returned in the current scan.
+ * ----------------
+ */
+bool
+index_skip(IndexScanDesc scan, ScanDirection direction, int prefix)
+{
+ SCAN_CHECKS;
+
+ return scan->indexRelation->rd_indam->amskip(scan, direction, prefix);
+}
+
/* ----------------
* index_getprocid
*
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 98917de2ef..134eda34ed 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -130,6 +130,7 @@ bthandler(PG_FUNCTION_ARGS)
amroutine->ambulkdelete = btbulkdelete;
amroutine->amvacuumcleanup = btvacuumcleanup;
amroutine->amcanreturn = btcanreturn;
+ amroutine->amskip = btskip;
amroutine->amcostestimate = btcostestimate;
amroutine->amoptions = btoptions;
amroutine->amproperty = btproperty;
@@ -378,6 +379,8 @@ btbeginscan(Relation rel, int nkeys, int norderbys)
*/
so->currTuples = so->markTuples = NULL;
+ so->skipScanKey = NULL;
+
scan->xs_itupdesc = RelationGetDescr(rel);
scan->opaque = so;
@@ -445,6 +448,15 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
_bt_preprocess_array_keys(scan);
}
+/*
+ * btskip() -- skip to the beginning of the next key prefix
+ */
+bool
+btskip(IndexScanDesc scan, ScanDirection direction, int prefix)
+{
+ return _bt_skip(scan, direction, prefix);
+}
+
/*
* btendscan() -- close down a scan
*/
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index 92832237a8..a9012dc1d1 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -1192,6 +1192,170 @@ _bt_next(IndexScanDesc scan, ScanDirection dir)
return true;
}
+/*
+ * _bt_skip() -- Skip items that have the same prefix as the most recently
+ * fetched index tuple. The current position is set so that a subsequent call
+ * to _bt_next will fetch the first tuple that differs in the leading 'prefix'
+ * keys.
+ */
+bool
+_bt_skip(IndexScanDesc scan, ScanDirection dir, int prefix)
+{
+ BTScanOpaque so = (BTScanOpaque) scan->opaque;
+ BTStack stack;
+ Buffer buf;
+ OffsetNumber offnum;
+ BTScanPosItem *currItem;
+ Page page;
+ BTPageOpaque opaque;
+ OffsetNumber low, high, compare_offset;
+ Relation indexRel = scan->indexRelation;
+ int compare_value = ScanDirectionIsForward(dir) ? 0 : 1;
+
+ /* We want to return tuples, and we need a starting point */
+ Assert(scan->xs_want_itup);
+ Assert(scan->xs_itup);
+
+ /*
+ * If skipScanKey is NULL then we initialize it with _bt_mkscankey,
+ * otherwise we will just update the sk_flags / sk_argument elements
+ * in order to eliminate repeated free/realloc.
+ */
+ if (so->skipScanKey == NULL)
+ {
+ so->skipScanKey = _bt_mkscankey(indexRel, scan->xs_itup);
+ }
+ else
+ {
+ TupleDesc itupdesc;
+ int indnkeyatts;
+ int i;
+
+ itupdesc = RelationGetDescr(indexRel);
+ indnkeyatts = IndexRelationGetNumberOfKeyAttributes(indexRel);
+ for (i = 0; i < indnkeyatts; i++)
+ {
+ Datum datum;
+ bool null;
+ int flags;
+
+ datum = index_getattr(scan->xs_itup, i + 1, itupdesc, &null);
+ flags = (null ? SK_ISNULL : 0) |
+ (indexRel->rd_indoption[i] << SK_BT_INDOPTION_SHIFT);
+ so->skipScanKey[i].sk_flags = flags;
+ so->skipScanKey[i].sk_argument = datum;
+ }
+ }
+
+ /* Check if the next unique key can be found within the current page */
+ buf = so->currPos.buf;
+
+ page = BufferGetPage(buf);
+ opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+ low = P_FIRSTDATAKEY(opaque);
+ high = PageGetMaxOffsetNumber(page);
+ compare_offset = ScanDirectionIsForward(dir) ? high : low;
+
+ if(_bt_compare(scan->indexRelation, prefix,
+ so->skipScanKey, page, compare_offset) > compare_value)
+ {
+ bool keyFound = false;
+
+ LockBuffer(buf, BT_READ);
+ offnum = _bt_binsrch(scan->indexRelation, buf, prefix, so->skipScanKey,
+ ScanDirectionIsForward(dir));
+
+ /* Lock the page for SERIALIZABLE transactions */
+ PredicateLockPage(scan->indexRelation, BufferGetBlockNumber(buf),
+ scan->xs_snapshot);
+
+ /* We know in which direction to look */
+ _bt_initialize_more_data(so, dir);
+
+ if (ScanDirectionIsForward(dir))
+ {
+ /* Move back for _bt_next */
+ offnum = OffsetNumberPrev(offnum);
+ }
+
+ /* Now read the data */
+ keyFound = _bt_readpage(scan, dir, offnum);
+ _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
+
+ if (keyFound)
+ {
+ /* set IndexTuple */
+ currItem = &so->currPos.items[so->currPos.itemIndex];
+ scan->xs_ctup.t_self = currItem->heapTid;
+ if (scan->xs_want_itup)
+ scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
+ return true;
+ }
+ }
+
+ if (BTScanPosIsValid(so->currPos))
+ {
+ ReleaseBuffer(so->currPos.buf);
+ so->currPos.buf = InvalidBuffer;
+ }
+
+ /*
+ * We haven't found scan key within the current page, so let's scan from
+ * the root. Use _bt_search and _bt_binsrch to get the buffer and offset
+ * number
+ */
+ stack =_bt_search(scan->indexRelation, prefix, so->skipScanKey,
+ ScanDirectionIsForward(dir), &buf, BT_READ,
+ scan->xs_snapshot);
+ _bt_freestack(stack);
+ so->currPos.buf = buf;
+ offnum = _bt_binsrch(scan->indexRelation, buf, prefix, so->skipScanKey,
+ ScanDirectionIsForward(dir));
+
+ /* Lock the page for SERIALIZABLE transactions */
+ PredicateLockPage(scan->indexRelation, BufferGetBlockNumber(buf),
+ scan->xs_snapshot);
+
+ /* We know in which direction to look */
+ _bt_initialize_more_data(so, dir);
+
+ if (ScanDirectionIsForward(dir))
+ {
+ /* Move back for _bt_next */
+ offnum = OffsetNumberPrev(offnum);
+ }
+
+ /* Now read the data */
+ if (!_bt_readpage(scan, dir, offnum))
+ {
+ /*
+ * There's no actually-matching data on this page. Try to advance to
+ * the next page. Return false if there's no matching data at all.
+ */
+ LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK);
+ if (!_bt_steppage(scan, dir))
+ {
+ _bt_freeskey(so->skipScanKey);
+ so->skipScanKey = NULL;
+ return false;
+ }
+ }
+ else
+ {
+ /* Drop the lock, and maybe the pin, on the current page */
+ _bt_drop_lock_and_maybe_pin(scan, &so->currPos);
+ }
+
+ /* And set IndexTuple */
+ currItem = &so->currPos.items[so->currPos.itemIndex];
+ scan->xs_ctup.t_self = currItem->heapTid;
+ if (scan->xs_want_itup)
+ scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
+
+ return true;
+}
+
/*
* _bt_readpage() -- Load data from current index page into so->currPos
*
diff --git a/src/backend/access/spgist/spgutils.c b/src/backend/access/spgist/spgutils.c
index de147d7b68..a45edfa94b 100644
--- a/src/backend/access/spgist/spgutils.c
+++ b/src/backend/access/spgist/spgutils.c
@@ -68,6 +68,7 @@ spghandler(PG_FUNCTION_ARGS)
amroutine->ambulkdelete = spgbulkdelete;
amroutine->amvacuumcleanup = spgvacuumcleanup;
amroutine->amcanreturn = spgcanreturn;
+ amroutine->amskip = NULL;
amroutine->amcostestimate = spgcostestimate;
amroutine->amoptions = spgoptions;
amroutine->amproperty = spgproperty;
diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index ae7f038203..487ffcb407 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -1298,6 +1298,14 @@ ExplainNode(PlanState *planstate, List *ancestors,
{
IndexOnlyScan *indexonlyscan = (IndexOnlyScan *) plan;
+ if (indexonlyscan->distinctPrefix > 0)
+ {
+ if (es->format != EXPLAIN_FORMAT_TEXT)
+ ExplainPropertyInteger("Distinct Prefix", NULL,
+ indexonlyscan->distinctPrefix,
+ es);
+ }
+
ExplainIndexScanDetails(indexonlyscan->indexid,
indexonlyscan->indexorderdir,
es);
@@ -1520,6 +1528,10 @@ ExplainNode(PlanState *planstate, List *ancestors,
planstate, es);
break;
case T_IndexOnlyScan:
+ if (((IndexOnlyScan *) plan)->distinctPrefix > 0)
+ {
+ ExplainPropertyText("Scan mode", "Skip scan", es);
+ }
show_scan_qual(((IndexOnlyScan *) plan)->indexqual,
"Index Cond", planstate, ancestors, es);
if (((IndexOnlyScan *) plan)->indexqual)
diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c
index b3f61dd1fc..2d048a9725 100644
--- a/src/backend/executor/nodeIndexonlyscan.c
+++ b/src/backend/executor/nodeIndexonlyscan.c
@@ -114,6 +114,19 @@ IndexOnlyNext(IndexOnlyScanState *node)
node->ioss_NumOrderByKeys);
}
+ /*
+ * Check if we need to skip to the next key prefix, because we've been
+ * asked to implement DISTINCT.
+ */
+ if (node->ioss_NumDistinctKeys > 0 && node->ioss_FirstTupleEmitted)
+ {
+ if (!index_skip(scandesc, direction, node->ioss_NumDistinctKeys))
+ {
+ /* Reached end of index. */
+ return ExecClearTuple(slot);
+ }
+ }
+
/*
* OK, now that we have what we need, fetch the next tuple.
*/
@@ -249,6 +262,8 @@ IndexOnlyNext(IndexOnlyScanState *node)
ItemPointerGetBlockNumber(tid),
estate->es_snapshot);
+ node->ioss_FirstTupleEmitted = true;
+
return slot;
}
@@ -505,6 +520,8 @@ ExecInitIndexOnlyScan(IndexOnlyScan *node, EState *estate, int eflags)
indexstate->ss.ps.plan = (Plan *) node;
indexstate->ss.ps.state = estate;
indexstate->ss.ps.ExecProcNode = ExecIndexOnlyScan;
+ indexstate->ioss_NumDistinctKeys = node->distinctPrefix;
+ indexstate->ioss_FirstTupleEmitted = false;
/*
* Miscellaneous initialization
diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c
index 3eb7e95d64..5fcac97f2b 100644
--- a/src/backend/nodes/copyfuncs.c
+++ b/src/backend/nodes/copyfuncs.c
@@ -514,6 +514,7 @@ _copyIndexOnlyScan(const IndexOnlyScan *from)
COPY_NODE_FIELD(indexorderby);
COPY_NODE_FIELD(indextlist);
COPY_SCALAR_FIELD(indexorderdir);
+ COPY_SCALAR_FIELD(distinctPrefix);
return newnode;
}
diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c
index 0fde876c77..e24aa415f6 100644
--- a/src/backend/nodes/outfuncs.c
+++ b/src/backend/nodes/outfuncs.c
@@ -572,6 +572,7 @@ _outIndexOnlyScan(StringInfo str, const IndexOnlyScan *node)
WRITE_NODE_FIELD(indexorderby);
WRITE_NODE_FIELD(indextlist);
WRITE_ENUM_FIELD(indexorderdir, ScanDirection);
+ WRITE_INT_FIELD(distinctPrefix);
}
static void
diff --git a/src/backend/nodes/readfuncs.c b/src/backend/nodes/readfuncs.c
index ec6f2569ab..c2bf1bbd89 100644
--- a/src/backend/nodes/readfuncs.c
+++ b/src/backend/nodes/readfuncs.c
@@ -1799,6 +1799,7 @@ _readIndexOnlyScan(void)
READ_NODE_FIELD(indexorderby);
READ_NODE_FIELD(indextlist);
READ_ENUM_FIELD(indexorderdir, ScanDirection);
+ READ_INT_FIELD(distinctPrefix);
READ_DONE();
}
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index 99c5ad9b4a..8d00f00c17 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -122,6 +122,7 @@ int max_parallel_workers_per_gather = 2;
bool enable_seqscan = true;
bool enable_indexscan = true;
bool enable_indexonlyscan = true;
+bool enable_indexskipscan = true;
bool enable_bitmapscan = true;
bool enable_tidscan = true;
bool enable_sort = true;
diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c
index 97d0c28132..aac8d2e796 100644
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -171,7 +171,8 @@ static IndexOnlyScan *make_indexonlyscan(List *qptlist, List *qpqual,
Index scanrelid, Oid indexid,
List *indexqual, List *indexorderby,
List *indextlist,
- ScanDirection indexscandir);
+ ScanDirection indexscandir,
+ int skipprefix);
static BitmapIndexScan *make_bitmap_indexscan(Index scanrelid, Oid indexid,
List *indexqual,
List *indexqualorig);
@@ -2722,7 +2723,8 @@ create_indexscan_plan(PlannerInfo *root,
fixed_indexquals,
fixed_indexorderbys,
best_path->indexinfo->indextlist,
- best_path->indexscandir);
+ best_path->indexscandir,
+ best_path->indexskipprefix);
else
scan_plan = (Scan *) make_indexscan(tlist,
qpqual,
@@ -4996,7 +4998,8 @@ make_indexonlyscan(List *qptlist,
List *indexqual,
List *indexorderby,
List *indextlist,
- ScanDirection indexscandir)
+ ScanDirection indexscandir,
+ int skipprefix)
{
IndexOnlyScan *node = makeNode(IndexOnlyScan);
Plan *plan = &node->scan.plan;
@@ -5011,6 +5014,7 @@ make_indexonlyscan(List *qptlist,
node->indexorderby = indexorderby;
node->indextlist = indextlist;
node->indexorderdir = indexscandir;
+ node->distinctPrefix = skipprefix;
return node;
}
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index 4465f002c8..a679bbbbde 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -4710,6 +4710,22 @@ create_distinct_paths(PlannerInfo *root,
path,
list_length(root->distinct_pathkeys),
numDistinctRows));
+
+ /* Also consider a skip scan, if possible. */
+ if (IsA(path, IndexPath) &&
+ path->pathtype == T_IndexOnlyScan &&
+ enable_indexskipscan &&
+ ((IndexPath *) path)->indexinfo->amcanskip &&
+ root->distinct_pathkeys > 0)
+ {
+ Path *subpath = (Path *)
+ create_skipscan_unique_path(root,
+ distinct_rel,
+ path,
+ list_length(root->distinct_pathkeys),
+ numDistinctRows);
+ add_path(distinct_rel, subpath);
+ }
}
}
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index b2637d0e89..fcb6d140b7 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -2769,6 +2769,45 @@ create_upper_unique_path(PlannerInfo *root,
return pathnode;
}
+/*
+ * create_skipscan_unique_path
+ * Creates a pathnode the same as an existing IndexPath except based on
+ * skipping duplicate values. This may or may not be cheaper than using
+ * create_upper_unique_path.
+ *
+ * The input path must be an IndexPath for an index that supports amskip.
+ */
+IndexPath *
+create_skipscan_unique_path(PlannerInfo *root,
+ RelOptInfo *rel,
+ Path *subpath,
+ int numCols,
+ double numGroups)
+{
+ IndexPath *pathnode = makeNode(IndexPath);
+
+ Assert(IsA(subpath, IndexPath));
+
+ /* We don't want to modify subpath, so make a copy. */
+ memcpy(pathnode, subpath, sizeof(IndexPath));
+
+ /* The size of the prefix we'll use for skipping. */
+ Assert(pathnode->indexinfo->amcanskip);
+ Assert(numCols > 0);
+ pathnode->indexskipprefix = numCols;
+
+ /*
+ * The cost to skip to each distinct value should be roughly the same as
+ * the cost of finding the first key times the number of distinct values
+ * we expect to find.
+ */
+ pathnode->path.startup_cost = subpath->startup_cost;
+ pathnode->path.total_cost = subpath->startup_cost * numGroups;
+ pathnode->path.rows = numGroups;
+
+ return pathnode;
+}
+
/*
* create_agg_path
* Creates a pathnode that represents performing aggregation/grouping
diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c
index 261492e6b7..b20faeaa50 100644
--- a/src/backend/optimizer/util/plancat.c
+++ b/src/backend/optimizer/util/plancat.c
@@ -269,6 +269,7 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent,
info->amoptionalkey = amroutine->amoptionalkey;
info->amsearcharray = amroutine->amsearcharray;
info->amsearchnulls = amroutine->amsearchnulls;
+ info->amcanskip = (amroutine->amskip != NULL);
info->amcanparallel = amroutine->amcanparallel;
info->amhasgettuple = (amroutine->amgettuple != NULL);
info->amhasgetbitmap = (amroutine->amgetbitmap != NULL);
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index c216ed0922..71f31bbfeb 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -893,6 +893,15 @@ static struct config_bool ConfigureNamesBool[] =
true,
NULL, NULL, NULL
},
+ {
+ {"enable_indexskipscan", PGC_USERSET, QUERY_TUNING_METHOD,
+ gettext_noop("Enables the planner's use of index-skip-scan plans."),
+ NULL
+ },
+ &enable_indexskipscan,
+ true,
+ NULL, NULL, NULL
+ },
{
{"enable_bitmapscan", PGC_USERSET, QUERY_TUNING_METHOD,
gettext_noop("Enables the planner's use of bitmap-scan plans."),
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index a21865a77f..834a775773 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -345,6 +345,7 @@
#enable_hashjoin = on
#enable_indexscan = on
#enable_indexonlyscan = on
+#enable_indexskipscan = on
#enable_material = on
#enable_mergejoin = on
#enable_nestloop = on
diff --git a/src/include/access/amapi.h b/src/include/access/amapi.h
index 653ddc976b..082a9bb0d6 100644
--- a/src/include/access/amapi.h
+++ b/src/include/access/amapi.h
@@ -127,6 +127,10 @@ typedef void (*amrescan_function) (IndexScanDesc scan,
typedef bool (*amgettuple_function) (IndexScanDesc scan,
ScanDirection direction);
+/* skip past duplicates in a given prefix */
+typedef bool (*amskip_function) (IndexScanDesc scan,
+ ScanDirection dir, int prefix);
+
/* fetch all valid tuples */
typedef int64 (*amgetbitmap_function) (IndexScanDesc scan,
TIDBitmap *tbm);
@@ -221,6 +225,7 @@ typedef struct IndexAmRoutine
amendscan_function amendscan;
ammarkpos_function ammarkpos; /* can be NULL */
amrestrpos_function amrestrpos; /* can be NULL */
+ amskip_function amskip; /* can be NULL */
/* interface functions to support parallel index scans */
amestimateparallelscan_function amestimateparallelscan; /* can be NULL */
diff --git a/src/include/access/genam.h b/src/include/access/genam.h
index c4aba39496..a9bf4f58a9 100644
--- a/src/include/access/genam.h
+++ b/src/include/access/genam.h
@@ -170,6 +170,7 @@ extern IndexBulkDeleteResult *index_bulk_delete(IndexVacuumInfo *info,
extern IndexBulkDeleteResult *index_vacuum_cleanup(IndexVacuumInfo *info,
IndexBulkDeleteResult *stats);
extern bool index_can_return(Relation indexRelation, int attno);
+extern bool index_skip(IndexScanDesc scan, ScanDirection direction, int prefix);
extern RegProcedure index_getprocid(Relation irel, AttrNumber attnum,
uint16 procnum);
extern FmgrInfo *index_getprocinfo(Relation irel, AttrNumber attnum,
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 4fb92d60a1..e74149d1a4 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -470,6 +470,9 @@ typedef struct BTScanOpaqueData
*/
int markItemIndex; /* itemIndex, or -1 if not valid */
+ /* Work space for _bt_skip */
+ ScanKey skipScanKey; /* used to control skipping */
+
/* keep these last in struct for efficiency */
BTScanPosData currPos; /* current position data */
BTScanPosData markPos; /* marked position, if any */
@@ -570,6 +573,7 @@ extern int32 _bt_compare(Relation rel, int keysz, ScanKey scankey,
Page page, OffsetNumber offnum);
extern bool _bt_first(IndexScanDesc scan, ScanDirection dir);
extern bool _bt_next(IndexScanDesc scan, ScanDirection dir);
+extern bool _bt_skip(IndexScanDesc scan, ScanDirection dir, int prefix);
extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost,
Snapshot snapshot);
@@ -597,6 +601,7 @@ extern void _bt_end_vacuum_callback(int code, Datum arg);
extern Size BTreeShmemSize(void);
extern void BTreeShmemInit(void);
extern bytea *btoptions(Datum reloptions, bool validate);
+extern bool btskip(IndexScanDesc scan, ScanDirection dir, int prefix);
extern bool btproperty(Oid index_oid, int attno,
IndexAMProperty prop, const char *propname,
bool *res, bool *isnull);
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 7cae085177..6fbc023246 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -1390,6 +1390,8 @@ typedef struct IndexScanState
* RelationDesc index relation descriptor
* ScanDesc index scan descriptor
* VMBuffer buffer in use for visibility map testing, if any
+ * NumDistinctKeys number of keys for skip-based DISTINCT
+ * FirstTupleEmitted has the first tuple been emitted
* ioss_PscanLen Size of parallel index-only scan descriptor
* ----------------
*/
@@ -1408,6 +1410,8 @@ typedef struct IndexOnlyScanState
Relation ioss_RelationDesc;
struct IndexScanDescData *ioss_ScanDesc;
Buffer ioss_VMBuffer;
+ int ioss_NumDistinctKeys;
+ bool ioss_FirstTupleEmitted;
Size ioss_PscanLen;
} IndexOnlyScanState;
diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h
index 6d087c268f..632b05a84f 100644
--- a/src/include/nodes/plannodes.h
+++ b/src/include/nodes/plannodes.h
@@ -431,6 +431,7 @@ typedef struct IndexOnlyScan
List *indexorderby; /* list of index ORDER BY exprs */
List *indextlist; /* TargetEntry list describing index's cols */
ScanDirection indexorderdir; /* forward or backward or don't care */
+ int distinctPrefix; /* the size of the prefix for distinct scans */
} IndexOnlyScan;
/* ----------------
diff --git a/src/include/nodes/relation.h b/src/include/nodes/relation.h
index 3430061361..fd7b5996d9 100644
--- a/src/include/nodes/relation.h
+++ b/src/include/nodes/relation.h
@@ -810,6 +810,7 @@ typedef struct IndexOptInfo
bool amsearchnulls; /* can AM search for NULL/NOT NULL entries? */
bool amhasgettuple; /* does AM have amgettuple interface? */
bool amhasgetbitmap; /* does AM have amgetbitmap interface? */
+ bool amcanskip; /* can AM skip duplicate values? */
bool amcanparallel; /* does AM support parallel scan? */
/* Rather than include amapi.h here, we declare amcostestimate like this */
void (*amcostestimate) (); /* AM's cost estimator */
@@ -1160,6 +1161,9 @@ typedef struct Path
* we need not recompute them when considering using the same index in a
* bitmap index/heap scan (see BitmapHeapPath). The costs of the IndexPath
* itself represent the costs of an IndexScan or IndexOnlyScan plan type.
+ *
+ * 'indexskipprefix' represents the number of columns to consider for skip
+ * scans.
*----------
*/
typedef struct IndexPath
@@ -1174,6 +1178,7 @@ typedef struct IndexPath
ScanDirection indexscandir;
Cost indextotalcost;
Selectivity indexselectivity;
+ int indexskipprefix;
} IndexPath;
/*
diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h
index e7005b4a0c..acfa5416f2 100644
--- a/src/include/optimizer/cost.h
+++ b/src/include/optimizer/cost.h
@@ -58,6 +58,7 @@ extern PGDLLIMPORT int max_parallel_workers_per_gather;
extern PGDLLIMPORT bool enable_seqscan;
extern PGDLLIMPORT bool enable_indexscan;
extern PGDLLIMPORT bool enable_indexonlyscan;
+extern PGDLLIMPORT bool enable_indexskipscan;
extern PGDLLIMPORT bool enable_bitmapscan;
extern PGDLLIMPORT bool enable_tidscan;
extern PGDLLIMPORT bool enable_sort;
diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h
index bd905d3328..38b99cd41b 100644
--- a/src/include/optimizer/pathnode.h
+++ b/src/include/optimizer/pathnode.h
@@ -186,6 +186,11 @@ extern UpperUniquePath *create_upper_unique_path(PlannerInfo *root,
Path *subpath,
int numCols,
double numGroups);
+extern IndexPath *create_skipscan_unique_path(PlannerInfo *root,
+ RelOptInfo *rel,
+ Path *subpath,
+ int numCols,
+ double numGroups);
extern AggPath *create_agg_path(PlannerInfo *root,
RelOptInfo *rel,
Path *subpath,
diff --git a/src/test/regress/expected/create_index.out b/src/test/regress/expected/create_index.out
index 46deb55c67..c9acae96d7 100644
--- a/src/test/regress/expected/create_index.out
+++ b/src/test/regress/expected/create_index.out
@@ -19,6 +19,7 @@ CREATE INDEX tenk1_unique1 ON tenk1 USING btree(unique1 int4_ops);
CREATE INDEX tenk1_unique2 ON tenk1 USING btree(unique2 int4_ops);
CREATE INDEX tenk1_hundred ON tenk1 USING btree(hundred int4_ops);
CREATE INDEX tenk1_thous_tenthous ON tenk1 (thousand, tenthous);
+CREATE INDEX tenk1_four ON tenk1 (four);
CREATE INDEX tenk2_unique1 ON tenk2 USING btree(unique1 int4_ops);
CREATE INDEX tenk2_unique2 ON tenk2 USING btree(unique2 int4_ops);
CREATE INDEX tenk2_hundred ON tenk2 USING btree(hundred int4_ops);
diff --git a/src/test/regress/expected/select_distinct.out b/src/test/regress/expected/select_distinct.out
index f3696c6d1d..38c9bc4b9b 100644
--- a/src/test/regress/expected/select_distinct.out
+++ b/src/test/regress/expected/select_distinct.out
@@ -244,3 +244,28 @@ SELECT null IS NOT DISTINCT FROM null as "yes";
t
(1 row)
+-- index skip scan
+SELECT DISTINCT four FROM tenk1;
+ four
+------
+ 0
+ 1
+ 2
+ 3
+(4 rows)
+
+SELECT DISTINCT four FROM tenk1 WHERE four = 1;
+ four
+------
+ 1
+(1 row)
+
+EXPLAIN (VERBOSE, COSTS OFF)
+SELECT DISTINCT four FROM tenk1;
+ QUERY PLAN
+--------------------------------------------------
+ Index Only Scan using tenk1_four on public.tenk1
+ Output: four
+ Scan mode: Skip scan
+(3 rows)
+
diff --git a/src/test/regress/expected/sysviews.out b/src/test/regress/expected/sysviews.out
index a1c90eb905..bd3b373515 100644
--- a/src/test/regress/expected/sysviews.out
+++ b/src/test/regress/expected/sysviews.out
@@ -78,6 +78,7 @@ select name, setting from pg_settings where name like 'enable%';
enable_hashjoin | on
enable_indexonlyscan | on
enable_indexscan | on
+ enable_indexskipscan | on
enable_material | on
enable_mergejoin | on
enable_nestloop | on
@@ -89,7 +90,7 @@ select name, setting from pg_settings where name like 'enable%';
enable_seqscan | on
enable_sort | on
enable_tidscan | on
-(17 rows)
+(18 rows)
-- Test that the pg_timezone_names and pg_timezone_abbrevs views are
-- more-or-less working. We can't test their contents in any great detail
diff --git a/src/test/regress/sql/create_index.sql b/src/test/regress/sql/create_index.sql
index 59da6b6592..588616446e 100644
--- a/src/test/regress/sql/create_index.sql
+++ b/src/test/regress/sql/create_index.sql
@@ -26,6 +26,8 @@ CREATE INDEX tenk1_hundred ON tenk1 USING btree(hundred int4_ops);
CREATE INDEX tenk1_thous_tenthous ON tenk1 (thousand, tenthous);
+CREATE INDEX tenk1_four ON tenk1 (four);
+
CREATE INDEX tenk2_unique1 ON tenk2 USING btree(unique1 int4_ops);
CREATE INDEX tenk2_unique2 ON tenk2 USING btree(unique2 int4_ops);
diff --git a/src/test/regress/sql/select_distinct.sql b/src/test/regress/sql/select_distinct.sql
index a605e86449..992e8d7c4d 100644
--- a/src/test/regress/sql/select_distinct.sql
+++ b/src/test/regress/sql/select_distinct.sql
@@ -73,3 +73,10 @@ SELECT 1 IS NOT DISTINCT FROM 2 as "no";
SELECT 2 IS NOT DISTINCT FROM 2 as "yes";
SELECT 2 IS NOT DISTINCT FROM null as "no";
SELECT null IS NOT DISTINCT FROM null as "yes";
+
+-- index skip scan
+SELECT DISTINCT four FROM tenk1;
+SELECT DISTINCT four FROM tenk1 WHERE four = 1;
+
+EXPLAIN (VERBOSE, COSTS OFF)
+SELECT DISTINCT four FROM tenk1;
--
2.16.4