From 9dbca43fca102e3684166067e901d2ce10d99d18 Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Mon, 1 Jun 2026 19:35:47 -0400 Subject: [PATCH v28 05/11] WIP: Adopt amgetbatch interface in GiST index AM. Replace gistgettuple with gistgetbatch, a function that implements the new amgetbatch interface added by commit FIXME. Plain index scans of GiST indexes now return matching items in batches consisting of all of the matches from a given leaf page. This gives the table AM the ability to perform optimizations like index prefetching during GiST index scans. The amgetbatch interface requires that index AMs take the same standardized approach to pin management for pins that are used to prevent unsafe concurrent TID recycling by VACUUM (that way prefetching can hold open multiple batches without it affecting the read stream). For an ordinary GiST batch this interlock pin is the pin on its single leaf page, held only for as long as the table AM still needs it as an interlock (just like during nbtree and hash scans). Nearest-neighbor (ordered) scans are handled quite differently, because their matches don't naturally arrive one leaf page at a time. Here gistgetbatch instead drains the scan's distance-ordered pairing heap, packing the matching leaf items into a single "virtual" batch in distance order, typically spanning many leaf pages. We're effectively pretending that the matches we found were in useful order, together on the same leaf page -- though that isn't really true. Virtual batches come with restrictions that make the pretense safe: an ordered scan is never planned as an index-only scan, and gistkillitemsbatch does nothing for a virtual batch. A virtual batch therefore never holds a TID recycling interlock pin at all; the pin on each underlying leaf page is instead dropped right away, as the page is scanned into the queue. The interlock pin also fixes a pre-existing bug in which GiST index-only scans could return wrong answers [1]. An index-only scan trusts the visibility map instead of fetching the heap tuple, so it must keep VACUUM from recycling a heap TID between the moment it reads an index entry and the moment it consults the visibility map; otherwise it can report indexed values that belong to an unrelated, since-recycled heap tuple. The retained leaf-page buffer pin is that interlock -- but only if VACUUM honors it. gistvacuumpage therefore now acquires a cleanup lock on each page (rather than a plain exclusive lock), so a concurrent scan's pin holds VACUUM off from recycling that page's TIDs until the scan has finished its visibility checks. This same interlock requirement is why ordered scans cannot be index-only: a virtual batch drops each leaf page's pin as soon as the page is scanned, so it has no bounded pin to offer as the recycling interlock that an index-only scan depends on. Rather than work around that (which seems prohibitively complicated), the planner never builds an index-only scan that uses ordering operators; ordered scans must be plain index scans, which fetch and recheck the heap tuple and so were never subject to the bug. This warrants an incompatibility item in the Postgres 20 release notes (note that both GiST and SP-GiST are affected). The gistgetbatch implementation makes use of new batch-related core infrastructure. GiST now registers an amgettransform callback, which sets the scan descriptor's per-tuple recheck flags. It also sets order-by distances, and reconstructs a heap tuple for index-only scans. It is called just before table_index_getnext_slot returns another tuple. Like nbtree, the scan uses a currTuples storage area to store IndexTuple structs in their original on-disk representation. Unlike nbtree, GiST uses amgettransform to convert the representation of the tuples into a heap tuple representation of the underlying indexed type. This scheme also relies on a new facility that allows index AMs to request their own separate dynamically sized area for supplemental metadata (GiST opclasses have the ability to represent that any tuple needs a recheck, so we have to shuttle that information around with the batch). [1] https://postgr.es/m/CAH2-Wz=jjiNL9FCh8C1L-GUH15f4WFTWub2x+_NucngcDDcHKw@mail.gmail.com Author: Peter Geoghegan --- src/include/access/amapi.h | 5 + src/include/access/gist_private.h | 77 +- src/include/access/gistxlog.h | 13 +- src/include/access/indexbatch.h | 25 + src/include/access/relscan.h | 5 + src/backend/access/brin/brin.c | 1 + src/backend/access/gin/ginutil.c | 1 + src/backend/access/gist/README | 94 ++- src/backend/access/gist/gist.c | 9 +- src/backend/access/gist/gistget.c | 661 ++++++++++-------- src/backend/access/gist/gistscan.c | 45 +- src/backend/access/gist/gistutil.c | 9 +- src/backend/access/gist/gistvacuum.c | 17 +- src/backend/access/gist/gistxlog.c | 37 +- src/backend/access/hash/hash.c | 1 + src/backend/access/heap/heapam_indexscan.c | 22 +- src/backend/access/index/amapi.c | 1 + src/backend/access/index/genam.c | 1 + src/backend/access/index/indexbatch.c | 5 +- src/backend/access/nbtree/nbtree.c | 1 + src/backend/access/rmgrdesc/gistdesc.c | 4 + src/backend/access/spgist/spgutils.c | 1 + src/backend/executor/nodeIndexonlyscan.c | 12 - src/backend/optimizer/path/indxpath.c | 5 +- contrib/bloom/blutils.c | 1 + contrib/btree_gist/expected/cash.out | 6 +- contrib/btree_gist/expected/date.out | 6 +- contrib/btree_gist/expected/float4.out | 6 +- contrib/btree_gist/expected/float8.out | 2 +- contrib/btree_gist/expected/int2.out | 6 +- contrib/btree_gist/expected/int4.out | 6 +- contrib/btree_gist/expected/int8.out | 2 +- contrib/btree_gist/expected/interval.out | 2 +- contrib/btree_gist/expected/time.out | 2 +- contrib/btree_gist/expected/timestamp.out | 2 +- contrib/btree_gist/expected/timestamptz.out | 2 +- doc/src/sgml/indexam.sgml | 200 +++++- .../modules/dummy_index_am/dummy_index_am.c | 1 + .../modules/index/expected/killtuples.out | 79 ++- src/test/modules/index/specs/killtuples.spec | 18 +- src/test/regress/expected/create_index.out | 14 +- .../regress/expected/create_index_spgist.out | 18 +- src/test/regress/expected/gist.out | 52 +- src/test/regress/sql/gist.sql | 8 +- src/tools/pgindent/typedefs.list | 2 + 45 files changed, 1029 insertions(+), 458 deletions(-) diff --git a/src/include/access/amapi.h b/src/include/access/amapi.h index 02793a115..157c1a8df 100644 --- a/src/include/access/amapi.h +++ b/src/include/access/amapi.h @@ -212,6 +212,10 @@ typedef void (*amunguardbatch_function) (IndexScanDesc scan, typedef void (*amkillitemsbatch_function) (IndexScanDesc scan, IndexScanBatch batch); +/* Set up the scan's xs_hitup output tuple for the given batch item */ +typedef void (*amgettransform_function) (IndexScanDesc scan, + IndexScanBatch batch, int item); + /* fetch all valid tuples */ typedef int64 (*amgetbitmap_function) (IndexScanDesc scan, TIDBitmap *tbm); @@ -326,6 +330,7 @@ typedef struct IndexAmRoutine amgetbatch_function amgetbatch; /* can be NULL */ amunguardbatch_function amunguardbatch; /* can be NULL */ amkillitemsbatch_function amkillitemsbatch; /* can be NULL */ + amgettransform_function amgettransform; /* can be NULL */ amgetbitmap_function amgetbitmap; /* can be NULL */ amendscan_function amendscan; amposreset_function amposreset; /* can be NULL */ diff --git a/src/include/access/gist_private.h b/src/include/access/gist_private.h index 44514f1cb..534e3b4ca 100644 --- a/src/include/access/gist_private.h +++ b/src/include/access/gist_private.h @@ -16,6 +16,7 @@ #include "access/amapi.h" #include "access/gist.h" +#include "access/indexbatch.h" #include "access/itup.h" #include "lib/pairingheap.h" #include "storage/bufmgr.h" @@ -120,10 +121,6 @@ typedef struct GISTSearchHeapItem ItemPointerData heapPtr; bool recheck; /* T if quals must be rechecked */ bool recheckDistances; /* T if distances must be rechecked */ - HeapTuple recontup; /* data reconstructed from the index, used in - * index-only scans */ - OffsetNumber offnum; /* track offset in page to mark tuple as - * LP_DEAD */ } GISTSearchHeapItem; /* Unvisited item, either index page or heap tuple */ @@ -148,6 +145,55 @@ typedef struct GISTSearchItem (offsetof(GISTSearchItem, distances) + \ sizeof(IndexOrderByDistance) * (n_distances)) +/* Per-batch data private to the GiST index AM */ +typedef struct GISTBatchData +{ + /* leaf page's buffer pin */ + Buffer buf; + /* leaf page's block number (InvalidBlockNumber means "virtual" batch) */ + BlockNumber blkno; +} GISTBatchData; + +/* Access the GiST-private per-batch data from an IndexScanBatch pointer */ +#define GISTBatchGetData(scan, batch) \ + index_scan_batch_index_opaque_static(scan, batch, GISTBatchData) + +/* + * Per-item private GiST data. We lay out the index AM's dynamic opaque area + * as an array of these, one per batch item, and subscript it via + * GISTBatchGetItem. + * + * GiST matching is potentially lossy, and the Consistent function's recheck + * flag varies from one item to the next, so every batch item records its own + * qual recheck flag; gistgettransform reports it as the item's xs_recheck. + * + * Note: Unordered scans only need a recheck flag, so their dynamic opaque + * area is just a bool array, subscripted via GISTBatchGetRecheck. + */ +typedef struct GISTBatchItem +{ + bool recheck; /* T if quals must be rechecked */ + bool recheckDistances; /* T if distances are lossy lower bounds */ + /* numberOfOrderBys entries */ + IndexOrderByDistance distances[FLEXIBLE_ARRAY_MEMBER]; +} GISTBatchItem; + +#define SizeOfGISTBatchItem(n_distances) \ + (offsetof(GISTBatchItem, distances) + \ + sizeof(IndexOrderByDistance) * (n_distances)) + +/* Get an item from dynamic area during an ordered scan */ +#define GISTBatchGetItem(scan, batch, item) \ + (AssertMacro((scan)->numberOfOrderBys > 0), \ + AssertMacro((item) >= 0 && (item) < MaxIndexTuplesPerPage), \ + (GISTBatchItem *) ((char *) index_scan_batch_index_opaque_dyn((scan), (batch)) + \ + (Size) (item) * SizeOfGISTBatchItem((scan)->numberOfOrderBys))) + +/* Get an item from dynamic area during a non-ordered scan */ +#define GISTBatchGetRecheck(scan, batch) \ + (AssertMacro((scan)->numberOfOrderBys == 0), \ + (bool *) index_scan_batch_index_opaque_dyn((scan), (batch))) + /* * GISTScanOpaqueData: private state for a scan of a GiST index */ @@ -159,23 +205,9 @@ typedef struct GISTScanOpaqueData pairingheap *queue; /* queue of unvisited items */ MemoryContext queueCxt; /* context holding the queue */ bool qual_ok; /* false if qual can never be satisfied */ - bool firstCall; /* true until first gistgettuple call */ /* pre-allocated workspace arrays */ IndexOrderByDistance *distances; /* output area for gistindex_keytest */ - - /* info about killed items if any (killedItems is NULL if never used) */ - OffsetNumber *killedItems; /* offset numbers of killed items */ - int numKilled; /* number of currently stored items */ - BlockNumber curBlkno; /* current number of block */ - GistNSN curPageLSN; /* pos in the WAL stream when page was read */ - - /* In a non-ordered search, returnable heap items are stored here: */ - GISTSearchHeapItem pageData[BLCKSZ / sizeof(IndexTupleData)]; - OffsetNumber nPageData; /* number of valid items in array */ - OffsetNumber curPageData; /* next item to return */ - MemoryContext pageDataCxt; /* context holding the fetched tuples, for - * index-only scans */ } GISTScanOpaqueData; typedef GISTScanOpaqueData *GISTScanOpaque; @@ -448,6 +480,9 @@ extern XLogRecPtr gistXLogUpdate(Buffer buffer, IndexTuple *itup, int ituplen, Buffer leftchildbuf); +extern XLogRecPtr gistXLogVacuum(Buffer buffer, + OffsetNumber *todelete, int ntodelete); + extern XLogRecPtr gistXLogDelete(Buffer buffer, OffsetNumber *todelete, int ntodelete, TransactionId snapshotConflictHorizon, Relation heaprel); @@ -458,7 +493,11 @@ extern XLogRecPtr gistXLogSplit(bool page_is_leaf, Buffer leftchildbuf, bool markfollowright); /* gistget.c */ -extern bool gistgettuple(IndexScanDesc scan, ScanDirection dir); +extern void gistkillitemsbatch(IndexScanDesc scan, IndexScanBatch batch); +extern IndexScanBatch gistgetbatch(IndexScanDesc scan, IndexScanBatch priorbatch, + ScanDirection dir); +extern void gistunguardbatch(IndexScanDesc scan, IndexScanBatch batch); +extern void gistgettransform(IndexScanDesc scan, IndexScanBatch batch, int item); extern int64 gistgetbitmap(IndexScanDesc scan, TIDBitmap *tbm); extern bool gistcanreturn(Relation index, int attno); diff --git a/src/include/access/gistxlog.h b/src/include/access/gistxlog.h index 1c2cf6e81..86e5e1f86 100644 --- a/src/include/access/gistxlog.h +++ b/src/include/access/gistxlog.h @@ -18,17 +18,24 @@ #include "lib/stringinfo.h" #define XLOG_GIST_PAGE_UPDATE 0x00 -#define XLOG_GIST_DELETE 0x10 /* delete leaf index tuples for a - * page */ +#define XLOG_GIST_DELETE 0x10 /* delete leaf index tuples marked + * as LP_DEAD during normal index + * tuple insertion */ #define XLOG_GIST_PAGE_REUSE 0x20 /* old page is about to be reused * from FSM */ #define XLOG_GIST_PAGE_SPLIT 0x30 - /* #define XLOG_GIST_INSERT_COMPLETE 0x40 */ /* not used anymore */ +#define XLOG_GIST_PAGE_VACUUM 0x40 /* delete leaf index tuples during + * VACUUM */ /* #define XLOG_GIST_CREATE_INDEX 0x50 */ /* not used anymore */ #define XLOG_GIST_PAGE_DELETE 0x60 /* #define XLOG_GIST_ASSIGN_LSN 0x70 */ /* not used anymore */ /* + * Used by both XLOG_GIST_PAGE_UPDATE and XLOG_GIST_PAGE_VACUUM. VACUUM only + * ever deletes tuples (ntoinsert is 0, and there is no left child), but the + * page-level changes are otherwise the same; the records differ only in that + * replaying a VACUUM record takes a cleanup lock on the target page. + * * Backup Blk 0: updated page. * Backup Blk 1: If this operation completes a page split, by inserting a * downlink for the split page, the left half of the split diff --git a/src/include/access/indexbatch.h b/src/include/access/indexbatch.h index 9471a9db5..24b531705 100644 --- a/src/include/access/indexbatch.h +++ b/src/include/access/indexbatch.h @@ -101,6 +101,8 @@ index_scan_batch_append(IndexScanDesc scan, IndexScanBatch batch) * * [table AM opaque area] <- table AM area (batch_table_opaque_size), * optionally requested by table AM + * [index AM dyn opaque] <- index AM area (batch_index_opaque_dyn), + * optionally requested by index AM * [index AM static opaque] <- index AM area (batch_index_opaque_static), * mandatory fixed-size index AM area * [IndexScanBatchData] <- batch pointer, returned by amgetbatch @@ -129,6 +131,17 @@ index_scan_batch_append(IndexScanDesc scan, IndexScanBatch batch) * area. Access to the area is cheap (a compile-time-constant subtraction), * but its size cannot vary from scan to scan. Index AMs typically use this * area to store things like index page sibling link block numbers. + * + * Index AMs can use a second, optional dynamically-sized private area + * (batch_index_opaque_dyn) that sits just before the static area. Its size + * is chosen at scan start rather than at compile time. It is accessed via + * index_scan_batch_index_opaque_dyn. This second area is generally only used + * during scans where large amounts of supplemental metadata are required, + * that cannot reasonably be allocated for every scan. Typically, this is + * granular information about the batch's items for use by the index AM's + * amgettransform routine (the tuples themselves are stored separately, in + * on-disk format, in the currTuples workspace; amgettransform converts each + * one into the scan's returnable tuple). * ---------------------------------------------------------------------------- */ @@ -165,6 +178,18 @@ index_scan_batch_table_area(IndexScanDesc scan, IndexScanBatch batch) (AssertMacro((scan)->batch_index_opaque_static == MAXALIGN(sizeof(type))), \ ((type *) ((char *) (batch) - MAXALIGN(sizeof(type))))) +/* + * Return a pointer to the index AM's dynamic opaque area + */ +static inline void * +index_scan_batch_index_opaque_dyn(IndexScanDesc scan, IndexScanBatch batch) +{ + Assert(scan->batch_index_opaque_dyn > 0); + + return (char *) batch - scan->batch_index_opaque_static - + MAXALIGN(scan->batch_index_opaque_dyn); +} + /* ---------------------------------------------------------------------------- * Elementary batch position operations * ---------------------------------------------------------------------------- diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h index f2f66e367..3a1e616d3 100644 --- a/src/include/access/relscan.h +++ b/src/include/access/relscan.h @@ -377,6 +377,11 @@ typedef struct IndexScanDescData */ uint32 batch_table_opaque_size; /* table AM opaque area size */ + /* + * Optional dynamic opaque size, also set by index AM in ambeginscan + */ + uint32 batch_index_opaque_dyn; + /* * Offset used by index_scan_batch_base (set on first batch alloc). See * access/indexbatch.h. diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c index 2d9d04aa3..4799a40b7 100644 --- a/src/backend/access/brin/brin.c +++ b/src/backend/access/brin/brin.c @@ -302,6 +302,7 @@ brinhandler(PG_FUNCTION_ARGS) .amgetbatch = NULL, .amunguardbatch = NULL, .amkillitemsbatch = NULL, + .amgettransform = NULL, .amgetbitmap = bringetbitmap, .amendscan = brinendscan, .amposreset = NULL, diff --git a/src/backend/access/gin/ginutil.c b/src/backend/access/gin/ginutil.c index 0e8b6a549..ceb9cb447 100644 --- a/src/backend/access/gin/ginutil.c +++ b/src/backend/access/gin/ginutil.c @@ -87,6 +87,7 @@ ginhandler(PG_FUNCTION_ARGS) .amgetbatch = NULL, .amunguardbatch = NULL, .amkillitemsbatch = NULL, + .amgettransform = NULL, .amgetbitmap = gingetbitmap, .amendscan = ginendscan, .amposreset = NULL, diff --git a/src/backend/access/gist/README b/src/backend/access/gist/README index 75445b074..8864a3faf 100644 --- a/src/backend/access/gist/README +++ b/src/backend/access/gist/README @@ -48,7 +48,7 @@ The original algorithms were modified in several ways: * They had to be adapted to PostgreSQL conventions. For example, the SEARCH algorithm was considerably changed, because in PostgreSQL the search function - should return one tuple (next), not all tuples at once. Also, it should + returns matching tuples incrementally, not all at once. Also, it should release page locks between calls. * Since we added support for variable length keys, it's not possible to guarantee enough free space for all keys on pages after splitting. User @@ -71,20 +71,24 @@ was not touched in the paper. Search Algorithm ---------------- -The search code maintains a queue of unvisited items, where an "item" is -either a heap tuple known to satisfy the search conditions, or an index -page that is consistent with the search conditions according to inspection -of its parent page's downlink item. Initially the root page is searched -to find unvisited items in it. Then we pull items from the queue. A -heap tuple pointer is just returned immediately; an index page entry -causes that page to be searched, generating more queue entries. +The search code maintains a queue of unvisited items. For a plain index +scan an "item" is always an index page that is consistent with the search +conditions according to inspection of its parent page's downlink item; +matching heap tuples are not queued, but are gathered into a batch as each +leaf page is scanned (see "Returning matches in batches", below). For a +nearest-neighbor (ordered) scan the queue additionally holds heap tuples +known to satisfy the search conditions, so that heap tuples and index +pages can be interleaved in distance order. Initially the root page is +added to the queue. Then we pull items from the queue: an index page +entry causes that page to be scanned, generating more queue entries, while +a heap tuple entry (ordered scans only) is a match to be returned. -The queue is kept ordered with heap tuple items at the front, then -index page entries, with any newly-added index page entry inserted -before existing index page entries. This ensures depth-first traversal -of the index, and in particular causes the first few heap tuples to be -returned as soon as possible. That is helpful in case there is a LIMIT -that requires only a few tuples to be produced. +The queue is kept ordered so that we perform a depth-first traversal of +the index: any newly-added index page entry is inserted before existing +index page entries, and (for ordered scans) heap tuple items are kept at +the front. This causes the first few matching heap tuples to be returned +as soon as possible, which is helpful in case there is a LIMIT that +requires only a few tuples to be produced. To implement nearest-neighbor search, the queue entries are augmented with distance data: heap tuple entries are labeled with exact distance @@ -94,17 +98,18 @@ queue entries are retrieved in smallest-distance-first order, with entries having identical distances managed as stated in the previous paragraph. -The search algorithm keeps an index page locked only long enough to scan -its entries and queue those that satisfy the search conditions. Since -insertions can occur concurrently with searches, it is possible for an -index child page to be split between the time we make a queue entry for it -(while visiting its parent page) and the time we actually reach and scan -the child page. To avoid missing the entries that were moved to the right -sibling, we detect whether a split has occurred by comparing the child -page's NSN (node sequence number, a special-purpose LSN) to the LSN that -the parent had when visited. If it did, the sibling page is immediately -added to the front of the queue, ensuring that its items will be scanned -in the same order as if they were still on the original child page. +The search algorithm keeps an index page locked only long enough to scan its +entries -- queueing the child pages that satisfy the search conditions, and +gathering any matching heap tuples (into a batch, or onto the queue for an +ordered scan). Since insertions can occur concurrently with searches, it is +possible for an index child page to be split between the time we make a queue +entry for it (while visiting its parent page) and the time we actually reach +and scan the child page. To avoid missing the entries that were moved to the +right sibling, we detect whether a split has occurred by comparing the child +page's NSN (node sequence number, a special-purpose LSN) to the LSN that the +parent had when visited. If it did, the sibling page is immediately added to +the front of the queue, ensuring that its items will be scanned in the same +order as if they were still on the original child page. As is usual in Postgres, the search algorithm only guarantees to find index entries that existed before the scan started; index entries added during @@ -116,6 +121,36 @@ Any such enlargement would be to add child items that we aren't interested in returning anyway. +Returning matches in batches +---------------------------- + +GiST implements the amgetbatch index AM interface, whose contract is +documented in doc/src/sgml/indexam.sgml (see also +src/backend/access/nbtree/README). Each call hands the table AM a batch of +matching TIDs rather than a single TID. GiST forms two kinds of batch: + +* A plain (non-ordered) scan returns one "conventional" batch per leaf + page, holding all of that page's matching TIDs in physical order. As in + nbtree and hash, the batch retains the leaf page's buffer pin (though not + its content lock) as the interlock against concurrent TID recycling by + VACUUM. + +* A nearest-neighbor (ordered) scan returns a single "virtual" batch. Its + matches don't arrive one leaf page at a time, so instead we drain the + distance-ordered queue, copying matching TIDs into the batch in distance + order -- typically spanning many leaf pages. A virtual batch retains no + buffer pin; each leaf page's pin is dropped as soon as the page is scanned. + +VACUUM honors a batch's pin by taking a cleanup lock on the leaf page (see +"Bulk delete algorithm (VACUUM)", below), just as nbtree does. Because a +virtual batch holds no such pin, ordered scans come with two restrictions, +both also seen in bitmap (amgetbitmap) scans and both explained in +doc/src/sgml/indexam.sgml: they never set LP_DEAD bits (gistkillitemsbatch +does nothing for a virtual batch), and they are never planned as index-only +scans (a virtual batch has no pin to offer as the TID-recycling interlock +that index-only scans depend on). + + Insert Algorithm ---------------- @@ -452,6 +487,15 @@ B-tree VACUUM uses, but because we already have NSNs on pages, to detect page splits during searches, we don't need a "vacuum cycle ID" concept for that like B-tree does. +We take a full cleanup lock on every leaf page as we scan it, even leaf +pages with no deletable tuples. As in nbtree, this is the interlock that +holds concurrent scans off from TID recycling; see "Returning matches in +batches", above. Replay of the resulting XLOG_GIST_PAGE_VACUUM records +takes the same cleanup lock, so that the interlock also protects index-only +scans running on a hot standby. Recovery only needs the cleanup lock on +pages that actually have items to delete (the only pages that generate a +record), not on every leaf page. + While we scan all the pages, we also make note of any completely empty leaf pages. We will try to unlink them from the tree after the scan. We also record the block numbers of all internal pages; they are needed to locate parents of diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c index 67b16053a..88b8a4ddf 100644 --- a/src/backend/access/gist/gist.c +++ b/src/backend/access/gist/gist.c @@ -103,10 +103,11 @@ gisthandler(PG_FUNCTION_ARGS) .amadjustmembers = gistadjustmembers, .ambeginscan = gistbeginscan, .amrescan = gistrescan, - .amgettuple = gistgettuple, - .amgetbatch = NULL, - .amunguardbatch = NULL, - .amkillitemsbatch = NULL, + .amgettuple = NULL, + .amgetbatch = gistgetbatch, + .amunguardbatch = gistunguardbatch, + .amkillitemsbatch = gistkillitemsbatch, + .amgettransform = gistgettransform, .amgetbitmap = gistgetbitmap, .amendscan = gistendscan, .amposreset = NULL, diff --git a/src/backend/access/gist/gistget.c b/src/backend/access/gist/gistget.c index 4d7c100d7..d6c268084 100644 --- a/src/backend/access/gist/gistget.c +++ b/src/backend/access/gist/gistget.c @@ -27,84 +27,84 @@ #include "utils/rel.h" /* - * gistkillitems() -- set LP_DEAD state for items an indexscan caller has - * told us were killed. - * - * We re-read page here, so it's important to check page LSN. If the page - * has been modified since the last read (as determined by LSN), we cannot - * flag any entries because it is possible that the old entry was vacuumed - * away and the TID was re-used by a completely different heap tuple. + * gistkillitemsbatch() -- Mark dead items' index tuples LP_DEAD */ -static void -gistkillitems(IndexScanDesc scan) +void +gistkillitemsbatch(IndexScanDesc scan, IndexScanBatch batch) { - GISTScanOpaque so = (GISTScanOpaque) scan->opaque; - Buffer buffer; + GISTBatchData *gbatch = GISTBatchGetData(scan, batch); + Relation rel = scan->indexRelation; + Buffer buf; Page page; - OffsetNumber offnum; - ItemId iid; - int i; bool killedsomething = false; + XLogRecPtr latestlsn; - Assert(so->curBlkno != InvalidBlockNumber); - Assert(XLogRecPtrIsValid(so->curPageLSN)); - Assert(so->killedItems != NULL); + Assert(batch->numDead > 0); - buffer = ReadBuffer(scan->indexRelation, so->curBlkno); - if (!BufferIsValid(buffer)) + /* + * Skip virtual (ordered-scan) batches, since there's no practical way to + * visit all of the index pages that these tuples really came from + */ + if (gbatch->blkno == InvalidBlockNumber) return; - LockBuffer(buffer, GIST_SHARE); - gistcheckpage(scan->indexRelation, buffer); - page = BufferGetPage(buffer); + buf = ReadBuffer(rel, gbatch->blkno); + LockBuffer(buf, GIST_SHARE); + gistcheckpage(rel, buf); + page = BufferGetPage(buf); - /* - * If page LSN differs it means that the page was modified since the last - * read. killedItems could be not valid so LP_DEAD hints applying is not - * safe. - */ - if (BufferGetLSNAtomic(buffer) != so->curPageLSN) - goto unlock; - - Assert(GistPageIsLeaf(page)); - - /* - * Mark all killedItems as dead. We need no additional recheck, because, - * if page was modified, curPageLSN must have changed. - */ - for (i = 0; i < so->numKilled; i++) + latestlsn = BufferGetLSNAtomic(buf); + Assert(batch->lsn <= latestlsn); + if (batch->lsn != latestlsn) { - if (!killedsomething) - { - /* - * Use the hint bit infrastructure to check if we can update the - * page while just holding a share lock. If we are not allowed, - * there's no point continuing. - */ - if (!BufferBeginSetHintBits(buffer)) - goto unlock; - } + /* Modified, give up on hinting */ + UnlockReleaseBuffer(buf); + return; + } - offnum = so->killedItems[i]; - iid = PageGetItemId(page, offnum); - ItemIdMarkDead(iid); - killedsomething = true; + /* Iterate through batch->deadItems[] in index page order */ + for (int i = 0; i < batch->numDead; i++) + { + int itemIndex = batch->deadItems[i]; + OffsetNumber offnum = batch->items[itemIndex].indexOffset; + ItemId iid = PageGetItemId(page, offnum); + + Assert(itemIndex >= batch->firstItem && itemIndex <= batch->lastItem); + Assert(i == 0 || + offnum > batch->items[batch->deadItems[i - 1]].indexOffset); + Assert(offnum <= PageGetMaxOffsetNumber(page)); + Assert(ItemPointerEquals(&((IndexTuple) PageGetItem(page, iid))->t_tid, + &batch->items[itemIndex].tableTid)); + + /* Mark index item as dead, if it isn't already */ + if (!ItemIdIsDead(iid)) + { + if (!killedsomething) + { + /* + * Use the hint bit infrastructure to check if we can update + * the page while just holding a share lock. If we are not + * allowed, there's no point continuing. + */ + if (!BufferBeginSetHintBits(buf)) + { + UnlockReleaseBuffer(buf); + return; + } + } + + ItemIdMarkDead(iid); + killedsomething = true; + } } if (killedsomething) { GistMarkPageHasGarbage(page); - BufferFinishSetHintBits(buffer, true, true); + BufferFinishSetHintBits(buf, true, true); } -unlock: - UnlockReleaseBuffer(buffer); - - /* - * Always reset the scan state, so we don't look for same items on other - * pages. - */ - so->numKilled = 0; + UnlockReleaseBuffer(buf); } /* @@ -318,16 +318,25 @@ gistindex_keytest(IndexScanDesc scan, * scan: index scan we are executing * pageItem: search queue item identifying an index page to scan * myDistances: distances array associated with pageItem, or NULL at the root - * tbm: if not NULL, gistgetbitmap's output bitmap - * ntids: if not NULL, gistgetbitmap's output tuple counter + * newbatch: caller's batch to fill, for a non-ordered scan; NULL when ordered * - * If tbm/ntids aren't NULL, we are doing an amgetbitmap scan, and heap - * tuples should be reported directly into the bitmap. If they are NULL, - * we're doing a plain or ordered indexscan. For a plain indexscan, heap - * tuple TIDs are returned into so->pageData[]. For an ordered indexscan, - * heap tuple TIDs are pushed into individual search queue items. In an - * index-only scan, reconstructed index tuples are returned along with the - * TIDs. + * For a non-ordered scan (newbatch isn't NULL, which is the case for both + * unordered gistgetbatch and gistgetbitmap), matching item TIDs from a leaf + * page are stored into caller's newbatch to return via gistgetbatch. If we + * don't save any items in newbatch, caller needs to find the next leaf page + * that has matches and save its items in newbatch instead (if there is none + * then caller should release newbatch). + * + * For an ordered (nearest-neighbor) scan (newbatch is NULL), matching leaf heap + * tuples are pushed onto the search queue as GISTSearchItems carrying their + * distances, so the queue can later be drained in distance order. The page's + * buffer pin is dropped before returning. This can only happen during + * batchImmediateUnguard scans, which is what makes it safe. Groups of enqueued + * items will eventually be returned (in the expected order) as "virtual + * batches", but we don't do that here. + * + * In all cases, lower index pages are pushed onto the search queue to be + * visited later. * * If we detect that the index page has split since we saw its downlink * in the parent, we push its new right sibling onto the queue so the @@ -335,10 +344,9 @@ gistindex_keytest(IndexScanDesc scan, */ static void gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, - IndexOrderByDistance *myDistances, TIDBitmap *tbm, int64 *ntids) + IndexOrderByDistance *myDistances, IndexScanBatch newbatch) { GISTScanOpaque so = (GISTScanOpaque) scan->opaque; - GISTSTATE *giststate = so->giststate; Relation r = scan->indexRelation; Buffer buffer; Page page; @@ -347,7 +355,12 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, OffsetNumber i; MemoryContext oldcxt; + /* state used when saving matching items into caller's newbatch */ + int itemIndex = 0; + int tupleOffset = 0; + Assert(!GISTSearchItemIsHeap(*pageItem)); + Assert((scan->numberOfOrderBys == 0) == (newbatch != NULL)); buffer = ReadBuffer(scan->indexRelation, pageItem->blkno); LockBuffer(buffer, GIST_SHARE); @@ -399,22 +412,11 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, */ if (GistPageIsDeleted(page)) { + Assert(!newbatch || newbatch->firstItem > newbatch->lastItem); UnlockReleaseBuffer(buffer); return; } - so->nPageData = so->curPageData = 0; - scan->xs_hitup = NULL; /* might point into pageDataCxt */ - if (so->pageDataCxt) - MemoryContextReset(so->pageDataCxt); - - /* - * We save the LSN of the page as we read it, so that we know whether it - * is safe to apply LP_DEAD hints to the page later. This allows us to - * drop the pin for MVCC scans, which allows vacuum to avoid blocking. - */ - so->curPageLSN = BufferGetLSNAtomic(buffer); - /* * check all tuples on page */ @@ -452,36 +454,28 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, if (!match) continue; - if (tbm && GistPageIsLeaf(page)) + if (scan->numberOfOrderBys == 0 && GistPageIsLeaf(page)) { /* - * getbitmap scan, so just push heap tuple TIDs into the bitmap - * without worrying about ordering + * Non-ordered scan (unordered amgetbatch or bitmap), so just + * store another matching item in caller's batch without worrying + * about ordering */ - tbm_add_tuples(tbm, &it->t_tid, 1, recheck); - (*ntids)++; - } - else if (scan->numberOfOrderBys == 0 && GistPageIsLeaf(page)) - { - /* - * Non-ordered scan, so report tuples in so->pageData[] - */ - so->pageData[so->nPageData].heapPtr = it->t_tid; - so->pageData[so->nPageData].recheck = recheck; - so->pageData[so->nPageData].offnum = i; + newbatch->items[itemIndex].tableTid = it->t_tid; + newbatch->items[itemIndex].indexOffset = i; + newbatch->items[itemIndex].tupleOffset = 0; + GISTBatchGetRecheck(scan, newbatch)[itemIndex] = recheck; - /* - * In an index-only scan, also fetch the data from the tuple. The - * reconstructed tuples are stored in pageDataCxt. - */ if (scan->xs_want_itup) { - oldcxt = MemoryContextSwitchTo(so->pageDataCxt); - so->pageData[so->nPageData].recontup = - gistFetchTuple(giststate, r, it); - MemoryContextSwitchTo(oldcxt); + /* Copy on-disk format index tuple into currTuples */ + Size itupsz = IndexTupleSize(it); + + newbatch->items[itemIndex].tupleOffset = tupleOffset; + memcpy(newbatch->currTuples + tupleOffset, it, itupsz); + tupleOffset += MAXALIGN(itupsz); } - so->nPageData++; + itemIndex++; } else { @@ -500,17 +494,15 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, if (GistPageIsLeaf(page)) { - /* Creating heap-tuple GISTSearchItem */ + /* Creating heap-tuple GISTSearchItem for ordered search */ + Assert(scan->numberOfOrderBys > 0); + Assert(newbatch == NULL); + Assert(scan->batchImmediateUnguard); + item->blkno = InvalidBlockNumber; item->data.heap.heapPtr = it->t_tid; item->data.heap.recheck = recheck; item->data.heap.recheckDistances = recheck_distances; - - /* - * In an index-only scan, also fetch the data from the tuple. - */ - if (scan->xs_want_itup) - item->data.heap.recontup = gistFetchTuple(giststate, r, it); } else { @@ -535,6 +527,30 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, } } + if (newbatch) + { + /* Finalize result batch during a non-ordered scan */ + Assert(scan->numberOfOrderBys == 0); + + newbatch->firstItem = 0; + newbatch->lastItem = itemIndex - 1; + + if (itemIndex > 0) + { + GISTBatchData *gnewbatch; + + Assert(GistPageIsLeaf(page)); + + gnewbatch = GISTBatchGetData(scan, newbatch); + gnewbatch->buf = buffer; + gnewbatch->blkno = BufferGetBlockNumber(buffer); + + indexam_util_unlock_batch(scan, newbatch, buffer); + return; + } + /* else caller needs to find another page to fill newbatch */ + } + UnlockReleaseBuffer(buffer); } @@ -563,22 +579,111 @@ getNextGISTSearchItem(GISTScanOpaque so) } /* - * Fetch next heap tuple in an ordered search + * gistScanStart() -- begin a scan by queueing its root page + * + * Called on the first amgetbatch/amgetbitmap call of a scan (the caller having + * already checked that the qual is satisfiable). Counts the scan for stats and + * queues the root page as the first work item, so the scan drivers are + * otherwise pure queue drainers. The root carries a zeroed parentlsn (it has + * no parent, so gistScanPage's split-detection is a no-op for it) and zeroed + * distances (so it sorts first in an ordered scan). + * + * Starting the scan here, rather than in gistrescan, follows the convention + * that amrescan only sets up scan keys while the scan proper (counting it, + * reading index pages) begins on the first fetch. */ -static bool -getNextNearest(IndexScanDesc scan) +static void +gistScanStart(IndexScanDesc scan) { GISTScanOpaque so = (GISTScanOpaque) scan->opaque; - bool res = false; + GISTSearchItem *root; + MemoryContext oldcxt; - if (scan->xs_hitup) + pgstat_count_index_scan(scan->indexRelation); + if (scan->instrument) + scan->instrument->nsearches++; + + oldcxt = MemoryContextSwitchTo(so->queueCxt); + root = palloc(SizeOfGISTSearchItem(scan->numberOfOrderBys)); + root->blkno = GIST_ROOT_BLKNO; + memset(&root->data.parentlsn, 0, sizeof(GistNSN)); + memset(root->distances, 0, + sizeof(root->distances[0]) * scan->numberOfOrderBys); + pairingheap_add(so->queue, &root->phNode); + MemoryContextSwitchTo(oldcxt); +} + +/* + * getNextBatch() -- read the next leaf page with matches into a fresh batch + * + * gistgetbatch's non-ordered walker, also driven by gistgetbitmap. Allocates a + * batch and drains the queue, scanning each queued index page until one + * produces matching leaf items, then returns that batch. When the queue is + * exhausted without a match, releases the batch and returns NULL. + */ +static IndexScanBatch +getNextBatch(IndexScanDesc scan) +{ + GISTScanOpaque so = (GISTScanOpaque) scan->opaque; + IndexScanBatch newbatch = indexam_util_alloc_batch(scan); + + /* GiST only ever scans forward; set the batch's direction up front */ + newbatch->dir = ForwardScanDirection; + + for (;;) { - /* free previously returned tuple */ - pfree(scan->xs_hitup); - scan->xs_hitup = NULL; + GISTSearchItem *item = getNextGISTSearchItem(so); + + if (item == NULL) + { + /* No more index pages to scan; the scan is exhausted */ + indexam_util_release_batch(scan, newbatch); + return NULL; + } + + CHECK_FOR_INTERRUPTS(); + + /* Scan this queued index page; matching leaf items go into the batch */ + gistScanPage(scan, item, item->distances, newbatch); + pfree(item); + + /* If this leaf page produced matching items, return the batch */ + if (newbatch->firstItem <= newbatch->lastItem) + return newbatch; } - do + pg_unreachable(); + + return NULL; +} + +/* + * getNextNearestBatch() -- drain the queue into a fresh batch in distance order + * + * gistgetbatch's ordered (nearest-neighbor) walker. The pairing-heap queue + * (so->queue) holds both unvisited index pages and matching leaf heap tuples, + * ordered by (lower-bound) distance. We pop items in that order, dispatching + * on the item type. A popped heap tuple is appended to the batch. We stop + * once the batch is full (maxitemsbatch items) or the queue is exhausted, + * leaving any remaining items queued for the next call. + * + * Because the queue is drained in nondecreasing distance order across the whole + * scan (a downlink's distance is a lower bound on its subtree, so items pushed + * while scanning a page never sort ahead of items already popped), the + * batches we emit are globally distance-ordered. + */ +static IndexScanBatch +getNextNearestBatch(IndexScanDesc scan) +{ + GISTScanOpaque so = (GISTScanOpaque) scan->opaque; + IndexScanBatch newbatch = indexam_util_alloc_batch(scan); + GISTBatchData *gnewbatch; + int nitems = 0; + + /* GiST only ever scans forward; set the batch's direction up front */ + newbatch->dir = ForwardScanDirection; + + for (;;) { GISTSearchItem *item = getNextGISTSearchItem(so); @@ -588,37 +693,67 @@ getNextNearest(IndexScanDesc scan) if (GISTSearchItemIsHeap(*item)) { /* found a heap item at currently minimal distance */ - scan->xs_heaptid = item->data.heap.heapPtr; - scan->xs_recheck = item->data.heap.recheck; + GISTBatchItem *bitem = GISTBatchGetItem(scan, newbatch, nitems); - index_store_float8_orderby_distances(scan, so->orderByTypes, - item->distances, - item->data.heap.recheckDistances); + newbatch->items[nitems].tableTid = item->data.heap.heapPtr; + newbatch->items[nitems].indexOffset = -1; /* meaningless here */ + newbatch->items[nitems].tupleOffset = 0; - /* in an index-only scan, also return the reconstructed tuple. */ - if (scan->xs_want_itup) - scan->xs_hitup = item->data.heap.recontup; - res = true; + bitem->recheck = item->data.heap.recheck; + bitem->recheckDistances = item->data.heap.recheckDistances; + memcpy(bitem->distances, item->distances, + sizeof(item->distances[0]) * scan->numberOfOrderBys); + + nitems++; + pfree(item); + + if (nitems == scan->maxitemsbatch) + break; /* batch full; remaining items stay queued */ } else { /* visit an index page, extract its items into queue */ CHECK_FOR_INTERRUPTS(); - gistScanPage(scan, item, item->distances, NULL, NULL); + gistScanPage(scan, item, item->distances, NULL); + pfree(item); } + } - pfree(item); - } while (!res); + if (nitems == 0) + { + /* No matching items remain: the scan is exhausted */ + indexam_util_release_batch(scan, newbatch); + return NULL; + } - return res; + /* + * An ordered batch is "virtual": its items come from many leaf pages, + * whose pins gistScanPage already dropped, so it holds no TID recycling + * interlock. It has no single originating page, and we don't track those + * index pages in any case (gistkillitemsbatch will just skip it). + */ + Assert(!newbatch->isGuarded); + + newbatch->firstItem = 0; + newbatch->lastItem = nitems - 1; + + gnewbatch = GISTBatchGetData(scan, newbatch); + gnewbatch->buf = InvalidBuffer; + gnewbatch->blkno = InvalidBlockNumber; + + return newbatch; } /* - * gistgettuple() -- Get the next tuple in the scan + * gistgetbatch() -- Get the first or next batch of items in a scan + * + * Dispatches to the ordered or non-ordered walker. Persistent traversal state + * lives in so->queue, so priorbatch is unused except to recognize the scan's + * first call, when we queue the root page (gistScanStart). */ -bool -gistgettuple(IndexScanDesc scan, ScanDirection dir) +IndexScanBatch +gistgetbatch(IndexScanDesc scan, IndexScanBatch priorbatch, ScanDirection dir) { GISTScanOpaque so = (GISTScanOpaque) scan->opaque; @@ -626,124 +761,111 @@ gistgettuple(IndexScanDesc scan, ScanDirection dir) elog(ERROR, "GiST only supports forward scan direction"); if (!so->qual_ok) - return false; + return NULL; - if (so->firstCall) - { - /* Begin the scan by processing the root page */ - GISTSearchItem fakeItem; - - pgstat_count_index_scan(scan->indexRelation); - if (scan->instrument) - scan->instrument->nsearches++; - - so->firstCall = false; - so->curPageData = so->nPageData = 0; - scan->xs_hitup = NULL; - if (so->pageDataCxt) - MemoryContextReset(so->pageDataCxt); - - fakeItem.blkno = GIST_ROOT_BLKNO; - memset(&fakeItem.data.parentlsn, 0, sizeof(GistNSN)); - gistScanPage(scan, &fakeItem, NULL, NULL, NULL); - } + if (priorbatch == NULL) + gistScanStart(scan); + if (scan->numberOfOrderBys > 0) + return getNextNearestBatch(scan); + + return getNextBatch(scan); +} + +/* + * gistunguardbatch() -- Drop a batch's TID recycling interlock (buffer pin) + * + * Called by the table AM when it's safe to drop the buffer pin held to + * prevent concurrent TID recycling by VACUUM. + */ +void +gistunguardbatch(IndexScanDesc scan, IndexScanBatch batch) +{ + GISTBatchData *gbatch = GISTBatchGetData(scan, batch); + + /* Should be called exactly once iff !batchImmediateUnguard */ + Assert(!scan->batchImmediateUnguard); + Assert(batch->isGuarded); + + ReleaseBuffer(gbatch->buf); +} + +/* + * gistgettransform() -- Set up the scan's per-tuple output for one batch item + * + * Implements the amgettransform interface. The table AM calls this as it + * returns each item of a GiST scan, to set the scan descriptor's per-tuple + * output from the item's per-item data. + * + * - We always apply the item's qual recheck flag to scan->xs_recheck. + * - For ordered scans, we report the item's own ORDER BY distances (stored in + * the per-item index AM area by getNextNearestBatch) as xs_orderbyvals. + * They are flagged for recheck only when the distance function was lossy + * for that item; an exact distance is reported as final, while a lossy + * lower bound is rechecked by the executor's reorder queue to recompute + * the true order. + * - For index-only scans, we reconstruct the originally indexed values from + * the stored on-disk index tuple into a heap tuple, exposed as xs_hitup. + * + * The reconstructed tuple lives in the scan's memory context and only needs to + * outlive a single table_index_getnext_slot call (the executor copies it into + * the scan slot). We free the previously returned tuple before building the + * next one. + */ +void +gistgettransform(IndexScanDesc scan, IndexScanBatch batch, int item) +{ + GISTScanOpaque so = (GISTScanOpaque) scan->opaque; + + Assert(item >= batch->firstItem && item <= batch->lastItem); + + /* Ordered scan (must be a plain index scan) */ if (scan->numberOfOrderBys > 0) { - /* Must fetch tuples in strict distance order */ - return getNextNearest(scan); + GISTBatchItem *bitem = GISTBatchGetItem(scan, batch, item); + + Assert(!scan->xs_want_itup); + + /* Apply this item's qual recheck flag */ + scan->xs_recheck = bitem->recheck; + + /* + * Note: This is a "virtual" batch. The items from caller's batch + * were stored in the batch in distance order by getNextNearestBatch, + * right before gistgetbatch returned it. + */ + Assert(GISTBatchGetData(scan, batch)->blkno == InvalidBlockNumber); + index_store_float8_orderby_distances(scan, so->orderByTypes, + bitem->distances, + bitem->recheckDistances); + return; } - else + + /* + * Unordered scan. + * + * Always uses simple bool array for item recheck flags. + */ + scan->xs_recheck = GISTBatchGetRecheck(scan, batch)[item]; + + /* Index-only scan */ + if (scan->xs_want_itup) { - /* Fetch tuples index-page-at-a-time */ - for (;;) + /* Reconstruct a returnable heap tuple from stashed index tuple */ + IndexTuple itup = (IndexTuple) (batch->currTuples + + batch->items[item].tupleOffset); + MemoryContext oldcxt; + + if (scan->xs_hitup) { - if (so->curPageData < so->nPageData) - { - if (scan->kill_prior_tuple && so->curPageData > 0) - { - - if (so->killedItems == NULL) - { - MemoryContext oldCxt = - MemoryContextSwitchTo(so->giststate->scanCxt); - - so->killedItems = - (OffsetNumber *) palloc(MaxIndexTuplesPerPage - * sizeof(OffsetNumber)); - - MemoryContextSwitchTo(oldCxt); - } - if (so->numKilled < MaxIndexTuplesPerPage) - so->killedItems[so->numKilled++] = - so->pageData[so->curPageData - 1].offnum; - } - /* continuing to return tuples from a leaf page */ - scan->xs_heaptid = so->pageData[so->curPageData].heapPtr; - scan->xs_recheck = so->pageData[so->curPageData].recheck; - - /* in an index-only scan, also return the reconstructed tuple */ - if (scan->xs_want_itup) - scan->xs_hitup = so->pageData[so->curPageData].recontup; - - so->curPageData++; - - return true; - } - - /* - * Check the last returned tuple and add it to killedItems if - * necessary - */ - if (scan->kill_prior_tuple - && so->curPageData > 0 - && so->curPageData == so->nPageData) - { - - if (so->killedItems == NULL) - { - MemoryContext oldCxt = - MemoryContextSwitchTo(so->giststate->scanCxt); - - so->killedItems = - (OffsetNumber *) palloc(MaxIndexTuplesPerPage - * sizeof(OffsetNumber)); - - MemoryContextSwitchTo(oldCxt); - } - if (so->numKilled < MaxIndexTuplesPerPage) - so->killedItems[so->numKilled++] = - so->pageData[so->curPageData - 1].offnum; - } - /* find and process the next index page */ - do - { - GISTSearchItem *item; - - if ((so->curBlkno != InvalidBlockNumber) && (so->numKilled > 0)) - gistkillitems(scan); - - item = getNextGISTSearchItem(so); - - if (!item) - return false; - - CHECK_FOR_INTERRUPTS(); - - /* save current item BlockNumber for next gistkillitems() call */ - so->curBlkno = item->blkno; - - /* - * While scanning a leaf page, ItemPointers of matching heap - * tuples are stored in so->pageData. If there are any on - * this page, we fall out of the inner "do" and loop around to - * return them. - */ - gistScanPage(scan, item, item->distances, NULL, NULL); - - pfree(item); - } while (so->nPageData == 0); + pfree(scan->xs_hitup); + scan->xs_hitup = NULL; } + + /* reconstruct the originally indexed values as a heap tuple */ + oldcxt = MemoryContextSwitchTo(so->giststate->scanCxt); + scan->xs_hitup = gistFetchTuple(so->giststate, scan->indexRelation, itup); + MemoryContextSwitchTo(oldcxt); } } @@ -755,41 +877,34 @@ gistgetbitmap(IndexScanDesc scan, TIDBitmap *tbm) { GISTScanOpaque so = (GISTScanOpaque) scan->opaque; int64 ntids = 0; - GISTSearchItem fakeItem; + IndexScanBatch batch; if (!so->qual_ok) return 0; - pgstat_count_index_scan(scan->indexRelation); - if (scan->instrument) - scan->instrument->nsearches++; - - /* Begin the scan by processing the root page */ - so->curPageData = so->nPageData = 0; - scan->xs_hitup = NULL; - if (so->pageDataCxt) - MemoryContextReset(so->pageDataCxt); - - fakeItem.blkno = GIST_ROOT_BLKNO; - memset(&fakeItem.data.parentlsn, 0, sizeof(GistNSN)); - gistScanPage(scan, &fakeItem, NULL, tbm, &ntids); + /* Begin the scan by queueing the root page */ + gistScanStart(scan); /* - * While scanning a leaf page, ItemPointers of matching heap tuples will - * be stored directly into tbm, so we don't need to deal with them here. + * Drive the same non-ordered walker as gistgetbatch, one leaf page at a + * time, draining each batch into the bitmap and releasing it before + * fetching the next, so only one batch is ever live (cf. spggetbitmap). */ - for (;;) + while ((batch = getNextBatch(scan)) != NULL) { - GISTSearchItem *item = getNextGISTSearchItem(so); + bool *recheck = GISTBatchGetRecheck(scan, batch); - if (!item) - break; + for (int i = batch->firstItem; i <= batch->lastItem; i++) + { + tbm_add_tuples(tbm, &batch->items[i].tableTid, 1, recheck[i]); + ntids++; + } - CHECK_FOR_INTERRUPTS(); - - gistScanPage(scan, item, item->distances, tbm, &ntids); - - pfree(item); + /* + * Return the batch to the single-slot bitmap cache, to be reused by + * the next getNextBatch + */ + indexam_util_release_batch(scan, batch); } return ntids; diff --git a/src/backend/access/gist/gistscan.c b/src/backend/access/gist/gistscan.c index c65f93abd..3ec405379 100644 --- a/src/backend/access/gist/gistscan.c +++ b/src/backend/access/gist/gistscan.c @@ -104,12 +104,34 @@ gistbeginscan(Relation r, int nkeys, int norderbys) scan->xs_orderbyvals = palloc0_array(Datum, scan->numberOfOrderBys); scan->xs_orderbynulls = palloc_array(bool, scan->numberOfOrderBys); memset(scan->xs_orderbynulls, true, sizeof(bool) * scan->numberOfOrderBys); - } - so->killedItems = NULL; /* until needed */ - so->numKilled = 0; - so->curBlkno = InvalidBlockNumber; - so->curPageLSN = InvalidXLogRecPtr; + /* + * Ordered scans fill a "virtual" batch by draining the + * distance-ordered queue, so the batch size is a tuning knob with no + * natural value. Testing has shown that a very small size will + * increase per-batch overhead (and likely instruction-cache misses), + * while a large size (such as MaxIndexTuplesPerPage) risks producing + * many tuples that a LIMIT node never consumes. This maxitemsbatch + * is a compromise. + */ + scan->maxitemsbatch = MaxIndexTuplesPerPage / 32; + } + else + scan->maxitemsbatch = MaxIndexTuplesPerPage; + + scan->batch_index_opaque_static = MAXALIGN(sizeof(GISTBatchData)); + + /* + * Use second opaque area for our per-item data: a GISTBatchItem array + * (with room for each item's ORDER BY distances) for ordered scans, or + * just an array of qual recheck flags for unordered scans + */ + if (scan->numberOfOrderBys > 0) + scan->batch_index_opaque_dyn = + SizeOfGISTBatchItem(scan->numberOfOrderBys) * scan->maxitemsbatch; + else + scan->batch_index_opaque_dyn = sizeof(bool) * scan->maxitemsbatch; + scan->batch_tuples_workspace = BLCKSZ; scan->opaque = so; @@ -168,8 +190,7 @@ gistrescan(IndexScanDesc scan, ScanKey key, int nkeys, /* * If we're doing an index-only scan, on the first call, also initialize a - * tuple descriptor to represent the returned index tuples and create a - * memory context to hold them during the scan. + * tuple descriptor to represent the returned index tuples. */ if (scan->xs_want_itup && !scan->xs_hitupdesc) { @@ -203,11 +224,6 @@ gistrescan(IndexScanDesc scan, ScanKey key, int nkeys, } TupleDescFinalize(so->giststate->fetchTupdesc); scan->xs_hitupdesc = so->giststate->fetchTupdesc; - - /* Also create a memory context that will hold the returned tuples */ - so->pageDataCxt = AllocSetContextCreate(so->giststate->scanCxt, - "GiST page data context", - ALLOCSET_DEFAULT_SIZES); } /* create new, empty pairing heap for search queue */ @@ -215,8 +231,6 @@ gistrescan(IndexScanDesc scan, ScanKey key, int nkeys, so->queue = pairingheap_allocate(pairingheap_GISTSearchItem_cmp, scan); MemoryContextSwitchTo(oldCxt); - so->firstCall = true; - /* Update scan key, if a new one is given */ if (key && scan->numberOfKeys > 0) { @@ -340,7 +354,8 @@ gistrescan(IndexScanDesc scan, ScanKey key, int nkeys, pfree(fn_extras); } - /* any previous xs_hitup will have been pfree'd in context resets above */ + if (scan->xs_hitup) + pfree(scan->xs_hitup); scan->xs_hitup = NULL; } diff --git a/src/backend/access/gist/gistutil.c b/src/backend/access/gist/gistutil.c index 0f58f6187..a687718e7 100644 --- a/src/backend/access/gist/gistutil.c +++ b/src/backend/access/gist/gistutil.c @@ -23,6 +23,7 @@ #include "utils/float.h" #include "utils/fmgrprotos.h" #include "utils/lsyscache.h" +#include "utils/memutils.h" #include "utils/rel.h" #include "utils/snapmgr.h" #include "utils/syscache.h" @@ -670,6 +671,7 @@ gistFetchTuple(GISTSTATE *giststate, Relation r, IndexTuple tuple) Datum fetchatt[INDEX_MAX_KEYS]; bool isnull[INDEX_MAX_KEYS]; int i; + HeapTuple htup; for (i = 0; i < IndexRelationGetNumberOfKeyAttributes(r); i++) { @@ -717,7 +719,12 @@ gistFetchTuple(GISTSTATE *giststate, Relation r, IndexTuple tuple) } MemoryContextSwitchTo(oldcxt); - return heap_form_tuple(giststate->fetchTupdesc, fetchatt, isnull); + htup = heap_form_tuple(giststate->fetchTupdesc, fetchatt, isnull); + + /* cleanup */ + MemoryContextReset(giststate->tempCxt); + + return htup; } float diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c index 686a04180..6b8dc2178 100644 --- a/src/backend/access/gist/gistvacuum.c +++ b/src/backend/access/gist/gistvacuum.c @@ -326,10 +326,17 @@ restart: recurse_to = InvalidBlockNumber; /* - * We are not going to stay here for a long time, aggressively grab an - * exclusive lock. + * Get a full cleanup lock on this page. We must get such a lock on every + * leaf page over the course of the vacuum scan, whether or not it + * actually contains any deletable tuples. + * + * Note: we could avoid this for internal pages, but not for the root + * page. The root page can start out as a leaf page, but subsequently + * become an internal page, even while a scan holds an interlock pin on + * that page (this isn't possible in nbtree because root splits always + * create a new root page, stored within a separate block number). */ - LockBuffer(buffer, GIST_EXCLUSIVE); + LockBufferForCleanup(buffer); page = BufferGetPage(buffer); if (gistPageRecyclable(page)) @@ -407,9 +414,7 @@ restart: { XLogRecPtr recptr; - recptr = gistXLogUpdate(buffer, - todelete, ntodelete, - NULL, 0, InvalidBuffer); + recptr = gistXLogVacuum(buffer, todelete, ntodelete); PageSetLSN(page, recptr); } else diff --git a/src/backend/access/gist/gistxlog.c b/src/backend/access/gist/gistxlog.c index ae538dc81..f9f651261 100644 --- a/src/backend/access/gist/gistxlog.c +++ b/src/backend/access/gist/gistxlog.c @@ -67,14 +67,15 @@ gistRedoClearFollowRight(XLogReaderState *record, uint8 block_id) * redo any page update (except page split) */ static void -gistRedoPageUpdateRecord(XLogReaderState *record) +gistRedoPageUpdateRecord(XLogReaderState *record, bool get_cleanup_lock) { XLogRecPtr lsn = record->EndRecPtr; gistxlogPageUpdate *xldata = (gistxlogPageUpdate *) XLogRecGetData(record); Buffer buffer; Page page; - if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) + if (XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, get_cleanup_lock, + &buffer) == BLK_NEEDS_REDO) { char *begin; char *data; @@ -407,7 +408,10 @@ gist_redo(XLogReaderState *record) switch (info) { case XLOG_GIST_PAGE_UPDATE: - gistRedoPageUpdateRecord(record); + gistRedoPageUpdateRecord(record, false); + break; + case XLOG_GIST_PAGE_VACUUM: + gistRedoPageUpdateRecord(record, true); break; case XLOG_GIST_DELETE: gistRedoDeleteRecord(record); @@ -637,6 +641,33 @@ gistXLogUpdate(Buffer buffer, return recptr; } +/* + * Write XLOG record describing a VACUUM deletion of leaf index tuples. + * + * This uses the same on-page representation as gistXLogUpdate() (the deletion + * of a set of items from a single leaf page), but is logged under a distinct + * record type so that replay knows to take a cleanup lock on the target page. + */ +XLogRecPtr +gistXLogVacuum(Buffer buffer, OffsetNumber *todelete, int ntodelete) +{ + gistxlogPageUpdate xlrec; + XLogRecPtr recptr; + + xlrec.ntodelete = ntodelete; + xlrec.ntoinsert = 0; + + XLogBeginInsert(); + XLogRegisterData(&xlrec, sizeof(gistxlogPageUpdate)); + + XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); + XLogRegisterBufData(0, todelete, sizeof(OffsetNumber) * ntodelete); + + recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_VACUUM); + + return recptr; +} + /* * Write XLOG record describing a delete of leaf index tuples marked as DEAD * during new tuple insertion. One may think that this case is already covered diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c index 76e3193d9..103a0833b 100644 --- a/src/backend/access/hash/hash.c +++ b/src/backend/access/hash/hash.c @@ -118,6 +118,7 @@ hashhandler(PG_FUNCTION_ARGS) .amgetbatch = hashgetbatch, .amunguardbatch = hashunguardbatch, .amkillitemsbatch = hashkillitemsbatch, + .amgettransform = NULL, .amgetbitmap = hashgetbitmap, .amendscan = hashendscan, .amposreset = NULL, diff --git a/src/backend/access/heap/heapam_indexscan.c b/src/backend/access/heap/heapam_indexscan.c index 323c245cd..5f04041df 100644 --- a/src/backend/access/heap/heapam_indexscan.c +++ b/src/backend/access/heap/heapam_indexscan.c @@ -956,11 +956,23 @@ heapam_index_return_scanpos_tid(IndexScanDesc scan, IndexScanHeapData *hscan, BatchRingItemPos *scanPos, bool *all_visible) { + amgettransform_function amgettransform = + scan->indexRelation->rd_indam->amgettransform; HeapBatchData *hbatch; /* Set xs_heaptid, which caller (and core executor) will need */ scan->xs_heaptid = scanBatch->items[scanPos->item].tableTid; + /* + * Let the index AM set this item's per-tuple output. An AM that provides + * amgettransform uses it to set the item's qual recheck flag + * (scan->xs_recheck), an ordered scan's ORDER BY distances + * (xs_orderbyvals/xs_recheckorderby), and an index-only scan's returnable + * tuple (xs_hitup). + */ + if (amgettransform != NULL) + amgettransform(scan, scanBatch, scanPos->item); + if (all_visible == NULL) { /* @@ -973,8 +985,14 @@ heapam_index_return_scanpos_tid(IndexScanDesc scan, IndexScanHeapData *hscan, /* Index-only scan */ Assert(scan->xs_want_itup); - scan->xs_itup = (IndexTuple) (scanBatch->currTuples + - scanBatch->items[scanPos->item].tupleOffset); + /* + * Unless the index AM already produced the returnable tuple via + * amgettransform above (in xs_hitup), set the original index tuple that + * amgetbatch stored in currTuples in xs_itup. + */ + if (amgettransform == NULL) + scan->xs_itup = (IndexTuple) (scanBatch->currTuples + + scanBatch->items[scanPos->item].tupleOffset); /* * Set visibility info for the current scanPos item (plus possibly some diff --git a/src/backend/access/index/amapi.c b/src/backend/access/index/amapi.c index d4adbbeb2..9886f49ff 100644 --- a/src/backend/access/index/amapi.c +++ b/src/backend/access/index/amapi.c @@ -58,6 +58,7 @@ GetIndexAmRoutine(Oid amhandler) /* Assert that AM doesn't have an invalid combination of callbacks */ Assert((routine->amgetbatch != NULL) == (routine->amunguardbatch != NULL)); Assert(routine->amkillitemsbatch == NULL || routine->amgetbatch != NULL); + Assert(routine->amgettransform == NULL || routine->amgetbatch != NULL); Assert(routine->amgetbatch != NULL || routine->amposreset == NULL); return routine; diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c index ca9bae803..1927faeab 100644 --- a/src/backend/access/index/genam.c +++ b/src/backend/access/index/genam.c @@ -133,6 +133,7 @@ RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys) scan->batch_index_opaque_static = 0; scan->batch_tuples_workspace = 0; scan->batch_table_opaque_size = 0; + scan->batch_index_opaque_dyn = 0; scan->batch_base_offset = 0; scan->xs_name_cstring_attnums = NULL; diff --git a/src/backend/access/index/indexbatch.c b/src/backend/access/index/indexbatch.c index e58e09897..2e2ccf6a9 100644 --- a/src/backend/access/index/indexbatch.c +++ b/src/backend/access/index/indexbatch.c @@ -632,6 +632,7 @@ indexam_util_alloc_batch(IndexScanDesc scan) { /* We lazily compute batch_base_offset on scan's first call */ size_t table_area = 0; + size_t index_dyn_area = MAXALIGN(scan->batch_index_opaque_dyn); if (scan->usebatchring) { @@ -642,8 +643,8 @@ indexam_util_alloc_batch(IndexScanDesc scan) table_area = MAXALIGN(scan->batch_table_opaque_size); } - /* ...though we always need an index AM area */ - scan->batch_base_offset = table_area + + /* ...though we always need index AM areas */ + scan->batch_base_offset = table_area + index_dyn_area + scan->batch_index_opaque_static; } diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index b83926f9f..6ace65508 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -166,6 +166,7 @@ bthandler(PG_FUNCTION_ARGS) .amgetbatch = btgetbatch, .amunguardbatch = btunguardbatch, .amkillitemsbatch = btkillitemsbatch, + .amgettransform = NULL, .amgetbitmap = btgetbitmap, .amendscan = btendscan, .amposreset = btposreset, diff --git a/src/backend/access/rmgrdesc/gistdesc.c b/src/backend/access/rmgrdesc/gistdesc.c index 67789e025..021f72fa0 100644 --- a/src/backend/access/rmgrdesc/gistdesc.c +++ b/src/backend/access/rmgrdesc/gistdesc.c @@ -66,6 +66,7 @@ gist_desc(StringInfo buf, XLogReaderState *record) switch (info) { case XLOG_GIST_PAGE_UPDATE: + case XLOG_GIST_PAGE_VACUUM: out_gistxlogPageUpdate(buf, (gistxlogPageUpdate *) rec); break; case XLOG_GIST_PAGE_REUSE: @@ -93,6 +94,9 @@ gist_identify(uint8 info) case XLOG_GIST_PAGE_UPDATE: id = "PAGE_UPDATE"; break; + case XLOG_GIST_PAGE_VACUUM: + id = "PAGE_VACUUM"; + break; case XLOG_GIST_DELETE: id = "DELETE"; break; diff --git a/src/backend/access/spgist/spgutils.c b/src/backend/access/spgist/spgutils.c index 745435da3..47153b4b0 100644 --- a/src/backend/access/spgist/spgutils.c +++ b/src/backend/access/spgist/spgutils.c @@ -92,6 +92,7 @@ spghandler(PG_FUNCTION_ARGS) .amgetbatch = NULL, .amunguardbatch = NULL, .amkillitemsbatch = NULL, + .amgettransform = NULL, .amgetbitmap = spggetbitmap, .amendscan = spgendscan, .amposreset = NULL, diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c index 84a97b71d..a6a8a96e7 100644 --- a/src/backend/executor/nodeIndexonlyscan.c +++ b/src/backend/executor/nodeIndexonlyscan.c @@ -132,18 +132,6 @@ IndexOnlyNext(IndexOnlyScanState *node) } } - /* - * We don't currently support rechecking ORDER BY distances. (In - * principle, if the index can support retrieval of the originally - * indexed value, it should be able to produce an exact distance - * calculation too. So it's not clear that adding code here for - * recheck/re-sort would be worth the trouble. But we should at least - * throw an error if someone tries it.) - */ - if (scandesc->numberOfOrderBys > 0 && scandesc->xs_recheckorderby) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("lossy distance functions are not supported in index-only scans"))); return slot; } diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c index 94fedf32c..624b6d0f8 100644 --- a/src/backend/optimizer/path/indxpath.c +++ b/src/backend/optimizer/path/indxpath.c @@ -951,9 +951,12 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel, /* * 3. Check if an index-only scan is possible. If we're not building * plain indexscans, this isn't relevant since bitmap scans don't support - * index data retrieval anyway. + * index data retrieval anyway. If there are ordering operators then we + * assume that an index-only scan is unsafe due to the difficulty with + * holding index page pins sufficient to avoid concurrent TID recycling. */ index_only_scan = (scantype != ST_BITMAPSCAN && + orderbyclauses == NIL && check_index_only(rel, index)); /* diff --git a/contrib/bloom/blutils.c b/contrib/bloom/blutils.c index 249af48e6..168842bc7 100644 --- a/contrib/bloom/blutils.c +++ b/contrib/bloom/blutils.c @@ -150,6 +150,7 @@ blhandler(PG_FUNCTION_ARGS) .amgetbatch = NULL, .amunguardbatch = NULL, .amkillitemsbatch = NULL, + .amgettransform = NULL, .amgetbitmap = blgetbitmap, .amendscan = blendscan, .amposreset = NULL, diff --git a/contrib/btree_gist/expected/cash.out b/contrib/btree_gist/expected/cash.out index 7fbc73559..56fd1eb49 100644 --- a/contrib/btree_gist/expected/cash.out +++ b/contrib/btree_gist/expected/cash.out @@ -74,10 +74,10 @@ SELECT count(*) FROM moneytmp WHERE a > '22649.64'::money; EXPLAIN (COSTS OFF) SELECT a, a <-> '21472.79' FROM moneytmp ORDER BY a <-> '21472.79' LIMIT 3; - QUERY PLAN --------------------------------------------------- + QUERY PLAN +----------------------------------------------- Limit - -> Index Only Scan using moneyidx on moneytmp + -> Index Scan using moneyidx on moneytmp Order By: (a <-> '$21,472.79'::money) (3 rows) diff --git a/contrib/btree_gist/expected/date.out b/contrib/btree_gist/expected/date.out index 5db864bb8..4a360bea6 100644 --- a/contrib/btree_gist/expected/date.out +++ b/contrib/btree_gist/expected/date.out @@ -74,10 +74,10 @@ SELECT count(*) FROM datetmp WHERE a > '2001-02-13'::date; EXPLAIN (COSTS OFF) SELECT a, a <-> '2001-02-13' FROM datetmp ORDER BY a <-> '2001-02-13' LIMIT 3; - QUERY PLAN ------------------------------------------------- + QUERY PLAN +---------------------------------------------- Limit - -> Index Only Scan using dateidx on datetmp + -> Index Scan using dateidx on datetmp Order By: (a <-> '02-13-2001'::date) (3 rows) diff --git a/contrib/btree_gist/expected/float4.out b/contrib/btree_gist/expected/float4.out index dfe732049..8878a317c 100644 --- a/contrib/btree_gist/expected/float4.out +++ b/contrib/btree_gist/expected/float4.out @@ -74,10 +74,10 @@ SELECT count(*) FROM float4tmp WHERE a > -179.0::float4; EXPLAIN (COSTS OFF) SELECT a, a <-> '-179.0' FROM float4tmp ORDER BY a <-> '-179.0' LIMIT 3; - QUERY PLAN ----------------------------------------------------- + QUERY PLAN +----------------------------------------------- Limit - -> Index Only Scan using float4idx on float4tmp + -> Index Scan using float4idx on float4tmp Order By: (a <-> '-179'::real) (3 rows) diff --git a/contrib/btree_gist/expected/float8.out b/contrib/btree_gist/expected/float8.out index ebd0ef3d6..763091b5c 100644 --- a/contrib/btree_gist/expected/float8.out +++ b/contrib/btree_gist/expected/float8.out @@ -77,7 +77,7 @@ SELECT a, a <-> '-1890.0' FROM float8tmp ORDER BY a <-> '-1890.0' LIMIT 3; QUERY PLAN ----------------------------------------------------- Limit - -> Index Only Scan using float8idx on float8tmp + -> Index Scan using float8idx on float8tmp Order By: (a <-> '-1890'::double precision) (3 rows) diff --git a/contrib/btree_gist/expected/int2.out b/contrib/btree_gist/expected/int2.out index 50a332939..245fa4be6 100644 --- a/contrib/btree_gist/expected/int2.out +++ b/contrib/btree_gist/expected/int2.out @@ -74,10 +74,10 @@ SELECT count(*) FROM int2tmp WHERE a > 237::int2; EXPLAIN (COSTS OFF) SELECT a, a <-> '237' FROM int2tmp ORDER BY a <-> '237' LIMIT 3; - QUERY PLAN ------------------------------------------------- + QUERY PLAN +------------------------------------------- Limit - -> Index Only Scan using int2idx on int2tmp + -> Index Scan using int2idx on int2tmp Order By: (a <-> '237'::smallint) (3 rows) diff --git a/contrib/btree_gist/expected/int4.out b/contrib/btree_gist/expected/int4.out index 6bbdc7c3f..41bed1f6e 100644 --- a/contrib/btree_gist/expected/int4.out +++ b/contrib/btree_gist/expected/int4.out @@ -74,10 +74,10 @@ SELECT count(*) FROM int4tmp WHERE a > 237::int4; EXPLAIN (COSTS OFF) SELECT a, a <-> '237' FROM int4tmp ORDER BY a <-> '237' LIMIT 3; - QUERY PLAN ------------------------------------------------- + QUERY PLAN +------------------------------------------- Limit - -> Index Only Scan using int4idx on int4tmp + -> Index Scan using int4idx on int4tmp Order By: (a <-> 237) (3 rows) diff --git a/contrib/btree_gist/expected/int8.out b/contrib/btree_gist/expected/int8.out index eff77c26b..2bbdd7657 100644 --- a/contrib/btree_gist/expected/int8.out +++ b/contrib/btree_gist/expected/int8.out @@ -77,7 +77,7 @@ SELECT a, a <-> '464571291354841' FROM int8tmp ORDER BY a <-> '464571291354841' QUERY PLAN ----------------------------------------------------- Limit - -> Index Only Scan using int8idx on int8tmp + -> Index Scan using int8idx on int8tmp Order By: (a <-> '464571291354841'::bigint) (3 rows) diff --git a/contrib/btree_gist/expected/interval.out b/contrib/btree_gist/expected/interval.out index 4c3d494e4..4ed196198 100644 --- a/contrib/btree_gist/expected/interval.out +++ b/contrib/btree_gist/expected/interval.out @@ -77,7 +77,7 @@ SELECT a, a <-> '199 days 21:21:23' FROM intervaltmp ORDER BY a <-> '199 days 21 QUERY PLAN --------------------------------------------------------------------------- Limit - -> Index Only Scan using intervalidx on intervaltmp + -> Index Scan using intervalidx on intervaltmp Order By: (a <-> '@ 199 days 21 hours 21 mins 23 secs'::interval) (3 rows) diff --git a/contrib/btree_gist/expected/time.out b/contrib/btree_gist/expected/time.out index ec95ef77c..1b9da4e19 100644 --- a/contrib/btree_gist/expected/time.out +++ b/contrib/btree_gist/expected/time.out @@ -77,7 +77,7 @@ SELECT a, a <-> '10:57:11' FROM timetmp ORDER BY a <-> '10:57:11' LIMIT 3; QUERY PLAN -------------------------------------------------------------- Limit - -> Index Only Scan using timeidx on timetmp + -> Index Scan using timeidx on timetmp Order By: (a <-> '10:57:11'::time without time zone) (3 rows) diff --git a/contrib/btree_gist/expected/timestamp.out b/contrib/btree_gist/expected/timestamp.out index 0d94f2f24..cc3624f08 100644 --- a/contrib/btree_gist/expected/timestamp.out +++ b/contrib/btree_gist/expected/timestamp.out @@ -77,7 +77,7 @@ SELECT a, a <-> '2004-10-26 08:55:08' FROM timestamptmp ORDER BY a <-> '2004-10- QUERY PLAN ----------------------------------------------------------------------------------- Limit - -> Index Only Scan using timestampidx on timestamptmp + -> Index Scan using timestampidx on timestamptmp Order By: (a <-> 'Tue Oct 26 08:55:08 2004'::timestamp without time zone) (3 rows) diff --git a/contrib/btree_gist/expected/timestamptz.out b/contrib/btree_gist/expected/timestamptz.out index 75a15a425..88d2404c4 100644 --- a/contrib/btree_gist/expected/timestamptz.out +++ b/contrib/btree_gist/expected/timestamptz.out @@ -197,7 +197,7 @@ SELECT a, a <-> '2018-12-18 10:59:54 GMT+2' FROM timestamptztmp ORDER BY a <-> ' QUERY PLAN ------------------------------------------------------------------------------------ Limit - -> Index Only Scan using timestamptzidx on timestamptztmp + -> Index Scan using timestamptzidx on timestamptztmp Order By: (a <-> 'Tue Dec 18 04:59:54 2018 PST'::timestamp with time zone) (3 rows) diff --git a/doc/src/sgml/indexam.sgml b/doc/src/sgml/indexam.sgml index 6e1e51169..75c0704cc 100644 --- a/doc/src/sgml/indexam.sgml +++ b/doc/src/sgml/indexam.sgml @@ -172,6 +172,7 @@ typedef struct IndexAmRoutine amgetbatch_function amgetbatch; /* can be NULL */ amunguardbatch_function amunguardbatch; /* can be NULL */ amkillitemsbatch_function amkillitemsbatch; /* can be NULL */ + amgettransform_function amgettransform; /* can be NULL */ amgetbitmap_function amgetbitmap; /* can be NULL */ amendscan_function amendscan; amposreset_function amposreset; /* can be NULL */ @@ -716,27 +717,58 @@ ambeginscan (Relation indexRelation, and sibling page links). + + + scan->batch_index_opaque_dyn: the size of an + optional second per-batch opaque area, or 0 if the index AM does not need + one. Unlike the area above, its size need not be known at compile time; + the index AM may choose it at the start of each scan. It sits immediately + before the static area, and core code treats it as a single opaque + allocation that the index AM lays out however it likes (for example, to + carry per-item match metadata, such as a recheck flag or order-by + distances, that must travel with the batch). + + scan->batch_tuples_workspace: the size in bytes of the per-batch tuple storage workspace used for index-only scans (typically BLCKSZ), or 0 if the index AM does not - support index-only scans. The workspace is accessible via - batch->currTuples. + support index-only scans. The workspace is accessible via the batch's + currTuples field. The index AM stores each + matching tuple here in its on-disk format (an + IndexTuple, or another on-disk tuple form used by + the AM); it is either exposed directly as + scan->xs_itup, or converted to the returnable tuple + later, by amgettransform (see below). + + These batch fields are usually set in ambeginscan, but an + index access method may instead set any of them in + amrescan when their value cannot be determined until then. + For example, the size of the dynamic opaque area might depend on whether this + is an index-only scan + (scan->xs_want_itup), which core code only sets after + ambeginscan has returned; such an access method sizes + scan->batch_index_opaque_dyn in + amrescan instead. This is safe because no batch is ever + allocated before the first amrescan call. + + An amgetbatch access method whose recheck requirement is a fixed property of the whole scan (rather than something that varies from one matching item to the next) should also set scan->xs_recheck here, in - ambeginscan, since the value applies to every item the - scan returns. The value set here persists across any subsequent - amrescan calls. B-tree (always false) and hash (always - true) work this way. + ambeginscan: the value then applies to every item the + scan returns, and persists across any subsequent + amrescan calls. See amgetbatch + below, which describes both this whole-scan case and the per-item case in + detail. @@ -758,6 +790,13 @@ amrescan (IndexScanDesc scan, remains the same. + + amrescan is also where an + amgetbatch access method sets any of the batch fields + described under ambeginscan above whose value could not + be determined until now. + + bool @@ -894,23 +933,75 @@ amgetbatch (IndexScanDesc scan, - Index access methods using amgetbatch must set - scan->xs_recheck to indicate whether rechecking of - scan keys is required, in the same way as amgettuple - does. However, scan->xs_recheck must be set consistently - for an entire scan rather than varying on a per-tuple basis. This is a key - difference from amgettuple, which can set - scan->xs_recheck independently for each tuple it returns. - Index access methods that require granular control over - scan->xs_recheck must use the amgettuple - interface instead of amgetbatch. + Index access methods using amgetbatch must convey + whether the scan keys need to be rechecked, via + scan->xs_recheck, just as + amgettuple access methods do. Unlike + amgettuple, however, an + amgetbatch access method cannot set + scan->xs_recheck at the point an individual item is + returned, because the interface decouples the order of + amgetbatch calls from the order in which items are + later returned to the scan. When the recheck requirement is a fixed + property of the whole scan, the index access method instead sets + scan->xs_recheck once, at scan start (in its + ambeginscan routine): B-tree always sets it false, and + hash always sets it true. When the requirement instead varies from one + matching item to the next, the index access method records the per-item + value in the batch and provides an amgettransform + callback (see below), which the table AM invokes for each returned item to + set scan->xs_recheck from that recorded state; GiST + works this way. - Similarly, the amgetbatch interface does not currently - support index-only scans that return data in the form of a - HeapTuple pointer stored in - scan->xs_hitup. + An amgetbatch access method that supports index-only + scans must supply the scan's returnable tuple for each matching item, and + must do so in one of exactly two ways. Which one applies is a fixed + property of the access method, determined by whether it provides an + amgettransform callback: + + + + If the index access method does not provide + amgettransform, it must store each matching tuple + in the batch's currTuples workspace as an + on-disk IndexTuple whose layout is exactly what + scan->xs_itupdesc describes. The table AM exposes + those stored bytes directly as scan->xs_itup and + deforms them against xs_itupdesc (just as for an + amgettuple index-only scan); the index access + method must not set scan->xs_itup itself. Among + the core access methods only B-tree uses this path, because it stores + the original indexed values unchanged, so the stored tuple already + matches xs_itupdesc. + + + + + Otherwise the index access method must provide an + amgettransform callback that produces the + returnable tuple in scan->xs_hitup (a + HeapTuple matching + scan->xs_hitupdesc). This gives the access method + complete freedom to form that tuple from whatever it stored in + currTuples, in whatever on-disk format suits + it. GiST uses this path, because the representation it + stores differs from the indexed value and so could not satisfy the + xs_itupdesc layout directly. + + + + The first path is generic, but useful only to an access method that — + like B-tree — already stores tuples in exactly the indexed-attribute + format; an access method that stores some other representation must take + the second path. The two paths are mutually exclusive: an + amgetbatch access method takes one or the other, never + both. (For historical reasons an amgettuple access + method is allowed to set both scan->xs_itup and + scan->xs_hitup for the same scan — the + heap-tuple form is then used — but that latitude is a legacy quirk + that amgetbatch deliberately does not repeat.) @@ -940,8 +1031,10 @@ amunguardbatch (IndexScanDesc scan, is not even required to use the standard helper indexam_util_unlock_batch to manage it. In practice, though, most or all index AMs will use that helper and hold the simplest - possible interlock: each guarded B-tree or hash batch keeps a single - buffer pin on the one index page the batch came from. See virtual nearest-neighbor batches that GiST uses for ordered + scans are not guarded, and hold no such pin.) See for details on buffer pin management during index scans. This function will be called at most once for each guarded batch; it is not called when the index AM has already unguarded the batch @@ -985,8 +1078,8 @@ amkillitemsbatch (IndexScanDesc scan, amgetbatch index AMs (those that don't can leave the field set to NULL), but doing so is recommended for performance, as it allows future scans to skip known-dead index entries. - Both core index access methods that currently support - amgetbatch (B-tree and hash) implement + All three core index access methods that currently support + amgetbatch (B-tree, hash, and GiST) implement LP_DEAD marking, though third-party index access methods are free to choose whether to implement this feature. The table AM may call tableam_util_scanpos_killitem to mark dead items as @@ -1028,7 +1121,7 @@ amkillitemsbatch (IndexScanDesc scan, VACUUM recycling table TIDs — so it would be unsafe to assume that index entries still point to the same heap/table tuples. Since LP_DEAD marking is only an optimization - hint, it is always safe to skip it. Both B-tree and hash use this + hint, it is always safe to skip it. B-tree, hash, and GiST use this approach. @@ -1067,6 +1160,41 @@ amkillitemsbatch (IndexScanDesc scan, +void +amgettransform (IndexScanDesc scan, + IndexScanBatch batch, + int item); + + Called by the table AM as it returns each matching item + (item is an index into the batch's + items array) of an amgetbatch + scan, to set up the scan's per-tuple output from per-item state that the + access method recorded in the batch. This is needed when that output cannot + be a fixed property of the whole scan. An access method may use it to set + scan->xs_recheck (when the need to recheck the scan + conditions varies from one matching item to the next), to set + xs_orderbyvals and + xs_recheckorderby for an ordered + (nearest-neighbor) scan, and to set scan->xs_hitup for + an index-only scan whose returnable tuple must be reconstructed rather than + returned directly as a stored index tuple. + + + + Implementing amgettransform is optional, and is only + meaningful together with amgetbatch. An access method + need only provide it when some part of its per-tuple output varies from one + matching item to the next. When every such output is instead a fixed + property of the whole scan — or, for index-only scans, is the on-disk + index tuple returned directly via scan->xs_itup — + the field can be left NULL, as B-tree and hash do. GiST + provides one because parts of its per-tuple output (the recheck flag, the + ORDER BY distances, and the reconstructed index-only + tuples) vary per matching item, as described above. + + + + int64 amgetbitmap (IndexScanDesc scan, TIDBitmap *tbm); @@ -1364,8 +1492,26 @@ amtranslatecmptype (CompareType cmptype, Oid opfamily, Oid opcintype); - Note that amgetbatch scans do not currently support - ordering operators. + An amgetbatch access method can support ordering + operators by providing an amgettransform callback: it + records each matching item's ordering values in the batch, and the table AM + calls amgettransform as it returns each item to set + xs_orderbyvals and + xs_recheckorderby from that recorded state. GiST + uses this for nearest-neighbor scans. As with + scan->xs_recheck, these values cannot be set directly as + items are returned. + + + + Scans that use ordering operators are never planned as index-only scans. + Because an ordered scan can collect matching items from many index leaf + pages without retaining a buffer pin on any of them (GiST's + virtual nearest-neighbor batches work this way), it has no + pin to serve as the interlock against concurrent TID recycling that an + index-only scan depends on (see ). The + planner therefore costs and executes such scans as plain index scans, which + always fetch and recheck the heap tuple. diff --git a/src/test/modules/dummy_index_am/dummy_index_am.c b/src/test/modules/dummy_index_am/dummy_index_am.c index 3f5be6082..c6990cab5 100644 --- a/src/test/modules/dummy_index_am/dummy_index_am.c +++ b/src/test/modules/dummy_index_am/dummy_index_am.c @@ -338,6 +338,7 @@ dihandler(PG_FUNCTION_ARGS) .amgetbatch = NULL, .amunguardbatch = NULL, .amkillitemsbatch = NULL, + .amgettransform = NULL, .amgetbitmap = NULL, .amendscan = diendscan, .amposreset = NULL, diff --git a/src/test/modules/index/expected/killtuples.out b/src/test/modules/index/expected/killtuples.out index a3db2c409..110c3d445 100644 --- a/src/test/modules/index/expected/killtuples.out +++ b/src/test/modules/index/expected/killtuples.out @@ -152,6 +152,83 @@ f step drop_table: DROP TABLE IF EXISTS kill_prior_tuple; step drop_ext_btree_gist: DROP EXTENSION btree_gist; +starting permutation: create_table fill_500 create_ext_btree_gist create_gist flush disable_seq disable_bitmap measure access_ordered flush result measure access_ordered flush result delete flush measure access_ordered flush result measure access_ordered flush result drop_table drop_ext_btree_gist +step create_table: CREATE TEMPORARY TABLE kill_prior_tuple(key int not null, cat text not null); +step fill_500: INSERT INTO kill_prior_tuple(key, cat) SELECT g.i, 'a' FROM generate_series(1, 500) g(i); +step create_ext_btree_gist: CREATE EXTENSION btree_gist; +step create_gist: CREATE INDEX kill_prior_tuple_gist ON kill_prior_tuple USING gist (key); +step flush: SELECT FROM pg_stat_force_next_flush(); +step disable_seq: SET enable_seqscan = false; +step disable_bitmap: SET enable_bitmapscan = false; +step measure: UPDATE counter SET heap_accesses = (SELECT heap_blks_read + heap_blks_hit FROM pg_statio_all_tables WHERE relname = 'kill_prior_tuple'); +step access_ordered: EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF, BUFFERS OFF) SELECT * FROM kill_prior_tuple ORDER BY key <-> 1; +QUERY PLAN +--------------------------------------------------------------------------------------- +Index Scan using kill_prior_tuple_gist on kill_prior_tuple (actual rows=500.00 loops=1) + Order By: (key <-> 1) + Index Searches: 1 +(3 rows) + +step flush: SELECT FROM pg_stat_force_next_flush(); +step result: SELECT ((heap_blks_read + heap_blks_hit - counter.heap_accesses) > 0) AS has_new_heap_accesses FROM counter, pg_statio_all_tables WHERE relname = 'kill_prior_tuple'; +has_new_heap_accesses +--------------------- +t +(1 row) + +step measure: UPDATE counter SET heap_accesses = (SELECT heap_blks_read + heap_blks_hit FROM pg_statio_all_tables WHERE relname = 'kill_prior_tuple'); +step access_ordered: EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF, BUFFERS OFF) SELECT * FROM kill_prior_tuple ORDER BY key <-> 1; +QUERY PLAN +--------------------------------------------------------------------------------------- +Index Scan using kill_prior_tuple_gist on kill_prior_tuple (actual rows=500.00 loops=1) + Order By: (key <-> 1) + Index Searches: 1 +(3 rows) + +step flush: SELECT FROM pg_stat_force_next_flush(); +step result: SELECT ((heap_blks_read + heap_blks_hit - counter.heap_accesses) > 0) AS has_new_heap_accesses FROM counter, pg_statio_all_tables WHERE relname = 'kill_prior_tuple'; +has_new_heap_accesses +--------------------- +t +(1 row) + +step delete: DELETE FROM kill_prior_tuple; +step flush: SELECT FROM pg_stat_force_next_flush(); +step measure: UPDATE counter SET heap_accesses = (SELECT heap_blks_read + heap_blks_hit FROM pg_statio_all_tables WHERE relname = 'kill_prior_tuple'); +step access_ordered: EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF, BUFFERS OFF) SELECT * FROM kill_prior_tuple ORDER BY key <-> 1; +QUERY PLAN +------------------------------------------------------------------------------------- +Index Scan using kill_prior_tuple_gist on kill_prior_tuple (actual rows=0.00 loops=1) + Order By: (key <-> 1) + Index Searches: 1 +(3 rows) + +step flush: SELECT FROM pg_stat_force_next_flush(); +step result: SELECT ((heap_blks_read + heap_blks_hit - counter.heap_accesses) > 0) AS has_new_heap_accesses FROM counter, pg_statio_all_tables WHERE relname = 'kill_prior_tuple'; +has_new_heap_accesses +--------------------- +t +(1 row) + +step measure: UPDATE counter SET heap_accesses = (SELECT heap_blks_read + heap_blks_hit FROM pg_statio_all_tables WHERE relname = 'kill_prior_tuple'); +step access_ordered: EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF, BUFFERS OFF) SELECT * FROM kill_prior_tuple ORDER BY key <-> 1; +QUERY PLAN +------------------------------------------------------------------------------------- +Index Scan using kill_prior_tuple_gist on kill_prior_tuple (actual rows=0.00 loops=1) + Order By: (key <-> 1) + Index Searches: 1 +(3 rows) + +step flush: SELECT FROM pg_stat_force_next_flush(); +step result: SELECT ((heap_blks_read + heap_blks_hit - counter.heap_accesses) > 0) AS has_new_heap_accesses FROM counter, pg_statio_all_tables WHERE relname = 'kill_prior_tuple'; +has_new_heap_accesses +--------------------- +t +(1 row) + +step drop_table: DROP TABLE IF EXISTS kill_prior_tuple; +step drop_ext_btree_gist: DROP EXTENSION btree_gist; + starting permutation: create_table fill_10 create_ext_btree_gist create_gist flush disable_seq disable_bitmap measure access flush result measure access flush result delete flush measure access flush result measure access flush result drop_table drop_ext_btree_gist step create_table: CREATE TEMPORARY TABLE kill_prior_tuple(key int not null, cat text not null); step fill_10: INSERT INTO kill_prior_tuple(key, cat) SELECT g.i, 'a' FROM generate_series(1, 10) g(i); @@ -223,7 +300,7 @@ step flush: SELECT FROM pg_stat_force_next_flush(); step result: SELECT ((heap_blks_read + heap_blks_hit - counter.heap_accesses) > 0) AS has_new_heap_accesses FROM counter, pg_statio_all_tables WHERE relname = 'kill_prior_tuple'; has_new_heap_accesses --------------------- -t +f (1 row) step drop_table: DROP TABLE IF EXISTS kill_prior_tuple; diff --git a/src/test/modules/index/specs/killtuples.spec b/src/test/modules/index/specs/killtuples.spec index 3b98ff9f7..f5d2fd773 100644 --- a/src/test/modules/index/specs/killtuples.spec +++ b/src/test/modules/index/specs/killtuples.spec @@ -47,6 +47,9 @@ step result { SELECT ((heap_blks_read + heap_blks_hit - counter.heap_accesses) > step access { EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF, BUFFERS OFF) SELECT * FROM kill_prior_tuple WHERE key = 1; } +# nearest-neighbor (order-by operator) scan (cannot set LP_DEAD bits) +step access_ordered { EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF, BUFFERS OFF) SELECT * FROM kill_prior_tuple ORDER BY key <-> 1; } + step delete { DELETE FROM kill_prior_tuple; } step drop_table { DROP TABLE IF EXISTS kill_prior_tuple; } @@ -96,7 +99,20 @@ permutation measure access flush result drop_table drop_ext_btree_gist -# Test gist, but with fewer rows - shows that killitems doesn't work anymore! +# GiST doesn't set LP_DEAD bits for ordered scans, so every access re-visits +# the heap +permutation + create_table fill_500 create_ext_btree_gist create_gist flush + disable_seq disable_bitmap + measure access_ordered flush result + measure access_ordered flush result + delete flush + measure access_ordered flush result + measure access_ordered flush result + drop_table drop_ext_btree_gist + +# Test gist with fewer rows, exercising the case where all the dead tuples are +# on a single page permutation create_table fill_10 create_ext_btree_gist create_gist flush disable_seq disable_bitmap diff --git a/src/test/regress/expected/create_index.out b/src/test/regress/expected/create_index.out index 55538c4c4..970b857c6 100644 --- a/src/test/regress/expected/create_index.out +++ b/src/test/regress/expected/create_index.out @@ -475,9 +475,9 @@ SELECT count(*) FROM point_tbl p WHERE p.f1 ~= '(-5, -12)'; EXPLAIN (COSTS OFF) SELECT * FROM point_tbl ORDER BY f1 <-> '0,1'; - QUERY PLAN ----------------------------------------------- - Index Only Scan using gpointind on point_tbl + QUERY PLAN +----------------------------------------- + Index Scan using gpointind on point_tbl Order By: (f1 <-> '(0,1)'::point) (2 rows) @@ -513,9 +513,9 @@ SELECT * FROM point_tbl WHERE f1 IS NULL; EXPLAIN (COSTS OFF) SELECT * FROM point_tbl WHERE f1 IS NOT NULL ORDER BY f1 <-> '0,1'; - QUERY PLAN ----------------------------------------------- - Index Only Scan using gpointind on point_tbl + QUERY PLAN +----------------------------------------- + Index Scan using gpointind on point_tbl Index Cond: (f1 IS NOT NULL) Order By: (f1 <-> '(0,1)'::point) (3 rows) @@ -539,7 +539,7 @@ EXPLAIN (COSTS OFF) SELECT * FROM point_tbl WHERE f1 <@ '(-10,-10),(10,10)':: box ORDER BY f1 <-> '0,1'; QUERY PLAN ------------------------------------------------ - Index Only Scan using gpointind on point_tbl + Index Scan using gpointind on point_tbl Index Cond: (f1 <@ '(10,10),(-10,-10)'::box) Order By: (f1 <-> '(0,1)'::point) (3 rows) diff --git a/src/test/regress/expected/create_index_spgist.out b/src/test/regress/expected/create_index_spgist.out index c6beb0efa..ddffca2e7 100644 --- a/src/test/regress/expected/create_index_spgist.out +++ b/src/test/regress/expected/create_index_spgist.out @@ -333,7 +333,7 @@ FROM quad_point_tbl; ---------------------------------------------------------------------------- WindowAgg Window: w1 AS (ORDER BY (p <-> '(0,0)'::point) ROWS UNBOUNDED PRECEDING) - -> Index Only Scan using sp_quad_ind on quad_point_tbl + -> Index Scan using sp_quad_ind on quad_point_tbl Order By: (p <-> '(0,0)'::point) (4 rows) @@ -354,7 +354,7 @@ FROM quad_point_tbl WHERE p <@ box '(200,200,1000,1000)'; ---------------------------------------------------------------------------- WindowAgg Window: w1 AS (ORDER BY (p <-> '(0,0)'::point) ROWS UNBOUNDED PRECEDING) - -> Index Only Scan using sp_quad_ind on quad_point_tbl + -> Index Scan using sp_quad_ind on quad_point_tbl Index Cond: (p <@ '(1000,1000),(200,200)'::box) Order By: (p <-> '(0,0)'::point) (5 rows) @@ -376,7 +376,7 @@ FROM quad_point_tbl WHERE p IS NOT NULL; -------------------------------------------------------------------------------- WindowAgg Window: w1 AS (ORDER BY (p <-> '(333,400)'::point) ROWS UNBOUNDED PRECEDING) - -> Index Only Scan using sp_quad_ind on quad_point_tbl + -> Index Scan using sp_quad_ind on quad_point_tbl Index Cond: (p IS NOT NULL) Order By: (p <-> '(333,400)'::point) (5 rows) @@ -503,7 +503,7 @@ FROM kd_point_tbl; ---------------------------------------------------------------------------- WindowAgg Window: w1 AS (ORDER BY (p <-> '(0,0)'::point) ROWS UNBOUNDED PRECEDING) - -> Index Only Scan using sp_kd_ind on kd_point_tbl + -> Index Scan using sp_kd_ind on kd_point_tbl Order By: (p <-> '(0,0)'::point) (4 rows) @@ -524,7 +524,7 @@ FROM kd_point_tbl WHERE p <@ box '(200,200,1000,1000)'; ---------------------------------------------------------------------------- WindowAgg Window: w1 AS (ORDER BY (p <-> '(0,0)'::point) ROWS UNBOUNDED PRECEDING) - -> Index Only Scan using sp_kd_ind on kd_point_tbl + -> Index Scan using sp_kd_ind on kd_point_tbl Index Cond: (p <@ '(1000,1000),(200,200)'::box) Order By: (p <-> '(0,0)'::point) (5 rows) @@ -546,7 +546,7 @@ FROM kd_point_tbl WHERE p IS NOT NULL; -------------------------------------------------------------------------------- WindowAgg Window: w1 AS (ORDER BY (p <-> '(333,400)'::point) ROWS UNBOUNDED PRECEDING) - -> Index Only Scan using sp_kd_ind on kd_point_tbl + -> Index Scan using sp_kd_ind on kd_point_tbl Index Cond: (p IS NOT NULL) Order By: (p <-> '(333,400)'::point) (5 rows) @@ -567,10 +567,10 @@ SET extra_float_digits = 0; CREATE INDEX ON quad_point_tbl_ord_seq1 USING spgist(p) INCLUDE(dist); EXPLAIN (COSTS OFF) SELECT p, dist FROM quad_point_tbl_ord_seq1 ORDER BY p <-> '0,0' LIMIT 10; - QUERY PLAN -------------------------------------------------------------------------------------------- + QUERY PLAN +-------------------------------------------------------------------------------------- Limit - -> Index Only Scan using quad_point_tbl_ord_seq1_p_dist_idx on quad_point_tbl_ord_seq1 + -> Index Scan using quad_point_tbl_ord_seq1_p_dist_idx on quad_point_tbl_ord_seq1 Order By: (p <-> '(0,0)'::point) (3 rows) diff --git a/src/test/regress/expected/gist.out b/src/test/regress/expected/gist.out index c75bbb23b..810db8b8f 100644 --- a/src/test/regress/expected/gist.out +++ b/src/test/regress/expected/gist.out @@ -74,13 +74,13 @@ select p from gist_tbl where p <@ box(point(0,0), point(0.5, 0.5)); (0.5,0.5) (11 rows) --- Also test an index-only knn-search +-- Also test a knn-search explain (costs off) select p from gist_tbl where p <@ box(point(0,0), point(0.5, 0.5)) order by p <-> point(0.201, 0.201); - QUERY PLAN --------------------------------------------------------- - Index Only Scan using gist_tbl_point_index on gist_tbl + QUERY PLAN +--------------------------------------------------- + Index Scan using gist_tbl_point_index on gist_tbl Index Cond: (p <@ '(0.5,0.5),(0,0)'::box) Order By: (p <-> '(0.201,0.201)'::point) (3 rows) @@ -106,9 +106,9 @@ order by p <-> point(0.201, 0.201); explain (costs off) select p from gist_tbl where p <@ box(point(0,0), point(0.5, 0.5)) order by point(0.101, 0.101) <-> p; - QUERY PLAN --------------------------------------------------------- - Index Only Scan using gist_tbl_point_index on gist_tbl + QUERY PLAN +--------------------------------------------------- + Index Scan using gist_tbl_point_index on gist_tbl Index Cond: (p <@ '(0.5,0.5),(0,0)'::box) Order By: (p <-> '(0.101,0.101)'::point) (3 rows) @@ -138,12 +138,12 @@ select p from (box(point(0.8,0.8), point(1.0,1.0)))) as v(bb) cross join lateral (select p from gist_tbl where p <@ bb order by p <-> bb[0] limit 2) ss; - QUERY PLAN --------------------------------------------------------------------- + QUERY PLAN +--------------------------------------------------------------- Nested Loop -> Values Scan on "*VALUES*" -> Limit - -> Index Only Scan using gist_tbl_point_index on gist_tbl + -> Index Scan using gist_tbl_point_index on gist_tbl Index Cond: (p <@ "*VALUES*".column1) Order By: (p <-> ("*VALUES*".column1)[0]) (6 rows) @@ -203,13 +203,13 @@ select b from gist_tbl where b <@ box(point(5,5), point(6,6)); (6,6),(6,6) (21 rows) --- Also test an index-only knn-search +-- Also test a knn-search explain (costs off) select b from gist_tbl where b <@ box(point(5,5), point(6,6)) order by b <-> point(5.2, 5.91); - QUERY PLAN ------------------------------------------------------- - Index Only Scan using gist_tbl_box_index on gist_tbl + QUERY PLAN +------------------------------------------------- + Index Scan using gist_tbl_box_index on gist_tbl Index Cond: (b <@ '(6,6),(5,5)'::box) Order By: (b <-> '(5.2,5.91)'::point) (3 rows) @@ -245,9 +245,9 @@ order by b <-> point(5.2, 5.91); explain (costs off) select b from gist_tbl where b <@ box(point(5,5), point(6,6)) order by point(5.2, 5.91) <-> b; - QUERY PLAN ------------------------------------------------------- - Index Only Scan using gist_tbl_box_index on gist_tbl + QUERY PLAN +------------------------------------------------- + Index Scan using gist_tbl_box_index on gist_tbl Index Cond: (b <@ '(6,6),(5,5)'::box) Order By: (b <-> '(5.2,5.91)'::point) (3 rows) @@ -373,20 +373,26 @@ select count(*) from gist_tbl; 10001 (1 row) --- This case isn't supported, but it should at least EXPLAIN correctly. +-- An ordering-operator (nearest-neighbor) scan is never planned as an +-- index-only scan, so this lossy-distance case runs as a plain index scan that +-- rechecks the distances against the heap tuple. explain (verbose, costs off) select p from gist_tbl order by circle(p,1) <-> point(0,0) limit 1; - QUERY PLAN ------------------------------------------------------------------------------------- + QUERY PLAN +---------------------------------------------------------------------------------- Limit Output: p, ((circle(p, '1'::double precision) <-> '(0,0)'::point)) - -> Index Only Scan using gist_tbl_multi_index on public.gist_tbl + -> Index Scan using gist_tbl_multi_index on public.gist_tbl Output: p, (circle(p, '1'::double precision) <-> '(0,0)'::point) - Order By: ((circle(gist_tbl.p, '1'::double precision)) <-> '(0,0)'::point) + Order By: (circle(gist_tbl.p, '1'::double precision) <-> '(0,0)'::point) (5 rows) select p from gist_tbl order by circle(p,1) <-> point(0,0) limit 1; -ERROR: lossy distance functions are not supported in index-only scans + p +------- + (0,0) +(1 row) + -- Force an index build using buffering. create index gist_tbl_box_index_forcing_buffering on gist_tbl using gist (p) with (buffering=on, fillfactor=50); diff --git a/src/test/regress/sql/gist.sql b/src/test/regress/sql/gist.sql index 6f1fc65f1..369eb4576 100644 --- a/src/test/regress/sql/gist.sql +++ b/src/test/regress/sql/gist.sql @@ -65,7 +65,7 @@ select p from gist_tbl where p <@ box(point(0,0), point(0.5, 0.5)); -- execute the same select p from gist_tbl where p <@ box(point(0,0), point(0.5, 0.5)); --- Also test an index-only knn-search +-- Also test a knn-search explain (costs off) select p from gist_tbl where p <@ box(point(0,0), point(0.5, 0.5)) order by p <-> point(0.201, 0.201); @@ -109,7 +109,7 @@ select b from gist_tbl where b <@ box(point(5,5), point(6,6)); -- execute the same select b from gist_tbl where b <@ box(point(5,5), point(6,6)); --- Also test an index-only knn-search +-- Also test a knn-search explain (costs off) select b from gist_tbl where b <@ box(point(5,5), point(6,6)) order by b <-> point(5.2, 5.91); @@ -164,7 +164,9 @@ explain (verbose, costs off) select count(*) from gist_tbl; select count(*) from gist_tbl; --- This case isn't supported, but it should at least EXPLAIN correctly. +-- An ordering-operator (nearest-neighbor) scan is never planned as an +-- index-only scan, so this lossy-distance case runs as a plain index scan that +-- rechecks the distances against the heap tuple. explain (verbose, costs off) select p from gist_tbl order by circle(p,1) <-> point(0,0) limit 1; select p from gist_tbl order by circle(p,1) <-> point(0,0) limit 1; diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 446e68a84..d3ab27607 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -1063,6 +1063,8 @@ GBT_NUMKEY_R GBT_VARKEY GBT_VARKEY_R GENERAL_NAME +GISTBatchData +GISTBatchItem GISTBuildBuffers GISTBuildState GISTDeletedPageContents -- 2.53.0