From 08f0567c430bdbee0785051ceb5004abcea76545 Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Mon, 1 Jun 2026 19:35:47 -0400 Subject: [PATCH v26 4/9] WIP: Adopt amgetbatch interface in GiST index AM. Replace gistgettuple with gistgetbatch, a function that implements the new amgetbatch interface added by commit FIXME. Plain index scans of GiST indexes now return matching items in batches consisting of all of the matches from a given leaf page. This gives the table AM the ability to perform optimizations like index prefetching during GiST index scans. Returned batches hold no buffer pins on index pages, per the amgetbatch interface's contract. This greatly simplifies resource management during index prefetching, where the read stream is expected to hold many pins on heap pages. The amgetbatch interface requires that index AMs take the same standardized approach to pin management for pins that are used to prevent unsafe concurrent TID recycling by VACUUM (that way prefetching can hold open multiple batches without it affecting the read stream). For an ordinary GiST batch this interlock pin is the pin on its single leaf page, held only for as long as the table AM still needs it as an interlock (just like during nbtree and hash scans). Nearest-neighbor (ordered) scans are handled quite differently, because their matches don't naturally arrive one leaf page at a time. Here gistgetbatch instead drains the scan's distance-ordered pairing heap, packing the matching leaf items into a single "virtual" batch in distance order, typically spanning many leaf pages. We're effectively pretending that the matches we found were in useful order, together on the same leaf page -- though that isn't really true. Virtual batches come with restrictions that make the pretense safe: an ordered scan is never planned as an index-only scan, and gistkillitemsbatch does nothing for a virtual batch. A virtual batch therefore never holds a TID recycling interlock pin at all; the pin on each underlying leaf page is instead dropped right away, as the page is scanned into the queue. The interlock pin also fixes a pre-existing bug in which GiST index-only scans could return wrong answers [1]. An index-only scan trusts the visibility map instead of fetching the heap tuple, so it must keep VACUUM from recycling a heap TID between the moment it reads an index entry and the moment it consults the visibility map; otherwise it can report indexed values that belong to an unrelated, since-recycled heap tuple. The retained leaf-page buffer pin is that interlock -- but only if VACUUM honors it. gistvacuumpage therefore now acquires a cleanup lock on each leaf page (rather than a plain exclusive lock), so a concurrent scan's pin holds VACUUM off from recycling that page's TIDs until the scan has finished its visibility checks. This same interlock requirement is why ordered scans cannot be index-only: a virtual batch drops each leaf page's pin as soon as the page is scanned, so it has no bounded pin to offer as the recycling interlock that an index-only scan depends on. Rather than work around that (which seems prohibitively complicated), the planner never builds an index-only scan that uses ordering operators; those scans are costed and executed as plain index scans, which fetch and recheck the heap tuple and so were never subject to the bug. gistkillitemsbatch (the GiST implementation of the new amkillitemsbatch interface) performs LP_DEAD marking of dead index entries, following the same approach as the other amgetbatch AMs: it compares the batch's saved LSN against the current leaf page LSN to detect concurrent page modifications. It does nothing for virtual batches, so nearest-neighbor scans don't set LP_DEAD bits -- though that isn't really new. GiST does now set LP_DEAD bits during scans that only ever touch a single leaf page, fixing a longstanding oversight in its previous approach, which was unable to mark dead items in that case (see changed output to an existing killtuples.out test case). The gistgetbatch implementation makes use of new batch-related core infrastructure. GiST now registers an amgettransform callback, which sets the scan descriptor's per-tuple recheck flags. It also sets order-by distances, and reconstructs a heap tuple for index-only scans. It is called just before table_index_getnext_slot returns another tuple. Like nbtree, the scan uses a currTuples storage area to store IndexTuple structs in their original on-disk representation. Unlike nbtree, GiST uses amgettransform to convert the representation of the tuples into a heap tuple representation of the underlying indexed type. This scheme also relies on a new facility that allows index AMs to request their own separate dynamically sized area for supplemental metadata (GiST opclasses have the ability to represent that any tuple needs a recheck, so we have to shuttle that information around with the batch). [1] https://postgr.es/m/CAH2-Wz=jjiNL9FCh8C1L-GUH15f4WFTWub2x+_NucngcDDcHKw@mail.gmail.com Author: Peter Geoghegan --- src/include/access/amapi.h | 5 + src/include/access/gist_private.h | 64 +- src/include/access/relscan.h | 51 +- src/backend/access/brin/brin.c | 1 + src/backend/access/gin/ginutil.c | 1 + src/backend/access/gist/README | 90 ++- src/backend/access/gist/gist.c | 9 +- src/backend/access/gist/gistget.c | 579 +++++++++++------- src/backend/access/gist/gistscan.c | 43 +- src/backend/access/gist/gistutil.c | 9 +- src/backend/access/gist/gistvacuum.c | 7 +- src/backend/access/hash/hash.c | 1 + src/backend/access/heap/heapam_indexscan.c | 22 +- src/backend/access/index/amapi.c | 1 + src/backend/access/index/genam.c | 1 + src/backend/access/index/indexbatch.c | 20 +- src/backend/access/nbtree/nbtree.c | 1 + src/backend/access/spgist/spgutils.c | 1 + src/backend/executor/nodeIndexonlyscan.c | 12 - src/backend/optimizer/path/indxpath.c | 5 +- contrib/bloom/blutils.c | 1 + contrib/btree_gist/expected/cash.out | 6 +- contrib/btree_gist/expected/date.out | 6 +- contrib/btree_gist/expected/float4.out | 6 +- contrib/btree_gist/expected/float8.out | 2 +- contrib/btree_gist/expected/int2.out | 6 +- contrib/btree_gist/expected/int4.out | 6 +- contrib/btree_gist/expected/int8.out | 2 +- contrib/btree_gist/expected/interval.out | 2 +- contrib/btree_gist/expected/time.out | 2 +- contrib/btree_gist/expected/timestamp.out | 2 +- contrib/btree_gist/expected/timestamptz.out | 2 +- doc/src/sgml/indexam.sgml | 136 +++- .../modules/dummy_index_am/dummy_index_am.c | 1 + .../modules/index/expected/killtuples.out | 79 ++- src/test/modules/index/specs/killtuples.spec | 18 +- src/test/regress/expected/create_index.out | 14 +- .../regress/expected/create_index_spgist.out | 18 +- src/test/regress/expected/gist.out | 52 +- src/test/regress/sql/gist.sql | 8 +- src/tools/pgindent/typedefs.list | 2 + 41 files changed, 875 insertions(+), 419 deletions(-) diff --git a/src/include/access/amapi.h b/src/include/access/amapi.h index 02793a115..157c1a8df 100644 --- a/src/include/access/amapi.h +++ b/src/include/access/amapi.h @@ -212,6 +212,10 @@ typedef void (*amunguardbatch_function) (IndexScanDesc scan, typedef void (*amkillitemsbatch_function) (IndexScanDesc scan, IndexScanBatch batch); +/* Set up the scan's xs_hitup output tuple for the given batch item */ +typedef void (*amgettransform_function) (IndexScanDesc scan, + IndexScanBatch batch, int item); + /* fetch all valid tuples */ typedef int64 (*amgetbitmap_function) (IndexScanDesc scan, TIDBitmap *tbm); @@ -326,6 +330,7 @@ typedef struct IndexAmRoutine amgetbatch_function amgetbatch; /* can be NULL */ amunguardbatch_function amunguardbatch; /* can be NULL */ amkillitemsbatch_function amkillitemsbatch; /* can be NULL */ + amgettransform_function amgettransform; /* can be NULL */ amgetbitmap_function amgetbitmap; /* can be NULL */ amendscan_function amendscan; amposreset_function amposreset; /* can be NULL */ diff --git a/src/include/access/gist_private.h b/src/include/access/gist_private.h index 44514f1cb..4225b079e 100644 --- a/src/include/access/gist_private.h +++ b/src/include/access/gist_private.h @@ -16,6 +16,7 @@ #include "access/amapi.h" #include "access/gist.h" +#include "access/indexbatch.h" #include "access/itup.h" #include "lib/pairingheap.h" #include "storage/bufmgr.h" @@ -120,10 +121,6 @@ typedef struct GISTSearchHeapItem ItemPointerData heapPtr; bool recheck; /* T if quals must be rechecked */ bool recheckDistances; /* T if distances must be rechecked */ - HeapTuple recontup; /* data reconstructed from the index, used in - * index-only scans */ - OffsetNumber offnum; /* track offset in page to mark tuple as - * LP_DEAD */ } GISTSearchHeapItem; /* Unvisited item, either index page or heap tuple */ @@ -148,6 +145,45 @@ typedef struct GISTSearchItem (offsetof(GISTSearchItem, distances) + \ sizeof(IndexOrderByDistance) * (n_distances)) +/* Per-batch data private to the GiST index AM */ +typedef struct GISTBatchData +{ + /* leaf page's buffer pin */ + Buffer buf; + /* leaf page's block number (InvalidBlockNumber means "virtual" batch) */ + BlockNumber blkno; +} GISTBatchData; + +/* Access the GiST-private per-batch data from an IndexScanBatch pointer */ +#define GISTBatchGetData(scan, batch) \ + indexam_util_batch_get_amdata(scan, batch, GISTBatchData) + +/* + * Per-item private GiST data. We lay out the index AM's dynamic opaque area + * as an array of these, one per batch item, and subscript it via + * GISTBatchGetItem. + * + * GiST matching is potentially lossy, and the Consistent function's recheck + * flag varies from one item to the next, so every batch item records its own + * qual recheck flag here; gistgettransform reports it as the item's xs_recheck. + */ +typedef struct GISTBatchItem +{ + bool recheck; /* T if quals must be rechecked */ + bool recheckDistances; /* T if distances are lossy lower bounds */ + /* numberOfOrderBys entries */ + IndexOrderByDistance distances[FLEXIBLE_ARRAY_MEMBER]; +} GISTBatchItem; + +#define SizeOfGISTBatchItem(n_distances) \ + (offsetof(GISTBatchItem, distances) + \ + sizeof(IndexOrderByDistance) * (n_distances)) + +#define GISTBatchGetItem(scan, batch, item) \ + (AssertMacro((item) >= 0 && (item) < MaxIndexTuplesPerPage), \ + (GISTBatchItem *) ((char *) index_scan_batch_index_opaque_dyn((scan), (batch)) + \ + (Size) (item) * SizeOfGISTBatchItem((scan)->numberOfOrderBys))) + /* * GISTScanOpaqueData: private state for a scan of a GiST index */ @@ -159,23 +195,9 @@ typedef struct GISTScanOpaqueData pairingheap *queue; /* queue of unvisited items */ MemoryContext queueCxt; /* context holding the queue */ bool qual_ok; /* false if qual can never be satisfied */ - bool firstCall; /* true until first gistgettuple call */ /* pre-allocated workspace arrays */ IndexOrderByDistance *distances; /* output area for gistindex_keytest */ - - /* info about killed items if any (killedItems is NULL if never used) */ - OffsetNumber *killedItems; /* offset numbers of killed items */ - int numKilled; /* number of currently stored items */ - BlockNumber curBlkno; /* current number of block */ - GistNSN curPageLSN; /* pos in the WAL stream when page was read */ - - /* In a non-ordered search, returnable heap items are stored here: */ - GISTSearchHeapItem pageData[BLCKSZ / sizeof(IndexTupleData)]; - OffsetNumber nPageData; /* number of valid items in array */ - OffsetNumber curPageData; /* next item to return */ - MemoryContext pageDataCxt; /* context holding the fetched tuples, for - * index-only scans */ } GISTScanOpaqueData; typedef GISTScanOpaqueData *GISTScanOpaque; @@ -458,7 +480,11 @@ extern XLogRecPtr gistXLogSplit(bool page_is_leaf, Buffer leftchildbuf, bool markfollowright); /* gistget.c */ -extern bool gistgettuple(IndexScanDesc scan, ScanDirection dir); +extern void gistkillitemsbatch(IndexScanDesc scan, IndexScanBatch batch); +extern IndexScanBatch gistgetbatch(IndexScanDesc scan, IndexScanBatch priorbatch, + ScanDirection dir); +extern void gistunguardbatch(IndexScanDesc scan, IndexScanBatch batch); +extern void gistgettransform(IndexScanDesc scan, IndexScanBatch batch, int item); extern int64 gistgetbitmap(IndexScanDesc scan, TIDBitmap *tbm); extern bool gistcanreturn(Relation index, int attno); diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h index 7d6a5f5d4..db56605b6 100644 --- a/src/include/access/relscan.h +++ b/src/include/access/relscan.h @@ -157,6 +157,7 @@ typedef struct BatchMatchingItem * * [table AM opaque area] <- allocation base, at -(batch_base_offset) * [table AM per-item area] <- supplemental flexible array per-item data + * [index AM dyn opaque] <- optional, dynamically sized * [index AM static opaque] <- at -(batch_index_opaque_static) * [IndexScanBatchData] <- batch pointer, returned by amgetbatch * [items[maxitemsbatch]] @@ -164,12 +165,12 @@ typedef struct BatchMatchingItem * index-only scans (batch_tuples_workspace) * * batch_base_offset combines the table AM opaque area (its fixed-size header - * plus its per-item area), and the static index AM opaque area into a single - * offset from the batch pointer to the true allocation base. The - * indexbatch.c utilities pfree a batch by passing pfree a pointer returned by - * index_scan_batch_base. We rely on the assumption that batches have a fixed - * layout for the duration of an index scan (batches are cached for reuse to - * avoid palloc churn). + * plus its per-item area), the optional dynamic index AM opaque area, and the + * static index AM opaque area into a single offset from the batch pointer to + * the true allocation base. The indexbatch.c utilities pfree a batch by + * passing pfree a pointer returned by index_scan_batch_base. We rely on the + * assumption that batches have a fixed layout for the duration of an index + * scan (batches are cached for reuse to avoid palloc churn). * * The table AM accesses its opaque area using the index_scan_batch_table_area * shim accessor. The area is a single contiguous block: a fixed-size header @@ -184,11 +185,22 @@ typedef struct BatchMatchingItem * Bitmap scans involving an amgetbitmap routine that finds it convenient to * reuse batch infrastructure internally never get a table AM opaque area. * - * An index AM gets a mandatory static area (batch_index_opaque_static), which - * has a size known at compile time -- MAXALIGN(sizeof(the AM's struct)) -- - * and is accessed via indexam_util_batch_get_amdata at that fixed offset. - * This is more efficient but less flexible than the table AM scheme: every - * index AM uses the same generic fixed-size header. + * An index AM gets two opaque areas, both before the batch pointer, divided by + * what is known when. The mandatory static area (batch_index_opaque_static) + * has a size known at compile time -- MAXALIGN(sizeof(the AM's struct)) -- and + * is accessed via indexam_util_batch_get_amdata at that fixed offset. This is + * more efficient but less flexible than the table AM scheme: every index AM + * uses the same generic fixed-size header. + * + * Index AMs can use a second, optional dynamically-sized private area + * (batch_index_opaque_dyn) that sits just before the static area. Its size + * is chosen at scan start rather than at compile time. It is accessed via + * index_scan_batch_index_opaque_dyn. This second area is generally only used + * during scans where large amounts of supplemental metadata are required, + * that cannot reasonably be allocated for every scan. Typically, this is + * granular information about the batch's items for use by the index AM's + * amgettransform routine. Index AMs cannot expect this space to be allocated + * during bitmap index scans. */ typedef struct IndexScanBatchData { @@ -398,6 +410,7 @@ typedef struct IndexScanDescData /* batch size information, set once by index AM in ambeginscan */ uint16 maxitemsbatch; /* size of each batch's items[] array */ uint16 batch_index_opaque_static; /* compile-time opaque size */ + Size batch_index_opaque_dyn; /* optional dynamic opaque size */ uint16 batch_tuples_workspace; /* currTuples workspace size */ /* @@ -549,6 +562,22 @@ index_scan_batch_table_area(IndexScanDescData *scan, IndexScanBatch batch) return index_scan_batch_base(scan, batch); } +/* + * Return a pointer to the index AM's dynamic opaque area. + * + * This optional area (sized batch_index_opaque_dyn) sits immediately before + * the index AM's static opaque area. Core code treats it as a single opaque + * allocation; the index AM alone decides its internal structure. + */ +static inline void * +index_scan_batch_index_opaque_dyn(IndexScanDescData *scan, IndexScanBatch batch) +{ + Assert(scan->batch_index_opaque_dyn > 0); + + return (char *) batch - scan->batch_index_opaque_static - + MAXALIGN(scan->batch_index_opaque_dyn); +} + /* * Advance position to its next item in the batch. * diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c index 2d9d04aa3..4799a40b7 100644 --- a/src/backend/access/brin/brin.c +++ b/src/backend/access/brin/brin.c @@ -302,6 +302,7 @@ brinhandler(PG_FUNCTION_ARGS) .amgetbatch = NULL, .amunguardbatch = NULL, .amkillitemsbatch = NULL, + .amgettransform = NULL, .amgetbitmap = bringetbitmap, .amendscan = brinendscan, .amposreset = NULL, diff --git a/src/backend/access/gin/ginutil.c b/src/backend/access/gin/ginutil.c index 0e8b6a549..ceb9cb447 100644 --- a/src/backend/access/gin/ginutil.c +++ b/src/backend/access/gin/ginutil.c @@ -87,6 +87,7 @@ ginhandler(PG_FUNCTION_ARGS) .amgetbatch = NULL, .amunguardbatch = NULL, .amkillitemsbatch = NULL, + .amgettransform = NULL, .amgetbitmap = gingetbitmap, .amendscan = ginendscan, .amposreset = NULL, diff --git a/src/backend/access/gist/README b/src/backend/access/gist/README index 75445b074..00b94d9f7 100644 --- a/src/backend/access/gist/README +++ b/src/backend/access/gist/README @@ -48,7 +48,7 @@ The original algorithms were modified in several ways: * They had to be adapted to PostgreSQL conventions. For example, the SEARCH algorithm was considerably changed, because in PostgreSQL the search function - should return one tuple (next), not all tuples at once. Also, it should + returns matching tuples incrementally, not all at once. Also, it should release page locks between calls. * Since we added support for variable length keys, it's not possible to guarantee enough free space for all keys on pages after splitting. User @@ -71,20 +71,24 @@ was not touched in the paper. Search Algorithm ---------------- -The search code maintains a queue of unvisited items, where an "item" is -either a heap tuple known to satisfy the search conditions, or an index -page that is consistent with the search conditions according to inspection -of its parent page's downlink item. Initially the root page is searched -to find unvisited items in it. Then we pull items from the queue. A -heap tuple pointer is just returned immediately; an index page entry -causes that page to be searched, generating more queue entries. +The search code maintains a queue of unvisited items. For a plain index +scan an "item" is always an index page that is consistent with the search +conditions according to inspection of its parent page's downlink item; +matching heap tuples are not queued, but are gathered into a batch as each +leaf page is scanned (see "Returning matches in batches", below). For a +nearest-neighbor (ordered) scan the queue additionally holds heap tuples +known to satisfy the search conditions, so that heap tuples and index +pages can be interleaved in distance order. Initially the root page is +added to the queue. Then we pull items from the queue: an index page +entry causes that page to be scanned, generating more queue entries, while +a heap tuple entry (ordered scans only) is a match to be returned. -The queue is kept ordered with heap tuple items at the front, then -index page entries, with any newly-added index page entry inserted -before existing index page entries. This ensures depth-first traversal -of the index, and in particular causes the first few heap tuples to be -returned as soon as possible. That is helpful in case there is a LIMIT -that requires only a few tuples to be produced. +The queue is kept ordered so that we perform a depth-first traversal of +the index: any newly-added index page entry is inserted before existing +index page entries, and (for ordered scans) heap tuple items are kept at +the front. This causes the first few matching heap tuples to be returned +as soon as possible, which is helpful in case there is a LIMIT that +requires only a few tuples to be produced. To implement nearest-neighbor search, the queue entries are augmented with distance data: heap tuple entries are labeled with exact distance @@ -94,17 +98,18 @@ queue entries are retrieved in smallest-distance-first order, with entries having identical distances managed as stated in the previous paragraph. -The search algorithm keeps an index page locked only long enough to scan -its entries and queue those that satisfy the search conditions. Since -insertions can occur concurrently with searches, it is possible for an -index child page to be split between the time we make a queue entry for it -(while visiting its parent page) and the time we actually reach and scan -the child page. To avoid missing the entries that were moved to the right -sibling, we detect whether a split has occurred by comparing the child -page's NSN (node sequence number, a special-purpose LSN) to the LSN that -the parent had when visited. If it did, the sibling page is immediately -added to the front of the queue, ensuring that its items will be scanned -in the same order as if they were still on the original child page. +The search algorithm keeps an index page locked only long enough to scan its +entries -- queueing the child pages that satisfy the search conditions, and +gathering any matching heap tuples (into a batch, or onto the queue for an +ordered scan). Since insertions can occur concurrently with searches, it is +possible for an index child page to be split between the time we make a queue +entry for it (while visiting its parent page) and the time we actually reach +and scan the child page. To avoid missing the entries that were moved to the +right sibling, we detect whether a split has occurred by comparing the child +page's NSN (node sequence number, a special-purpose LSN) to the LSN that the +parent had when visited. If it did, the sibling page is immediately added to +the front of the queue, ensuring that its items will be scanned in the same +order as if they were still on the original child page. As is usual in Postgres, the search algorithm only guarantees to find index entries that existed before the scan started; index entries added during @@ -116,6 +121,36 @@ Any such enlargement would be to add child items that we aren't interested in returning anyway. +Returning matches in batches +---------------------------- + +GiST implements the amgetbatch index AM interface, whose contract is +documented in doc/src/sgml/indexam.sgml (see also +src/backend/access/nbtree/README). Each call hands the table AM a batch of +matching TIDs rather than a single TID. GiST forms two kinds of batch: + +* A plain (non-ordered) scan returns one "conventional" batch per leaf + page, holding all of that page's matching TIDs in physical order. As in + nbtree and hash, the batch retains the leaf page's buffer pin (though not + its content lock) as the interlock against concurrent TID recycling by + VACUUM. + +* A nearest-neighbor (ordered) scan returns a single "virtual" batch. Its + matches don't arrive one leaf page at a time, so instead we drain the + distance-ordered queue, copying matching TIDs into the batch in distance + order -- typically spanning many leaf pages. A virtual batch retains no + buffer pin; each leaf page's pin is dropped as soon as the page is scanned. + +VACUUM honors a batch's pin by taking a cleanup lock on the leaf page (see +"Bulk delete algorithm (VACUUM)", below), just as nbtree does. Because a +virtual batch holds no such pin, ordered scans come with two restrictions, +both also seen in bitmap (amgetbitmap) scans and both explained in +doc/src/sgml/indexam.sgml: they never set LP_DEAD bits (gistkillitemsbatch +does nothing for a virtual batch), and they are never planned as index-only +scans (a virtual batch has no pin to offer as the TID-recycling interlock +that index-only scans depend on). + + Insert Algorithm ---------------- @@ -452,6 +487,11 @@ B-tree VACUUM uses, but because we already have NSNs on pages, to detect page splits during searches, we don't need a "vacuum cycle ID" concept for that like B-tree does. +We take a full cleanup lock on every leaf page as we scan it, even leaf +pages with no deletable tuples. As in nbtree, this is the interlock that +holds concurrent scans off from TID recycling; see "Returning matches in +batches", above. + While we scan all the pages, we also make note of any completely empty leaf pages. We will try to unlink them from the tree after the scan. We also record the block numbers of all internal pages; they are needed to locate parents of diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c index 67b16053a..88b8a4ddf 100644 --- a/src/backend/access/gist/gist.c +++ b/src/backend/access/gist/gist.c @@ -103,10 +103,11 @@ gisthandler(PG_FUNCTION_ARGS) .amadjustmembers = gistadjustmembers, .ambeginscan = gistbeginscan, .amrescan = gistrescan, - .amgettuple = gistgettuple, - .amgetbatch = NULL, - .amunguardbatch = NULL, - .amkillitemsbatch = NULL, + .amgettuple = NULL, + .amgetbatch = gistgetbatch, + .amunguardbatch = gistunguardbatch, + .amkillitemsbatch = gistkillitemsbatch, + .amgettransform = gistgettransform, .amgetbitmap = gistgetbitmap, .amendscan = gistendscan, .amposreset = NULL, diff --git a/src/backend/access/gist/gistget.c b/src/backend/access/gist/gistget.c index 4d7c100d7..dd5ca0bf5 100644 --- a/src/backend/access/gist/gistget.c +++ b/src/backend/access/gist/gistget.c @@ -27,68 +27,78 @@ #include "utils/rel.h" /* - * gistkillitems() -- set LP_DEAD state for items an indexscan caller has - * told us were killed. - * - * We re-read page here, so it's important to check page LSN. If the page - * has been modified since the last read (as determined by LSN), we cannot - * flag any entries because it is possible that the old entry was vacuumed - * away and the TID was re-used by a completely different heap tuple. + * gistkillitemsbatch() -- Mark dead items' index tuples LP_DEAD */ -static void -gistkillitems(IndexScanDesc scan) +void +gistkillitemsbatch(IndexScanDesc scan, IndexScanBatch batch) { - GISTScanOpaque so = (GISTScanOpaque) scan->opaque; + GISTBatchData *gbatch = GISTBatchGetData(scan, batch); + Relation rel = scan->indexRelation; Buffer buffer; Page page; - OffsetNumber offnum; - ItemId iid; - int i; bool killedsomething = false; + XLogRecPtr latestlsn; - Assert(so->curBlkno != InvalidBlockNumber); - Assert(XLogRecPtrIsValid(so->curPageLSN)); - Assert(so->killedItems != NULL); + Assert(batch->numDead > 0); - buffer = ReadBuffer(scan->indexRelation, so->curBlkno); - if (!BufferIsValid(buffer)) + /* + * Skip virtual (ordered-scan) batches, since there's no practical way to + * visit all of the index pages that these tuples really came from + */ + if (gbatch->blkno == InvalidBlockNumber) return; + buffer = ReadBuffer(rel, gbatch->blkno); LockBuffer(buffer, GIST_SHARE); - gistcheckpage(scan->indexRelation, buffer); + gistcheckpage(rel, buffer); page = BufferGetPage(buffer); - /* - * If page LSN differs it means that the page was modified since the last - * read. killedItems could be not valid so LP_DEAD hints applying is not - * safe. - */ - if (BufferGetLSNAtomic(buffer) != so->curPageLSN) - goto unlock; - - Assert(GistPageIsLeaf(page)); - - /* - * Mark all killedItems as dead. We need no additional recheck, because, - * if page was modified, curPageLSN must have changed. - */ - for (i = 0; i < so->numKilled; i++) + latestlsn = BufferGetLSNAtomic(buffer); + Assert(batch->lsn <= latestlsn); + if (batch->lsn != latestlsn) { - if (!killedsomething) - { - /* - * Use the hint bit infrastructure to check if we can update the - * page while just holding a share lock. If we are not allowed, - * there's no point continuing. - */ - if (!BufferBeginSetHintBits(buffer)) - goto unlock; - } + /* Modified, give up on hinting */ + UnlockReleaseBuffer(buffer); + return; + } - offnum = so->killedItems[i]; - iid = PageGetItemId(page, offnum); - ItemIdMarkDead(iid); - killedsomething = true; + /* Iterate through batch->deadItems[] in index page order */ + for (int i = 0; i < batch->numDead; i++) + { + int itemIndex = batch->deadItems[i]; + OffsetNumber offnum = batch->items[itemIndex].indexOffset; + ItemId iid = PageGetItemId(page, offnum); + + Assert(itemIndex >= batch->firstItem && itemIndex <= batch->lastItem); + Assert(i == 0 || + offnum > batch->items[batch->deadItems[i - 1]].indexOffset); + Assert(offnum <= PageGetMaxOffsetNumber(page)); + Assert(ItemPointerEquals(&((IndexTuple) PageGetItem(page, iid))->t_tid, + &batch->items[itemIndex].tableTid)); + + /* + * Mark index item as dead, if it isn't already. Since this happens + * while holding a shared buffer lock, it's possible that multiple + * processes attempt to do this simultaneously, leading to multiple + * full-page images being sent to WAL (if wal_log_hints or data + * checksums are enabled), which is undesirable. + */ + if (!ItemIdIsDead(iid)) + { + if (!killedsomething) + { + /* + * Use the hint bit infrastructure to check if we can update + * the page while just holding a share lock. If we are not + * allowed, there's no point continuing. + */ + if (!BufferBeginSetHintBits(buffer)) + goto unlock; + } + + ItemIdMarkDead(iid); + killedsomething = true; + } } if (killedsomething) @@ -99,12 +109,6 @@ gistkillitems(IndexScanDesc scan) unlock: UnlockReleaseBuffer(buffer); - - /* - * Always reset the scan state, so we don't look for same items on other - * pages. - */ - so->numKilled = 0; } /* @@ -320,14 +324,29 @@ gistindex_keytest(IndexScanDesc scan, * myDistances: distances array associated with pageItem, or NULL at the root * tbm: if not NULL, gistgetbitmap's output bitmap * ntids: if not NULL, gistgetbitmap's output tuple counter + * newbatch: if not NULL, ordinary (unordered) gistgetbatch scan * - * If tbm/ntids aren't NULL, we are doing an amgetbitmap scan, and heap - * tuples should be reported directly into the bitmap. If they are NULL, - * we're doing a plain or ordered indexscan. For a plain indexscan, heap - * tuple TIDs are returned into so->pageData[]. For an ordered indexscan, - * heap tuple TIDs are pushed into individual search queue items. In an - * index-only scan, reconstructed index tuples are returned along with the - * TIDs. + * If tbm/ntids aren't NULL, we are doing an amgetbitmap scan, and matching + * heap tuples are reported directly into the bitmap. + * + * Otherwise, if newbatch isn't NULL, we're doing a non-ordered amgetbatch + * scan: matching items TIDs from a leaf page are stored into caller's + * newbatch to return via gistgetbatch. If we don't save any items in + * newbatch, caller needs to find the next leaf page that has matches and save + * its items in newbatch instead (if there is none then caller should release + * newbatch). + * + * Otherwise (newbatch and tbm both NULL) we're scanning a page for an ordered + * (nearest-neighbor) gistgetbatch scan: matching leaf heap tuples are pushed onto + * the search queue as GISTSearchItems carrying their distances, so the queue + * can later be drained in distance order. The page's buffer pin is dropped + * before returning. This can only happen during batchImmediateUnguard scans, + * which is what makes it safe. Groups of enqueued items will eventually be + * returned (in the expected order) as "virtual batches", but we don't do that + * here. + * + * In all cases, lower index pages are pushed onto the search queue to be + * visited later. * * If we detect that the index page has split since we saw its downlink * in the parent, we push its new right sibling onto the queue so the @@ -335,10 +354,10 @@ gistindex_keytest(IndexScanDesc scan, */ static void gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, - IndexOrderByDistance *myDistances, TIDBitmap *tbm, int64 *ntids) + IndexOrderByDistance *myDistances, TIDBitmap *tbm, int64 *ntids, + IndexScanBatch newbatch) { GISTScanOpaque so = (GISTScanOpaque) scan->opaque; - GISTSTATE *giststate = so->giststate; Relation r = scan->indexRelation; Buffer buffer; Page page; @@ -347,7 +366,12 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, OffsetNumber i; MemoryContext oldcxt; + /* state used when saving matching items into caller's newbatch */ + int itemIndex = 0; + int tupleOffset = 0; + Assert(!GISTSearchItemIsHeap(*pageItem)); + Assert(!newbatch || tbm == NULL); buffer = ReadBuffer(scan->indexRelation, pageItem->blkno); LockBuffer(buffer, GIST_SHARE); @@ -403,18 +427,6 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, return; } - so->nPageData = so->curPageData = 0; - scan->xs_hitup = NULL; /* might point into pageDataCxt */ - if (so->pageDataCxt) - MemoryContextReset(so->pageDataCxt); - - /* - * We save the LSN of the page as we read it, so that we know whether it - * is safe to apply LP_DEAD hints to the page later. This allows us to - * drop the pin for MVCC scans, which allows vacuum to avoid blocking. - */ - so->curPageLSN = BufferGetLSNAtomic(buffer); - /* * check all tuples on page */ @@ -464,24 +476,24 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, else if (scan->numberOfOrderBys == 0 && GistPageIsLeaf(page)) { /* - * Non-ordered scan, so report tuples in so->pageData[] + * unordered amgetbatch scan, so just store another matching item + * in caller's batch without worrying about ordering */ - so->pageData[so->nPageData].heapPtr = it->t_tid; - so->pageData[so->nPageData].recheck = recheck; - so->pageData[so->nPageData].offnum = i; + newbatch->items[itemIndex].tableTid = it->t_tid; + newbatch->items[itemIndex].indexOffset = i; + newbatch->items[itemIndex].tupleOffset = 0; + GISTBatchGetItem(scan, newbatch, itemIndex)->recheck = recheck; - /* - * In an index-only scan, also fetch the data from the tuple. The - * reconstructed tuples are stored in pageDataCxt. - */ if (scan->xs_want_itup) { - oldcxt = MemoryContextSwitchTo(so->pageDataCxt); - so->pageData[so->nPageData].recontup = - gistFetchTuple(giststate, r, it); - MemoryContextSwitchTo(oldcxt); + /* Copy on-disk format index tuple into currTuples */ + Size itupsz = IndexTupleSize(it); + + newbatch->items[itemIndex].tupleOffset = tupleOffset; + memcpy(newbatch->currTuples + tupleOffset, it, itupsz); + tupleOffset += MAXALIGN(itupsz); } - so->nPageData++; + itemIndex++; } else { @@ -500,17 +512,15 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, if (GistPageIsLeaf(page)) { - /* Creating heap-tuple GISTSearchItem */ + /* Creating heap-tuple GISTSearchItem for ordered search */ + Assert(scan->numberOfOrderBys > 0); + Assert(newbatch == NULL); + Assert(scan->batchImmediateUnguard); + item->blkno = InvalidBlockNumber; item->data.heap.heapPtr = it->t_tid; item->data.heap.recheck = recheck; item->data.heap.recheckDistances = recheck_distances; - - /* - * In an index-only scan, also fetch the data from the tuple. - */ - if (scan->xs_want_itup) - item->data.heap.recontup = gistFetchTuple(giststate, r, it); } else { @@ -535,6 +545,31 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, } } + if (newbatch) + { + /* Finalize result batch during a non-ordered amgetbatch scan */ + Assert(scan->numberOfOrderBys == 0 && tbm == NULL); + + newbatch->dir = ForwardScanDirection; + newbatch->firstItem = 0; + newbatch->lastItem = itemIndex - 1; + + if (itemIndex > 0) + { + GISTBatchData *gnewbatch; + + Assert(GistPageIsLeaf(page)); + + gnewbatch = GISTBatchGetData(scan, newbatch); + gnewbatch->buf = buffer; + gnewbatch->blkno = BufferGetBlockNumber(buffer); + + indexam_util_unlock_batch(scan, newbatch, buffer); + return; + } + /* else caller needs to find another page to fill newbatch */ + } + UnlockReleaseBuffer(buffer); } @@ -563,22 +598,29 @@ getNextGISTSearchItem(GISTScanOpaque so) } /* - * Fetch next heap tuple in an ordered search + * gistgetbatch_ordered() -- drain the queue into caller's newbatch in + * distance order + * + * Helper for gistgetbatch's ordered (nearest-neighbor) path. The pairing-heap + * queue (so->queue) holds both unvisited index pages and matching leaf heap + * tuples, ordered by (lower-bound) distance. We pop items in that order, + * dispatching on the item type. A popped heap tuple is appended to the + * batch. We stop once the batch is full (maxitemsbatch items) or the queue + * is exhausted, leaving any remaining items queued for the next call. + * + * Because the queue is drained in nondecreasing distance order across the whole + * scan (a downlink's distance is a lower bound on its subtree, so items pushed + * while scanning a page never sort ahead of items already popped), the + * batches we emit are globally distance-ordered. */ -static bool -getNextNearest(IndexScanDesc scan) +static IndexScanBatch +gistgetbatch_ordered(IndexScanDesc scan, IndexScanBatch newbatch) { GISTScanOpaque so = (GISTScanOpaque) scan->opaque; - bool res = false; + GISTBatchData *gnewbatch; + int nitems = 0; - if (scan->xs_hitup) - { - /* free previously returned tuple */ - pfree(scan->xs_hitup); - scan->xs_hitup = NULL; - } - - do + for (;;) { GISTSearchItem *item = getNextGISTSearchItem(so); @@ -588,162 +630,233 @@ getNextNearest(IndexScanDesc scan) if (GISTSearchItemIsHeap(*item)) { /* found a heap item at currently minimal distance */ - scan->xs_heaptid = item->data.heap.heapPtr; - scan->xs_recheck = item->data.heap.recheck; + GISTBatchItem *bitem = GISTBatchGetItem(scan, newbatch, nitems); - index_store_float8_orderby_distances(scan, so->orderByTypes, - item->distances, - item->data.heap.recheckDistances); + newbatch->items[nitems].tableTid = item->data.heap.heapPtr; + newbatch->items[nitems].indexOffset = -1; /* meaningless here */ + newbatch->items[nitems].tupleOffset = 0; - /* in an index-only scan, also return the reconstructed tuple. */ - if (scan->xs_want_itup) - scan->xs_hitup = item->data.heap.recontup; - res = true; + bitem->recheck = item->data.heap.recheck; + bitem->recheckDistances = item->data.heap.recheckDistances; + memcpy(bitem->distances, item->distances, + sizeof(item->distances[0]) * scan->numberOfOrderBys); + + nitems++; + pfree(item); + + if (nitems == scan->maxitemsbatch) + break; /* batch full; remaining items stay queued */ } else { /* visit an index page, extract its items into queue */ CHECK_FOR_INTERRUPTS(); - gistScanPage(scan, item, item->distances, NULL, NULL); + gistScanPage(scan, item, item->distances, NULL, NULL, NULL); + pfree(item); } + } - pfree(item); - } while (!res); + if (nitems == 0) + { + /* No matching items remain: the scan is exhausted */ + indexam_util_release_batch(scan, newbatch); + return NULL; + } - return res; + /* + * An ordered batch is "virtual": its items come from many leaf pages, + * whose pins gistScanPage already dropped, so it holds no TID recycling + * interlock. It has no single originating page, and we don't track those + * index pages in any case (gistkillitemsbatch will just skip it). + */ + Assert(!newbatch->isGuarded); + + newbatch->dir = ForwardScanDirection; + newbatch->firstItem = 0; + newbatch->lastItem = nitems - 1; + + gnewbatch = GISTBatchGetData(scan, newbatch); + gnewbatch->buf = InvalidBuffer; + gnewbatch->blkno = InvalidBlockNumber; + + return newbatch; } /* - * gistgettuple() -- Get the next tuple in the scan + * gistgetbatch() -- Get the first or next batch of items in a scan */ -bool -gistgettuple(IndexScanDesc scan, ScanDirection dir) +IndexScanBatch +gistgetbatch(IndexScanDesc scan, IndexScanBatch priorbatch, ScanDirection dir) { GISTScanOpaque so = (GISTScanOpaque) scan->opaque; + bool firstCall = (priorbatch == NULL); + IndexScanBatch newbatch; + GISTSearchItem fakeItem; if (dir != ForwardScanDirection) elog(ERROR, "GiST only supports forward scan direction"); if (!so->qual_ok) - return false; + return NULL; - if (so->firstCall) + /* Allocate a batch to pass to gistScanPage or gistgetbatch_ordered */ + newbatch = indexam_util_alloc_batch(scan); + + if (firstCall) { - /* Begin the scan by processing the root page */ - GISTSearchItem fakeItem; - + /* Begin the scan by preparing to process the root page */ pgstat_count_index_scan(scan->indexRelation); if (scan->instrument) scan->instrument->nsearches++; - so->firstCall = false; - so->curPageData = so->nPageData = 0; - scan->xs_hitup = NULL; - if (so->pageDataCxt) - MemoryContextReset(so->pageDataCxt); + Assert(scan->xs_hitup == NULL); + /* gistScanPage root page call setup */ fakeItem.blkno = GIST_ROOT_BLKNO; memset(&fakeItem.data.parentlsn, 0, sizeof(GistNSN)); - gistScanPage(scan, &fakeItem, NULL, NULL, NULL); } + /* + * Ordered (nearest-neighbor) scan, which returns "virtual" batches + */ if (scan->numberOfOrderBys > 0) { - /* Must fetch tuples in strict distance order */ - return getNextNearest(scan); - } - else - { - /* Fetch tuples index-page-at-a-time */ - for (;;) + if (firstCall) { - if (so->curPageData < so->nPageData) - { - if (scan->kill_prior_tuple && so->curPageData > 0) - { - - if (so->killedItems == NULL) - { - MemoryContext oldCxt = - MemoryContextSwitchTo(so->giststate->scanCxt); - - so->killedItems = - (OffsetNumber *) palloc(MaxIndexTuplesPerPage - * sizeof(OffsetNumber)); - - MemoryContextSwitchTo(oldCxt); - } - if (so->numKilled < MaxIndexTuplesPerPage) - so->killedItems[so->numKilled++] = - so->pageData[so->curPageData - 1].offnum; - } - /* continuing to return tuples from a leaf page */ - scan->xs_heaptid = so->pageData[so->curPageData].heapPtr; - scan->xs_recheck = so->pageData[so->curPageData].recheck; - - /* in an index-only scan, also return the reconstructed tuple */ - if (scan->xs_want_itup) - scan->xs_hitup = so->pageData[so->curPageData].recontup; - - so->curPageData++; - - return true; - } - - /* - * Check the last returned tuple and add it to killedItems if - * necessary - */ - if (scan->kill_prior_tuple - && so->curPageData > 0 - && so->curPageData == so->nPageData) - { - - if (so->killedItems == NULL) - { - MemoryContext oldCxt = - MemoryContextSwitchTo(so->giststate->scanCxt); - - so->killedItems = - (OffsetNumber *) palloc(MaxIndexTuplesPerPage - * sizeof(OffsetNumber)); - - MemoryContextSwitchTo(oldCxt); - } - if (so->numKilled < MaxIndexTuplesPerPage) - so->killedItems[so->numKilled++] = - so->pageData[so->curPageData - 1].offnum; - } - /* find and process the next index page */ - do - { - GISTSearchItem *item; - - if ((so->curBlkno != InvalidBlockNumber) && (so->numKilled > 0)) - gistkillitems(scan); - - item = getNextGISTSearchItem(so); - - if (!item) - return false; - - CHECK_FOR_INTERRUPTS(); - - /* save current item BlockNumber for next gistkillitems() call */ - so->curBlkno = item->blkno; - - /* - * While scanning a leaf page, ItemPointers of matching heap - * tuples are stored in so->pageData. If there are any on - * this page, we fall out of the inner "do" and loop around to - * return them. - */ - gistScanPage(scan, item, item->distances, NULL, NULL); - - pfree(item); - } while (so->nPageData == 0); + /* Actually process the root page */ + gistScanPage(scan, &fakeItem, NULL, NULL, NULL, NULL); } + + /* else save matches below the root into newbatch */ + return gistgetbatch_ordered(scan, newbatch); + } + + /* + * Non-ordered scan, which returns guardable batches in the order that the + * scan visits their leaf pages in + */ + if (firstCall) + { + /* Actually process the root page */ + gistScanPage(scan, &fakeItem, NULL, NULL, NULL, newbatch); + if (newbatch->firstItem <= newbatch->lastItem) + return newbatch; + } + /* else save matches below the root into newbatch */ + + for (;;) + { + GISTSearchItem *item = getNextGISTSearchItem(so); + + if (item == NULL) + { + /* No more index pages to scan; the scan is exhausted */ + indexam_util_release_batch(scan, newbatch); + return NULL; + } + + CHECK_FOR_INTERRUPTS(); + + /* Scan this queued index page; matching leaf items go into the batch */ + gistScanPage(scan, item, item->distances, NULL, NULL, newbatch); + pfree(item); + + /* If this leaf page produced matching items, return the batch */ + if (newbatch->firstItem <= newbatch->lastItem) + return newbatch; + } + + pg_unreachable(); + + return NULL; +} + +/* + * gistunguardbatch() -- Drop a batch's TID recycling interlock (buffer pin) + * + * Called by the table AM when it's safe to drop the buffer pin held to + * prevent concurrent TID recycling by VACUUM. + */ +void +gistunguardbatch(IndexScanDesc scan, IndexScanBatch batch) +{ + GISTBatchData *gbatch = GISTBatchGetData(scan, batch); + + /* Should be called exactly once iff !batchImmediateUnguard */ + Assert(!scan->batchImmediateUnguard); + Assert(batch->isGuarded); + + ReleaseBuffer(gbatch->buf); +} + +/* + * gistgettransform() -- Set up the scan's per-tuple output for one batch item + * + * Implements the amgettransform interface. The table AM calls this as it + * returns each item of a GiST scan, to set the scan descriptor's per-tuple + * output from the item's per-item data. + * + * - We always apply the item's qual recheck flag to scan->xs_recheck. + * - For ordered scans, we report the item's own ORDER BY distances (stored in + * the per-item index AM area by gistgetbatch_ordered) as xs_orderbyvals. + * They are flagged for recheck only when the distance function was lossy for + * that item; an exact distance is reported as final, while a lossy lower + * bound is rechecked by the executor's reorder queue to recompute the true + * order. + * - For index-only scans, we reconstruct the originally indexed values from + * the stored on-disk index tuple into a heap tuple, exposed as xs_hitup. + * + * The reconstructed tuple lives in the scan's memory context and only needs to + * outlive a single table_index_getnext_slot call (the executor copies it into + * the scan slot). We free the previously returned tuple before building the + * next one. + */ +void +gistgettransform(IndexScanDesc scan, IndexScanBatch batch, int item) +{ + GISTScanOpaque so = (GISTScanOpaque) scan->opaque; + GISTBatchItem *bitem = GISTBatchGetItem(scan, batch, item); + + Assert(item >= batch->firstItem && item <= batch->lastItem); + + /* Apply this item's qual recheck flag */ + scan->xs_recheck = bitem->recheck; + + /* Index-only scan (can't be ordered) */ + if (scan->xs_want_itup) + { + /* Reconstruct a returnable heap tuple from stashed index tuple */ + IndexTuple itup = (IndexTuple) (batch->currTuples + + batch->items[item].tupleOffset); + MemoryContext oldcxt; + + Assert(scan->numberOfOrderBys == 0); + + if (scan->xs_hitup) + { + pfree(scan->xs_hitup); + scan->xs_hitup = NULL; + } + + /* reconstruct the originally indexed values as a heap tuple */ + oldcxt = MemoryContextSwitchTo(so->giststate->scanCxt); + scan->xs_hitup = gistFetchTuple(so->giststate, scan->indexRelation, itup); + MemoryContextSwitchTo(oldcxt); + } + + /* Ordered scan (must be a plain index scan) */ + else if (scan->numberOfOrderBys > 0) + { + /* + * Note: This is a "virtual" batch. The items from caller's batch + * were stored in the batch in distance order by gistgetbatch_ordered, + * right before gistgetbatch returned it. + */ + Assert(GISTBatchGetData(scan, batch)->blkno == InvalidBlockNumber); + index_store_float8_orderby_distances(scan, so->orderByTypes, + bitem->distances, + bitem->recheckDistances); } } @@ -765,19 +878,10 @@ gistgetbitmap(IndexScanDesc scan, TIDBitmap *tbm) scan->instrument->nsearches++; /* Begin the scan by processing the root page */ - so->curPageData = so->nPageData = 0; - scan->xs_hitup = NULL; - if (so->pageDataCxt) - MemoryContextReset(so->pageDataCxt); - fakeItem.blkno = GIST_ROOT_BLKNO; memset(&fakeItem.data.parentlsn, 0, sizeof(GistNSN)); - gistScanPage(scan, &fakeItem, NULL, tbm, &ntids); + gistScanPage(scan, &fakeItem, NULL, tbm, &ntids, NULL); - /* - * While scanning a leaf page, ItemPointers of matching heap tuples will - * be stored directly into tbm, so we don't need to deal with them here. - */ for (;;) { GISTSearchItem *item = getNextGISTSearchItem(so); @@ -787,7 +891,8 @@ gistgetbitmap(IndexScanDesc scan, TIDBitmap *tbm) CHECK_FOR_INTERRUPTS(); - gistScanPage(scan, item, item->distances, tbm, &ntids); + /* Scan this queued index page; matching leaf items go into tbm */ + gistScanPage(scan, item, item->distances, tbm, &ntids, NULL); pfree(item); } diff --git a/src/backend/access/gist/gistscan.c b/src/backend/access/gist/gistscan.c index c65f93abd..9fdc03bd3 100644 --- a/src/backend/access/gist/gistscan.c +++ b/src/backend/access/gist/gistscan.c @@ -104,12 +104,32 @@ gistbeginscan(Relation r, int nkeys, int norderbys) scan->xs_orderbyvals = palloc0_array(Datum, scan->numberOfOrderBys); scan->xs_orderbynulls = palloc_array(bool, scan->numberOfOrderBys); memset(scan->xs_orderbynulls, true, sizeof(bool) * scan->numberOfOrderBys); - } - so->killedItems = NULL; /* until needed */ - so->numKilled = 0; - so->curBlkno = InvalidBlockNumber; - so->curPageLSN = InvalidXLogRecPtr; + /* + * Ordered scans fill a "virtual" batch by draining the + * distance-ordered queue, so the batch size is a tuning knob with no + * natural value. Testing has shown that a very small size will + * increase per-batch overhead (and likely instruction-cache misses), + * while a large size (such as MaxIndexTuplesPerPage) risks producing + * many tuples that a LIMIT node never consumes. This maxitemsbatch + * is a compromise. + */ + scan->maxitemsbatch = MaxIndexTuplesPerPage / 32; + } + else + scan->maxitemsbatch = MaxIndexTuplesPerPage; + + scan->batch_index_opaque_static = MAXALIGN(sizeof(GISTBatchData)); + + /* + * Use second opaque area for our per-item GISTBatchItem array + * + * XXX Most scans only need to store a recheck flag. We could save memory + * in that common case by just asking for an array of booleans here. + */ + scan->batch_index_opaque_dyn = + SizeOfGISTBatchItem(scan->numberOfOrderBys) * scan->maxitemsbatch; + scan->batch_tuples_workspace = BLCKSZ; scan->opaque = so; @@ -168,8 +188,7 @@ gistrescan(IndexScanDesc scan, ScanKey key, int nkeys, /* * If we're doing an index-only scan, on the first call, also initialize a - * tuple descriptor to represent the returned index tuples and create a - * memory context to hold them during the scan. + * tuple descriptor to represent the returned index tuples. */ if (scan->xs_want_itup && !scan->xs_hitupdesc) { @@ -203,11 +222,6 @@ gistrescan(IndexScanDesc scan, ScanKey key, int nkeys, } TupleDescFinalize(so->giststate->fetchTupdesc); scan->xs_hitupdesc = so->giststate->fetchTupdesc; - - /* Also create a memory context that will hold the returned tuples */ - so->pageDataCxt = AllocSetContextCreate(so->giststate->scanCxt, - "GiST page data context", - ALLOCSET_DEFAULT_SIZES); } /* create new, empty pairing heap for search queue */ @@ -215,8 +229,6 @@ gistrescan(IndexScanDesc scan, ScanKey key, int nkeys, so->queue = pairingheap_allocate(pairingheap_GISTSearchItem_cmp, scan); MemoryContextSwitchTo(oldCxt); - so->firstCall = true; - /* Update scan key, if a new one is given */ if (key && scan->numberOfKeys > 0) { @@ -340,7 +352,8 @@ gistrescan(IndexScanDesc scan, ScanKey key, int nkeys, pfree(fn_extras); } - /* any previous xs_hitup will have been pfree'd in context resets above */ + if (scan->xs_hitup) + pfree(scan->xs_hitup); scan->xs_hitup = NULL; } diff --git a/src/backend/access/gist/gistutil.c b/src/backend/access/gist/gistutil.c index 0f58f6187..a687718e7 100644 --- a/src/backend/access/gist/gistutil.c +++ b/src/backend/access/gist/gistutil.c @@ -23,6 +23,7 @@ #include "utils/float.h" #include "utils/fmgrprotos.h" #include "utils/lsyscache.h" +#include "utils/memutils.h" #include "utils/rel.h" #include "utils/snapmgr.h" #include "utils/syscache.h" @@ -670,6 +671,7 @@ gistFetchTuple(GISTSTATE *giststate, Relation r, IndexTuple tuple) Datum fetchatt[INDEX_MAX_KEYS]; bool isnull[INDEX_MAX_KEYS]; int i; + HeapTuple htup; for (i = 0; i < IndexRelationGetNumberOfKeyAttributes(r); i++) { @@ -717,7 +719,12 @@ gistFetchTuple(GISTSTATE *giststate, Relation r, IndexTuple tuple) } MemoryContextSwitchTo(oldcxt); - return heap_form_tuple(giststate->fetchTupdesc, fetchatt, isnull); + htup = heap_form_tuple(giststate->fetchTupdesc, fetchatt, isnull); + + /* cleanup */ + MemoryContextReset(giststate->tempCxt); + + return htup; } float diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c index 686a04180..ad0b7a71f 100644 --- a/src/backend/access/gist/gistvacuum.c +++ b/src/backend/access/gist/gistvacuum.c @@ -326,10 +326,11 @@ restart: recurse_to = InvalidBlockNumber; /* - * We are not going to stay here for a long time, aggressively grab an - * exclusive lock. + * Get a full cleanup lock on this page. We must get such a lock on every + * leaf page over the course of the vacuum scan, whether or not it + * actually contains any deletable tuples. */ - LockBuffer(buffer, GIST_EXCLUSIVE); + LockBufferForCleanup(buffer); page = BufferGetPage(buffer); if (gistPageRecyclable(page)) diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c index a97a068cf..4b08b285d 100644 --- a/src/backend/access/hash/hash.c +++ b/src/backend/access/hash/hash.c @@ -118,6 +118,7 @@ hashhandler(PG_FUNCTION_ARGS) .amgetbatch = hashgetbatch, .amunguardbatch = hashunguardbatch, .amkillitemsbatch = hashkillitemsbatch, + .amgettransform = NULL, .amgetbitmap = hashgetbitmap, .amendscan = hashendscan, .amposreset = NULL, diff --git a/src/backend/access/heap/heapam_indexscan.c b/src/backend/access/heap/heapam_indexscan.c index e137ec1c6..75653d1a0 100644 --- a/src/backend/access/heap/heapam_indexscan.c +++ b/src/backend/access/heap/heapam_indexscan.c @@ -997,11 +997,23 @@ heapam_index_return_scanpos_tid(IndexScanDesc scan, IndexFetchHeapData *hscan, BatchRingItemPos *scanPos, bool *all_visible) { + amgettransform_function amgettransform = + scan->indexRelation->rd_indam->amgettransform; HeapBatchData *hbatch; /* Set xs_heaptid, which caller (and core executor) will need */ scan->xs_heaptid = scanBatch->items[scanPos->item].tableTid; + /* + * Let the index AM set this item's per-tuple output. An AM that provides + * amgettransform uses it to set the item's qual recheck flag + * (scan->xs_recheck), an ordered scan's ORDER BY distances + * (xs_orderbyvals/xs_recheckorderby), and an index-only scan's returnable + * tuple (xs_hitup). + */ + if (amgettransform != NULL) + amgettransform(scan, scanBatch, scanPos->item); + if (all_visible == NULL) { /* @@ -1014,8 +1026,14 @@ heapam_index_return_scanpos_tid(IndexScanDesc scan, IndexFetchHeapData *hscan, /* Index-only scan */ Assert(scan->xs_want_itup); - scan->xs_itup = (IndexTuple) (scanBatch->currTuples + - scanBatch->items[scanPos->item].tupleOffset); + /* + * Unless the index AM already produced the returnable tuple via + * amgettransform above (in xs_hitup), set the original index tuple that + * amgetbatch stored in currTuples in xs_itup. + */ + if (amgettransform == NULL) + scan->xs_itup = (IndexTuple) (scanBatch->currTuples + + scanBatch->items[scanPos->item].tupleOffset); /* * Set visibility info for the current scanPos item (plus possibly some diff --git a/src/backend/access/index/amapi.c b/src/backend/access/index/amapi.c index d4adbbeb2..9886f49ff 100644 --- a/src/backend/access/index/amapi.c +++ b/src/backend/access/index/amapi.c @@ -58,6 +58,7 @@ GetIndexAmRoutine(Oid amhandler) /* Assert that AM doesn't have an invalid combination of callbacks */ Assert((routine->amgetbatch != NULL) == (routine->amunguardbatch != NULL)); Assert(routine->amkillitemsbatch == NULL || routine->amgetbatch != NULL); + Assert(routine->amgettransform == NULL || routine->amgetbatch != NULL); Assert(routine->amgetbatch != NULL || routine->amposreset == NULL); return routine; diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c index 30af4e412..220f23e57 100644 --- a/src/backend/access/index/genam.c +++ b/src/backend/access/index/genam.c @@ -131,6 +131,7 @@ RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys) scan->xs_getnext_slot = NULL; scan->batch_index_opaque_static = 0; + scan->batch_index_opaque_dyn = 0; scan->batch_tuples_workspace = 0; scan->batch_opaque_size = 0; scan->batch_per_item_size = 0; diff --git a/src/backend/access/index/indexbatch.c b/src/backend/access/index/indexbatch.c index 75b9bb45f..793fb2687 100644 --- a/src/backend/access/index/indexbatch.c +++ b/src/backend/access/index/indexbatch.c @@ -593,13 +593,15 @@ indexam_util_alloc_batch(IndexScanDesc scan) * * This combines the table AM opaque area (its fixed-size header plus * any per-item area used during index-only scans) and the index AM's - * static opaque area into a single offset used to find the true - * allocation base from the batch pointer. (This is also where the - * fixed-size table AM opaque area can be found.) + * opaque areas (static plus the optional dynamic) into a single + * offset used to find the true allocation base from the batch + * pointer. (This is also where the fixed-size table AM opaque area + * can be found.) */ if (scan->batch_base_offset == 0) { Size table_area = 0; + Size index_dyn_area; /* * The table AM opaque area is a single contiguous block: a @@ -619,8 +621,18 @@ indexam_util_alloc_batch(IndexScanDesc scan) (Size) scan->batch_per_item_size * scan->maxitemsbatch); + /* + * The optional dynamic index AM opaque area + * (batch_index_opaque_dyn bytes) sits between the table AM area + * and the static index AM opaque area + */ + index_dyn_area = MAXALIGN(scan->batch_index_opaque_dyn); - scan->batch_base_offset = table_area + + /* + * index_dyn_area is allowed to be very large, so we're careful to + * not let it overflow + */ + scan->batch_base_offset = table_area + index_dyn_area + scan->batch_index_opaque_static; } diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 3d334484d..cd2b2cdce 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -166,6 +166,7 @@ bthandler(PG_FUNCTION_ARGS) .amgetbatch = btgetbatch, .amunguardbatch = btunguardbatch, .amkillitemsbatch = btkillitemsbatch, + .amgettransform = NULL, .amgetbitmap = btgetbitmap, .amendscan = btendscan, .amposreset = btposreset, diff --git a/src/backend/access/spgist/spgutils.c b/src/backend/access/spgist/spgutils.c index 745435da3..47153b4b0 100644 --- a/src/backend/access/spgist/spgutils.c +++ b/src/backend/access/spgist/spgutils.c @@ -92,6 +92,7 @@ spghandler(PG_FUNCTION_ARGS) .amgetbatch = NULL, .amunguardbatch = NULL, .amkillitemsbatch = NULL, + .amgettransform = NULL, .amgetbitmap = spggetbitmap, .amendscan = spgendscan, .amposreset = NULL, diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c index eb4c8ca5a..a29c911cc 100644 --- a/src/backend/executor/nodeIndexonlyscan.c +++ b/src/backend/executor/nodeIndexonlyscan.c @@ -132,18 +132,6 @@ IndexOnlyNext(IndexOnlyScanState *node) } } - /* - * We don't currently support rechecking ORDER BY distances. (In - * principle, if the index can support retrieval of the originally - * indexed value, it should be able to produce an exact distance - * calculation too. So it's not clear that adding code here for - * recheck/re-sort would be worth the trouble. But we should at least - * throw an error if someone tries it.) - */ - if (scandesc->numberOfOrderBys > 0 && scandesc->xs_recheckorderby) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("lossy distance functions are not supported in index-only scans"))); return slot; } diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c index 94fedf32c..624b6d0f8 100644 --- a/src/backend/optimizer/path/indxpath.c +++ b/src/backend/optimizer/path/indxpath.c @@ -951,9 +951,12 @@ build_index_paths(PlannerInfo *root, RelOptInfo *rel, /* * 3. Check if an index-only scan is possible. If we're not building * plain indexscans, this isn't relevant since bitmap scans don't support - * index data retrieval anyway. + * index data retrieval anyway. If there are ordering operators then we + * assume that an index-only scan is unsafe due to the difficulty with + * holding index page pins sufficient to avoid concurrent TID recycling. */ index_only_scan = (scantype != ST_BITMAPSCAN && + orderbyclauses == NIL && check_index_only(rel, index)); /* diff --git a/contrib/bloom/blutils.c b/contrib/bloom/blutils.c index 249af48e6..168842bc7 100644 --- a/contrib/bloom/blutils.c +++ b/contrib/bloom/blutils.c @@ -150,6 +150,7 @@ blhandler(PG_FUNCTION_ARGS) .amgetbatch = NULL, .amunguardbatch = NULL, .amkillitemsbatch = NULL, + .amgettransform = NULL, .amgetbitmap = blgetbitmap, .amendscan = blendscan, .amposreset = NULL, diff --git a/contrib/btree_gist/expected/cash.out b/contrib/btree_gist/expected/cash.out index 7fbc73559..56fd1eb49 100644 --- a/contrib/btree_gist/expected/cash.out +++ b/contrib/btree_gist/expected/cash.out @@ -74,10 +74,10 @@ SELECT count(*) FROM moneytmp WHERE a > '22649.64'::money; EXPLAIN (COSTS OFF) SELECT a, a <-> '21472.79' FROM moneytmp ORDER BY a <-> '21472.79' LIMIT 3; - QUERY PLAN --------------------------------------------------- + QUERY PLAN +----------------------------------------------- Limit - -> Index Only Scan using moneyidx on moneytmp + -> Index Scan using moneyidx on moneytmp Order By: (a <-> '$21,472.79'::money) (3 rows) diff --git a/contrib/btree_gist/expected/date.out b/contrib/btree_gist/expected/date.out index 5db864bb8..4a360bea6 100644 --- a/contrib/btree_gist/expected/date.out +++ b/contrib/btree_gist/expected/date.out @@ -74,10 +74,10 @@ SELECT count(*) FROM datetmp WHERE a > '2001-02-13'::date; EXPLAIN (COSTS OFF) SELECT a, a <-> '2001-02-13' FROM datetmp ORDER BY a <-> '2001-02-13' LIMIT 3; - QUERY PLAN ------------------------------------------------- + QUERY PLAN +---------------------------------------------- Limit - -> Index Only Scan using dateidx on datetmp + -> Index Scan using dateidx on datetmp Order By: (a <-> '02-13-2001'::date) (3 rows) diff --git a/contrib/btree_gist/expected/float4.out b/contrib/btree_gist/expected/float4.out index dfe732049..8878a317c 100644 --- a/contrib/btree_gist/expected/float4.out +++ b/contrib/btree_gist/expected/float4.out @@ -74,10 +74,10 @@ SELECT count(*) FROM float4tmp WHERE a > -179.0::float4; EXPLAIN (COSTS OFF) SELECT a, a <-> '-179.0' FROM float4tmp ORDER BY a <-> '-179.0' LIMIT 3; - QUERY PLAN ----------------------------------------------------- + QUERY PLAN +----------------------------------------------- Limit - -> Index Only Scan using float4idx on float4tmp + -> Index Scan using float4idx on float4tmp Order By: (a <-> '-179'::real) (3 rows) diff --git a/contrib/btree_gist/expected/float8.out b/contrib/btree_gist/expected/float8.out index ebd0ef3d6..763091b5c 100644 --- a/contrib/btree_gist/expected/float8.out +++ b/contrib/btree_gist/expected/float8.out @@ -77,7 +77,7 @@ SELECT a, a <-> '-1890.0' FROM float8tmp ORDER BY a <-> '-1890.0' LIMIT 3; QUERY PLAN ----------------------------------------------------- Limit - -> Index Only Scan using float8idx on float8tmp + -> Index Scan using float8idx on float8tmp Order By: (a <-> '-1890'::double precision) (3 rows) diff --git a/contrib/btree_gist/expected/int2.out b/contrib/btree_gist/expected/int2.out index 50a332939..245fa4be6 100644 --- a/contrib/btree_gist/expected/int2.out +++ b/contrib/btree_gist/expected/int2.out @@ -74,10 +74,10 @@ SELECT count(*) FROM int2tmp WHERE a > 237::int2; EXPLAIN (COSTS OFF) SELECT a, a <-> '237' FROM int2tmp ORDER BY a <-> '237' LIMIT 3; - QUERY PLAN ------------------------------------------------- + QUERY PLAN +------------------------------------------- Limit - -> Index Only Scan using int2idx on int2tmp + -> Index Scan using int2idx on int2tmp Order By: (a <-> '237'::smallint) (3 rows) diff --git a/contrib/btree_gist/expected/int4.out b/contrib/btree_gist/expected/int4.out index 6bbdc7c3f..41bed1f6e 100644 --- a/contrib/btree_gist/expected/int4.out +++ b/contrib/btree_gist/expected/int4.out @@ -74,10 +74,10 @@ SELECT count(*) FROM int4tmp WHERE a > 237::int4; EXPLAIN (COSTS OFF) SELECT a, a <-> '237' FROM int4tmp ORDER BY a <-> '237' LIMIT 3; - QUERY PLAN ------------------------------------------------- + QUERY PLAN +------------------------------------------- Limit - -> Index Only Scan using int4idx on int4tmp + -> Index Scan using int4idx on int4tmp Order By: (a <-> 237) (3 rows) diff --git a/contrib/btree_gist/expected/int8.out b/contrib/btree_gist/expected/int8.out index eff77c26b..2bbdd7657 100644 --- a/contrib/btree_gist/expected/int8.out +++ b/contrib/btree_gist/expected/int8.out @@ -77,7 +77,7 @@ SELECT a, a <-> '464571291354841' FROM int8tmp ORDER BY a <-> '464571291354841' QUERY PLAN ----------------------------------------------------- Limit - -> Index Only Scan using int8idx on int8tmp + -> Index Scan using int8idx on int8tmp Order By: (a <-> '464571291354841'::bigint) (3 rows) diff --git a/contrib/btree_gist/expected/interval.out b/contrib/btree_gist/expected/interval.out index 4c3d494e4..4ed196198 100644 --- a/contrib/btree_gist/expected/interval.out +++ b/contrib/btree_gist/expected/interval.out @@ -77,7 +77,7 @@ SELECT a, a <-> '199 days 21:21:23' FROM intervaltmp ORDER BY a <-> '199 days 21 QUERY PLAN --------------------------------------------------------------------------- Limit - -> Index Only Scan using intervalidx on intervaltmp + -> Index Scan using intervalidx on intervaltmp Order By: (a <-> '@ 199 days 21 hours 21 mins 23 secs'::interval) (3 rows) diff --git a/contrib/btree_gist/expected/time.out b/contrib/btree_gist/expected/time.out index ec95ef77c..1b9da4e19 100644 --- a/contrib/btree_gist/expected/time.out +++ b/contrib/btree_gist/expected/time.out @@ -77,7 +77,7 @@ SELECT a, a <-> '10:57:11' FROM timetmp ORDER BY a <-> '10:57:11' LIMIT 3; QUERY PLAN -------------------------------------------------------------- Limit - -> Index Only Scan using timeidx on timetmp + -> Index Scan using timeidx on timetmp Order By: (a <-> '10:57:11'::time without time zone) (3 rows) diff --git a/contrib/btree_gist/expected/timestamp.out b/contrib/btree_gist/expected/timestamp.out index 0d94f2f24..cc3624f08 100644 --- a/contrib/btree_gist/expected/timestamp.out +++ b/contrib/btree_gist/expected/timestamp.out @@ -77,7 +77,7 @@ SELECT a, a <-> '2004-10-26 08:55:08' FROM timestamptmp ORDER BY a <-> '2004-10- QUERY PLAN ----------------------------------------------------------------------------------- Limit - -> Index Only Scan using timestampidx on timestamptmp + -> Index Scan using timestampidx on timestamptmp Order By: (a <-> 'Tue Oct 26 08:55:08 2004'::timestamp without time zone) (3 rows) diff --git a/contrib/btree_gist/expected/timestamptz.out b/contrib/btree_gist/expected/timestamptz.out index 75a15a425..88d2404c4 100644 --- a/contrib/btree_gist/expected/timestamptz.out +++ b/contrib/btree_gist/expected/timestamptz.out @@ -197,7 +197,7 @@ SELECT a, a <-> '2018-12-18 10:59:54 GMT+2' FROM timestamptztmp ORDER BY a <-> ' QUERY PLAN ------------------------------------------------------------------------------------ Limit - -> Index Only Scan using timestamptzidx on timestamptztmp + -> Index Scan using timestamptzidx on timestamptztmp Order By: (a <-> 'Tue Dec 18 04:59:54 2018 PST'::timestamp with time zone) (3 rows) diff --git a/doc/src/sgml/indexam.sgml b/doc/src/sgml/indexam.sgml index 38b6a4004..27fcd04d5 100644 --- a/doc/src/sgml/indexam.sgml +++ b/doc/src/sgml/indexam.sgml @@ -172,6 +172,7 @@ typedef struct IndexAmRoutine amgetbatch_function amgetbatch; /* can be NULL */ amunguardbatch_function amunguardbatch; /* can be NULL */ amkillitemsbatch_function amkillitemsbatch; /* can be NULL */ + amgettransform_function amgettransform; /* can be NULL */ amgetbitmap_function amgetbitmap; /* can be NULL */ amendscan_function amendscan; amposreset_function amposreset; /* can be NULL */ @@ -716,6 +717,18 @@ ambeginscan (Relation indexRelation, and sibling page links). + + + scan->batch_index_opaque_dyn: the size of an + optional second per-batch opaque area, or 0 if the index AM does not need + one. Unlike the area above, its size need not be known at compile time; + the index AM may choose it at the start of each scan. It sits immediately + before the static area, and core code treats it as a single opaque + allocation that the index AM lays out however it likes (for example, to + carry per-item match metadata, such as a recheck flag or order-by + distances, that must travel with the batch). + + scan->batch_tuples_workspace: the size in bytes @@ -736,7 +749,8 @@ ambeginscan (Relation indexRelation, ambeginscan, since the value applies to every item the scan returns. The value set here persists across any subsequent amrescan calls. B-tree (always false) and hash (always - true) work this way. + true) work this way; see amgetbatch below for the case + where the recheck requirement varies per item. @@ -890,20 +904,46 @@ amgetbatch (IndexScanDesc scan, Index access methods using amgetbatch must set scan->xs_recheck to indicate whether rechecking of scan keys is required, in the same way as amgettuple - does. However, scan->xs_recheck must be set consistently - for an entire scan rather than varying on a per-tuple basis. This is a key - difference from amgettuple, which can set - scan->xs_recheck independently for each tuple it returns. - Index access methods that require granular control over - scan->xs_recheck must use the amgettuple - interface instead of amgetbatch. + does. An amgetbatch access method cannot set + scan->xs_recheck as it returns each item the way + amgettuple does, because its interface decouples the + order of amgetbatch calls from the order in which + items are later returned to the scan. When the recheck requirement is a + fixed property of the whole scan, the access method simply sets + scan->xs_recheck once, at scan start (in its + ambeginscan routine): B-tree always sets it false, and + hash always sets it true. When it instead varies from one matching + item to the next, the access method records the per-item value in the batch + and provides an amgettransform callback (see below), + which the table AM invokes for each returned item to set + scan->xs_recheck from that recorded state; GiST works + this way. - Similarly, the amgetbatch interface does not currently - support index-only scans that return data in the form of a - HeapTuple pointer stored in - scan->xs_hitup. + For index-only scans, an amgetbatch access method + normally returns the matching on-disk IndexTuple + directly: the batch infrastructure copies IndexTuple + records from index pages into a local buffer associated with each batch, and + the table AM exposes the current one via scan->xs_itup. + xs_itupdesc works in the same way as already described for + amgettuple, and the index access method must not set the + scan->xs_itup field itself. + + + + An access method whose returnable values must be reconstructed (rather than + returned as a stored IndexTuple) instead provides an + amgettransform callback that sets + scan->xs_hitup to a reconstructed + HeapTuple. Because prefetching keeps several + batches open and consumes their items asynchronously, there is no automatic + single-tuple lifetime as there is with amgettuple; the + access method manages this itself, typically by reconstructing into a + scan-lifetime memory context and freeing the previously returned tuple each + time amgettransform is called, so that only one + reconstructed tuple is valid at a time. GiST uses this approach for its + index-only scans. @@ -930,9 +970,10 @@ amunguardbatch (IndexScanDesc scan, leaf page, which prevents concurrent TID recycling by VACUUM. Formally, an index AM may hold a different kind of interlock, or multiple - interlocks, in its per-batch opaque area, but in practice both built-in - index AMs that support amgetbatch — B-tree and - hash — hold a single buffer pin. See + interlocks, in its per-batch opaque area, but in practice the built-in + index AMs that support amgetbatch hold a single buffer + pin: B-tree, hash, and GiST batches each retain the pin on the batch's one + index leaf page. See for details on buffer pin management during index scans. This function will be called at most once for each guarded batch; it is not called when the index AM has already unguarded the batch itself (as it does when @@ -975,8 +1016,8 @@ amkillitemsbatch (IndexScanDesc scan, amgetbatch index AMs (those that don't can leave the field set to NULL), but doing so is recommended for performance, as it allows future scans to skip known-dead index entries. - Both core index access methods that currently support - amgetbatch (B-tree and hash) implement + All three core index access methods that currently support + amgetbatch (B-tree, hash, and GiST) implement LP_DEAD marking, though third-party index access methods are free to choose whether to implement this feature. The table AM may call tableam_util_scanpos_killitem to mark dead items as @@ -1016,7 +1057,7 @@ amkillitemsbatch (IndexScanDesc scan, VACUUM recycling table TIDs — so it would be unsafe to assume that index entries still point to the same heap/table tuples. Since LP_DEAD marking is only an optimization - hint, it is always safe to skip it. Both B-tree and hash use this + hint, it is always safe to skip it. B-tree, hash, and GiST use this approach. @@ -1055,6 +1096,41 @@ amkillitemsbatch (IndexScanDesc scan, +void +amgettransform (IndexScanDesc scan, + IndexScanBatch batch, + int item); + + Called by the table AM as it returns each matching item + (item is an index into the batch's + items array) of an amgetbatch + scan, to set up the scan's per-tuple output from per-item state that the + access method recorded in the batch. This is needed when that output cannot + be a fixed property of the whole scan. An access method may use it to set + xs_recheck (when the need to recheck the scan + conditions varies from one matching item to the next), to set + xs_orderbyvals and + xs_recheckorderby for an ordered + (nearest-neighbor) scan, and to set xs_hitup for + an index-only scan whose returnable tuple must be reconstructed rather than + returned directly as a stored index tuple. + + + + Implementing amgettransform is optional, and is only + meaningful together with amgetbatch. An access method + need only provide it when some part of its per-tuple output varies from one + matching item to the next. When every such output is instead a fixed + property of the whole scan — or, for index-only scans, is the on-disk + index tuple returned directly via xs_itup — + the field can be left NULL, as B-tree and hash do. GiST + provides one because parts of its per-tuple output (the recheck flag, the + ORDER BY distances, and the reconstructed index-only + tuples) vary per matching item, as described above. + + + + int64 amgetbitmap (IndexScanDesc scan, TIDBitmap *tbm); @@ -1352,8 +1428,28 @@ amtranslatecmptype (CompareType cmptype, Oid opfamily, Oid opcintype); - Note that amgetbatch scans do not currently support - ordering operators. + An amgetbatch access method can support ordering + operators by providing an amgettransform callback: it + records each matching item's ordering values in the batch, and the table AM + calls amgettransform as it returns each item to set + xs_orderbyvals and + xs_recheckorderby from that recorded state. GiST + uses this for nearest-neighbor scans. As with + scan->xs_recheck, these values cannot be set directly as + items are returned, because prefetching decouples the order of + amgetbatch calls from the order in which items are + returned to the scan. + + + + Scans that use ordering operators are never planned as index-only scans. + Because an ordered scan can collect matching items from many index leaf + pages without retaining a buffer pin on any of them (GiST's + virtual nearest-neighbor batches work this way), it has no + pin to serve as the interlock against concurrent TID recycling that an + index-only scan depends on (see ). The + planner therefore costs and executes such scans as plain index scans, which + always fetch and recheck the heap tuple. diff --git a/src/test/modules/dummy_index_am/dummy_index_am.c b/src/test/modules/dummy_index_am/dummy_index_am.c index 3f5be6082..c6990cab5 100644 --- a/src/test/modules/dummy_index_am/dummy_index_am.c +++ b/src/test/modules/dummy_index_am/dummy_index_am.c @@ -338,6 +338,7 @@ dihandler(PG_FUNCTION_ARGS) .amgetbatch = NULL, .amunguardbatch = NULL, .amkillitemsbatch = NULL, + .amgettransform = NULL, .amgetbitmap = NULL, .amendscan = diendscan, .amposreset = NULL, diff --git a/src/test/modules/index/expected/killtuples.out b/src/test/modules/index/expected/killtuples.out index a3db2c409..110c3d445 100644 --- a/src/test/modules/index/expected/killtuples.out +++ b/src/test/modules/index/expected/killtuples.out @@ -152,6 +152,83 @@ f step drop_table: DROP TABLE IF EXISTS kill_prior_tuple; step drop_ext_btree_gist: DROP EXTENSION btree_gist; +starting permutation: create_table fill_500 create_ext_btree_gist create_gist flush disable_seq disable_bitmap measure access_ordered flush result measure access_ordered flush result delete flush measure access_ordered flush result measure access_ordered flush result drop_table drop_ext_btree_gist +step create_table: CREATE TEMPORARY TABLE kill_prior_tuple(key int not null, cat text not null); +step fill_500: INSERT INTO kill_prior_tuple(key, cat) SELECT g.i, 'a' FROM generate_series(1, 500) g(i); +step create_ext_btree_gist: CREATE EXTENSION btree_gist; +step create_gist: CREATE INDEX kill_prior_tuple_gist ON kill_prior_tuple USING gist (key); +step flush: SELECT FROM pg_stat_force_next_flush(); +step disable_seq: SET enable_seqscan = false; +step disable_bitmap: SET enable_bitmapscan = false; +step measure: UPDATE counter SET heap_accesses = (SELECT heap_blks_read + heap_blks_hit FROM pg_statio_all_tables WHERE relname = 'kill_prior_tuple'); +step access_ordered: EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF, BUFFERS OFF) SELECT * FROM kill_prior_tuple ORDER BY key <-> 1; +QUERY PLAN +--------------------------------------------------------------------------------------- +Index Scan using kill_prior_tuple_gist on kill_prior_tuple (actual rows=500.00 loops=1) + Order By: (key <-> 1) + Index Searches: 1 +(3 rows) + +step flush: SELECT FROM pg_stat_force_next_flush(); +step result: SELECT ((heap_blks_read + heap_blks_hit - counter.heap_accesses) > 0) AS has_new_heap_accesses FROM counter, pg_statio_all_tables WHERE relname = 'kill_prior_tuple'; +has_new_heap_accesses +--------------------- +t +(1 row) + +step measure: UPDATE counter SET heap_accesses = (SELECT heap_blks_read + heap_blks_hit FROM pg_statio_all_tables WHERE relname = 'kill_prior_tuple'); +step access_ordered: EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF, BUFFERS OFF) SELECT * FROM kill_prior_tuple ORDER BY key <-> 1; +QUERY PLAN +--------------------------------------------------------------------------------------- +Index Scan using kill_prior_tuple_gist on kill_prior_tuple (actual rows=500.00 loops=1) + Order By: (key <-> 1) + Index Searches: 1 +(3 rows) + +step flush: SELECT FROM pg_stat_force_next_flush(); +step result: SELECT ((heap_blks_read + heap_blks_hit - counter.heap_accesses) > 0) AS has_new_heap_accesses FROM counter, pg_statio_all_tables WHERE relname = 'kill_prior_tuple'; +has_new_heap_accesses +--------------------- +t +(1 row) + +step delete: DELETE FROM kill_prior_tuple; +step flush: SELECT FROM pg_stat_force_next_flush(); +step measure: UPDATE counter SET heap_accesses = (SELECT heap_blks_read + heap_blks_hit FROM pg_statio_all_tables WHERE relname = 'kill_prior_tuple'); +step access_ordered: EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF, BUFFERS OFF) SELECT * FROM kill_prior_tuple ORDER BY key <-> 1; +QUERY PLAN +------------------------------------------------------------------------------------- +Index Scan using kill_prior_tuple_gist on kill_prior_tuple (actual rows=0.00 loops=1) + Order By: (key <-> 1) + Index Searches: 1 +(3 rows) + +step flush: SELECT FROM pg_stat_force_next_flush(); +step result: SELECT ((heap_blks_read + heap_blks_hit - counter.heap_accesses) > 0) AS has_new_heap_accesses FROM counter, pg_statio_all_tables WHERE relname = 'kill_prior_tuple'; +has_new_heap_accesses +--------------------- +t +(1 row) + +step measure: UPDATE counter SET heap_accesses = (SELECT heap_blks_read + heap_blks_hit FROM pg_statio_all_tables WHERE relname = 'kill_prior_tuple'); +step access_ordered: EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF, BUFFERS OFF) SELECT * FROM kill_prior_tuple ORDER BY key <-> 1; +QUERY PLAN +------------------------------------------------------------------------------------- +Index Scan using kill_prior_tuple_gist on kill_prior_tuple (actual rows=0.00 loops=1) + Order By: (key <-> 1) + Index Searches: 1 +(3 rows) + +step flush: SELECT FROM pg_stat_force_next_flush(); +step result: SELECT ((heap_blks_read + heap_blks_hit - counter.heap_accesses) > 0) AS has_new_heap_accesses FROM counter, pg_statio_all_tables WHERE relname = 'kill_prior_tuple'; +has_new_heap_accesses +--------------------- +t +(1 row) + +step drop_table: DROP TABLE IF EXISTS kill_prior_tuple; +step drop_ext_btree_gist: DROP EXTENSION btree_gist; + starting permutation: create_table fill_10 create_ext_btree_gist create_gist flush disable_seq disable_bitmap measure access flush result measure access flush result delete flush measure access flush result measure access flush result drop_table drop_ext_btree_gist step create_table: CREATE TEMPORARY TABLE kill_prior_tuple(key int not null, cat text not null); step fill_10: INSERT INTO kill_prior_tuple(key, cat) SELECT g.i, 'a' FROM generate_series(1, 10) g(i); @@ -223,7 +300,7 @@ step flush: SELECT FROM pg_stat_force_next_flush(); step result: SELECT ((heap_blks_read + heap_blks_hit - counter.heap_accesses) > 0) AS has_new_heap_accesses FROM counter, pg_statio_all_tables WHERE relname = 'kill_prior_tuple'; has_new_heap_accesses --------------------- -t +f (1 row) step drop_table: DROP TABLE IF EXISTS kill_prior_tuple; diff --git a/src/test/modules/index/specs/killtuples.spec b/src/test/modules/index/specs/killtuples.spec index 3b98ff9f7..f5d2fd773 100644 --- a/src/test/modules/index/specs/killtuples.spec +++ b/src/test/modules/index/specs/killtuples.spec @@ -47,6 +47,9 @@ step result { SELECT ((heap_blks_read + heap_blks_hit - counter.heap_accesses) > step access { EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF, BUFFERS OFF) SELECT * FROM kill_prior_tuple WHERE key = 1; } +# nearest-neighbor (order-by operator) scan (cannot set LP_DEAD bits) +step access_ordered { EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF, BUFFERS OFF) SELECT * FROM kill_prior_tuple ORDER BY key <-> 1; } + step delete { DELETE FROM kill_prior_tuple; } step drop_table { DROP TABLE IF EXISTS kill_prior_tuple; } @@ -96,7 +99,20 @@ permutation measure access flush result drop_table drop_ext_btree_gist -# Test gist, but with fewer rows - shows that killitems doesn't work anymore! +# GiST doesn't set LP_DEAD bits for ordered scans, so every access re-visits +# the heap +permutation + create_table fill_500 create_ext_btree_gist create_gist flush + disable_seq disable_bitmap + measure access_ordered flush result + measure access_ordered flush result + delete flush + measure access_ordered flush result + measure access_ordered flush result + drop_table drop_ext_btree_gist + +# Test gist with fewer rows, exercising the case where all the dead tuples are +# on a single page permutation create_table fill_10 create_ext_btree_gist create_gist flush disable_seq disable_bitmap diff --git a/src/test/regress/expected/create_index.out b/src/test/regress/expected/create_index.out index 55538c4c4..970b857c6 100644 --- a/src/test/regress/expected/create_index.out +++ b/src/test/regress/expected/create_index.out @@ -475,9 +475,9 @@ SELECT count(*) FROM point_tbl p WHERE p.f1 ~= '(-5, -12)'; EXPLAIN (COSTS OFF) SELECT * FROM point_tbl ORDER BY f1 <-> '0,1'; - QUERY PLAN ----------------------------------------------- - Index Only Scan using gpointind on point_tbl + QUERY PLAN +----------------------------------------- + Index Scan using gpointind on point_tbl Order By: (f1 <-> '(0,1)'::point) (2 rows) @@ -513,9 +513,9 @@ SELECT * FROM point_tbl WHERE f1 IS NULL; EXPLAIN (COSTS OFF) SELECT * FROM point_tbl WHERE f1 IS NOT NULL ORDER BY f1 <-> '0,1'; - QUERY PLAN ----------------------------------------------- - Index Only Scan using gpointind on point_tbl + QUERY PLAN +----------------------------------------- + Index Scan using gpointind on point_tbl Index Cond: (f1 IS NOT NULL) Order By: (f1 <-> '(0,1)'::point) (3 rows) @@ -539,7 +539,7 @@ EXPLAIN (COSTS OFF) SELECT * FROM point_tbl WHERE f1 <@ '(-10,-10),(10,10)':: box ORDER BY f1 <-> '0,1'; QUERY PLAN ------------------------------------------------ - Index Only Scan using gpointind on point_tbl + Index Scan using gpointind on point_tbl Index Cond: (f1 <@ '(10,10),(-10,-10)'::box) Order By: (f1 <-> '(0,1)'::point) (3 rows) diff --git a/src/test/regress/expected/create_index_spgist.out b/src/test/regress/expected/create_index_spgist.out index c6beb0efa..ddffca2e7 100644 --- a/src/test/regress/expected/create_index_spgist.out +++ b/src/test/regress/expected/create_index_spgist.out @@ -333,7 +333,7 @@ FROM quad_point_tbl; ---------------------------------------------------------------------------- WindowAgg Window: w1 AS (ORDER BY (p <-> '(0,0)'::point) ROWS UNBOUNDED PRECEDING) - -> Index Only Scan using sp_quad_ind on quad_point_tbl + -> Index Scan using sp_quad_ind on quad_point_tbl Order By: (p <-> '(0,0)'::point) (4 rows) @@ -354,7 +354,7 @@ FROM quad_point_tbl WHERE p <@ box '(200,200,1000,1000)'; ---------------------------------------------------------------------------- WindowAgg Window: w1 AS (ORDER BY (p <-> '(0,0)'::point) ROWS UNBOUNDED PRECEDING) - -> Index Only Scan using sp_quad_ind on quad_point_tbl + -> Index Scan using sp_quad_ind on quad_point_tbl Index Cond: (p <@ '(1000,1000),(200,200)'::box) Order By: (p <-> '(0,0)'::point) (5 rows) @@ -376,7 +376,7 @@ FROM quad_point_tbl WHERE p IS NOT NULL; -------------------------------------------------------------------------------- WindowAgg Window: w1 AS (ORDER BY (p <-> '(333,400)'::point) ROWS UNBOUNDED PRECEDING) - -> Index Only Scan using sp_quad_ind on quad_point_tbl + -> Index Scan using sp_quad_ind on quad_point_tbl Index Cond: (p IS NOT NULL) Order By: (p <-> '(333,400)'::point) (5 rows) @@ -503,7 +503,7 @@ FROM kd_point_tbl; ---------------------------------------------------------------------------- WindowAgg Window: w1 AS (ORDER BY (p <-> '(0,0)'::point) ROWS UNBOUNDED PRECEDING) - -> Index Only Scan using sp_kd_ind on kd_point_tbl + -> Index Scan using sp_kd_ind on kd_point_tbl Order By: (p <-> '(0,0)'::point) (4 rows) @@ -524,7 +524,7 @@ FROM kd_point_tbl WHERE p <@ box '(200,200,1000,1000)'; ---------------------------------------------------------------------------- WindowAgg Window: w1 AS (ORDER BY (p <-> '(0,0)'::point) ROWS UNBOUNDED PRECEDING) - -> Index Only Scan using sp_kd_ind on kd_point_tbl + -> Index Scan using sp_kd_ind on kd_point_tbl Index Cond: (p <@ '(1000,1000),(200,200)'::box) Order By: (p <-> '(0,0)'::point) (5 rows) @@ -546,7 +546,7 @@ FROM kd_point_tbl WHERE p IS NOT NULL; -------------------------------------------------------------------------------- WindowAgg Window: w1 AS (ORDER BY (p <-> '(333,400)'::point) ROWS UNBOUNDED PRECEDING) - -> Index Only Scan using sp_kd_ind on kd_point_tbl + -> Index Scan using sp_kd_ind on kd_point_tbl Index Cond: (p IS NOT NULL) Order By: (p <-> '(333,400)'::point) (5 rows) @@ -567,10 +567,10 @@ SET extra_float_digits = 0; CREATE INDEX ON quad_point_tbl_ord_seq1 USING spgist(p) INCLUDE(dist); EXPLAIN (COSTS OFF) SELECT p, dist FROM quad_point_tbl_ord_seq1 ORDER BY p <-> '0,0' LIMIT 10; - QUERY PLAN -------------------------------------------------------------------------------------------- + QUERY PLAN +-------------------------------------------------------------------------------------- Limit - -> Index Only Scan using quad_point_tbl_ord_seq1_p_dist_idx on quad_point_tbl_ord_seq1 + -> Index Scan using quad_point_tbl_ord_seq1_p_dist_idx on quad_point_tbl_ord_seq1 Order By: (p <-> '(0,0)'::point) (3 rows) diff --git a/src/test/regress/expected/gist.out b/src/test/regress/expected/gist.out index c75bbb23b..810db8b8f 100644 --- a/src/test/regress/expected/gist.out +++ b/src/test/regress/expected/gist.out @@ -74,13 +74,13 @@ select p from gist_tbl where p <@ box(point(0,0), point(0.5, 0.5)); (0.5,0.5) (11 rows) --- Also test an index-only knn-search +-- Also test a knn-search explain (costs off) select p from gist_tbl where p <@ box(point(0,0), point(0.5, 0.5)) order by p <-> point(0.201, 0.201); - QUERY PLAN --------------------------------------------------------- - Index Only Scan using gist_tbl_point_index on gist_tbl + QUERY PLAN +--------------------------------------------------- + Index Scan using gist_tbl_point_index on gist_tbl Index Cond: (p <@ '(0.5,0.5),(0,0)'::box) Order By: (p <-> '(0.201,0.201)'::point) (3 rows) @@ -106,9 +106,9 @@ order by p <-> point(0.201, 0.201); explain (costs off) select p from gist_tbl where p <@ box(point(0,0), point(0.5, 0.5)) order by point(0.101, 0.101) <-> p; - QUERY PLAN --------------------------------------------------------- - Index Only Scan using gist_tbl_point_index on gist_tbl + QUERY PLAN +--------------------------------------------------- + Index Scan using gist_tbl_point_index on gist_tbl Index Cond: (p <@ '(0.5,0.5),(0,0)'::box) Order By: (p <-> '(0.101,0.101)'::point) (3 rows) @@ -138,12 +138,12 @@ select p from (box(point(0.8,0.8), point(1.0,1.0)))) as v(bb) cross join lateral (select p from gist_tbl where p <@ bb order by p <-> bb[0] limit 2) ss; - QUERY PLAN --------------------------------------------------------------------- + QUERY PLAN +--------------------------------------------------------------- Nested Loop -> Values Scan on "*VALUES*" -> Limit - -> Index Only Scan using gist_tbl_point_index on gist_tbl + -> Index Scan using gist_tbl_point_index on gist_tbl Index Cond: (p <@ "*VALUES*".column1) Order By: (p <-> ("*VALUES*".column1)[0]) (6 rows) @@ -203,13 +203,13 @@ select b from gist_tbl where b <@ box(point(5,5), point(6,6)); (6,6),(6,6) (21 rows) --- Also test an index-only knn-search +-- Also test a knn-search explain (costs off) select b from gist_tbl where b <@ box(point(5,5), point(6,6)) order by b <-> point(5.2, 5.91); - QUERY PLAN ------------------------------------------------------- - Index Only Scan using gist_tbl_box_index on gist_tbl + QUERY PLAN +------------------------------------------------- + Index Scan using gist_tbl_box_index on gist_tbl Index Cond: (b <@ '(6,6),(5,5)'::box) Order By: (b <-> '(5.2,5.91)'::point) (3 rows) @@ -245,9 +245,9 @@ order by b <-> point(5.2, 5.91); explain (costs off) select b from gist_tbl where b <@ box(point(5,5), point(6,6)) order by point(5.2, 5.91) <-> b; - QUERY PLAN ------------------------------------------------------- - Index Only Scan using gist_tbl_box_index on gist_tbl + QUERY PLAN +------------------------------------------------- + Index Scan using gist_tbl_box_index on gist_tbl Index Cond: (b <@ '(6,6),(5,5)'::box) Order By: (b <-> '(5.2,5.91)'::point) (3 rows) @@ -373,20 +373,26 @@ select count(*) from gist_tbl; 10001 (1 row) --- This case isn't supported, but it should at least EXPLAIN correctly. +-- An ordering-operator (nearest-neighbor) scan is never planned as an +-- index-only scan, so this lossy-distance case runs as a plain index scan that +-- rechecks the distances against the heap tuple. explain (verbose, costs off) select p from gist_tbl order by circle(p,1) <-> point(0,0) limit 1; - QUERY PLAN ------------------------------------------------------------------------------------- + QUERY PLAN +---------------------------------------------------------------------------------- Limit Output: p, ((circle(p, '1'::double precision) <-> '(0,0)'::point)) - -> Index Only Scan using gist_tbl_multi_index on public.gist_tbl + -> Index Scan using gist_tbl_multi_index on public.gist_tbl Output: p, (circle(p, '1'::double precision) <-> '(0,0)'::point) - Order By: ((circle(gist_tbl.p, '1'::double precision)) <-> '(0,0)'::point) + Order By: (circle(gist_tbl.p, '1'::double precision) <-> '(0,0)'::point) (5 rows) select p from gist_tbl order by circle(p,1) <-> point(0,0) limit 1; -ERROR: lossy distance functions are not supported in index-only scans + p +------- + (0,0) +(1 row) + -- Force an index build using buffering. create index gist_tbl_box_index_forcing_buffering on gist_tbl using gist (p) with (buffering=on, fillfactor=50); diff --git a/src/test/regress/sql/gist.sql b/src/test/regress/sql/gist.sql index 6f1fc65f1..369eb4576 100644 --- a/src/test/regress/sql/gist.sql +++ b/src/test/regress/sql/gist.sql @@ -65,7 +65,7 @@ select p from gist_tbl where p <@ box(point(0,0), point(0.5, 0.5)); -- execute the same select p from gist_tbl where p <@ box(point(0,0), point(0.5, 0.5)); --- Also test an index-only knn-search +-- Also test a knn-search explain (costs off) select p from gist_tbl where p <@ box(point(0,0), point(0.5, 0.5)) order by p <-> point(0.201, 0.201); @@ -109,7 +109,7 @@ select b from gist_tbl where b <@ box(point(5,5), point(6,6)); -- execute the same select b from gist_tbl where b <@ box(point(5,5), point(6,6)); --- Also test an index-only knn-search +-- Also test a knn-search explain (costs off) select b from gist_tbl where b <@ box(point(5,5), point(6,6)) order by b <-> point(5.2, 5.91); @@ -164,7 +164,9 @@ explain (verbose, costs off) select count(*) from gist_tbl; select count(*) from gist_tbl; --- This case isn't supported, but it should at least EXPLAIN correctly. +-- An ordering-operator (nearest-neighbor) scan is never planned as an +-- index-only scan, so this lossy-distance case runs as a plain index scan that +-- rechecks the distances against the heap tuple. explain (verbose, costs off) select p from gist_tbl order by circle(p,1) <-> point(0,0) limit 1; select p from gist_tbl order by circle(p,1) <-> point(0,0) limit 1; diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 267ac1462..e42095682 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -1063,6 +1063,8 @@ GBT_NUMKEY_R GBT_VARKEY GBT_VARKEY_R GENERAL_NAME +GISTBatchData +GISTBatchItem GISTBuildBuffers GISTBuildState GISTDeletedPageContents -- 2.53.0