From 7d85357c00b4b768af436bbfe38539bff75d1e4a Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Thu, 4 Jun 2026 23:09:37 -0400 Subject: [PATCH v28 06/11] WIP: Adopt amgetbatch interface in SP-GiST index AM. Replace spggettuple with spggetbatch, which implements the amgetbatch interface added by commit FIXME. Plain index scans of SP-GiST indexes now return matching items in batches consisting of all of the matches from a given leaf page, giving the table AM the ability to perform optimizations like index prefetching during SP-GiST index scans. As in nbtree, hash, and GiST, an ordinary batch's only retained buffer pin is the one on its single leaf page, held as the standardized interlock against unsafe concurrent TID recycling by VACUUM, for as long as the table AM still needs it. Nearest-neighbor (ordered) scans work as in GiST: spggetbatch drains the distance-ordered queue into one "virtual" batch spanning many leaf pages. The interlock pin also fixes a pre-existing bug in which SP-GiST index-only scans could return wrong answers. This is exactly the same race condition that commit FIXME (which taught GiST to use the amgetbatch interface) fixed in GiST. As with GiST, we rely on the planner disallowing ordered SP-GiST scans to close the gap there (SP-GiST also uses "virtual batches" during ordered scans, which make a conventional leaf page pin interlock impractical, just like in GiST). There is an additional restriction on index-only scans, which is a separate issue that is peculiar to SP-GiST: index-only scans are now disabled for "long values" opclasses such as the text radix opclass. These opclasses use reconstructed values whose size is essentially unbounded. The prefix cannot reliably fit into a fixed per-batch reconstruction workspace. There doesn't appear to be a simple way to solve that resource management problem within the confines of the amgetbatch design, and inventing new infrastructure to make it work doesn't seem likely to pay for itself. This warrants a separate SP-GiST only incompatibility item in the Postgres 20 release notes (in addition to an item about GiST _and_ SP-GiST not supporting ordered index-only scans anymore). Author: Peter Geoghegan --- src/include/access/spgist.h | 5 +- src/include/access/spgist_private.h | 102 +- src/backend/access/spgist/README | 11 +- src/backend/access/spgist/spgscan.c | 908 +++++++++++++----- src/backend/access/spgist/spgutils.c | 8 +- src/backend/access/spgist/spgvacuum.c | 68 +- src/backend/access/spgist/spgxlog.c | 12 +- doc/src/sgml/indexam.sgml | 40 +- doc/src/sgml/spgist.sgml | 7 +- .../expected/spgist_name_ops.out | 4 +- src/test/regress/expected/amutils.out | 2 +- .../regress/expected/create_index_spgist.out | 50 +- src/tools/pgindent/typedefs.list | 3 +- 13 files changed, 853 insertions(+), 367 deletions(-) diff --git a/src/include/access/spgist.h b/src/include/access/spgist.h index 083d93f8f..3c2582e76 100644 --- a/src/include/access/spgist.h +++ b/src/include/access/spgist.h @@ -208,7 +208,10 @@ extern void spgendscan(IndexScanDesc scan); extern void spgrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, ScanKey orderbys, int norderbys); extern int64 spggetbitmap(IndexScanDesc scan, TIDBitmap *tbm); -extern bool spggettuple(IndexScanDesc scan, ScanDirection dir); +extern IndexScanBatch spggetbatch(IndexScanDesc scan, IndexScanBatch priorbatch, + ScanDirection dir); +extern void spgunguardbatch(IndexScanDesc scan, IndexScanBatch batch); +extern void spggettransform(IndexScanDesc scan, IndexScanBatch batch, int item); extern bool spgcanreturn(Relation index, int attno); /* spgvacuum.c */ diff --git a/src/include/access/spgist_private.h b/src/include/access/spgist_private.h index ec6d6f5f7..ff8920140 100644 --- a/src/include/access/spgist_private.h +++ b/src/include/access/spgist_private.h @@ -14,6 +14,7 @@ #ifndef SPGIST_PRIVATE_H #define SPGIST_PRIVATE_H +#include "access/indexbatch.h" #include "access/itup.h" #include "access/spgist.h" #include "catalog/pg_am_d.h" @@ -183,6 +184,81 @@ typedef struct SpGistSearchItem #define SizeOfSpGistSearchItem(n_distances) \ (offsetof(SpGistSearchItem, distances) + sizeof(double) * (n_distances)) +/* + * Per-batch data private to the SP-GiST index AM (the static index AM opaque + * area of an IndexScanBatch). + * + * A non-ordered batch holds all matches from a single leaf page, and its buffer + * pin is the TID recycling interlock. An ordered (nearest-neighbor) scan + * instead returns a "virtual" batch, drained from the distance-ordered queue + * and spanning many leaf pages; it holds no pin (blkno == InvalidBlockNumber). + * + * reconValue/level/isNull are the shared inputs spggettransform uses to + * reconstruct an index-only scan's values; the prefix is the same for every + * match in a non-ordered batch. (Ordered scans are never index-only.) + */ +typedef struct SpGistBatchData +{ + Buffer buf; /* leaf page's pin (InvalidBuffer if virtual) */ + BlockNumber blkno; /* leaf blkno (InvalidBlockNumber == virtual) */ + Datum reconValue; /* prefix; into the recon area when + * by-reference */ + int level; + bool isNull; /* batch came from a nulls page */ +} SpGistBatchData; + +#define SpGistBatchGetData(scan, batch) \ + index_scan_batch_index_opaque_static(scan, batch, SpGistBatchData) + +/* + * Per-item data for an ordered (virtual) batch: an array in the dynamic opaque + * area, subscripted via SpGistBatchGetItem. Each item has its own recheck flag + * (SP-GiST matching is lossy, varying per item) plus its ORDER BY distances. + * + * A non-ordered batch needs only a recheck flag per item, so its dynamic opaque + * area is a plain bool array, subscripted via SpGistBatchGetRecheck. + */ +typedef struct SpGistBatchItem +{ + bool recheck; /* T if quals must be rechecked */ + bool recheckDistances; /* T if distances are lossy lower bounds */ + IndexOrderByDistance distances[FLEXIBLE_ARRAY_MEMBER]; /* numberOfOrderBys */ +} SpGistBatchItem; + +#define SizeOfSpGistBatchItem(n_distances) \ + (offsetof(SpGistBatchItem, distances) + \ + sizeof(IndexOrderByDistance) * (n_distances)) + +/* Subscript an ordered (virtual) batch's item array */ +#define SpGistBatchGetItem(scan, batch, item) \ + (AssertMacro(((SpGistScanOpaque) (scan)->opaque)->numberOfNonNullOrderBys > 0), \ + AssertMacro((item) >= 0 && (item) < MaxIndexTuplesPerPage), \ + (SpGistBatchItem *) ((char *) index_scan_batch_index_opaque_dyn((scan), (batch)) + \ + (Size) (item) * SizeOfSpGistBatchItem((scan)->numberOfOrderBys))) + +/* Subscript a non-ordered batch's recheck-flag array */ +#define SpGistBatchGetRecheck(scan, batch) \ + (AssertMacro(((SpGistScanOpaque) (scan)->opaque)->numberOfNonNullOrderBys == 0), \ + (bool *) index_scan_batch_index_opaque_dyn((scan), (batch))) + +/* Size of each layout's per-item array within the dynamic opaque area */ +#define SpGistBatchItemArraySize(scan) \ + MAXALIGN(SizeOfSpGistBatchItem((scan)->numberOfOrderBys) * (scan)->maxitemsbatch) +#define SpGistBatchRecheckArraySize(scan) \ + MAXALIGN(sizeof(bool) * (scan)->maxitemsbatch) + +/* + * For an index-only scan, the shared by-reference reconstruction prefix + * (SpGistBatchData.reconValue) is stored after the per-item array in the + * dynamic opaque area, not in currTuples: the prefix is reconstructed from + * ancestor inner pages, so it isn't bounded by the one leaf page that + * currTuples is sized for. Index-only scans are always non-ordered, so it + * follows the recheck array. + */ +#define SpGistBatchGetReconArea(scan, batch) \ + ((char *) index_scan_batch_index_opaque_dyn((scan), (batch)) + \ + SpGistBatchRecheckArraySize(scan)) + /* * Private state of an index scan */ @@ -217,29 +293,9 @@ typedef struct SpGistScanOpaqueData double *zeroDistances; double *infDistances; - /* These fields are only used in amgetbitmap scans: */ - TIDBitmap *tbm; /* bitmap being filled */ - int64 ntids; /* number of TIDs passed to bitmap */ - - /* These fields are only used in amgettuple scans: */ - bool want_itup; /* are we reconstructing tuples? */ - TupleDesc reconTupDesc; /* if so, descriptor for reconstructed tuples */ - int nPtrs; /* number of TIDs found on current page */ - int iPtr; /* index for scanning through same */ - ItemPointerData heapPtrs[MaxIndexTuplesPerPage]; /* TIDs from cur page */ - bool recheck[MaxIndexTuplesPerPage]; /* their recheck flags */ - bool recheckDistances[MaxIndexTuplesPerPage]; /* distance recheck - * flags */ - HeapTuple reconTups[MaxIndexTuplesPerPage]; /* reconstructed tuples */ - - /* distances (for recheck) */ - IndexOrderByDistance *distances[MaxIndexTuplesPerPage]; - - /* - * Note: using MaxIndexTuplesPerPage above is a bit hokey since - * SpGistLeafTuples aren't exactly IndexTuples; however, they are larger, - * so this is safe. - */ + /* These fields are only used in amgetbatch scans: */ + TupleDesc reconTupDesc; /* descriptor for reconstructed tuples */ + MemoryContext reconCxt; /* context for lazily reconstructed xs_hitup */ } SpGistScanOpaqueData; typedef SpGistScanOpaqueData *SpGistScanOpaque; diff --git a/src/backend/access/spgist/README b/src/backend/access/spgist/README index 7117e02c7..e37240992 100644 --- a/src/backend/access/spgist/README +++ b/src/backend/access/spgist/README @@ -352,8 +352,8 @@ target TID is not acceptable, so we have to extend the algorithm to cope with such cases. We recognize that such a move might have occurred when we see a leaf-page REDIRECT tuple whose XID indicates it might have been created after the VACUUM scan started. We add the redirection target TID -to a "pending list" of places we need to recheck. Between pages of the -main sequential scan, we empty the pending list by visiting each listed +to a "pending list" of places we need to recheck. During the main +sequential scan, we empty the pending list by visiting each listed TID. If it points to an inner tuple (from a PickSplit), add each downlink TID to the pending list. If it points to a leaf page, vacuum that page. (We could just vacuum the single pointed-to chain, but vacuuming the @@ -365,6 +365,13 @@ only after we've completed all pending-list processing; instead we just mark items as done after processing them. Adding a TID that's already in the list is a no-op, whether or not that item is marked done yet. +On a leaf page, VACUUM takes a cleanup lock rather than a plain exclusive lock. +This is the interlock that makes index-only scans safe against concurrent TID +recycling: such a scan keeps a pin on the leaf page it read heap TIDs from until +it has consulted the visibility map, and VACUUM cannot make any of that page's +TIDs recyclable until spgbulkdelete returns, which it cannot do until it has +cleanup-locked that page behind the scan's pin. + spgbulkdelete also updates the index's free space map. Currently, spgvacuumcleanup has nothing to do if spgbulkdelete was diff --git a/src/backend/access/spgist/spgscan.c b/src/backend/access/spgist/spgscan.c index 2cc5f06f5..e6b15d2cc 100644 --- a/src/backend/access/spgist/spgscan.c +++ b/src/backend/access/spgist/spgscan.c @@ -28,10 +28,12 @@ #include "utils/memutils.h" #include "utils/rel.h" -typedef void (*storeRes_func) (SpGistScanOpaque so, ItemPointer heapPtr, - Datum leafValue, bool isNull, - SpGistLeafTuple leafTuple, bool recheck, - bool recheckDistances, double *distances); +static Buffer spgReadItemPage(IndexScanDesc scan, SpGistSearchItem *item, + Buffer buffer); +static void spgProcessInnerPage(IndexScanDesc scan, SpGistSearchItem *item, + Page page); +static void spgProcessLeafPage(IndexScanDesc scan, SpGistSearchItem *item, + Page page, IndexScanBatch batch); /* * Pairing heap comparison function for the SpGistSearchItem queue. @@ -172,26 +174,6 @@ resetSpGistScanOpaque(SpGistScanOpaque so) spgAddStartItem(so, false); MemoryContextSwitchTo(oldCtx); - - if (so->numberOfOrderBys > 0) - { - /* Must pfree distances to avoid memory leak */ - int i; - - for (i = 0; i < so->nPtrs; i++) - if (so->distances[i]) - pfree(so->distances[i]); - } - - if (so->want_itup) - { - /* Must pfree reconstructed tuples to avoid memory leak */ - int i; - - for (i = 0; i < so->nPtrs; i++) - pfree(so->reconTups[i]); - } - so->iPtr = so->nPtrs = 0; } /* @@ -332,6 +314,9 @@ spgbeginscan(Relation rel, int keysz, int orderbysz) */ so->reconTupDesc = scan->xs_hitupdesc = getSpGistTupleDesc(rel, &so->state.attType); + so->reconCxt = AllocSetContextCreate(CurrentMemoryContext, + "SP-GiST reconstruction context", + ALLOCSET_SMALL_SIZES); /* Allocate various arrays needed for order-by scans */ if (scan->numberOfOrderBys > 0) @@ -354,6 +339,27 @@ spgbeginscan(Relation rel, int keysz, int orderbysz) scan->xs_orderbynulls = palloc_array(bool, scan->numberOfOrderBys); memset(scan->xs_orderbynulls, true, sizeof(bool) * scan->numberOfOrderBys); + + /* + * Ordered scans fill a "virtual" batch by draining the + * distance-ordered queue, so the batch size is a tuning knob with no + * natural value. Testing has shown that a very small size will + * increase per-batch overhead (and likely instruction-cache misses), + * while a large size (such as MaxIndexTuplesPerPage) risks producing + * many tuples that a LIMIT node never consumes. This maxitemsbatch + * is a compromise. + */ + scan->maxitemsbatch = MaxIndexTuplesPerPage / 32; + } + else + { + /* + * A non-ordered batch holds all of the matches from a single leaf + * page, so one page's worth of items is the natural cap. Using + * MaxIndexTuplesPerPage is a bit hokey since SpGistLeafTuples aren't + * exactly IndexTuples; however, they are larger, so this is safe. + */ + scan->maxitemsbatch = MaxIndexTuplesPerPage; } fmgr_info_copy(&so->innerConsistentFn, @@ -366,6 +372,9 @@ spgbeginscan(Relation rel, int keysz, int orderbysz) so->indexCollation = rel->rd_indcollation[0]; + scan->batch_index_opaque_static = MAXALIGN(sizeof(SpGistBatchData)); + scan->batch_tuples_workspace = BLCKSZ; + scan->opaque = so; return scan; @@ -411,9 +420,31 @@ spgrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, /* preprocess scankeys, set up the representation in *so */ spgPrepareScanKeys(scan); + /* + * Size the dynamic opaque area now that the scan keys (and xs_want_itup) + * are known. Ordered (virtual) batches need a full SpGistBatchItem array + * (each item's ORDER BY distances included); non-ordered batches need + * only a recheck flag per item. Index-only scans (always non-ordered) + * need extra room after the array for the reconstruction prefix (see + * SpGistBatchGetReconArea and spgcanreturn). + * + * We do this here rather than in spgbeginscan because xs_want_itup is set + * by index_beginscan only after ambeginscan returns. + */ + if (so->numberOfNonNullOrderBys > 0) + scan->batch_index_opaque_dyn = SpGistBatchItemArraySize(scan); + else + scan->batch_index_opaque_dyn = SpGistBatchRecheckArraySize(scan); + if (scan->xs_want_itup) + scan->batch_index_opaque_dyn += BLCKSZ; + /* set up starting queue entries */ resetSpGistScanOpaque(so); + /* discard any index-only tuple reconstructed by a previous scan */ + MemoryContextReset(so->reconCxt); + scan->xs_hitup = NULL; + /* count an indexscan for stats */ pgstat_count_index_scan(scan->indexRelation); if (scan->instrument) @@ -427,6 +458,7 @@ spgendscan(IndexScanDesc scan) MemoryContextDelete(so->tempCxt); MemoryContextDelete(so->traversalCxt); + MemoryContextDelete(so->reconCxt); if (so->keyData) pfree(so->keyData); @@ -455,10 +487,11 @@ spgendscan(IndexScanDesc scan) * Leaf SpGistSearchItem constructor, called in queue context */ static SpGistSearchItem * -spgNewHeapItem(SpGistScanOpaque so, int level, SpGistLeafTuple leafTuple, +spgNewHeapItem(IndexScanDesc scan, int level, SpGistLeafTuple leafTuple, Datum leafValue, bool recheck, bool recheckDistances, bool isnull, double *distances) { + SpGistScanOpaque so = (SpGistScanOpaque) scan->opaque; SpGistSearchItem *item = spgAllocSearchItem(so, isnull, distances); item->level = level; @@ -470,7 +503,7 @@ spgNewHeapItem(SpGistScanOpaque so, int level, SpGistLeafTuple leafTuple, * if we didn't ask it to, and mildly-broken methods might supply one of * the wrong type. The correct leafValue type is attType not leafType. */ - if (so->want_itup) + if (scan->xs_want_itup) { item->value = isnull ? (Datum) 0 : datumCopy(leafValue, so->state.attType.attbyval, @@ -502,16 +535,18 @@ spgNewHeapItem(SpGistScanOpaque so, int level, SpGistLeafTuple leafTuple, } /* - * Test whether a leaf tuple satisfies all the scan keys + * Test whether a leaf tuple satisfies all the scan keys. * - * *reportedSome is set to true if: - * the scan is not ordered AND the item satisfies the scankeys + * When a match is found, an ordered scan queues the heap tuple for later + * distance-ordered draining. A non-ordered scan appends it to batch. + * + * 'batch' arg is NULL for ordered scans. */ static bool -spgLeafTest(SpGistScanOpaque so, SpGistSearchItem *item, - SpGistLeafTuple leafTuple, bool isnull, - bool *reportedSome, storeRes_func storeRes) +spgLeafTest(IndexScanDesc scan, SpGistSearchItem *item, + SpGistLeafTuple leafTuple, bool isnull, IndexScanBatch batch) { + SpGistScanOpaque so = (SpGistScanOpaque) scan->opaque; Datum leafValue; double *distances; bool result; @@ -544,7 +579,7 @@ spgLeafTest(SpGistScanOpaque so, SpGistSearchItem *item, in.reconstructedValue = item->value; in.traversalValue = item->traversalValue; in.level = item->level; - in.returnData = so->want_itup; + in.returnData = scan->xs_want_itup; in.leafDatum = SGLTDATUM(leafTuple, &so->state); out.leafValue = (Datum) 0; @@ -569,15 +604,16 @@ spgLeafTest(SpGistScanOpaque so, SpGistSearchItem *item, /* item passes the scankeys */ if (so->numberOfNonNullOrderBys > 0) { - /* the scan is ordered -> add the item to the queue */ - MemoryContext oldCxt = MemoryContextSwitchTo(so->traversalCxt); - SpGistSearchItem *heapItem = spgNewHeapItem(so, item->level, - leafTuple, - leafValue, - recheck, - recheckDistances, - isnull, - distances); + /* The scan is ordered; add the item to the queue */ + MemoryContext oldCxt; + SpGistSearchItem *heapItem; + + Assert(scan->batchImmediateUnguard); + + oldCxt = MemoryContextSwitchTo(so->traversalCxt); + heapItem = spgNewHeapItem(scan, item->level, leafTuple, leafValue, + recheck, recheckDistances, isnull, + distances); spgAddSearchItemToQueue(so, heapItem); @@ -585,11 +621,41 @@ spgLeafTest(SpGistScanOpaque so, SpGistSearchItem *item, } else { - /* non-ordered scan, so report the item right away */ + /* + * The scan is non-ordered; add the item to caller's batch + * directly. + */ + int i = ++batch->lastItem; + Assert(!recheckDistances); - storeRes(so, &leafTuple->heapPtr, leafValue, isnull, - leafTuple, recheck, false, NULL); - *reportedSome = true; + Assert(i < scan->maxitemsbatch); + + batch->items[i].tableTid = leafTuple->heapPtr; + batch->items[i].indexOffset = InvalidOffsetNumber; /* meaningless */ + batch->items[i].tupleOffset = 0; + + SpGistBatchGetRecheck(scan, batch)[i] = recheck; + + if (scan->xs_want_itup) + { + Size sz = leafTuple->size; + int off = 0; + + if (i > batch->firstItem) + { + int prev = batch->items[i - 1].tupleOffset; + + /* + * Copy tuple to point immediately after most recently + * appended tuple + */ + off = prev + ((SpGistLeafTuple) (batch->currTuples + prev))->size; + } + + batch->items[i].tupleOffset = off; + memcpy(batch->currTuples + off, leafTuple, sz); + Assert(off + sz <= scan->batch_tuples_workspace); + } } } @@ -599,10 +665,12 @@ spgLeafTest(SpGistScanOpaque so, SpGistSearchItem *item, /* A bundle initializer for inner_consistent methods */ static void spgInitInnerConsistentIn(spgInnerConsistentIn *in, - SpGistScanOpaque so, + IndexScanDesc scan, SpGistSearchItem *item, SpGistInnerTuple innerTuple) { + SpGistScanOpaque so = (SpGistScanOpaque) scan->opaque; + in->scankeys = so->keyData; in->orderbys = so->orderByData; in->nkeys = so->numberOfKeys; @@ -612,7 +680,7 @@ spgInitInnerConsistentIn(spgInnerConsistentIn *in, in->traversalMemoryContext = so->traversalCxt; in->traversalValue = item->traversalValue; in->level = item->level; - in->returnData = so->want_itup; + in->returnData = scan->xs_want_itup; in->allTheSame = innerTuple->allTheSame; in->hasPrefix = (innerTuple->prefixSize > 0); in->prefixDatum = SGITDATUM(innerTuple, &so->state); @@ -659,9 +727,10 @@ spgMakeInnerItem(SpGistScanOpaque so, } static void -spgInnerTest(SpGistScanOpaque so, SpGistSearchItem *item, +spgInnerTest(IndexScanDesc scan, SpGistSearchItem *item, SpGistInnerTuple innerTuple, bool isnull) { + SpGistScanOpaque so = (SpGistScanOpaque) scan->opaque; MemoryContext oldCxt = MemoryContextSwitchTo(so->tempCxt); spgInnerConsistentOut out; int nNodes = innerTuple->nNodes; @@ -673,7 +742,7 @@ spgInnerTest(SpGistScanOpaque so, SpGistSearchItem *item, { spgInnerConsistentIn in; - spgInitInnerConsistentIn(&in, so, item, innerTuple); + spgInitInnerConsistentIn(&in, scan, item, innerTuple); /* use user-defined inner consistent method */ FunctionCall2Coll(&so->innerConsistentFn, @@ -755,12 +824,11 @@ enum SpGistSpecialOffsetNumbers }; static OffsetNumber -spgTestLeafTuple(SpGistScanOpaque so, +spgTestLeafTuple(IndexScanDesc scan, SpGistSearchItem *item, Page page, OffsetNumber offset, bool isnull, bool isroot, - bool *reportedSome, - storeRes_func storeRes) + IndexScanBatch batch) { SpGistLeafTuple leafTuple = (SpGistLeafTuple) PageGetItem(page, PageGetItemId(page, offset)); @@ -796,117 +864,91 @@ spgTestLeafTuple(SpGistScanOpaque so, Assert(ItemPointerIsValid(&leafTuple->heapPtr)); - spgLeafTest(so, item, leafTuple, isnull, reportedSome, storeRes); + spgLeafTest(scan, item, leafTuple, isnull, batch); return SGLT_GET_NEXTOFFSET(leafTuple); } /* - * Walk the tree and report all tuples passing the scan quals to the storeRes - * subroutine. + * Walk the tree and return the next batch of matching tuples. * - * If scanWholeIndex is true, we'll do just that. If not, we'll stop at the - * next page boundary once we have reported at least one tuple. + * Main driver of spgistgetbitmap and non-ordered spgistgetbatch scans. */ -static void -spgWalk(Relation index, SpGistScanOpaque so, bool scanWholeIndex, - storeRes_func storeRes) +static IndexScanBatch +spgWalk(IndexScanDesc scan) { + SpGistScanOpaque so = (SpGistScanOpaque) scan->opaque; + IndexScanBatch batch; Buffer buffer = InvalidBuffer; - bool reportedSome = false; - while (scanWholeIndex || !reportedSome) + batch = indexam_util_alloc_batch(scan); + + /* SP-GiST only ever scans forward; set the batch's direction up front */ + batch->dir = ForwardScanDirection; + + /* Walk until a leaf page yields matches, or the index is exhausted */ + while (batch->firstItem > batch->lastItem) { SpGistSearchItem *item = spgGetNextQueueItem(so); + Page page; if (item == NULL) break; /* No more items in queue -> done */ -redirect: - /* Check for interrupts, just in case of infinite loop */ - CHECK_FOR_INTERRUPTS(); + /* Heap items only occur in ordered scans (see spgWalkOrdered) */ + Assert(!item->isLeaf); - if (item->isLeaf) - { - /* We store heap items in the queue only in case of ordered search */ - Assert(so->numberOfNonNullOrderBys > 0); - storeRes(so, &item->heapPtr, item->value, item->isNull, - item->leafTuple, item->recheck, - item->recheckDistances, item->distances); - reportedSome = true; - } + /* + * Navigate to the item's live page, then process its contents. + * + * Note: spgReadItemPage calls CHECK_FOR_INTERRUPTS(). + */ + buffer = spgReadItemPage(scan, item, buffer); + page = BufferGetPage(buffer); + + if (SpGistPageIsLeaf(page)) + spgProcessLeafPage(scan, item, page, batch); else + spgProcessInnerPage(scan, item, page); + + if (batch->firstItem <= batch->lastItem) { - BlockNumber blkno = ItemPointerGetBlockNumber(&item->heapPtr); - OffsetNumber offset = ItemPointerGetOffsetNumber(&item->heapPtr); - Page page; - bool isnull; + /* batch has matching items to return */ + SpGistBatchData *sbatch = SpGistBatchGetData(scan, batch); - if (buffer == InvalidBuffer) + Assert(BufferIsValid(buffer)); + Assert(SpGistPageIsLeaf(BufferGetPage(buffer))); + + sbatch->buf = buffer; + sbatch->blkno = BufferGetBlockNumber(buffer); + + if (scan->xs_want_itup) { - buffer = ReadBuffer(index, blkno); - LockBuffer(buffer, BUFFER_LOCK_SHARE); - } - else if (blkno != BufferGetBlockNumber(buffer)) - { - UnlockReleaseBuffer(buffer); - buffer = ReadBuffer(index, blkno); - LockBuffer(buffer, BUFFER_LOCK_SHARE); - } + /* + * Stash the shared reconstruction prefix for spggettransform, + * which runs after item is freed. The prefix (item->value) + * is the same for every match in the batch. It can be NULL + * when the opclass reconstructs entirely from the leaf datum + * (e.g. quad/kd-tree) or at the root level. + */ + sbatch->level = item->level; + sbatch->isNull = item->isNull; - /* else new pointer points to the same page, no work needed */ - - page = BufferGetPage(buffer); - - isnull = SpGistPageStoresNulls(page) ? true : false; - - if (SpGistPageIsLeaf(page)) - { - /* Page is a leaf - that is, all its tuples are heap items */ - OffsetNumber max = PageGetMaxOffsetNumber(page); - - if (SpGistBlockIsRoot(blkno)) + if (so->state.attLeafType.attbyval || item->isNull || + DatumGetPointer(item->value) == NULL) { - /* When root is a leaf, examine all its tuples */ - for (offset = FirstOffsetNumber; offset <= max; offset++) - (void) spgTestLeafTuple(so, item, page, offset, - isnull, true, - &reportedSome, storeRes); + sbatch->reconValue = item->value; } else { - /* Normal case: just examine the chain we arrived at */ - while (offset != InvalidOffsetNumber) - { - Assert(offset >= FirstOffsetNumber && offset <= max); - offset = spgTestLeafTuple(so, item, page, offset, - isnull, false, - &reportedSome, storeRes); - if (offset == SpGistRedirectOffsetNumber) - goto redirect; - } - } - } - else /* page is inner */ - { - SpGistInnerTuple innerTuple = (SpGistInnerTuple) - PageGetItem(page, PageGetItemId(page, offset)); + /* pass-by-reference prefix: copy it into the recon area */ + Size sz = datumGetSize(item->value, false, + so->state.attLeafType.attlen); + char *dest = SpGistBatchGetReconArea(scan, batch); - if (innerTuple->tupstate != SPGIST_LIVE) - { - if (innerTuple->tupstate == SPGIST_REDIRECT) - { - /* transfer attention to redirect point */ - item->heapPtr = ((SpGistDeadTuple) innerTuple)->pointer; - Assert(ItemPointerGetBlockNumber(&item->heapPtr) != - SPGIST_METAPAGE_BLKNO); - goto redirect; - } - elog(ERROR, "unexpected SPGiST tuple state: %d", - innerTuple->tupstate); + memcpy(dest, DatumGetPointer(item->value), sz); + sbatch->reconValue = PointerGetDatum(dest); } - - spgInnerTest(so, item, innerTuple, isnull); } } @@ -916,175 +958,511 @@ redirect: MemoryContextReset(so->tempCxt); } - if (buffer != InvalidBuffer) - UnlockReleaseBuffer(buffer); + if (batch->firstItem > batch->lastItem) + { + /* queue exhausted without finding any matches: end of scan */ + if (buffer != InvalidBuffer) + UnlockReleaseBuffer(buffer); + indexam_util_release_batch(scan, batch); + return NULL; + } + + indexam_util_unlock_batch(scan, batch, buffer); + + return batch; } - -/* storeRes subroutine for getbitmap case */ +/* + * Convert an ordered heap item's flattened distances into the batch item's + * IndexOrderByDistance array, honoring nonNullOrderByOffsets. + */ static void -storeBitmap(SpGistScanOpaque so, ItemPointer heapPtr, - Datum leafValue, bool isnull, - SpGistLeafTuple leafTuple, bool recheck, - bool recheckDistances, double *distances) +spgFillBatchItemDistances(SpGistScanOpaque so, SpGistBatchItem *bitem, + SpGistSearchItem *item) { - Assert(!recheckDistances && !distances); - tbm_add_tuples(so->tbm, heapPtr, 1, recheck); - so->ntids++; + if (item->isNull || so->numberOfNonNullOrderBys <= 0) + { + for (int i = 0; i < so->numberOfOrderBys; i++) + { + bitem->distances[i].value = 0.0; + bitem->distances[i].isnull = true; + } + return; + } + + for (int i = 0; i < so->numberOfOrderBys; i++) + { + int offset = so->nonNullOrderByOffsets[i]; + + if (offset >= 0) + { + bitem->distances[i].value = item->distances[offset]; + bitem->distances[i].isnull = false; + } + else + { + bitem->distances[i].value = 0.0; + bitem->distances[i].isnull = true; + } + } +} + +/* + * spgWalkOrdered() -- drain the distance queue into one virtual batch + * + * Pop items from so->scanQueue in (lower-bound) distance order: index pages are + * scanned (pushing children and matching heap tuples back onto the queue), heap + * tuples are appended to the batch, until the batch fills or the queue empties. + * The result is a "virtual" batch spanning many leaf pages, holding no pin (and + * never index-only, which the planner forbids for ordered scans). + */ +static IndexScanBatch +spgWalkOrdered(IndexScanDesc scan) +{ + SpGistScanOpaque so = (SpGistScanOpaque) scan->opaque; + IndexScanBatch batch = indexam_util_alloc_batch(scan); + SpGistBatchData *sbatch; + Buffer buffer = InvalidBuffer; + int nitems = 0; + + /* SP-GiST only ever scans forward; set the batch's direction up front */ + batch->dir = ForwardScanDirection; + + for (;;) + { + SpGistSearchItem *item = spgGetNextQueueItem(so); + + if (item == NULL) + break; /* queue exhausted (end of scan) */ + + if (item->isLeaf) + { + /* matching heap tuple: append to the batch in distance order */ + SpGistBatchItem *bitem = SpGistBatchGetItem(scan, batch, nitems); + + batch->items[nitems].tableTid = item->heapPtr; + batch->items[nitems].indexOffset = InvalidOffsetNumber; + batch->items[nitems].tupleOffset = 0; + + bitem->recheck = item->recheck; + bitem->recheckDistances = item->recheckDistances; + spgFillBatchItemDistances(so, bitem, item); + + spgFreeSearchItem(so, item); + MemoryContextReset(so->tempCxt); + + if (++nitems == scan->maxitemsbatch) + break; /* batch full; remaining items stay queued */ + } + else + { + Page page; + + /* + * Index page: scan it, pushing children/heap items onto the + * queue. + * + * Note: spgReadItemPage calls CHECK_FOR_INTERRUPTS(). + */ + buffer = spgReadItemPage(scan, item, buffer); + page = BufferGetPage(buffer); + + if (SpGistPageIsLeaf(page)) + { + /* root-as-leaf: queue matching heap items (batch unused) */ + spgProcessLeafPage(scan, item, page, NULL); + } + else + spgProcessInnerPage(scan, item, page); + + spgFreeSearchItem(so, item); + MemoryContextReset(so->tempCxt); + } + } + + if (buffer != InvalidBuffer) + UnlockReleaseBuffer(buffer); + + if (nitems == 0) + { + /* no matching items remain: the scan is exhausted */ + indexam_util_release_batch(scan, batch); + return NULL; + } + + /* an ordered batch is "virtual" and holds no interlock pin */ + sbatch = SpGistBatchGetData(scan, batch); + sbatch->buf = InvalidBuffer; + sbatch->blkno = InvalidBlockNumber; + + batch->firstItem = 0; + batch->lastItem = nitems - 1; + + Assert(!batch->isGuarded); + + return batch; +} + +/* + * Navigate to the live page that 'item' points at, following inner-tuple and + * leaf-head REDIRECTs. + * + * 'buffer' is the lock the caller is already holding (or InvalidBuffer). We + * keep that lock while the item stays on the same block, releasing and + * re-acquiring only when the block changes. + * + * The returned buffer is pinned and share-locked, holding either a live inner + * tuple or a leaf page (whose chain head is not a redirect) at item->heapPtr. + */ +static Buffer +spgReadItemPage(IndexScanDesc scan, SpGistSearchItem *item, Buffer buffer) +{ + Relation index = scan->indexRelation; + + Assert(!item->isLeaf); /* heap items are handled by the caller */ + + for (;;) + { + BlockNumber blkno = ItemPointerGetBlockNumber(&item->heapPtr); + OffsetNumber offset = ItemPointerGetOffsetNumber(&item->heapPtr); + Page page; + + /* Release the page we hold if the item moved to a different block */ + if (buffer != InvalidBuffer && blkno != BufferGetBlockNumber(buffer)) + { + UnlockReleaseBuffer(buffer); + buffer = InvalidBuffer; + } + + /* Acquire the page if we're not already holding it */ + if (buffer == InvalidBuffer) + { + CHECK_FOR_INTERRUPTS(); + buffer = ReadBuffer(index, blkno); + LockBuffer(buffer, BUFFER_LOCK_SHARE); + } + + page = BufferGetPage(buffer); + + if (SpGistPageIsLeaf(page)) + { + ItemId iid; + SpGistLeafTuple head; + + /* When root is a leaf, all its tuples are live: no redirect */ + if (SpGistBlockIsRoot(blkno)) + return buffer; + + /* + * A leaf REDIRECT is always the head of its chain; follow it to + * the live tuples' page before the caller reports any match. A + * live or dead head is left for spgProcessLeafPage to deal with. + */ + iid = PageGetItemId(page, offset); + head = (SpGistLeafTuple) PageGetItem(page, iid); + + if (head->tupstate == SPGIST_REDIRECT) + { + item->heapPtr = ((SpGistDeadTuple) head)->pointer; + Assert(ItemPointerGetBlockNumber(&item->heapPtr) != + SPGIST_METAPAGE_BLKNO); + continue; + } + + return buffer; + } + else /* page is inner */ + { + ItemId iid; + SpGistInnerTuple innerTuple; + + iid = PageGetItemId(page, offset); + innerTuple = (SpGistInnerTuple) PageGetItem(page, iid); + + if (innerTuple->tupstate != SPGIST_LIVE) + { + if (innerTuple->tupstate == SPGIST_REDIRECT) + { + /* transfer attention to redirect point */ + item->heapPtr = ((SpGistDeadTuple) innerTuple)->pointer; + Assert(ItemPointerGetBlockNumber(&item->heapPtr) != + SPGIST_METAPAGE_BLKNO); + continue; + } + elog(ERROR, "unexpected SPGiST tuple state: %d", + innerTuple->tupstate); + } + + return buffer; + } + } +} + +/* + * Descend a live inner tuple reached by spgReadItemPage: run inner_consistent + * and push the matching child nodes onto the scan queue. + * + * When we're called, buffer containing 'page' is share-locked. The tuple at + * item->heapPtr must be live. + */ +static void +spgProcessInnerPage(IndexScanDesc scan, SpGistSearchItem *item, Page page) +{ + OffsetNumber offset = ItemPointerGetOffsetNumber(&item->heapPtr); + ItemId iid; + SpGistInnerTuple innerTuple; + + Assert(!SpGistPageIsLeaf(page)); + + iid = PageGetItemId(page, offset); + innerTuple = (SpGistInnerTuple) PageGetItem(page, iid); + Assert(innerTuple->tupstate == SPGIST_LIVE); + + spgInnerTest(scan, item, innerTuple, SpGistPageStoresNulls(page)); +} + +/* + * Examine a leaf page reached by spgReadItemPage, acting on matching tuples: + * a non-ordered scan appends them to batch; an ordered scan queues them, with + * batch NULL. + * + * When we're called, buffer containing 'page' is share-locked. + * spgReadItemPage must have already followed any leaf-head redirect, so the + * chain examined here contains no redirect. + */ +static void +spgProcessLeafPage(IndexScanDesc scan, SpGistSearchItem *item, Page page, + IndexScanBatch batch) +{ + BlockNumber blkno = ItemPointerGetBlockNumber(&item->heapPtr); + OffsetNumber offset = ItemPointerGetOffsetNumber(&item->heapPtr); + bool isnull = SpGistPageStoresNulls(page); + OffsetNumber max = PageGetMaxOffsetNumber(page); + + Assert(SpGistPageIsLeaf(page)); + + if (SpGistBlockIsRoot(blkno)) + { + /* When root is a leaf, examine all its tuples */ + for (offset = FirstOffsetNumber; offset <= max; offset++) + (void) spgTestLeafTuple(scan, item, page, offset, + isnull, true, batch); + } + else + { + /* Normal case: just examine the chain we arrived at */ + while (offset != InvalidOffsetNumber) + { + Assert(offset >= FirstOffsetNumber && offset <= max); + offset = spgTestLeafTuple(scan, item, page, offset, + isnull, false, batch); + /* spgReadItemPage already resolved any leaf-head redirect */ + Assert(offset != SpGistRedirectOffsetNumber); + } + } } int64 spggetbitmap(IndexScanDesc scan, TIDBitmap *tbm) { - SpGistScanOpaque so = (SpGistScanOpaque) scan->opaque; + int64 ntids = 0; + IndexScanBatch batch; - /* Copy want_itup to *so so we don't need to pass it around separately */ - so->want_itup = false; - - so->tbm = tbm; - so->ntids = 0; - - spgWalk(scan->indexRelation, so, true, storeBitmap); - - return so->ntids; -} - -/* storeRes subroutine for gettuple case */ -static void -storeGettuple(SpGistScanOpaque so, ItemPointer heapPtr, - Datum leafValue, bool isnull, - SpGistLeafTuple leafTuple, bool recheck, - bool recheckDistances, double *nonNullDistances) -{ - Assert(so->nPtrs < MaxIndexTuplesPerPage); - so->heapPtrs[so->nPtrs] = *heapPtr; - so->recheck[so->nPtrs] = recheck; - so->recheckDistances[so->nPtrs] = recheckDistances; - - if (so->numberOfOrderBys > 0) + /* + * Drive spgWalk one leaf page at a time, draining each batch into the + * bitmap and releasing it before fetching the next, so only one batch is + * ever live (cf. hashgetbitmap). + */ + while ((batch = spgWalk(scan)) != NULL) { - if (isnull || so->numberOfNonNullOrderBys <= 0) - so->distances[so->nPtrs] = NULL; - else + bool *recheck = SpGistBatchGetRecheck(scan, batch); + + for (int i = batch->firstItem; i <= batch->lastItem; i++) { - IndexOrderByDistance *distances = palloc_array(IndexOrderByDistance, - so->numberOfOrderBys); - int i; - - for (i = 0; i < so->numberOfOrderBys; i++) - { - int offset = so->nonNullOrderByOffsets[i]; - - if (offset >= 0) - { - /* Copy non-NULL distance value */ - distances[i].value = nonNullDistances[offset]; - distances[i].isnull = false; - } - else - { - /* Set distance's NULL flag. */ - distances[i].value = 0.0; - distances[i].isnull = true; - } - } - - so->distances[so->nPtrs] = distances; + tbm_add_tuples(tbm, &batch->items[i].tableTid, 1, recheck[i]); + ntids++; } - } - if (so->want_itup) - { /* - * Reconstruct index data. We have to copy the datum out of the temp - * context anyway, so we may as well create the tuple here. + * Return the batch to the single-slot bitmap cache, to be reused by + * the next spgWalk */ - Datum leafDatums[INDEX_MAX_KEYS]; - bool leafIsnulls[INDEX_MAX_KEYS]; - - /* We only need to deform the old tuple if it has INCLUDE attributes */ - if (so->state.leafTupDesc->natts > 1) - spgDeformLeafTuple(leafTuple, so->state.leafTupDesc, - leafDatums, leafIsnulls, isnull); - - leafDatums[spgKeyColumn] = leafValue; - leafIsnulls[spgKeyColumn] = isnull; - - so->reconTups[so->nPtrs] = heap_form_tuple(so->reconTupDesc, - leafDatums, - leafIsnulls); + indexam_util_release_batch(scan, batch); } - so->nPtrs++; + + return ntids; } -bool -spggettuple(IndexScanDesc scan, ScanDirection dir) +IndexScanBatch +spggetbatch(IndexScanDesc scan, IndexScanBatch priorbatch, ScanDirection dir) { SpGistScanOpaque so = (SpGistScanOpaque) scan->opaque; + /* + * Note: Persistent traversal state lives in so->scanQueue, so we have no + * use for priorbatch here + */ if (dir != ForwardScanDirection) elog(ERROR, "SP-GiST only supports forward scan direction"); - /* Copy want_itup to *so so we don't need to pass it around separately */ - so->want_itup = scan->xs_want_itup; + if (so->numberOfNonNullOrderBys > 0) + return spgWalkOrdered(scan); - for (;;) + return spgWalk(scan); +} + +/* + * spgunguardbatch() -- Drop a batch's TID recycling interlock (buffer pin) + */ +void +spgunguardbatch(IndexScanDesc scan, IndexScanBatch batch) +{ + SpGistBatchData *sbatch = SpGistBatchGetData(scan, batch); + + /* Should be called exactly once iff !batchImmediateUnguard */ + Assert(!scan->batchImmediateUnguard); + Assert(batch->isGuarded); + + ReleaseBuffer(sbatch->buf); +} + +/* + * spggettransform() -- Set up the scan's per-tuple output for one batch item + * + * Applies the item's recheck flag, and either reconstructs the index-only heap + * tuple (xs_hitup) or reports the item's ORDER BY distances. + */ +void +spggettransform(IndexScanDesc scan, IndexScanBatch batch, int item) +{ + SpGistScanOpaque so = (SpGistScanOpaque) scan->opaque; + + Assert(item >= batch->firstItem && item <= batch->lastItem); + + /* Ordered (virtual) batch: recheck flag and distances live in the item */ + if (so->numberOfNonNullOrderBys > 0) { - if (so->iPtr < so->nPtrs) - { - /* continuing to return reported tuples */ - scan->xs_heaptid = so->heapPtrs[so->iPtr]; - scan->xs_recheck = so->recheck[so->iPtr]; - scan->xs_hitup = so->reconTups[so->iPtr]; + SpGistBatchItem *bitem = SpGistBatchGetItem(scan, batch, item); - if (so->numberOfOrderBys > 0) - index_store_float8_orderby_distances(scan, so->orderByTypes, - so->distances[so->iPtr], - so->recheckDistances[so->iPtr]); - so->iPtr++; - return true; - } + Assert(!scan->xs_want_itup); + Assert(SpGistBatchGetData(scan, batch)->blkno == InvalidBlockNumber); - if (so->numberOfOrderBys > 0) - { - /* Must pfree distances to avoid memory leak */ - int i; - - for (i = 0; i < so->nPtrs; i++) - if (so->distances[i]) - pfree(so->distances[i]); - } - - if (so->want_itup) - { - /* Must pfree reconstructed tuples to avoid memory leak */ - int i; - - for (i = 0; i < so->nPtrs; i++) - pfree(so->reconTups[i]); - } - so->iPtr = so->nPtrs = 0; - - spgWalk(scan->indexRelation, so, false, storeGettuple); - - if (so->nPtrs == 0) - break; /* must have completed scan */ + scan->xs_recheck = bitem->recheck; + index_store_float8_orderby_distances(scan, so->orderByTypes, + bitem->distances, + bitem->recheckDistances); + return; } - return false; + /* Non-ordered batch: recheck flags live in a bool array */ + scan->xs_recheck = SpGistBatchGetRecheck(scan, batch)[item]; + + if (scan->xs_want_itup) + { + /* Index-only scan */ + SpGistBatchData *sbatch = SpGistBatchGetData(scan, batch); + SpGistLeafTuple leafTuple; + Datum leafDatums[INDEX_MAX_KEYS]; + bool leafIsnulls[INDEX_MAX_KEYS]; + Datum leafValue = (Datum) 0; + MemoryContext oldcxt; + + Assert(scan->numberOfOrderBys == 0); + Assert(sbatch->blkno != InvalidBlockNumber); + + /* Reconstruct the key value via leaf_consistent */ + leafTuple = (SpGistLeafTuple) (batch->currTuples + + batch->items[item].tupleOffset); + if (!sbatch->isNull) + { + spgLeafConsistentIn in; + spgLeafConsistentOut out; + + oldcxt = MemoryContextSwitchTo(so->tempCxt); + + in.scankeys = so->keyData; + in.orderbys = NULL; + in.nkeys = so->numberOfKeys; + in.norderbys = 0; + in.reconstructedValue = sbatch->reconValue; + in.traversalValue = NULL; + in.level = sbatch->level; + in.returnData = true; + in.leafDatum = SGLTDATUM(leafTuple, &so->state); + + out.leafValue = (Datum) 0; + out.recheck = false; + out.distances = NULL; + out.recheckDistances = false; + + (void) FunctionCall2Coll(&so->leafConsistentFn, so->indexCollation, + PointerGetDatum(&in), PointerGetDatum(&out)); + leafValue = out.leafValue; + + MemoryContextSwitchTo(oldcxt); + } + + /* free the previously returned reconstructed tuple, if any */ + if (scan->xs_hitup) + { + pfree(scan->xs_hitup); + scan->xs_hitup = NULL; + } + + /* build the returnable heap tuple in the scan-lifetime context */ + oldcxt = MemoryContextSwitchTo(so->reconCxt); + + /* Only deform the leaf tuple if it has INCLUDE attributes */ + if (so->state.leafTupDesc->natts > 1) + spgDeformLeafTuple(leafTuple, so->state.leafTupDesc, + leafDatums, leafIsnulls, sbatch->isNull); + + leafDatums[spgKeyColumn] = leafValue; + leafIsnulls[spgKeyColumn] = sbatch->isNull; + + scan->xs_hitup = heap_form_tuple(so->reconTupDesc, + leafDatums, leafIsnulls); + + MemoryContextSwitchTo(oldcxt); + + /* clean up after the leaf_consistent call */ + MemoryContextReset(so->tempCxt); + } + else if (scan->numberOfOrderBys > 0) + { + /* all-NULL order-by arguments: report NULL distances */ + index_store_float8_orderby_distances(scan, so->orderByTypes, + NULL, false); + } } bool spgcanreturn(Relation index, int attno) { - SpGistCache *cache; + SpGistCache *cache = spgGetCache(index); - /* INCLUDE attributes can always be fetched for index-only scans */ + /* + * Forbid index-only scans for "long values" opclasses (e.g. text radix): + * the key is reconstructed from the prefix accumulated during the descent + * (see spggettransform) and is bounded only by the field-size limit, so + * it won't fit the fixed per-batch reconstruction workspace + * (spgbeginscan). + */ + if (cache->config.longValuesOK) + return false; + + /* + * else INCLUDE attributes can always be fetched for index-only scans. + * + * Note: We deliberately give up on INCLUDE-only index-only scans too, + * even though an INCLUDE column comes straight from the bounded leaf + * tuple and needs no key reconstruction: spggettransform reconstructs the + * key unconditionally, and recheck of a key-column qual would need the + * key value regardless. + */ if (attno > 1) return true; /* We can do it if the opclass config function says so */ - cache = spgGetCache(index); - return cache->config.canReturnData; } diff --git a/src/backend/access/spgist/spgutils.c b/src/backend/access/spgist/spgutils.c index 47153b4b0..260a16490 100644 --- a/src/backend/access/spgist/spgutils.c +++ b/src/backend/access/spgist/spgutils.c @@ -88,11 +88,11 @@ spghandler(PG_FUNCTION_ARGS) .amadjustmembers = spgadjustmembers, .ambeginscan = spgbeginscan, .amrescan = spgrescan, - .amgettuple = spggettuple, - .amgetbatch = NULL, - .amunguardbatch = NULL, + .amgettuple = NULL, + .amgetbatch = spggetbatch, + .amunguardbatch = spgunguardbatch, .amkillitemsbatch = NULL, - .amgettransform = NULL, + .amgettransform = spggettransform, .amgetbitmap = spggetbitmap, .amendscan = spgendscan, .amposreset = NULL, diff --git a/src/backend/access/spgist/spgvacuum.c b/src/backend/access/spgist/spgvacuum.c index c461f8dc0..39b9bfcbd 100644 --- a/src/backend/access/spgist/spgvacuum.c +++ b/src/backend/access/spgist/spgvacuum.c @@ -625,7 +625,18 @@ spgvacuumpage(spgBulkDeleteState *bds, Buffer buffer) BlockNumber blkno = BufferGetBlockNumber(buffer); Page page; - LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + /* + * Get a full cleanup lock on this page. We must get such a lock on every + * leaf page over the course of the vacuum scan, whether or not it + * actually contains any deletable tuples. + * + * Note: we could avoid this for inner pages, but not for the root page. + * The root page can start out as a leaf page, but subsequently become an + * inner page, even while a scan holds an interlock pin on that page (this + * isn't possible in nbtree because root splits always create a new root + * page, stored within a separate block number). + */ + LockBufferForCleanup(buffer); page = BufferGetPage(buffer); if (PageIsNew(page)) @@ -706,7 +717,7 @@ spgprocesspending(spgBulkDeleteState *bds) blkno = ItemPointerGetBlockNumber(&pitem->tid); buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno, RBM_NORMAL, bds->info->strategy); - LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + LockBufferForCleanup(buffer); page = BufferGetPage(buffer); if (PageIsNew(page) || SpGistPageIsDeleted(page)) @@ -793,6 +804,16 @@ spgprocesspending(spgBulkDeleteState *bds) spgClearPendingList(bds); } +/* + * Chunk size for the main bulkdelete scan: the pending list is drained at each + * chunk boundary, the only point where the read stream is idle (see + * spgvacuumscan). This trades read-ahead against memory -- a larger interval + * lets the stream prefetch further between resets, but lets the pending list + * grow larger before it is bounded. 4096 keeps prefetch effective while + * capping the list at a few thousand entries under heavy concurrent insertion. + */ +#define SPGIST_VACUUM_DRAIN_INTERVAL 4096 + /* * Perform a bulkdelete scan */ @@ -845,22 +866,29 @@ spgvacuumscan(spgBulkDeleteState *bds) * delete some deletable tuples. See more extensive comments about this * in btvacuumscan(). */ + num_pages = 0; /* 0 forces an initial length check below */ for (;;) { - /* Get the current relation length */ - if (needLock) - LockRelationForExtension(index, ExclusiveLock); - num_pages = RelationGetNumberOfBlocks(index); - if (needLock) - UnlockRelationForExtension(index, ExclusiveLock); - - /* Quit if we've scanned the whole relation */ + /* Refresh the relation length once we have caught up to it */ if (p.current_blocknum >= num_pages) - break; + { + if (needLock) + LockRelationForExtension(index, ExclusiveLock); + num_pages = RelationGetNumberOfBlocks(index); + if (needLock) + UnlockRelationForExtension(index, ExclusiveLock); - p.last_exclusive = num_pages; + /* Quit if we've scanned the whole relation */ + if (p.current_blocknum >= num_pages) + break; + } + + /* Give the stream the next chunk; see SPGIST_VACUUM_DRAIN_INTERVAL */ + if (num_pages - p.current_blocknum > SPGIST_VACUUM_DRAIN_INTERVAL) + p.last_exclusive = p.current_blocknum + SPGIST_VACUUM_DRAIN_INTERVAL; + else + p.last_exclusive = num_pages; - /* Iterate over pages, then loop back to recheck length */ while (true) { Buffer buf; @@ -874,18 +902,18 @@ spgvacuumscan(spgBulkDeleteState *bds) break; spgvacuumpage(bds, buf); - - /* empty the pending-list after each page */ - if (bds->pendingList != NULL) - spgprocesspending(bds); } /* - * We have to reset the read stream to use it again. After returning - * InvalidBuffer, the read stream API won't invoke our callback again - * until the stream has been reset. + * Reset the read stream for the next chunk (after returning + * InvalidBuffer it won't call our callback again until reset). Now + * that it is idle, drain the pending list: spgprocesspending revisits + * redirect-relocated tuples under the same cleanup-lock interlock. */ read_stream_reset(stream); + + if (bds->pendingList != NULL) + spgprocesspending(bds); } read_stream_end(stream); diff --git a/src/backend/access/spgist/spgxlog.c b/src/backend/access/spgist/spgxlog.c index 55e8066a7..0eac1d1a1 100644 --- a/src/backend/access/spgist/spgxlog.c +++ b/src/backend/access/spgist/spgxlog.c @@ -771,7 +771,9 @@ spgRedoVacuumLeaf(XLogReaderState *record) ptr += sizeof(OffsetNumber) * xldata->nChain; chainDest = (OffsetNumber *) ptr; - if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) + /* We must take a cleanup lock here, just like spgvacuumpage() */ + if (XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &buffer) + == BLK_NEEDS_REDO) { page = BufferGetPage(buffer); @@ -834,7 +836,9 @@ spgRedoVacuumRoot(XLogReaderState *record) toDelete = xldata->offsets; - if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) + /* Take a cleanup lock, as in spgRedoVacuumLeaf() */ + if (XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &buffer) + == BLK_NEEDS_REDO) { page = BufferGetPage(buffer); @@ -873,7 +877,9 @@ spgRedoVacuumRedirect(XLogReaderState *record) locator); } - if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) + /* Take a cleanup lock, as in spgRedoVacuumLeaf() */ + if (XLogReadBufferForRedoExtended(record, 0, RBM_NORMAL, true, &buffer) + == BLK_NEEDS_REDO) { Page page = BufferGetPage(buffer); SpGistPageOpaque opaque = SpGistPageGetOpaque(page); diff --git a/doc/src/sgml/indexam.sgml b/doc/src/sgml/indexam.sgml index 75c0704cc..490431f70 100644 --- a/doc/src/sgml/indexam.sgml +++ b/doc/src/sgml/indexam.sgml @@ -951,7 +951,7 @@ amgetbatch (IndexScanDesc scan, value in the batch and provides an amgettransform callback (see below), which the table AM invokes for each returned item to set scan->xs_recheck from that recorded state; GiST - works this way. + and SP-GiST work this way. @@ -986,8 +986,8 @@ amgetbatch (IndexScanDesc scan, scan->xs_hitupdesc). This gives the access method complete freedom to form that tuple from whatever it stored in currTuples, in whatever on-disk format suits - it. GiST uses this path, because the representation it - stores differs from the indexed value and so could not satisfy the + it. GiST and SP-GiST use this path, because the representation they + store differs from the indexed value and so could not satisfy the xs_itupdesc layout directly. @@ -1031,10 +1031,10 @@ amunguardbatch (IndexScanDesc scan, is not even required to use the standard helper indexam_util_unlock_batch to manage it. In practice, though, most or all index AMs will use that helper and hold the simplest - possible interlock: each guarded B-tree, hash, or GiST batch keeps a - single buffer pin on the one index page the batch came from. (The - virtual nearest-neighbor batches that GiST uses for ordered - scans are not guarded, and hold no such pin.) See virtual nearest-neighbor batches that GiST and SP-GiST use + for ordered scans are not guarded, and hold no such pin.) See for details on buffer pin management during index scans. This function will be called at most once for each guarded batch; it is not called when the index AM has already unguarded the batch @@ -1078,10 +1078,12 @@ amkillitemsbatch (IndexScanDesc scan, amgetbatch index AMs (those that don't can leave the field set to NULL), but doing so is recommended for performance, as it allows future scans to skip known-dead index entries. - All three core index access methods that currently support - amgetbatch (B-tree, hash, and GiST) implement - LP_DEAD marking, though third-party index access methods - are free to choose whether to implement this feature. The table AM may + B-tree, hash, and GiST implement LP_DEAD marking; SP-GiST + is an example of a core amgetbatch access method that + leaves it unimplemented (it still holds the leaf-page interlock pin for + index-only scans, but never sets LP_DEAD bits), and + third-party index access methods are likewise free to choose whether to + implement this feature. The table AM may call tableam_util_scanpos_killitem to mark dead items as the scan progresses. If the batch contains any such dead items, the batch's deadItems array will have been sorted and @@ -1188,9 +1190,9 @@ amgettransform (IndexScanDesc scan, property of the whole scan — or, for index-only scans, is the on-disk index tuple returned directly via scan->xs_itup — the field can be left NULL, as B-tree and hash do. GiST - provides one because parts of its per-tuple output (the recheck flag, the - ORDER BY distances, and the reconstructed index-only - tuples) vary per matching item, as described above. + and SP-GiST provide one because parts of their per-tuple output (the recheck + flag, the ORDER BY distances, and the reconstructed + index-only tuples) vary per matching item, as described above. @@ -1498,7 +1500,7 @@ amtranslatecmptype (CompareType cmptype, Oid opfamily, Oid opcintype); calls amgettransform as it returns each item to set xs_orderbyvals and xs_recheckorderby from that recorded state. GiST - uses this for nearest-neighbor scans. As with + and SP-GiST use this for nearest-neighbor scans. As with scan->xs_recheck, these values cannot be set directly as items are returned. @@ -1506,10 +1508,10 @@ amtranslatecmptype (CompareType cmptype, Oid opfamily, Oid opcintype); Scans that use ordering operators are never planned as index-only scans. Because an ordered scan can collect matching items from many index leaf - pages without retaining a buffer pin on any of them (GiST's - virtual nearest-neighbor batches work this way), it has no - pin to serve as the interlock against concurrent TID recycling that an - index-only scan depends on (see ). The + pages without retaining a buffer pin on any of them (the + virtual nearest-neighbor batches of GiST and SP-GiST work + this way), it has no pin to serve as the interlock against concurrent TID + recycling that an index-only scan depends on (see ). The planner therefore costs and executes such scans as plain index scans, which always fetch and recheck the heap tuple. diff --git a/doc/src/sgml/spgist.sgml b/doc/src/sgml/spgist.sgml index 6af93719b..0011a458b 100644 --- a/doc/src/sgml/spgist.sgml +++ b/doc/src/sgml/spgist.sgml @@ -336,7 +336,12 @@ typedef struct spgConfigOut longValuesOK should be set true only when the attType is of variable length and the operator class is capable of segmenting long values by repeated suffixing - (see ). + (see ). Setting it true disables + index-only scans for the operator class, even if + canReturnData is also set: reconstructing the + indexed value can then require materializing an arbitrarily large prefix + (a long value is stored by spreading its prefix across many inner tuples), + so such queries are executed as regular index scans instead. diff --git a/src/test/modules/spgist_name_ops/expected/spgist_name_ops.out b/src/test/modules/spgist_name_ops/expected/spgist_name_ops.out index 1ee65ede2..ae0ef9933 100644 --- a/src/test/modules/spgist_name_ops/expected/spgist_name_ops.out +++ b/src/test/modules/spgist_name_ops/expected/spgist_name_ops.out @@ -41,7 +41,7 @@ select * from t --------------------------------------------------------------------------------------------------- Sort Sort Key: f1 - -> Index Only Scan using t_f1_f2_f3_idx on t + -> Index Scan using t_f1_f2_f3_idx on t Index Cond: ((f1 > 'binary_upgrade_set_n'::name) AND (f1 < 'binary_upgrade_set_p'::name)) (4 rows) @@ -90,7 +90,7 @@ select * from t --------------------------------------------------------------------------------------------------- Sort Sort Key: f1 - -> Index Only Scan using t_f1_f2_f3_idx on t + -> Index Scan using t_f1_f2_f3_idx on t Index Cond: ((f1 > 'binary_upgrade_set_n'::name) AND (f1 < 'binary_upgrade_set_p'::name)) (4 rows) diff --git a/src/test/regress/expected/amutils.out b/src/test/regress/expected/amutils.out index 7ab6113c6..2d26f7f99 100644 --- a/src/test/regress/expected/amutils.out +++ b/src/test/regress/expected/amutils.out @@ -101,7 +101,7 @@ select prop, nulls_last | t | f | f | f | f | f | f orderable | t | f | f | f | f | f | f distance_orderable | f | f | t | f | t | f | f - returnable | t | f | f | t | t | f | f + returnable | t | f | f | f | t | f | f search_array | t | f | f | f | f | f | f search_nulls | t | f | t | t | t | f | t bogus | | | | | | | diff --git a/src/test/regress/expected/create_index_spgist.out b/src/test/regress/expected/create_index_spgist.out index ddffca2e7..da730dd47 100644 --- a/src/test/regress/expected/create_index_spgist.out +++ b/src/test/regress/expected/create_index_spgist.out @@ -602,10 +602,10 @@ FROM (VALUES (point '1,2'), (NULL), ('1234,5678')) pts(pt); EXPLAIN (COSTS OFF) SELECT count(*) FROM radix_text_tbl WHERE t = 'P0123456789abcdef'; - QUERY PLAN ------------------------------------------------------------- + QUERY PLAN +------------------------------------------------------- Aggregate - -> Index Only Scan using sp_radix_ind on radix_text_tbl + -> Index Scan using sp_radix_ind on radix_text_tbl Index Cond: (t = 'P0123456789abcdef'::text) (3 rows) @@ -617,10 +617,10 @@ SELECT count(*) FROM radix_text_tbl WHERE t = 'P0123456789abcdef'; EXPLAIN (COSTS OFF) SELECT count(*) FROM radix_text_tbl WHERE t = 'P0123456789abcde'; - QUERY PLAN ------------------------------------------------------------- + QUERY PLAN +------------------------------------------------------- Aggregate - -> Index Only Scan using sp_radix_ind on radix_text_tbl + -> Index Scan using sp_radix_ind on radix_text_tbl Index Cond: (t = 'P0123456789abcde'::text) (3 rows) @@ -632,10 +632,10 @@ SELECT count(*) FROM radix_text_tbl WHERE t = 'P0123456789abcde'; EXPLAIN (COSTS OFF) SELECT count(*) FROM radix_text_tbl WHERE t = 'P0123456789abcdefF'; - QUERY PLAN ------------------------------------------------------------- + QUERY PLAN +------------------------------------------------------- Aggregate - -> Index Only Scan using sp_radix_ind on radix_text_tbl + -> Index Scan using sp_radix_ind on radix_text_tbl Index Cond: (t = 'P0123456789abcdefF'::text) (3 rows) @@ -650,7 +650,7 @@ SELECT count(*) FROM radix_text_tbl WHERE t < 'Aztec QUERY PLAN ---------------------------------------------------------------------- Aggregate - -> Index Only Scan using sp_radix_ind on radix_text_tbl + -> Index Scan using sp_radix_ind on radix_text_tbl Index Cond: (t < 'Aztec Ct '::text) (3 rows) @@ -665,7 +665,7 @@ SELECT count(*) FROM radix_text_tbl WHERE t ~<~ 'Aztec QUERY PLAN ------------------------------------------------------------------------ Aggregate - -> Index Only Scan using sp_radix_ind on radix_text_tbl + -> Index Scan using sp_radix_ind on radix_text_tbl Index Cond: (t ~<~ 'Aztec Ct '::text) (3 rows) @@ -680,7 +680,7 @@ SELECT count(*) FROM radix_text_tbl WHERE t <= 'Aztec QUERY PLAN ----------------------------------------------------------------------- Aggregate - -> Index Only Scan using sp_radix_ind on radix_text_tbl + -> Index Scan using sp_radix_ind on radix_text_tbl Index Cond: (t <= 'Aztec Ct '::text) (3 rows) @@ -695,7 +695,7 @@ SELECT count(*) FROM radix_text_tbl WHERE t ~<=~ 'Aztec QUERY PLAN ------------------------------------------------------------------------- Aggregate - -> Index Only Scan using sp_radix_ind on radix_text_tbl + -> Index Scan using sp_radix_ind on radix_text_tbl Index Cond: (t ~<=~ 'Aztec Ct '::text) (3 rows) @@ -710,7 +710,7 @@ SELECT count(*) FROM radix_text_tbl WHERE t = 'Aztec QUERY PLAN ---------------------------------------------------------------------- Aggregate - -> Index Only Scan using sp_radix_ind on radix_text_tbl + -> Index Scan using sp_radix_ind on radix_text_tbl Index Cond: (t = 'Aztec Ct '::text) (3 rows) @@ -725,7 +725,7 @@ SELECT count(*) FROM radix_text_tbl WHERE t = 'Worth QUERY PLAN ---------------------------------------------------------------------- Aggregate - -> Index Only Scan using sp_radix_ind on radix_text_tbl + -> Index Scan using sp_radix_ind on radix_text_tbl Index Cond: (t = 'Worth St '::text) (3 rows) @@ -740,7 +740,7 @@ SELECT count(*) FROM radix_text_tbl WHERE t >= 'Worth QUERY PLAN ----------------------------------------------------------------------- Aggregate - -> Index Only Scan using sp_radix_ind on radix_text_tbl + -> Index Scan using sp_radix_ind on radix_text_tbl Index Cond: (t >= 'Worth St '::text) (3 rows) @@ -755,7 +755,7 @@ SELECT count(*) FROM radix_text_tbl WHERE t ~>=~ 'Worth QUERY PLAN ------------------------------------------------------------------------- Aggregate - -> Index Only Scan using sp_radix_ind on radix_text_tbl + -> Index Scan using sp_radix_ind on radix_text_tbl Index Cond: (t ~>=~ 'Worth St '::text) (3 rows) @@ -770,7 +770,7 @@ SELECT count(*) FROM radix_text_tbl WHERE t > 'Worth QUERY PLAN ---------------------------------------------------------------------- Aggregate - -> Index Only Scan using sp_radix_ind on radix_text_tbl + -> Index Scan using sp_radix_ind on radix_text_tbl Index Cond: (t > 'Worth St '::text) (3 rows) @@ -785,7 +785,7 @@ SELECT count(*) FROM radix_text_tbl WHERE t ~>~ 'Worth QUERY PLAN ------------------------------------------------------------------------ Aggregate - -> Index Only Scan using sp_radix_ind on radix_text_tbl + -> Index Scan using sp_radix_ind on radix_text_tbl Index Cond: (t ~>~ 'Worth St '::text) (3 rows) @@ -797,10 +797,10 @@ SELECT count(*) FROM radix_text_tbl WHERE t ~>~ 'Worth EXPLAIN (COSTS OFF) SELECT count(*) FROM radix_text_tbl WHERE t ^@ 'Worth'; - QUERY PLAN ------------------------------------------------------------- + QUERY PLAN +------------------------------------------------------- Aggregate - -> Index Only Scan using sp_radix_ind on radix_text_tbl + -> Index Scan using sp_radix_ind on radix_text_tbl Index Cond: (t ^@ 'Worth'::text) (3 rows) @@ -812,10 +812,10 @@ SELECT count(*) FROM radix_text_tbl WHERE t ^@ 'Worth'; EXPLAIN (COSTS OFF) SELECT count(*) FROM radix_text_tbl WHERE starts_with(t, 'Worth'); - QUERY PLAN ------------------------------------------------------------- + QUERY PLAN +------------------------------------------------------- Aggregate - -> Index Only Scan using sp_radix_ind on radix_text_tbl + -> Index Scan using sp_radix_ind on radix_text_tbl Index Cond: (t ^@ 'Worth'::text) Filter: starts_with(t, 'Worth'::text) (4 rows) diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index d3ab27607..cbfcde303 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -2971,6 +2971,8 @@ SortSupportData SortTuple SortTupleComparator SortedPoint +SpGistBatchData +SpGistBatchItem SpGistBuildState SpGistCache SpGistDeadTuple @@ -4345,7 +4347,6 @@ standard_qp_extra stemmer_module stmtCacheEntry storeInfo -storeRes_func stream_stop_callback string substitute_actual_parameters_context -- 2.53.0