From 065720d1e6ad31dd4b092f3ceb10c6a88200683e Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Fri, 25 Apr 2025 14:57:06 +0200 Subject: [PATCH v02 1/4] Expose visibility checking shim for index usage This will be the backbone of a fix for visibility issues in gist and sp-gist's index-only scan code in later commits. The issue to be solved is index scans caching tuples in memory that are being removed by a concurrently running vacuum; to the point that the VACUUM process has removed the tuple and marked its page as ALL_VISIBLE by the time the IOS code returns that tuple through amgettuple. While the structure of IndexScanDesc is updated, this new field is located in an alignment gap, thus making this new field safe to use. Backpatch-through: 14 --- src/backend/access/index/indexam.c | 6 ++ src/backend/executor/nodeIndexonlyscan.c | 86 ++++++++++++++++-------- src/backend/utils/adt/selfuncs.c | 81 ++++++++++++++-------- src/include/access/relscan.h | 1 + src/include/access/tableam.h | 53 +++++++++++++++ 5 files changed, 169 insertions(+), 58 deletions(-) diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c index 7967e939847..d4b457803d4 100644 --- a/src/backend/access/index/indexam.c +++ b/src/backend/access/index/indexam.c @@ -606,6 +606,12 @@ index_getnext_tid(IndexScanDesc scan, ScanDirection direction) /* XXX: we should assert that a snapshot is pushed or registered */ Assert(TransactionIdIsValid(RecentXmin)); + /* + * Reset xs_visrecheck, so we don't confuse the next tuple's visibility + * state with that of the previous. + */ + scan->xs_visrecheck = TMVC_Unchecked; + /* * The AM's amgettuple proc finds the next index entry matching the scan * keys, and puts the TID into scan->xs_heaptid. It should also set diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c index d52012e8a69..50dc9682970 100644 --- a/src/backend/executor/nodeIndexonlyscan.c +++ b/src/backend/executor/nodeIndexonlyscan.c @@ -124,13 +124,16 @@ IndexOnlyNext(IndexOnlyScanState *node) while ((tid = index_getnext_tid(scandesc, direction)) != NULL) { bool tuple_from_heap = false; + TMVC_Result vischeck = scandesc->xs_visrecheck; CHECK_FOR_INTERRUPTS(); /* * We can skip the heap fetch if the TID references a heap page on * which all tuples are known visible to everybody. In any case, - * we'll use the index tuple not the heap tuple as the data source. + * we'll use the index tuple not the heap tuple as the data source. We + * can skip the VM lookup if the index has already issued that VM + * lookup, indicated by a non-Unchecked value in xs_visrecheck. * * Note on Memory Ordering Effects: visibilitymap_get_status does not * lock the visibility map buffer, and therefore the result we read @@ -160,37 +163,62 @@ IndexOnlyNext(IndexOnlyScanState *node) * * It's worth going through this complexity to avoid needing to lock * the VM buffer, which could cause significant contention. + * + * The index doing these checks for us doesn't materially change these + * considerations. */ - if (!VM_ALL_VISIBLE(scandesc->heapRelation, - ItemPointerGetBlockNumber(tid), - &node->ioss_VMBuffer)) - { - /* - * Rats, we have to visit the heap to check visibility. - */ - InstrCountTuples2(node, 1); - if (!index_fetch_heap(scandesc, node->ioss_TableSlot)) - continue; /* no visible tuple, try next index entry */ + if (vischeck == TMVC_Unchecked) + vischeck = table_index_vischeck_tuple(scandesc->heapRelation, + &node->ioss_VMBuffer, + tid); - ExecClearTuple(node->ioss_TableSlot); + Assert(vischeck != TMVC_Unchecked); - /* - * Only MVCC snapshots are supported here, so there should be no - * need to keep following the HOT chain once a visible entry has - * been found. If we did want to allow that, we'd need to keep - * more state to remember not to call index_getnext_tid next time. - */ - if (scandesc->xs_heap_continue) - elog(ERROR, "non-MVCC snapshots are not supported in index-only scans"); - - /* - * Note: at this point we are holding a pin on the heap page, as - * recorded in scandesc->xs_cbuf. We could release that pin now, - * but it's not clear whether it's a win to do so. The next index - * entry might require a visit to the same heap page. - */ - - tuple_from_heap = true; + switch (vischeck) + { + case TMVC_Unchecked: + elog(ERROR, "Failed to check visibility for tuple"); + + /* + * In case of compilers that don't undertand that elog(ERROR) + * doens't exit, but which do have a functional + * -Wimplicit-fallthrough warning: + */ + /* fallthrough */ + case TMVC_MaybeVisible: + { + /* + * Rats, we have to visit the heap to check visibility. + */ + InstrCountTuples2(node, 1); + if (!index_fetch_heap(scandesc, node->ioss_TableSlot)) + continue; /* no visible tuple, try next index entry */ + + ExecClearTuple(node->ioss_TableSlot); + + /* + * Only MVCC snapshots are supported here, so there should + * be no need to keep following the HOT chain once a + * visible entry has been found. If we did want to allow + * that, we'd need to keep more state to remember not to + * call index_getnext_tid next time. + */ + if (scandesc->xs_heap_continue) + elog(ERROR, "non-MVCC snapshots are not supported in index-only scans"); + + /* + * Note: at this point we are holding a pin on the heap + * page, as recorded in scandesc->xs_cbuf. We could + * release that pin now, but it's not clear whether it's a + * win to do so. The next index entry might require a + * visit to the same heap page. + */ + + tuple_from_heap = true; + break; + } + case TMVC_Visible: + break; } /* diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index b9449b4574a..c30a22e5640 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -7195,49 +7195,71 @@ get_actual_variable_endpoint(Relation heapRel, /* Set it up for index-only scan */ index_scan->xs_want_itup = true; index_rescan(index_scan, scankeys, 1, NULL, 0); + index_scan->xs_visrecheck = TMVC_Unchecked; /* Fetch first/next tuple in specified direction */ while ((tid = index_getnext_tid(index_scan, indexscandir)) != NULL) { BlockNumber block = ItemPointerGetBlockNumber(tid); + TMVC_Result visres = index_scan->xs_visrecheck; - if (!VM_ALL_VISIBLE(heapRel, - block, - &vmbuffer)) + if (visres == TMVC_Unchecked) + visres = table_index_vischeck_tuple(heapRel, &vmbuffer, tid); + + Assert(visres != TMVC_Unchecked); + + switch (visres) { - /* Rats, we have to visit the heap to check visibility */ - if (!index_fetch_heap(index_scan, tableslot)) - { + case TMVC_Unchecked: + elog(ERROR, "Failed to check visibility for tuple"); + /* - * No visible tuple for this index entry, so we need to - * advance to the next entry. Before doing so, count heap - * page fetches and give up if we've done too many. - * - * We don't charge a page fetch if this is the same heap page - * as the previous tuple. This is on the conservative side, - * since other recently-accessed pages are probably still in - * buffers too; but it's good enough for this heuristic. + * In case of compilers that don't undertand that elog(ERROR) + * doens't exit, and which have -Wimplicit-fallthrough: */ + /* fallthrough */ + case TMVC_MaybeVisible: + { + /* Rats, we have to visit the heap to check visibility */ + if (!index_fetch_heap(index_scan, tableslot)) + { + /* + * No visible tuple for this index entry, so we need + * to advance to the next entry. Before doing so, + * count heap page fetches and give up if we've done + * too many. + * + * We don't charge a page fetch if this is the same + * heap page as the previous tuple. This is on the + * conservative side, since other recently-accessed + * pages are probably still in buffers too; but it's + * good enough for this heuristic. + */ #define VISITED_PAGES_LIMIT 100 - if (block != last_heap_block) - { - last_heap_block = block; - n_visited_heap_pages++; - if (n_visited_heap_pages > VISITED_PAGES_LIMIT) - break; - } + if (block != last_heap_block) + { + last_heap_block = block; + n_visited_heap_pages++; + if (n_visited_heap_pages > VISITED_PAGES_LIMIT) + goto exit_loop; + } - continue; /* no visible tuple, try next index entry */ - } + continue; /* no visible tuple, try next index entry */ + } - /* We don't actually need the heap tuple for anything */ - ExecClearTuple(tableslot); + /* We don't actually need the heap tuple for anything */ + ExecClearTuple(tableslot); - /* - * We don't care whether there's more than one visible tuple in - * the HOT chain; if any are visible, that's good enough. - */ + /* + * We don't care whether there's more than one visible + * tuple in the HOT chain; if any are visible, that's good + * enough. + */ + break; + } + case TMVC_Visible: + break; } /* @@ -7270,6 +7292,7 @@ get_actual_variable_endpoint(Relation heapRel, have_data = true; break; } +exit_loop: if (vmbuffer != InvalidBuffer) ReleaseBuffer(vmbuffer); diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h index 2ea06a67a63..a2c96111617 100644 --- a/src/include/access/relscan.h +++ b/src/include/access/relscan.h @@ -188,6 +188,7 @@ typedef struct IndexScanDescData IndexFetchTableData *xs_heapfetch; bool xs_recheck; /* T means scan keys must be rechecked */ + uint8 xs_visrecheck; /* TMVC_Result; see tableam.h */ /* * When fetching with an ordering operator, the values of the ORDER BY diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index f2c36696bca..77a91df22e3 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -24,6 +24,7 @@ #include "storage/read_stream.h" #include "utils/rel.h" #include "utils/snapshot.h" +#include "visibilitymap.h" #define DEFAULT_TABLE_ACCESS_METHOD "heap" @@ -276,6 +277,28 @@ typedef struct TM_IndexDeleteOp TM_IndexStatus *status; } TM_IndexDeleteOp; +/* + * Output of table_index_vischeck_tuple() + * + * Index-only scans need to know the visibility of the associated table tuples + * before they can return the index tuple. If the index tuple is known to be + * visible with a cheap check, we can return it directly without requesting + * the visibility info from the table AM directly. + * + * This AM API exposes a cheap visibility checking API to indexes, allowing + * these indexes to check multiple tuples worth of visibility info at once, + * and allowing the AM to store these checks, improving the pinning ergonomics + * of index AMs by allowing a scan to cache index tuples in memory without + * holding pins on those index tuples' pages until the index tuples were + * returned. + */ +typedef enum TMVC_Result +{ + TMVC_Unchecked, + TMVC_MaybeVisible, + TMVC_Visible, +} TMVC_Result; + /* * "options" flag bits for table_tuple_insert. Access methods may define * their own bits for internal use, as long as they don't collide with these. @@ -1414,6 +1437,36 @@ table_index_delete_tuples(Relation rel, TM_IndexDeleteOp *delstate) return rel->rd_tableam->index_delete_tuples(rel, delstate); } +/* + * Determine rough visibility information of index tuples based on each TID. + * + * Determines which entries from index AM caller's TM_IndexVisibilityCheckOp + * state point to TMVC_VISIBLE or TMVC_MAYBE_VISIBLE table tuples, at low IO + * overhead. For the heap AM, the implementation is effectively a wrapper + * around VM_ALL_FROZEN. + * + * On return, all TM_VisChecks indicated by checkop->checktids will have been + * updated with the correct visibility status. + * + * Note that there is no value for "definitely dead" tuples, as the Heap AM + * doesn't have an efficient method to determine that a tuple is dead to all + * users, as it would have to go into the heap. If and when AMs are built + * that would support VM checks with an equivalent to VM_ALL_DEAD this + * decision can be reconsidered. + */ +static inline TMVC_Result +table_index_vischeck_tuple(Relation rel, Buffer *vmbuffer, ItemPointer tid) +{ + BlockNumber blkno = ItemPointerGetBlockNumberNoCheck(tid); + TMVC_Result res; + + if (VM_ALL_VISIBLE(rel, blkno, vmbuffer)) + res = TMVC_Visible; + else + res = TMVC_MaybeVisible; + + return res; +} /* ---------------------------------------------------------------------------- * Functions for manipulations of physical tuples. -- 2.50.1 (Apple Git-155)