From c8e63c35e9ac09b71d53ddc4e5d4dd2b1ec31cb6 Mon Sep 17 00:00:00 2001 From: nkey Date: Sat, 30 Nov 2024 17:41:29 +0100 Subject: [PATCH v2 3/4] Allow advancing xmin during non-unique, non-parallel concurrent index builds by periodically resetting snapshots Long-running transactions like those used by CREATE INDEX CONCURRENTLY and REINDEX CONCURRENTLY can hold back the global xmin horizon, preventing VACUUM from cleaning up dead tuples and potentially leading to transaction ID wraparound issues. In PostgreSQL 14, commit d9d076222f5b attempted to allow VACUUM to ignore indexing transactions with CONCURRENTLY to mitigate this problem. However, this was reverted in commit e28bb8851969 because it could cause indexes to miss heap tuples that were HOT-updated and HOT-pruned during the index creation, leading to index corruption. This patch introduces a safe alternative by periodically resetting the snapshot used during non-unique, non-parallel concurrent index builds. By resetting the snapshot every N pages during the heap scan, we allow the xmin horizon to advance without risking index corruption. This approach is safe for non-unique index builds because they do not enforce uniqueness constraints that require a consistent snapshot across the entire scan. Currently, this technique is applied to: Non-parallel index builds: Parallel index builds are not yet supported and will be addressed in a future commit. Non-unique indexes: Unique index builds still require a consistent snapshot to enforce uniqueness constraints, and support for them may be added in the future. Only during the first scan of the heap: The second scan during index validation still uses a single snapshot to ensure index correctness. To implement this, a new scan option SO_RESET_SNAPSHOT is introduced. When set, it causes the snapshot to be reset every SO_RESET_SNAPSHOT_EACH_N_PAGE pages during the scan. The heap scan code is adjusted to support this option, and the index build code is modified to use it for applicable concurrent index builds that are not on system catalogs and not using parallel workers. This addresses the issues that led to the reversion of commit d9d076222f5b, providing a safe way to allow xmin advancement during long-running non-unique, non-parallel concurrent index builds while ensuring index correctness. Regression tests are added to verify the behavior. --- contrib/amcheck/verify_nbtree.c | 3 +- contrib/pgstattuple/pgstattuple.c | 2 +- src/backend/access/brin/brin.c | 14 +++ src/backend/access/heap/heapam.c | 46 ++++++++ src/backend/access/heap/heapam_handler.c | 57 ++++++++-- src/backend/access/index/genam.c | 2 +- src/backend/access/nbtree/nbtsort.c | 14 +++ src/backend/catalog/index.c | 30 +++++- src/backend/commands/indexcmds.c | 14 +-- src/backend/optimizer/plan/planner.c | 9 ++ src/include/access/tableam.h | 28 ++++- src/test/modules/injection_points/Makefile | 2 +- .../expected/cic_reset_snapshots.out | 102 ++++++++++++++++++ src/test/modules/injection_points/meson.build | 1 + .../sql/cic_reset_snapshots.sql | 82 ++++++++++++++ 15 files changed, 375 insertions(+), 31 deletions(-) create mode 100644 src/test/modules/injection_points/expected/cic_reset_snapshots.out create mode 100644 src/test/modules/injection_points/sql/cic_reset_snapshots.sql diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c index ffe4f721672..7fb052ce3de 100644 --- a/contrib/amcheck/verify_nbtree.c +++ b/contrib/amcheck/verify_nbtree.c @@ -689,7 +689,8 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace, 0, /* number of keys */ NULL, /* scan key */ true, /* buffer access strategy OK */ - true); /* syncscan OK? */ + true, /* syncscan OK? */ + false); /* * Scan will behave as the first scan of a CREATE INDEX CONCURRENTLY diff --git a/contrib/pgstattuple/pgstattuple.c b/contrib/pgstattuple/pgstattuple.c index 48cb8f59c4f..ff7cc07df99 100644 --- a/contrib/pgstattuple/pgstattuple.c +++ b/contrib/pgstattuple/pgstattuple.c @@ -332,7 +332,7 @@ pgstat_heap(Relation rel, FunctionCallInfo fcinfo) errmsg("only heap AM is supported"))); /* Disable syncscan because we assume we scan from block zero upwards */ - scan = table_beginscan_strat(rel, SnapshotAny, 0, NULL, true, false); + scan = table_beginscan_strat(rel, SnapshotAny, 0, NULL, true, false, false); hscan = (HeapScanDesc) scan; InitDirtySnapshot(SnapshotDirty); diff --git a/src/backend/access/brin/brin.c b/src/backend/access/brin/brin.c index 3aedec882cd..d69859ac4df 100644 --- a/src/backend/access/brin/brin.c +++ b/src/backend/access/brin/brin.c @@ -2366,6 +2366,7 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, WalUsage *walusage; BufferUsage *bufferusage; bool leaderparticipates = true; + bool need_pop_active_snapshot = true; int querylen; #ifdef DISABLE_LEADER_PARTICIPATION @@ -2391,9 +2392,16 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, * live according to that. */ if (!isconcurrent) + { + Assert(ActiveSnapshotSet()); snapshot = SnapshotAny; + need_pop_active_snapshot = false; + } else + { snapshot = RegisterSnapshot(GetTransactionSnapshot()); + PushActiveSnapshot(GetTransactionSnapshot()); + } /* * Estimate size for our own PARALLEL_KEY_BRIN_SHARED workspace. @@ -2436,6 +2444,8 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, /* If no DSM segment was available, back out (do serial build) */ if (pcxt->seg == NULL) { + if (need_pop_active_snapshot) + PopActiveSnapshot(); if (IsMVCCSnapshot(snapshot)) UnregisterSnapshot(snapshot); DestroyParallelContext(pcxt); @@ -2515,6 +2525,8 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, /* If no workers were successfully launched, back out (do serial build) */ if (pcxt->nworkers_launched == 0) { + if (need_pop_active_snapshot) + PopActiveSnapshot(); _brin_end_parallel(brinleader, NULL); return; } @@ -2531,6 +2543,8 @@ _brin_begin_parallel(BrinBuildState *buildstate, Relation heap, Relation index, * sure that the failure-to-start case will not hang forever. */ WaitForParallelWorkersToAttach(pcxt); + if (need_pop_active_snapshot) + PopActiveSnapshot(); } /* diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index d00300c5dcb..1fdfdf96482 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -51,6 +51,7 @@ #include "utils/datum.h" #include "utils/inval.h" #include "utils/spccache.h" +#include "utils/injection_point.h" static HeapTuple heap_prepare_insert(Relation relation, HeapTuple tup, @@ -566,6 +567,36 @@ heap_prepare_pagescan(TableScanDesc sscan) LockBuffer(buffer, BUFFER_LOCK_UNLOCK); } +/* + * Reset the active snapshot during a scan. + * This ensures the xmin horizon can advance while maintaining safe tuple visibility. + * Note: No other snapshot should be active during this operation. + */ +static inline void +heap_reset_scan_snapshot(TableScanDesc sscan) +{ + /* Make sure no other snapshot was set as active. */ + Assert(GetActiveSnapshot() == sscan->rs_snapshot); + /* And make sure active snapshot is not registered. */ + Assert(GetActiveSnapshot()->regd_count == 0); + PopActiveSnapshot(); + + sscan->rs_snapshot = InvalidSnapshot; /* just ot be tidy */ + Assert(!HaveRegisteredOrActiveSnapshot()); + InvalidateCatalogSnapshot(); + + /* Goal of snapshot reset is to allow horizon to advance. */ + Assert(!TransactionIdIsValid(MyProc->xmin)); +#if USE_INJECTION_POINTS + /* In some cases it is still not possible due xid assign. */ + if (!TransactionIdIsValid(MyProc->xid)) + INJECTION_POINT("heap_reset_scan_snapshot_effective"); +#endif + + PushActiveSnapshot(GetLatestSnapshot()); + sscan->rs_snapshot = GetActiveSnapshot(); +} + /* * heap_fetch_next_buffer - read and pin the next block from MAIN_FORKNUM. * @@ -607,7 +638,13 @@ heap_fetch_next_buffer(HeapScanDesc scan, ScanDirection dir) scan->rs_cbuf = read_stream_next_buffer(scan->rs_read_stream, NULL); if (BufferIsValid(scan->rs_cbuf)) + { scan->rs_cblock = BufferGetBlockNumber(scan->rs_cbuf); +#define SO_RESET_SNAPSHOT_EACH_N_PAGE 64 + if ((scan->rs_base.rs_flags & SO_RESET_SNAPSHOT) && + (scan->rs_cblock % SO_RESET_SNAPSHOT_EACH_N_PAGE == 0)) + heap_reset_scan_snapshot((TableScanDesc) scan); + } } /* @@ -1233,6 +1270,15 @@ heap_endscan(TableScanDesc sscan) if (scan->rs_parallelworkerdata != NULL) pfree(scan->rs_parallelworkerdata); + if (scan->rs_base.rs_flags & SO_RESET_SNAPSHOT) + { + Assert(!(scan->rs_base.rs_flags & SO_TEMP_SNAPSHOT)); + /* Make sure no other snapshot was set as active. */ + Assert(GetActiveSnapshot() == sscan->rs_snapshot); + /* And make sure snapshot is not registered. */ + Assert(GetActiveSnapshot()->regd_count == 0); + } + if (scan->rs_base.rs_flags & SO_TEMP_SNAPSHOT) UnregisterSnapshot(scan->rs_base.rs_snapshot); diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index a8d95e0f1c1..980c51e32b9 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -1190,6 +1190,8 @@ heapam_index_build_range_scan(Relation heapRelation, ExprContext *econtext; Snapshot snapshot; bool need_unregister_snapshot = false; + bool need_pop_active_snapshot = false; + bool reset_snapshots = false; TransactionId OldestXmin; BlockNumber previous_blkno = InvalidBlockNumber; BlockNumber root_blkno = InvalidBlockNumber; @@ -1224,9 +1226,6 @@ heapam_index_build_range_scan(Relation heapRelation, /* Arrange for econtext's scan tuple to be the tuple under test */ econtext->ecxt_scantuple = slot; - /* Set up execution state for predicate, if any. */ - predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate); - /* * Prepare for scan of the base relation. In a normal index build, we use * SnapshotAny because we must retrieve all tuples and do our own time @@ -1236,6 +1235,15 @@ heapam_index_build_range_scan(Relation heapRelation, */ OldestXmin = InvalidTransactionId; + /* + * For unique index we need consistent snapshot for the whole scan. + * In case of parallel scan some additional infrastructure required + * to perform scan with SO_RESET_SNAPSHOT which is not yet ready. + */ + reset_snapshots = indexInfo->ii_Concurrent && + !indexInfo->ii_Unique && + !is_system_catalog; /* just for the case */ + /* okay to ignore lazy VACUUMs here */ if (!IsBootstrapProcessingMode() && !indexInfo->ii_Concurrent) OldestXmin = GetOldestNonRemovableTransactionId(heapRelation); @@ -1244,24 +1252,41 @@ heapam_index_build_range_scan(Relation heapRelation, { /* * Serial index build. - * - * Must begin our own heap scan in this case. We may also need to - * register a snapshot whose lifetime is under our direct control. */ if (!TransactionIdIsValid(OldestXmin)) { - snapshot = RegisterSnapshot(GetTransactionSnapshot()); - need_unregister_snapshot = true; + snapshot = GetTransactionSnapshot(); + /* + * Must begin our own heap scan in this case. We may also need to + * register a snapshot whose lifetime is under our direct control. + * In case of resetting of snapshot during the scan registration is + * not allowed because snapshot is going to be changed every so + * often. + */ + if (!reset_snapshots) + { + snapshot = RegisterSnapshot(snapshot); + need_unregister_snapshot = true; + } + Assert(!ActiveSnapshotSet()); + PushActiveSnapshot(snapshot); + /* store link to snapshot because it may be copied */ + snapshot = GetActiveSnapshot(); + need_pop_active_snapshot = true; } else + { + Assert(!indexInfo->ii_Concurrent); snapshot = SnapshotAny; + } scan = table_beginscan_strat(heapRelation, /* relation */ snapshot, /* snapshot */ 0, /* number of keys */ NULL, /* scan key */ true, /* buffer access strategy OK */ - allow_sync); /* syncscan OK? */ + allow_sync, /* syncscan OK? */ + reset_snapshots /* reset snapshots? */); } else { @@ -1275,6 +1300,8 @@ heapam_index_build_range_scan(Relation heapRelation, Assert(!IsBootstrapProcessingMode()); Assert(allow_sync); snapshot = scan->rs_snapshot; + PushActiveSnapshot(snapshot); + need_pop_active_snapshot = true; } hscan = (HeapScanDesc) scan; @@ -1289,6 +1316,13 @@ heapam_index_build_range_scan(Relation heapRelation, Assert(snapshot == SnapshotAny ? TransactionIdIsValid(OldestXmin) : !TransactionIdIsValid(OldestXmin)); Assert(snapshot == SnapshotAny || !anyvisible); + Assert(snapshot == SnapshotAny || ActiveSnapshotSet()); + + /* Set up execution state for predicate, if any. */ + predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate); + /* Clear reference to snapshot since it may be changed by the scan itself. */ + if (reset_snapshots) + snapshot = InvalidSnapshot; /* Publish number of blocks to scan */ if (progress) @@ -1724,6 +1758,8 @@ heapam_index_build_range_scan(Relation heapRelation, table_endscan(scan); + if (need_pop_active_snapshot) + PopActiveSnapshot(); /* we can now forget our snapshot, if set and registered by us */ if (need_unregister_snapshot) UnregisterSnapshot(snapshot); @@ -1796,7 +1832,8 @@ heapam_index_validate_scan(Relation heapRelation, 0, /* number of keys */ NULL, /* scan key */ true, /* buffer access strategy OK */ - false); /* syncscan not OK */ + false, /* syncscan not OK */ + false); hscan = (HeapScanDesc) scan; pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_TOTAL, diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c index 60c61039d66..777df91972e 100644 --- a/src/backend/access/index/genam.c +++ b/src/backend/access/index/genam.c @@ -461,7 +461,7 @@ systable_beginscan(Relation heapRelation, */ sysscan->scan = table_beginscan_strat(heapRelation, snapshot, nkeys, key, - true, false); + true, false, false); sysscan->iscan = NULL; } diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index 17a352d040c..5c4581afb1a 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -1410,6 +1410,7 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) WalUsage *walusage; BufferUsage *bufferusage; bool leaderparticipates = true; + bool need_pop_active_snapshot = true; int querylen; #ifdef DISABLE_LEADER_PARTICIPATION @@ -1435,9 +1436,16 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) * live according to that. */ if (!isconcurrent) + { + Assert(ActiveSnapshotSet()); snapshot = SnapshotAny; + need_pop_active_snapshot = false; + } else + { snapshot = RegisterSnapshot(GetTransactionSnapshot()); + PushActiveSnapshot(snapshot); + } /* * Estimate size for our own PARALLEL_KEY_BTREE_SHARED workspace, and @@ -1491,6 +1499,8 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) /* If no DSM segment was available, back out (do serial build) */ if (pcxt->seg == NULL) { + if (need_pop_active_snapshot) + PopActiveSnapshot(); if (IsMVCCSnapshot(snapshot)) UnregisterSnapshot(snapshot); DestroyParallelContext(pcxt); @@ -1585,6 +1595,8 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) /* If no workers were successfully launched, back out (do serial build) */ if (pcxt->nworkers_launched == 0) { + if (need_pop_active_snapshot) + PopActiveSnapshot(); _bt_end_parallel(btleader); return; } @@ -1601,6 +1613,8 @@ _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request) * sure that the failure-to-start case will not hang forever. */ WaitForParallelWorkersToAttach(pcxt); + if (need_pop_active_snapshot) + PopActiveSnapshot(); } /* diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index 1c3a9e06d37..f581a743aae 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -79,6 +79,7 @@ #include "utils/snapmgr.h" #include "utils/syscache.h" #include "utils/tuplesort.h" +#include "storage/proc.h" /* Potentially set by pg_upgrade_support functions */ Oid binary_upgrade_next_index_pg_class_oid = InvalidOid; @@ -1490,8 +1491,8 @@ index_concurrently_build(Oid heapRelationId, Relation indexRelation; IndexInfo *indexInfo; - /* This had better make sure that a snapshot is active */ - Assert(ActiveSnapshotSet()); + Assert(!TransactionIdIsValid(MyProc->xmin)); + Assert(!TransactionIdIsValid(MyProc->xid)); /* Open and lock the parent heap relation */ heapRel = table_open(heapRelationId, ShareUpdateExclusiveLock); @@ -1509,19 +1510,28 @@ index_concurrently_build(Oid heapRelationId, indexRelation = index_open(indexRelationId, RowExclusiveLock); + /* BuildIndexInfo may require as snapshot for expressions and predicates */ + PushActiveSnapshot(GetTransactionSnapshot()); /* * We have to re-build the IndexInfo struct, since it was lost in the * commit of the transaction where this concurrent index was created at * the catalog level. */ indexInfo = BuildIndexInfo(indexRelation); + /* Done with snapshot */ + PopActiveSnapshot(); Assert(!indexInfo->ii_ReadyForInserts); indexInfo->ii_Concurrent = true; indexInfo->ii_BrokenHotChain = false; + Assert(!TransactionIdIsValid(MyProc->xmin)); /* Now build the index */ index_build(heapRel, indexRelation, indexInfo, false, true); + /* Invalidate catalog snapshot just for assert */ + InvalidateCatalogSnapshot(); + Assert((indexInfo->ii_ParallelWorkers || indexInfo->ii_Unique) || !TransactionIdIsValid(MyProc->xmin)); + /* Roll back any GUC changes executed by index functions */ AtEOXact_GUC(false, save_nestlevel); @@ -1532,12 +1542,19 @@ index_concurrently_build(Oid heapRelationId, table_close(heapRel, NoLock); index_close(indexRelation, NoLock); + /* + * Updating pg_index might involve TOAST table access, so ensure we + * have a valid snapshot. + */ + PushActiveSnapshot(GetTransactionSnapshot()); /* * Update the pg_index row to mark the index as ready for inserts. Once we * commit this transaction, any new transactions that open the table must * insert new entries into the index for insertions and non-HOT updates. */ index_set_state_flags(indexRelationId, INDEX_CREATE_SET_READY); + /* we can do away with our snapshot */ + PopActiveSnapshot(); } /* @@ -3205,7 +3222,8 @@ IndexCheckExclusion(Relation heapRelation, 0, /* number of keys */ NULL, /* scan key */ true, /* buffer access strategy OK */ - true); /* syncscan OK */ + true, /* syncscan OK */ + false); while (table_scan_getnextslot(scan, ForwardScanDirection, slot)) { @@ -3268,12 +3286,16 @@ IndexCheckExclusion(Relation heapRelation, * as of the start of the scan (see table_index_build_scan), whereas a normal * build takes care to include recently-dead tuples. This is OK because * we won't mark the index valid until all transactions that might be able - * to see those tuples are gone. The reason for doing that is to avoid + * to see those tuples are gone. One of reasons for doing that is to avoid * bogus unique-index failures due to concurrent UPDATEs (we might see * different versions of the same row as being valid when we pass over them, * if we used HeapTupleSatisfiesVacuum). This leaves us with an index that * does not contain any tuples added to the table while we built the index. * + * Furthermore, in case of non-unique index we set SO_RESET_SNAPSHOT for the + * scan, which causes new snapshot to be set as active every so often. The reason + * for that is to propagate the xmin horizon forward. + * * Next, we mark the index "indisready" (but still not "indisvalid") and * commit the second transaction and start a third. Again we wait for all * transactions that could have been modifying the table to terminate. Now diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c index 932854d6c60..6c1fce8ed25 100644 --- a/src/backend/commands/indexcmds.c +++ b/src/backend/commands/indexcmds.c @@ -1670,23 +1670,17 @@ DefineIndex(Oid tableId, * chains can be created where the new tuple and the old tuple in the * chain have different index keys. * - * We now take a new snapshot, and build the index using all tuples that - * are visible in this snapshot. We can be sure that any HOT updates to + * We build the index using all tuples that are visible using single or + * multiple refreshing snapshots. We can be sure that any HOT updates to * these tuples will be compatible with the index, since any updates made * by transactions that didn't know about the index are now committed or * rolled back. Thus, each visible tuple is either the end of its * HOT-chain or the extension of the chain is HOT-safe for this index. */ - /* Set ActiveSnapshot since functions in the indexes may need it */ - PushActiveSnapshot(GetTransactionSnapshot()); - /* Perform concurrent build of index */ index_concurrently_build(tableId, indexRelationId); - /* we can do away with our snapshot */ - PopActiveSnapshot(); - /* * Commit this transaction to make the indisready update visible. */ @@ -4084,9 +4078,6 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein if (newidx->safe) set_indexsafe_procflags(); - /* Set ActiveSnapshot since functions in the indexes may need it */ - PushActiveSnapshot(GetTransactionSnapshot()); - /* * Update progress for the index to build, with the correct parent * table involved. @@ -4101,7 +4092,6 @@ ReindexRelationConcurrently(const ReindexStmt *stmt, Oid relationOid, const Rein /* Perform concurrent build of new index */ index_concurrently_build(newidx->tableId, newidx->indexId); - PopActiveSnapshot(); CommitTransactionCommand(); } diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index b665a7762ec..d9de16af81d 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -62,6 +62,7 @@ #include "utils/lsyscache.h" #include "utils/rel.h" #include "utils/selfuncs.h" +#include "utils/snapmgr.h" /* GUC parameters */ double cursor_tuple_fraction = DEFAULT_CURSOR_TUPLE_FRACTION; @@ -6942,6 +6943,7 @@ plan_create_index_workers(Oid tableOid, Oid indexOid) Relation heap; Relation index; RelOptInfo *rel; + bool need_pop_active_snapshot = false; int parallel_workers; BlockNumber heap_blocks; double reltuples; @@ -6997,6 +6999,11 @@ plan_create_index_workers(Oid tableOid, Oid indexOid) heap = table_open(tableOid, NoLock); index = index_open(indexOid, NoLock); + /* Set ActiveSnapshot since functions in the indexes may need it */ + if (!ActiveSnapshotSet()) { + PushActiveSnapshot(GetTransactionSnapshot()); + need_pop_active_snapshot = true; + } /* * Determine if it's safe to proceed. * @@ -7054,6 +7061,8 @@ plan_create_index_workers(Oid tableOid, Oid indexOid) parallel_workers--; done: + if (need_pop_active_snapshot) + PopActiveSnapshot(); index_close(index, NoLock); table_close(heap, NoLock); diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index adb478a93ca..f4c7d2a92bf 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -24,6 +24,7 @@ #include "storage/read_stream.h" #include "utils/rel.h" #include "utils/snapshot.h" +#include "utils/injection_point.h" #define DEFAULT_TABLE_ACCESS_METHOD "heap" @@ -69,6 +70,17 @@ typedef enum ScanOptions * needed. If table data may be needed, set SO_NEED_TUPLES. */ SO_NEED_TUPLES = 1 << 10, + /* + * Reset scan and catalog snapshot every so often? If so, each + * SO_RESET_SNAPSHOT_EACH_N_PAGE pages active snapshot is popped, + * catalog snapshot invalidated, latest snapshot pushed as active. + * + * At the end of the scan snapshot is not popped. + * Goal of such mode is keep xmin propagating horizon forward. + * + * see heap_reset_scan_snapshot for details. + */ + SO_RESET_SNAPSHOT = 1 << 11, } ScanOptions; /* @@ -935,7 +947,8 @@ extern TableScanDesc table_beginscan_catalog(Relation relation, int nkeys, static inline TableScanDesc table_beginscan_strat(Relation rel, Snapshot snapshot, int nkeys, struct ScanKeyData *key, - bool allow_strat, bool allow_sync) + bool allow_strat, bool allow_sync, + bool reset_snapshot) { uint32 flags = SO_TYPE_SEQSCAN | SO_ALLOW_PAGEMODE; @@ -943,6 +956,15 @@ table_beginscan_strat(Relation rel, Snapshot snapshot, flags |= SO_ALLOW_STRAT; if (allow_sync) flags |= SO_ALLOW_SYNC; + if (reset_snapshot) + { + INJECTION_POINT("table_beginscan_strat_reset_snapshots"); + /* Active snapshot is required on start. */ + Assert(GetActiveSnapshot() == snapshot); + /* Active snapshot should not be registered to keep xmin propagating. */ + Assert(GetActiveSnapshot()->regd_count == 0); + flags |= (SO_RESET_SNAPSHOT); + } return rel->rd_tableam->scan_begin(rel, snapshot, nkeys, key, NULL, flags); } @@ -1779,6 +1801,10 @@ table_scan_analyze_next_tuple(TableScanDesc scan, TransactionId OldestXmin, * very hard to detect whether they're really incompatible with the chain tip. * This only really makes sense for heap AM, it might need to be generalized * for other AMs later. + * + * In case of non-unique index and non-parallel concurrent build + * SO_RESET_SNAPSHOT is applied for the scan. That leads for changing snapshots + * on the fly to allow xmin horizon propagate. */ static inline double table_index_build_scan(Relation table_rel, diff --git a/src/test/modules/injection_points/Makefile b/src/test/modules/injection_points/Makefile index f8f86e8f3b6..73893d351bb 100644 --- a/src/test/modules/injection_points/Makefile +++ b/src/test/modules/injection_points/Makefile @@ -10,7 +10,7 @@ EXTENSION = injection_points DATA = injection_points--1.0.sql PGFILEDESC = "injection_points - facility for injection points" -REGRESS = injection_points reindex_conc +REGRESS = injection_points reindex_conc cic_reset_snapshots REGRESS_OPTS = --dlpath=$(top_builddir)/src/test/regress ISOLATION = basic inplace \ diff --git a/src/test/modules/injection_points/expected/cic_reset_snapshots.out b/src/test/modules/injection_points/expected/cic_reset_snapshots.out new file mode 100644 index 00000000000..4cfbbb05923 --- /dev/null +++ b/src/test/modules/injection_points/expected/cic_reset_snapshots.out @@ -0,0 +1,102 @@ +CREATE EXTENSION injection_points; +SELECT injection_points_set_local(); + injection_points_set_local +---------------------------- + +(1 row) + +SELECT injection_points_attach('heap_reset_scan_snapshot_effective', 'notice'); + injection_points_attach +------------------------- + +(1 row) + +SELECT injection_points_attach('table_beginscan_strat_reset_snapshots', 'notice'); + injection_points_attach +------------------------- + +(1 row) + +CREATE SCHEMA cic_reset_snap; +CREATE TABLE cic_reset_snap.tbl(i int primary key, j int); +INSERT INTO cic_reset_snap.tbl SELECT i, i * I FROM generate_series(1, 200) s(i); +CREATE FUNCTION cic_reset_snap.predicate_stable(integer) RETURNS bool IMMUTABLE + LANGUAGE plpgsql AS $$ +BEGIN + EXECUTE 'SELECT txid_current()'; + RETURN MOD($1, 2) = 0; +END; $$; +CREATE FUNCTION cic_reset_snap.predicate_stable_no_param() RETURNS bool IMMUTABLE + LANGUAGE plpgsql AS $$ +BEGIN + EXECUTE 'SELECT txid_current()'; + RETURN false; +END; $$; +---------------- +ALTER TABLE cic_reset_snap.tbl SET (parallel_workers=0); +CREATE UNIQUE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(MOD(i, 2), j) WHERE MOD(i, 2) = 0; +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable(i); +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable_no_param(); +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl USING BRIN(i); +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +-- The same in parallel mode +ALTER TABLE cic_reset_snap.tbl SET (parallel_workers=2); +CREATE UNIQUE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(MOD(i, 2), j) WHERE MOD(i, 2) = 0; +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable(i); +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +NOTICE: notice triggered for injection point table_beginscan_strat_reset_snapshots +NOTICE: notice triggered for injection point heap_reset_scan_snapshot_effective +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable_no_param(); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl USING BRIN(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; +DROP SCHEMA cic_reset_snap CASCADE; +NOTICE: drop cascades to 3 other objects +DETAIL: drop cascades to table cic_reset_snap.tbl +drop cascades to function cic_reset_snap.predicate_stable(integer) +drop cascades to function cic_reset_snap.predicate_stable_no_param() +DROP EXTENSION injection_points; diff --git a/src/test/modules/injection_points/meson.build b/src/test/modules/injection_points/meson.build index 91fc8ce687f..f288633da4f 100644 --- a/src/test/modules/injection_points/meson.build +++ b/src/test/modules/injection_points/meson.build @@ -35,6 +35,7 @@ tests += { 'sql': [ 'injection_points', 'reindex_conc', + 'cic_reset_snapshots', ], 'regress_args': ['--dlpath', meson.build_root() / 'src/test/regress'], # The injection points are cluster-wide, so disable installcheck diff --git a/src/test/modules/injection_points/sql/cic_reset_snapshots.sql b/src/test/modules/injection_points/sql/cic_reset_snapshots.sql new file mode 100644 index 00000000000..4fef5a47431 --- /dev/null +++ b/src/test/modules/injection_points/sql/cic_reset_snapshots.sql @@ -0,0 +1,82 @@ +CREATE EXTENSION injection_points; + +SELECT injection_points_set_local(); +SELECT injection_points_attach('heap_reset_scan_snapshot_effective', 'notice'); +SELECT injection_points_attach('table_beginscan_strat_reset_snapshots', 'notice'); + + +CREATE SCHEMA cic_reset_snap; +CREATE TABLE cic_reset_snap.tbl(i int primary key, j int); +INSERT INTO cic_reset_snap.tbl SELECT i, i * I FROM generate_series(1, 200) s(i); + +CREATE FUNCTION cic_reset_snap.predicate_stable(integer) RETURNS bool IMMUTABLE + LANGUAGE plpgsql AS $$ +BEGIN + EXECUTE 'SELECT txid_current()'; + RETURN MOD($1, 2) = 0; +END; $$; + +CREATE FUNCTION cic_reset_snap.predicate_stable_no_param() RETURNS bool IMMUTABLE + LANGUAGE plpgsql AS $$ +BEGIN + EXECUTE 'SELECT txid_current()'; + RETURN false; +END; $$; + +---------------- +ALTER TABLE cic_reset_snap.tbl SET (parallel_workers=0); + +CREATE UNIQUE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(MOD(i, 2), j) WHERE MOD(i, 2) = 0; +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable_no_param(); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl USING BRIN(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +-- The same in parallel mode +ALTER TABLE cic_reset_snap.tbl SET (parallel_workers=2); + +CREATE UNIQUE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(MOD(i, 2), j) WHERE MOD(i, 2) = 0; +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl(i, j) WHERE cic_reset_snap.predicate_stable_no_param(); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +CREATE INDEX CONCURRENTLY idx ON cic_reset_snap.tbl USING BRIN(i); +REINDEX INDEX CONCURRENTLY cic_reset_snap.idx; +DROP INDEX CONCURRENTLY cic_reset_snap.idx; + +DROP SCHEMA cic_reset_snap CASCADE; + +DROP EXTENSION injection_points; -- 2.43.0