From 5022571ba9b95cc86715e7f34acc37f99b5e0153 Mon Sep 17 00:00:00 2001 From: Michail Nikolaev Date: Sat, 15 Jan 2022 16:21:51 +0300 Subject: [PATCH v9 1/3] code --- src/backend/access/common/bufmask.c | 25 ++++++++ src/backend/access/gist/gistget.c | 43 +++++++++++-- src/backend/access/gist/gistxlog.c | 15 +++++ src/backend/access/hash/hash.c | 4 +- src/backend/access/hash/hash_xlog.c | 17 +++++ src/backend/access/hash/hashsearch.c | 18 ++++-- src/backend/access/hash/hashutil.c | 33 +++++++++- src/backend/access/heap/heapam.c | 42 +++++++++--- src/backend/access/heap/heapam_handler.c | 5 +- src/backend/access/index/genam.c | 20 +++--- src/backend/access/index/indexam.c | 81 +++++++++++++++++++++--- src/backend/access/nbtree/nbtinsert.c | 22 +++++-- src/backend/access/nbtree/nbtree.c | 4 +- src/backend/access/nbtree/nbtsearch.c | 14 +++- src/backend/access/nbtree/nbtutils.c | 33 +++++++++- src/backend/access/nbtree/nbtxlog.c | 16 +++++ src/backend/access/table/tableam.c | 4 +- src/backend/access/transam/rmgr.c | 4 +- src/backend/access/transam/xlogutils.c | 6 ++ src/backend/storage/ipc/standby.c | 6 ++ src/bin/pg_rewind/parsexlog.c | 2 +- src/bin/pg_waldump/rmgrdesc.c | 2 +- src/include/access/bufmask.h | 1 + src/include/access/gist.h | 5 ++ src/include/access/gistxlog.h | 1 + src/include/access/hash.h | 2 + src/include/access/hash_xlog.h | 1 + src/include/access/heapam.h | 2 +- src/include/access/nbtree.h | 2 + src/include/access/nbtxlog.h | 1 + src/include/access/relscan.h | 15 ++++- src/include/access/rmgr.h | 2 +- src/include/access/rmgrlist.h | 46 +++++++------- src/include/access/tableam.h | 14 ++-- src/include/access/xlog_internal.h | 4 ++ 35 files changed, 422 insertions(+), 90 deletions(-) diff --git a/src/backend/access/common/bufmask.c b/src/backend/access/common/bufmask.c index 4e953bfd61..22026482ad 100644 --- a/src/backend/access/common/bufmask.c +++ b/src/backend/access/common/bufmask.c @@ -128,3 +128,28 @@ mask_page_content(Page page) memset(&((PageHeader) page)->pd_upper, MASK_MARKER, sizeof(uint16)); } + +/* + * mask_lp_dead + * + * In some index AMs, line pointer flags can be modified without emitting any + * WAL record. Sometimes it is required to mask LP_DEAD flags set on primary to + * set own values on standby. + */ +void +mask_lp_dead(Page page) +{ + OffsetNumber offnum, + maxoff; + + maxoff = PageGetMaxOffsetNumber(page); + for (offnum = FirstOffsetNumber; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + ItemId itemId = PageGetItemId(page, offnum); + + if (ItemIdHasStorage(itemId) && ItemIdIsDead(itemId)) + itemId->lp_flags = LP_NORMAL; + } +} diff --git a/src/backend/access/gist/gistget.c b/src/backend/access/gist/gistget.c index adbf622c83..1905c04c51 100644 --- a/src/backend/access/gist/gistget.c +++ b/src/backend/access/gist/gistget.c @@ -14,6 +14,7 @@ */ #include "postgres.h" +#include "access/bufmask.h" #include "access/genam.h" #include "access/gist_private.h" #include "access/relscan.h" @@ -49,6 +50,7 @@ gistkillitems(IndexScanDesc scan) Assert(so->curBlkno != InvalidBlockNumber); Assert(!XLogRecPtrIsInvalid(so->curPageLSN)); Assert(so->killedItems != NULL); + Assert(so->numKilled > 0); buffer = ReadBuffer(scan->indexRelation, so->curBlkno); if (!BufferIsValid(buffer)) @@ -62,8 +64,13 @@ gistkillitems(IndexScanDesc scan) * If page LSN differs it means that the page was modified since the last * read. killedItems could be not valid so LP_DEAD hints applying is not * safe. + * + * Another case - standby was promoted after start of current transaction. + * It is not required for correctness, but it is better to just skip + * everything. */ - if (BufferGetLSNAtomic(buffer) != so->curPageLSN) + if ((BufferGetLSNAtomic(buffer) != so->curPageLSN) || + (scan->xactStartedInRecovery && !RecoveryInProgress())) { UnlockReleaseBuffer(buffer); so->numKilled = 0; /* reset counter */ @@ -71,6 +78,20 @@ gistkillitems(IndexScanDesc scan) } Assert(GistPageIsLeaf(page)); + if (GistPageHasLpSafeOnStandby(page) && !scan->xactStartedInRecovery) + { + /* Seems like server was promoted some time ago, + * clear the flag just for accuracy. */ + GistClearPageHasLpSafeOnStandby(page); + } + else if (!GistPageHasLpSafeOnStandby(page) && scan->xactStartedInRecovery) + { + /* LP_DEAD flags were set by primary. We need to clear them, + * and allow standby to set own. */ + mask_lp_dead(page); + pg_memory_barrier(); + GistMarkPageHasLpSafeOnStandby(page); + } /* * Mark all killedItems as dead. We need no additional recheck, because, @@ -338,6 +359,7 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, OffsetNumber maxoff; OffsetNumber i; MemoryContext oldcxt; + bool ignore_killed_tuples; Assert(!GISTSearchItemIsHeap(*pageItem)); @@ -412,6 +434,15 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, * check all tuples on page */ maxoff = PageGetMaxOffsetNumber(page); + /* + * Check whether is it allowed to see LP_DEAD bits - always true for primary, + * on secondary we should avoid flags that were set by primary. + * In case of promotion xactStartedInRecovery may still be equal + * to true on primary so, old standby-safe bits are used (case of old + * transaction in promoted server). + */ + ignore_killed_tuples = !scan->xactStartedInRecovery || + GistPageHasLpSafeOnStandby(page); for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) { ItemId iid = PageGetItemId(page, i); @@ -424,7 +455,7 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, * If the scan specifies not to return killed tuples, then we treat a * killed tuple as not passing the qual. */ - if (scan->ignore_killed_tuples && ItemIdIsDead(iid)) + if (ignore_killed_tuples && ItemIdIsDead(iid)) continue; it = (IndexTuple) PageGetItem(page, iid); @@ -651,7 +682,9 @@ gistgettuple(IndexScanDesc scan, ScanDirection dir) { if (so->curPageData < so->nPageData) { - if (scan->kill_prior_tuple && so->curPageData > 0) + if (scan->kill_prior_tuple && so->curPageData > 0 && + (XLogRecPtrIsInvalid(scan->kill_prior_tuple_min_lsn) || + scan->kill_prior_tuple_min_lsn < so->curPageLSN)) { if (so->killedItems == NULL) @@ -688,7 +721,9 @@ gistgettuple(IndexScanDesc scan, ScanDirection dir) */ if (scan->kill_prior_tuple && so->curPageData > 0 - && so->curPageData == so->nPageData) + && so->curPageData == so->nPageData + && (XLogRecPtrIsInvalid(scan->kill_prior_tuple_min_lsn) || + scan->kill_prior_tuple_min_lsn < so->curPageLSN)) { if (so->killedItems == NULL) diff --git a/src/backend/access/gist/gistxlog.c b/src/backend/access/gist/gistxlog.c index df70f906b4..cb2893093f 100644 --- a/src/backend/access/gist/gistxlog.c +++ b/src/backend/access/gist/gistxlog.c @@ -451,6 +451,20 @@ gist_xlog_cleanup(void) MemoryContextDelete(opCtx); } +/* + * Mask a Gist page that LP_DEAD bits are not safe for the standby. + */ +void +gist_fpi_mask(char *pagedata, BlockNumber blkno) +{ + Page page = (Page) pagedata; + + if (GistPageIsLeaf(page)) + { + GistClearPageHasLpSafeOnStandby(page); + } +} + /* * Mask a Gist page before running consistency checks on it. */ @@ -459,6 +473,7 @@ gist_mask(char *pagedata, BlockNumber blkno) { Page page = (Page) pagedata; + gist_fpi_mask(pagedata, blkno); mask_page_lsn_and_checksum(page); mask_page_hint_bits(page); diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c index d48c8a4549..c25cc4d8ad 100644 --- a/src/backend/access/hash/hash.c +++ b/src/backend/access/hash/hash.c @@ -296,8 +296,10 @@ hashgettuple(IndexScanDesc scan, ScanDirection dir) { /* * Check to see if we should kill the previously-fetched tuple. + * If the tuple is marked as dead but with min LSN - treat it as alive. */ - if (scan->kill_prior_tuple) + if (scan->kill_prior_tuple && + XLogRecPtrIsInvalid(scan->kill_prior_tuple_min_lsn)) { /* * Yes, so remember it for later. (We'll deal with all such tuples diff --git a/src/backend/access/hash/hash_xlog.c b/src/backend/access/hash/hash_xlog.c index 55937b9a68..d4f759f0c1 100644 --- a/src/backend/access/hash/hash_xlog.c +++ b/src/backend/access/hash/hash_xlog.c @@ -1101,6 +1101,22 @@ hash_redo(XLogReaderState *record) } } +/* + * Mask a hash page that LP_DEAD bits are not safe for the standby. + */ +void +hash_fpi_mask(char *pagedata, BlockNumber blkno) +{ + Page page = (Page) pagedata; + HashPageOpaque opaque = (HashPageOpaque) PageGetSpecialPointer(page); + int pagetype = opaque->hasho_flag & LH_PAGE_TYPE; + + if (pagetype == LH_BUCKET_PAGE || pagetype == LH_OVERFLOW_PAGE) + { + opaque->hasho_flag &= ~LH_LP_SAFE_ON_STANDBY; + } +} + /* * Mask a hash page before performing consistency checks on it. */ @@ -1111,6 +1127,7 @@ hash_mask(char *pagedata, BlockNumber blkno) HashPageOpaque opaque; int pagetype; + hash_fpi_mask(pagedata, blkno); mask_page_lsn_and_checksum(page); mask_page_hint_bits(page); diff --git a/src/backend/access/hash/hashsearch.c b/src/backend/access/hash/hashsearch.c index 7ca542a3fb..7a60281e64 100644 --- a/src/backend/access/hash/hashsearch.c +++ b/src/backend/access/hash/hashsearch.c @@ -612,9 +612,21 @@ _hash_load_qualified_items(IndexScanDesc scan, Page page, IndexTuple itup; int itemIndex; OffsetNumber maxoff; + bool ignore_killed_tuples; + HashPageOpaque bucket_opaque; maxoff = PageGetMaxOffsetNumber(page); + bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page); + /* + * Check whether is it allowed to see LP_DEAD bits - always true for primary, + * on secondary we should avoid flags that were set by primary. + * In case of promotion xactStartedInRecovery may still be equal + * to true on primary so, old standby-safe bits are used (case of old + * transaction in promoted server). + */ + ignore_killed_tuples = !scan->xactStartedInRecovery || + H_LP_SAFE_ON_STANDBY(bucket_opaque); if (ScanDirectionIsForward(dir)) { /* load items[] in ascending order */ @@ -632,8 +644,7 @@ _hash_load_qualified_items(IndexScanDesc scan, Page page, */ if ((so->hashso_buc_populated && !so->hashso_buc_split && (itup->t_info & INDEX_MOVED_BY_SPLIT_MASK)) || - (scan->ignore_killed_tuples && - (ItemIdIsDead(PageGetItemId(page, offnum))))) + (ignore_killed_tuples && (ItemIdIsDead(PageGetItemId(page, offnum))))) { offnum = OffsetNumberNext(offnum); /* move forward */ continue; @@ -678,8 +689,7 @@ _hash_load_qualified_items(IndexScanDesc scan, Page page, */ if ((so->hashso_buc_populated && !so->hashso_buc_split && (itup->t_info & INDEX_MOVED_BY_SPLIT_MASK)) || - (scan->ignore_killed_tuples && - (ItemIdIsDead(PageGetItemId(page, offnum))))) + (ignore_killed_tuples && (ItemIdIsDead(PageGetItemId(page, offnum))))) { offnum = OffsetNumberPrev(offnum); /* move back */ continue; diff --git a/src/backend/access/hash/hashutil.c b/src/backend/access/hash/hashutil.c index edb6fa968f..00274f7c09 100644 --- a/src/backend/access/hash/hashutil.c +++ b/src/backend/access/hash/hashutil.c @@ -14,6 +14,7 @@ */ #include "postgres.h" +#include "access/bufmask.h" #include "access/hash.h" #include "access/reloptions.h" #include "access/relscan.h" @@ -547,6 +548,7 @@ _hash_kill_items(IndexScanDesc scan) int numKilled = so->numKilled; int i; bool killedsomething = false; + bool dirty = false; bool havePin = false; Assert(so->numKilled > 0); @@ -559,6 +561,15 @@ _hash_kill_items(IndexScanDesc scan) */ so->numKilled = 0; + /* + * Standby was promoted after start of current transaction. It is not + * required for correctness, but it is better to just skip everything. + */ + if (scan->xactStartedInRecovery && !RecoveryInProgress()) + { + return; + } + blkno = so->currPos.currPage; if (HashScanPosIsPinned(so->currPos)) { @@ -577,6 +588,23 @@ _hash_kill_items(IndexScanDesc scan) opaque = (HashPageOpaque) PageGetSpecialPointer(page); maxoff = PageGetMaxOffsetNumber(page); + if (H_LP_SAFE_ON_STANDBY(opaque) && !scan->xactStartedInRecovery) + { + /* Seems like server was promoted some time ago, + * clear the flag just for accuracy. */ + opaque->hasho_flag &= ~LH_LP_SAFE_ON_STANDBY; + dirty = true; + } + else if (!H_LP_SAFE_ON_STANDBY(opaque) && scan->xactStartedInRecovery) + { + /* LP_DEAD flags were set by the primary. We need to clear them, + * and allow standby to set own. */ + mask_lp_flags(page); + pg_memory_barrier(); + opaque->hasho_flag |= LH_LP_SAFE_ON_STANDBY; + dirty = true; + } + for (i = 0; i < numKilled; i++) { int itemIndex = so->killedItems[i]; @@ -596,7 +624,7 @@ _hash_kill_items(IndexScanDesc scan) { /* found the item */ ItemIdMarkDead(iid); - killedsomething = true; + killedsomething = dirty = true; break; /* out of inner search loop */ } offnum = OffsetNumberNext(offnum); @@ -611,6 +639,9 @@ _hash_kill_items(IndexScanDesc scan) if (killedsomething) { opaque->hasho_flag |= LH_PAGE_HAS_DEAD_TUPLES; + } + if (dirty) + { MarkBufferDirtyHint(buf, true); } diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 98230aac49..a1ecd704ba 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -1698,9 +1698,11 @@ heap_fetch(Relation relation, * the tuple here, in addition to updating *tid. If no match is found, the * contents of this buffer on return are undefined. * - * If all_dead is not NULL, we check non-visible tuples to see if they are - * globally dead; *all_dead is set true if all members of the HOT chain - * are vacuumable, false if not. + * If deadness is not NULL, we check non-visible tuples to see if they + * are globally dead; *all_dead is set true if all members of the HOT chain + * are vacuumable, false if not. Also, *latest_removed_xid is set to the + * latest removed xid in a HOT chain, if known. *page_lsn is set to current page + * LSN value. * * Unlike heap_fetch, the caller must already have pin and (at least) share * lock on the buffer; it is still pinned/locked at exit. Also unlike @@ -1709,7 +1711,7 @@ heap_fetch(Relation relation, bool heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer, Snapshot snapshot, HeapTuple heapTuple, - bool *all_dead, bool first_call) + TupleDeadnessData *deadness, bool first_call) { Page dp = (Page) BufferGetPage(buffer); TransactionId prev_xmax = InvalidTransactionId; @@ -1721,8 +1723,12 @@ heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer, GlobalVisState *vistest = NULL; /* If this is not the first call, previous call returned a (live!) tuple */ - if (all_dead) - *all_dead = first_call; + if (deadness) + { + deadness->all_dead = first_call; + deadness->latest_removed_xid = InvalidTransactionId; + deadness->page_lsn = PageGetLSN(dp); + } blkno = ItemPointerGetBlockNumber(tid); offnum = ItemPointerGetOffsetNumber(tid); @@ -1755,6 +1761,13 @@ heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer, at_chain_start = false; continue; } + /* + * Even if all items are dead we are not sure about latest_removed_xid + * value. In theory, some newer items of the chain could be vacuumed + * while older are not (pure paranoia, probably). + */ + if (deadness) + deadness->latest_removed_xid = InvalidTransactionId; /* else must be end of chain */ break; } @@ -1804,8 +1817,11 @@ heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer, ItemPointerSetOffsetNumber(tid, offnum); PredicateLockTID(relation, &heapTuple->t_self, snapshot, HeapTupleHeaderGetXmin(heapTuple->t_data)); - if (all_dead) - *all_dead = false; + if (deadness) + { + deadness->all_dead = false; + deadness->latest_removed_xid = InvalidTransactionId; + } return true; } } @@ -1819,13 +1835,19 @@ heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer, * Note: if you change the criterion here for what is "dead", fix the * planner's get_actual_variable_range() function to match. */ - if (all_dead && *all_dead) + if (deadness && deadness->all_dead) { if (!vistest) vistest = GlobalVisTestFor(relation); if (!HeapTupleIsSurelyDead(heapTuple, vistest)) - *all_dead = false; + { + deadness->all_dead = false; + deadness->latest_removed_xid = InvalidTransactionId; + } + else + HeapTupleHeaderAdvanceLatestRemovedXid(heapTuple->t_data, + &deadness->latest_removed_xid); } /* diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index 39ef8a0b77..b6bce376b7 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -113,7 +113,8 @@ heapam_index_fetch_tuple(struct IndexFetchTableData *scan, ItemPointer tid, Snapshot snapshot, TupleTableSlot *slot, - bool *call_again, bool *all_dead) + bool *call_again, + TupleDeadnessData *deadness) { IndexFetchHeapData *hscan = (IndexFetchHeapData *) scan; BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot; @@ -145,7 +146,7 @@ heapam_index_fetch_tuple(struct IndexFetchTableData *scan, hscan->xs_cbuf, snapshot, &bslot->base.tupdata, - all_dead, + deadness, !*call_again); bslot->base.tupdata.t_self = *tid; LockBuffer(hscan->xs_cbuf, BUFFER_LOCK_UNLOCK); diff --git a/src/backend/access/index/genam.c b/src/backend/access/index/genam.c index 98af5347b9..98653b2a4b 100644 --- a/src/backend/access/index/genam.c +++ b/src/backend/access/index/genam.c @@ -106,18 +106,18 @@ RelationGetIndexScan(Relation indexRelation, int nkeys, int norderbys) scan->xs_want_itup = false; /* may be set later */ /* - * During recovery we ignore killed tuples and don't bother to kill them - * either. We do this because the xmin on the primary node could easily be - * later than the xmin on the standby node, so that what the primary - * thinks is killed is supposed to be visible on standby. So for correct - * MVCC for queries during recovery we must ignore these hints and check - * all tuples. Do *not* set ignore_killed_tuples to true when running in a - * transaction that was started during recovery. xactStartedInRecovery - * should not be altered by index AMs. - */ + * For correct MVCC for queries during recovery, we could use index LP_DEAD + * bits as on the primary. But index AM should consider that it is possible + * to receive such bits as part of FPI. The xmin on the primary node could + * easily be later than the xmin on the standby node, so that what the + * primary thinks is killed is supposed to be visible on standby. + * + * So for correct MVCC for queries during recovery we must mask these FPI + * hints and check all tuples until standby-safe hints are set. + */ scan->kill_prior_tuple = false; + scan->kill_prior_tuple_min_lsn = InvalidXLogRecPtr; scan->xactStartedInRecovery = TransactionStartedDuringRecovery(); - scan->ignore_killed_tuples = !scan->xactStartedInRecovery; scan->opaque = NULL; diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c index fe80b8b0ba..5eeda12e71 100644 --- a/src/backend/access/index/indexam.c +++ b/src/backend/access/index/indexam.c @@ -309,6 +309,7 @@ index_rescan(IndexScanDesc scan, table_index_fetch_reset(scan->xs_heapfetch); scan->kill_prior_tuple = false; /* for safety */ + scan->kill_prior_tuple_min_lsn = InvalidXLogRecPtr; scan->xs_heap_continue = false; scan->indexRelation->rd_indam->amrescan(scan, keys, nkeys, @@ -386,6 +387,7 @@ index_restrpos(IndexScanDesc scan) table_index_fetch_reset(scan->xs_heapfetch); scan->kill_prior_tuple = false; /* for safety */ + scan->kill_prior_tuple_min_lsn = InvalidXLogRecPtr; scan->xs_heap_continue = false; scan->indexRelation->rd_indam->amrestrpos(scan); @@ -534,6 +536,7 @@ index_getnext_tid(IndexScanDesc scan, ScanDirection direction) /* Reset kill flag immediately for safety */ scan->kill_prior_tuple = false; + scan->kill_prior_tuple_min_lsn = InvalidXLogRecPtr; scan->xs_heap_continue = false; /* If we're out of index entries, we're done */ @@ -553,6 +556,61 @@ index_getnext_tid(IndexScanDesc scan, ScanDirection direction) return &scan->xs_heaptid; } +/* + * is_index_lp_dead_maybe_allowed + * + * Checks whether it allowed setting LP_DEAD hint bit for the tuple in the index. + * + * minLsn is used as output for LSN value that need to be compared with + * page LSN for decision in case of true as result value. + * + * if ->minLsn is InvalidXLogRecPtr then just return value taken into account. + */ +static bool +is_index_lp_dead_maybe_allowed(TupleDeadnessData *deadness, + XLogRecPtr *minLsn) +{ + *minLsn = InvalidXLogRecPtr; + if (!deadness->all_dead) + return false; + /* It is always allowed on primary if ->all_dead. */ + if (!RecoveryInProgress()) + return true; + + if (TransactionIdIsValid(deadness->latest_removed_xid)) { + /* + * If latest_removed_xid is known - make sure its commit record + * less than minRecoveryPoint to avoid MVCC failure after crash recovery. + */ + XLogRecPtr commitLSN + = TransactionIdGetCommitLSN(deadness->latest_removed_xid); + + if (XLogNeedsFlush(commitLSN)) + { + /* LSN not flushed - allow iff index LSN is greater. */ + *minLsn = commitLSN; + } + } else { + /* + * Looks like it is tuple cleared by heap_page_prune_execute, + * we must be sure if LSN of XLOG_HEAP2_PRUNE (or any subsequent + * updates) less than minRecoveryPoint to avoid MVCC failure + * after crash recovery. + * + * Another possible case is transaction rollback or tuple updated + * by inserting transaction. Such tuple never will be seen, so it + * is safe to set LP_DEAD. It is related to the logic of + * HeapTupleHeaderAdvanceLatestRemovedXid. + */ + if (XLogNeedsFlush(deadness->page_lsn)) + { + /* LSN not flushed - allow iff index LSN is greater. */ + *minLsn = deadness->page_lsn; + } + } + return true; +} + /* ---------------- * index_fetch_heap - get the scan's next heap tuple * @@ -574,12 +632,17 @@ index_getnext_tid(IndexScanDesc scan, ScanDirection direction) bool index_fetch_heap(IndexScanDesc scan, TupleTableSlot *slot) { - bool all_dead = false; - bool found; + TupleDeadnessData deadness; + bool found; + + deadness.all_dead = false; + deadness.latest_removed_xid = InvalidTransactionId; + deadness.page_lsn = InvalidXLogRecPtr; found = table_index_fetch_tuple(scan->xs_heapfetch, &scan->xs_heaptid, scan->xs_snapshot, slot, - &scan->xs_heap_continue, &all_dead); + &scan->xs_heap_continue, + &deadness); if (found) pgstat_count_heap_fetch(scan->indexRelation); @@ -587,13 +650,12 @@ index_fetch_heap(IndexScanDesc scan, TupleTableSlot *slot) /* * If we scanned a whole HOT chain and found only dead tuples, tell index * AM to kill its entry for that TID (this will take effect in the next - * amgettuple call, in index_getnext_tid). We do not do this when in - * recovery because it may violate MVCC to do so. See comments in - * RelationGetIndexScan(). + * amgettuple call, in index_getnext_tid). We do this when in + * recovery only in certain conditions because it may violate MVCC. */ - if (!scan->xactStartedInRecovery) - scan->kill_prior_tuple = all_dead; - + scan->kill_prior_tuple = + is_index_lp_dead_maybe_allowed(&deadness, + &scan->kill_prior_tuple_min_lsn); return found; } @@ -667,6 +729,7 @@ index_getbitmap(IndexScanDesc scan, TIDBitmap *bitmap) /* just make sure this is false... */ scan->kill_prior_tuple = false; + scan->kill_prior_tuple_min_lsn = InvalidXLogRecPtr; /* * have the am's getbitmap proc do all the work. diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index 62746c4721..cdde00ce58 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -15,6 +15,7 @@ #include "postgres.h" +#include "access/bufmask.h" #include "access/nbtree.h" #include "access/nbtxlog.h" #include "access/transam.h" @@ -503,7 +504,11 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, if (inposting || !ItemIdIsDead(curitemid)) { ItemPointerData htid; - bool all_dead = false; + TupleDeadnessData deadness; + + deadness.all_dead = false; + deadness.latest_removed_xid = InvalidTransactionId; + deadness.page_lsn = InvalidXLogRecPtr; if (!inposting) { @@ -557,7 +562,7 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, */ else if (table_index_fetch_tuple_check(heapRel, &htid, &SnapshotDirty, - &all_dead)) + &deadness)) { TransactionId xwait; @@ -671,8 +676,8 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, RelationGetRelationName(rel)))); } } - else if (all_dead && (!inposting || - (prevalldead && + else if (deadness.all_dead && (!inposting || + (prevalldead && curposti == BTreeTupleGetNPosting(curitup) - 1))) { /* @@ -680,6 +685,13 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, * all posting list TIDs) is dead to everyone, so mark the * index entry killed. */ + Assert(!RecoveryInProgress()); + if (P_LP_SAFE_ON_STANDBY(opaque)) + { + /* Seems like server was promoted some time ago, + * clear the flag just for accuracy. */ + opaque->btpo_flags &= ~BTP_LP_SAFE_ON_STANDBY; + } ItemIdMarkDead(curitemid); opaque->btpo_flags |= BTP_HAS_GARBAGE; @@ -697,7 +709,7 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, * Remember if posting list tuple has even a single HOT chain * whose members are not all dead */ - if (!all_dead && inposting) + if (!deadness.all_dead && inposting) prevalldead = false; } } diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 13024af2fa..a987521f11 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -245,7 +245,9 @@ btgettuple(IndexScanDesc scan, ScanDirection dir) /* * Check to see if we should kill the previously-fetched tuple. */ - if (scan->kill_prior_tuple) + if (scan->kill_prior_tuple && + (XLogRecPtrIsInvalid(scan->kill_prior_tuple_min_lsn) || + scan->kill_prior_tuple_min_lsn < so->currPos.lsn)) { /* * Yes, remember it for later. (We'll deal with all such diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index 9d82d4904d..13803f33ec 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -1528,6 +1528,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) int itemIndex; bool continuescan; int indnatts; + bool ignore_killed_tuples; /* * We must have the buffer pinned and locked, but the usual macro can't be @@ -1581,6 +1582,15 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) */ Assert(BTScanPosIsPinned(so->currPos)); + /* + * Check whether is it allowed to see LP_DEAD bits - always true for primary, + * on secondary we should avoid flags that were set by primary. + * In case of promotion xactStartedInRecovery may still be equal + * to true on primary so, old standby-safe bits are used (case of old + * transaction in promoted server). + */ + ignore_killed_tuples = !scan->xactStartedInRecovery || + P_LP_SAFE_ON_STANDBY(opaque); if (ScanDirectionIsForward(dir)) { /* load items[] in ascending order */ @@ -1597,7 +1607,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) * If the scan specifies not to return killed tuples, then we * treat a killed tuple as not passing the qual */ - if (scan->ignore_killed_tuples && ItemIdIsDead(iid)) + if (ignore_killed_tuples && ItemIdIsDead(iid)) { offnum = OffsetNumberNext(offnum); continue; @@ -1697,7 +1707,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) * uselessly advancing to the page to the left. This is similar * to the high key optimization used by forward scans. */ - if (scan->ignore_killed_tuples && ItemIdIsDead(iid)) + if (ignore_killed_tuples && ItemIdIsDead(iid)) { Assert(offnum >= P_FIRSTDATAKEY(opaque)); if (offnum > P_FIRSTDATAKEY(opaque)) diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index ed67863c56..72b0fabe58 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -17,6 +17,7 @@ #include +#include "access/bufmask.h" #include "access/nbtree.h" #include "access/reloptions.h" #include "access/relscan.h" @@ -1725,6 +1726,7 @@ _bt_killitems(IndexScanDesc scan) int i; int numKilled = so->numKilled; bool killedsomething = false; + bool dirty = false; bool droppedpin PG_USED_FOR_ASSERTS_ONLY; Assert(BTScanPosIsValid(so->currPos)); @@ -1735,6 +1737,15 @@ _bt_killitems(IndexScanDesc scan) */ so->numKilled = 0; + /* + * Standby was promoted after start of current transaction. It is not + * required for correctness, but it is better to just skip everything. + */ + if (scan->xactStartedInRecovery && !RecoveryInProgress()) + { + return; + } + if (BTScanPosIsPinned(so->currPos)) { /* @@ -1771,6 +1782,23 @@ _bt_killitems(IndexScanDesc scan) minoff = P_FIRSTDATAKEY(opaque); maxoff = PageGetMaxOffsetNumber(page); + if (P_LP_SAFE_ON_STANDBY(opaque) && !scan->xactStartedInRecovery) + { + /* Seems like server was promoted some time ago, + * clear the flag just for accuracy. */ + opaque->btpo_flags &= ~BTP_LP_SAFE_ON_STANDBY; + dirty = true; + } + else if (!P_LP_SAFE_ON_STANDBY(opaque) && scan->xactStartedInRecovery) + { + /* LP_DEAD flags were set by primary. We need to clear them, + * and allow standby to set own. */ + mask_lp_dead(page); + pg_memory_barrier(); + opaque->btpo_flags |= BTP_LP_SAFE_ON_STANDBY; + dirty = true; + } + for (i = 0; i < numKilled; i++) { int itemIndex = so->killedItems[i]; @@ -1866,7 +1894,7 @@ _bt_killitems(IndexScanDesc scan) { /* found the item/all posting list items */ ItemIdMarkDead(iid); - killedsomething = true; + killedsomething = dirty = true; break; /* out of inner search loop */ } offnum = OffsetNumberNext(offnum); @@ -1883,6 +1911,9 @@ _bt_killitems(IndexScanDesc scan) if (killedsomething) { opaque->btpo_flags |= BTP_HAS_GARBAGE; + } + if (dirty) + { MarkBufferDirtyHint(so->currPos.buf, true); } diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c index 611f412ba8..68330f6498 100644 --- a/src/backend/access/nbtree/nbtxlog.c +++ b/src/backend/access/nbtree/nbtxlog.c @@ -1083,6 +1083,21 @@ btree_xlog_cleanup(void) opCtx = NULL; } +/* + * Mask a btree page that LP_DEAD bits are not safe for the standby. + */ +void +btree_fpi_mask(char *pagedata, BlockNumber blkno) +{ + Page page = (Page) pagedata; + BTPageOpaque maskopaq = (BTPageOpaque) PageGetSpecialPointer(page); + + if (P_ISLEAF(maskopaq)) + { + maskopaq->btpo_flags &= ~BTP_LP_SAFE_ON_STANDBY; + } +} + /* * Mask a btree page before performing consistency checks on it. */ @@ -1092,6 +1107,7 @@ btree_mask(char *pagedata, BlockNumber blkno) Page page = (Page) pagedata; BTPageOpaque maskopaq; + btree_fpi_mask(pagedata, blkno); mask_page_lsn_and_checksum(page); mask_page_hint_bits(page); diff --git a/src/backend/access/table/tableam.c b/src/backend/access/table/tableam.c index 4f20c6ac12..1b642f0cd0 100644 --- a/src/backend/access/table/tableam.c +++ b/src/backend/access/table/tableam.c @@ -219,7 +219,7 @@ bool table_index_fetch_tuple_check(Relation rel, ItemPointer tid, Snapshot snapshot, - bool *all_dead) + TupleDeadnessData *deadness) { IndexFetchTableData *scan; TupleTableSlot *slot; @@ -229,7 +229,7 @@ table_index_fetch_tuple_check(Relation rel, slot = table_slot_create(rel, NULL); scan = table_index_fetch_begin(rel); found = table_index_fetch_tuple(scan, tid, snapshot, slot, &call_again, - all_dead); + deadness); table_index_fetch_end(scan); ExecDropSingleTupleTableSlot(slot); diff --git a/src/backend/access/transam/rmgr.c b/src/backend/access/transam/rmgr.c index f8847d5aeb..0549d97b29 100644 --- a/src/backend/access/transam/rmgr.c +++ b/src/backend/access/transam/rmgr.c @@ -31,8 +31,8 @@ #include "utils/relmapper.h" /* must be kept in sync with RmgrData definition in xlog_internal.h */ -#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask,decode) \ - { name, redo, desc, identify, startup, cleanup, mask, decode }, +#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask,decode,fpi_mask) \ + { name, redo, desc, identify, startup, cleanup, mask, decode, fpi_mask }, const RmgrData RmgrTable[RM_MAX_ID + 1] = { #include "access/rmgrlist.h" diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c index 90e1c48390..89dca6da6c 100644 --- a/src/backend/access/transam/xlogutils.c +++ b/src/backend/access/transam/xlogutils.c @@ -352,6 +352,7 @@ XLogReadBufferForRedoExtended(XLogReaderState *record, Buffer *buf) { XLogRecPtr lsn = record->EndRecPtr; + RmgrId rmid = XLogRecGetRmid(record); RelFileNode rnode; ForkNumber forknum; BlockNumber blkno; @@ -393,6 +394,11 @@ XLogReadBufferForRedoExtended(XLogReaderState *record, if (!PageIsNew(page)) { PageSetLSN(page, lsn); + /* If FPI apply mask function is defined - apply it to the buffer. */ + if (RmgrTable[rmid].rm_fpi_mask) + { + RmgrTable[rmid].rm_fpi_mask(page, blkno); + } } MarkBufferDirty(*buf); diff --git a/src/backend/storage/ipc/standby.c b/src/backend/storage/ipc/standby.c index 87ac0f74b2..ec8f47aca7 100644 --- a/src/backend/storage/ipc/standby.c +++ b/src/backend/storage/ipc/standby.c @@ -1137,6 +1137,12 @@ standby_redo(XLogReaderState *record) running.xids = xlrec->xids; ProcArrayApplyRecoveryInfo(&running); + if (InHotStandby) + { + /* Move minRecoveryPoint forward to allow standby set + * hint bits and index-LP_DEAD more aggressively. */ + XLogFlush(record->currRecPtr); + } } else if (info == XLOG_INVALIDATIONS) { diff --git a/src/bin/pg_rewind/parsexlog.c b/src/bin/pg_rewind/parsexlog.c index f6cfee4ce8..4ac3ffc8c1 100644 --- a/src/bin/pg_rewind/parsexlog.c +++ b/src/bin/pg_rewind/parsexlog.c @@ -28,7 +28,7 @@ * RmgrNames is an array of resource manager names, to make error messages * a bit nicer. */ -#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask,decode) \ +#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask,decode,fpi_mask) \ name, static const char *RmgrNames[RM_MAX_ID + 1] = { diff --git a/src/bin/pg_waldump/rmgrdesc.c b/src/bin/pg_waldump/rmgrdesc.c index 6a4ebd1310..65b7525c14 100644 --- a/src/bin/pg_waldump/rmgrdesc.c +++ b/src/bin/pg_waldump/rmgrdesc.c @@ -32,7 +32,7 @@ #include "storage/standbydefs.h" #include "utils/relmapper.h" -#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask,decode) \ +#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask,decode,fpi_mask) \ { name, desc, identify}, const RmgrDescData RmgrDescTable[RM_MAX_ID + 1] = { diff --git a/src/include/access/bufmask.h b/src/include/access/bufmask.h index 7ce9f67eff..cac41968ed 100644 --- a/src/include/access/bufmask.h +++ b/src/include/access/bufmask.h @@ -28,5 +28,6 @@ extern void mask_page_hint_bits(Page page); extern void mask_unused_space(Page page); extern void mask_lp_flags(Page page); extern void mask_page_content(Page page); +extern void mask_lp_dead(Page page); #endif diff --git a/src/include/access/gist.h b/src/include/access/gist.h index a3337627b8..3939ef386b 100644 --- a/src/include/access/gist.h +++ b/src/include/access/gist.h @@ -50,6 +50,7 @@ #define F_FOLLOW_RIGHT (1 << 3) /* page to the right has no downlink */ #define F_HAS_GARBAGE (1 << 4) /* some tuples on the page are dead, * but not deleted yet */ +#define F_LP_SAFE_ON_STANDBY (1 << 5) /* LP bits are safe to use on standby */ /* * NSN (node sequence number) is a special-purpose LSN which is stored on each @@ -179,6 +180,10 @@ typedef struct GISTENTRY #define GistMarkPageHasGarbage(page) ( GistPageGetOpaque(page)->flags |= F_HAS_GARBAGE) #define GistClearPageHasGarbage(page) ( GistPageGetOpaque(page)->flags &= ~F_HAS_GARBAGE) +#define GistPageHasLpSafeOnStandby(page) ( GistPageGetOpaque(page)->flags & F_LP_SAFE_ON_STANDBY) +#define GistMarkPageHasLpSafeOnStandby(page) ( GistPageGetOpaque(page)->flags |= F_LP_SAFE_ON_STANDBY) +#define GistClearPageHasLpSafeOnStandby(page) ( GistPageGetOpaque(page)->flags &= ~F_LP_SAFE_ON_STANDBY) + #define GistFollowRight(page) ( GistPageGetOpaque(page)->flags & F_FOLLOW_RIGHT) #define GistMarkFollowRight(page) ( GistPageGetOpaque(page)->flags |= F_FOLLOW_RIGHT) #define GistClearFollowRight(page) ( GistPageGetOpaque(page)->flags &= ~F_FOLLOW_RIGHT) diff --git a/src/include/access/gistxlog.h b/src/include/access/gistxlog.h index 4537e67eba..c46d20e9b6 100644 --- a/src/include/access/gistxlog.h +++ b/src/include/access/gistxlog.h @@ -110,5 +110,6 @@ extern const char *gist_identify(uint8 info); extern void gist_xlog_startup(void); extern void gist_xlog_cleanup(void); extern void gist_mask(char *pagedata, BlockNumber blkno); +extern void gist_fpi_mask(char *pagedata, BlockNumber blkno); #endif diff --git a/src/include/access/hash.h b/src/include/access/hash.h index cd7b2a53d8..91fe12a043 100644 --- a/src/include/access/hash.h +++ b/src/include/access/hash.h @@ -59,6 +59,7 @@ typedef uint32 Bucket; #define LH_BUCKET_BEING_SPLIT (1 << 5) #define LH_BUCKET_NEEDS_SPLIT_CLEANUP (1 << 6) #define LH_PAGE_HAS_DEAD_TUPLES (1 << 7) +#define LH_LP_SAFE_ON_STANDBY (1 << 8) #define LH_PAGE_TYPE \ (LH_OVERFLOW_PAGE | LH_BUCKET_PAGE | LH_BITMAP_PAGE | LH_META_PAGE) @@ -89,6 +90,7 @@ typedef HashPageOpaqueData *HashPageOpaque; #define H_BUCKET_BEING_SPLIT(opaque) (((opaque)->hasho_flag & LH_BUCKET_BEING_SPLIT) != 0) #define H_BUCKET_BEING_POPULATED(opaque) (((opaque)->hasho_flag & LH_BUCKET_BEING_POPULATED) != 0) #define H_HAS_DEAD_TUPLES(opaque) (((opaque)->hasho_flag & LH_PAGE_HAS_DEAD_TUPLES) != 0) +#define H_LP_SAFE_ON_STANDBY(opaque) (((opaque)->hasho_flag & LH_LP_SAFE_ON_STANDBY) != 0) /* * The page ID is for the convenience of pg_filedump and similar utilities, diff --git a/src/include/access/hash_xlog.h b/src/include/access/hash_xlog.h index 59230706bb..77bd27cf0f 100644 --- a/src/include/access/hash_xlog.h +++ b/src/include/access/hash_xlog.h @@ -263,5 +263,6 @@ extern void hash_redo(XLogReaderState *record); extern void hash_desc(StringInfo buf, XLogReaderState *record); extern const char *hash_identify(uint8 info); extern void hash_mask(char *pagedata, BlockNumber blkno); +extern void hash_fpi_mask(char *pagedata, BlockNumber blkno); #endif /* HASH_XLOG_H */ diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index 0ad87730e1..3c45de2f60 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -136,7 +136,7 @@ extern bool heap_fetch(Relation relation, Snapshot snapshot, HeapTuple tuple, Buffer *userbuf); extern bool heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer, Snapshot snapshot, HeapTuple heapTuple, - bool *all_dead, bool first_call); + TupleDeadnessData *deadness, bool first_call); extern void heap_get_latest_tid(TableScanDesc scan, ItemPointer tid); diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 9fec6fb1a8..cbd6b003ce 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -80,6 +80,7 @@ typedef BTPageOpaqueData *BTPageOpaque; #define BTP_HAS_GARBAGE (1 << 6) /* page has LP_DEAD tuples (deprecated) */ #define BTP_INCOMPLETE_SPLIT (1 << 7) /* right sibling's downlink is missing */ #define BTP_HAS_FULLXID (1 << 8) /* contains BTDeletedPageData */ +#define BTP_LP_SAFE_ON_STANDBY (1 << 9) /* LP bits are safe to use on standby */ /* * The max allowed value of a cycle ID is a bit less than 64K. This is @@ -225,6 +226,7 @@ typedef struct BTMetaPageData #define P_HAS_GARBAGE(opaque) (((opaque)->btpo_flags & BTP_HAS_GARBAGE) != 0) #define P_INCOMPLETE_SPLIT(opaque) (((opaque)->btpo_flags & BTP_INCOMPLETE_SPLIT) != 0) #define P_HAS_FULLXID(opaque) (((opaque)->btpo_flags & BTP_HAS_FULLXID) != 0) +#define P_LP_SAFE_ON_STANDBY(opaque) (((opaque)->btpo_flags & BTP_LP_SAFE_ON_STANDBY) != 0) /* * BTDeletedPageData is the page contents of a deleted page diff --git a/src/include/access/nbtxlog.h b/src/include/access/nbtxlog.h index de362d3cb9..e13a6f1b6e 100644 --- a/src/include/access/nbtxlog.h +++ b/src/include/access/nbtxlog.h @@ -347,5 +347,6 @@ extern const char *btree_identify(uint8 info); extern void btree_xlog_startup(void); extern void btree_xlog_cleanup(void); extern void btree_mask(char *pagedata, BlockNumber blkno); +extern void btree_fpi_mask(char *pagedata, BlockNumber blkno); #endif /* NBTXLOG_H */ diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h index 53a93ccbe7..55f138cae0 100644 --- a/src/include/access/relscan.h +++ b/src/include/access/relscan.h @@ -126,9 +126,10 @@ typedef struct IndexScanDescData /* signaling to index AM about killing index tuples */ bool kill_prior_tuple; /* last-returned tuple is dead */ - bool ignore_killed_tuples; /* do not return killed entries */ - bool xactStartedInRecovery; /* prevents killing/seeing killed - * tuples */ + XLogRecPtr kill_prior_tuple_min_lsn; /* kill_prior_tuple additionally + * requires index page lsn */ + bool xactStartedInRecovery; /* prevents ignoring tuples + * killed by primary */ /* index access method's private state */ void *opaque; /* access-method-specific info */ @@ -188,4 +189,12 @@ typedef struct SysScanDescData struct TupleTableSlot *slot; } SysScanDescData; +/* Struct for data about visibility of tuple */ +typedef struct TupleDeadnessData +{ + bool all_dead; /* guaranteed not visible for all backends */ + TransactionId latest_removed_xid; /* latest removed xid if known */ + XLogRecPtr page_lsn; /* lsn of page where dead tuple located */ +} TupleDeadnessData; + #endif /* RELSCAN_H */ diff --git a/src/include/access/rmgr.h b/src/include/access/rmgr.h index d9b512630c..6143bf5454 100644 --- a/src/include/access/rmgr.h +++ b/src/include/access/rmgr.h @@ -19,7 +19,7 @@ typedef uint8 RmgrId; * Note: RM_MAX_ID must fit in RmgrId; widening that type will affect the XLOG * file format. */ -#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask,decode) \ +#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask,decode,fpi_mask) \ symname, typedef enum RmgrIds diff --git a/src/include/access/rmgrlist.h b/src/include/access/rmgrlist.h index 9a74721c97..173956ca1c 100644 --- a/src/include/access/rmgrlist.h +++ b/src/include/access/rmgrlist.h @@ -24,26 +24,26 @@ * Changes to this list possibly need an XLOG_PAGE_MAGIC bump. */ -/* symbol name, textual name, redo, desc, identify, startup, cleanup */ -PG_RMGR(RM_XLOG_ID, "XLOG", xlog_redo, xlog_desc, xlog_identify, NULL, NULL, NULL, xlog_decode) -PG_RMGR(RM_XACT_ID, "Transaction", xact_redo, xact_desc, xact_identify, NULL, NULL, NULL, xact_decode) -PG_RMGR(RM_SMGR_ID, "Storage", smgr_redo, smgr_desc, smgr_identify, NULL, NULL, NULL, NULL) -PG_RMGR(RM_CLOG_ID, "CLOG", clog_redo, clog_desc, clog_identify, NULL, NULL, NULL, NULL) -PG_RMGR(RM_DBASE_ID, "Database", dbase_redo, dbase_desc, dbase_identify, NULL, NULL, NULL, NULL) -PG_RMGR(RM_TBLSPC_ID, "Tablespace", tblspc_redo, tblspc_desc, tblspc_identify, NULL, NULL, NULL, NULL) -PG_RMGR(RM_MULTIXACT_ID, "MultiXact", multixact_redo, multixact_desc, multixact_identify, NULL, NULL, NULL, NULL) -PG_RMGR(RM_RELMAP_ID, "RelMap", relmap_redo, relmap_desc, relmap_identify, NULL, NULL, NULL, NULL) -PG_RMGR(RM_STANDBY_ID, "Standby", standby_redo, standby_desc, standby_identify, NULL, NULL, NULL, standby_decode) -PG_RMGR(RM_HEAP2_ID, "Heap2", heap2_redo, heap2_desc, heap2_identify, NULL, NULL, heap_mask, heap2_decode) -PG_RMGR(RM_HEAP_ID, "Heap", heap_redo, heap_desc, heap_identify, NULL, NULL, heap_mask, heap_decode) -PG_RMGR(RM_BTREE_ID, "Btree", btree_redo, btree_desc, btree_identify, btree_xlog_startup, btree_xlog_cleanup, btree_mask, NULL) -PG_RMGR(RM_HASH_ID, "Hash", hash_redo, hash_desc, hash_identify, NULL, NULL, hash_mask, NULL) -PG_RMGR(RM_GIN_ID, "Gin", gin_redo, gin_desc, gin_identify, gin_xlog_startup, gin_xlog_cleanup, gin_mask, NULL) -PG_RMGR(RM_GIST_ID, "Gist", gist_redo, gist_desc, gist_identify, gist_xlog_startup, gist_xlog_cleanup, gist_mask, NULL) -PG_RMGR(RM_SEQ_ID, "Sequence", seq_redo, seq_desc, seq_identify, NULL, NULL, seq_mask, NULL) -PG_RMGR(RM_SPGIST_ID, "SPGist", spg_redo, spg_desc, spg_identify, spg_xlog_startup, spg_xlog_cleanup, spg_mask, NULL) -PG_RMGR(RM_BRIN_ID, "BRIN", brin_redo, brin_desc, brin_identify, NULL, NULL, brin_mask, NULL) -PG_RMGR(RM_COMMIT_TS_ID, "CommitTs", commit_ts_redo, commit_ts_desc, commit_ts_identify, NULL, NULL, NULL, NULL) -PG_RMGR(RM_REPLORIGIN_ID, "ReplicationOrigin", replorigin_redo, replorigin_desc, replorigin_identify, NULL, NULL, NULL, NULL) -PG_RMGR(RM_GENERIC_ID, "Generic", generic_redo, generic_desc, generic_identify, NULL, NULL, generic_mask, NULL) -PG_RMGR(RM_LOGICALMSG_ID, "LogicalMessage", logicalmsg_redo, logicalmsg_desc, logicalmsg_identify, NULL, NULL, NULL, logicalmsg_decode) +/* symbol name, textual name, redo, desc, identify, startup, cleanup, mask, decode, fpi_mask */ +PG_RMGR(RM_XLOG_ID, "XLOG", xlog_redo, xlog_desc, xlog_identify, NULL, NULL, NULL, xlog_decode, NULL) +PG_RMGR(RM_XACT_ID, "Transaction", xact_redo, xact_desc, xact_identify, NULL, NULL, NULL, xact_decode, NULL) +PG_RMGR(RM_SMGR_ID, "Storage", smgr_redo, smgr_desc, smgr_identify, NULL, NULL, NULL, NULL, NULL) +PG_RMGR(RM_CLOG_ID, "CLOG", clog_redo, clog_desc, clog_identify, NULL, NULL, NULL, NULL, NULL) +PG_RMGR(RM_DBASE_ID, "Database", dbase_redo, dbase_desc, dbase_identify, NULL, NULL, NULL, NULL, NULL) +PG_RMGR(RM_TBLSPC_ID, "Tablespace", tblspc_redo, tblspc_desc, tblspc_identify, NULL, NULL, NULL, NULL, NULL) +PG_RMGR(RM_MULTIXACT_ID, "MultiXact", multixact_redo, multixact_desc, multixact_identify, NULL, NULL, NULL, NULL, NULL) +PG_RMGR(RM_RELMAP_ID, "RelMap", relmap_redo, relmap_desc, relmap_identify, NULL, NULL, NULL, NULL, NULL) +PG_RMGR(RM_STANDBY_ID, "Standby", standby_redo, standby_desc, standby_identify, NULL, NULL, NULL, standby_decode, NULL) +PG_RMGR(RM_HEAP2_ID, "Heap2", heap2_redo, heap2_desc, heap2_identify, NULL, NULL, heap_mask, heap2_decode, NULL) +PG_RMGR(RM_HEAP_ID, "Heap", heap_redo, heap_desc, heap_identify, NULL, NULL, heap_mask, heap_decode, NULL) +PG_RMGR(RM_BTREE_ID, "Btree", btree_redo, btree_desc, btree_identify, btree_xlog_startup, btree_xlog_cleanup, btree_mask, NULL, btree_fpi_mask) +PG_RMGR(RM_HASH_ID, "Hash", hash_redo, hash_desc, hash_identify, NULL, NULL, hash_mask, NULL, hash_fpi_mask) +PG_RMGR(RM_GIN_ID, "Gin", gin_redo, gin_desc, gin_identify, gin_xlog_startup, gin_xlog_cleanup, gin_mask, NULL, NULL) +PG_RMGR(RM_GIST_ID, "Gist", gist_redo, gist_desc, gist_identify, gist_xlog_startup, gist_xlog_cleanup, gist_mask, NULL, gist_fpi_mask) +PG_RMGR(RM_SEQ_ID, "Sequence", seq_redo, seq_desc, seq_identify, NULL, NULL, seq_mask, NULL, NULL) +PG_RMGR(RM_SPGIST_ID, "SPGist", spg_redo, spg_desc, spg_identify, spg_xlog_startup, spg_xlog_cleanup, spg_mask, NULL, NULL) +PG_RMGR(RM_BRIN_ID, "BRIN", brin_redo, brin_desc, brin_identify, NULL, NULL, brin_mask, NULL, NULL) +PG_RMGR(RM_COMMIT_TS_ID, "CommitTs", commit_ts_redo, commit_ts_desc, commit_ts_identify, NULL, NULL, NULL, NULL, NULL) +PG_RMGR(RM_REPLORIGIN_ID, "ReplicationOrigin", replorigin_redo, replorigin_desc, replorigin_identify, NULL, NULL, NULL, NULL, NULL) +PG_RMGR(RM_GENERIC_ID, "Generic", generic_redo, generic_desc, generic_identify, NULL, NULL, generic_mask, NULL, NULL) +PG_RMGR(RM_LOGICALMSG_ID, "LogicalMessage", logicalmsg_redo, logicalmsg_desc, logicalmsg_identify, NULL, NULL, NULL, logicalmsg_decode, NULL) diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h index bb365736b7..bcd99e0242 100644 --- a/src/include/access/tableam.h +++ b/src/include/access/tableam.h @@ -425,7 +425,7 @@ typedef struct TableAmRoutine * needs to be set to true by index_fetch_tuple, signaling to the caller * that index_fetch_tuple should be called again for the same tid. * - * *all_dead, if all_dead is not NULL, should be set to true by + * *deadness, if value is not NULL, should be filled by * index_fetch_tuple iff it is guaranteed that no backend needs to see * that tuple. Index AMs can use that to avoid returning that tid in * future searches. @@ -434,7 +434,8 @@ typedef struct TableAmRoutine ItemPointer tid, Snapshot snapshot, TupleTableSlot *slot, - bool *call_again, bool *all_dead); + bool *call_again, + TupleDeadnessData *deadness); /* ------------------------------------------------------------------------ @@ -1196,7 +1197,7 @@ table_index_fetch_end(struct IndexFetchTableData *scan) * will be set to true, signaling that table_index_fetch_tuple() should be called * again for the same tid. * - * *all_dead, if all_dead is not NULL, will be set to true by + * *deadness, if value is not NULL, will be filled by * table_index_fetch_tuple() iff it is guaranteed that no backend needs to see * that tuple. Index AMs can use that to avoid returning that tid in future * searches. @@ -1213,7 +1214,8 @@ table_index_fetch_tuple(struct IndexFetchTableData *scan, ItemPointer tid, Snapshot snapshot, TupleTableSlot *slot, - bool *call_again, bool *all_dead) + bool *call_again, + TupleDeadnessData *deadness) { /* * We don't expect direct calls to table_index_fetch_tuple with valid @@ -1225,7 +1227,7 @@ table_index_fetch_tuple(struct IndexFetchTableData *scan, return scan->rel->rd_tableam->index_fetch_tuple(scan, tid, snapshot, slot, call_again, - all_dead); + deadness); } /* @@ -1237,7 +1239,7 @@ table_index_fetch_tuple(struct IndexFetchTableData *scan, extern bool table_index_fetch_tuple_check(Relation rel, ItemPointer tid, Snapshot snapshot, - bool *all_dead); + TupleDeadnessData *deadness); /* ------------------------------------------------------------------------ diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h index 849954a8e5..de5134c4fc 100644 --- a/src/include/access/xlog_internal.h +++ b/src/include/access/xlog_internal.h @@ -304,6 +304,9 @@ struct XLogRecordBuffer; * rm_mask takes as input a page modified by the resource manager and masks * out bits that shouldn't be flagged by wal_consistency_checking. * + * rm_fpi_mask takes FPI buffer and applies access specific non-logged changes, + * for example - marks LP_DEAD bits on index page as non-safe for standby. + * * RmgrTable[] is indexed by RmgrId values (see rmgrlist.h). */ typedef struct RmgrData @@ -317,6 +320,7 @@ typedef struct RmgrData void (*rm_mask) (char *pagedata, BlockNumber blkno); void (*rm_decode) (struct LogicalDecodingContext *ctx, struct XLogRecordBuffer *buf); + void (*rm_fpi_mask) (char *pagedata, BlockNumber blkno); } RmgrData; extern const RmgrData RmgrTable[]; -- 2.33.1