From 983a660f8847d5f7ed6359c0718d58ae3e9c120b Mon Sep 17 00:00:00 2001 From: Bertrand Drouvot Date: Fri, 31 Mar 2023 08:21:16 +0000 Subject: [PATCH v54 2/7] Add info in WAL records in preparation for logical slot conflict handling. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Overall design: 1. We want to enable logical decoding on standbys, but replay of WAL from the primary might remove data that is needed by logical decoding, causing error(s) on the standby. To prevent those errors, a new replication conflict scenario needs to be addressed (as much as hot standby does). 2. Our chosen strategy for dealing with this type of replication slot is to invalidate logical slots for which needed data has been removed. 3. To do this we need the latestRemovedXid for each change, just as we do for physical replication conflicts, but we also need to know whether any particular change was to data that logical replication might access. That way, during WAL replay, we know when there is a risk of conflict and, if so, if there is a conflict. 4. We can't rely on the standby's relcache entries for this purpose in any way, because the startup process can't access catalog contents. 5. Therefore every WAL record that potentially removes data from the index or heap must carry a flag indicating whether or not it is one that might be accessed during logical decoding. Why do we need this for logical decoding on standby? First, let's forget about logical decoding on standby and recall that on a primary database, any catalog rows that may be needed by a logical decoding replication slot are not removed. This is done thanks to the catalog_xmin associated with the logical replication slot. But, with logical decoding on standby, in the following cases: - hot_standby_feedback is off - hot_standby_feedback is on but there is no a physical slot between the primary and the standby. Then, hot_standby_feedback will work, but only while the connection is alive (for example a node restart would break it) Then, the primary may delete system catalog rows that could be needed by the logical decoding on the standby (as it does not know about the catalog_xmin on the standby). So, it’s mandatory to identify those rows and invalidate the slots that may need them if any. Identifying those rows is the purpose of this commit. Implementation: When a WAL replay on standby indicates that a catalog table tuple is to be deleted by an xid that is greater than a logical slot's catalog_xmin, then that means the slot's catalog_xmin conflicts with the xid, and we need to handle the conflict. While subsequent commits will do the actual conflict handling, this commit adds a new field isCatalogRel in such WAL records (and a new bit set in the xl_heap_visible flags field), that is true for catalog tables, so as to arrange for conflict handling. The affected WAL records are the ones that already contain the snapshotConflictHorizon field, namely: - gistxlogDelete - gistxlogPageReuse - xl_hash_vacuum_one_page - xl_heap_prune - xl_heap_freeze_page - xl_heap_visible - xl_btree_reuse_page - xl_btree_delete - spgxlogVacuumRedirect Due to this new field being added, xl_hash_vacuum_one_page and gistxlogDelete do now contain the offsets to be deleted as a FLEXIBLE_ARRAY_MEMBER. This is needed to ensure correct alignement. It's not needed on the others struct where isCatalogRel has been added. Author: Andres Freund (in an older version), Amit Khandekar, Bertrand Drouvot Reviewed-By: Bertrand Drouvot, Andres Freund, Robert Haas, Fabrizio de Royes Mello, Melanie Plageman --- src/backend/access/gist/gistxlog.c | 12 ++++-------- src/backend/access/hash/hash_xlog.c | 12 +++--------- src/backend/access/hash/hashinsert.c | 1 + src/backend/access/heap/heapam.c | 9 ++++++++- src/backend/access/heap/pruneheap.c | 1 + src/backend/access/nbtree/nbtpage.c | 2 ++ src/backend/access/spgist/spgvacuum.c | 1 + src/include/access/gistxlog.h | 11 ++++++++--- src/include/access/hash_xlog.h | 8 +++++--- src/include/access/heapam_xlog.h | 8 ++++++-- src/include/access/nbtxlog.h | 8 ++++++-- src/include/access/spgxlog.h | 2 ++ src/include/access/visibilitymapdefs.h | 17 +++++++++++++---- 13 files changed, 60 insertions(+), 32 deletions(-) 10.1% src/backend/access/gist/ 8.6% src/backend/access/hash/ 10.4% src/backend/access/heap/ 3.2% src/backend/access/nbtree/ 65.9% src/include/access/ diff --git a/src/backend/access/gist/gistxlog.c b/src/backend/access/gist/gistxlog.c index 4b52719765..b7678f3c14 100644 --- a/src/backend/access/gist/gistxlog.c +++ b/src/backend/access/gist/gistxlog.c @@ -177,6 +177,7 @@ gistRedoDeleteRecord(XLogReaderState *record) gistxlogDelete *xldata = (gistxlogDelete *) XLogRecGetData(record); Buffer buffer; Page page; + OffsetNumber *toDelete = xldata->offsets; /* * If we have any conflict processing to do, it must happen before we @@ -203,14 +204,7 @@ gistRedoDeleteRecord(XLogReaderState *record) { page = (Page) BufferGetPage(buffer); - if (XLogRecGetDataLen(record) > SizeOfGistxlogDelete) - { - OffsetNumber *todelete; - - todelete = (OffsetNumber *) ((char *) xldata + SizeOfGistxlogDelete); - - PageIndexMultiDelete(page, todelete, xldata->ntodelete); - } + PageIndexMultiDelete(page, toDelete, xldata->ntodelete); GistClearPageHasGarbage(page); GistMarkTuplesDeleted(page); @@ -609,6 +603,7 @@ gistXLogPageReuse(Relation rel, Relation heaprel, */ /* XLOG stuff */ + xlrec_reuse.isCatalogRel = RelationIsAccessibleInLogicalDecoding(heaprel); xlrec_reuse.locator = rel->rd_locator; xlrec_reuse.block = blkno; xlrec_reuse.snapshotConflictHorizon = deleteXid; @@ -678,6 +673,7 @@ gistXLogDelete(Buffer buffer, OffsetNumber *todelete, int ntodelete, gistxlogDelete xlrec; XLogRecPtr recptr; + xlrec.isCatalogRel = RelationIsAccessibleInLogicalDecoding(heaprel); xlrec.snapshotConflictHorizon = snapshotConflictHorizon; xlrec.ntodelete = ntodelete; diff --git a/src/backend/access/hash/hash_xlog.c b/src/backend/access/hash/hash_xlog.c index f38b42efb9..08ceb91288 100644 --- a/src/backend/access/hash/hash_xlog.c +++ b/src/backend/access/hash/hash_xlog.c @@ -980,8 +980,10 @@ hash_xlog_vacuum_one_page(XLogReaderState *record) Page page; XLogRedoAction action; HashPageOpaque pageopaque; + OffsetNumber *toDelete; xldata = (xl_hash_vacuum_one_page *) XLogRecGetData(record); + toDelete = xldata->offsets; /* * If we have any conflict processing to do, it must happen before we @@ -1010,15 +1012,7 @@ hash_xlog_vacuum_one_page(XLogReaderState *record) { page = (Page) BufferGetPage(buffer); - if (XLogRecGetDataLen(record) > SizeOfHashVacuumOnePage) - { - OffsetNumber *unused; - - unused = (OffsetNumber *) ((char *) xldata + SizeOfHashVacuumOnePage); - - PageIndexMultiDelete(page, unused, xldata->ntuples); - } - + PageIndexMultiDelete(page, toDelete, xldata->ntuples); /* * Mark the page as not containing any LP_DEAD items. See comments in * _hash_vacuum_one_page() for details. diff --git a/src/backend/access/hash/hashinsert.c b/src/backend/access/hash/hashinsert.c index a604e31891..22656b24e2 100644 --- a/src/backend/access/hash/hashinsert.c +++ b/src/backend/access/hash/hashinsert.c @@ -432,6 +432,7 @@ _hash_vacuum_one_page(Relation rel, Relation hrel, Buffer metabuf, Buffer buf) xl_hash_vacuum_one_page xlrec; XLogRecPtr recptr; + xlrec.isCatalogRel = RelationIsAccessibleInLogicalDecoding(hrel); xlrec.snapshotConflictHorizon = snapshotConflictHorizon; xlrec.ntuples = ndeletable; diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 9662e38254..08546b35c7 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -6698,6 +6698,7 @@ heap_freeze_execute_prepared(Relation rel, Buffer buffer, nplans = heap_log_freeze_plan(tuples, ntuples, plans, offsets); xlrec.snapshotConflictHorizon = snapshotConflictHorizon; + xlrec.isCatalogRel = RelationIsAccessibleInLogicalDecoding(rel); xlrec.nplans = nplans; XLogBeginInsert(); @@ -8280,6 +8281,8 @@ log_heap_visible(Relation rel, Buffer heap_buffer, Buffer vm_buffer, xlrec.snapshotConflictHorizon = snapshotConflictHorizon; xlrec.flags = vmflags; + if (RelationIsAccessibleInLogicalDecoding(rel)) + xlrec.flags |= VISIBILITYMAP_XLOG_CATALOG_REL; XLogBeginInsert(); XLogRegisterData((char *) &xlrec, SizeOfHeapVisible); @@ -8870,6 +8873,8 @@ heap_xlog_visible(XLogReaderState *record) BlockNumber blkno; XLogRedoAction action; + Assert((xlrec->flags & VISIBILITYMAP_XLOG_VALID_BITS) == xlrec->flags); + XLogRecGetBlockTag(record, 1, &rlocator, NULL, &blkno); /* @@ -8956,6 +8961,7 @@ heap_xlog_visible(XLogReaderState *record) { Page vmpage = BufferGetPage(vmbuffer); Relation reln; + uint8 vmbits; /* initialize the page if it was read as zeros */ if (PageIsNew(vmpage)) @@ -8969,9 +8975,10 @@ heap_xlog_visible(XLogReaderState *record) reln = CreateFakeRelcacheEntry(rlocator); visibilitymap_pin(reln, blkno, &vmbuffer); + vmbits = (xlrec->flags & VISIBILITYMAP_VALID_BITS); visibilitymap_set(reln, blkno, InvalidBuffer, lsn, vmbuffer, - xlrec->snapshotConflictHorizon, xlrec->flags); + xlrec->snapshotConflictHorizon, vmbits); ReleaseBuffer(vmbuffer); FreeFakeRelcacheEntry(reln); diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c index 4e65cbcadf..3f0342351f 100644 --- a/src/backend/access/heap/pruneheap.c +++ b/src/backend/access/heap/pruneheap.c @@ -418,6 +418,7 @@ heap_page_prune(Relation relation, Buffer buffer, xl_heap_prune xlrec; XLogRecPtr recptr; + xlrec.isCatalogRel = RelationIsAccessibleInLogicalDecoding(relation); xlrec.snapshotConflictHorizon = prstate.snapshotConflictHorizon; xlrec.nredirected = prstate.nredirected; xlrec.ndead = prstate.ndead; diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index ee996b5660..151ad37a54 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -836,6 +836,7 @@ _bt_log_reuse_page(Relation rel, Relation heaprel, BlockNumber blkno, */ /* XLOG stuff */ + xlrec_reuse.isCatalogRel = RelationIsAccessibleInLogicalDecoding(heaprel); xlrec_reuse.locator = rel->rd_locator; xlrec_reuse.block = blkno; xlrec_reuse.snapshotConflictHorizon = safexid; @@ -1358,6 +1359,7 @@ _bt_delitems_delete(Relation rel, Relation heaprel, Buffer buf, XLogRecPtr recptr; xl_btree_delete xlrec_delete; + xlrec_delete.isCatalogRel = RelationIsAccessibleInLogicalDecoding(heaprel); xlrec_delete.snapshotConflictHorizon = snapshotConflictHorizon; xlrec_delete.ndeleted = ndeletable; xlrec_delete.nupdated = nupdatable; diff --git a/src/backend/access/spgist/spgvacuum.c b/src/backend/access/spgist/spgvacuum.c index 3cff71e720..2f4a4aad24 100644 --- a/src/backend/access/spgist/spgvacuum.c +++ b/src/backend/access/spgist/spgvacuum.c @@ -503,6 +503,7 @@ vacuumRedirectAndPlaceholder(Relation index, Relation heaprel, Buffer buffer) spgxlogVacuumRedirect xlrec; GlobalVisState *vistest; + xlrec.isCatalogRel = RelationIsAccessibleInLogicalDecoding(heaprel); xlrec.nToPlaceholder = 0; xlrec.snapshotConflictHorizon = InvalidTransactionId; diff --git a/src/include/access/gistxlog.h b/src/include/access/gistxlog.h index 2ce9366277..93fb9d438a 100644 --- a/src/include/access/gistxlog.h +++ b/src/include/access/gistxlog.h @@ -51,11 +51,14 @@ typedef struct gistxlogDelete { TransactionId snapshotConflictHorizon; uint16 ntodelete; /* number of deleted offsets */ + bool isCatalogRel; /* to handle recovery conflict during logical + * decoding on standby */ - /* TODELETE OFFSET NUMBER ARRAY FOLLOWS */ + /* TODELETE OFFSET NUMBERS */ + OffsetNumber offsets[FLEXIBLE_ARRAY_MEMBER]; } gistxlogDelete; -#define SizeOfGistxlogDelete (offsetof(gistxlogDelete, ntodelete) + sizeof(uint16)) +#define SizeOfGistxlogDelete offsetof(gistxlogDelete, offsets) /* * Backup Blk 0: If this operation completes a page split, by inserting a @@ -98,9 +101,11 @@ typedef struct gistxlogPageReuse RelFileLocator locator; BlockNumber block; FullTransactionId snapshotConflictHorizon; + bool isCatalogRel; /* to handle recovery conflict during logical + * decoding on standby */ } gistxlogPageReuse; -#define SizeOfGistxlogPageReuse (offsetof(gistxlogPageReuse, snapshotConflictHorizon) + sizeof(FullTransactionId)) +#define SizeOfGistxlogPageReuse (offsetof(gistxlogPageReuse, isCatalogRel) + sizeof(bool)) extern void gist_redo(XLogReaderState *record); extern void gist_desc(StringInfo buf, XLogReaderState *record); diff --git a/src/include/access/hash_xlog.h b/src/include/access/hash_xlog.h index 9894ab9afe..6c5535fe73 100644 --- a/src/include/access/hash_xlog.h +++ b/src/include/access/hash_xlog.h @@ -252,12 +252,14 @@ typedef struct xl_hash_vacuum_one_page { TransactionId snapshotConflictHorizon; uint16 ntuples; + bool isCatalogRel; /* to handle recovery conflict during logical + * decoding on standby */ - /* TARGET OFFSET NUMBERS FOLLOW AT THE END */ + /* TARGET OFFSET NUMBERS */ + OffsetNumber offsets[FLEXIBLE_ARRAY_MEMBER]; } xl_hash_vacuum_one_page; -#define SizeOfHashVacuumOnePage \ - (offsetof(xl_hash_vacuum_one_page, ntuples) + sizeof(uint16)) +#define SizeOfHashVacuumOnePage offsetof(xl_hash_vacuum_one_page, offsets) extern void hash_redo(XLogReaderState *record); extern void hash_desc(StringInfo buf, XLogReaderState *record); diff --git a/src/include/access/heapam_xlog.h b/src/include/access/heapam_xlog.h index 42620bbdc9..08db7e62dd 100644 --- a/src/include/access/heapam_xlog.h +++ b/src/include/access/heapam_xlog.h @@ -245,10 +245,12 @@ typedef struct xl_heap_prune TransactionId snapshotConflictHorizon; uint16 nredirected; uint16 ndead; + bool isCatalogRel; /* to handle recovery conflict during logical + * decoding on standby */ /* OFFSET NUMBERS are in the block reference 0 */ } xl_heap_prune; -#define SizeOfHeapPrune (offsetof(xl_heap_prune, ndead) + sizeof(uint16)) +#define SizeOfHeapPrune (offsetof(xl_heap_prune, isCatalogRel) + sizeof(bool)) /* * The vacuum page record is similar to the prune record, but can only mark @@ -344,13 +346,15 @@ typedef struct xl_heap_freeze_page { TransactionId snapshotConflictHorizon; uint16 nplans; + bool isCatalogRel; /* to handle recovery conflict during logical + * decoding on standby */ /* * In payload of blk 0 : FREEZE PLANS and OFFSET NUMBER ARRAY */ } xl_heap_freeze_page; -#define SizeOfHeapFreezePage (offsetof(xl_heap_freeze_page, nplans) + sizeof(uint16)) +#define SizeOfHeapFreezePage (offsetof(xl_heap_freeze_page, isCatalogRel) + sizeof(bool)) /* * This is what we need to know about setting a visibility map bit diff --git a/src/include/access/nbtxlog.h b/src/include/access/nbtxlog.h index 7dd67257f2..823c700dee 100644 --- a/src/include/access/nbtxlog.h +++ b/src/include/access/nbtxlog.h @@ -188,9 +188,11 @@ typedef struct xl_btree_reuse_page RelFileLocator locator; BlockNumber block; FullTransactionId snapshotConflictHorizon; + bool isCatalogRel; /* to handle recovery conflict during logical + * decoding on standby */ } xl_btree_reuse_page; -#define SizeOfBtreeReusePage (sizeof(xl_btree_reuse_page)) +#define SizeOfBtreeReusePage (offsetof(xl_btree_reuse_page, isCatalogRel) + sizeof(bool)) /* * xl_btree_vacuum and xl_btree_delete records describe deletion of index @@ -235,6 +237,8 @@ typedef struct xl_btree_delete TransactionId snapshotConflictHorizon; uint16 ndeleted; uint16 nupdated; + bool isCatalogRel; /* to handle recovery conflict during logical + * decoding on standby */ /*---- * In payload of blk 0 : @@ -245,7 +249,7 @@ typedef struct xl_btree_delete */ } xl_btree_delete; -#define SizeOfBtreeDelete (offsetof(xl_btree_delete, nupdated) + sizeof(uint16)) +#define SizeOfBtreeDelete (offsetof(xl_btree_delete, isCatalogRel) + sizeof(bool)) /* * The offsets that appear in xl_btree_update metadata are offsets into the diff --git a/src/include/access/spgxlog.h b/src/include/access/spgxlog.h index b9d6753533..75267a4914 100644 --- a/src/include/access/spgxlog.h +++ b/src/include/access/spgxlog.h @@ -240,6 +240,8 @@ typedef struct spgxlogVacuumRedirect uint16 nToPlaceholder; /* number of redirects to make placeholders */ OffsetNumber firstPlaceholder; /* first placeholder tuple to remove */ TransactionId snapshotConflictHorizon; /* newest XID of removed redirects */ + bool isCatalogRel; /* to handle recovery conflict during logical + * decoding on standby */ /* offsets of redirect tuples to make placeholders follow */ OffsetNumber offsets[FLEXIBLE_ARRAY_MEMBER]; diff --git a/src/include/access/visibilitymapdefs.h b/src/include/access/visibilitymapdefs.h index 9165b9456b..1fe1fe092d 100644 --- a/src/include/access/visibilitymapdefs.h +++ b/src/include/access/visibilitymapdefs.h @@ -17,9 +17,18 @@ #define BITS_PER_HEAPBLOCK 2 /* Flags for bit map */ -#define VISIBILITYMAP_ALL_VISIBLE 0x01 -#define VISIBILITYMAP_ALL_FROZEN 0x02 -#define VISIBILITYMAP_VALID_BITS 0x03 /* OR of all valid visibilitymap - * flags bits */ +#define VISIBILITYMAP_ALL_VISIBLE 0x01 +#define VISIBILITYMAP_ALL_FROZEN 0x02 +#define VISIBILITYMAP_VALID_BITS 0x03 /* OR of all valid visibilitymap + * flags bits */ +/* + * To detect recovery conflicts during logical decoding on a standby, we need + * to know if a table is a user catalog table. For that we add an additional + * bit into xl_heap_visible.flags, in addition to the above. + * + * NB: VISIBILITYMAP_XLOG_* may not be passed to visibilitymap_set(). + */ +#define VISIBILITYMAP_XLOG_CATALOG_REL 0x04 +#define VISIBILITYMAP_XLOG_VALID_BITS (VISIBILITYMAP_VALID_BITS | VISIBILITYMAP_XLOG_CATALOG_REL) #endif /* VISIBILITYMAPDEFS_H */ -- 2.34.1