diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c index b66a2b6..c027f68 100644 --- a/src/backend/access/transam/multixact.c +++ b/src/backend/access/transam/multixact.c @@ -24,17 +24,21 @@ * since it would get completely confused if someone inquired about a bogus * MultiXactId that pointed to an intermediate slot containing an XID.) * - * XLOG interactions: this module generates an XLOG record whenever a new - * OFFSETs or MEMBERs page is initialized to zeroes, as well as an XLOG record - * whenever a new MultiXactId is defined. This allows us to completely - * rebuild the data entered since the last checkpoint during XLOG replay. - * Because this is possible, we need not follow the normal rule of - * "write WAL before data"; the only correctness guarantee needed is that - * we flush and sync all dirty OFFSETs and MEMBERs pages to disk before a - * checkpoint is considered complete. If a page does make it to disk ahead - * of corresponding WAL records, it will be forcibly zeroed before use anyway. - * Therefore, we don't need to mark our pages with LSN information; we have - * enough synchronization already. + * XLOG interactions: this module generates a record whenever a new OFFSETs or + * MEMBERs page is initialized to zeroes, as well as an + * XLOG_MULTIXACT_CREATE_ID record whenever a new MultiXactId is defined. + * This module ignores the WAL rule "write xlog before data," because it + * suffices that actions recording a MultiXactId in a heap xmax do follow that + * rule. The only way for the MXID to be referenced from any data page is for + * heap_lock_tuple() or heap_update() to have put it there, and each generates + * an XLOG record that must follow ours. The normal LSN interlock between the + * data page and that XLOG record will ensure that our XLOG record reaches + * disk first. If the SLRU members/offsets data reaches disk sooner than the + * XLOG records, we do not care; after recovery, no xmax will refer to it. On + * the flip side, to ensure that all referenced entries _do_ reach disk, this + * module's XLOG records completely rebuild the data entered since the last + * checkpoint. We flush and sync all dirty OFFSETs and MEMBERs pages to disk + * before each checkpoint is considered complete. * * Like clog.c, and unlike subtrans.c, we have to preserve state across * crashes and ensure that MXID and offset numbering increases monotonically @@ -795,19 +799,7 @@ MultiXactIdCreateFromMembers(int nmembers, MultiXactMember *members) */ multi = GetNewMultiXactId(nmembers, &offset); - /* - * Make an XLOG entry describing the new MXID. - * - * Note: we need not flush this XLOG entry to disk before proceeding. The - * only way for the MXID to be referenced from any data page is for - * heap_lock_tuple() to have put it there, and heap_lock_tuple() generates - * an XLOG record that must follow ours. The normal LSN interlock between - * the data page and that XLOG record will ensure that our XLOG record - * reaches disk first. If the SLRU members/offsets data reaches disk - * sooner than the XLOG record, we do not care because we'll overwrite it - * with zeroes unless the XLOG record is there too; see notes at top of - * this file. - */ + /* Make an XLOG entry describing the new MXID. */ xlrec.mid = multi; xlrec.moff = offset; xlrec.nmembers = nmembers; @@ -2037,7 +2029,11 @@ TrimMultiXact(void) /* * Zero out the remainder of the current offsets page. See notes in - * TrimCLOG() for motivation. + * TrimCLOG() for background. Unlike CLOG, some WAL record covers every + * pg_multixact SLRU mutation. Since, also unlike CLOG, we ignore the WAL + * rule "write xlog before data," nextMXact successors may carry obsolete, + * nonzero offset values. Zero those so case 2 of GetMultiXactIdMembers() + * operates normally. */ entryno = MultiXactIdToOffsetEntry(nextMXact); if (entryno != 0)