From 5a05a3751ae278ba8eb7b79f50a4f7b652a1179c Mon Sep 17 00:00:00 2001 From: Michael Paquier Date: Wed, 23 Jul 2014 23:11:10 +0900 Subject: [PATCH] Add facility to check FPW consistency at WAL replay The consistency check on FDW is made of two phases each time a FPW is restored in place of an existing page:: - Apply a mask on the FPW and the current page to eliminate potential conflicts like hint bits for example. - Check that the FPW is consistent with the current page, aka the current page does not contain any new information that the FPW taken has not. This is done by checking the masked portions of the FPW and the current page. - If an inconsistency is found, a WARNING is simply logged. This check is done only if current page is not empty and if database has reached a consistent check. --- src/backend/access/transam/xlog.c | 104 ++++++++++++ src/backend/commands/sequence.c | 34 ++-- src/backend/storage/buffer/Makefile | 2 +- src/backend/storage/buffer/bufmask.c | 301 +++++++++++++++++++++++++++++++++++ src/include/commands/sequence.h | 13 ++ src/include/storage/bufmask.h | 21 +++ 6 files changed, 452 insertions(+), 23 deletions(-) create mode 100644 src/backend/storage/buffer/bufmask.c create mode 100644 src/include/storage/bufmask.h diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index e4f9595..a383126 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -47,6 +47,7 @@ #include "replication/walsender.h" #include "storage/barrier.h" #include "storage/bufmgr.h" +#include "storage/bufmask.h" #include "storage/fd.h" #include "storage/ipc.h" #include "storage/large_object.h" @@ -4065,6 +4066,109 @@ RestoreBackupBlockContents(XLogRecPtr lsn, BkpBlock bkpb, char *blk, page = (Page) BufferGetPage(buffer); + /* + * Before saving the new contents of the backup block ensure that + * it is consistent with the existing one. Apply masking to it and + * then perform a comparison with what is going to be added. Do + * only this operation once a consistent state has been reached by + * server and only if the page that is being rewritten is not empty + * to have a clean base for comparison. + */ + if (reachedConsistency && + !PageIsEmpty(page)) + { + RelFileNode rnode; + ForkNumber forknum; + BlockNumber blkno; + char *norm_new_page, *norm_old_page; + char *blk_data = (char *) palloc(BLCKSZ); + char old_buf[BLCKSZ * 2]; + char new_buf[BLCKSZ * 2]; + int j = 0; + int i; + bool inconsistent = false; + + /* + * Block data needs to be reformated correctly, especially if + * it has a hole. This is the same procssing as below but + * replicating this code saves memcpy calls if consistency + * checks cannot be done on this backup block. + */ + if (bkpb.hole_length == 0) + { + memcpy(blk_data, blk, BLCKSZ); + } + else + { + memcpy(blk_data, blk, bkpb.hole_offset); + /* must zero-fill the hole */ + MemSet(blk_data + bkpb.hole_offset, 0, bkpb.hole_length); + memcpy(blk_data + (bkpb.hole_offset + bkpb.hole_length), + blk + bkpb.hole_offset, + BLCKSZ - (bkpb.hole_offset + bkpb.hole_length)); + } + + /* Mask pages */ + BufferGetTag(buffer, &rnode, &forknum, &blkno); + norm_new_page = mask_page(blk_data, blkno); + norm_old_page = mask_page((char *) page, blkno); + + /* + * Convert the pages to be compared into hex format to facilitate + * their comparison and make potential diffs more readable while + * debugging. + */ + for (i = 0; i < BLCKSZ; i++) + { + const char *digits = "0123456789ABCDEF"; + uint8 byte_new = (uint8) norm_new_page[i]; + uint8 byte_old = (uint8) norm_old_page[i]; + + new_buf[j] = digits[byte_new >> 4]; + old_buf[j] = digits[byte_old >> 4]; + /* + * Do an inclusive comparison, if the new buffer has a mask + * marker and not the old buffer pages are inconsistent as this + * would mean that the old page has content that the new buffer + * has not. + */ + if (new_buf[j] == 0x0F && old_buf[j] != 0x0F) + { + inconsistent = true; + break; + } + j++; + new_buf[j] = digits[byte_new & 0x0F]; + old_buf[j] = digits[byte_old & 0x0F]; + if (new_buf[j] == 0x0F && old_buf[j] != 0x0F) + { + inconsistent = true; + break; + } + j++; + } + + /* Time to compare the old and new contents */ + if (inconsistent) + elog(WARNING, + "Inconsistent pages found for record %X/%X, rel %u/%u/%u, " + "forknum %u, blkno %u", + (uint32) (lsn >> 32), (uint32) lsn, + rnode.spcNode, rnode.dbNode, rnode.relNode, + forknum, blkno); + else + elog(DEBUG1, + "Consistent pages found for record %X/%X, rel %u/%u/%u, " + "forknum %u, blkno %u", + (uint32) (lsn >> 32), (uint32) lsn, + rnode.spcNode, rnode.dbNode, rnode.relNode, + forknum, blkno); + + pfree(blk_data); + pfree(norm_new_page); + pfree(norm_old_page); + } + if (bkpb.hole_length == 0) { memcpy((char *) page, blk, BLCKSZ); diff --git a/src/backend/commands/sequence.c b/src/backend/commands/sequence.c index e608420..802aac7 100644 --- a/src/backend/commands/sequence.c +++ b/src/backend/commands/sequence.c @@ -46,16 +46,6 @@ #define SEQ_LOG_VALS 32 /* - * The "special area" of a sequence's buffer page looks like this. - */ -#define SEQ_MAGIC 0x1717 - -typedef struct sequence_magic -{ - uint32 magic; -} sequence_magic; - -/* * We store a SeqTable item for every sequence we have touched in the current * session. This is needed to hold onto nextval/currval state. (We can't * rely on the relcache, since it's only, well, a cache, and may decide to @@ -306,7 +296,7 @@ fill_seq_with_data(Relation rel, HeapTuple tuple) { Buffer buf; Page page; - sequence_magic *sm; + SequencePageOpaqueData *sm; OffsetNumber offnum; /* Initialize first page of relation with special magic number */ @@ -316,9 +306,9 @@ fill_seq_with_data(Relation rel, HeapTuple tuple) page = BufferGetPage(buf); - PageInit(page, BufferGetPageSize(buf), sizeof(sequence_magic)); - sm = (sequence_magic *) PageGetSpecialPointer(page); - sm->magic = SEQ_MAGIC; + PageInit(page, BufferGetPageSize(buf), sizeof(SequencePageOpaqueData)); + sm = (SequencePageOpaqueData *) PageGetSpecialPointer(page); + sm->seq_page_id = SEQ_MAGIC; /* Now insert sequence tuple */ @@ -1066,18 +1056,18 @@ read_seq_tuple(SeqTable elm, Relation rel, Buffer *buf, HeapTuple seqtuple) { Page page; ItemId lp; - sequence_magic *sm; + SequencePageOpaqueData *sm; Form_pg_sequence seq; *buf = ReadBuffer(rel, 0); LockBuffer(*buf, BUFFER_LOCK_EXCLUSIVE); page = BufferGetPage(*buf); - sm = (sequence_magic *) PageGetSpecialPointer(page); + sm = (SequencePageOpaqueData *) PageGetSpecialPointer(page); - if (sm->magic != SEQ_MAGIC) + if (sm->seq_page_id != SEQ_MAGIC) elog(ERROR, "bad magic number in sequence \"%s\": %08X", - RelationGetRelationName(rel), sm->magic); + RelationGetRelationName(rel), sm->seq_page_id); lp = PageGetItemId(page, FirstOffsetNumber); Assert(ItemIdIsNormal(lp)); @@ -1541,7 +1531,7 @@ seq_redo(XLogRecPtr lsn, XLogRecord *record) char *item; Size itemsz; xl_seq_rec *xlrec = (xl_seq_rec *) XLogRecGetData(record); - sequence_magic *sm; + SequencePageOpaqueData *sm; /* Backup blocks are not used in seq records */ Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK)); @@ -1564,9 +1554,9 @@ seq_redo(XLogRecPtr lsn, XLogRecord *record) */ localpage = (Page) palloc(BufferGetPageSize(buffer)); - PageInit(localpage, BufferGetPageSize(buffer), sizeof(sequence_magic)); - sm = (sequence_magic *) PageGetSpecialPointer(localpage); - sm->magic = SEQ_MAGIC; + PageInit(localpage, BufferGetPageSize(buffer), sizeof(SequencePageOpaqueData)); + sm = (SequencePageOpaqueData *) PageGetSpecialPointer(localpage); + sm->seq_page_id = SEQ_MAGIC; item = (char *) xlrec + sizeof(xl_seq_rec); itemsz = record->xl_len - sizeof(xl_seq_rec); diff --git a/src/backend/storage/buffer/Makefile b/src/backend/storage/buffer/Makefile index 2c10fba..8630dca 100644 --- a/src/backend/storage/buffer/Makefile +++ b/src/backend/storage/buffer/Makefile @@ -12,6 +12,6 @@ subdir = src/backend/storage/buffer top_builddir = ../../../.. include $(top_builddir)/src/Makefile.global -OBJS = buf_table.o buf_init.o bufmgr.o freelist.o localbuf.o +OBJS = buf_table.o buf_init.o bufmask.o bufmgr.o freelist.o localbuf.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/storage/buffer/bufmask.c b/src/backend/storage/buffer/bufmask.c new file mode 100644 index 0000000..7e0077c --- /dev/null +++ b/src/backend/storage/buffer/bufmask.c @@ -0,0 +1,301 @@ +/*------------------------------------------------------------------------- + * + * bufmask.c + * Routines for buffer masking, used to ensure that buffers used for + * comparison across nodes are in a consistent state. + * + * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * Most pages cannot be compared directly, because some parts of the + * page are not expected to be byte-by-byte identical. For example, + * hint bits or unused space in the page. The strategy is to normalize + * all pages by creating a mask of those bits that are not expected to + * match. + * + * IDENTIFICATION + * src/backend/storage/page/bufmask.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/nbtree.h" +#include "access/gist.h" +#include "access/gin_private.h" +#include "access/htup_details.h" +#include "access/spgist_private.h" +#include "commands/sequence.h" +#include "storage/bufmask.h" +#include "storage/bufmgr.h" + +/* Marker used to mask pages consistently */ +#define MASK_MARKER 0xFF + +static void mask_unused_space(Page page); +static void mask_heap_page(Page page); +static void mask_spgist_page(Page page); +static void mask_gist_page(Page page); +static void mask_gin_page(Page page, BlockNumber blkno); +static void mask_sequence_page(Page page); +static void mask_btree_page(Page page); + +/* + * Mask the unused space of a page between pd_lower and pd_upper. + */ +static void +mask_unused_space(Page page) +{ + int pd_lower = ((PageHeader) page)->pd_lower; + int pd_upper = ((PageHeader) page)->pd_upper; + int pd_special = ((PageHeader) page)->pd_special; + + /* Sanity check */ + if (pd_lower > pd_upper || pd_special < pd_upper || + pd_lower < SizeOfPageHeaderData || pd_special > BLCKSZ) + { + elog(ERROR, "invalid page at %X/%08X\n", + ((PageHeader) page)->pd_lsn.xlogid, + ((PageHeader) page)->pd_lsn.xrecoff); + } + + memset(page + pd_lower, MASK_MARKER, pd_upper - pd_lower); +} + +/* + * Mask a heap page + */ +static void +mask_heap_page(Page page) +{ + OffsetNumber off; + PageHeader phdr = (PageHeader) page; + + mask_unused_space(page); + + /* Ignore prune_xid (it's like a hint-bit) */ + phdr->pd_prune_xid = 0xFFFFFFFF; + + /* Ignore PD_PAGE_FULL and PD_HAS_FREE_LINES flags, they are just hints */ + phdr->pd_flags |= PD_PAGE_FULL | PD_HAS_FREE_LINES; + + /* + * Also mask the all-visible flag. + * + * XXX: It is unfortunate that we have to do this. If the flag is set + * incorrectly, that's serious, and we would like to catch it. If the flag + * is cleared incorrectly, that's serious too. But redo of HEAP_CLEAN + * records don't currently set the flag, even though it is set in the + * master, so we must silence failures that that causes. + */ + phdr->pd_flags |= PD_ALL_VISIBLE; + + for (off = 1; off <= PageGetMaxOffsetNumber(page); off++) + { + ItemId iid = PageGetItemId(page, off); + char *page_item; + + page_item = (char *) (page + ItemIdGetOffset(iid)); + + /* + * Ignore hint bits and command ID. + */ + if (ItemIdIsNormal(iid)) + { + HeapTupleHeader page_htup = (HeapTupleHeader) page_item; + + page_htup->t_infomask = + HEAP_XMIN_COMMITTED | HEAP_XMIN_INVALID | + HEAP_XMAX_COMMITTED | HEAP_XMAX_INVALID; + page_htup->t_infomask |= HEAP_COMBOCID; + page_htup->t_choice.t_heap.t_field3.t_cid = 0xFFFFFFFF; + } + + /* + * Ignore any padding bytes after the tuple, when the length of + * the item is not MAXALIGNed. + */ + if (ItemIdHasStorage(iid)) + { + int len = ItemIdGetLength(iid); + int padlen = MAXALIGN(len) - len; + + if (padlen > 0) + memset(page_item + len, MASK_MARKER, padlen); + } + } +} + +/* + * Mask a SpGist page + */ +static void +mask_spgist_page(Page page) +{ + mask_unused_space(page); +} + +/* + * Mask a GIST page + */ +static void +mask_gist_page(Page page) +{ + mask_unused_space(page); +} + +/* + * Mask a Gin page + */ +static void +mask_gin_page(Page page, BlockNumber blkno) +{ + /* GIN metapage doesn't use pd_lower/pd_upper. Other page types do. */ + if (blkno != 0) + mask_unused_space(page); +} + +/* + * Mask a sequence page + */ +static void +mask_sequence_page(Page page) +{ + /* + * FIXME: currently, we just ignore sequence records altogether. nextval + * records a different value in the WAL record than it writes to the + * buffer. Ideally we would only mask out the value in the tuple. + */ + memset(page, MASK_MARKER, BLCKSZ); +} + +/* + * Mask a btree page + */ +static void +mask_btree_page(Page page) +{ + OffsetNumber off; + OffsetNumber maxoff; + BTPageOpaque maskopaq = (BTPageOpaque) + (((char *) page) + ((PageHeader) page)->pd_special); + + /* + * Mark unused space before any processing. This is important as it + * uses pd_lower and pd_upper that may be masked on this page + * afterwards if it is a deleted page. + */ + mask_unused_space(page); + + /* + * Mask everything on a DELETED page. + */ + if (((BTPageOpaque) PageGetSpecialPointer(page))->btpo_flags & BTP_DELETED) + { + /* Page content, between standard page header and opaque struct */ + memset(page + SizeOfPageHeaderData, MASK_MARKER, + BLCKSZ - MAXALIGN(sizeof(BTPageOpaqueData))); + + /* pd_lower and upper */ + memset(&((PageHeader) page)->pd_lower, MASK_MARKER, sizeof(uint16)); + memset(&((PageHeader) page)->pd_upper, MASK_MARKER, sizeof(uint16)); + } + else + { + /* + * Mask some line pointer bits, particularly those marked as + * used on a master and unused on a standby. + * XXX: This could be refined. + */ + maxoff = PageGetMaxOffsetNumber(page); + for (off = 1; off <= maxoff; off++) + { + ItemId iid = PageGetItemId(page, off); + + if (ItemIdIsUsed(iid)) + iid->lp_flags = LP_UNUSED; + } + } + + /* + * Mask BTP_HAS_GARBAGE flag. This needs to be done at the end + * of process as previous masking operations could generate some + * garbage. + */ + maskopaq->btpo_flags |= BTP_HAS_GARBAGE; +} + +/* + * mask_page + * + * Mask a given page. First try to find what kind of page it is + * and then normalize it. This function returns a normalized page + * palloc'ed. So caller should free the normalized page correctly when + * using this function. Tracking blkno is needed for gin pages as their + * metapage does not use pd_lower and pd_upper. + * Before calling this function, it is assumed that caller has already + * taken a proper lock on the page being masked. + */ +char * +mask_page(const char *page, BlockNumber blkno) +{ + Page page_norm; + uint16 tail; + + page_norm = (Page) palloc(BLCKSZ); + memcpy(page_norm, page, BLCKSZ); + + /* + * Look at the size of the special area, and the last two bytes in + * it, to detect what kind of a page it is. Then call the appropriate + * masking function. + */ + memcpy(&tail, &page[BLCKSZ - 2], 2); + if (PageGetSpecialSize(page) == 0) + { + /* Case of a normal relation, it has an empty special area */ + mask_heap_page(page_norm); + } + else if (PageGetSpecialSize(page) == MAXALIGN(sizeof(GISTPageOpaqueData)) && + tail == GIST_PAGE_ID) + { + /* Gist page */ + mask_gist_page(page_norm); + } + else if (PageGetSpecialSize(page) == MAXALIGN(sizeof(BTPageOpaqueData)) && + tail <= MAX_BT_CYCLE_ID) + { + /* btree page */ + mask_btree_page(page_norm); + } + else if (PageGetSpecialSize(page) == MAXALIGN(sizeof(SpGistPageOpaqueData)) && + tail == SPGIST_PAGE_ID) + { + /* SpGist page */ + mask_spgist_page(page_norm); + } + else if (PageGetSpecialSize(page) == MAXALIGN(sizeof(GinPageOpaqueData)) || + PageGetSpecialSize(page) == MAXALIGN(sizeof(SequencePageOpaqueData))) + { + /* + * The page found here is used either for a Gin index or a sequence. + * Gin index pages do not have a proper identifier, so check if the page + * is used by a sequence or not. If it is not the case, this page is used + * by a gin index. It is still possible that a gin page covers with area + * with exactly the same value as SEQ_MAGIC, but this is unlikely to happen. + */ + if (((SequencePageOpaqueData *) PageGetSpecialPointer(page))->seq_page_id == SEQ_MAGIC) + mask_sequence_page(page_norm); + else + mask_gin_page(page_norm, blkno); + } + else + { + /* Should not come here */ + Assert(0); + } + + /* Return normalized page */ + return (char *) page_norm; +} diff --git a/src/include/commands/sequence.h b/src/include/commands/sequence.h index 8819c00..2455878 100644 --- a/src/include/commands/sequence.h +++ b/src/include/commands/sequence.h @@ -18,6 +18,19 @@ #include "nodes/parsenodes.h" #include "storage/relfilenode.h" +/* + * Page opaque data in a sequence page + */ +typedef struct SequencePageOpaqueData +{ + uint32 seq_page_id; +} SequencePageOpaqueData; + +/* + * This page ID is for the conveniende to be able to identify if a page + * is being used by a sequence. + */ +#define SEQ_MAGIC 0x1717 typedef struct FormData_pg_sequence { diff --git a/src/include/storage/bufmask.h b/src/include/storage/bufmask.h new file mode 100644 index 0000000..1dd5a67 --- /dev/null +++ b/src/include/storage/bufmask.h @@ -0,0 +1,21 @@ +/*------------------------------------------------------------------------- + * + * bufmask.h + * Buffer masking definitions. + * + * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/bufmask.h + */ + +#ifndef BUFMASK_H +#define BUFMASK_H + +#include "postgres.h" +#include "storage/block.h" + +/* Entry point for page masking */ +extern char *mask_page(const char *page, BlockNumber blkno); + +#endif -- 2.0.2