*** a/src/backend/postmaster/postmaster.c --- b/src/backend/postmaster/postmaster.c *************** *** 312,317 **** extern char *optarg; --- 312,320 ---- extern int optind, opterr; + extern int page_checksum; + extern bool fullPageWrites; + #ifdef HAVE_INT_OPTRESET extern int optreset; /* might not be declared by system headers */ #endif *************** *** 736,741 **** PostmasterMain(int argc, char *argv[]) --- 739,765 ---- (errmsg("WAL streaming (max_wal_senders > 0) requires wal_level \"archive\" or \"hot_standby\""))); /* + * The idea here is that there will be checksum mismatches if there + * are partial writes to pages during hardware crashes. So, the user + * should have full_page_writes enabled if page_checksum is enabled, + * so that these pages are automatically fixed, else Postgres may + * often get checksum errors after crashes (on pages that are in fact + * partially written and hence corrupted). With full_page_writes + * enabled, Postgres will replace each page without ever looking at + * the partially-written page and seeing an incorrect checksum. + * Hence, checksums will detect only real disk corruptions (where the + * disk reported a successful write but the data was still corrupted + * at some point). + * + * Alternatively, we may want to leave this check out, for those + * sophisticated users who know that their hardware/software setup + * can never produce partial writes during crashes. + */ + if (page_checksum && !fullPageWrites) + ereport(ERROR, + (errmsg("full_page_writes must be enabled if page_checksum is enabled"))); + + /* * Other one-time internal sanity checks can go here, if they are fast. * (Put any slow processing further down, after postmaster.pid creation.) */ *** a/src/backend/storage/smgr/smgr.c --- b/src/backend/storage/smgr/smgr.c *************** *** 81,86 **** static const int NSmgr = lengthof(smgrsw); --- 81,92 ---- */ static HTAB *SMgrRelationHash = NULL; + /* Page checksumming. */ + static uint64 tempbuf[BLCKSZ/sizeof(uint64)]; + extern bool page_checksum; + + #define INVALID_CKSUM 0x1b0af034 + /* local function prototypes */ static void smgrshutdown(int code, Datum arg); static void smgr_internal_unlink(RelFileNode rnode, ForkNumber forknum, *************** *** 375,380 **** smgr_internal_unlink(RelFileNode rnode, ForkNumber forknum, --- 381,439 ---- } /* + * The initial value when computing the checksum for a data page. + */ + static inline uint64 + ChecksumInit(SMgrRelation reln, ForkNumber f, BlockNumber b) + { + return b + f; + } + + /* + * Compute a checksum of buffer (with length len), using initial value + * cksum. We use a relatively simple checksum calculation to avoid + * overhead, but could replace with some kind of CRC calculation. + */ + static inline uint32 + ComputeChecksum(uint64 *buffer, uint32 len, uint64 cksum) + { + int i; + + for (i = 0; i < len/sizeof(uint64); i += 4) { + cksum += (cksum << 5) + *buffer; + cksum += (cksum << 5) + *(buffer+1); + cksum += (cksum << 5) + *(buffer+2); + cksum += (cksum << 5) + *(buffer+3); + buffer += 4; + } + cksum = (cksum & 0xFFFFFFFF) + (cksum >> 32); + return cksum; + } + + /* + * Copy buffer to dst and compute the checksum during the copy (so that + * the checksum is correct for the final contents of dst). + */ + static inline uint32 + CopyAndComputeChecksum(uint64 *dst, volatile uint64 *buffer, + uint32 len, uint64 cksum) + { + int i; + + for (i = 0; i < len/sizeof(uint64); i += 4) { + cksum += (cksum << 5) + (*dst = *buffer); + cksum += (cksum << 5) + (*(dst+1) = *(buffer+1)); + cksum += (cksum << 5) + (*(dst+2) = *(buffer+2)); + cksum += (cksum << 5) + (*(dst+3) = *(buffer+3)); + dst += 4; + buffer += 4; + } + cksum = (cksum & 0xFFFFFFFF) + (cksum >> 32); + return cksum; + } + + + /* * smgrextend() -- Add a new block to a file. * * The semantics are nearly the same as smgrwrite(): write at the *************** *** 387,394 **** void smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool isTemp) { ! (*(smgrsw[reln->smgr_which].smgr_extend)) (reln, forknum, blocknum, ! buffer, isTemp); } /* --- 446,470 ---- smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool isTemp) { ! PageHeader p; ! Assert(PageGetPageLayoutVersion(((PageHeader)buffer)) == PG_PAGE_LAYOUT_VERSION || ! PageIsNew(buffer)); ! if (page_checksum) { ! p = (PageHeader)tempbuf; ! ((PageHeader)buffer)->cksum = 0; ! /* ! * We copy and compute the checksum, and then write out the data ! * from the copy, so that we avoid any problem with hint bits ! * changing after we compute the checksum. ! */ ! p->cksum = CopyAndComputeChecksum(tempbuf, (uint64 *)buffer, BLCKSZ, ! ChecksumInit(reln, forknum, blocknum)); ! } else { ! p = (PageHeader)buffer; ! p->cksum = INVALID_CKSUM; ! } ! (*(smgrsw[reln->smgr_which].smgr_extend))(reln, forknum, blocknum, ! (char *)p, isTemp); } /* *************** *** 412,418 **** void --- 488,516 ---- smgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer) { + PageHeader p = (PageHeader) buffer; + (*(smgrsw[reln->smgr_which].smgr_read)) (reln, forknum, blocknum, buffer); + Assert(PageIsNew(p) || PageGetPageLayoutVersion(p) == PG_PAGE_LAYOUT_VERSION); + if (page_checksum && p->cksum != INVALID_CKSUM) { + const uint32 diskCksum = p->cksum; + uint32 cksum; + + p->cksum = 0; + cksum = ComputeChecksum((uint64 *)buffer, BLCKSZ, + ChecksumInit(reln, forknum, blocknum)); + if (cksum != diskCksum) { + ereport(PANIC, (0, errmsg("checksum mismatch: disk has %#x, should be %#x\n" + "filename %s, BlockNum %u, block specifier %d/%d/%d/%d/%u", + diskCksum, (uint32)cksum, + relpath(reln->smgr_rnode, forknum), + blocknum, + reln->smgr_rnode.spcNode, + reln->smgr_rnode.dbNode, + reln->smgr_rnode.relNode, + forknum, blocknum))); + } + } } /* *************** *** 434,441 **** void smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool isTemp) { ! (*(smgrsw[reln->smgr_which].smgr_write)) (reln, forknum, blocknum, ! buffer, isTemp); } /* --- 532,556 ---- smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool isTemp) { ! PageHeader p; ! ! if (page_checksum) { ! p = (PageHeader)tempbuf; ! ((PageHeader)buffer)->cksum = 0; ! /* ! * We copy and compute the checksum, and then write out the data ! * from the copy, so that we avoid any problem with hint bits ! * changing after we compute the checksum. ! */ ! p->cksum = CopyAndComputeChecksum(tempbuf, (uint64 *)buffer, BLCKSZ, ! ChecksumInit(reln, forknum, blocknum)); ! } else { ! p = (PageHeader)buffer; ! p->cksum = INVALID_CKSUM; ! } ! Assert(PageGetPageLayoutVersion(p) == PG_PAGE_LAYOUT_VERSION); ! (*(smgrsw[reln->smgr_which].smgr_write))(reln, forknum, blocknum, ! (char *)p, isTemp); } /* *** a/src/backend/utils/misc/guc.c --- b/src/backend/utils/misc/guc.c *************** *** 367,372 **** bool default_with_oids = false; --- 367,374 ---- bool SQL_inheritance = true; bool Password_encryption = true; + bool page_checksum = true; + int log_min_error_statement = ERROR; int log_min_messages = WARNING; *************** *** 1270,1275 **** static struct config_bool ConfigureNamesBool[] = --- 1272,1285 ---- false, NULL, NULL }, + { + {"page_checksum", PGC_POSTMASTER, CUSTOM_OPTIONS, + gettext_noop("enable disk page checksumming"), + NULL + }, + &page_checksum, true, NULL, NULL + }, + /* End-of-list marker */ { {NULL, 0, 0, NULL, NULL}, NULL, false, NULL, NULL *** a/src/backend/utils/misc/postgresql.conf.sample --- b/src/backend/utils/misc/postgresql.conf.sample *************** *** 526,528 **** --- 526,529 ---- #------------------------------------------------------------------------------ #custom_variable_classes = '' # list of custom variable class names + #page_cksum = on *** a/src/include/storage/bufpage.h --- b/src/include/storage/bufpage.h *************** *** 132,137 **** typedef struct PageHeaderData --- 132,138 ---- LocationIndex pd_special; /* offset to start of special space */ uint16 pd_pagesize_version; TransactionId pd_prune_xid; /* oldest prunable XID, or zero if none */ + uint32 cksum; /* page checksum */ ItemIdData pd_linp[1]; /* beginning of line pointer array */ } PageHeaderData; *************** *** 154,160 **** typedef PageHeaderData *PageHeader; * tuple? */ #define PD_ALL_VISIBLE 0x0004 /* all tuples on page are visible to * everyone */ - #define PD_VALID_FLAG_BITS 0x0007 /* OR of all valid pd_flags bits */ /* --- 155,160 ---- *************** *** 165,172 **** typedef PageHeaderData *PageHeader; * Release 8.3 uses 4; it changed the HeapTupleHeader layout again, and * added the pd_flags field (by stealing some bits from pd_tli), * as well as adding the pd_prune_xid field (which enlarges the header). */ ! #define PG_PAGE_LAYOUT_VERSION 4 /* ---------------------------------------------------------------- --- 165,173 ---- * Release 8.3 uses 4; it changed the HeapTupleHeader layout again, and * added the pd_flags field (by stealing some bits from pd_tli), * as well as adding the pd_prune_xid field (which enlarges the header). + * Release x.y uses 5; we added checksums to heap/index/fsm files. */ ! #define PG_PAGE_LAYOUT_VERSION 5 /* ----------------------------------------------------------------