diff --git a/src/backend/replication/basebackup.c b/src/backend/replication/basebackup.c index 9f735a2c07..fe7285f7a2 100644 --- a/src/backend/replication/basebackup.c +++ b/src/backend/replication/basebackup.c @@ -1258,6 +1258,7 @@ sendFile(const char *readfilename, const char *tarfilename, struct stat *statbuf int i; pgoff_t len = 0; char page[BLCKSZ]; + bool block_retry = false; size_t pad; PageHeader phdr; int segmentno = 0; @@ -1341,6 +1342,50 @@ sendFile(const char *readfilename, const char *tarfilename, struct stat *statbuf phdr = (PageHeader) page; if (phdr->pd_checksum != checksum) { + /* + * Retry the block on the first failure. It's possible + * that we read the first 4K page of the block just + * before postgres updated the entire block so it ends + * up looking torn to us. We only need to retry once + * because the LSN should be updated to something we can + * ignore on the next pass. If the error happens again + * then it is a true validation failure. + */ + if (block_retry == false) + { + /* Reread the failed block */ + if (fseek(fp, -(cnt - BLCKSZ * i), SEEK_CUR) == -1) + { + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not fseek in file \"%s\": %m", + readfilename))); + } + + if (fread(buf + BLCKSZ * i, 1, BLCKSZ, fp) != BLCKSZ) + { + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not reread block %d of file \"%s\": %m", + blkno, readfilename))); + } + + if (fseek(fp, cnt - BLCKSZ * i - BLCKSZ, SEEK_CUR) == -1) + { + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not fseek in file \"%s\": %m", + readfilename))); + } + + /* Set flag so we know a retry was attempted */ + block_retry = true; + + /* Reset loop to validate the block again */ + i--; + continue; + } + ereport(WARNING, (errmsg("checksum verification failed in file " "\"%s\", block %d: calculated %X but " @@ -1350,6 +1395,7 @@ sendFile(const char *readfilename, const char *tarfilename, struct stat *statbuf checksum_failure = true; } } + block_retry = false; blkno++; } }