diff --git a/contrib/pg_visibility/pg_visibility.c b/contrib/pg_visibility/pg_visibility.c index 0cd1160ceb..229ca8ca3f 100644 --- a/contrib/pg_visibility/pg_visibility.c +++ b/contrib/pg_visibility/pg_visibility.c @@ -398,6 +398,7 @@ pg_truncate_visibility_map(PG_FUNCTION_ARGS) if (BlockNumberIsValid(block)) { fork = VISIBILITYMAP_FORKNUM; + MarkTruncateBuffers(rel->rd_smgr->smgr_rnode, &fork, 1, &block); smgrtruncate(rel->rd_smgr, &fork, 1, &block); } diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c index fddfbf1d8c..f859037881 100644 --- a/src/backend/catalog/storage.c +++ b/src/backend/catalog/storage.c @@ -275,6 +275,40 @@ RelationTruncate(Relation rel, BlockNumber nblocks) } } + /* + * 1) WAL-log the truncation of the file. + * 2) Discard buffers of the pages to be removed. + * 3) Truncate the pages from the file. + * + * We are seeing corruptions, if there are crashes in the midst of these + * operations, treat all the operations as critical and raise PANIC to + * avoid discrepancy between the WAL log, buffer copy and the on-disk image. + */ + + /* + * It seems logical and apt to put a critical section, but there are two + * issues. First the minor one, palloc() operations down the lane are + * restricted, the major one is (as the below comment suggests) for + * a repeatable truncate error, WAL replay will never end and the server + * is hosed. The former issue can be resolved by either reworking palloc()s + * or using TRY/CATCH block instead. The counter argument to the latter + * issue is, the lack of critical section is what pushing us into the + * scenario we want to avoid by omitting it. Missing critical section is + * causing the WAL replay hitting PANIC persistently with invalid-offset + * on the disk page ERRORs. + */ + + /*START_CRIT_SECTION();*/ + + /* + * We are going to truncate (shrink) the file, the contents of the + * corresponding buffers are useless and need to be discarded, but + * a backround task might flush them to the disk right after we + * truncate and before we discard. Let's prevent it by marking + * buffers as IO_IN_PROGRESS (though we don't do any real IO). + */ + MarkTruncateBuffers(rel->rd_smgr->smgr_rnode, forks, nforks, blocks); + /* * We WAL-log the truncation before actually truncating, which means * trouble if the truncation fails. If we then crash, the WAL replay @@ -309,13 +343,14 @@ RelationTruncate(Relation rel, BlockNumber nblocks) * with a truncated heap, but the FSM or visibility map would still * contain entries for the non-existent heap pages. */ - if (fsm || vm) - XLogFlush(lsn); + XLogFlush(lsn); } /* Do the real work to truncate relation forks */ smgrtruncate(rel->rd_smgr, forks, nforks, blocks); + /* END_CRIT_SECTION(); */ + /* * Update upper-level FSM pages to account for the truncation. * This is important because the just-truncated pages were likely @@ -689,7 +724,10 @@ smgr_redo(XLogReaderState *record) /* Do the real work to truncate relation forks */ if (nforks > 0) + { + MarkTruncateBuffers(reln->smgr_rnode, forks, nforks, blocks); smgrtruncate(reln, forks, nforks, blocks); + } /* * Update upper-level FSM pages to account for the truncation. diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index e05e2b3456..564f365f7e 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -141,6 +141,9 @@ static bool IsForInput; /* local state for LockBufferForCleanup */ static BufferDesc *PinCountWaitBuf = NULL; +static BufferDesc **truncatedBufs = NULL; +static int numTruncatedBufs = 0; + /* * Backend-Private refcount management: * @@ -3986,6 +3989,9 @@ TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits) void AbortBufferIO(void) { + int i; + uint32 buf_state; + BufferDesc *buf = InProgressBuf; if (buf) @@ -4031,6 +4037,22 @@ AbortBufferIO(void) } TerminateBufferIO(buf, false, BM_IO_ERROR); } + + if (numTruncatedBufs) + { + Assert(truncatedBufs); + for (i = 0; i < numTruncatedBufs; i++) + { + buf_state = LockBufHdr(truncatedBufs[i]); + Assert(buf_state & BM_IO_IN_PROGRESS); + buf_state &= ~BM_IO_IN_PROGRESS; + UnlockBufHdr(truncatedBufs[i], buf_state); + } + + pfree(truncatedBufs); + numTruncatedBufs = 0; + truncatedBufs = NULL; + } } /* @@ -4370,3 +4392,146 @@ TestForOldSnapshot_impl(Snapshot snapshot, Relation relation) (errcode(ERRCODE_SNAPSHOT_TOO_OLD), errmsg("snapshot too old"))); } + +/* + * DropTruncatedBuffers + * + * This function removes from the buffer pool all the pages from the + * saved list of buffers that were marked as BM_IO_IN_PRGRESS just + * before the truncation. Dirty pages are simply dropped, without + * bothering to write them out first. + * + * Currently, this is called only from smgr.c smgrtuncate() where the + * underlying file was just truncated. It is the responsibility of + * MarkTruncatedBuffers() ensure that the list of buffers truncated is + * sane and point to the right set of buffers. It is also the responsibility + * of higher-level code to ensure that no other process could be trying + * to load more pages of the relation into buffers. + * + */ +void +DropTruncatedBuffers(RelFileNodeBackend rnode, ForkNumber *forkNum, + int nforks, BlockNumber *firstDelBlock) +{ + int i; + int j; + + /* If it's a local relation, it's localbuf.c's problem. */ + if (RelFileNodeBackendIsTemp(rnode)) + { + if (rnode.backend == MyBackendId) + { + for (j = 0; j < nforks; j++) + DropRelFileNodeLocalBuffers(rnode.node, forkNum[j], + firstDelBlock[j]); + } + return; + } + + if (!numTruncatedBufs) + return; /* Nothing to clean up */ + + for (i = 0; i < numTruncatedBufs; i++) + { + uint32 buf_state; + + buf_state = LockBufHdr(truncatedBufs[i]); + Assert(buf_state & BM_IO_IN_PROGRESS); + InvalidateBuffer(truncatedBufs[i]); /* releases spinlock */ + } + + pfree(truncatedBufs); + numTruncatedBufs = 0; + truncatedBufs = NULL; +} + +/* + * Finds the buffers (pages) of a about to be truncated-file + * and prevent them from being written to disk by marking them + * as BM_IO_IN_PROGRES. Though the buffers are marked for IO, they + * will never be written to disk but it prevents processes from + * doing IO. + * + * Note: Buffers might be marked for BM_IO_IN_PROGRES than usual + * time because of the extra code path to be exercised before we + * drop them, but there shouldn't be any regular backends waiting + * on these pages (as they are empty and getting the process of + * being dicarded). + */ +void +MarkTruncateBuffers(RelFileNodeBackend rnode, ForkNumber *forkNum, + int nforks, BlockNumber *firstDelBlock) +{ + int i; + int j; + + /* Do we care about local buffers i.e. Temp relation ? */ + + + for (i = 0; i < NBuffers; i++) + { + BufferDesc *bufHdr = GetBufferDescriptor(i); + uint32 buf_state; + + /* + * Since we take AccessExclusiveLock on the relation during truncate, + * it's safe to check without lock and will save lot of lock + * acquisitions in typical cases. + */ + if (!RelFileNodeEquals(bufHdr->tag.rnode, rnode.node)) + continue; + + buf_state = LockBufHdr(bufHdr); + for (j = 0; j < nforks; j++) + { + if (RelFileNodeEquals(bufHdr->tag.rnode, rnode.node) && + bufHdr->tag.forkNum == forkNum[j] && + bufHdr->tag.blockNum >= firstDelBlock[j]) + { +retry: + if (buf_state & BM_IO_IN_PROGRESS) + { + UnlockBufHdr(bufHdr, buf_state); + + WaitIO(bufHdr); + + /* OK, now the flag is cleared, recheck */ + buf_state = LockBufHdr(bufHdr); + goto retry; + } + else + { + /* + * Easy path is to allocate NBuffers, but that + * might to lead to wastage, start with 100 and + * increase in increments of 100. + */ + if (!truncatedBufs) + { + truncatedBufs = (BufferDesc **) + palloc0(100 * sizeof(BufferDesc *)); + } + else if ((numTruncatedBufs % 100) == 0) + { + truncatedBufs = (BufferDesc **) + repalloc(truncatedBufs, + (numTruncatedBufs + 100) * sizeof(BufferDesc *)); + } + + /* + * Add to the list, this will be used either to + * clean up (in an exception) or invalidate the + * buffers after the truncate. + */ + truncatedBufs[numTruncatedBufs] = bufHdr; + numTruncatedBufs++; + buf_state |= BM_IO_IN_PROGRESS; + UnlockBufHdr(bufHdr, buf_state); + } + } + } + + if (j >= nforks) + UnlockBufHdr(bufHdr, buf_state); + } +} diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c index 360b5bf5bf..d1f066a220 100644 --- a/src/backend/storage/smgr/smgr.c +++ b/src/backend/storage/smgr/smgr.c @@ -572,12 +572,6 @@ smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks, BlockNumber *nb { int i; - /* - * Get rid of any buffers for the about-to-be-deleted blocks. bufmgr will - * just drop them without bothering to write the contents. - */ - DropRelFileNodeBuffers(reln->smgr_rnode, forknum, nforks, nblocks); - /* * Send a shared-inval message to force other backends to close any smgr * references they may have for this rel. This is useful because they @@ -608,6 +602,13 @@ smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks, BlockNumber *nb if (forknum[i] == VISIBILITYMAP_FORKNUM) reln->smgr_vm_nblocks = nblocks[i]; } + + /* + * Now that we have successfully truncated the table(shrunk the file), + * get rid of any buffers for the just deleted blocks. Bufmgr will + * just drop them without bothering to write the contents. + */ + DropTruncatedBuffers(reln->smgr_rnode, forknum, nforks, nblocks); } /* diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index d2a5b52f6e..9889193bd9 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -189,6 +189,10 @@ extern void FlushRelationBuffers(Relation rel); extern void FlushDatabaseBuffers(Oid dbid); extern void DropRelFileNodeBuffers(RelFileNodeBackend rnode, ForkNumber *forkNum, int nforks, BlockNumber *firstDelBlock); +extern void DropTruncatedBuffers(RelFileNodeBackend rnode, ForkNumber *forkNum, + int nforks, BlockNumber *firstDelBlock); +extern void MarkTruncateBuffers(RelFileNodeBackend rnode, ForkNumber *forkNum, + int nforks, BlockNumber *firstDelBlock); extern void DropRelFileNodesAllBuffers(RelFileNodeBackend *rnodes, int nnodes); extern void DropDatabaseBuffers(Oid dbid);