From fadd72afcf78a55a2cfd32217b317f17a9147962 Mon Sep 17 00:00:00 2001 From: Peter Eisentraut Date: Tue, 16 May 2023 16:10:48 +0200 Subject: [PATCH] WIP: Improve smgr source code comments --- src/backend/storage/smgr/md.c | 501 ++++++++++++++++---------------- src/backend/storage/smgr/smgr.c | 251 ++++++++-------- src/include/storage/md.h | 8 +- src/include/storage/smgr.h | 8 +- 4 files changed, 382 insertions(+), 386 deletions(-) diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index e982a8dd7f..4115a24b3f 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -154,7 +154,7 @@ _mdfd_open_flags(void) } /* - * mdinit() -- Initialize private state for magnetic disk storage manager. + * mdinit() -- Initialize private state for magnetic disk storage manager. */ void mdinit(void) @@ -165,7 +165,7 @@ mdinit(void) } /* - * mdexists() -- Does the physical file exist? + * mdexists() -- Does the physical file exist? * * Note: this will return true for lingering files, with pending deletions */ @@ -184,7 +184,7 @@ mdexists(SMgrRelation reln, ForkNumber forknum) } /* - * mdcreate() -- Create a new relation on magnetic disk. + * mdcreate() -- Create a new relation on magnetic disk. * * If isRedo is true, it's okay for the relation to exist already. */ @@ -242,7 +242,7 @@ mdcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo) } /* - * mdunlink() -- Unlink a relation. + * mdunlink() -- Unlink a relation. * * Note that we're passed a RelFileLocatorBackend --- by the time this is called, * there won't be an SMgrRelation hashtable entry anymore. @@ -447,183 +447,7 @@ mdunlinkfork(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo) } /* - * mdextend() -- Add a block to the specified relation. - * - * The semantics are nearly the same as mdwrite(): write at the - * specified position. However, this is to be used for the case of - * extending a relation (i.e., blocknum is at or beyond the current - * EOF). Note that we assume writing a block beyond current EOF - * causes intervening file space to become filled with zeroes. - */ -void -mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, - const void *buffer, bool skipFsync) -{ - off_t seekpos; - int nbytes; - MdfdVec *v; - - /* If this build supports direct I/O, the buffer must be I/O aligned. */ - if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ) - Assert((uintptr_t) buffer == TYPEALIGN(PG_IO_ALIGN_SIZE, buffer)); - - /* This assert is too expensive to have on normally ... */ -#ifdef CHECK_WRITE_VS_EXTEND - Assert(blocknum >= mdnblocks(reln, forknum)); -#endif - - /* - * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any - * more --- we mustn't create a block whose number actually is - * InvalidBlockNumber. (Note that this failure should be unreachable - * because of upstream checks in bufmgr.c.) - */ - if (blocknum == InvalidBlockNumber) - ereport(ERROR, - (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), - errmsg("cannot extend file \"%s\" beyond %u blocks", - relpath(reln->smgr_rlocator, forknum), - InvalidBlockNumber))); - - v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_CREATE); - - seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); - - Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE); - - if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_EXTEND)) != BLCKSZ) - { - if (nbytes < 0) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not extend file \"%s\": %m", - FilePathName(v->mdfd_vfd)), - errhint("Check free disk space."))); - /* short write: complain appropriately */ - ereport(ERROR, - (errcode(ERRCODE_DISK_FULL), - errmsg("could not extend file \"%s\": wrote only %d of %d bytes at block %u", - FilePathName(v->mdfd_vfd), - nbytes, BLCKSZ, blocknum), - errhint("Check free disk space."))); - } - - if (!skipFsync && !SmgrIsTemp(reln)) - register_dirty_segment(reln, forknum, v); - - Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE)); -} - -/* - * mdzeroextend() -- Add new zeroed out blocks to the specified relation. - * - * Similar to mdextend(), except the relation can be extended by multiple - * blocks at once and the added blocks will be filled with zeroes. - */ -void -mdzeroextend(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, int nblocks, bool skipFsync) -{ - MdfdVec *v; - BlockNumber curblocknum = blocknum; - int remblocks = nblocks; - - Assert(nblocks > 0); - - /* This assert is too expensive to have on normally ... */ -#ifdef CHECK_WRITE_VS_EXTEND - Assert(blocknum >= mdnblocks(reln, forknum)); -#endif - - /* - * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any - * more --- we mustn't create a block whose number actually is - * InvalidBlockNumber or larger. - */ - if ((uint64) blocknum + nblocks >= (uint64) InvalidBlockNumber) - ereport(ERROR, - (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), - errmsg("cannot extend file \"%s\" beyond %u blocks", - relpath(reln->smgr_rlocator, forknum), - InvalidBlockNumber))); - - while (remblocks > 0) - { - BlockNumber segstartblock = curblocknum % ((BlockNumber) RELSEG_SIZE); - off_t seekpos = (off_t) BLCKSZ * segstartblock; - int numblocks; - - if (segstartblock + remblocks > RELSEG_SIZE) - numblocks = RELSEG_SIZE - segstartblock; - else - numblocks = remblocks; - - v = _mdfd_getseg(reln, forknum, curblocknum, skipFsync, EXTENSION_CREATE); - - Assert(segstartblock < RELSEG_SIZE); - Assert(segstartblock + numblocks <= RELSEG_SIZE); - - /* - * If available and useful, use posix_fallocate() (via FileAllocate()) - * to extend the relation. That's often more efficient than using - * write(), as it commonly won't cause the kernel to allocate page - * cache space for the extended pages. - * - * However, we don't use FileAllocate() for small extensions, as it - * defeats delayed allocation on some filesystems. Not clear where - * that decision should be made though? For now just use a cutoff of - * 8, anything between 4 and 8 worked OK in some local testing. - */ - if (numblocks > 8) - { - int ret; - - ret = FileFallocate(v->mdfd_vfd, - seekpos, (off_t) BLCKSZ * numblocks, - WAIT_EVENT_DATA_FILE_EXTEND); - if (ret != 0) - { - ereport(ERROR, - errcode_for_file_access(), - errmsg("could not extend file \"%s\" with FileFallocate(): %m", - FilePathName(v->mdfd_vfd)), - errhint("Check free disk space.")); - } - } - else - { - int ret; - - /* - * Even if we don't want to use fallocate, we can still extend a - * bit more efficiently than writing each 8kB block individually. - * pg_pwrite_zeros() (via FileZero()) uses - * pg_pwritev_with_retry() to avoid multiple writes or needing a - * zeroed buffer for the whole length of the extension. - */ - ret = FileZero(v->mdfd_vfd, - seekpos, (off_t) BLCKSZ * numblocks, - WAIT_EVENT_DATA_FILE_EXTEND); - if (ret < 0) - ereport(ERROR, - errcode_for_file_access(), - errmsg("could not extend file \"%s\": %m", - FilePathName(v->mdfd_vfd)), - errhint("Check free disk space.")); - } - - if (!skipFsync && !SmgrIsTemp(reln)) - register_dirty_segment(reln, forknum, v); - - Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE)); - - remblocks -= numblocks; - curblocknum += numblocks; - } -} - -/* - * mdopenfork() -- Open one fork of the specified relation. + * mdopenfork() -- Open one fork of the specified relation. * * Note we only open the first segment, when there are multiple segments. * @@ -673,7 +497,7 @@ mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior) } /* - * mdopen() -- Initialize newly-opened relation. + * mdopen() -- Initialize newly-opened relation. */ void mdopen(SMgrRelation reln) @@ -684,7 +508,7 @@ mdopen(SMgrRelation reln) } /* - * mdclose() -- Close the specified relation, if it isn't closed already. + * mdclose() -- Close the specified relation, if it isn't closed already. */ void mdclose(SMgrRelation reln, ForkNumber forknum) @@ -707,7 +531,7 @@ mdclose(SMgrRelation reln, ForkNumber forknum) } /* - * mdprefetch() -- Initiate asynchronous read of the specified block of a relation + * mdprefetch() -- Initiate asynchronous read of the specified block of a relation */ bool mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) @@ -734,64 +558,7 @@ mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) } /* - * mdwriteback() -- Tell the kernel to write pages back to storage. - * - * This accepts a range of blocks because flushing several pages at once is - * considerably more efficient than doing so individually. - */ -void -mdwriteback(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, BlockNumber nblocks) -{ - Assert((io_direct_flags & IO_DIRECT_DATA) == 0); - - /* - * Issue flush requests in as few requests as possible; have to split at - * segment boundaries though, since those are actually separate files. - */ - while (nblocks > 0) - { - BlockNumber nflush = nblocks; - off_t seekpos; - MdfdVec *v; - int segnum_start, - segnum_end; - - v = _mdfd_getseg(reln, forknum, blocknum, true /* not used */ , - EXTENSION_DONT_OPEN); - - /* - * We might be flushing buffers of already removed relations, that's - * ok, just ignore that case. If the segment file wasn't open already - * (ie from a recent mdwrite()), then we don't want to re-open it, to - * avoid a race with PROCSIGNAL_BARRIER_SMGRRELEASE that might leave - * us with a descriptor to a file that is about to be unlinked. - */ - if (!v) - return; - - /* compute offset inside the current segment */ - segnum_start = blocknum / RELSEG_SIZE; - - /* compute number of desired writes within the current segment */ - segnum_end = (blocknum + nblocks - 1) / RELSEG_SIZE; - if (segnum_start != segnum_end) - nflush = RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)); - - Assert(nflush >= 1); - Assert(nflush <= nblocks); - - seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); - - FileWriteback(v->mdfd_vfd, seekpos, (off_t) BLCKSZ * nflush, WAIT_EVENT_DATA_FILE_FLUSH); - - nblocks -= nflush; - blocknum += nflush; - } -} - -/* - * mdread() -- Read the specified block from a relation. + * mdread() -- Read the specified block from a relation. */ void mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, @@ -856,11 +623,7 @@ mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, } /* - * mdwrite() -- Write the supplied block at the appropriate location. - * - * This is to be used only for updating already-existing blocks of a - * relation (ie, those before the current EOF). To extend a relation, - * use mdextend(). + * mdwrite() -- Write the supplied block at the appropriate location. */ void mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, @@ -924,12 +687,242 @@ mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, } /* - * mdnblocks() -- Get the number of blocks stored in a relation. + * mdextend() -- Add a block to the specified relation. + * + * This is to be used for the case of extending a relation (i.e., blocknum is + * at or beyond the current EOF). Note that writing a block beyond current + * EOF must cause the intervening file space to become filled with zeroes. + * The POSIX file system APIs do that automatically, so we don't need to do + * anything about that. + */ +void +mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + const void *buffer, bool skipFsync) +{ + off_t seekpos; + int nbytes; + MdfdVec *v; + + /* If this build supports direct I/O, the buffer must be I/O aligned. */ + if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ) + Assert((uintptr_t) buffer == TYPEALIGN(PG_IO_ALIGN_SIZE, buffer)); + + /* This assert is too expensive to have on normally ... */ +#ifdef CHECK_WRITE_VS_EXTEND + Assert(blocknum >= mdnblocks(reln, forknum)); +#endif + + /* + * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any + * more --- we mustn't create a block whose number actually is + * InvalidBlockNumber. (Note that this failure should be unreachable + * because of upstream checks in bufmgr.c.) + */ + if (blocknum == InvalidBlockNumber) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("cannot extend file \"%s\" beyond %u blocks", + relpath(reln->smgr_rlocator, forknum), + InvalidBlockNumber))); + + v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_CREATE); + + seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); + + Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE); + + if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_EXTEND)) != BLCKSZ) + { + if (nbytes < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not extend file \"%s\": %m", + FilePathName(v->mdfd_vfd)), + errhint("Check free disk space."))); + /* short write: complain appropriately */ + ereport(ERROR, + (errcode(ERRCODE_DISK_FULL), + errmsg("could not extend file \"%s\": wrote only %d of %d bytes at block %u", + FilePathName(v->mdfd_vfd), + nbytes, BLCKSZ, blocknum), + errhint("Check free disk space."))); + } + + if (!skipFsync && !SmgrIsTemp(reln)) + register_dirty_segment(reln, forknum, v); + + Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE)); +} + +/* + * mdzeroextend() -- Add new zeroed out blocks to the specified relation. + */ +void +mdzeroextend(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, int nblocks, bool skipFsync) +{ + MdfdVec *v; + BlockNumber curblocknum = blocknum; + int remblocks = nblocks; + + Assert(nblocks > 0); + + /* This assert is too expensive to have on normally ... */ +#ifdef CHECK_WRITE_VS_EXTEND + Assert(blocknum >= mdnblocks(reln, forknum)); +#endif + + /* + * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any + * more --- we mustn't create a block whose number actually is + * InvalidBlockNumber or larger. + */ + if ((uint64) blocknum + nblocks >= (uint64) InvalidBlockNumber) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("cannot extend file \"%s\" beyond %u blocks", + relpath(reln->smgr_rlocator, forknum), + InvalidBlockNumber))); + + while (remblocks > 0) + { + BlockNumber segstartblock = curblocknum % ((BlockNumber) RELSEG_SIZE); + off_t seekpos = (off_t) BLCKSZ * segstartblock; + int numblocks; + + if (segstartblock + remblocks > RELSEG_SIZE) + numblocks = RELSEG_SIZE - segstartblock; + else + numblocks = remblocks; + + v = _mdfd_getseg(reln, forknum, curblocknum, skipFsync, EXTENSION_CREATE); + + Assert(segstartblock < RELSEG_SIZE); + Assert(segstartblock + numblocks <= RELSEG_SIZE); + + /* + * If available and useful, use posix_fallocate() (via FileAllocate()) + * to extend the relation. That's often more efficient than using + * write(), as it commonly won't cause the kernel to allocate page + * cache space for the extended pages. + * + * However, we don't use FileAllocate() for small extensions, as it + * defeats delayed allocation on some filesystems. Not clear where + * that decision should be made though? For now just use a cutoff of + * 8, anything between 4 and 8 worked OK in some local testing. + */ + if (numblocks > 8) + { + int ret; + + ret = FileFallocate(v->mdfd_vfd, + seekpos, (off_t) BLCKSZ * numblocks, + WAIT_EVENT_DATA_FILE_EXTEND); + if (ret != 0) + { + ereport(ERROR, + errcode_for_file_access(), + errmsg("could not extend file \"%s\" with FileFallocate(): %m", + FilePathName(v->mdfd_vfd)), + errhint("Check free disk space.")); + } + } + else + { + int ret; + + /* + * Even if we don't want to use fallocate, we can still extend a + * bit more efficiently than writing each 8kB block individually. + * pg_pwrite_zeros() (via FileZero()) uses + * pg_pwritev_with_retry() to avoid multiple writes or needing a + * zeroed buffer for the whole length of the extension. + */ + ret = FileZero(v->mdfd_vfd, + seekpos, (off_t) BLCKSZ * numblocks, + WAIT_EVENT_DATA_FILE_EXTEND); + if (ret < 0) + ereport(ERROR, + errcode_for_file_access(), + errmsg("could not extend file \"%s\": %m", + FilePathName(v->mdfd_vfd)), + errhint("Check free disk space.")); + } + + if (!skipFsync && !SmgrIsTemp(reln)) + register_dirty_segment(reln, forknum, v); + + Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE)); + + remblocks -= numblocks; + curblocknum += numblocks; + } +} + +/* + * mdwriteback() -- Tell the kernel to write pages back to storage. + * + * This accepts a range of blocks because flushing several pages at once is + * considerably more efficient than doing so individually. + */ +void +mdwriteback(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, BlockNumber nblocks) +{ + Assert((io_direct_flags & IO_DIRECT_DATA) == 0); + + /* + * Issue flush requests in as few requests as possible; have to split at + * segment boundaries though, since those are actually separate files. + */ + while (nblocks > 0) + { + BlockNumber nflush = nblocks; + off_t seekpos; + MdfdVec *v; + int segnum_start, + segnum_end; + + v = _mdfd_getseg(reln, forknum, blocknum, true /* not used */ , + EXTENSION_DONT_OPEN); + + /* + * We might be flushing buffers of already removed relations, that's + * ok, just ignore that case. If the segment file wasn't open already + * (ie from a recent mdwrite()), then we don't want to re-open it, to + * avoid a race with PROCSIGNAL_BARRIER_SMGRRELEASE that might leave + * us with a descriptor to a file that is about to be unlinked. + */ + if (!v) + return; + + /* compute offset inside the current segment */ + segnum_start = blocknum / RELSEG_SIZE; + + /* compute number of desired writes within the current segment */ + segnum_end = (blocknum + nblocks - 1) / RELSEG_SIZE; + if (segnum_start != segnum_end) + nflush = RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)); + + Assert(nflush >= 1); + Assert(nflush <= nblocks); + + seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); + + FileWriteback(v->mdfd_vfd, seekpos, (off_t) BLCKSZ * nflush, WAIT_EVENT_DATA_FILE_FLUSH); + + nblocks -= nflush; + blocknum += nflush; + } +} + +/* + * mdnblocks() -- Get the number of blocks stored in a relation. * - * Important side effect: all active segments of the relation are opened - * and added to the md_seg_fds array. If this routine has not been - * called, then only segments up to the last one actually touched - * are present in the array. + * Important side effect: all active segments of the relation are opened + * and added to the md_seg_fds array. If this routine has not been + * called, then only segments up to the last one actually touched + * are present in the array. */ BlockNumber mdnblocks(SMgrRelation reln, ForkNumber forknum) @@ -986,7 +979,7 @@ mdnblocks(SMgrRelation reln, ForkNumber forknum) } /* - * mdtruncate() -- Truncate relation to specified number of blocks. + * mdtruncate() -- Truncate relation to specified number of blocks. */ void mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) @@ -1080,7 +1073,7 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) } /* - * mdimmedsync() -- Immediately sync a relation to stable storage. + * mdimmedsync() -- Immediately sync a relation to stable storage. * * Note that only writes already issued are synced; this routine knows * nothing of dirty buffers that may exist inside the buffer manager. We diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c index 70d0d570b1..d983a30475 100644 --- a/src/backend/storage/smgr/smgr.c +++ b/src/backend/storage/smgr/smgr.c @@ -30,7 +30,9 @@ /* * This struct of function pointers defines the API between smgr.c and - * any individual storage manager module. Note that smgr subfunctions are + * any individual storage manager module. + * + * Note that smgr subfunctions are * generally expected to report problems via elog(ERROR). An exception is * that smgr_unlink should use elog(WARNING), rather than erroring out, * because we normally unlink relations during post-commit/abort cleanup, @@ -49,16 +51,16 @@ typedef struct f_smgr bool (*smgr_exists) (SMgrRelation reln, ForkNumber forknum); void (*smgr_unlink) (RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo); - void (*smgr_extend) (SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, const void *buffer, bool skipFsync); - void (*smgr_zeroextend) (SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, int nblocks, bool skipFsync); bool (*smgr_prefetch) (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum); void (*smgr_read) (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void *buffer); void (*smgr_write) (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync); + void (*smgr_extend) (SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, const void *buffer, bool skipFsync); + void (*smgr_zeroextend) (SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, int nblocks, bool skipFsync); void (*smgr_writeback) (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks); BlockNumber (*smgr_nblocks) (SMgrRelation reln, ForkNumber forknum); @@ -104,8 +106,7 @@ static void smgrshutdown(int code, Datum arg); /* - * smgrinit(), smgrshutdown() -- Initialize or shut down storage - * managers. + * smgrinit() -- Initialize storage managers. * * Note: smgrinit is called during backend startup (normal or standalone * case), *not* during postmaster start. Therefore, any resources created @@ -142,9 +143,11 @@ smgrshutdown(int code, Datum arg) } /* - * smgropen() -- Return an SMgrRelation object, creating it if need be. + * smgropen() -- Return an SMgrRelation object, creating it if need be. + * + * "backend" is for temporary tables, otherwise InvalidBackendId. * - * This does not attempt to actually open the underlying file. + * This does not attempt to actually open the underlying file. */ SMgrRelation smgropen(RelFileLocator rlocator, BackendId backend) @@ -245,7 +248,7 @@ smgrclearowner(SMgrRelation *owner, SMgrRelation reln) } /* - * smgrexists() -- Does the underlying file for a fork exist? + * smgrexists() -- Does the underlying file for a fork exist? */ bool smgrexists(SMgrRelation reln, ForkNumber forknum) @@ -254,7 +257,7 @@ smgrexists(SMgrRelation reln, ForkNumber forknum) } /* - * smgrclose() -- Close and delete an SMgrRelation object. + * smgrclose() -- Close and delete an SMgrRelation object. */ void smgrclose(SMgrRelation reln) @@ -284,9 +287,9 @@ smgrclose(SMgrRelation reln) } /* - * smgrrelease() -- Release all resources used by this object. + * smgrrelease() -- Release all resources used by this object. * - * The object remains valid. + * The object remains valid. */ void smgrrelease(SMgrRelation reln) @@ -299,9 +302,9 @@ smgrrelease(SMgrRelation reln) } /* - * smgrreleaseall() -- Release resources used by all objects. + * smgrreleaseall() -- Release resources used by all objects. * - * This is called for PROCSIGNAL_BARRIER_SMGRRELEASE. + * This is called for PROCSIGNAL_BARRIER_SMGRRELEASE. */ void smgrreleaseall(void) @@ -320,7 +323,7 @@ smgrreleaseall(void) } /* - * smgrcloseall() -- Close all existing SMgrRelation objects. + * smgrcloseall() -- Close all existing SMgrRelation objects. */ void smgrcloseall(void) @@ -339,8 +342,8 @@ smgrcloseall(void) } /* - * smgrcloserellocator() -- Close SMgrRelation object for given RelFileLocator, - * if one exists. + * smgrcloserellocator() -- Close SMgrRelation object for given + * RelFileLocator, if one exists. * * This has the same effects as smgrclose(smgropen(rlocator)), but it avoids * uselessly creating a hashtable entry only to drop it again when no @@ -363,11 +366,13 @@ smgrcloserellocator(RelFileLocatorBackend rlocator) } /* - * smgrcreate() -- Create a new relation. + * smgrcreate() -- Create a new relation. * - * Given an already-created (but presumably unused) SMgrRelation, - * cause the underlying disk file or other storage for the fork - * to be created. + * Given an already-created (but presumably unused) SMgrRelation, cause the + * underlying disk file or other storage for the fork to be created. + * + * isRedo is true during recovery. In that case, the underlying storage may + * already exist. */ void smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo) @@ -376,13 +381,13 @@ smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo) } /* - * smgrdosyncall() -- Immediately sync all forks of all given relations + * smgrdosyncall() -- Immediately sync all forks of all given relations * - * All forks of all given relations are synced out to the store. + * All forks of all given relations are synced out to the store. * - * This is equivalent to FlushRelationBuffers() for each smgr relation, - * then calling smgrimmedsync() for all forks of each relation, but it's - * significantly quicker so should be preferred when possible. + * This is equivalent to FlushRelationBuffers() for each smgr relation, then + * calling smgrimmedsync() for all forks of each relation, but it's + * significantly quicker so should be preferred when possible. */ void smgrdosyncall(SMgrRelation *rels, int nrels) @@ -411,14 +416,13 @@ smgrdosyncall(SMgrRelation *rels, int nrels) } /* - * smgrdounlinkall() -- Immediately unlink all forks of all given relations + * smgrdounlinkall() -- Immediately unlink all forks of all given relations * - * All forks of all given relations are removed from the store. This - * should not be used during transactional operations, since it can't be - * undone. + * All forks of all given relations are removed from the store. This should + * not be used during transactional operations, since it can't be undone. * - * If isRedo is true, it is okay for the underlying file(s) to be gone - * already. + * If isRedo is true, it is okay for the underlying file(s) to be gone + * already. */ void smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo) @@ -483,15 +487,64 @@ smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo) pfree(rlocators); } +/* + * smgrprefetch() -- Initiate asynchronous read of the specified block of a relation. + * + * In recovery only, this can return false to indicate that a file doesn't + * exist (presumably it has been dropped by a later WAL record). + */ +bool +smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) +{ + return smgrsw[reln->smgr_which].smgr_prefetch(reln, forknum, blocknum); +} + +/* + * smgrread() -- read a particular block from a relation into the supplied + * buffer. + * + * This routine is called from the buffer manager in order to instantiate + * pages in the shared buffer cache. All storage managers return pages in the + * format that POSTGRES expects. + */ +void +smgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + void *buffer) +{ + smgrsw[reln->smgr_which].smgr_read(reln, forknum, blocknum, buffer); +} + +/* + * smgrwrite() -- Write the supplied buffer out. + * + * This is to be used only for updating already-existing blocks of a relation + * (ie, those before the current EOF). To extend a relation, use + * smgrextend(). + * + * This is not a synchronous write -- the block is not necessarily on disk at + * return, only dumped out to the kernel. However, provisions will be made to + * fsync the write before the next checkpoint. + * + * skipFsync indicates that the caller will make other provisions to fsync the + * relation, so we needn't bother. Temporary relations also do not require + * fsync. + */ +void +smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + const void *buffer, bool skipFsync) +{ + smgrsw[reln->smgr_which].smgr_write(reln, forknum, blocknum, + buffer, skipFsync); +} /* - * smgrextend() -- Add a new block to a file. + * smgrextend() -- Add a new block to a file. * - * The semantics are nearly the same as smgrwrite(): write at the - * specified position. However, this is to be used for the case of - * extending a relation (i.e., blocknum is at or beyond the current - * EOF). Note that we assume writing a block beyond current EOF - * causes intervening file space to become filled with zeroes. + * The semantics are nearly the same as smgrwrite(): write at the specified + * position. However, this is to be used for the case of extending a relation + * (i.e., blocknum is at or beyond the current EOF). Writing a block beyond + * current EOF must cause the intervening file space to become filled with + * zeroes. */ void smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, @@ -512,11 +565,13 @@ smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, } /* - * smgrzeroextend() -- Add new zeroed out blocks to a file. + * smgrzeroextend() -- Add new zeroed out blocks to a file. + * + * Extend the relation by the given number of blocks, which will be filled + * with zeroes. This is similar to smgrextend() but only extends and does not + * write a buffer of data. * - * Similar to smgrextend(), except the relation can be extended by - * multiple blocks at once and the added blocks will be filled with - * zeroes. + * FIXME: why both blocknum and nblocks */ void smgrzeroextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, @@ -537,60 +592,8 @@ smgrzeroextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, } /* - * smgrprefetch() -- Initiate asynchronous read of the specified block of a relation. - * - * In recovery only, this can return false to indicate that a file - * doesn't exist (presumably it has been dropped by a later WAL - * record). - */ -bool -smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) -{ - return smgrsw[reln->smgr_which].smgr_prefetch(reln, forknum, blocknum); -} - -/* - * smgrread() -- read a particular block from a relation into the supplied - * buffer. - * - * This routine is called from the buffer manager in order to - * instantiate pages in the shared buffer cache. All storage managers - * return pages in the format that POSTGRES expects. - */ -void -smgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, - void *buffer) -{ - smgrsw[reln->smgr_which].smgr_read(reln, forknum, blocknum, buffer); -} - -/* - * smgrwrite() -- Write the supplied buffer out. - * - * This is to be used only for updating already-existing blocks of a - * relation (ie, those before the current EOF). To extend a relation, - * use smgrextend(). - * - * This is not a synchronous write -- the block is not necessarily - * on disk at return, only dumped out to the kernel. However, - * provisions will be made to fsync the write before the next checkpoint. - * - * skipFsync indicates that the caller will make other provisions to - * fsync the relation, so we needn't bother. Temporary relations also - * do not require fsync. - */ -void -smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, - const void *buffer, bool skipFsync) -{ - smgrsw[reln->smgr_which].smgr_write(reln, forknum, blocknum, - buffer, skipFsync); -} - - -/* - * smgrwriteback() -- Trigger kernel writeback for the supplied range of - * blocks. + * smgrwriteback() -- Trigger kernel writeback for the supplied range of + * blocks. */ void smgrwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, @@ -601,8 +604,8 @@ smgrwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, } /* - * smgrnblocks() -- Calculate the number of blocks in the - * supplied relation. + * smgrnblocks() -- Calculate the number of blocks in the + * supplied relation. */ BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum) @@ -622,8 +625,8 @@ smgrnblocks(SMgrRelation reln, ForkNumber forknum) } /* - * smgrnblocks_cached() -- Get the cached number of blocks in the supplied - * relation. + * smgrnblocks_cached() -- Get the cached number of blocks in the supplied + * relation. * * Returns an InvalidBlockNumber when not in recovery and when the relation * fork size is not cached. @@ -642,8 +645,8 @@ smgrnblocks_cached(SMgrRelation reln, ForkNumber forknum) } /* - * smgrtruncate() -- Truncate the given forks of supplied relation to - * each specified numbers of blocks + * smgrtruncate() -- Truncate the given forks of supplied relation to + * each specified numbers of blocks * * The truncation is done immediately, so this can't be rolled back. * @@ -694,27 +697,27 @@ smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks, BlockNumber *nb } /* - * smgrimmedsync() -- Force the specified relation to stable storage. - * - * Synchronously force all previous writes to the specified relation - * down to disk. - * - * This is useful for building completely new relations (eg, new - * indexes). Instead of incrementally WAL-logging the index build - * steps, we can just write completed index pages to disk with smgrwrite - * or smgrextend, and then fsync the completed index file before - * committing the transaction. (This is sufficient for purposes of - * crash recovery, since it effectively duplicates forcing a checkpoint - * for the completed index. But it is *not* sufficient if one wishes - * to use the WAL log for PITR or replication purposes: in that case - * we have to make WAL entries as well.) - * - * The preceding writes should specify skipFsync = true to avoid - * duplicative fsyncs. - * - * Note that you need to do FlushRelationBuffers() first if there is - * any possibility that there are dirty buffers for the relation; - * otherwise the sync is not very meaningful. + * smgrimmedsync() -- Force the specified relation to stable storage. + * + * Synchronously force all previous writes to the specified relation + * down to disk. + * + * This is useful for building completely new relations (eg, new + * indexes). Instead of incrementally WAL-logging the index build + * steps, we can just write completed index pages to disk with smgrwrite + * or smgrextend, and then fsync the completed index file before + * committing the transaction. (This is sufficient for purposes of + * crash recovery, since it effectively duplicates forcing a checkpoint + * for the completed index. But it is *not* sufficient if one wishes + * to use the WAL log for PITR or replication purposes: in that case + * we have to make WAL entries as well.) + * + * The preceding writes should specify skipFsync = true to avoid + * duplicative fsyncs. + * + * Note that you need to do FlushRelationBuffers() first if there is + * any possibility that there are dirty buffers for the relation; + * otherwise the sync is not very meaningful. */ void smgrimmedsync(SMgrRelation reln, ForkNumber forknum) diff --git a/src/include/storage/md.h b/src/include/storage/md.h index 941879ee6a..8af34e4155 100644 --- a/src/include/storage/md.h +++ b/src/include/storage/md.h @@ -26,16 +26,16 @@ extern void mdclose(SMgrRelation reln, ForkNumber forknum); extern void mdcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo); extern bool mdexists(SMgrRelation reln, ForkNumber forknum); extern void mdunlink(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo); -extern void mdextend(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, const void *buffer, bool skipFsync); -extern void mdzeroextend(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, int nblocks, bool skipFsync); extern bool mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum); extern void mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void *buffer); extern void mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync); +extern void mdextend(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, const void *buffer, bool skipFsync); +extern void mdzeroextend(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, int nblocks, bool skipFsync); extern void mdwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks); extern BlockNumber mdnblocks(SMgrRelation reln, ForkNumber forknum); diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h index a9a179aaba..896512f1bc 100644 --- a/src/include/storage/smgr.h +++ b/src/include/storage/smgr.h @@ -90,16 +90,16 @@ extern void smgrreleaseall(void); extern void smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo); extern void smgrdosyncall(SMgrRelation *rels, int nrels); extern void smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo); -extern void smgrextend(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, const void *buffer, bool skipFsync); -extern void smgrzeroextend(SMgrRelation reln, ForkNumber forknum, - BlockNumber blocknum, int nblocks, bool skipFsync); extern bool smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum); extern void smgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, void *buffer); extern void smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const void *buffer, bool skipFsync); +extern void smgrextend(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, const void *buffer, bool skipFsync); +extern void smgrzeroextend(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, int nblocks, bool skipFsync); extern void smgrwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks); extern BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum); -- 2.40.1