From fadd72afcf78a55a2cfd32217b317f17a9147962 Mon Sep 17 00:00:00 2001
From: Peter Eisentraut <peter@eisentraut.org>
Date: Tue, 16 May 2023 16:10:48 +0200
Subject: [PATCH] WIP: Improve smgr source code comments

---
 src/backend/storage/smgr/md.c   | 501 ++++++++++++++++----------------
 src/backend/storage/smgr/smgr.c | 251 ++++++++--------
 src/include/storage/md.h        |   8 +-
 src/include/storage/smgr.h      |   8 +-
 4 files changed, 382 insertions(+), 386 deletions(-)

diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index e982a8dd7f..4115a24b3f 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -154,7 +154,7 @@ _mdfd_open_flags(void)
 }
 
 /*
- *	mdinit() -- Initialize private state for magnetic disk storage manager.
+ * mdinit() -- Initialize private state for magnetic disk storage manager.
  */
 void
 mdinit(void)
@@ -165,7 +165,7 @@ mdinit(void)
 }
 
 /*
- *	mdexists() -- Does the physical file exist?
+ * mdexists() -- Does the physical file exist?
  *
  * Note: this will return true for lingering files, with pending deletions
  */
@@ -184,7 +184,7 @@ mdexists(SMgrRelation reln, ForkNumber forknum)
 }
 
 /*
- *	mdcreate() -- Create a new relation on magnetic disk.
+ * mdcreate() -- Create a new relation on magnetic disk.
  *
  * If isRedo is true, it's okay for the relation to exist already.
  */
@@ -242,7 +242,7 @@ mdcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
 }
 
 /*
- *	mdunlink() -- Unlink a relation.
+ * mdunlink() -- Unlink a relation.
  *
  * Note that we're passed a RelFileLocatorBackend --- by the time this is called,
  * there won't be an SMgrRelation hashtable entry anymore.
@@ -447,183 +447,7 @@ mdunlinkfork(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo)
 }
 
 /*
- *	mdextend() -- Add a block to the specified relation.
- *
- *		The semantics are nearly the same as mdwrite(): write at the
- *		specified position.  However, this is to be used for the case of
- *		extending a relation (i.e., blocknum is at or beyond the current
- *		EOF).  Note that we assume writing a block beyond current EOF
- *		causes intervening file space to become filled with zeroes.
- */
-void
-mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
-		 const void *buffer, bool skipFsync)
-{
-	off_t		seekpos;
-	int			nbytes;
-	MdfdVec    *v;
-
-	/* If this build supports direct I/O, the buffer must be I/O aligned. */
-	if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ)
-		Assert((uintptr_t) buffer == TYPEALIGN(PG_IO_ALIGN_SIZE, buffer));
-
-	/* This assert is too expensive to have on normally ... */
-#ifdef CHECK_WRITE_VS_EXTEND
-	Assert(blocknum >= mdnblocks(reln, forknum));
-#endif
-
-	/*
-	 * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any
-	 * more --- we mustn't create a block whose number actually is
-	 * InvalidBlockNumber.  (Note that this failure should be unreachable
-	 * because of upstream checks in bufmgr.c.)
-	 */
-	if (blocknum == InvalidBlockNumber)
-		ereport(ERROR,
-				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
-				 errmsg("cannot extend file \"%s\" beyond %u blocks",
-						relpath(reln->smgr_rlocator, forknum),
-						InvalidBlockNumber)));
-
-	v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_CREATE);
-
-	seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
-
-	Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
-
-	if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_EXTEND)) != BLCKSZ)
-	{
-		if (nbytes < 0)
-			ereport(ERROR,
-					(errcode_for_file_access(),
-					 errmsg("could not extend file \"%s\": %m",
-							FilePathName(v->mdfd_vfd)),
-					 errhint("Check free disk space.")));
-		/* short write: complain appropriately */
-		ereport(ERROR,
-				(errcode(ERRCODE_DISK_FULL),
-				 errmsg("could not extend file \"%s\": wrote only %d of %d bytes at block %u",
-						FilePathName(v->mdfd_vfd),
-						nbytes, BLCKSZ, blocknum),
-				 errhint("Check free disk space.")));
-	}
-
-	if (!skipFsync && !SmgrIsTemp(reln))
-		register_dirty_segment(reln, forknum, v);
-
-	Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
-}
-
-/*
- *	mdzeroextend() -- Add new zeroed out blocks to the specified relation.
- *
- *		Similar to mdextend(), except the relation can be extended by multiple
- *		blocks at once and the added blocks will be filled with zeroes.
- */
-void
-mdzeroextend(SMgrRelation reln, ForkNumber forknum,
-			 BlockNumber blocknum, int nblocks, bool skipFsync)
-{
-	MdfdVec    *v;
-	BlockNumber curblocknum = blocknum;
-	int			remblocks = nblocks;
-
-	Assert(nblocks > 0);
-
-	/* This assert is too expensive to have on normally ... */
-#ifdef CHECK_WRITE_VS_EXTEND
-	Assert(blocknum >= mdnblocks(reln, forknum));
-#endif
-
-	/*
-	 * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any
-	 * more --- we mustn't create a block whose number actually is
-	 * InvalidBlockNumber or larger.
-	 */
-	if ((uint64) blocknum + nblocks >= (uint64) InvalidBlockNumber)
-		ereport(ERROR,
-				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
-				 errmsg("cannot extend file \"%s\" beyond %u blocks",
-						relpath(reln->smgr_rlocator, forknum),
-						InvalidBlockNumber)));
-
-	while (remblocks > 0)
-	{
-		BlockNumber	segstartblock = curblocknum % ((BlockNumber) RELSEG_SIZE);
-		off_t		seekpos = (off_t) BLCKSZ * segstartblock;
-		int			numblocks;
-
-		if (segstartblock + remblocks > RELSEG_SIZE)
-			numblocks = RELSEG_SIZE - segstartblock;
-		else
-			numblocks = remblocks;
-
-		v = _mdfd_getseg(reln, forknum, curblocknum, skipFsync, EXTENSION_CREATE);
-
-		Assert(segstartblock < RELSEG_SIZE);
-		Assert(segstartblock + numblocks <= RELSEG_SIZE);
-
-		/*
-		 * If available and useful, use posix_fallocate() (via FileAllocate())
-		 * to extend the relation. That's often more efficient than using
-		 * write(), as it commonly won't cause the kernel to allocate page
-		 * cache space for the extended pages.
-		 *
-		 * However, we don't use FileAllocate() for small extensions, as it
-		 * defeats delayed allocation on some filesystems. Not clear where
-		 * that decision should be made though? For now just use a cutoff of
-		 * 8, anything between 4 and 8 worked OK in some local testing.
-		 */
-		if (numblocks > 8)
-		{
-			int			ret;
-
-			ret = FileFallocate(v->mdfd_vfd,
-								seekpos, (off_t) BLCKSZ * numblocks,
-								WAIT_EVENT_DATA_FILE_EXTEND);
-			if (ret != 0)
-			{
-				ereport(ERROR,
-						errcode_for_file_access(),
-						errmsg("could not extend file \"%s\" with FileFallocate(): %m",
-							   FilePathName(v->mdfd_vfd)),
-						errhint("Check free disk space."));
-			}
-		}
-		else
-		{
-			int			ret;
-
-			/*
-			 * Even if we don't want to use fallocate, we can still extend a
-			 * bit more efficiently than writing each 8kB block individually.
-			 * pg_pwrite_zeros() (via FileZero()) uses
-			 * pg_pwritev_with_retry() to avoid multiple writes or needing a
-			 * zeroed buffer for the whole length of the extension.
-			 */
-			ret = FileZero(v->mdfd_vfd,
-						   seekpos, (off_t) BLCKSZ * numblocks,
-						   WAIT_EVENT_DATA_FILE_EXTEND);
-			if (ret < 0)
-				ereport(ERROR,
-						errcode_for_file_access(),
-						errmsg("could not extend file \"%s\": %m",
-							   FilePathName(v->mdfd_vfd)),
-						errhint("Check free disk space."));
-		}
-
-		if (!skipFsync && !SmgrIsTemp(reln))
-			register_dirty_segment(reln, forknum, v);
-
-		Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
-
-		remblocks -= numblocks;
-		curblocknum += numblocks;
-	}
-}
-
-/*
- *	mdopenfork() -- Open one fork of the specified relation.
+ * mdopenfork() -- Open one fork of the specified relation.
  *
  * Note we only open the first segment, when there are multiple segments.
  *
@@ -673,7 +497,7 @@ mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior)
 }
 
 /*
- *  mdopen() -- Initialize newly-opened relation.
+ * mdopen() -- Initialize newly-opened relation.
  */
 void
 mdopen(SMgrRelation reln)
@@ -684,7 +508,7 @@ mdopen(SMgrRelation reln)
 }
 
 /*
- *	mdclose() -- Close the specified relation, if it isn't closed already.
+ * mdclose() -- Close the specified relation, if it isn't closed already.
  */
 void
 mdclose(SMgrRelation reln, ForkNumber forknum)
@@ -707,7 +531,7 @@ mdclose(SMgrRelation reln, ForkNumber forknum)
 }
 
 /*
- *	mdprefetch() -- Initiate asynchronous read of the specified block of a relation
+ * mdprefetch() -- Initiate asynchronous read of the specified block of a relation
  */
 bool
 mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
@@ -734,64 +558,7 @@ mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 }
 
 /*
- * mdwriteback() -- Tell the kernel to write pages back to storage.
- *
- * This accepts a range of blocks because flushing several pages at once is
- * considerably more efficient than doing so individually.
- */
-void
-mdwriteback(SMgrRelation reln, ForkNumber forknum,
-			BlockNumber blocknum, BlockNumber nblocks)
-{
-	Assert((io_direct_flags & IO_DIRECT_DATA) == 0);
-
-	/*
-	 * Issue flush requests in as few requests as possible; have to split at
-	 * segment boundaries though, since those are actually separate files.
-	 */
-	while (nblocks > 0)
-	{
-		BlockNumber nflush = nblocks;
-		off_t		seekpos;
-		MdfdVec    *v;
-		int			segnum_start,
-					segnum_end;
-
-		v = _mdfd_getseg(reln, forknum, blocknum, true /* not used */ ,
-						 EXTENSION_DONT_OPEN);
-
-		/*
-		 * We might be flushing buffers of already removed relations, that's
-		 * ok, just ignore that case.  If the segment file wasn't open already
-		 * (ie from a recent mdwrite()), then we don't want to re-open it, to
-		 * avoid a race with PROCSIGNAL_BARRIER_SMGRRELEASE that might leave
-		 * us with a descriptor to a file that is about to be unlinked.
-		 */
-		if (!v)
-			return;
-
-		/* compute offset inside the current segment */
-		segnum_start = blocknum / RELSEG_SIZE;
-
-		/* compute number of desired writes within the current segment */
-		segnum_end = (blocknum + nblocks - 1) / RELSEG_SIZE;
-		if (segnum_start != segnum_end)
-			nflush = RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE));
-
-		Assert(nflush >= 1);
-		Assert(nflush <= nblocks);
-
-		seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
-
-		FileWriteback(v->mdfd_vfd, seekpos, (off_t) BLCKSZ * nflush, WAIT_EVENT_DATA_FILE_FLUSH);
-
-		nblocks -= nflush;
-		blocknum += nflush;
-	}
-}
-
-/*
- *	mdread() -- Read the specified block from a relation.
+ * mdread() -- Read the specified block from a relation.
  */
 void
 mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
@@ -856,11 +623,7 @@ mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 }
 
 /*
- *	mdwrite() -- Write the supplied block at the appropriate location.
- *
- *		This is to be used only for updating already-existing blocks of a
- *		relation (ie, those before the current EOF).  To extend a relation,
- *		use mdextend().
+ * mdwrite() -- Write the supplied block at the appropriate location.
  */
 void
 mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
@@ -924,12 +687,242 @@ mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 }
 
 /*
- *	mdnblocks() -- Get the number of blocks stored in a relation.
+ * mdextend() -- Add a block to the specified relation.
+ *
+ * This is to be used for the case of extending a relation (i.e., blocknum is
+ * at or beyond the current EOF).  Note that writing a block beyond current
+ * EOF must cause the intervening file space to become filled with zeroes.
+ * The POSIX file system APIs do that automatically, so we don't need to do
+ * anything about that.
+ */
+void
+mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+		 const void *buffer, bool skipFsync)
+{
+	off_t		seekpos;
+	int			nbytes;
+	MdfdVec    *v;
+
+	/* If this build supports direct I/O, the buffer must be I/O aligned. */
+	if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ)
+		Assert((uintptr_t) buffer == TYPEALIGN(PG_IO_ALIGN_SIZE, buffer));
+
+	/* This assert is too expensive to have on normally ... */
+#ifdef CHECK_WRITE_VS_EXTEND
+	Assert(blocknum >= mdnblocks(reln, forknum));
+#endif
+
+	/*
+	 * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any
+	 * more --- we mustn't create a block whose number actually is
+	 * InvalidBlockNumber.  (Note that this failure should be unreachable
+	 * because of upstream checks in bufmgr.c.)
+	 */
+	if (blocknum == InvalidBlockNumber)
+		ereport(ERROR,
+				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+				 errmsg("cannot extend file \"%s\" beyond %u blocks",
+						relpath(reln->smgr_rlocator, forknum),
+						InvalidBlockNumber)));
+
+	v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_CREATE);
+
+	seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
+
+	Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
+
+	if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_EXTEND)) != BLCKSZ)
+	{
+		if (nbytes < 0)
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not extend file \"%s\": %m",
+							FilePathName(v->mdfd_vfd)),
+					 errhint("Check free disk space.")));
+		/* short write: complain appropriately */
+		ereport(ERROR,
+				(errcode(ERRCODE_DISK_FULL),
+				 errmsg("could not extend file \"%s\": wrote only %d of %d bytes at block %u",
+						FilePathName(v->mdfd_vfd),
+						nbytes, BLCKSZ, blocknum),
+				 errhint("Check free disk space.")));
+	}
+
+	if (!skipFsync && !SmgrIsTemp(reln))
+		register_dirty_segment(reln, forknum, v);
+
+	Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
+}
+
+/*
+ * mdzeroextend() -- Add new zeroed out blocks to the specified relation.
+ */
+void
+mdzeroextend(SMgrRelation reln, ForkNumber forknum,
+			 BlockNumber blocknum, int nblocks, bool skipFsync)
+{
+	MdfdVec    *v;
+	BlockNumber curblocknum = blocknum;
+	int			remblocks = nblocks;
+
+	Assert(nblocks > 0);
+
+	/* This assert is too expensive to have on normally ... */
+#ifdef CHECK_WRITE_VS_EXTEND
+	Assert(blocknum >= mdnblocks(reln, forknum));
+#endif
+
+	/*
+	 * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any
+	 * more --- we mustn't create a block whose number actually is
+	 * InvalidBlockNumber or larger.
+	 */
+	if ((uint64) blocknum + nblocks >= (uint64) InvalidBlockNumber)
+		ereport(ERROR,
+				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+				 errmsg("cannot extend file \"%s\" beyond %u blocks",
+						relpath(reln->smgr_rlocator, forknum),
+						InvalidBlockNumber)));
+
+	while (remblocks > 0)
+	{
+		BlockNumber	segstartblock = curblocknum % ((BlockNumber) RELSEG_SIZE);
+		off_t		seekpos = (off_t) BLCKSZ * segstartblock;
+		int			numblocks;
+
+		if (segstartblock + remblocks > RELSEG_SIZE)
+			numblocks = RELSEG_SIZE - segstartblock;
+		else
+			numblocks = remblocks;
+
+		v = _mdfd_getseg(reln, forknum, curblocknum, skipFsync, EXTENSION_CREATE);
+
+		Assert(segstartblock < RELSEG_SIZE);
+		Assert(segstartblock + numblocks <= RELSEG_SIZE);
+
+		/*
+		 * If available and useful, use posix_fallocate() (via FileAllocate())
+		 * to extend the relation. That's often more efficient than using
+		 * write(), as it commonly won't cause the kernel to allocate page
+		 * cache space for the extended pages.
+		 *
+		 * However, we don't use FileAllocate() for small extensions, as it
+		 * defeats delayed allocation on some filesystems. Not clear where
+		 * that decision should be made though? For now just use a cutoff of
+		 * 8, anything between 4 and 8 worked OK in some local testing.
+		 */
+		if (numblocks > 8)
+		{
+			int			ret;
+
+			ret = FileFallocate(v->mdfd_vfd,
+								seekpos, (off_t) BLCKSZ * numblocks,
+								WAIT_EVENT_DATA_FILE_EXTEND);
+			if (ret != 0)
+			{
+				ereport(ERROR,
+						errcode_for_file_access(),
+						errmsg("could not extend file \"%s\" with FileFallocate(): %m",
+							   FilePathName(v->mdfd_vfd)),
+						errhint("Check free disk space."));
+			}
+		}
+		else
+		{
+			int			ret;
+
+			/*
+			 * Even if we don't want to use fallocate, we can still extend a
+			 * bit more efficiently than writing each 8kB block individually.
+			 * pg_pwrite_zeros() (via FileZero()) uses
+			 * pg_pwritev_with_retry() to avoid multiple writes or needing a
+			 * zeroed buffer for the whole length of the extension.
+			 */
+			ret = FileZero(v->mdfd_vfd,
+						   seekpos, (off_t) BLCKSZ * numblocks,
+						   WAIT_EVENT_DATA_FILE_EXTEND);
+			if (ret < 0)
+				ereport(ERROR,
+						errcode_for_file_access(),
+						errmsg("could not extend file \"%s\": %m",
+							   FilePathName(v->mdfd_vfd)),
+						errhint("Check free disk space."));
+		}
+
+		if (!skipFsync && !SmgrIsTemp(reln))
+			register_dirty_segment(reln, forknum, v);
+
+		Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
+
+		remblocks -= numblocks;
+		curblocknum += numblocks;
+	}
+}
+
+/*
+ * mdwriteback() -- Tell the kernel to write pages back to storage.
+ *
+ * This accepts a range of blocks because flushing several pages at once is
+ * considerably more efficient than doing so individually.
+ */
+void
+mdwriteback(SMgrRelation reln, ForkNumber forknum,
+			BlockNumber blocknum, BlockNumber nblocks)
+{
+	Assert((io_direct_flags & IO_DIRECT_DATA) == 0);
+
+	/*
+	 * Issue flush requests in as few requests as possible; have to split at
+	 * segment boundaries though, since those are actually separate files.
+	 */
+	while (nblocks > 0)
+	{
+		BlockNumber nflush = nblocks;
+		off_t		seekpos;
+		MdfdVec    *v;
+		int			segnum_start,
+					segnum_end;
+
+		v = _mdfd_getseg(reln, forknum, blocknum, true /* not used */ ,
+						 EXTENSION_DONT_OPEN);
+
+		/*
+		 * We might be flushing buffers of already removed relations, that's
+		 * ok, just ignore that case.  If the segment file wasn't open already
+		 * (ie from a recent mdwrite()), then we don't want to re-open it, to
+		 * avoid a race with PROCSIGNAL_BARRIER_SMGRRELEASE that might leave
+		 * us with a descriptor to a file that is about to be unlinked.
+		 */
+		if (!v)
+			return;
+
+		/* compute offset inside the current segment */
+		segnum_start = blocknum / RELSEG_SIZE;
+
+		/* compute number of desired writes within the current segment */
+		segnum_end = (blocknum + nblocks - 1) / RELSEG_SIZE;
+		if (segnum_start != segnum_end)
+			nflush = RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE));
+
+		Assert(nflush >= 1);
+		Assert(nflush <= nblocks);
+
+		seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
+
+		FileWriteback(v->mdfd_vfd, seekpos, (off_t) BLCKSZ * nflush, WAIT_EVENT_DATA_FILE_FLUSH);
+
+		nblocks -= nflush;
+		blocknum += nflush;
+	}
+}
+
+/*
+ * mdnblocks() -- Get the number of blocks stored in a relation.
  *
- *		Important side effect: all active segments of the relation are opened
- *		and added to the md_seg_fds array.  If this routine has not been
- *		called, then only segments up to the last one actually touched
- *		are present in the array.
+ * Important side effect: all active segments of the relation are opened
+ * and added to the md_seg_fds array.  If this routine has not been
+ * called, then only segments up to the last one actually touched
+ * are present in the array.
  */
 BlockNumber
 mdnblocks(SMgrRelation reln, ForkNumber forknum)
@@ -986,7 +979,7 @@ mdnblocks(SMgrRelation reln, ForkNumber forknum)
 }
 
 /*
- *	mdtruncate() -- Truncate relation to specified number of blocks.
+ * mdtruncate() -- Truncate relation to specified number of blocks.
  */
 void
 mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
@@ -1080,7 +1073,7 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 }
 
 /*
- *	mdimmedsync() -- Immediately sync a relation to stable storage.
+ * mdimmedsync() -- Immediately sync a relation to stable storage.
  *
  * Note that only writes already issued are synced; this routine knows
  * nothing of dirty buffers that may exist inside the buffer manager.  We
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index 70d0d570b1..d983a30475 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -30,7 +30,9 @@
 
 /*
  * This struct of function pointers defines the API between smgr.c and
- * any individual storage manager module.  Note that smgr subfunctions are
+ * any individual storage manager module.
+ *
+ * Note that smgr subfunctions are
  * generally expected to report problems via elog(ERROR).  An exception is
  * that smgr_unlink should use elog(WARNING), rather than erroring out,
  * because we normally unlink relations during post-commit/abort cleanup,
@@ -49,16 +51,16 @@ typedef struct f_smgr
 	bool		(*smgr_exists) (SMgrRelation reln, ForkNumber forknum);
 	void		(*smgr_unlink) (RelFileLocatorBackend rlocator, ForkNumber forknum,
 								bool isRedo);
-	void		(*smgr_extend) (SMgrRelation reln, ForkNumber forknum,
-								BlockNumber blocknum, const void *buffer, bool skipFsync);
-	void		(*smgr_zeroextend) (SMgrRelation reln, ForkNumber forknum,
-									BlockNumber blocknum, int nblocks, bool skipFsync);
 	bool		(*smgr_prefetch) (SMgrRelation reln, ForkNumber forknum,
 								  BlockNumber blocknum);
 	void		(*smgr_read) (SMgrRelation reln, ForkNumber forknum,
 							  BlockNumber blocknum, void *buffer);
 	void		(*smgr_write) (SMgrRelation reln, ForkNumber forknum,
 							   BlockNumber blocknum, const void *buffer, bool skipFsync);
+	void		(*smgr_extend) (SMgrRelation reln, ForkNumber forknum,
+								BlockNumber blocknum, const void *buffer, bool skipFsync);
+	void		(*smgr_zeroextend) (SMgrRelation reln, ForkNumber forknum,
+									BlockNumber blocknum, int nblocks, bool skipFsync);
 	void		(*smgr_writeback) (SMgrRelation reln, ForkNumber forknum,
 								   BlockNumber blocknum, BlockNumber nblocks);
 	BlockNumber (*smgr_nblocks) (SMgrRelation reln, ForkNumber forknum);
@@ -104,8 +106,7 @@ static void smgrshutdown(int code, Datum arg);
 
 
 /*
- *	smgrinit(), smgrshutdown() -- Initialize or shut down storage
- *								  managers.
+ * smgrinit() -- Initialize storage managers.
  *
  * Note: smgrinit is called during backend startup (normal or standalone
  * case), *not* during postmaster start.  Therefore, any resources created
@@ -142,9 +143,11 @@ smgrshutdown(int code, Datum arg)
 }
 
 /*
- *	smgropen() -- Return an SMgrRelation object, creating it if need be.
+ * smgropen() -- Return an SMgrRelation object, creating it if need be.
+ *
+ * "backend" is for temporary tables, otherwise InvalidBackendId.
  *
- *		This does not attempt to actually open the underlying file.
+ * This does not attempt to actually open the underlying file.
  */
 SMgrRelation
 smgropen(RelFileLocator rlocator, BackendId backend)
@@ -245,7 +248,7 @@ smgrclearowner(SMgrRelation *owner, SMgrRelation reln)
 }
 
 /*
- *	smgrexists() -- Does the underlying file for a fork exist?
+ * smgrexists() -- Does the underlying file for a fork exist?
  */
 bool
 smgrexists(SMgrRelation reln, ForkNumber forknum)
@@ -254,7 +257,7 @@ smgrexists(SMgrRelation reln, ForkNumber forknum)
 }
 
 /*
- *	smgrclose() -- Close and delete an SMgrRelation object.
+ * smgrclose() -- Close and delete an SMgrRelation object.
  */
 void
 smgrclose(SMgrRelation reln)
@@ -284,9 +287,9 @@ smgrclose(SMgrRelation reln)
 }
 
 /*
- *	smgrrelease() -- Release all resources used by this object.
+ * smgrrelease() -- Release all resources used by this object.
  *
- *	The object remains valid.
+ * The object remains valid.
  */
 void
 smgrrelease(SMgrRelation reln)
@@ -299,9 +302,9 @@ smgrrelease(SMgrRelation reln)
 }
 
 /*
- *	smgrreleaseall() -- Release resources used by all objects.
+ * smgrreleaseall() -- Release resources used by all objects.
  *
- *	This is called for PROCSIGNAL_BARRIER_SMGRRELEASE.
+ * This is called for PROCSIGNAL_BARRIER_SMGRRELEASE.
  */
 void
 smgrreleaseall(void)
@@ -320,7 +323,7 @@ smgrreleaseall(void)
 }
 
 /*
- *	smgrcloseall() -- Close all existing SMgrRelation objects.
+ * smgrcloseall() -- Close all existing SMgrRelation objects.
  */
 void
 smgrcloseall(void)
@@ -339,8 +342,8 @@ smgrcloseall(void)
 }
 
 /*
- *	smgrcloserellocator() -- Close SMgrRelation object for given RelFileLocator,
- *					   if one exists.
+ * smgrcloserellocator() -- Close SMgrRelation object for given
+ *							RelFileLocator, if one exists.
  *
  * This has the same effects as smgrclose(smgropen(rlocator)), but it avoids
  * uselessly creating a hashtable entry only to drop it again when no
@@ -363,11 +366,13 @@ smgrcloserellocator(RelFileLocatorBackend rlocator)
 }
 
 /*
- *	smgrcreate() -- Create a new relation.
+ * smgrcreate() -- Create a new relation.
  *
- *		Given an already-created (but presumably unused) SMgrRelation,
- *		cause the underlying disk file or other storage for the fork
- *		to be created.
+ * Given an already-created (but presumably unused) SMgrRelation, cause the
+ * underlying disk file or other storage for the fork to be created.
+ *
+ * isRedo is true during recovery.  In that case, the underlying storage may
+ * already exist.
  */
 void
 smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
@@ -376,13 +381,13 @@ smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
 }
 
 /*
- *	smgrdosyncall() -- Immediately sync all forks of all given relations
+ * smgrdosyncall() -- Immediately sync all forks of all given relations
  *
- *		All forks of all given relations are synced out to the store.
+ * All forks of all given relations are synced out to the store.
  *
- *		This is equivalent to FlushRelationBuffers() for each smgr relation,
- *		then calling smgrimmedsync() for all forks of each relation, but it's
- *		significantly quicker so should be preferred when possible.
+ * This is equivalent to FlushRelationBuffers() for each smgr relation, then
+ * calling smgrimmedsync() for all forks of each relation, but it's
+ * significantly quicker so should be preferred when possible.
  */
 void
 smgrdosyncall(SMgrRelation *rels, int nrels)
@@ -411,14 +416,13 @@ smgrdosyncall(SMgrRelation *rels, int nrels)
 }
 
 /*
- *	smgrdounlinkall() -- Immediately unlink all forks of all given relations
+ * smgrdounlinkall() -- Immediately unlink all forks of all given relations
  *
- *		All forks of all given relations are removed from the store.  This
- *		should not be used during transactional operations, since it can't be
- *		undone.
+ * All forks of all given relations are removed from the store.  This should
+ * not be used during transactional operations, since it can't be undone.
  *
- *		If isRedo is true, it is okay for the underlying file(s) to be gone
- *		already.
+ * If isRedo is true, it is okay for the underlying file(s) to be gone
+ * already.
  */
 void
 smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo)
@@ -483,15 +487,64 @@ smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo)
 	pfree(rlocators);
 }
 
+/*
+ * smgrprefetch() -- Initiate asynchronous read of the specified block of a relation.
+ *
+ * In recovery only, this can return false to indicate that a file doesn't
+ * exist (presumably it has been dropped by a later WAL record).
+ */
+bool
+smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
+{
+	return smgrsw[reln->smgr_which].smgr_prefetch(reln, forknum, blocknum);
+}
+
+/*
+ * smgrread() -- read a particular block from a relation into the supplied
+ *				 buffer.
+ *
+ * This routine is called from the buffer manager in order to instantiate
+ * pages in the shared buffer cache.  All storage managers return pages in the
+ * format that POSTGRES expects.
+ */
+void
+smgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+		 void *buffer)
+{
+	smgrsw[reln->smgr_which].smgr_read(reln, forknum, blocknum, buffer);
+}
+
+/*
+ * smgrwrite() -- Write the supplied buffer out.
+ *
+ * This is to be used only for updating already-existing blocks of a relation
+ * (ie, those before the current EOF).  To extend a relation, use
+ * smgrextend().
+ *
+ * This is not a synchronous write -- the block is not necessarily on disk at
+ * return, only dumped out to the kernel.  However, provisions will be made to
+ * fsync the write before the next checkpoint.
+ *
+ * skipFsync indicates that the caller will make other provisions to fsync the
+ * relation, so we needn't bother.  Temporary relations also do not require
+ * fsync.
+ */
+void
+smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+		  const void *buffer, bool skipFsync)
+{
+	smgrsw[reln->smgr_which].smgr_write(reln, forknum, blocknum,
+										buffer, skipFsync);
+}
 
 /*
- *	smgrextend() -- Add a new block to a file.
+ * smgrextend() -- Add a new block to a file.
  *
- *		The semantics are nearly the same as smgrwrite(): write at the
- *		specified position.  However, this is to be used for the case of
- *		extending a relation (i.e., blocknum is at or beyond the current
- *		EOF).  Note that we assume writing a block beyond current EOF
- *		causes intervening file space to become filled with zeroes.
+ * The semantics are nearly the same as smgrwrite(): write at the specified
+ * position.  However, this is to be used for the case of extending a relation
+ * (i.e., blocknum is at or beyond the current EOF).  Writing a block beyond
+ * current EOF must cause the intervening file space to become filled with
+ * zeroes.
  */
 void
 smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
@@ -512,11 +565,13 @@ smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 }
 
 /*
- *	smgrzeroextend() -- Add new zeroed out blocks to a file.
+ * smgrzeroextend() -- Add new zeroed out blocks to a file.
+ *
+ * Extend the relation by the given number of blocks, which will be filled
+ * with zeroes.  This is similar to smgrextend() but only extends and does not
+ * write a buffer of data.
  *
- *		Similar to smgrextend(), except the relation can be extended by
- *		multiple blocks at once and the added blocks will be filled with
- *		zeroes.
+ * FIXME: why both blocknum and nblocks
  */
 void
 smgrzeroextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
@@ -537,60 +592,8 @@ smgrzeroextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 }
 
 /*
- *	smgrprefetch() -- Initiate asynchronous read of the specified block of a relation.
- *
- *		In recovery only, this can return false to indicate that a file
- *		doesn't	exist (presumably it has been dropped by a later WAL
- *		record).
- */
-bool
-smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
-{
-	return smgrsw[reln->smgr_which].smgr_prefetch(reln, forknum, blocknum);
-}
-
-/*
- *	smgrread() -- read a particular block from a relation into the supplied
- *				  buffer.
- *
- *		This routine is called from the buffer manager in order to
- *		instantiate pages in the shared buffer cache.  All storage managers
- *		return pages in the format that POSTGRES expects.
- */
-void
-smgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
-		 void *buffer)
-{
-	smgrsw[reln->smgr_which].smgr_read(reln, forknum, blocknum, buffer);
-}
-
-/*
- *	smgrwrite() -- Write the supplied buffer out.
- *
- *		This is to be used only for updating already-existing blocks of a
- *		relation (ie, those before the current EOF).  To extend a relation,
- *		use smgrextend().
- *
- *		This is not a synchronous write -- the block is not necessarily
- *		on disk at return, only dumped out to the kernel.  However,
- *		provisions will be made to fsync the write before the next checkpoint.
- *
- *		skipFsync indicates that the caller will make other provisions to
- *		fsync the relation, so we needn't bother.  Temporary relations also
- *		do not require fsync.
- */
-void
-smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
-		  const void *buffer, bool skipFsync)
-{
-	smgrsw[reln->smgr_which].smgr_write(reln, forknum, blocknum,
-										buffer, skipFsync);
-}
-
-
-/*
- *	smgrwriteback() -- Trigger kernel writeback for the supplied range of
- *					   blocks.
+ * smgrwriteback() -- Trigger kernel writeback for the supplied range of
+ *					  blocks.
  */
 void
 smgrwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
@@ -601,8 +604,8 @@ smgrwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 }
 
 /*
- *	smgrnblocks() -- Calculate the number of blocks in the
- *					 supplied relation.
+ * smgrnblocks() -- Calculate the number of blocks in the
+ *					supplied relation.
  */
 BlockNumber
 smgrnblocks(SMgrRelation reln, ForkNumber forknum)
@@ -622,8 +625,8 @@ smgrnblocks(SMgrRelation reln, ForkNumber forknum)
 }
 
 /*
- *	smgrnblocks_cached() -- Get the cached number of blocks in the supplied
- *							relation.
+ * smgrnblocks_cached() -- Get the cached number of blocks in the supplied
+ *						   relation.
  *
  * Returns an InvalidBlockNumber when not in recovery and when the relation
  * fork size is not cached.
@@ -642,8 +645,8 @@ smgrnblocks_cached(SMgrRelation reln, ForkNumber forknum)
 }
 
 /*
- *	smgrtruncate() -- Truncate the given forks of supplied relation to
- *					  each specified numbers of blocks
+ * smgrtruncate() -- Truncate the given forks of supplied relation to
+ *					 each specified numbers of blocks
  *
  * The truncation is done immediately, so this can't be rolled back.
  *
@@ -694,27 +697,27 @@ smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks, BlockNumber *nb
 }
 
 /*
- *	smgrimmedsync() -- Force the specified relation to stable storage.
- *
- *		Synchronously force all previous writes to the specified relation
- *		down to disk.
- *
- *		This is useful for building completely new relations (eg, new
- *		indexes).  Instead of incrementally WAL-logging the index build
- *		steps, we can just write completed index pages to disk with smgrwrite
- *		or smgrextend, and then fsync the completed index file before
- *		committing the transaction.  (This is sufficient for purposes of
- *		crash recovery, since it effectively duplicates forcing a checkpoint
- *		for the completed index.  But it is *not* sufficient if one wishes
- *		to use the WAL log for PITR or replication purposes: in that case
- *		we have to make WAL entries as well.)
- *
- *		The preceding writes should specify skipFsync = true to avoid
- *		duplicative fsyncs.
- *
- *		Note that you need to do FlushRelationBuffers() first if there is
- *		any possibility that there are dirty buffers for the relation;
- *		otherwise the sync is not very meaningful.
+ * smgrimmedsync() -- Force the specified relation to stable storage.
+ *
+ * Synchronously force all previous writes to the specified relation
+ * down to disk.
+ *
+ * This is useful for building completely new relations (eg, new
+ * indexes).  Instead of incrementally WAL-logging the index build
+ * steps, we can just write completed index pages to disk with smgrwrite
+ * or smgrextend, and then fsync the completed index file before
+ * committing the transaction.  (This is sufficient for purposes of
+ * crash recovery, since it effectively duplicates forcing a checkpoint
+ * for the completed index.  But it is *not* sufficient if one wishes
+ * to use the WAL log for PITR or replication purposes: in that case
+ * we have to make WAL entries as well.)
+ *
+ * The preceding writes should specify skipFsync = true to avoid
+ * duplicative fsyncs.
+ *
+ * Note that you need to do FlushRelationBuffers() first if there is
+ * any possibility that there are dirty buffers for the relation;
+ * otherwise the sync is not very meaningful.
  */
 void
 smgrimmedsync(SMgrRelation reln, ForkNumber forknum)
diff --git a/src/include/storage/md.h b/src/include/storage/md.h
index 941879ee6a..8af34e4155 100644
--- a/src/include/storage/md.h
+++ b/src/include/storage/md.h
@@ -26,16 +26,16 @@ extern void mdclose(SMgrRelation reln, ForkNumber forknum);
 extern void mdcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo);
 extern bool mdexists(SMgrRelation reln, ForkNumber forknum);
 extern void mdunlink(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo);
-extern void mdextend(SMgrRelation reln, ForkNumber forknum,
-					 BlockNumber blocknum, const void *buffer, bool skipFsync);
-extern void mdzeroextend(SMgrRelation reln, ForkNumber forknum,
-						 BlockNumber blocknum, int nblocks, bool skipFsync);
 extern bool mdprefetch(SMgrRelation reln, ForkNumber forknum,
 					   BlockNumber blocknum);
 extern void mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 				   void *buffer);
 extern void mdwrite(SMgrRelation reln, ForkNumber forknum,
 					BlockNumber blocknum, const void *buffer, bool skipFsync);
+extern void mdextend(SMgrRelation reln, ForkNumber forknum,
+					 BlockNumber blocknum, const void *buffer, bool skipFsync);
+extern void mdzeroextend(SMgrRelation reln, ForkNumber forknum,
+						 BlockNumber blocknum, int nblocks, bool skipFsync);
 extern void mdwriteback(SMgrRelation reln, ForkNumber forknum,
 						BlockNumber blocknum, BlockNumber nblocks);
 extern BlockNumber mdnblocks(SMgrRelation reln, ForkNumber forknum);
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index a9a179aaba..896512f1bc 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -90,16 +90,16 @@ extern void smgrreleaseall(void);
 extern void smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo);
 extern void smgrdosyncall(SMgrRelation *rels, int nrels);
 extern void smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo);
-extern void smgrextend(SMgrRelation reln, ForkNumber forknum,
-					   BlockNumber blocknum, const void *buffer, bool skipFsync);
-extern void smgrzeroextend(SMgrRelation reln, ForkNumber forknum,
-						   BlockNumber blocknum, int nblocks, bool skipFsync);
 extern bool smgrprefetch(SMgrRelation reln, ForkNumber forknum,
 						 BlockNumber blocknum);
 extern void smgrread(SMgrRelation reln, ForkNumber forknum,
 					 BlockNumber blocknum, void *buffer);
 extern void smgrwrite(SMgrRelation reln, ForkNumber forknum,
 					  BlockNumber blocknum, const void *buffer, bool skipFsync);
+extern void smgrextend(SMgrRelation reln, ForkNumber forknum,
+					   BlockNumber blocknum, const void *buffer, bool skipFsync);
+extern void smgrzeroextend(SMgrRelation reln, ForkNumber forknum,
+						   BlockNumber blocknum, int nblocks, bool skipFsync);
 extern void smgrwriteback(SMgrRelation reln, ForkNumber forknum,
 						  BlockNumber blocknum, BlockNumber nblocks);
 extern BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum);
-- 
2.40.1