From ad8d317335092201682f9759cf88ee6382918f7d Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Sun, 23 Oct 2022 14:25:46 -0700
Subject: [PATCH v1 04/12] Add smgrzeroextend(), FileZero(), FileFallocate()

smgrzeroextend() uses FileFallocate() to efficiently extend files by multiple
blocks. When extending by a small number of blocks, use FileZero() instead, as
using posix_fallocate() for small numbers of blocks is inefficient for some
file systems / operating systems. FileZero() is also used as the fallback for
FileFallocate() on platforms / filesystems that don't support fallocate.

Author:
Reviewed-by:
Discussion: https://postgr.es/m/
Backpatch:
---
 src/include/storage/fd.h        |   3 +
 src/include/storage/md.h        |   2 +
 src/include/storage/smgr.h      |   2 +
 src/backend/storage/file/fd.c   | 105 ++++++++++++++++++++++++++++++++
 src/backend/storage/smgr/md.c   | 103 +++++++++++++++++++++++++++++++
 src/backend/storage/smgr/smgr.c |  21 +++++++
 6 files changed, 236 insertions(+)

diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h
index c0a212487d9..3d309edb152 100644
--- a/src/include/storage/fd.h
+++ b/src/include/storage/fd.h
@@ -106,6 +106,9 @@ extern int	FilePrefetch(File file, off_t offset, int amount, uint32 wait_event_i
 extern int	FileRead(File file, char *buffer, int amount, off_t offset, uint32 wait_event_info);
 extern int	FileWrite(File file, char *buffer, int amount, off_t offset, uint32 wait_event_info);
 extern int	FileSync(File file, uint32 wait_event_info);
+extern int	FileZero(File file, off_t offset, off_t len, uint32 wait_event_info);
+extern int	FileFallocate(File file, off_t offset, off_t len, uint32 wait_event_info);
+
 extern off_t FileSize(File file);
 extern int	FileTruncate(File file, off_t offset, uint32 wait_event_info);
 extern void FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info);
diff --git a/src/include/storage/md.h b/src/include/storage/md.h
index 10aa1b0109b..d0597f8a575 100644
--- a/src/include/storage/md.h
+++ b/src/include/storage/md.h
@@ -28,6 +28,8 @@ extern bool mdexists(SMgrRelation reln, ForkNumber forknum);
 extern void mdunlink(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo);
 extern void mdextend(SMgrRelation reln, ForkNumber forknum,
 					 BlockNumber blocknum, char *buffer, bool skipFsync);
+extern void mdzeroextend(SMgrRelation reln, ForkNumber forknum,
+						 BlockNumber blocknum, int nblocks, bool skipFsync);
 extern bool mdprefetch(SMgrRelation reln, ForkNumber forknum,
 					   BlockNumber blocknum);
 extern void mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index a07715356ba..503310e82ba 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -92,6 +92,8 @@ extern void smgrdosyncall(SMgrRelation *rels, int nrels);
 extern void smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo);
 extern void smgrextend(SMgrRelation reln, ForkNumber forknum,
 					   BlockNumber blocknum, char *buffer, bool skipFsync);
+extern void smgrzeroextend(SMgrRelation reln, ForkNumber forknum,
+						   BlockNumber blocknum, int nblocks, bool skipFsync);
 extern bool smgrprefetch(SMgrRelation reln, ForkNumber forknum,
 						 BlockNumber blocknum);
 extern void smgrread(SMgrRelation reln, ForkNumber forknum,
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
index 4151cafec54..47b620649aa 100644
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -93,6 +93,7 @@
 #include "common/pg_prng.h"
 #include "miscadmin.h"
 #include "pgstat.h"
+#include "port/pg_iovec.h"
 #include "portability/mem.h"
 #include "postmaster/startup.h"
 #include "storage/fd.h"
@@ -2205,6 +2206,105 @@ FileSync(File file, uint32 wait_event_info)
 	return returnCode;
 }
 
+/* So that FileZero() doesn't have to re-zero a block on every call */
+static const PGAlignedBlock zerobuf = {0};
+
+int
+FileZero(File file, off_t offset, off_t len, uint32 wait_event_info)
+{
+	int			returnCode;
+	int			numblocks;
+	struct iovec iov[PG_IOV_MAX];
+
+	/*
+	 * FIXME: Quick-and-dirty implementation, to be replaced by
+	 * pg_pwrite_zeros() from
+	 * https://postgr.es/m/Y1oc%2BFjiyVjNZa%2BL%40paquier.xyz
+	 *
+	 * Otherwise it'd not at all be ok to rely on len being a multiple of
+	 * BLCKSZ.
+	 */
+	Assert((len % BLCKSZ) == 0);
+
+	Assert(FileIsValid(file));
+	returnCode = FileAccess(file);
+	if (returnCode < 0)
+		return returnCode;
+
+	numblocks = len / BLCKSZ;
+
+	for (int i = 0; i < Min(numblocks, lengthof(iov)); ++i)
+	{
+		iov[i].iov_base = (char *) zerobuf.data;
+		iov[i].iov_len = BLCKSZ;
+	}
+
+	while (numblocks > 0)
+	{
+		int			iovcnt = Min(numblocks, lengthof(iov));
+		off_t		seekpos_l = offset;
+		ssize_t		ret;
+
+		pgstat_report_wait_start(wait_event_info);
+		ret = pg_pwritev_with_retry(VfdCache[file].fd, iov, iovcnt, seekpos_l);
+		pgstat_report_wait_end();
+
+		if (ret < 0)
+			return -1;
+
+		Assert(ret == iovcnt * BLCKSZ);
+		offset += iovcnt * BLCKSZ;
+		numblocks -= iovcnt;
+	}
+
+	return 0;
+}
+
+/*
+ * Try to reserve file space with posix_fallocate(). If posix_fallocate() is
+ * not implemented on the operating system or fails with EINVAL / EOPNOTSUPP,
+ * use FileZero() instead.
+ *
+ * Note that at least glibc() implements posix_fallocate() in userspace if not
+ * implemented by the filesystem. That's not the case for all environments
+ * though.
+ */
+int
+FileFallocate(File file, off_t offset, off_t len, uint32 wait_event_info)
+{
+	int			returnCode;
+
+	Assert(FileIsValid(file));
+	returnCode = FileAccess(file);
+	if (returnCode < 0)
+		return returnCode;
+
+#ifdef HAVE_POSIX_FALLOCATE
+	pgstat_report_wait_start(wait_event_info);
+	returnCode = posix_fallocate(VfdCache[file].fd, offset, len);
+	pgstat_report_wait_end();
+
+	if (returnCode == 0)
+		return 0;
+
+	/* for compatibility with %m printing etc */
+	errno = returnCode;
+
+	/*
+	 * Return in cases of a "real" failure, if fallocate is not supported,
+	 * fall through to the FileZero() backed implementation.
+	 */
+	if (returnCode != EINVAL && returnCode != EOPNOTSUPP)
+		return returnCode;
+
+	if (returnCode == 0 ||
+		(returnCode != EINVAL && returnCode != EINVAL))
+		return returnCode;
+#endif
+
+	return FileZero(file, offset, len, wait_event_info);
+}
+
 off_t
 FileSize(File file)
 {
@@ -2277,6 +2377,11 @@ int
 FileGetRawDesc(File file)
 {
 	Assert(FileIsValid(file));
+
+	if (FileAccess(file) < 0)
+		return -1;
+
+	FileAccess(file);
 	return VfdCache[file].fd;
 }
 
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index a515bb36ac1..eee3cae7c4e 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -28,6 +28,7 @@
 #include "access/xlog.h"
 #include "access/xlogutils.h"
 #include "commands/tablespace.h"
+#include "common/file_utils.h"
 #include "miscadmin.h"
 #include "pg_trace.h"
 #include "pgstat.h"
@@ -486,6 +487,108 @@ mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 	Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
 }
 
+void
+mdzeroextend(SMgrRelation reln, ForkNumber forknum,
+			 BlockNumber blocknum, int nblocks, bool skipFsync)
+{
+	MdfdVec    *v;
+	BlockNumber curblocknum = blocknum;
+	int         remblocks = nblocks;
+
+	Assert(nblocks > 0);
+
+	/* This assert is too expensive to have on normally ... */
+#ifdef CHECK_WRITE_VS_EXTEND
+	Assert(blocknum >= mdnblocks(reln, forknum));
+#endif
+
+	/*
+	 * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any
+	 * more --- we mustn't create a block whose number actually is
+	 * InvalidBlockNumber or larger.
+	 */
+	if ((uint64) blocknum + nblocks >= (uint64) InvalidBlockNumber)
+		ereport(ERROR,
+				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+				 errmsg("cannot extend file \"%s\" beyond %u blocks",
+						relpath(reln->smgr_rlocator, forknum),
+						InvalidBlockNumber)));
+
+	while (remblocks > 0)
+	{
+		int			segstartblock = curblocknum % ((BlockNumber) RELSEG_SIZE);
+		int			segendblock = (curblocknum % ((BlockNumber) RELSEG_SIZE)) + remblocks;
+		off_t       seekpos = (off_t) BLCKSZ * segstartblock;
+		int			numblocks;
+
+		if (segendblock > RELSEG_SIZE)
+			segendblock = RELSEG_SIZE;
+
+		numblocks = segendblock - segstartblock;
+
+		v = _mdfd_getseg(reln, forknum, curblocknum, skipFsync, EXTENSION_CREATE);
+
+		Assert(segstartblock < RELSEG_SIZE);
+		Assert(segendblock <= RELSEG_SIZE);
+
+		/*
+		 * If available use posix_fallocate() to extend the relation. That's
+		 * often more efficient than using write(), as it commonly won't cause
+		 * the kernel to allocate page cache space for the extended pages.
+		 *
+		 * However, we shouldn't use fallocate() for small extensions, it
+		 * defeats delayed allocation on some filesystems. Not clear where
+		 * that decision should be made though? For now just use a cutoff of
+		 * 8, anything between 4 and 8 worked OK in some local testing.
+		 */
+		if (numblocks > 8)
+		{
+			int         ret;
+
+			ret = FileFallocate(v->mdfd_vfd, seekpos,
+								(off_t) BLCKSZ * numblocks,
+								WAIT_EVENT_DATA_FILE_EXTEND);
+			if (ret != 0)
+			{
+				ereport(ERROR,
+							(errcode_for_file_access(),
+							 errmsg("could not extend file \"%s\" with posix_fallocate(): %m",
+									FilePathName(v->mdfd_vfd)),
+							 errhint("Check free disk space.")));
+			}
+		}
+		else
+		{
+			int         ret;
+
+			/*
+			 * Even if we don't have fallocate, we can still extend a bit more
+			 * efficiently than writing each 8kB block individually.
+			 * FileZero() uses pg_writev[with_retry] with a single zeroed
+			 * buffer to avoid needing a zeroed buffer for the whole length of
+			 * the extension.
+			 */
+			ret = FileZero(v->mdfd_vfd, seekpos,
+						   (off_t) BLCKSZ * numblocks,
+						   WAIT_EVENT_DATA_FILE_EXTEND);
+			if (ret < 0)
+				ereport(ERROR,
+						(errcode_for_file_access(),
+						 errmsg("could not extend file \"%s\": %m",
+								FilePathName(v->mdfd_vfd)),
+						 errhint("Check free disk space.")));
+		}
+
+		if (!skipFsync && !SmgrIsTemp(reln))
+			register_dirty_segment(reln, forknum, v);
+
+		Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
+
+		remblocks -= segendblock - segstartblock;
+		curblocknum += segendblock - segstartblock;
+	}
+}
+
 /*
  *	mdopenfork() -- Open one fork of the specified relation.
  *
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index c1a5febcbfd..6fb693cb062 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -50,6 +50,8 @@ typedef struct f_smgr
 								bool isRedo);
 	void		(*smgr_extend) (SMgrRelation reln, ForkNumber forknum,
 								BlockNumber blocknum, char *buffer, bool skipFsync);
+	void		(*smgr_zeroextend) (SMgrRelation reln, ForkNumber forknum,
+									BlockNumber blocknum, int nblocks, bool skipFsync);
 	bool		(*smgr_prefetch) (SMgrRelation reln, ForkNumber forknum,
 								  BlockNumber blocknum);
 	void		(*smgr_read) (SMgrRelation reln, ForkNumber forknum,
@@ -75,6 +77,7 @@ static const f_smgr smgrsw[] = {
 		.smgr_exists = mdexists,
 		.smgr_unlink = mdunlink,
 		.smgr_extend = mdextend,
+		.smgr_zeroextend = mdzeroextend,
 		.smgr_prefetch = mdprefetch,
 		.smgr_read = mdread,
 		.smgr_write = mdwrite,
@@ -507,6 +510,24 @@ smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 		reln->smgr_cached_nblocks[forknum] = InvalidBlockNumber;
 }
 
+void
+smgrzeroextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+			   int nblocks, bool skipFsync)
+{
+	smgrsw[reln->smgr_which].smgr_zeroextend(reln, forknum, blocknum,
+											 nblocks, skipFsync);
+
+	/*
+	 * Normally we expect this to increase nblocks by nblocks, but if the
+	 * cached value isn't as expected, just invalidate it so the next call
+	 * asks the kernel.
+	 */
+	if (reln->smgr_cached_nblocks[forknum] == blocknum)
+		reln->smgr_cached_nblocks[forknum] = blocknum + nblocks;
+	else
+		reln->smgr_cached_nblocks[forknum] = InvalidBlockNumber;
+}
+
 /*
  *	smgrprefetch() -- Initiate asynchronous read of the specified block of a relation.
  *
-- 
2.38.0

