From 0176762890bc8ddc02e26696ba4414c6c8bac2e5 Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nathan@postgresql.org>
Date: Thu, 31 Aug 2023 07:59:52 -0700
Subject: [PATCH v9 3/4] add support for syncfs in frontend support functions

---
 src/bin/initdb/initdb.c               |   5 +-
 src/bin/pg_basebackup/pg_basebackup.c |   5 +-
 src/bin/pg_checksums/pg_checksums.c   |   3 +-
 src/bin/pg_dump/pg_backup.h           |   4 +-
 src/bin/pg_dump/pg_backup_archiver.c  |  14 ++-
 src/bin/pg_dump/pg_backup_archiver.h  |   1 +
 src/bin/pg_dump/pg_backup_directory.c |   2 +-
 src/bin/pg_dump/pg_dump.c             |   3 +-
 src/bin/pg_rewind/file_ops.c          |   2 +-
 src/bin/pg_rewind/pg_rewind.c         |   1 +
 src/bin/pg_rewind/pg_rewind.h         |   2 +
 src/common/file_utils.c               | 170 +++++++++++++++++++++-----
 src/include/common/file_utils.h       |   5 +-
 13 files changed, 172 insertions(+), 45 deletions(-)

diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c
index 905b979947..bbea1e412b 100644
--- a/src/bin/initdb/initdb.c
+++ b/src/bin/initdb/initdb.c
@@ -165,6 +165,7 @@ static bool show_setting = false;
 static bool data_checksums = false;
 static char *xlog_dir = NULL;
 static int	wal_segment_size_mb = (DEFAULT_XLOG_SEG_SIZE) / (1024 * 1024);
+static DataDirSyncMethod sync_method = DATA_DIR_SYNC_METHOD_FSYNC;
 
 
 /* internal vars */
@@ -3333,7 +3334,7 @@ main(int argc, char *argv[])
 
 		fputs(_("syncing data to disk ... "), stdout);
 		fflush(stdout);
-		fsync_pgdata(pg_data, PG_VERSION_NUM);
+		fsync_pgdata(pg_data, PG_VERSION_NUM, sync_method);
 		check_ok();
 		return 0;
 	}
@@ -3396,7 +3397,7 @@ main(int argc, char *argv[])
 	{
 		fputs(_("syncing data to disk ... "), stdout);
 		fflush(stdout);
-		fsync_pgdata(pg_data, PG_VERSION_NUM);
+		fsync_pgdata(pg_data, PG_VERSION_NUM, sync_method);
 		check_ok();
 	}
 	else
diff --git a/src/bin/pg_basebackup/pg_basebackup.c b/src/bin/pg_basebackup/pg_basebackup.c
index 1dc8efe0cb..1a6eacf6d5 100644
--- a/src/bin/pg_basebackup/pg_basebackup.c
+++ b/src/bin/pg_basebackup/pg_basebackup.c
@@ -148,6 +148,7 @@ static bool verify_checksums = true;
 static bool manifest = true;
 static bool manifest_force_encode = false;
 static char *manifest_checksums = NULL;
+static DataDirSyncMethod sync_method = DATA_DIR_SYNC_METHOD_FSYNC;
 
 static bool success = false;
 static bool made_new_pgdata = false;
@@ -2199,11 +2200,11 @@ BaseBackup(char *compression_algorithm, char *compression_detail,
 		if (format == 't')
 		{
 			if (strcmp(basedir, "-") != 0)
-				(void) fsync_dir_recurse(basedir);
+				(void) fsync_dir_recurse(basedir, sync_method);
 		}
 		else
 		{
-			(void) fsync_pgdata(basedir, serverVersion);
+			(void) fsync_pgdata(basedir, serverVersion, sync_method);
 		}
 	}
 
diff --git a/src/bin/pg_checksums/pg_checksums.c b/src/bin/pg_checksums/pg_checksums.c
index 9011a19b4e..123450f483 100644
--- a/src/bin/pg_checksums/pg_checksums.c
+++ b/src/bin/pg_checksums/pg_checksums.c
@@ -44,6 +44,7 @@ static char *only_filenode = NULL;
 static bool do_sync = true;
 static bool verbose = false;
 static bool showprogress = false;
+static DataDirSyncMethod sync_method = DATA_DIR_SYNC_METHOD_FSYNC;
 
 typedef enum
 {
@@ -623,7 +624,7 @@ main(int argc, char *argv[])
 		if (do_sync)
 		{
 			pg_log_info("syncing data directory");
-			fsync_pgdata(DataDir, PG_VERSION_NUM);
+			fsync_pgdata(DataDir, PG_VERSION_NUM, sync_method);
 		}
 
 		pg_log_info("updating control file");
diff --git a/src/bin/pg_dump/pg_backup.h b/src/bin/pg_dump/pg_backup.h
index aba780ef4b..3a57cdd97d 100644
--- a/src/bin/pg_dump/pg_backup.h
+++ b/src/bin/pg_dump/pg_backup.h
@@ -24,6 +24,7 @@
 #define PG_BACKUP_H
 
 #include "common/compression.h"
+#include "common/file_utils.h"
 #include "fe_utils/simple_list.h"
 #include "libpq-fe.h"
 
@@ -307,7 +308,8 @@ extern Archive *OpenArchive(const char *FileSpec, const ArchiveFormat fmt);
 extern Archive *CreateArchive(const char *FileSpec, const ArchiveFormat fmt,
 							  const pg_compress_specification compression_spec,
 							  bool dosync, ArchiveMode mode,
-							  SetupWorkerPtrType setupDumpWorker);
+							  SetupWorkerPtrType setupDumpWorker,
+							  DataDirSyncMethod sync_method);
 
 /* The --list option */
 extern void PrintTOCSummary(Archive *AHX);
diff --git a/src/bin/pg_dump/pg_backup_archiver.c b/src/bin/pg_dump/pg_backup_archiver.c
index 39ebcfec32..4d83381d84 100644
--- a/src/bin/pg_dump/pg_backup_archiver.c
+++ b/src/bin/pg_dump/pg_backup_archiver.c
@@ -66,7 +66,8 @@ typedef struct _parallelReadyList
 static ArchiveHandle *_allocAH(const char *FileSpec, const ArchiveFormat fmt,
 							   const pg_compress_specification compression_spec,
 							   bool dosync, ArchiveMode mode,
-							   SetupWorkerPtrType setupWorkerPtr);
+							   SetupWorkerPtrType setupWorkerPtr,
+							   DataDirSyncMethod sync_method);
 static void _getObjectDescription(PQExpBuffer buf, const TocEntry *te);
 static void _printTocEntry(ArchiveHandle *AH, TocEntry *te, bool isData);
 static char *sanitize_line(const char *str, bool want_hyphen);
@@ -238,11 +239,12 @@ Archive *
 CreateArchive(const char *FileSpec, const ArchiveFormat fmt,
 			  const pg_compress_specification compression_spec,
 			  bool dosync, ArchiveMode mode,
-			  SetupWorkerPtrType setupDumpWorker)
+			  SetupWorkerPtrType setupDumpWorker,
+			  DataDirSyncMethod sync_method)
 
 {
 	ArchiveHandle *AH = _allocAH(FileSpec, fmt, compression_spec,
-								 dosync, mode, setupDumpWorker);
+								 dosync, mode, setupDumpWorker, sync_method);
 
 	return (Archive *) AH;
 }
@@ -257,7 +259,8 @@ OpenArchive(const char *FileSpec, const ArchiveFormat fmt)
 
 	compression_spec.algorithm = PG_COMPRESSION_NONE;
 	AH = _allocAH(FileSpec, fmt, compression_spec, true,
-				  archModeRead, setupRestoreWorker);
+				  archModeRead, setupRestoreWorker,
+				  DATA_DIR_SYNC_METHOD_FSYNC);
 
 	return (Archive *) AH;
 }
@@ -2233,7 +2236,7 @@ static ArchiveHandle *
 _allocAH(const char *FileSpec, const ArchiveFormat fmt,
 		 const pg_compress_specification compression_spec,
 		 bool dosync, ArchiveMode mode,
-		 SetupWorkerPtrType setupWorkerPtr)
+		 SetupWorkerPtrType setupWorkerPtr, DataDirSyncMethod sync_method)
 {
 	ArchiveHandle *AH;
 	CompressFileHandle *CFH;
@@ -2287,6 +2290,7 @@ _allocAH(const char *FileSpec, const ArchiveFormat fmt,
 	AH->mode = mode;
 	AH->compression_spec = compression_spec;
 	AH->dosync = dosync;
+	AH->sync_method = sync_method;
 
 	memset(&(AH->sqlparse), 0, sizeof(AH->sqlparse));
 
diff --git a/src/bin/pg_dump/pg_backup_archiver.h b/src/bin/pg_dump/pg_backup_archiver.h
index 18b38c17ab..b07673933d 100644
--- a/src/bin/pg_dump/pg_backup_archiver.h
+++ b/src/bin/pg_dump/pg_backup_archiver.h
@@ -312,6 +312,7 @@ struct _archiveHandle
 	pg_compress_specification compression_spec; /* Requested specification for
 												 * compression */
 	bool		dosync;			/* data requested to be synced on sight */
+	DataDirSyncMethod sync_method;
 	ArchiveMode mode;			/* File mode - r or w */
 	void	   *formatData;		/* Header data specific to file format */
 
diff --git a/src/bin/pg_dump/pg_backup_directory.c b/src/bin/pg_dump/pg_backup_directory.c
index 7f2ac7c7fd..6faa3a511f 100644
--- a/src/bin/pg_dump/pg_backup_directory.c
+++ b/src/bin/pg_dump/pg_backup_directory.c
@@ -613,7 +613,7 @@ _CloseArchive(ArchiveHandle *AH)
 		 * individually. Just recurse once through all the files generated.
 		 */
 		if (AH->dosync)
-			fsync_dir_recurse(ctx->directory);
+			fsync_dir_recurse(ctx->directory, AH->sync_method);
 	}
 	AH->FH = NULL;
 }
diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c
index 65f64c282d..39a468b131 100644
--- a/src/bin/pg_dump/pg_dump.c
+++ b/src/bin/pg_dump/pg_dump.c
@@ -357,6 +357,7 @@ main(int argc, char **argv)
 	char	   *compression_algorithm_str = "none";
 	char	   *error_detail = NULL;
 	bool		user_compression_defined = false;
+	DataDirSyncMethod sync_method = DATA_DIR_SYNC_METHOD_FSYNC;
 
 	static DumpOptions dopt;
 
@@ -777,7 +778,7 @@ main(int argc, char **argv)
 
 	/* Open the output file */
 	fout = CreateArchive(filename, archiveFormat, compression_spec,
-						 dosync, archiveMode, setupDumpWorker);
+						 dosync, archiveMode, setupDumpWorker, sync_method);
 
 	/* Make dump options accessible right away */
 	SetArchiveOptions(fout, &dopt, NULL);
diff --git a/src/bin/pg_rewind/file_ops.c b/src/bin/pg_rewind/file_ops.c
index 25996b4da4..451fb1856e 100644
--- a/src/bin/pg_rewind/file_ops.c
+++ b/src/bin/pg_rewind/file_ops.c
@@ -296,7 +296,7 @@ sync_target_dir(void)
 	if (!do_sync || dry_run)
 		return;
 
-	fsync_pgdata(datadir_target, PG_VERSION_NUM);
+	fsync_pgdata(datadir_target, PG_VERSION_NUM, sync_method);
 }
 
 
diff --git a/src/bin/pg_rewind/pg_rewind.c b/src/bin/pg_rewind/pg_rewind.c
index 7f69f02441..bdfacf3263 100644
--- a/src/bin/pg_rewind/pg_rewind.c
+++ b/src/bin/pg_rewind/pg_rewind.c
@@ -74,6 +74,7 @@ bool		showprogress = false;
 bool		dry_run = false;
 bool		do_sync = true;
 bool		restore_wal = false;
+DataDirSyncMethod sync_method = DATA_DIR_SYNC_METHOD_FSYNC;
 
 /* Target history */
 TimeLineHistoryEntry *targetHistory;
diff --git a/src/bin/pg_rewind/pg_rewind.h b/src/bin/pg_rewind/pg_rewind.h
index ef8bdc1fbb..05729adfef 100644
--- a/src/bin/pg_rewind/pg_rewind.h
+++ b/src/bin/pg_rewind/pg_rewind.h
@@ -13,6 +13,7 @@
 
 #include "access/timeline.h"
 #include "common/logging.h"
+#include "common/file_utils.h"
 #include "datapagemap.h"
 #include "libpq-fe.h"
 #include "storage/block.h"
@@ -24,6 +25,7 @@ extern bool showprogress;
 extern bool dry_run;
 extern bool do_sync;
 extern int	WalSegSz;
+extern DataDirSyncMethod sync_method;
 
 /* Target history */
 extern TimeLineHistoryEntry *targetHistory;
diff --git a/src/common/file_utils.c b/src/common/file_utils.c
index 74833c4acb..05c73c0bb7 100644
--- a/src/common/file_utils.c
+++ b/src/common/file_utils.c
@@ -51,6 +51,31 @@ static void walkdir(const char *path,
 					int (*action) (const char *fname, bool isdir),
 					bool process_symlinks);
 
+#ifdef HAVE_SYNCFS
+static void
+do_syncfs(const char *path)
+{
+	int			fd;
+
+	fd = open(path, O_RDONLY, 0);
+
+	if (fd < 0)
+	{
+		pg_log_error("could not open file \"%s\": %m", path);
+		return;
+	}
+
+	if (syncfs(fd) < 0)
+	{
+		pg_log_error("could not synchronize file system for file \"%s\": %m", path);
+		(void) close(fd);
+		exit(EXIT_FAILURE);
+	}
+
+	(void) close(fd);
+}
+#endif
+
 /*
  * Issue fsync recursively on PGDATA and all its contents.
  *
@@ -63,7 +88,8 @@ static void walkdir(const char *path,
  */
 void
 fsync_pgdata(const char *pg_data,
-			 int serverVersion)
+			 int serverVersion,
+			 DataDirSyncMethod sync_method)
 {
 	bool		xlog_is_symlink;
 	char		pg_wal[MAXPGPATH];
@@ -89,30 +115,93 @@ fsync_pgdata(const char *pg_data,
 			xlog_is_symlink = true;
 	}
 
-	/*
-	 * If possible, hint to the kernel that we're soon going to fsync the data
-	 * directory and its contents.
-	 */
+	switch (sync_method)
+	{
+		case DATA_DIR_SYNC_METHOD_SYNCFS:
+			{
+#ifndef HAVE_SYNCFS
+				pg_log_error("this build does not support sync method \"%s\"",
+							 "syncfs");
+				exit(EXIT_FAILURE);
+#else
+				DIR		   *dir;
+				struct dirent *de;
+
+				/*
+				 * On Linux, we don't have to open every single file one by
+				 * one.  We can use syncfs() to sync whole filesystems.  We
+				 * only expect filesystem boundaries to exist where we
+				 * tolerate symlinks, namely pg_wal and the tablespaces, so we
+				 * call syncfs() for each of those directories.
+				 */
+
+				/* Sync the top level pgdata directory. */
+				do_syncfs(pg_data);
+
+				/* If any tablespaces are configured, sync each of those. */
+				dir = opendir(pg_tblspc);
+				if (dir == NULL)
+					pg_log_error("could not open directory \"%s\": %m",
+								 pg_tblspc);
+				else
+				{
+					while (errno = 0, (de = readdir(dir)) != NULL)
+					{
+						char		subpath[MAXPGPATH * 2];
+
+						if (strcmp(de->d_name, ".") == 0 ||
+							strcmp(de->d_name, "..") == 0)
+							continue;
+
+						snprintf(subpath, sizeof(subpath), "%s/%s",
+								 pg_tblspc, de->d_name);
+						do_syncfs(subpath);
+					}
+
+					if (errno)
+						pg_log_error("could not read directory \"%s\": %m",
+									 pg_tblspc);
+
+					(void) closedir(dir);
+				}
+
+				/* If pg_wal is a symlink, process that too. */
+				if (xlog_is_symlink)
+					do_syncfs(pg_wal);
+#endif							/* HAVE_SYNCFS */
+			}
+			break;
+
+		case DATA_DIR_SYNC_METHOD_FSYNC:
+			{
+				/*
+				 * If possible, hint to the kernel that we're soon going to
+				 * fsync the data directory and its contents.
+				 */
 #ifdef PG_FLUSH_DATA_WORKS
-	walkdir(pg_data, pre_sync_fname, false);
-	if (xlog_is_symlink)
-		walkdir(pg_wal, pre_sync_fname, false);
-	walkdir(pg_tblspc, pre_sync_fname, true);
+				walkdir(pg_data, pre_sync_fname, false);
+				if (xlog_is_symlink)
+					walkdir(pg_wal, pre_sync_fname, false);
+				walkdir(pg_tblspc, pre_sync_fname, true);
 #endif
 
-	/*
-	 * Now we do the fsync()s in the same order.
-	 *
-	 * The main call ignores symlinks, so in addition to specially processing
-	 * pg_wal if it's a symlink, pg_tblspc has to be visited separately with
-	 * process_symlinks = true.  Note that if there are any plain directories
-	 * in pg_tblspc, they'll get fsync'd twice.  That's not an expected case
-	 * so we don't worry about optimizing it.
-	 */
-	walkdir(pg_data, fsync_fname, false);
-	if (xlog_is_symlink)
-		walkdir(pg_wal, fsync_fname, false);
-	walkdir(pg_tblspc, fsync_fname, true);
+				/*
+				 * Now we do the fsync()s in the same order.
+				 *
+				 * The main call ignores symlinks, so in addition to specially
+				 * processing pg_wal if it's a symlink, pg_tblspc has to be
+				 * visited separately with process_symlinks = true.  Note that
+				 * if there are any plain directories in pg_tblspc, they'll
+				 * get fsync'd twice. That's not an expected case so we don't
+				 * worry about optimizing it.
+				 */
+				walkdir(pg_data, fsync_fname, false);
+				if (xlog_is_symlink)
+					walkdir(pg_wal, fsync_fname, false);
+				walkdir(pg_tblspc, fsync_fname, true);
+			}
+			break;
+	}
 }
 
 /*
@@ -121,17 +210,40 @@ fsync_pgdata(const char *pg_data,
  * This is a convenient wrapper on top of walkdir().
  */
 void
-fsync_dir_recurse(const char *dir)
+fsync_dir_recurse(const char *dir, DataDirSyncMethod sync_method)
 {
-	/*
-	 * If possible, hint to the kernel that we're soon going to fsync the data
-	 * directory and its contents.
-	 */
+	switch (sync_method)
+	{
+		case DATA_DIR_SYNC_METHOD_SYNCFS:
+			{
+#ifndef HAVE_SYNCFS
+				pg_log_error("this build does not support sync method \"%s\"",
+							 "syncfs");
+				exit(EXIT_FAILURE);
+#else
+				/*
+				 * On Linux, we don't have to open every single file one by
+				 * one.  We can use syncfs() to sync the whole filesystem.
+				 */
+				do_syncfs(dir);
+#endif							/* HAVE_SYNCFS */
+			}
+			break;
+
+		case DATA_DIR_SYNC_METHOD_FSYNC:
+			{
+				/*
+				 * If possible, hint to the kernel that we're soon going to
+				 * fsync the data directory and its contents.
+				 */
 #ifdef PG_FLUSH_DATA_WORKS
-	walkdir(dir, pre_sync_fname, false);
+				walkdir(dir, pre_sync_fname, false);
 #endif
 
-	walkdir(dir, fsync_fname, false);
+				walkdir(dir, fsync_fname, false);
+			}
+			break;
+	}
 }
 
 /*
diff --git a/src/include/common/file_utils.h b/src/include/common/file_utils.h
index 7da21f15e6..cae6159bf6 100644
--- a/src/include/common/file_utils.h
+++ b/src/include/common/file_utils.h
@@ -34,8 +34,9 @@ struct iovec;					/* avoid including port/pg_iovec.h here */
 
 #ifdef FRONTEND
 extern int	fsync_fname(const char *fname, bool isdir);
-extern void fsync_pgdata(const char *pg_data, int serverVersion);
-extern void fsync_dir_recurse(const char *dir);
+extern void fsync_pgdata(const char *pg_data, int serverVersion,
+						 DataDirSyncMethod sync_method);
+extern void fsync_dir_recurse(const char *dir, DataDirSyncMethod sync_method);
 extern int	durable_rename(const char *oldfile, const char *newfile);
 extern int	fsync_parent_path(const char *fname);
 #endif
-- 
2.25.1

