>From 088b80eb0825339eca688e4347a4ef547edcadbb Mon Sep 17 00:00:00 2001
From: Abhijit Menon-Sen <ams@2ndQuadrant.com>
Date: Thu, 6 Nov 2014 00:45:56 +0530
Subject: Recursively fsync PGDATA at startup after a crash

This is so that we don't lose older unflushed writes in the event of
a power failure after crash recovery, where more recent writes are
preserved.

See 20140918083148.GA17265@alap3.anarazel.de for details.
---
 src/backend/access/transam/xlog.c |  49 +++++++++++++++++
 src/backend/storage/file/fd.c     | 112 ++++++++++++++++++++++++++++++++++++++
 src/include/storage/fd.h          |   2 +
 3 files changed, 163 insertions(+)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 6cf4415..084174d 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -845,6 +845,8 @@ static void WALInsertLockAcquireExclusive(void);
 static void WALInsertLockRelease(void);
 static void WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt);
 
+static void fsync_pgdata(char *datadir);
+
 /*
  * Insert an XLOG record represented by an already-constructed chain of data
  * chunks.  This is a low-level routine; to construct the WAL record header
@@ -5878,6 +5880,25 @@ StartupXLOG(void)
 		ereport(FATAL,
 				(errmsg("control file contains invalid data")));
 
+	/*
+	 * If we need to perform crash recovery, we issue an fsync on the
+	 * data directory and its contents to try to ensure that any data
+	 * written before the crash are flushed to disk. Otherwise a power
+	 * failure in the near future might cause earlier unflushed writes
+	 * to be lost, even though more recent data written to disk from
+	 * here on would be persisted.
+	 *
+	 * We also do this if the control file indicates that fsync was
+	 * disabled at some point while the server was running earlier.
+	 */
+
+	if (enableFsync &&
+		(ControlFile->state != DB_SHUTDOWNED &&
+		 ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY))
+	{
+		fsync_pgdata(data_directory);
+	}
+
 	if (ControlFile->state == DB_SHUTDOWNED)
 	{
 		/* This is the expected case, so don't be chatty in standalone mode */
@@ -11123,3 +11144,31 @@ SetWalWriterSleeping(bool sleeping)
 	XLogCtl->WalWriterSleeping = sleeping;
 	SpinLockRelease(&XLogCtl->info_lck);
 }
+
+/*
+ * Issue fsync recursively on PGDATA and all its contents.
+ */
+static void
+fsync_pgdata(char *datadir)
+{
+	if (!enableFsync)
+		return;
+
+	/*
+	 * If possible, hint to the kernel that we're soon going to fsync
+	 * the data directory and its contents.
+	 */
+#if defined(HAVE_SYNC_FILE_RANGE) || \
+	(defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED))
+	walkdir(datadir, pre_sync_fname);
+#endif
+
+	/*
+	 * Now we do the fsync()s in the same order.
+	 *
+	 * It's important to fsync the destination directory itself as individual
+	 * file fsyncs don't guarantee that the directory entry for the file is
+	 * synced.
+	 */
+	walkdir(datadir, fsync_fname);
+}
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
index f796717..aba12ca 100644
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -2439,3 +2439,115 @@ looks_like_temp_rel_name(const char *name)
 		return false;
 	return true;
 }
+
+/*
+ * Hint to the OS that it should get ready to fsync() this file.
+ *
+ * Adapted from pre_sync_fname in initdb.c
+ */
+void
+pre_sync_fname(char *fname, bool isdir)
+{
+	int			fd;
+
+	fd = open(fname, O_RDONLY | PG_BINARY);
+
+	/*
+	 * Some OSs don't allow us to open directories at all (Windows returns
+	 * EACCES)
+	 */
+	if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
+		return;
+
+	if (fd < 0)
+		ereport(FATAL,
+				(errmsg("could not open file \"%s\" before fsync",
+						fname)));
+
+	pg_flush_data(fd, 0, 0);
+
+	close(fd);
+}
+
+/*
+ * walkdir: recursively walk a directory, applying the action to each
+ * regular file and directory (including the named directory itself)
+ * and following symbolic links.
+ */
+void
+walkdir(char *path, void (*action) (char *fname, bool isdir))
+{
+	DIR		   *dir;
+	struct dirent *de;
+
+	dir = AllocateDir(path);
+	while ((de = ReadDir(dir, path)) != NULL)
+	{
+		char		subpath[MAXPGPATH];
+		struct stat fst;
+
+		CHECK_FOR_INTERRUPTS();
+
+		if (strcmp(de->d_name, ".") == 0 ||
+			strcmp(de->d_name, "..") == 0)
+			continue;
+
+		snprintf(subpath, MAXPGPATH, "%s/%s", path, de->d_name);
+
+		if (lstat(subpath, &fst) < 0)
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not stat file \"%s\": %m", subpath)));
+
+		if (S_ISREG(fst.st_mode))
+			(*action) (subpath, false);
+		else if (S_ISDIR(fst.st_mode))
+			walkdir(subpath, action);
+#ifndef WIN32
+		else if (S_ISLNK(fst.st_mode))
+#else
+		else if (pg_win32_is_junction(subpath))
+#endif
+		{
+#if defined(HAVE_READLINK) || defined(WIN32)
+			char		linkpath[MAXPGPATH];
+			int			len;
+			struct stat lst;
+
+			len = readlink(subpath, linkpath, sizeof(linkpath)-1);
+			if (len < 0)
+				ereport(ERROR,
+						(errcode_for_file_access(),
+						 errmsg("could not read symbolic link \"%s\": %m",
+								subpath)));
+
+			if (len >= sizeof(linkpath)-1)
+				ereport(ERROR,
+						(errmsg("symbolic link \"%s\" target is too long",
+								subpath)));
+
+			linkpath[len] = '\0';
+
+			if (lstat(linkpath, &lst) == 0)
+			{
+				if (S_ISREG(lst.st_mode))
+					(*action) (linkpath, false);
+				else if (S_ISDIR(lst.st_mode))
+					walkdir(subpath, action);
+			}
+			else if (errno != ENOENT)
+				ereport(ERROR,
+						(errcode_for_file_access(),
+						 errmsg("could not stat file \"%s\": %m", linkpath)));
+#else
+			ereport(WARNING,
+					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+					 errmsg("this platform does not support symbolic links; ignoring \"%s\"",
+							subpath)));
+#endif
+		}
+	}
+	FreeDir(dir);
+
+	(*action) (path, true);
+}
diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h
index 5e9571c..5b563a6 100644
--- a/src/include/storage/fd.h
+++ b/src/include/storage/fd.h
@@ -114,6 +114,8 @@ extern int	pg_fsync_writethrough(int fd);
 extern int	pg_fdatasync(int fd);
 extern int	pg_flush_data(int fd, off_t offset, off_t amount);
 extern void fsync_fname(char *fname, bool isdir);
+extern void pre_sync_fname(char *fname, bool isdir);
+extern void walkdir(char *path, void (*action) (char *fname, bool isdir));
 
 /* Filename components for OpenTemporaryFile */
 #define PG_TEMP_FILES_DIR "pgsql_tmp"
-- 
1.9.1

