From 54112d952592b9a1d3858384b5c79b6334b20ac6 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@enterprisedb.com>
Date: Mon, 31 Dec 2018 15:25:16 +1300
Subject: [PATCH 2/2] Refactor the fsync machinery to support future SMGR
 implementations.

In anticipation of proposed block storage managers alongside md.c that
map bufmgr.c blocks to files optimised for different usage patterns:

1.  Move the system for requesting fsyncs out of md.c into a new
translation unit smgrsync.c.

2.  Have smgrsync.c perform the actual fsync() calls via the existing
polymorphic smgrimmedsync() interface, extended to allow an individual
segment number to be specified.

3.  Teach the checkpointer how to forget individual segments that are
unlinked from the 'front' after having been dropped from shared
buffers.

4.  Move the request tracking from a bitmapset into a sorted vector,
because the proposed block storage managers are not anchored at zero
and use potentially very large and sparse integers.

Author: Thomas Munro
Reviewed-by:
Discussion: https://postgr.es/m/CAEepm=2gTANm=e3ARnJT=n0h8hf88wqmaZxk0JYkxw+b21fNrw@mail.gmail.com
---
 contrib/bloom/blinsert.c              |   2 +-
 src/backend/access/heap/heapam.c      |   4 +-
 src/backend/access/nbtree/nbtree.c    |   2 +-
 src/backend/access/nbtree/nbtsort.c   |   4 +-
 src/backend/access/spgist/spginsert.c |   2 +-
 src/backend/access/transam/xlog.c     |   2 +
 src/backend/catalog/heap.c            |   2 +-
 src/backend/commands/dbcommands.c     |   2 +-
 src/backend/commands/tablecmds.c      |   2 +-
 src/backend/commands/tablespace.c     |   1 -
 src/backend/postmaster/checkpointer.c |  21 +-
 src/backend/storage/buffer/bufmgr.c   |   1 +
 src/backend/storage/smgr/Makefile     |   2 +-
 src/backend/storage/smgr/md.c         | 776 ++----------------------------
 src/backend/storage/smgr/smgr.c       | 117 +++--
 src/backend/storage/smgr/smgrsync.c   | 855 ++++++++++++++++++++++++++++++++++
 src/backend/tcop/utility.c            |   1 -
 src/backend/utils/misc/guc.c          |   1 +
 src/include/postmaster/bgwriter.h     |  24 +-
 src/include/postmaster/checkpointer.h |  39 ++
 src/include/storage/smgr.h            |  31 +-
 src/include/storage/smgrsync.h        |  35 ++
 22 files changed, 1063 insertions(+), 863 deletions(-)
 create mode 100644 src/backend/storage/smgr/smgrsync.c
 create mode 100644 src/include/postmaster/checkpointer.h
 create mode 100644 src/include/storage/smgrsync.h

diff --git a/contrib/bloom/blinsert.c b/contrib/bloom/blinsert.c
index e43fbe0005..6fa07db4f8 100644
--- a/contrib/bloom/blinsert.c
+++ b/contrib/bloom/blinsert.c
@@ -188,7 +188,7 @@ blbuildempty(Relation index)
 	 * write did not go through shared_buffers and therefore a concurrent
 	 * checkpoint may have moved the redo pointer past our xlog record.
 	 */
-	smgrimmedsync(index->rd_smgr, INIT_FORKNUM);
+	smgrimmedsync(index->rd_smgr, INIT_FORKNUM, InvalidSegmentNumber);
 }
 
 /*
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index dc3499349b..1b32b0128d 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -8980,7 +8980,7 @@ heap_sync(Relation rel)
 	/* main heap */
 	FlushRelationBuffers(rel);
 	/* FlushRelationBuffers will have opened rd_smgr */
-	smgrimmedsync(rel->rd_smgr, MAIN_FORKNUM);
+	smgrimmedsyncrel(rel->rd_smgr, MAIN_FORKNUM);
 
 	/* FSM is not critical, don't bother syncing it */
 
@@ -8991,7 +8991,7 @@ heap_sync(Relation rel)
 
 		toastrel = table_open(rel->rd_rel->reltoastrelid, AccessShareLock);
 		FlushRelationBuffers(toastrel);
-		smgrimmedsync(toastrel->rd_smgr, MAIN_FORKNUM);
+		smgrimmedsyncrel(toastrel->rd_smgr, MAIN_FORKNUM);
 		table_close(toastrel, AccessShareLock);
 	}
 }
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 98917de2ef..1bccc7a8df 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -178,7 +178,7 @@ btbuildempty(Relation index)
 	 * write did not go through shared_buffers and therefore a concurrent
 	 * checkpoint may have moved the redo pointer past our xlog record.
 	 */
-	smgrimmedsync(index->rd_smgr, INIT_FORKNUM);
+	smgrimmedsyncrel(index->rd_smgr, INIT_FORKNUM);
 }
 
 /*
diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c
index dc398e1186..3b555eff89 100644
--- a/src/backend/access/nbtree/nbtsort.c
+++ b/src/backend/access/nbtree/nbtsort.c
@@ -34,7 +34,7 @@
  * Since the index will never be used unless it is completely built,
  * from a crash-recovery point of view there is no need to WAL-log the
  * steps of the build.  After completing the index build, we can just sync
- * the whole file to disk using smgrimmedsync() before exiting this module.
+ * the whole file to disk using smgrimmedsyncrel() before exiting this module.
  * This can be seen to be sufficient for crash recovery by considering that
  * it's effectively equivalent to what would happen if a CHECKPOINT occurred
  * just after the index build.  However, it is clearly not sufficient if the
@@ -1208,7 +1208,7 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
 	if (RelationNeedsWAL(wstate->index))
 	{
 		RelationOpenSmgr(wstate->index);
-		smgrimmedsync(wstate->index->rd_smgr, MAIN_FORKNUM);
+		smgrimmedsyncrel(wstate->index->rd_smgr, MAIN_FORKNUM);
 	}
 }
 
diff --git a/src/backend/access/spgist/spginsert.c b/src/backend/access/spgist/spginsert.c
index f428a15138..0c5ab317cb 100644
--- a/src/backend/access/spgist/spginsert.c
+++ b/src/backend/access/spgist/spginsert.c
@@ -205,7 +205,7 @@ spgbuildempty(Relation index)
 	 * writes did not go through shared buffers and therefore a concurrent
 	 * checkpoint may have moved the redo pointer past our xlog record.
 	 */
-	smgrimmedsync(index->rd_smgr, INIT_FORKNUM);
+	smgrimmedsyncrel(index->rd_smgr, INIT_FORKNUM);
 }
 
 /*
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index ecd12fc53a..87d1172373 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -44,6 +44,7 @@
 #include "pgstat.h"
 #include "port/atomics.h"
 #include "postmaster/bgwriter.h"
+#include "postmaster/checkpointer.h"
 #include "postmaster/walwriter.h"
 #include "postmaster/startup.h"
 #include "replication/basebackup.h"
@@ -64,6 +65,7 @@
 #include "storage/procarray.h"
 #include "storage/reinit.h"
 #include "storage/smgr.h"
+#include "storage/smgrsync.h"
 #include "storage/spin.h"
 #include "utils/builtins.h"
 #include "utils/guc.h"
diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c
index 06d18a1cfb..e0a38a1144 100644
--- a/src/backend/catalog/heap.c
+++ b/src/backend/catalog/heap.c
@@ -1421,7 +1421,7 @@ heap_create_init_fork(Relation rel)
 	RelationOpenSmgr(rel);
 	smgrcreate(rel->rd_smgr, INIT_FORKNUM, false);
 	log_smgrcreate(&rel->rd_smgr->smgr_rnode.node, INIT_FORKNUM);
-	smgrimmedsync(rel->rd_smgr, INIT_FORKNUM);
+	smgrimmedsyncrel(rel->rd_smgr, INIT_FORKNUM);
 }
 
 /*
diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c
index d207cd899f..1be6c874a1 100644
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -47,7 +47,6 @@
 #include "mb/pg_wchar.h"
 #include "miscadmin.h"
 #include "pgstat.h"
-#include "postmaster/bgwriter.h"
 #include "replication/slot.h"
 #include "storage/copydir.h"
 #include "storage/fd.h"
@@ -55,6 +54,7 @@
 #include "storage/ipc.h"
 #include "storage/procarray.h"
 #include "storage/smgr.h"
+#include "storage/smgrsync.h"
 #include "utils/acl.h"
 #include "utils/builtins.h"
 #include "utils/fmgroids.h"
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index 715c6a221c..4e662bdf92 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -11788,7 +11788,7 @@ copy_relation_data(SMgrRelation src, SMgrRelation dst,
 	 * here, they might still not be on disk when the crash occurs.
 	 */
 	if (relpersistence == RELPERSISTENCE_PERMANENT || copying_initfork)
-		smgrimmedsync(dst, forkNum);
+		smgrimmedsyncrel(dst, forkNum);
 }
 
 /*
diff --git a/src/backend/commands/tablespace.c b/src/backend/commands/tablespace.c
index 4afd178e97..e450e161a0 100644
--- a/src/backend/commands/tablespace.c
+++ b/src/backend/commands/tablespace.c
@@ -70,7 +70,6 @@
 #include "commands/tablespace.h"
 #include "common/file_perm.h"
 #include "miscadmin.h"
-#include "postmaster/bgwriter.h"
 #include "storage/fd.h"
 #include "storage/lmgr.h"
 #include "storage/standby.h"
diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c
index fe96c41359..da233b1cdc 100644
--- a/src/backend/postmaster/checkpointer.c
+++ b/src/backend/postmaster/checkpointer.c
@@ -47,6 +47,7 @@
 #include "miscadmin.h"
 #include "pgstat.h"
 #include "postmaster/bgwriter.h"
+#include "postmaster/postmaster.h"
 #include "replication/syncrep.h"
 #include "storage/bufmgr.h"
 #include "storage/condition_variable.h"
@@ -56,6 +57,7 @@
 #include "storage/proc.h"
 #include "storage/shmem.h"
 #include "storage/smgr.h"
+#include "storage/smgrsync.h"
 #include "storage/spin.h"
 #include "utils/guc.h"
 #include "utils/memutils.h"
@@ -108,10 +110,10 @@
  */
 typedef struct
 {
-	RelFileNode rnode;
+	int			type;
+	RelFileNode	rnode;
 	ForkNumber	forknum;
-	BlockNumber segno;			/* see md.c for special values */
-	/* might add a real request-type field later; not needed yet */
+	SegmentNumber segno;
 } CheckpointerRequest;
 
 typedef struct
@@ -1077,9 +1079,7 @@ RequestCheckpoint(int flags)
  * RelFileNodeBackend.
  *
  * segno specifies which segment (not block!) of the relation needs to be
- * fsync'd.  (Since the valid range is much less than BlockNumber, we can
- * use high values for special flags; that's all internal to md.c, which
- * see for details.)
+ * fsync'd.
  *
  * To avoid holding the lock for longer than necessary, we normally write
  * to the requests[] queue without checking for duplicates.  The checkpointer
@@ -1092,13 +1092,14 @@ RequestCheckpoint(int flags)
  * let the backend know by returning false.
  */
 bool
-ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
+ForwardFsyncRequest(int type, RelFileNode rnode, ForkNumber forknum,
+					SegmentNumber segno)
 {
 	CheckpointerRequest *request;
 	bool		too_full;
 
 	if (!IsUnderPostmaster)
-		return false;			/* probably shouldn't even get here */
+		elog(ERROR, "ForwardFsyncRequest must not be called in single user mode");
 
 	if (AmCheckpointerProcess())
 		elog(ERROR, "ForwardFsyncRequest must not be called in checkpointer");
@@ -1130,6 +1131,7 @@ ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
 
 	/* OK, insert request */
 	request = &CheckpointerShmem->requests[CheckpointerShmem->num_requests++];
+	request->type = type;
 	request->rnode = rnode;
 	request->forknum = forknum;
 	request->segno = segno;
@@ -1314,7 +1316,8 @@ AbsorbFsyncRequests(void)
 	LWLockRelease(CheckpointerCommLock);
 
 	for (request = requests; n > 0; request++, n--)
-		RememberFsyncRequest(request->rnode, request->forknum, request->segno);
+		RememberFsyncRequest(request->type, request->rnode, request->forknum,
+							 request->segno);
 
 	END_CRIT_SECTION();
 
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 273e2f385f..b75021cce3 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -47,6 +47,7 @@
 #include "storage/ipc.h"
 #include "storage/proc.h"
 #include "storage/smgr.h"
+#include "storage/smgrsync.h"
 #include "storage/standby.h"
 #include "utils/rel.h"
 #include "utils/resowner_private.h"
diff --git a/src/backend/storage/smgr/Makefile b/src/backend/storage/smgr/Makefile
index 2b95cb0df1..c9c4be325e 100644
--- a/src/backend/storage/smgr/Makefile
+++ b/src/backend/storage/smgr/Makefile
@@ -12,6 +12,6 @@ subdir = src/backend/storage/smgr
 top_builddir = ../../../..
 include $(top_builddir)/src/Makefile.global
 
-OBJS = md.o smgr.o smgrtype.o
+OBJS = md.o smgr.o smgrsync.o smgrtype.o
 
 include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index 2aba2dfe91..73c68e56bf 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -30,37 +30,23 @@
 #include "access/xlog.h"
 #include "pgstat.h"
 #include "portability/instr_time.h"
-#include "postmaster/bgwriter.h"
 #include "storage/fd.h"
 #include "storage/bufmgr.h"
 #include "storage/relfilenode.h"
 #include "storage/smgr.h"
+#include "storage/smgrsync.h"
 #include "utils/hsearch.h"
 #include "utils/memutils.h"
 #include "pg_trace.h"
 
 
-/* intervals for calling AbsorbFsyncRequests in mdsync and mdpostckpt */
-#define FSYNCS_PER_ABSORB		10
-#define UNLINKS_PER_ABSORB		10
-
-/*
- * Special values for the segno arg to RememberFsyncRequest.
- *
- * Note that CompactCheckpointerRequestQueue assumes that it's OK to remove an
- * fsync request from the queue if an identical, subsequent request is found.
- * See comments there before making changes here.
- */
-#define FORGET_RELATION_FSYNC	(InvalidBlockNumber)
-#define FORGET_DATABASE_FSYNC	(InvalidBlockNumber-1)
-#define UNLINK_RELATION_REQUEST (InvalidBlockNumber-2)
 
 /*
  * On Windows, we have to interpret EACCES as possibly meaning the same as
  * ENOENT, because if a file is unlinked-but-not-yet-gone on that platform,
  * that's what you get.  Ugh.  This code is designed so that we don't
  * actually believe these cases are okay without further evidence (namely,
- * a pending fsync request getting canceled ... see mdsync).
+ * a pending fsync request getting canceled ... see smgrsync).
  */
 #ifndef WIN32
 #define FILE_POSSIBLY_DELETED(err)	((err) == ENOENT)
@@ -134,30 +120,9 @@ static MemoryContext MdCxt;		/* context for all MdfdVec objects */
  * (Regular backends do not track pending operations locally, but forward
  * them to the checkpointer.)
  */
-typedef uint16 CycleCtr;		/* can be any convenient integer size */
-
-typedef struct
-{
-	RelFileNode rnode;			/* hash table key (must be first!) */
-	CycleCtr	cycle_ctr;		/* mdsync_cycle_ctr of oldest request */
-	/* requests[f] has bit n set if we need to fsync segment n of fork f */
-	Bitmapset  *requests[MAX_FORKNUM + 1];
-	/* canceled[f] is true if we canceled fsyncs for fork "recently" */
-	bool		canceled[MAX_FORKNUM + 1];
-} PendingOperationEntry;
-
-typedef struct
-{
-	RelFileNode rnode;			/* the dead relation to delete */
-	CycleCtr	cycle_ctr;		/* mdckpt_cycle_ctr when request was made */
-} PendingUnlinkEntry;
+typedef uint32 CycleCtr;		/* can be any convenient integer size */
 
-static HTAB *pendingOpsTable = NULL;
-static List *pendingUnlinks = NIL;
-static MemoryContext pendingOpsCxt; /* context for the above  */
 
-static CycleCtr mdsync_cycle_ctr = 0;
-static CycleCtr mdckpt_cycle_ctr = 0;
 
 
 /*** behavior for mdopen & _mdfd_getseg ***/
@@ -184,8 +149,7 @@ static void mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum,
 			 bool isRedo);
 static MdfdVec *mdopen(SMgrRelation reln, ForkNumber forknum, int behavior);
 static void register_dirty_segment(SMgrRelation reln, ForkNumber forknum,
-					   MdfdVec *seg);
-static void register_unlink(RelFileNodeBackend rnode);
+							   MdfdVec *seg);
 static void _fdvec_resize(SMgrRelation reln,
 			  ForkNumber forknum,
 			  int nseg);
@@ -208,64 +172,6 @@ mdinit(void)
 	MdCxt = AllocSetContextCreate(TopMemoryContext,
 								  "MdSmgr",
 								  ALLOCSET_DEFAULT_SIZES);
-
-	/*
-	 * Create pending-operations hashtable if we need it.  Currently, we need
-	 * it if we are standalone (not under a postmaster) or if we are a startup
-	 * or checkpointer auxiliary process.
-	 */
-	if (!IsUnderPostmaster || AmStartupProcess() || AmCheckpointerProcess())
-	{
-		HASHCTL		hash_ctl;
-
-		/*
-		 * XXX: The checkpointer needs to add entries to the pending ops table
-		 * when absorbing fsync requests.  That is done within a critical
-		 * section, which isn't usually allowed, but we make an exception. It
-		 * means that there's a theoretical possibility that you run out of
-		 * memory while absorbing fsync requests, which leads to a PANIC.
-		 * Fortunately the hash table is small so that's unlikely to happen in
-		 * practice.
-		 */
-		pendingOpsCxt = AllocSetContextCreate(MdCxt,
-											  "Pending ops context",
-											  ALLOCSET_DEFAULT_SIZES);
-		MemoryContextAllowInCriticalSection(pendingOpsCxt, true);
-
-		MemSet(&hash_ctl, 0, sizeof(hash_ctl));
-		hash_ctl.keysize = sizeof(RelFileNode);
-		hash_ctl.entrysize = sizeof(PendingOperationEntry);
-		hash_ctl.hcxt = pendingOpsCxt;
-		pendingOpsTable = hash_create("Pending Ops Table",
-									  100L,
-									  &hash_ctl,
-									  HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
-		pendingUnlinks = NIL;
-	}
-}
-
-/*
- * In archive recovery, we rely on checkpointer to do fsyncs, but we will have
- * already created the pendingOpsTable during initialization of the startup
- * process.  Calling this function drops the local pendingOpsTable so that
- * subsequent requests will be forwarded to checkpointer.
- */
-void
-SetForwardFsyncRequests(void)
-{
-	/* Perform any pending fsyncs we may have queued up, then drop table */
-	if (pendingOpsTable)
-	{
-		mdsync();
-		hash_destroy(pendingOpsTable);
-	}
-	pendingOpsTable = NULL;
-
-	/*
-	 * We should not have any pending unlink requests, since mdunlink doesn't
-	 * queue unlink requests when isRedo.
-	 */
-	Assert(pendingUnlinks == NIL);
 }
 
 /*
@@ -382,7 +288,7 @@ mdunlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo)
 {
 	/*
 	 * We have to clean out any pending fsync requests for the doomed
-	 * relation, else the next mdsync() will fail.  There can't be any such
+	 * relation, else the next smgrsync() will fail.  There can't be any such
 	 * requests for a temp relation, though.  We can send just one request
 	 * even when deleting multiple forks, since the fsync queuing code accepts
 	 * the "InvalidForkNumber = all forks" convention.
@@ -442,7 +348,7 @@ mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo)
 					 errmsg("could not truncate file \"%s\": %m", path)));
 
 		/* Register request to unlink first segment later */
-		register_unlink(rnode);
+		UnlinkAfterCheckpoint(rnode);
 	}
 
 	/*
@@ -972,13 +878,15 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 }
 
 /*
- *	mdimmedsync() -- Immediately sync a relation to stable storage.
+ *	mdimmedsyncrel() -- Immediately sync a relation to stable storage.
  *
  * Note that only writes already issued are synced; this routine knows
  * nothing of dirty buffers that may exist inside the buffer manager.
+ *
+ * See smgrimmedsyncrel comment for contract.
  */
-void
-mdimmedsync(SMgrRelation reln, ForkNumber forknum)
+bool
+mdimmedsyncrel(SMgrRelation reln, ForkNumber forknum)
 {
 	int			segno;
 
@@ -992,407 +900,57 @@ mdimmedsync(SMgrRelation reln, ForkNumber forknum)
 
 	while (segno > 0)
 	{
-		MdfdVec    *v = &reln->md_seg_fds[forknum][segno - 1];
+		MdfdVec *v = &reln->md_seg_fds[forknum][segno - 1];
 
 		if (FileSync(v->mdfd_vfd, WAIT_EVENT_DATA_FILE_IMMEDIATE_SYNC) < 0)
-			ereport(data_sync_elevel(ERROR),
+			ereport(ERROR,
 					(errcode_for_file_access(),
 					 errmsg("could not fsync file \"%s\": %m",
 							FilePathName(v->mdfd_vfd))));
 		segno--;
 	}
-}
-
-/*
- *	mdsync() -- Sync previous writes to stable storage.
- */
-void
-mdsync(void)
-{
-	static bool mdsync_in_progress = false;
-
-	HASH_SEQ_STATUS hstat;
-	PendingOperationEntry *entry;
-	int			absorb_counter;
-
-	/* Statistics on sync times */
-	int			processed = 0;
-	instr_time	sync_start,
-				sync_end,
-				sync_diff;
-	uint64		elapsed;
-	uint64		longest = 0;
-	uint64		total_elapsed = 0;
-
-	/*
-	 * This is only called during checkpoints, and checkpoints should only
-	 * occur in processes that have created a pendingOpsTable.
-	 */
-	if (!pendingOpsTable)
-		elog(ERROR, "cannot sync without a pendingOpsTable");
-
-	/*
-	 * If we are in the checkpointer, the sync had better include all fsync
-	 * requests that were queued by backends up to this point.  The tightest
-	 * race condition that could occur is that a buffer that must be written
-	 * and fsync'd for the checkpoint could have been dumped by a backend just
-	 * before it was visited by BufferSync().  We know the backend will have
-	 * queued an fsync request before clearing the buffer's dirtybit, so we
-	 * are safe as long as we do an Absorb after completing BufferSync().
-	 */
-	AbsorbFsyncRequests();
-
-	/*
-	 * To avoid excess fsync'ing (in the worst case, maybe a never-terminating
-	 * checkpoint), we want to ignore fsync requests that are entered into the
-	 * hashtable after this point --- they should be processed next time,
-	 * instead.  We use mdsync_cycle_ctr to tell old entries apart from new
-	 * ones: new ones will have cycle_ctr equal to the incremented value of
-	 * mdsync_cycle_ctr.
-	 *
-	 * In normal circumstances, all entries present in the table at this point
-	 * will have cycle_ctr exactly equal to the current (about to be old)
-	 * value of mdsync_cycle_ctr.  However, if we fail partway through the
-	 * fsync'ing loop, then older values of cycle_ctr might remain when we
-	 * come back here to try again.  Repeated checkpoint failures would
-	 * eventually wrap the counter around to the point where an old entry
-	 * might appear new, causing us to skip it, possibly allowing a checkpoint
-	 * to succeed that should not have.  To forestall wraparound, any time the
-	 * previous mdsync() failed to complete, run through the table and
-	 * forcibly set cycle_ctr = mdsync_cycle_ctr.
-	 *
-	 * Think not to merge this loop with the main loop, as the problem is
-	 * exactly that that loop may fail before having visited all the entries.
-	 * From a performance point of view it doesn't matter anyway, as this path
-	 * will never be taken in a system that's functioning normally.
-	 */
-	if (mdsync_in_progress)
-	{
-		/* prior try failed, so update any stale cycle_ctr values */
-		hash_seq_init(&hstat, pendingOpsTable);
-		while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
-		{
-			entry->cycle_ctr = mdsync_cycle_ctr;
-		}
-	}
-
-	/* Advance counter so that new hashtable entries are distinguishable */
-	mdsync_cycle_ctr++;
-
-	/* Set flag to detect failure if we don't reach the end of the loop */
-	mdsync_in_progress = true;
 
-	/* Now scan the hashtable for fsync requests to process */
-	absorb_counter = FSYNCS_PER_ABSORB;
-	hash_seq_init(&hstat, pendingOpsTable);
-	while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
-	{
-		ForkNumber	forknum;
-
-		/*
-		 * If the entry is new then don't process it this time; it might
-		 * contain multiple fsync-request bits, but they are all new.  Note
-		 * "continue" bypasses the hash-remove call at the bottom of the loop.
-		 */
-		if (entry->cycle_ctr == mdsync_cycle_ctr)
-			continue;
-
-		/* Else assert we haven't missed it */
-		Assert((CycleCtr) (entry->cycle_ctr + 1) == mdsync_cycle_ctr);
-
-		/*
-		 * Scan over the forks and segments represented by the entry.
-		 *
-		 * The bitmap manipulations are slightly tricky, because we can call
-		 * AbsorbFsyncRequests() inside the loop and that could result in
-		 * bms_add_member() modifying and even re-palloc'ing the bitmapsets.
-		 * So we detach it, but if we fail we'll merge it with any new
-		 * requests that have arrived in the meantime.
-		 */
-		for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
-		{
-			Bitmapset  *requests = entry->requests[forknum];
-			int			segno;
-
-			entry->requests[forknum] = NULL;
-			entry->canceled[forknum] = false;
-
-			segno = -1;
-			while ((segno = bms_next_member(requests, segno)) >= 0)
-			{
-				int			failures;
-
-				/*
-				 * If fsync is off then we don't have to bother opening the
-				 * file at all.  (We delay checking until this point so that
-				 * changing fsync on the fly behaves sensibly.)
-				 */
-				if (!enableFsync)
-					continue;
-
-				/*
-				 * If in checkpointer, we want to absorb pending requests
-				 * every so often to prevent overflow of the fsync request
-				 * queue.  It is unspecified whether newly-added entries will
-				 * be visited by hash_seq_search, but we don't care since we
-				 * don't need to process them anyway.
-				 */
-				if (--absorb_counter <= 0)
-				{
-					AbsorbFsyncRequests();
-					absorb_counter = FSYNCS_PER_ABSORB;
-				}
-
-				/*
-				 * The fsync table could contain requests to fsync segments
-				 * that have been deleted (unlinked) by the time we get to
-				 * them. Rather than just hoping an ENOENT (or EACCES on
-				 * Windows) error can be ignored, what we do on error is
-				 * absorb pending requests and then retry.  Since mdunlink()
-				 * queues a "cancel" message before actually unlinking, the
-				 * fsync request is guaranteed to be marked canceled after the
-				 * absorb if it really was this case. DROP DATABASE likewise
-				 * has to tell us to forget fsync requests before it starts
-				 * deletions.
-				 */
-				for (failures = 0;; failures++) /* loop exits at "break" */
-				{
-					SMgrRelation reln;
-					MdfdVec    *seg;
-					char	   *path;
-					int			save_errno;
-
-					/*
-					 * Find or create an smgr hash entry for this relation.
-					 * This may seem a bit unclean -- md calling smgr?	But
-					 * it's really the best solution.  It ensures that the
-					 * open file reference isn't permanently leaked if we get
-					 * an error here. (You may say "but an unreferenced
-					 * SMgrRelation is still a leak!" Not really, because the
-					 * only case in which a checkpoint is done by a process
-					 * that isn't about to shut down is in the checkpointer,
-					 * and it will periodically do smgrcloseall(). This fact
-					 * justifies our not closing the reln in the success path
-					 * either, which is a good thing since in non-checkpointer
-					 * cases we couldn't safely do that.)
-					 */
-					reln = smgropen(entry->rnode, InvalidBackendId);
-
-					/* Attempt to open and fsync the target segment */
-					seg = _mdfd_getseg(reln, forknum,
-									   (BlockNumber) segno * (BlockNumber) RELSEG_SIZE,
-									   false,
-									   EXTENSION_RETURN_NULL
-									   | EXTENSION_DONT_CHECK_SIZE);
-
-					INSTR_TIME_SET_CURRENT(sync_start);
-
-					if (seg != NULL &&
-						FileSync(seg->mdfd_vfd, WAIT_EVENT_DATA_FILE_SYNC) >= 0)
-					{
-						/* Success; update statistics about sync timing */
-						INSTR_TIME_SET_CURRENT(sync_end);
-						sync_diff = sync_end;
-						INSTR_TIME_SUBTRACT(sync_diff, sync_start);
-						elapsed = INSTR_TIME_GET_MICROSEC(sync_diff);
-						if (elapsed > longest)
-							longest = elapsed;
-						total_elapsed += elapsed;
-						processed++;
-						requests = bms_del_member(requests, segno);
-						if (log_checkpoints)
-							elog(DEBUG1, "checkpoint sync: number=%d file=%s time=%.3f msec",
-								 processed,
-								 FilePathName(seg->mdfd_vfd),
-								 (double) elapsed / 1000);
-
-						break;	/* out of retry loop */
-					}
-
-					/* Compute file name for use in message */
-					save_errno = errno;
-					path = _mdfd_segpath(reln, forknum, (BlockNumber) segno);
-					errno = save_errno;
-
-					/*
-					 * It is possible that the relation has been dropped or
-					 * truncated since the fsync request was entered.
-					 * Therefore, allow ENOENT, but only if we didn't fail
-					 * already on this file.  This applies both for
-					 * _mdfd_getseg() and for FileSync, since fd.c might have
-					 * closed the file behind our back.
-					 *
-					 * XXX is there any point in allowing more than one retry?
-					 * Don't see one at the moment, but easy to change the
-					 * test here if so.
-					 */
-					if (!FILE_POSSIBLY_DELETED(errno) ||
-						failures > 0)
-					{
-						Bitmapset  *new_requests;
-
-						/*
-						 * We need to merge these unsatisfied requests with
-						 * any others that have arrived since we started.
-						 */
-						new_requests = entry->requests[forknum];
-						entry->requests[forknum] =
-							bms_join(new_requests, requests);
-
-						errno = save_errno;
-						ereport(data_sync_elevel(ERROR),
-								(errcode_for_file_access(),
-								 errmsg("could not fsync file \"%s\": %m",
-										path)));
-					}
-					else
-						ereport(DEBUG1,
-								(errcode_for_file_access(),
-								 errmsg("could not fsync file \"%s\" but retrying: %m",
-										path)));
-					pfree(path);
-
-					/*
-					 * Absorb incoming requests and check to see if a cancel
-					 * arrived for this relation fork.
-					 */
-					AbsorbFsyncRequests();
-					absorb_counter = FSYNCS_PER_ABSORB; /* might as well... */
-
-					if (entry->canceled[forknum])
-						break;
-				}				/* end retry loop */
-			}
-			bms_free(requests);
-		}
-
-		/*
-		 * We've finished everything that was requested before we started to
-		 * scan the entry.  If no new requests have been inserted meanwhile,
-		 * remove the entry.  Otherwise, update its cycle counter, as all the
-		 * requests now in it must have arrived during this cycle.
-		 */
-		for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
-		{
-			if (entry->requests[forknum] != NULL)
-				break;
-		}
-		if (forknum <= MAX_FORKNUM)
-			entry->cycle_ctr = mdsync_cycle_ctr;
-		else
-		{
-			/* Okay to remove it */
-			if (hash_search(pendingOpsTable, &entry->rnode,
-							HASH_REMOVE, NULL) == NULL)
-				elog(ERROR, "pendingOpsTable corrupted");
-		}
-	}							/* end loop over hashtable entries */
-
-	/* Return sync performance metrics for report at checkpoint end */
-	CheckpointStats.ckpt_sync_rels = processed;
-	CheckpointStats.ckpt_longest_sync = longest;
-	CheckpointStats.ckpt_agg_sync_time = total_elapsed;
-
-	/* Flag successful completion of mdsync */
-	mdsync_in_progress = false;
+	return true;
 }
 
 /*
- * mdpreckpt() -- Do pre-checkpoint work
- *
- * To distinguish unlink requests that arrived before this checkpoint
- * started from those that arrived during the checkpoint, we use a cycle
- * counter similar to the one we use for fsync requests. That cycle
- * counter is incremented here.
+ *	mdimmedsyncseg() -- Immediately sync a relation segment to stable storage.
  *
- * This must be called *before* the checkpoint REDO point is determined.
- * That ensures that we won't delete files too soon.
- *
- * Note that we can't do anything here that depends on the assumption
- * that the checkpoint will be completed.
- */
-void
-mdpreckpt(void)
-{
-	/*
-	 * Any unlink requests arriving after this point will be assigned the next
-	 * cycle counter, and won't be unlinked until next checkpoint.
-	 */
-	mdckpt_cycle_ctr++;
-}
-
-/*
- * mdpostckpt() -- Do post-checkpoint work
+ * Note that only writes already issued are synced; this routine knows
+ * nothing of dirty buffers that may exist inside the buffer manager.
  *
- * Remove any lingering files that can now be safely removed.
+ * See smgrimmedsyncseg comment for contract.
  */
-void
-mdpostckpt(void)
+bool
+mdimmedsyncseg(SMgrRelation reln, ForkNumber forknum, SegmentNumber segno)
 {
-	int			absorb_counter;
+	MdfdVec	   *v;
 
-	absorb_counter = UNLINKS_PER_ABSORB;
-	while (pendingUnlinks != NIL)
+	if (segno != InvalidSegmentNumber)
 	{
-		PendingUnlinkEntry *entry = (PendingUnlinkEntry *) linitial(pendingUnlinks);
-		char	   *path;
-
 		/*
-		 * New entries are appended to the end, so if the entry is new we've
-		 * reached the end of old entries.
-		 *
-		 * Note: if just the right number of consecutive checkpoints fail, we
-		 * could be fooled here by cycle_ctr wraparound.  However, the only
-		 * consequence is that we'd delay unlinking for one more checkpoint,
-		 * which is perfectly tolerable.
+		 * Get the specified segment, or report failure if it doesn't seem to
+		 * exist.
 		 */
-		if (entry->cycle_ctr == mdckpt_cycle_ctr)
-			break;
-
-		/* Unlink the file */
-		path = relpathperm(entry->rnode, MAIN_FORKNUM);
-		if (unlink(path) < 0)
-		{
-			/*
-			 * There's a race condition, when the database is dropped at the
-			 * same time that we process the pending unlink requests. If the
-			 * DROP DATABASE deletes the file before we do, we will get ENOENT
-			 * here. rmtree() also has to ignore ENOENT errors, to deal with
-			 * the possibility that we delete the file first.
-			 */
-			if (errno != ENOENT)
-				ereport(WARNING,
-						(errcode_for_file_access(),
-						 errmsg("could not remove file \"%s\": %m", path)));
-		}
-		pfree(path);
+		v = _mdfd_openseg(reln, forknum, segno * RELSEG_SIZE,
+								 EXTENSION_RETURN_NULL);
+		if (v == NULL)
+			return false;
 
-		/* And remove the list entry */
-		pendingUnlinks = list_delete_first(pendingUnlinks);
-		pfree(entry);
+		if (FileSync(v->mdfd_vfd, WAIT_EVENT_DATA_FILE_IMMEDIATE_SYNC) < 0)
+			ereport(data_sync_elevel(ERROR),
+					(errcode_for_file_access(),
+					 errmsg("could not fsync file \"%s\": %m",
+							FilePathName(v->mdfd_vfd))));
 
-		/*
-		 * As in mdsync, we don't want to stop absorbing fsync requests for a
-		 * long time when there are many deletions to be done.  We can safely
-		 * call AbsorbFsyncRequests() at this point in the loop (note it might
-		 * try to delete list entries).
-		 */
-		if (--absorb_counter <= 0)
-		{
-			AbsorbFsyncRequests();
-			absorb_counter = UNLINKS_PER_ABSORB;
-		}
+		return true;
 	}
+
+	return false;
 }
 
 /*
  * register_dirty_segment() -- Mark a relation segment as needing fsync
- *
- * If there is a local pending-ops table, just make an entry in it for
- * mdsync to process later.  Otherwise, try to pass off the fsync request
- * to the checkpointer process.  If that fails, just do the fsync
- * locally before returning (we hope this will not happen often enough
- * to be a performance problem).
  */
 static void
 register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
@@ -1400,16 +958,8 @@ register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
 	/* Temp relations should never be fsync'd */
 	Assert(!SmgrIsTemp(reln));
 
-	if (pendingOpsTable)
+	if (!FsyncAtCheckpoint(reln->smgr_rnode.node, forknum, seg->mdfd_segno))
 	{
-		/* push it into local pending-ops table */
-		RememberFsyncRequest(reln->smgr_rnode.node, forknum, seg->mdfd_segno);
-	}
-	else
-	{
-		if (ForwardFsyncRequest(reln->smgr_rnode.node, forknum, seg->mdfd_segno))
-			return;				/* passed it off successfully */
-
 		ereport(DEBUG1,
 				(errmsg("could not forward fsync request because request queue is full")));
 
@@ -1421,258 +971,6 @@ register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
 	}
 }
 
-/*
- * register_unlink() -- Schedule a file to be deleted after next checkpoint
- *
- * We don't bother passing in the fork number, because this is only used
- * with main forks.
- *
- * As with register_dirty_segment, this could involve either a local or
- * a remote pending-ops table.
- */
-static void
-register_unlink(RelFileNodeBackend rnode)
-{
-	/* Should never be used with temp relations */
-	Assert(!RelFileNodeBackendIsTemp(rnode));
-
-	if (pendingOpsTable)
-	{
-		/* push it into local pending-ops table */
-		RememberFsyncRequest(rnode.node, MAIN_FORKNUM,
-							 UNLINK_RELATION_REQUEST);
-	}
-	else
-	{
-		/*
-		 * Notify the checkpointer about it.  If we fail to queue the request
-		 * message, we have to sleep and try again, because we can't simply
-		 * delete the file now.  Ugly, but hopefully won't happen often.
-		 *
-		 * XXX should we just leave the file orphaned instead?
-		 */
-		Assert(IsUnderPostmaster);
-		while (!ForwardFsyncRequest(rnode.node, MAIN_FORKNUM,
-									UNLINK_RELATION_REQUEST))
-			pg_usleep(10000L);	/* 10 msec seems a good number */
-	}
-}
-
-/*
- * RememberFsyncRequest() -- callback from checkpointer side of fsync request
- *
- * We stuff fsync requests into the local hash table for execution
- * during the checkpointer's next checkpoint.  UNLINK requests go into a
- * separate linked list, however, because they get processed separately.
- *
- * The range of possible segment numbers is way less than the range of
- * BlockNumber, so we can reserve high values of segno for special purposes.
- * We define three:
- * - FORGET_RELATION_FSYNC means to cancel pending fsyncs for a relation,
- *	 either for one fork, or all forks if forknum is InvalidForkNumber
- * - FORGET_DATABASE_FSYNC means to cancel pending fsyncs for a whole database
- * - UNLINK_RELATION_REQUEST is a request to delete the file after the next
- *	 checkpoint.
- * Note also that we're assuming real segment numbers don't exceed INT_MAX.
- *
- * (Handling FORGET_DATABASE_FSYNC requests is a tad slow because the hash
- * table has to be searched linearly, but dropping a database is a pretty
- * heavyweight operation anyhow, so we'll live with it.)
- */
-void
-RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
-{
-	Assert(pendingOpsTable);
-
-	if (segno == FORGET_RELATION_FSYNC)
-	{
-		/* Remove any pending requests for the relation (one or all forks) */
-		PendingOperationEntry *entry;
-
-		entry = (PendingOperationEntry *) hash_search(pendingOpsTable,
-													  &rnode,
-													  HASH_FIND,
-													  NULL);
-		if (entry)
-		{
-			/*
-			 * We can't just delete the entry since mdsync could have an
-			 * active hashtable scan.  Instead we delete the bitmapsets; this
-			 * is safe because of the way mdsync is coded.  We also set the
-			 * "canceled" flags so that mdsync can tell that a cancel arrived
-			 * for the fork(s).
-			 */
-			if (forknum == InvalidForkNumber)
-			{
-				/* remove requests for all forks */
-				for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
-				{
-					bms_free(entry->requests[forknum]);
-					entry->requests[forknum] = NULL;
-					entry->canceled[forknum] = true;
-				}
-			}
-			else
-			{
-				/* remove requests for single fork */
-				bms_free(entry->requests[forknum]);
-				entry->requests[forknum] = NULL;
-				entry->canceled[forknum] = true;
-			}
-		}
-	}
-	else if (segno == FORGET_DATABASE_FSYNC)
-	{
-		/* Remove any pending requests for the entire database */
-		HASH_SEQ_STATUS hstat;
-		PendingOperationEntry *entry;
-		ListCell   *cell,
-				   *prev,
-				   *next;
-
-		/* Remove fsync requests */
-		hash_seq_init(&hstat, pendingOpsTable);
-		while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
-		{
-			if (entry->rnode.dbNode == rnode.dbNode)
-			{
-				/* remove requests for all forks */
-				for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
-				{
-					bms_free(entry->requests[forknum]);
-					entry->requests[forknum] = NULL;
-					entry->canceled[forknum] = true;
-				}
-			}
-		}
-
-		/* Remove unlink requests */
-		prev = NULL;
-		for (cell = list_head(pendingUnlinks); cell; cell = next)
-		{
-			PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(cell);
-
-			next = lnext(cell);
-			if (entry->rnode.dbNode == rnode.dbNode)
-			{
-				pendingUnlinks = list_delete_cell(pendingUnlinks, cell, prev);
-				pfree(entry);
-			}
-			else
-				prev = cell;
-		}
-	}
-	else if (segno == UNLINK_RELATION_REQUEST)
-	{
-		/* Unlink request: put it in the linked list */
-		MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt);
-		PendingUnlinkEntry *entry;
-
-		/* PendingUnlinkEntry doesn't store forknum, since it's always MAIN */
-		Assert(forknum == MAIN_FORKNUM);
-
-		entry = palloc(sizeof(PendingUnlinkEntry));
-		entry->rnode = rnode;
-		entry->cycle_ctr = mdckpt_cycle_ctr;
-
-		pendingUnlinks = lappend(pendingUnlinks, entry);
-
-		MemoryContextSwitchTo(oldcxt);
-	}
-	else
-	{
-		/* Normal case: enter a request to fsync this segment */
-		MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt);
-		PendingOperationEntry *entry;
-		bool		found;
-
-		entry = (PendingOperationEntry *) hash_search(pendingOpsTable,
-													  &rnode,
-													  HASH_ENTER,
-													  &found);
-		/* if new entry, initialize it */
-		if (!found)
-		{
-			entry->cycle_ctr = mdsync_cycle_ctr;
-			MemSet(entry->requests, 0, sizeof(entry->requests));
-			MemSet(entry->canceled, 0, sizeof(entry->canceled));
-		}
-
-		/*
-		 * NB: it's intentional that we don't change cycle_ctr if the entry
-		 * already exists.  The cycle_ctr must represent the oldest fsync
-		 * request that could be in the entry.
-		 */
-
-		entry->requests[forknum] = bms_add_member(entry->requests[forknum],
-												  (int) segno);
-
-		MemoryContextSwitchTo(oldcxt);
-	}
-}
-
-/*
- * ForgetRelationFsyncRequests -- forget any fsyncs for a relation fork
- *
- * forknum == InvalidForkNumber means all forks, although this code doesn't
- * actually know that, since it's just forwarding the request elsewhere.
- */
-void
-ForgetRelationFsyncRequests(RelFileNode rnode, ForkNumber forknum)
-{
-	if (pendingOpsTable)
-	{
-		/* standalone backend or startup process: fsync state is local */
-		RememberFsyncRequest(rnode, forknum, FORGET_RELATION_FSYNC);
-	}
-	else if (IsUnderPostmaster)
-	{
-		/*
-		 * Notify the checkpointer about it.  If we fail to queue the cancel
-		 * message, we have to sleep and try again ... ugly, but hopefully
-		 * won't happen often.
-		 *
-		 * XXX should we CHECK_FOR_INTERRUPTS in this loop?  Escaping with an
-		 * error would leave the no-longer-used file still present on disk,
-		 * which would be bad, so I'm inclined to assume that the checkpointer
-		 * will always empty the queue soon.
-		 */
-		while (!ForwardFsyncRequest(rnode, forknum, FORGET_RELATION_FSYNC))
-			pg_usleep(10000L);	/* 10 msec seems a good number */
-
-		/*
-		 * Note we don't wait for the checkpointer to actually absorb the
-		 * cancel message; see mdsync() for the implications.
-		 */
-	}
-}
-
-/*
- * ForgetDatabaseFsyncRequests -- forget any fsyncs and unlinks for a DB
- */
-void
-ForgetDatabaseFsyncRequests(Oid dbid)
-{
-	RelFileNode rnode;
-
-	rnode.dbNode = dbid;
-	rnode.spcNode = 0;
-	rnode.relNode = 0;
-
-	if (pendingOpsTable)
-	{
-		/* standalone backend or startup process: fsync state is local */
-		RememberFsyncRequest(rnode, InvalidForkNumber, FORGET_DATABASE_FSYNC);
-	}
-	else if (IsUnderPostmaster)
-	{
-		/* see notes in ForgetRelationFsyncRequests */
-		while (!ForwardFsyncRequest(rnode, InvalidForkNumber,
-									FORGET_DATABASE_FSYNC))
-			pg_usleep(10000L);	/* 10 msec seems a good number */
-	}
-}
-
 /*
  * DropRelationFiles -- drop files of all given relations
  */
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index 0c0bba4ab3..de93d92bb7 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -21,6 +21,7 @@
 #include "storage/bufmgr.h"
 #include "storage/ipc.h"
 #include "storage/smgr.h"
+#include "storage/smgrsync.h"
 #include "utils/hsearch.h"
 #include "utils/inval.h"
 
@@ -58,10 +59,9 @@ typedef struct f_smgr
 	BlockNumber (*smgr_nblocks) (SMgrRelation reln, ForkNumber forknum);
 	void		(*smgr_truncate) (SMgrRelation reln, ForkNumber forknum,
 								  BlockNumber nblocks);
-	void		(*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum);
-	void		(*smgr_pre_ckpt) (void);	/* may be NULL */
-	void		(*smgr_sync) (void);	/* may be NULL */
-	void		(*smgr_post_ckpt) (void);	/* may be NULL */
+	bool		(*smgr_immedsyncrel) (SMgrRelation reln, ForkNumber forknum);
+	bool		(*smgr_immedsyncseg) (SMgrRelation reln, ForkNumber forknum,
+								   SegmentNumber segno);
 } f_smgr;
 
 
@@ -81,10 +81,8 @@ static const f_smgr smgrsw[] = {
 		.smgr_writeback = mdwriteback,
 		.smgr_nblocks = mdnblocks,
 		.smgr_truncate = mdtruncate,
-		.smgr_immedsync = mdimmedsync,
-		.smgr_pre_ckpt = mdpreckpt,
-		.smgr_sync = mdsync,
-		.smgr_post_ckpt = mdpostckpt
+		.smgr_immedsyncrel = mdimmedsyncrel,
+		.smgr_immedsyncseg = mdimmedsyncseg
 	}
 };
 
@@ -104,6 +102,14 @@ static void smgrshutdown(int code, Datum arg);
 static void add_to_unowned_list(SMgrRelation reln);
 static void remove_from_unowned_list(SMgrRelation reln);
 
+/*
+ * For now there is only one implementation.
+ */
+static inline int
+which_for_relfilenode(RelFileNode rnode)
+{
+	return 0;	/* we only have md.c at present */
+}
 
 /*
  *	smgrinit(), smgrshutdown() -- Initialize or shut down storage
@@ -118,6 +124,8 @@ smgrinit(void)
 {
 	int			i;
 
+	smgrsync_init();
+
 	for (i = 0; i < NSmgr; i++)
 	{
 		if (smgrsw[i].smgr_init)
@@ -185,7 +193,7 @@ smgropen(RelFileNode rnode, BackendId backend)
 		reln->smgr_targblock = InvalidBlockNumber;
 		reln->smgr_fsm_nblocks = InvalidBlockNumber;
 		reln->smgr_vm_nblocks = InvalidBlockNumber;
-		reln->smgr_which = 0;	/* we only have md.c at present */
+		reln->smgr_which = which_for_relfilenode(rnode);
 
 		/* mark it not open */
 		for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
@@ -723,20 +731,23 @@ smgrtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 }
 
 /*
- *	smgrimmedsync() -- Force the specified relation to stable storage.
+ *	smgrimmedsyncrel() -- Force the specified relation to stable storage.
  *
  *		Synchronously force all previous writes to the specified relation
- *		down to disk.
- *
- *		This is useful for building completely new relations (eg, new
- *		indexes).  Instead of incrementally WAL-logging the index build
- *		steps, we can just write completed index pages to disk with smgrwrite
- *		or smgrextend, and then fsync the completed index file before
- *		committing the transaction.  (This is sufficient for purposes of
- *		crash recovery, since it effectively duplicates forcing a checkpoint
- *		for the completed index.  But it is *not* sufficient if one wishes
- *		to use the WAL log for PITR or replication purposes: in that case
- *		we have to make WAL entries as well.)
+ *		down to disk.  If segnum is >= 0, only applies to data in
+ *		one segment file.
+ *
+ *		Used for checkpointing dirty files.
+ *
+ *		This can also be used for building completely new relations (eg, new
+ *		indexes).  Instead of incrementally WAL-logging the index build steps,
+ *		we can just write completed index pages to disk with smgrwrite or
+ *		smgrextend, and then fsync the completed index file before committing
+ *		the transaction.  (This is sufficient for purposes of crash recovery,
+ *		since it effectively duplicates forcing a checkpoint for the completed
+ *		index.  But it is *not* sufficient if one wishes to use the WAL log
+ *		for PITR or replication purposes: in that case we have to make WAL
+ *		entries as well.)
  *
  *		The preceding writes should specify skipFsync = true to avoid
  *		duplicative fsyncs.
@@ -744,57 +755,33 @@ smgrtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
  *		Note that you need to do FlushRelationBuffers() first if there is
  *		any possibility that there are dirty buffers for the relation;
  *		otherwise the sync is not very meaningful.
+ *
+ *		Fail to fsync raises an error, but non-existence of a requested
+ *		segment is reported with a false return value.
  */
-void
-smgrimmedsync(SMgrRelation reln, ForkNumber forknum)
-{
-	smgrsw[reln->smgr_which].smgr_immedsync(reln, forknum);
-}
-
-
-/*
- *	smgrpreckpt() -- Prepare for checkpoint.
- */
-void
-smgrpreckpt(void)
-{
-	int			i;
-
-	for (i = 0; i < NSmgr; i++)
-	{
-		if (smgrsw[i].smgr_pre_ckpt)
-			smgrsw[i].smgr_pre_ckpt();
-	}
-}
-
-/*
- *	smgrsync() -- Sync files to disk during checkpoint.
- */
-void
-smgrsync(void)
+bool
+smgrimmedsyncrel(SMgrRelation reln, ForkNumber forknum)
 {
-	int			i;
-
-	for (i = 0; i < NSmgr; i++)
-	{
-		if (smgrsw[i].smgr_sync)
-			smgrsw[i].smgr_sync();
-	}
+	return smgrsw[reln->smgr_which].smgr_immedsyncrel(reln, forknum);
 }
 
 /*
- *	smgrpostckpt() -- Post-checkpoint cleanup.
+ *	smgrimmedsyncseg() -- Force the specified relation segment to stable storage.
+ *
+ *		Synchronously force all previous writes to the specified relation
+ *		segment down to disk.
+ *
+ *		The preceding writes should specify skipFsync = true to avoid
+ *		duplicative fsyncs.
+ *
+ *		Fail to fsync raises an error, but non-existence of a requested
+ *		segment is reported with a false return value.
  */
-void
-smgrpostckpt(void)
+bool
+smgrimmedsyncseg(SMgrRelation reln, ForkNumber forknum, SegmentNumber segno)
 {
-	int			i;
-
-	for (i = 0; i < NSmgr; i++)
-	{
-		if (smgrsw[i].smgr_post_ckpt)
-			smgrsw[i].smgr_post_ckpt();
-	}
+	Assert(segno != InvalidSegmentNumber);
+	return smgrsw[reln->smgr_which].smgr_immedsyncseg(reln, forknum, segno);
 }
 
 /*
diff --git a/src/backend/storage/smgr/smgrsync.c b/src/backend/storage/smgr/smgrsync.c
new file mode 100644
index 0000000000..6ac901eb78
--- /dev/null
+++ b/src/backend/storage/smgr/smgrsync.c
@@ -0,0 +1,855 @@
+/*-------------------------------------------------------------------------
+ *
+ * smgrsync.c
+ *	  management of file synchronization.
+ *
+ * This modules tracks which files need to be fsynced or unlinked at the
+ * next checkpoint, and performs those actions.  Normally the work is done
+ * when called by the checkpointer, but it is also done in standalone mode
+ * and startup.
+ *
+ * Originally this logic was inside md.c, but it is now made more general,
+ * for reuse by other SMGR implementations that work with files.
+ *
+ * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/storage/smgr/smgrsync.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <unistd.h>
+
+#include "access/xlog.h"
+#include "miscadmin.h"
+#include "nodes/pg_list.h"
+#include "pgstat.h"
+#include "portability/instr_time.h"
+#include "postmaster/bgwriter.h"
+#include "storage/relfilenode.h"
+#include "storage/smgrsync.h"
+#include "utils/hsearch.h"
+#include "utils/memutils.h"
+
+static MemoryContext pendingOpsCxt; /* context for the pending ops state  */
+
+#define SV_PREFIX segnum_vector
+#define SV_DECLARE
+#define SV_DEFINE
+#define SV_ELEMENT_TYPE BlockNumber
+#define SV_SCOPE static inline
+#define SV_GLOBAL_MEMORY_CONTEXT pendingOpsCxt
+#include "lib/simplevector.h"
+
+#define SA_PREFIX segnum_array
+#define SA_COMPARE(a,b) (*a < *b ? -1 : *a == *b ? 0 : 1)
+#define SA_DECLARE
+#define SA_DEFINE
+#define SA_ELEMENT_TYPE SV_ELEMENT_TYPE
+#define SA_SCOPE static inline
+#include "lib/sort_utils.h"
+
+/*
+ * In some contexts (currently, standalone backends and the checkpointer)
+ * we keep track of pending fsync operations: we need to remember all relation
+ * segments that have been written since the last checkpoint, so that we can
+ * fsync them down to disk before completing the next checkpoint.  A hash
+ * table remembers the pending operations.  We use a hash table mostly as
+ * a convenient way of merging duplicate requests.
+ *
+ * We use a similar mechanism to remember no-longer-needed files that can
+ * be deleted after the next checkpoint, but we use a linked list instead of
+ * a hash table, because we don't expect there to be any duplicate requests.
+ *
+ * These mechanisms are only used for non-temp relations; we never fsync
+ * temp rels, nor do we need to postpone their deletion (see comments in
+ * mdunlink).
+ *
+ * (Regular backends do not track pending operations locally, but forward
+ * them to the checkpointer.)
+ */
+
+typedef uint32 CycleCtr;		/* can be any convenient integer size */
+
+/*
+ * Values for the "type" member of CheckpointerRequest.
+ *
+ * Note that CompactCheckpointerRequestQueue assumes that it's OK to remove an
+ * fsync request from the queue if an identical, subsequent request is found.
+ * See comments there before making changes here.
+ */
+#define FSYNC_SEGMENT_REQUEST	1
+#define FORGET_SEGMENT_FSYNC	2
+#define FORGET_RELATION_FSYNC	3
+#define FORGET_DATABASE_FSYNC	4
+#define UNLINK_RELATION_REQUEST 5
+#define UNLINK_SEGMENT_REQUEST	6
+
+/* intervals for calling AbsorbFsyncRequests in smgrsync and smgrpostckpt */
+#define FSYNCS_PER_ABSORB		10
+#define UNLINKS_PER_ABSORB		10
+
+/*
+ * An entry in the hash table of files that need to be flushed for the next
+ * checkpoint.
+ */
+typedef struct PendingFsyncEntry
+{
+	RelFileNode	rnode;
+	segnum_vector requests[MAX_FORKNUM + 1];
+	segnum_vector requests_in_progress[MAX_FORKNUM + 1];
+	CycleCtr	cycle_ctr;
+} PendingFsyncEntry;
+
+typedef struct PendingUnlinkEntry
+{
+	RelFileNode rnode;			/* the dead relation to delete */
+	CycleCtr	cycle_ctr;		/* ckpt_cycle_ctr when request was made */
+} PendingUnlinkEntry;
+
+static bool sync_in_progress = false;
+static CycleCtr sync_cycle_ctr = 0;
+static CycleCtr ckpt_cycle_ctr = 0;
+
+static HTAB *pendingFsyncTable = NULL;
+static List *pendingUnlinks = NIL;
+
+/*
+ * Initialize the pending operations state, if necessary.
+ */
+void
+smgrsync_init(void)
+{
+	/*
+	 * Create pending-operations hashtable if we need it.  Currently, we need
+	 * it if we are standalone (not under a postmaster) or if we are a startup
+	 * or checkpointer auxiliary process.
+	 */
+	if (!IsUnderPostmaster || AmStartupProcess() || AmCheckpointerProcess())
+	{
+		HASHCTL		hash_ctl;
+
+		/*
+		 * XXX: The checkpointer needs to add entries to the pending ops table
+		 * when absorbing fsync requests.  That is done within a critical
+		 * section, which isn't usually allowed, but we make an exception. It
+		 * means that there's a theoretical possibility that you run out of
+		 * memory while absorbing fsync requests, which leads to a PANIC.
+		 * Fortunately the hash table is small so that's unlikely to happen in
+		 * practice.
+		 */
+		pendingOpsCxt = AllocSetContextCreate(TopMemoryContext,
+											  "Pending ops context",
+											  ALLOCSET_DEFAULT_SIZES);
+		MemoryContextAllowInCriticalSection(pendingOpsCxt, true);
+
+		MemSet(&hash_ctl, 0, sizeof(hash_ctl));
+		hash_ctl.keysize = sizeof(RelFileNode);
+		hash_ctl.entrysize = sizeof(PendingFsyncEntry);
+		hash_ctl.hcxt = pendingOpsCxt;
+		pendingFsyncTable = hash_create("Pending Ops Table",
+									  100L,
+									  &hash_ctl,
+									  HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
+		pendingUnlinks = NIL;
+	}
+}
+
+/*
+ * Do pre-checkpoint work.
+ *
+ * To distinguish unlink requests that arrived before this checkpoint
+ * started from those that arrived during the checkpoint, we use a cycle
+ * counter similar to the one we use for fsync requests. That cycle
+ * counter is incremented here.
+ *
+ * This must be called *before* the checkpoint REDO point is determined.
+ * That ensures that we won't delete files too soon.
+ *
+ * Note that we can't do anything here that depends on the assumption
+ * that the checkpoint will be completed.
+ */
+void
+smgrpreckpt(void)
+{
+	/*
+	 * Any unlink requests arriving after this point will be assigned the next
+	 * cycle counter, and won't be unlinked until next checkpoint.
+	 */
+	ckpt_cycle_ctr++;
+}
+
+/*
+ * Sync previous writes to stable storage.
+ */
+void
+smgrsync(void)
+{
+	HASH_SEQ_STATUS hstat;
+	PendingFsyncEntry *entry;
+	int			absorb_counter;
+
+	/* Statistics on sync times */
+	instr_time	sync_start,
+				sync_end,
+				sync_diff;
+	uint64		elapsed;
+	int			processed = CheckpointStats.ckpt_sync_rels;
+	uint64		longest = CheckpointStats.ckpt_longest_sync;
+	uint64		total_elapsed = CheckpointStats.ckpt_agg_sync_time;
+
+	/*
+	 * This is only called during checkpoints, and checkpoints should only
+	 * occur in processes that have created a pendingFsyncTable.
+	 */
+	if (!pendingFsyncTable)
+		elog(ERROR, "cannot sync without a pendingFsyncTable");
+
+	/*
+	 * If we are in the checkpointer, the sync had better include all fsync
+	 * requests that were queued by backends up to this point.  The tightest
+	 * race condition that could occur is that a buffer that must be written
+	 * and fsync'd for the checkpoint could have been dumped by a backend just
+	 * before it was visited by BufferSync().  We know the backend will have
+	 * queued an fsync request before clearing the buffer's dirtybit, so we
+	 * are safe as long as we do an Absorb after completing BufferSync().
+	 */
+	AbsorbFsyncRequests();
+
+	/*
+	 * To avoid excess fsync'ing (in the worst case, maybe a never-terminating
+	 * checkpoint), we want to ignore fsync requests that are entered into the
+	 * hashtable after this point --- they should be processed next time,
+	 * instead.  We use sync_cycle_ctr to tell old entries apart from new
+	 * ones: new ones will have cycle_ctr equal to the incremented value of
+	 * sync_cycle_ctr.
+	 *
+	 * In normal circumstances, all entries present in the table at this point
+	 * will have cycle_ctr exactly equal to the current (about to be old)
+	 * value of sync_cycle_ctr.  However, if we fail partway through the
+	 * fsync'ing loop, then older values of cycle_ctr might remain when we
+	 * come back here to try again.  Repeated checkpoint failures would
+	 * eventually wrap the counter around to the point where an old entry
+	 * might appear new, causing us to skip it, possibly allowing a checkpoint
+	 * to succeed that should not have.  To forestall wraparound, any time the
+	 * previous smgrsync() failed to complete, run through the table and
+	 * forcibly set cycle_ctr = sync_cycle_ctr.
+	 *
+	 * Think not to merge this loop with the main loop, as the problem is
+	 * exactly that that loop may fail before having visited all the entries.
+	 * From a performance point of view it doesn't matter anyway, as this path
+	 * will never be taken in a system that's functioning normally.
+	 */
+	if (sync_in_progress)
+	{
+		/* prior try failed, so update any stale cycle_ctr values */
+		hash_seq_init(&hstat, pendingFsyncTable);
+		while ((entry = (PendingFsyncEntry *) hash_seq_search(&hstat)) != NULL)
+		{
+			ForkNumber		forknum;
+
+			entry->cycle_ctr = sync_cycle_ctr;
+
+			/*
+			 * If any requests remain unprocessed, they need to be merged with
+			 * the segment numbers that have arrived since.
+			 */
+			for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
+			{
+				segnum_vector *requests = &entry->requests[forknum];
+				segnum_vector *requests_in_progress =
+					&entry->requests_in_progress[forknum];
+
+				if (!segnum_vector_empty(requests_in_progress))
+				{
+					/* Append the unfinished requests that were not yet handled. */
+					segnum_vector_append_n(requests,
+										   segnum_vector_data(requests_in_progress),
+										   segnum_vector_size(requests_in_progress));
+					segnum_vector_reset(requests_in_progress);
+
+					/* Sort and make unique. */
+					segnum_array_sort(segnum_vector_begin(requests),
+									  segnum_vector_end(requests));
+					segnum_vector_resize(requests,
+									 segnum_array_unique(segnum_vector_begin(requests),
+														 segnum_vector_end(requests)) -
+										 segnum_vector_begin(requests));
+				}
+			}
+		}
+	}
+
+	/* Advance counter so that new hashtable entries are distinguishable */
+	sync_cycle_ctr++;
+
+	/* Set flag to detect failure if we don't reach the end of the loop */
+	sync_in_progress = true;
+
+	/* Now scan the hashtable for fsync requests to process */
+	absorb_counter = FSYNCS_PER_ABSORB;
+	hash_seq_init(&hstat, pendingFsyncTable);
+	while ((entry = (PendingFsyncEntry *) hash_seq_search(&hstat)))
+	{
+		ForkNumber forknum;
+		SMgrRelation reln;
+
+		/*
+		 * If the entry is new then don't process it this time; it might
+		 * contain multiple fsync requests, but they are all new.  Note
+		 * "continue" bypasses the hash-remove call at the bottom of the loop.
+		 */
+		if (entry->cycle_ctr == sync_cycle_ctr)
+			continue;
+
+		/* Else assert we haven't missed it */
+		Assert((CycleCtr) (entry->cycle_ctr + 1) == sync_cycle_ctr);
+
+		/*
+		 * Scan over the forks and segments represented by the entry.
+		 *
+		 * The vector manipulations are slightly tricky, because we can call
+		 * AbsorbFsyncRequests() inside the loop and that could result in new
+		 * segment numbers being added.  So we swap the contents of "requests"
+		 * with "requests_in_progress", and if we fail we'll merge it with any
+		 * new requests that have arrived in the meantime.
+		 */
+		for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
+		{
+			segnum_vector *requests_in_progress =
+				&entry->requests_in_progress[forknum];
+
+			/*
+			 * Transfer the current set of segment numbers into the "in
+			 * progress" vector (which must be empty initially).
+			 */
+			Assert(segnum_vector_empty(requests_in_progress));
+			segnum_vector_swap(&entry->requests[forknum], requests_in_progress);
+
+			/*
+			 * If fsync is off then we don't have to bother opening the
+			 * files at all.  (We delay checking until this point so that
+			 * changing fsync on the fly behaves sensibly.)
+			 */
+			if (!enableFsync)
+				segnum_vector_clear(requests_in_progress);
+
+			/* Loop until all requests have been handled. */
+			while (!segnum_vector_empty(requests_in_progress))
+			{
+				SegmentNumber	segno = *segnum_vector_back(requests_in_progress);
+
+				INSTR_TIME_SET_CURRENT(sync_start);
+
+				reln = smgropen(entry->rnode, InvalidBackendId);
+				Assert(segno != InvalidSegmentNumber);
+				if (!smgrimmedsyncseg(reln, forknum, segno))
+				{
+					/*
+					 * The underlying file couldn't be found.  Check if a
+					 * later message in the queue reports that it has been
+					 * unlinked; if so it will be removed from the vector,
+					 * indicating that we can safely skip it.
+					 */
+					AbsorbFsyncRequests();
+					if (!segnum_array_binary_search(segnum_vector_begin(requests_in_progress),
+													segnum_vector_end(requests_in_progress),
+													&segno))
+						continue;
+
+					/* Otherwise it's an unexpectedly missing file. */
+					ereport(ERROR,
+							(errcode_for_file_access(),
+							 errmsg("could not open backing file to fsync: %u/%u/%u",
+									entry->rnode.dbNode,
+									entry->rnode.relNode,
+									segno)));
+				}
+
+				/* Success; update statistics about sync timing */
+				INSTR_TIME_SET_CURRENT(sync_end);
+				sync_diff = sync_end;
+				INSTR_TIME_SUBTRACT(sync_diff, sync_start);
+				elapsed = INSTR_TIME_GET_MICROSEC(sync_diff);
+				if (elapsed > longest)
+					longest = elapsed;
+				total_elapsed += elapsed;
+				processed++;
+
+				/* Remove this segment number. */
+				Assert(segno == *segnum_vector_back(requests_in_progress));
+				segnum_vector_pop_back(requests_in_progress);
+
+				if (log_checkpoints)
+					ereport(DEBUG1,
+							(errmsg("checkpoint sync: number=%d db=%u rel=%u seg=%u time=%.3f msec",
+									processed,
+									entry->rnode.dbNode,
+									entry->rnode.relNode,
+									segno,
+									(double) elapsed / 1000),
+							 errhidestmt(true),
+							 errhidecontext(true)));
+
+				/*
+				 * If in checkpointer, we want to absorb pending requests
+				 * every so often to prevent overflow of the fsync request
+				 * queue.  It is unspecified whether newly-added entries will
+				 * be visited by hash_seq_search, but we don't care since we
+				 * don't need to process them anyway.
+				 */
+				if (--absorb_counter <= 0)
+				{
+					AbsorbFsyncRequests();
+					absorb_counter = FSYNCS_PER_ABSORB;
+				}
+			}
+		}
+
+		/*
+		 * We've finished everything that was requested before we started to
+		 * scan the entry.  If no new requests have been inserted meanwhile,
+		 * remove the entry.  Otherwise, update its cycle counter, as all the
+		 * requests now in it must have arrived during this cycle.
+		 */
+		for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
+		{
+			Assert(segnum_vector_empty(&entry->requests_in_progress[forknum]));
+			if (!segnum_vector_empty(&entry->requests[forknum]))
+				break;
+			segnum_vector_reset(&entry->requests[forknum]);
+		}
+		if (forknum <= MAX_FORKNUM)
+			entry->cycle_ctr = sync_cycle_ctr;
+		else
+		{
+			/* Okay to remove it */
+			if (hash_search(pendingFsyncTable, &entry->rnode,
+							HASH_REMOVE, NULL) == NULL)
+				elog(ERROR, "pendingOpsTable corrupted");
+		}
+	}							/* end loop over hashtable entries */
+
+	/* Maintain sync performance metrics for report at checkpoint end */
+	CheckpointStats.ckpt_sync_rels = processed;
+	CheckpointStats.ckpt_longest_sync = longest;
+	CheckpointStats.ckpt_agg_sync_time = total_elapsed;
+
+	/* Flag successful completion of smgrsync */
+	sync_in_progress = false;
+}
+
+/*
+ * Do post-checkpoint work.
+ *
+ * Remove any lingering files that can now be safely removed.
+ */
+void
+smgrpostckpt(void)
+{
+	int			absorb_counter;
+
+	absorb_counter = UNLINKS_PER_ABSORB;
+	while (pendingUnlinks != NIL)
+	{
+		PendingUnlinkEntry *entry = (PendingUnlinkEntry *) linitial(pendingUnlinks);
+		char	   *path;
+
+		/*
+		 * New entries are appended to the end, so if the entry is new we've
+		 * reached the end of old entries.
+		 *
+		 * Note: if just the right number of consecutive checkpoints fail, we
+		 * could be fooled here by cycle_ctr wraparound.  However, the only
+		 * consequence is that we'd delay unlinking for one more checkpoint,
+		 * which is perfectly tolerable.
+		 */
+		if (entry->cycle_ctr == ckpt_cycle_ctr)
+			break;
+
+		/* Unlink the file */
+		path = relpathperm(entry->rnode, MAIN_FORKNUM);
+		if (unlink(path) < 0)
+		{
+			/*
+			 * There's a race condition, when the database is dropped at the
+			 * same time that we process the pending unlink requests. If the
+			 * DROP DATABASE deletes the file before we do, we will get ENOENT
+			 * here. rmtree() also has to ignore ENOENT errors, to deal with
+			 * the possibility that we delete the file first.
+			 */
+			if (errno != ENOENT)
+				ereport(WARNING,
+						(errcode_for_file_access(),
+						 errmsg("could not remove file \"%s\": %m", path)));
+		}
+		pfree(path);
+
+		/* And remove the list entry */
+		pendingUnlinks = list_delete_first(pendingUnlinks);
+		pfree(entry);
+
+		/*
+		 * As in smgrsync, we don't want to stop absorbing fsync requests for a
+		 * long time when there are many deletions to be done.  We can safely
+		 * call AbsorbFsyncRequests() at this point in the loop (note it might
+		 * try to delete list entries).
+		 */
+		if (--absorb_counter <= 0)
+		{
+			AbsorbFsyncRequests();
+			absorb_counter = UNLINKS_PER_ABSORB;
+		}
+	}
+}
+
+
+/*
+ * Mark a file as needing fsync.
+ *
+ * If there is a local pending-ops table, just make an entry in it for
+ * smgrsync to process later.  Otherwise, try to pass off the fsync request to
+ * the checkpointer process.
+ *
+ * Returns true on success, but false if the queue was full and we couldn't
+ * pass the request to the the checkpointer, meaning that the caller must
+ * perform the fsync.
+ */
+bool
+FsyncAtCheckpoint(RelFileNode rnode, ForkNumber forknum, SegmentNumber segno)
+{
+	if (pendingFsyncTable)
+	{
+		RememberFsyncRequest(FSYNC_SEGMENT_REQUEST, rnode, forknum, segno);
+		return true;
+	}
+	else
+		return ForwardFsyncRequest(FSYNC_SEGMENT_REQUEST, rnode, forknum,
+								   segno);
+}
+
+/*
+ * Schedule a file to be deleted after next checkpoint.
+ *
+ * As with FsyncAtCheckpoint, this could involve either a local or a remote
+ * pending-ops table.
+ */
+void
+UnlinkAfterCheckpoint(RelFileNodeBackend rnode)
+{
+	/* Should never be used with temp relations */
+	Assert(!RelFileNodeBackendIsTemp(rnode));
+
+	if (pendingFsyncTable)
+	{
+		/* push it into local pending-ops table */
+		RememberFsyncRequest(UNLINK_RELATION_REQUEST,
+							 rnode.node,
+							 MAIN_FORKNUM,
+							 InvalidSegmentNumber);
+	}
+	else
+	{
+		/* Notify the checkpointer about it. */
+		Assert(IsUnderPostmaster);
+
+		ForwardFsyncRequest(UNLINK_RELATION_REQUEST,
+							rnode.node,
+							MAIN_FORKNUM,
+							InvalidSegmentNumber);
+	}
+}
+
+/*
+ * In archive recovery, we rely on checkpointer to do fsyncs, but we will have
+ * already created the pendingFsyncTable during initialization of the startup
+ * process.  Calling this function drops the local pendingFsyncTable so that
+ * subsequent requests will be forwarded to checkpointer.
+ */
+void
+SetForwardFsyncRequests(void)
+{
+	/* Perform any pending fsyncs we may have queued up, then drop table */
+	if (pendingFsyncTable)
+	{
+		smgrsync();
+		hash_destroy(pendingFsyncTable);
+	}
+	pendingFsyncTable = NULL;
+
+	/*
+	 * We should not have any pending unlink requests, since mdunlink doesn't
+	 * queue unlink requests when isRedo.
+	 */
+	Assert(pendingUnlinks == NIL);
+}
+
+/*
+ * Find and remove a segment number by binary search.
+ */
+static inline void
+delete_segno(segnum_vector *vec, SegmentNumber segno)
+{
+	SegmentNumber *position =
+		segnum_array_lower_bound(segnum_vector_begin(vec),
+								 segnum_vector_end(vec),
+								 &segno);
+
+	if (position != segnum_vector_end(vec) &&
+		*position == segno)
+		segnum_vector_erase(vec, position);
+}
+
+/*
+ * Add a segment number by binary search.  Hopefully these tend to be added a
+ * the high end, which is cheap.
+ */
+static inline void
+insert_segno(segnum_vector *vec, SegmentNumber segno)
+{
+	segnum_vector_insert(vec,
+						 segnum_array_lower_bound(segnum_vector_begin(vec),
+												  segnum_vector_end(vec),
+												  &segno),
+						 &segno);
+}
+
+/*
+ * RememberFsyncRequest() -- callback from checkpointer side of fsync request
+ *
+ * We stuff fsync requests into the local hash table for execution
+ * during the checkpointer's next checkpoint.  UNLINK requests go into a
+ * separate linked list, however, because they get processed separately.
+ *
+ * Valid valid values for 'type':
+ * - FSYNC_SEGMENT_REQUEST means to schedule an fsync
+ * - FORGET_SEGMENT_FSYNC means to cancel pending fsyncs for one segment
+ * - FORGET_RELATION_FSYNC means to cancel pending fsyncs for a relation,
+ *	 either for one fork, or all forks if forknum is InvalidForkNumber
+ * - FORGET_DATABASE_FSYNC means to cancel pending fsyncs for a whole database
+ * - UNLINK_RELATION_REQUEST is a request to delete the file after the next
+ *	 checkpoint.
+ * Note also that we're assuming real segment numbers don't exceed INT_MAX.
+ *
+ * (Handling FORGET_DATABASE_FSYNC requests is a tad slow because the hash
+ * table has to be searched linearly, but dropping a database is a pretty
+ * heavyweight operation anyhow, so we'll live with it.)
+ */
+void
+RememberFsyncRequest(int type, RelFileNode rnode, ForkNumber forknum,
+					 SegmentNumber segno)
+{
+	Assert(pendingFsyncTable);
+
+	if (type == FORGET_SEGMENT_FSYNC || type == FORGET_RELATION_FSYNC)
+	{
+		PendingFsyncEntry *entry;
+
+		entry = hash_search(pendingFsyncTable, &rnode, HASH_FIND, NULL);
+		if (entry)
+		{
+			if (type == FORGET_SEGMENT_FSYNC)
+			{
+				delete_segno(&entry->requests[forknum], segno);
+				delete_segno(&entry->requests_in_progress[forknum], segno);
+			}
+			else if (forknum == InvalidForkNumber)
+			{
+				/* Remove requests for all forks. */
+				for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
+				{
+					segnum_vector_reset(&entry->requests[forknum]);
+					segnum_vector_reset(&entry->requests_in_progress[forknum]);
+				}
+			}
+			else
+			{
+				/* Forget about all segments for one fork. */
+				segnum_vector_reset(&entry->requests[forknum]);
+				segnum_vector_reset(&entry->requests_in_progress[forknum]);
+			}
+		}
+	}
+	else if (type == FORGET_DATABASE_FSYNC)
+	{
+		HASH_SEQ_STATUS hstat;
+		PendingFsyncEntry *entry;
+
+		/* Remove fsync requests */
+		hash_seq_init(&hstat, pendingFsyncTable);
+		while ((entry = (PendingFsyncEntry *) hash_seq_search(&hstat)) != NULL)
+		{
+			if (rnode.dbNode == entry->rnode.dbNode)
+			{
+				/* Remove requests for all forks. */
+				for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
+				{
+					segnum_vector_reset(&entry->requests[forknum]);
+					segnum_vector_reset(&entry->requests_in_progress[forknum]);
+				}
+			}
+		}
+
+		/* Remove unlink requests */
+		if (segno == FORGET_DATABASE_FSYNC)
+		{
+			ListCell   *cell,
+					   *next,
+					   *prev;
+
+			prev = NULL;
+			for (cell = list_head(pendingUnlinks); cell; cell = next)
+			{
+				PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(cell);
+
+				next = lnext(cell);
+				if (rnode.dbNode == entry->rnode.dbNode)
+				{
+					pendingUnlinks = list_delete_cell(pendingUnlinks, cell,
+													  prev);
+					pfree(entry);
+				}
+				else
+					prev = cell;
+			}
+		}
+	}
+	else if (type == UNLINK_RELATION_REQUEST)
+	{
+		/* Unlink request: put it in the linked list */
+		MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt);
+		PendingUnlinkEntry *entry;
+
+		/* PendingUnlinkEntry doesn't store forknum, since it's always MAIN */
+		Assert(forknum == MAIN_FORKNUM);
+
+		entry = palloc(sizeof(PendingUnlinkEntry));
+		entry->rnode = rnode;
+		entry->cycle_ctr = ckpt_cycle_ctr;
+
+		pendingUnlinks = lappend(pendingUnlinks, entry);
+
+		MemoryContextSwitchTo(oldcxt);
+	}
+	else if (type == FSYNC_SEGMENT_REQUEST)
+	{
+		/* Normal case: enter a request to fsync this segment */
+		PendingFsyncEntry *entry;
+		bool		found;
+
+		entry = (PendingFsyncEntry *) hash_search(pendingFsyncTable,
+												  &rnode,
+												  HASH_ENTER,
+												  &found);
+		/* if new entry, initialize it */
+		if (!found)
+		{
+			ForkNumber	f;
+
+			entry->cycle_ctr = ckpt_cycle_ctr;
+			for (f = 0; f <= MAX_FORKNUM; f++)
+			{
+				segnum_vector_init(&entry->requests[f]);
+				segnum_vector_init(&entry->requests_in_progress[f]);
+			}
+		}
+
+		/*
+		 * NB: it's intentional that we don't change cycle_ctr if the entry
+		 * already exists.  The cycle_ctr must represent the oldest fsync
+		 * request that could be in the entry.
+		 */
+
+		insert_segno(&entry->requests[forknum], segno);
+	}
+}
+
+/*
+ * ForgetSegmentFsyncRequests -- forget any fsyncs for one segment of a
+ * relation fork
+ *
+ * forknum == InvalidForkNumber means all forks, although this code doesn't
+ * actually know that, since it's just forwarding the request elsewhere.
+ */
+void
+ForgetSegmentFsyncRequests(RelFileNode rnode, ForkNumber forknum,
+						   SegmentNumber segno)
+{
+	if (pendingFsyncTable)
+	{
+		/* standalone backend or startup process: fsync state is local */
+		RememberFsyncRequest(FORGET_SEGMENT_FSYNC, rnode, forknum, segno);
+	}
+	else if (IsUnderPostmaster)
+	{
+		/* Notify the checkpointer about it. */
+		while (!ForwardFsyncRequest(FORGET_SEGMENT_FSYNC, rnode, forknum,
+									segno))
+			pg_usleep(10000L);	/* 10 msec seems a good number */
+
+		/*
+		 * Note we don't wait for the checkpointer to actually absorb the
+		 * cancel message; see smgrsync() for the implications.
+		 */
+	}
+}
+
+/*
+ * ForgetRelationFsyncRequests -- forget any fsyncs for a relation fork
+ *
+ * forknum == InvalidForkNumber means all forks, although this code doesn't
+ * actually know that, since it's just forwarding the request elsewhere.
+ */
+void
+ForgetRelationFsyncRequests(RelFileNode rnode, ForkNumber forknum)
+{
+	if (pendingFsyncTable)
+	{
+		/* standalone backend or startup process: fsync state is local */
+		RememberFsyncRequest(FORGET_RELATION_FSYNC, rnode, forknum,
+							 InvalidSegmentNumber);
+	}
+	else if (IsUnderPostmaster)
+	{
+		/* Notify the checkpointer about it. */
+		while (!ForwardFsyncRequest(FORGET_RELATION_FSYNC, rnode, forknum,
+									InvalidSegmentNumber))
+			pg_usleep(10000L);	/* 10 msec seems a good number */
+
+		/*
+		 * Note we don't wait for the checkpointer to actually absorb the
+		 * cancel message; see smgrsync() for the implications.
+		 */
+	}
+}
+
+/*
+ * ForgetDatabaseFsyncRequests -- forget any fsyncs and unlinks for a DB
+ */
+void
+ForgetDatabaseFsyncRequests(Oid dbid)
+{
+	RelFileNode rnode;
+
+	rnode.dbNode = dbid;
+	rnode.spcNode = 0;
+	rnode.relNode = 0;
+
+	if (pendingFsyncTable)
+	{
+		/* standalone backend or startup process: fsync state is local */
+		RememberFsyncRequest(FORGET_DATABASE_FSYNC, rnode, 0,
+							 InvalidSegmentNumber);
+	}
+	else if (IsUnderPostmaster)
+	{
+		/* see notes in ForgetRelationFsyncRequests */
+		while (!ForwardFsyncRequest(FORGET_DATABASE_FSYNC, rnode, 0,
+									InvalidSegmentNumber))
+			pg_usleep(10000L);	/* 10 msec seems a good number */
+	}
+}
diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c
index 6ec795f1b4..9ed06a32e6 100644
--- a/src/backend/tcop/utility.c
+++ b/src/backend/tcop/utility.c
@@ -59,7 +59,6 @@
 #include "commands/view.h"
 #include "miscadmin.h"
 #include "parser/parse_utilcmd.h"
-#include "postmaster/bgwriter.h"
 #include "rewrite/rewriteDefine.h"
 #include "rewrite/rewriteRemove.h"
 #include "storage/fd.h"
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 41d477165c..8869e730dc 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -61,6 +61,7 @@
 #include "postmaster/autovacuum.h"
 #include "postmaster/bgworker_internals.h"
 #include "postmaster/bgwriter.h"
+#include "postmaster/checkpointer.h"
 #include "postmaster/postmaster.h"
 #include "postmaster/syslogger.h"
 #include "postmaster/walwriter.h"
diff --git a/src/include/postmaster/bgwriter.h b/src/include/postmaster/bgwriter.h
index 53b8f5fe3c..585ce52667 100644
--- a/src/include/postmaster/bgwriter.h
+++ b/src/include/postmaster/bgwriter.h
@@ -1,10 +1,7 @@
 /*-------------------------------------------------------------------------
  *
  * bgwriter.h
- *	  Exports from postmaster/bgwriter.c and postmaster/checkpointer.c.
- *
- * The bgwriter process used to handle checkpointing duties too.  Now
- * there is a separate process, but we did not bother to split this header.
+ *	  Exports from postmaster/bgwriter.c.
  *
  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
  *
@@ -15,29 +12,10 @@
 #ifndef _BGWRITER_H
 #define _BGWRITER_H
 
-#include "storage/block.h"
-#include "storage/relfilenode.h"
-
-
 /* GUC options */
 extern int	BgWriterDelay;
-extern int	CheckPointTimeout;
-extern int	CheckPointWarning;
-extern double CheckPointCompletionTarget;
 
 extern void BackgroundWriterMain(void) pg_attribute_noreturn();
-extern void CheckpointerMain(void) pg_attribute_noreturn();
-
-extern void RequestCheckpoint(int flags);
-extern void CheckpointWriteDelay(int flags, double progress);
-
-extern bool ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum,
-					BlockNumber segno);
-extern void AbsorbFsyncRequests(void);
-
-extern Size CheckpointerShmemSize(void);
-extern void CheckpointerShmemInit(void);
 
-extern bool FirstCallSinceLastCheckpoint(void);
 
 #endif							/* _BGWRITER_H */
diff --git a/src/include/postmaster/checkpointer.h b/src/include/postmaster/checkpointer.h
new file mode 100644
index 0000000000..28b13c2d9c
--- /dev/null
+++ b/src/include/postmaster/checkpointer.h
@@ -0,0 +1,39 @@
+/*-------------------------------------------------------------------------
+ *
+ * checkpointer.h
+ *	  Exports from postmaster/checkpointer.c.
+ *
+ * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
+ *
+ * src/include/postmaster/checkpointer.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef CHECKPOINTER_H
+#define CHECKPOINTER_H
+
+#include "common/relpath.h"
+#include "storage/block.h"
+#include "storage/relfilenode.h"
+
+/* GUC options */
+extern int	CheckPointTimeout;
+extern int	CheckPointWarning;
+extern double CheckPointCompletionTarget;
+
+extern void CheckpointerMain(void) pg_attribute_noreturn();
+extern bool ForwardFsyncRequest(int type, RelFileNode rnode,
+								ForkNumber forknum, BlockNumber segno);
+extern void RequestCheckpoint(int flags);
+extern void CheckpointWriteDelay(int flags, double progress);
+
+extern void AbsorbFsyncRequests(void);
+extern void AbsorbAllFsyncRequests(void);
+
+extern Size CheckpointerShmemSize(void);
+extern void CheckpointerShmemInit(void);
+
+extern bool FirstCallSinceLastCheckpoint(void);
+extern void CountBackendWrite(void);
+
+#endif
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index 820d08ed4e..8b7ff665c5 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -18,6 +18,15 @@
 #include "storage/block.h"
 #include "storage/relfilenode.h"
 
+/*
+ * The type used to identify segment numbers.  Generally, segments are an
+ * internal detail of individual storage manager implementations, but since
+ * they appear in various places to allow them to be passed between processes,
+ * it seemed worthwhile to have a typename.
+ */
+typedef uint32 SegmentNumber;
+
+#define InvalidSegmentNumber ((SegmentNumber) 0xFFFFFFFF)
 
 /*
  * smgr.c maintains a table of SMgrRelation objects, which are essentially
@@ -105,10 +114,10 @@ extern void smgrwriteback(SMgrRelation reln, ForkNumber forknum,
 extern BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum);
 extern void smgrtruncate(SMgrRelation reln, ForkNumber forknum,
 			 BlockNumber nblocks);
-extern void smgrimmedsync(SMgrRelation reln, ForkNumber forknum);
-extern void smgrpreckpt(void);
-extern void smgrsync(void);
-extern void smgrpostckpt(void);
+extern bool smgrimmedsyncrel(SMgrRelation reln, ForkNumber forknum);
+extern bool smgrimmedsyncseg(SMgrRelation reln, ForkNumber forknum,
+	SegmentNumber segno);
+
 extern void AtEOXact_SMgr(void);
 
 
@@ -133,16 +142,10 @@ extern void mdwriteback(SMgrRelation reln, ForkNumber forknum,
 extern BlockNumber mdnblocks(SMgrRelation reln, ForkNumber forknum);
 extern void mdtruncate(SMgrRelation reln, ForkNumber forknum,
 		   BlockNumber nblocks);
-extern void mdimmedsync(SMgrRelation reln, ForkNumber forknum);
-extern void mdpreckpt(void);
-extern void mdsync(void);
-extern void mdpostckpt(void);
-
-extern void SetForwardFsyncRequests(void);
-extern void RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum,
-					 BlockNumber segno);
-extern void ForgetRelationFsyncRequests(RelFileNode rnode, ForkNumber forknum);
-extern void ForgetDatabaseFsyncRequests(Oid dbid);
+extern bool mdimmedsyncrel(SMgrRelation reln, ForkNumber forknum);
+extern bool mdimmedsyncseg(SMgrRelation reln, ForkNumber forknum,
+	SegmentNumber segno);
+
 extern void DropRelationFiles(RelFileNode *delrels, int ndelrels, bool isRedo);
 
 #endif							/* SMGR_H */
diff --git a/src/include/storage/smgrsync.h b/src/include/storage/smgrsync.h
new file mode 100644
index 0000000000..212a0f8443
--- /dev/null
+++ b/src/include/storage/smgrsync.h
@@ -0,0 +1,35 @@
+/*-------------------------------------------------------------------------
+ *
+ * smgrsync.h
+ *	  management of file synchronization
+ *
+ *
+ * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/smgrpending.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef SMGRSYNC_H
+#define SMGRSYNC_H
+
+#include "storage/smgr.h"
+
+extern void smgrsync_init(void);
+extern void smgrpreckpt(void);
+extern void smgrsync(void);
+extern void smgrpostckpt(void);
+
+extern void UnlinkAfterCheckpoint(RelFileNodeBackend rnode);
+extern bool FsyncAtCheckpoint(RelFileNode rnode, ForkNumber forknum,
+							  SegmentNumber segno);
+extern void RememberFsyncRequest(int type, RelFileNode rnode,
+								 ForkNumber forknum, SegmentNumber segno);
+extern void SetForwardFsyncRequests(void);
+extern void ForgetSegmentFsyncRequests(RelFileNode rnode, ForkNumber forknum,
+									   SegmentNumber segno);
+extern void ForgetRelationFsyncRequests(RelFileNode rnode, ForkNumber forknum);
+extern void ForgetDatabaseFsyncRequests(Oid dbid);
+
+#endif
-- 
2.16.5