From 88cb3577b380275019642458ae40ae1c93c78755 Mon Sep 17 00:00:00 2001
From: Shawn Debnath <sdn@amazon.com>
Date: Sun, 17 Feb 2019 22:08:46 +0000
Subject: [PATCH] Refactor the fsync mechanism to support future SMGR
 implementations.

In anticipation of proposed block storage managers alongside md.c that
map bufmgr.c blocks to files optimised for different usage patterns:

1. Move the system for requesting and processing fsyncs out of md.c
into smgr.c.

2. Redefine smgrsync() behavior to be closer to that of
smgrimmedsysnc(), i.e., if a regular sync is required for a particular
file, enqueue it in locally or forward it to checkpointer. The
processing of fsync requests has been moved to a new ProcessFsyncRequests
function. smgrimmedsync() retains the old behavior of forcing an
immediate sync.

3. Removed the need for specific storage managers to implement pre and
post checkpoint callbacks. These are now executed at the smgr layer.

4. We now embed the fork number and the segment number as part of the
hash key for the pending ops table. This eliminates the bitmapset based
segment tracking for each relfilenode during fsync as not all storage
managers may map their segments from zero.

5. As part of processing the requests, we now re-construct the
path to the segment based on relfilenode, fork and segment numbers, and
use PathNameOpenFile to get a file descriptor to use for FileSync.

Author: Shawn Debnath, Thomas Munro
Reviewed-by:
Discussion:
https://postgr.es/m/CAEepm=2gTANm=e3ARnJT=n0h8hf88wqmaZxk0JYkxw+b21fNrw@mail.gmail.com
---
 contrib/bloom/blinsert.c              |   2 +-
 src/backend/access/heap/heapam.c      |   4 +-
 src/backend/access/nbtree/nbtree.c    |   2 +-
 src/backend/access/nbtree/nbtsort.c   |   2 +-
 src/backend/access/spgist/spginsert.c |   2 +-
 src/backend/access/transam/xlog.c     |   4 +-
 src/backend/catalog/heap.c            |   2 +-
 src/backend/commands/tablecmds.c      |   2 +-
 src/backend/postmaster/checkpointer.c |   2 +-
 src/backend/storage/buffer/bufmgr.c   |   2 +-
 src/backend/storage/smgr/md.c         | 915 +++-------------------------------
 src/backend/storage/smgr/smgr.c       | 702 ++++++++++++++++++++++++--
 src/include/postmaster/bgwriter.h     |   3 +-
 src/include/storage/smgr.h            |  68 ++-
 14 files changed, 818 insertions(+), 894 deletions(-)

diff --git a/contrib/bloom/blinsert.c b/contrib/bloom/blinsert.c
index e43fbe0005..6fa07db4f8 100644
--- a/contrib/bloom/blinsert.c
+++ b/contrib/bloom/blinsert.c
@@ -188,7 +188,7 @@ blbuildempty(Relation index)
 	 * write did not go through shared_buffers and therefore a concurrent
 	 * checkpoint may have moved the redo pointer past our xlog record.
 	 */
-	smgrimmedsync(index->rd_smgr, INIT_FORKNUM);
+	smgrimmedsync(index->rd_smgr, INIT_FORKNUM, InvalidSegmentNumber);
 }
 
 /*
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index dc3499349b..4cf2661387 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -8980,7 +8980,7 @@ heap_sync(Relation rel)
 	/* main heap */
 	FlushRelationBuffers(rel);
 	/* FlushRelationBuffers will have opened rd_smgr */
-	smgrimmedsync(rel->rd_smgr, MAIN_FORKNUM);
+	smgrimmedsync(rel->rd_smgr, MAIN_FORKNUM, InvalidSegmentNumber);
 
 	/* FSM is not critical, don't bother syncing it */
 
@@ -8991,7 +8991,7 @@ heap_sync(Relation rel)
 
 		toastrel = table_open(rel->rd_rel->reltoastrelid, AccessShareLock);
 		FlushRelationBuffers(toastrel);
-		smgrimmedsync(toastrel->rd_smgr, MAIN_FORKNUM);
+		smgrimmedsync(toastrel->rd_smgr, MAIN_FORKNUM, InvalidSegmentNumber);
 		table_close(toastrel, AccessShareLock);
 	}
 }
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 98917de2ef..b29112c133 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -178,7 +178,7 @@ btbuildempty(Relation index)
 	 * write did not go through shared_buffers and therefore a concurrent
 	 * checkpoint may have moved the redo pointer past our xlog record.
 	 */
-	smgrimmedsync(index->rd_smgr, INIT_FORKNUM);
+	smgrimmedsync(index->rd_smgr, INIT_FORKNUM, InvalidSegmentNumber);
 }
 
 /*
diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c
index dc398e1186..052215bb34 100644
--- a/src/backend/access/nbtree/nbtsort.c
+++ b/src/backend/access/nbtree/nbtsort.c
@@ -1208,7 +1208,7 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
 	if (RelationNeedsWAL(wstate->index))
 	{
 		RelationOpenSmgr(wstate->index);
-		smgrimmedsync(wstate->index->rd_smgr, MAIN_FORKNUM);
+		smgrimmedsync(wstate->index->rd_smgr, MAIN_FORKNUM, InvalidSegmentNumber);
 	}
 }
 
diff --git a/src/backend/access/spgist/spginsert.c b/src/backend/access/spgist/spginsert.c
index f428a15138..0eb5ced43d 100644
--- a/src/backend/access/spgist/spginsert.c
+++ b/src/backend/access/spgist/spginsert.c
@@ -205,7 +205,7 @@ spgbuildempty(Relation index)
 	 * writes did not go through shared buffers and therefore a concurrent
 	 * checkpoint may have moved the redo pointer past our xlog record.
 	 */
-	smgrimmedsync(index->rd_smgr, INIT_FORKNUM);
+	smgrimmedsync(index->rd_smgr, INIT_FORKNUM, InvalidSegmentNumber);
 }
 
 /*
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index ecd12fc53a..69eaa2f7e2 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -8616,7 +8616,7 @@ CreateCheckPoint(int flags)
 	 * the REDO pointer.  Note that smgr must not do anything that'd have to
 	 * be undone if we decide no checkpoint is needed.
 	 */
-	smgrpreckpt();
+	smgrprecheckpoint();
 
 	/* Begin filling in the checkpoint WAL record */
 	MemSet(&checkPoint, 0, sizeof(checkPoint));
@@ -8912,7 +8912,7 @@ CreateCheckPoint(int flags)
 	/*
 	 * Let smgr do post-checkpoint cleanup (eg, deleting old files).
 	 */
-	smgrpostckpt();
+	smgrpostcheckpoint();
 
 	/*
 	 * Update the average distance between checkpoints if the prior checkpoint
diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c
index d0215a5eed..8ecf7c09a2 100644
--- a/src/backend/catalog/heap.c
+++ b/src/backend/catalog/heap.c
@@ -1421,7 +1421,7 @@ heap_create_init_fork(Relation rel)
 	RelationOpenSmgr(rel);
 	smgrcreate(rel->rd_smgr, INIT_FORKNUM, false);
 	log_smgrcreate(&rel->rd_smgr->smgr_rnode.node, INIT_FORKNUM);
-	smgrimmedsync(rel->rd_smgr, INIT_FORKNUM);
+	smgrimmedsync(rel->rd_smgr, INIT_FORKNUM, InvalidSegmentNumber);
 }
 
 /*
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index 715c6a221c..125b16c339 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -11788,7 +11788,7 @@ copy_relation_data(SMgrRelation src, SMgrRelation dst,
 	 * here, they might still not be on disk when the crash occurs.
 	 */
 	if (relpersistence == RELPERSISTENCE_PERMANENT || copying_initfork)
-		smgrimmedsync(dst, forkNum);
+		smgrimmedsync(dst, forkNum, InvalidSegmentNumber);
 }
 
 /*
diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c
index fe96c41359..867f427028 100644
--- a/src/backend/postmaster/checkpointer.c
+++ b/src/backend/postmaster/checkpointer.c
@@ -1092,7 +1092,7 @@ RequestCheckpoint(int flags)
  * let the backend know by returning false.
  */
 bool
-ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
+ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum, SegmentNumber segno)
 {
 	CheckpointerRequest *request;
 	bool		too_full;
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 273e2f385f..c493c591aa 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -2584,7 +2584,7 @@ CheckPointBuffers(int flags)
 	BufferSync(flags);
 	CheckpointStats.ckpt_sync_t = GetCurrentTimestamp();
 	TRACE_POSTGRESQL_BUFFER_CHECKPOINT_SYNC_START();
-	smgrsync();
+	ProcessFsyncRequests();
 	CheckpointStats.ckpt_sync_end_t = GetCurrentTimestamp();
 	TRACE_POSTGRESQL_BUFFER_CHECKPOINT_DONE();
 }
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index 2aba2dfe91..46b54f92a3 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -29,8 +29,6 @@
 #include "access/xlogutils.h"
 #include "access/xlog.h"
 #include "pgstat.h"
-#include "portability/instr_time.h"
-#include "postmaster/bgwriter.h"
 #include "storage/fd.h"
 #include "storage/bufmgr.h"
 #include "storage/relfilenode.h"
@@ -39,35 +37,6 @@
 #include "utils/memutils.h"
 #include "pg_trace.h"
 
-
-/* intervals for calling AbsorbFsyncRequests in mdsync and mdpostckpt */
-#define FSYNCS_PER_ABSORB		10
-#define UNLINKS_PER_ABSORB		10
-
-/*
- * Special values for the segno arg to RememberFsyncRequest.
- *
- * Note that CompactCheckpointerRequestQueue assumes that it's OK to remove an
- * fsync request from the queue if an identical, subsequent request is found.
- * See comments there before making changes here.
- */
-#define FORGET_RELATION_FSYNC	(InvalidBlockNumber)
-#define FORGET_DATABASE_FSYNC	(InvalidBlockNumber-1)
-#define UNLINK_RELATION_REQUEST (InvalidBlockNumber-2)
-
-/*
- * On Windows, we have to interpret EACCES as possibly meaning the same as
- * ENOENT, because if a file is unlinked-but-not-yet-gone on that platform,
- * that's what you get.  Ugh.  This code is designed so that we don't
- * actually believe these cases are okay without further evidence (namely,
- * a pending fsync request getting canceled ... see mdsync).
- */
-#ifndef WIN32
-#define FILE_POSSIBLY_DELETED(err)	((err) == ENOENT)
-#else
-#define FILE_POSSIBLY_DELETED(err)	((err) == ENOENT || (err) == EACCES)
-#endif
-
 /*
  *	The magnetic disk storage manager keeps track of open file
  *	descriptors in its own descriptor pool.  This is done to make it
@@ -114,53 +83,27 @@ typedef struct _MdfdVec
 
 static MemoryContext MdCxt;		/* context for all MdfdVec objects */
 
+/* local routines */
+static void mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum,
+			 bool isRedo);
+static MdfdVec *mdopen(SMgrRelation reln, ForkNumber forknum, int behavior);
+static void register_dirty_segment(SMgrRelation reln, ForkNumber forknum,
+					   MdfdVec *seg);
+static void register_unlink(RelFileNodeBackend rnode);
+static void _fdvec_resize(SMgrRelation reln,
+			  ForkNumber forknum,
+			  int nseg);
+static MdfdVec *_mdfd_openseg(SMgrRelation reln, ForkNumber forkno,
+			  BlockNumber segno, int oflags);
+static MdfdVec *_mdfd_getseg(SMgrRelation reln, ForkNumber forkno,
+			 BlockNumber blkno, bool skipFsync, int behavior);
+static BlockNumber _mdnblocks(SMgrRelation reln, ForkNumber forknum,
+		   MdfdVec *seg);
+
 
 /*
- * In some contexts (currently, standalone backends and the checkpointer)
- * we keep track of pending fsync operations: we need to remember all relation
- * segments that have been written since the last checkpoint, so that we can
- * fsync them down to disk before completing the next checkpoint.  This hash
- * table remembers the pending operations.  We use a hash table mostly as
- * a convenient way of merging duplicate requests.
- *
- * We use a similar mechanism to remember no-longer-needed files that can
- * be deleted after the next checkpoint, but we use a linked list instead of
- * a hash table, because we don't expect there to be any duplicate requests.
- *
- * These mechanisms are only used for non-temp relations; we never fsync
- * temp rels, nor do we need to postpone their deletion (see comments in
- * mdunlink).
- *
- * (Regular backends do not track pending operations locally, but forward
- * them to the checkpointer.)
+ * Segment handling behaviors
  */
-typedef uint16 CycleCtr;		/* can be any convenient integer size */
-
-typedef struct
-{
-	RelFileNode rnode;			/* hash table key (must be first!) */
-	CycleCtr	cycle_ctr;		/* mdsync_cycle_ctr of oldest request */
-	/* requests[f] has bit n set if we need to fsync segment n of fork f */
-	Bitmapset  *requests[MAX_FORKNUM + 1];
-	/* canceled[f] is true if we canceled fsyncs for fork "recently" */
-	bool		canceled[MAX_FORKNUM + 1];
-} PendingOperationEntry;
-
-typedef struct
-{
-	RelFileNode rnode;			/* the dead relation to delete */
-	CycleCtr	cycle_ctr;		/* mdckpt_cycle_ctr when request was made */
-} PendingUnlinkEntry;
-
-static HTAB *pendingOpsTable = NULL;
-static List *pendingUnlinks = NIL;
-static MemoryContext pendingOpsCxt; /* context for the above  */
-
-static CycleCtr mdsync_cycle_ctr = 0;
-static CycleCtr mdckpt_cycle_ctr = 0;
-
-
-/*** behavior for mdopen & _mdfd_getseg ***/
 /* ereport if segment not present */
 #define EXTENSION_FAIL				(1 << 0)
 /* return NULL if segment not present */
@@ -179,26 +122,6 @@ static CycleCtr mdckpt_cycle_ctr = 0;
 #define EXTENSION_DONT_CHECK_SIZE	(1 << 4)
 
 
-/* local routines */
-static void mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum,
-			 bool isRedo);
-static MdfdVec *mdopen(SMgrRelation reln, ForkNumber forknum, int behavior);
-static void register_dirty_segment(SMgrRelation reln, ForkNumber forknum,
-					   MdfdVec *seg);
-static void register_unlink(RelFileNodeBackend rnode);
-static void _fdvec_resize(SMgrRelation reln,
-			  ForkNumber forknum,
-			  int nseg);
-static char *_mdfd_segpath(SMgrRelation reln, ForkNumber forknum,
-			  BlockNumber segno);
-static MdfdVec *_mdfd_openseg(SMgrRelation reln, ForkNumber forkno,
-			  BlockNumber segno, int oflags);
-static MdfdVec *_mdfd_getseg(SMgrRelation reln, ForkNumber forkno,
-			 BlockNumber blkno, bool skipFsync, int behavior);
-static BlockNumber _mdnblocks(SMgrRelation reln, ForkNumber forknum,
-		   MdfdVec *seg);
-
-
 /*
  *	mdinit() -- Initialize private state for magnetic disk storage manager.
  */
@@ -208,64 +131,6 @@ mdinit(void)
 	MdCxt = AllocSetContextCreate(TopMemoryContext,
 								  "MdSmgr",
 								  ALLOCSET_DEFAULT_SIZES);
-
-	/*
-	 * Create pending-operations hashtable if we need it.  Currently, we need
-	 * it if we are standalone (not under a postmaster) or if we are a startup
-	 * or checkpointer auxiliary process.
-	 */
-	if (!IsUnderPostmaster || AmStartupProcess() || AmCheckpointerProcess())
-	{
-		HASHCTL		hash_ctl;
-
-		/*
-		 * XXX: The checkpointer needs to add entries to the pending ops table
-		 * when absorbing fsync requests.  That is done within a critical
-		 * section, which isn't usually allowed, but we make an exception. It
-		 * means that there's a theoretical possibility that you run out of
-		 * memory while absorbing fsync requests, which leads to a PANIC.
-		 * Fortunately the hash table is small so that's unlikely to happen in
-		 * practice.
-		 */
-		pendingOpsCxt = AllocSetContextCreate(MdCxt,
-											  "Pending ops context",
-											  ALLOCSET_DEFAULT_SIZES);
-		MemoryContextAllowInCriticalSection(pendingOpsCxt, true);
-
-		MemSet(&hash_ctl, 0, sizeof(hash_ctl));
-		hash_ctl.keysize = sizeof(RelFileNode);
-		hash_ctl.entrysize = sizeof(PendingOperationEntry);
-		hash_ctl.hcxt = pendingOpsCxt;
-		pendingOpsTable = hash_create("Pending Ops Table",
-									  100L,
-									  &hash_ctl,
-									  HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
-		pendingUnlinks = NIL;
-	}
-}
-
-/*
- * In archive recovery, we rely on checkpointer to do fsyncs, but we will have
- * already created the pendingOpsTable during initialization of the startup
- * process.  Calling this function drops the local pendingOpsTable so that
- * subsequent requests will be forwarded to checkpointer.
- */
-void
-SetForwardFsyncRequests(void)
-{
-	/* Perform any pending fsyncs we may have queued up, then drop table */
-	if (pendingOpsTable)
-	{
-		mdsync();
-		hash_destroy(pendingOpsTable);
-	}
-	pendingOpsTable = NULL;
-
-	/*
-	 * We should not have any pending unlink requests, since mdunlink doesn't
-	 * queue unlink requests when isRedo.
-	 */
-	Assert(pendingUnlinks == NIL);
 }
 
 /*
@@ -382,10 +247,10 @@ mdunlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo)
 {
 	/*
 	 * We have to clean out any pending fsync requests for the doomed
-	 * relation, else the next mdsync() will fail.  There can't be any such
-	 * requests for a temp relation, though.  We can send just one request
-	 * even when deleting multiple forks, since the fsync queuing code accepts
-	 * the "InvalidForkNumber = all forks" convention.
+	 * relation, else the next ProcessFsyncRequests will fail.  There can't be
+	 * any such requests for a temp relation, though.  We can send just one
+	 * request even when deleting multiple forks, since the fsync queuing
+	 * code accepts the "InvalidForkNumber = all forks" convention.
 	 */
 	if (!RelFileNodeBackendIsTemp(rnode))
 		ForgetRelationFsyncRequests(rnode.node, forkNum);
@@ -978,421 +843,91 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
  * nothing of dirty buffers that may exist inside the buffer manager.
  */
 void
-mdimmedsync(SMgrRelation reln, ForkNumber forknum)
+mdimmedsync(SMgrRelation reln, ForkNumber forknum, SegmentNumber segno)
 {
-	int			segno;
-
-	/*
-	 * NOTE: mdnblocks makes sure we have opened all active segments, so that
-	 * fsync loop will get them all!
-	 */
-	mdnblocks(reln, forknum);
+	MdfdVec	   *segments = NULL;
+	size_t		nsegs = 0;
 
-	segno = reln->md_num_open_segs[forknum];
+	if (segno != InvalidSegmentNumber)
+	{
+		/* Get the specified segment, or report failure if it doesn't exist */
+		segments = _mdfd_openseg(reln, forknum, segno * RELSEG_SIZE,
+								 EXTENSION_RETURN_NULL);
+		if (!segments)
+			ereport(data_sync_elevel(ERROR),
+				(errcode_for_file_access(),
+				 errmsg("could not open file \"%s\" to fsync: %m",
+						mdsegpath(reln->smgr_rnode, forknum, segno))));
+		nsegs = 1;
+	}
+	else
+	{
+		/*
+		 * NOTE: mdnblocks makes sure we have opened all active segments, so that
+		 * fsync loop will get them all!
+		 */
+		mdnblocks(reln, forknum);
+		nsegs = reln->md_num_open_segs[forknum];
+		segments = &reln->md_seg_fds[forknum][0];
+	}
 
-	while (segno > 0)
+	for (segno = 0; segno < nsegs; ++segno)
 	{
-		MdfdVec    *v = &reln->md_seg_fds[forknum][segno - 1];
+		MdfdVec    *v = &segments[segno];
 
 		if (FileSync(v->mdfd_vfd, WAIT_EVENT_DATA_FILE_IMMEDIATE_SYNC) < 0)
 			ereport(data_sync_elevel(ERROR),
 					(errcode_for_file_access(),
 					 errmsg("could not fsync file \"%s\": %m",
 							FilePathName(v->mdfd_vfd))));
-		segno--;
 	}
 }
 
 /*
- *	mdsync() -- Sync previous writes to stable storage.
+ *	mdrelpath() -- Return the full path to the relation.
  */
-void
-mdsync(void)
+char *
+mdrelpath(RelFileNodeBackend rnode, ForkNumber forknum)
 {
-	static bool mdsync_in_progress = false;
-
-	HASH_SEQ_STATUS hstat;
-	PendingOperationEntry *entry;
-	int			absorb_counter;
-
-	/* Statistics on sync times */
-	int			processed = 0;
-	instr_time	sync_start,
-				sync_end,
-				sync_diff;
-	uint64		elapsed;
-	uint64		longest = 0;
-	uint64		total_elapsed = 0;
-
-	/*
-	 * This is only called during checkpoints, and checkpoints should only
-	 * occur in processes that have created a pendingOpsTable.
-	 */
-	if (!pendingOpsTable)
-		elog(ERROR, "cannot sync without a pendingOpsTable");
-
-	/*
-	 * If we are in the checkpointer, the sync had better include all fsync
-	 * requests that were queued by backends up to this point.  The tightest
-	 * race condition that could occur is that a buffer that must be written
-	 * and fsync'd for the checkpoint could have been dumped by a backend just
-	 * before it was visited by BufferSync().  We know the backend will have
-	 * queued an fsync request before clearing the buffer's dirtybit, so we
-	 * are safe as long as we do an Absorb after completing BufferSync().
-	 */
-	AbsorbFsyncRequests();
-
-	/*
-	 * To avoid excess fsync'ing (in the worst case, maybe a never-terminating
-	 * checkpoint), we want to ignore fsync requests that are entered into the
-	 * hashtable after this point --- they should be processed next time,
-	 * instead.  We use mdsync_cycle_ctr to tell old entries apart from new
-	 * ones: new ones will have cycle_ctr equal to the incremented value of
-	 * mdsync_cycle_ctr.
-	 *
-	 * In normal circumstances, all entries present in the table at this point
-	 * will have cycle_ctr exactly equal to the current (about to be old)
-	 * value of mdsync_cycle_ctr.  However, if we fail partway through the
-	 * fsync'ing loop, then older values of cycle_ctr might remain when we
-	 * come back here to try again.  Repeated checkpoint failures would
-	 * eventually wrap the counter around to the point where an old entry
-	 * might appear new, causing us to skip it, possibly allowing a checkpoint
-	 * to succeed that should not have.  To forestall wraparound, any time the
-	 * previous mdsync() failed to complete, run through the table and
-	 * forcibly set cycle_ctr = mdsync_cycle_ctr.
-	 *
-	 * Think not to merge this loop with the main loop, as the problem is
-	 * exactly that that loop may fail before having visited all the entries.
-	 * From a performance point of view it doesn't matter anyway, as this path
-	 * will never be taken in a system that's functioning normally.
-	 */
-	if (mdsync_in_progress)
-	{
-		/* prior try failed, so update any stale cycle_ctr values */
-		hash_seq_init(&hstat, pendingOpsTable);
-		while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
-		{
-			entry->cycle_ctr = mdsync_cycle_ctr;
-		}
-	}
-
-	/* Advance counter so that new hashtable entries are distinguishable */
-	mdsync_cycle_ctr++;
-
-	/* Set flag to detect failure if we don't reach the end of the loop */
-	mdsync_in_progress = true;
-
-	/* Now scan the hashtable for fsync requests to process */
-	absorb_counter = FSYNCS_PER_ABSORB;
-	hash_seq_init(&hstat, pendingOpsTable);
-	while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
-	{
-		ForkNumber	forknum;
-
-		/*
-		 * If the entry is new then don't process it this time; it might
-		 * contain multiple fsync-request bits, but they are all new.  Note
-		 * "continue" bypasses the hash-remove call at the bottom of the loop.
-		 */
-		if (entry->cycle_ctr == mdsync_cycle_ctr)
-			continue;
-
-		/* Else assert we haven't missed it */
-		Assert((CycleCtr) (entry->cycle_ctr + 1) == mdsync_cycle_ctr);
-
-		/*
-		 * Scan over the forks and segments represented by the entry.
-		 *
-		 * The bitmap manipulations are slightly tricky, because we can call
-		 * AbsorbFsyncRequests() inside the loop and that could result in
-		 * bms_add_member() modifying and even re-palloc'ing the bitmapsets.
-		 * So we detach it, but if we fail we'll merge it with any new
-		 * requests that have arrived in the meantime.
-		 */
-		for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
-		{
-			Bitmapset  *requests = entry->requests[forknum];
-			int			segno;
-
-			entry->requests[forknum] = NULL;
-			entry->canceled[forknum] = false;
-
-			segno = -1;
-			while ((segno = bms_next_member(requests, segno)) >= 0)
-			{
-				int			failures;
-
-				/*
-				 * If fsync is off then we don't have to bother opening the
-				 * file at all.  (We delay checking until this point so that
-				 * changing fsync on the fly behaves sensibly.)
-				 */
-				if (!enableFsync)
-					continue;
-
-				/*
-				 * If in checkpointer, we want to absorb pending requests
-				 * every so often to prevent overflow of the fsync request
-				 * queue.  It is unspecified whether newly-added entries will
-				 * be visited by hash_seq_search, but we don't care since we
-				 * don't need to process them anyway.
-				 */
-				if (--absorb_counter <= 0)
-				{
-					AbsorbFsyncRequests();
-					absorb_counter = FSYNCS_PER_ABSORB;
-				}
-
-				/*
-				 * The fsync table could contain requests to fsync segments
-				 * that have been deleted (unlinked) by the time we get to
-				 * them. Rather than just hoping an ENOENT (or EACCES on
-				 * Windows) error can be ignored, what we do on error is
-				 * absorb pending requests and then retry.  Since mdunlink()
-				 * queues a "cancel" message before actually unlinking, the
-				 * fsync request is guaranteed to be marked canceled after the
-				 * absorb if it really was this case. DROP DATABASE likewise
-				 * has to tell us to forget fsync requests before it starts
-				 * deletions.
-				 */
-				for (failures = 0;; failures++) /* loop exits at "break" */
-				{
-					SMgrRelation reln;
-					MdfdVec    *seg;
-					char	   *path;
-					int			save_errno;
-
-					/*
-					 * Find or create an smgr hash entry for this relation.
-					 * This may seem a bit unclean -- md calling smgr?	But
-					 * it's really the best solution.  It ensures that the
-					 * open file reference isn't permanently leaked if we get
-					 * an error here. (You may say "but an unreferenced
-					 * SMgrRelation is still a leak!" Not really, because the
-					 * only case in which a checkpoint is done by a process
-					 * that isn't about to shut down is in the checkpointer,
-					 * and it will periodically do smgrcloseall(). This fact
-					 * justifies our not closing the reln in the success path
-					 * either, which is a good thing since in non-checkpointer
-					 * cases we couldn't safely do that.)
-					 */
-					reln = smgropen(entry->rnode, InvalidBackendId);
-
-					/* Attempt to open and fsync the target segment */
-					seg = _mdfd_getseg(reln, forknum,
-									   (BlockNumber) segno * (BlockNumber) RELSEG_SIZE,
-									   false,
-									   EXTENSION_RETURN_NULL
-									   | EXTENSION_DONT_CHECK_SIZE);
-
-					INSTR_TIME_SET_CURRENT(sync_start);
-
-					if (seg != NULL &&
-						FileSync(seg->mdfd_vfd, WAIT_EVENT_DATA_FILE_SYNC) >= 0)
-					{
-						/* Success; update statistics about sync timing */
-						INSTR_TIME_SET_CURRENT(sync_end);
-						sync_diff = sync_end;
-						INSTR_TIME_SUBTRACT(sync_diff, sync_start);
-						elapsed = INSTR_TIME_GET_MICROSEC(sync_diff);
-						if (elapsed > longest)
-							longest = elapsed;
-						total_elapsed += elapsed;
-						processed++;
-						requests = bms_del_member(requests, segno);
-						if (log_checkpoints)
-							elog(DEBUG1, "checkpoint sync: number=%d file=%s time=%.3f msec",
-								 processed,
-								 FilePathName(seg->mdfd_vfd),
-								 (double) elapsed / 1000);
-
-						break;	/* out of retry loop */
-					}
-
-					/* Compute file name for use in message */
-					save_errno = errno;
-					path = _mdfd_segpath(reln, forknum, (BlockNumber) segno);
-					errno = save_errno;
-
-					/*
-					 * It is possible that the relation has been dropped or
-					 * truncated since the fsync request was entered.
-					 * Therefore, allow ENOENT, but only if we didn't fail
-					 * already on this file.  This applies both for
-					 * _mdfd_getseg() and for FileSync, since fd.c might have
-					 * closed the file behind our back.
-					 *
-					 * XXX is there any point in allowing more than one retry?
-					 * Don't see one at the moment, but easy to change the
-					 * test here if so.
-					 */
-					if (!FILE_POSSIBLY_DELETED(errno) ||
-						failures > 0)
-					{
-						Bitmapset  *new_requests;
-
-						/*
-						 * We need to merge these unsatisfied requests with
-						 * any others that have arrived since we started.
-						 */
-						new_requests = entry->requests[forknum];
-						entry->requests[forknum] =
-							bms_join(new_requests, requests);
-
-						errno = save_errno;
-						ereport(data_sync_elevel(ERROR),
-								(errcode_for_file_access(),
-								 errmsg("could not fsync file \"%s\": %m",
-										path)));
-					}
-					else
-						ereport(DEBUG1,
-								(errcode_for_file_access(),
-								 errmsg("could not fsync file \"%s\" but retrying: %m",
-										path)));
-					pfree(path);
-
-					/*
-					 * Absorb incoming requests and check to see if a cancel
-					 * arrived for this relation fork.
-					 */
-					AbsorbFsyncRequests();
-					absorb_counter = FSYNCS_PER_ABSORB; /* might as well... */
-
-					if (entry->canceled[forknum])
-						break;
-				}				/* end retry loop */
-			}
-			bms_free(requests);
-		}
-
-		/*
-		 * We've finished everything that was requested before we started to
-		 * scan the entry.  If no new requests have been inserted meanwhile,
-		 * remove the entry.  Otherwise, update its cycle counter, as all the
-		 * requests now in it must have arrived during this cycle.
-		 */
-		for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
-		{
-			if (entry->requests[forknum] != NULL)
-				break;
-		}
-		if (forknum <= MAX_FORKNUM)
-			entry->cycle_ctr = mdsync_cycle_ctr;
-		else
-		{
-			/* Okay to remove it */
-			if (hash_search(pendingOpsTable, &entry->rnode,
-							HASH_REMOVE, NULL) == NULL)
-				elog(ERROR, "pendingOpsTable corrupted");
-		}
-	}							/* end loop over hashtable entries */
-
-	/* Return sync performance metrics for report at checkpoint end */
-	CheckpointStats.ckpt_sync_rels = processed;
-	CheckpointStats.ckpt_longest_sync = longest;
-	CheckpointStats.ckpt_agg_sync_time = total_elapsed;
+	char	   *path;
 
-	/* Flag successful completion of mdsync */
-	mdsync_in_progress = false;
+	path = relpath(rnode, forknum);
+	return path;
 }
 
 /*
- * mdpreckpt() -- Do pre-checkpoint work
- *
- * To distinguish unlink requests that arrived before this checkpoint
- * started from those that arrived during the checkpoint, we use a cycle
- * counter similar to the one we use for fsync requests. That cycle
- * counter is incremented here.
- *
- * This must be called *before* the checkpoint REDO point is determined.
- * That ensures that we won't delete files too soon.
+ *	mdsegpath()
  *
- * Note that we can't do anything here that depends on the assumption
- * that the checkpoint will be completed.
+ * Return the filename for the specified segment of the relation. The
+ * returned string is palloc'd.
  */
-void
-mdpreckpt(void)
+char *
+mdsegpath(RelFileNodeBackend rnode, ForkNumber forknum, BlockNumber segno)
 {
-	/*
-	 * Any unlink requests arriving after this point will be assigned the next
-	 * cycle counter, and won't be unlinked until next checkpoint.
-	 */
-	mdckpt_cycle_ctr++;
-}
+	char	   *path,
+			   *fullpath;
 
-/*
- * mdpostckpt() -- Do post-checkpoint work
- *
- * Remove any lingering files that can now be safely removed.
- */
-void
-mdpostckpt(void)
-{
-	int			absorb_counter;
+	path = relpath(rnode, forknum);
 
-	absorb_counter = UNLINKS_PER_ABSORB;
-	while (pendingUnlinks != NIL)
+	if (segno > 0)
 	{
-		PendingUnlinkEntry *entry = (PendingUnlinkEntry *) linitial(pendingUnlinks);
-		char	   *path;
-
-		/*
-		 * New entries are appended to the end, so if the entry is new we've
-		 * reached the end of old entries.
-		 *
-		 * Note: if just the right number of consecutive checkpoints fail, we
-		 * could be fooled here by cycle_ctr wraparound.  However, the only
-		 * consequence is that we'd delay unlinking for one more checkpoint,
-		 * which is perfectly tolerable.
-		 */
-		if (entry->cycle_ctr == mdckpt_cycle_ctr)
-			break;
-
-		/* Unlink the file */
-		path = relpathperm(entry->rnode, MAIN_FORKNUM);
-		if (unlink(path) < 0)
-		{
-			/*
-			 * There's a race condition, when the database is dropped at the
-			 * same time that we process the pending unlink requests. If the
-			 * DROP DATABASE deletes the file before we do, we will get ENOENT
-			 * here. rmtree() also has to ignore ENOENT errors, to deal with
-			 * the possibility that we delete the file first.
-			 */
-			if (errno != ENOENT)
-				ereport(WARNING,
-						(errcode_for_file_access(),
-						 errmsg("could not remove file \"%s\": %m", path)));
-		}
+		fullpath = psprintf("%s.%u", path, segno);
 		pfree(path);
-
-		/* And remove the list entry */
-		pendingUnlinks = list_delete_first(pendingUnlinks);
-		pfree(entry);
-
-		/*
-		 * As in mdsync, we don't want to stop absorbing fsync requests for a
-		 * long time when there are many deletions to be done.  We can safely
-		 * call AbsorbFsyncRequests() at this point in the loop (note it might
-		 * try to delete list entries).
-		 */
-		if (--absorb_counter <= 0)
-		{
-			AbsorbFsyncRequests();
-			absorb_counter = UNLINKS_PER_ABSORB;
-		}
 	}
+	else
+		fullpath = path;
+
+	return fullpath;
 }
 
 /*
  * register_dirty_segment() -- Mark a relation segment as needing fsync
  *
- * If there is a local pending-ops table, just make an entry in it for
- * mdsync to process later.  Otherwise, try to pass off the fsync request
- * to the checkpointer process.  If that fails, just do the fsync
- * locally before returning (we hope this will not happen often enough
- * to be a performance problem).
+ * Call smgrsync() to queue the fsync request.  If there is a local pending-ops
+ * table, just make an entry in it to be processed later. Otherwise, try to
+ * forward the fsync request to the checkpointer process. If that fails, just
+ * do the fsync locally before returning (we hope this will not happen often
+ * enough to be a performance problem).
  */
 static void
 register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
@@ -1400,16 +935,9 @@ register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
 	/* Temp relations should never be fsync'd */
 	Assert(!SmgrIsTemp(reln));
 
-	if (pendingOpsTable)
-	{
-		/* push it into local pending-ops table */
-		RememberFsyncRequest(reln->smgr_rnode.node, forknum, seg->mdfd_segno);
-	}
-	else
+	/* Try to push it into local pending-ops table, or forward to checkpointer */
+	if (!smgrsync(reln->smgr_rnode.node, forknum, seg->mdfd_segno))
 	{
-		if (ForwardFsyncRequest(reln->smgr_rnode.node, forknum, seg->mdfd_segno))
-			return;				/* passed it off successfully */
-
 		ereport(DEBUG1,
 				(errmsg("could not forward fsync request because request queue is full")));
 
@@ -1436,13 +964,8 @@ register_unlink(RelFileNodeBackend rnode)
 	/* Should never be used with temp relations */
 	Assert(!RelFileNodeBackendIsTemp(rnode));
 
-	if (pendingOpsTable)
-	{
-		/* push it into local pending-ops table */
-		RememberFsyncRequest(rnode.node, MAIN_FORKNUM,
-							 UNLINK_RELATION_REQUEST);
-	}
-	else
+	/* Try to push it into local pending-ops table, or forward to checkpointer */
+	if (!smgrsync(rnode.node, MAIN_FORKNUM, UNLINK_RELATION_REQUEST))
 	{
 		/*
 		 * Notify the checkpointer about it.  If we fail to queue the request
@@ -1458,259 +981,6 @@ register_unlink(RelFileNodeBackend rnode)
 	}
 }
 
-/*
- * RememberFsyncRequest() -- callback from checkpointer side of fsync request
- *
- * We stuff fsync requests into the local hash table for execution
- * during the checkpointer's next checkpoint.  UNLINK requests go into a
- * separate linked list, however, because they get processed separately.
- *
- * The range of possible segment numbers is way less than the range of
- * BlockNumber, so we can reserve high values of segno for special purposes.
- * We define three:
- * - FORGET_RELATION_FSYNC means to cancel pending fsyncs for a relation,
- *	 either for one fork, or all forks if forknum is InvalidForkNumber
- * - FORGET_DATABASE_FSYNC means to cancel pending fsyncs for a whole database
- * - UNLINK_RELATION_REQUEST is a request to delete the file after the next
- *	 checkpoint.
- * Note also that we're assuming real segment numbers don't exceed INT_MAX.
- *
- * (Handling FORGET_DATABASE_FSYNC requests is a tad slow because the hash
- * table has to be searched linearly, but dropping a database is a pretty
- * heavyweight operation anyhow, so we'll live with it.)
- */
-void
-RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
-{
-	Assert(pendingOpsTable);
-
-	if (segno == FORGET_RELATION_FSYNC)
-	{
-		/* Remove any pending requests for the relation (one or all forks) */
-		PendingOperationEntry *entry;
-
-		entry = (PendingOperationEntry *) hash_search(pendingOpsTable,
-													  &rnode,
-													  HASH_FIND,
-													  NULL);
-		if (entry)
-		{
-			/*
-			 * We can't just delete the entry since mdsync could have an
-			 * active hashtable scan.  Instead we delete the bitmapsets; this
-			 * is safe because of the way mdsync is coded.  We also set the
-			 * "canceled" flags so that mdsync can tell that a cancel arrived
-			 * for the fork(s).
-			 */
-			if (forknum == InvalidForkNumber)
-			{
-				/* remove requests for all forks */
-				for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
-				{
-					bms_free(entry->requests[forknum]);
-					entry->requests[forknum] = NULL;
-					entry->canceled[forknum] = true;
-				}
-			}
-			else
-			{
-				/* remove requests for single fork */
-				bms_free(entry->requests[forknum]);
-				entry->requests[forknum] = NULL;
-				entry->canceled[forknum] = true;
-			}
-		}
-	}
-	else if (segno == FORGET_DATABASE_FSYNC)
-	{
-		/* Remove any pending requests for the entire database */
-		HASH_SEQ_STATUS hstat;
-		PendingOperationEntry *entry;
-		ListCell   *cell,
-				   *prev,
-				   *next;
-
-		/* Remove fsync requests */
-		hash_seq_init(&hstat, pendingOpsTable);
-		while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
-		{
-			if (entry->rnode.dbNode == rnode.dbNode)
-			{
-				/* remove requests for all forks */
-				for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
-				{
-					bms_free(entry->requests[forknum]);
-					entry->requests[forknum] = NULL;
-					entry->canceled[forknum] = true;
-				}
-			}
-		}
-
-		/* Remove unlink requests */
-		prev = NULL;
-		for (cell = list_head(pendingUnlinks); cell; cell = next)
-		{
-			PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(cell);
-
-			next = lnext(cell);
-			if (entry->rnode.dbNode == rnode.dbNode)
-			{
-				pendingUnlinks = list_delete_cell(pendingUnlinks, cell, prev);
-				pfree(entry);
-			}
-			else
-				prev = cell;
-		}
-	}
-	else if (segno == UNLINK_RELATION_REQUEST)
-	{
-		/* Unlink request: put it in the linked list */
-		MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt);
-		PendingUnlinkEntry *entry;
-
-		/* PendingUnlinkEntry doesn't store forknum, since it's always MAIN */
-		Assert(forknum == MAIN_FORKNUM);
-
-		entry = palloc(sizeof(PendingUnlinkEntry));
-		entry->rnode = rnode;
-		entry->cycle_ctr = mdckpt_cycle_ctr;
-
-		pendingUnlinks = lappend(pendingUnlinks, entry);
-
-		MemoryContextSwitchTo(oldcxt);
-	}
-	else
-	{
-		/* Normal case: enter a request to fsync this segment */
-		MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt);
-		PendingOperationEntry *entry;
-		bool		found;
-
-		entry = (PendingOperationEntry *) hash_search(pendingOpsTable,
-													  &rnode,
-													  HASH_ENTER,
-													  &found);
-		/* if new entry, initialize it */
-		if (!found)
-		{
-			entry->cycle_ctr = mdsync_cycle_ctr;
-			MemSet(entry->requests, 0, sizeof(entry->requests));
-			MemSet(entry->canceled, 0, sizeof(entry->canceled));
-		}
-
-		/*
-		 * NB: it's intentional that we don't change cycle_ctr if the entry
-		 * already exists.  The cycle_ctr must represent the oldest fsync
-		 * request that could be in the entry.
-		 */
-
-		entry->requests[forknum] = bms_add_member(entry->requests[forknum],
-												  (int) segno);
-
-		MemoryContextSwitchTo(oldcxt);
-	}
-}
-
-/*
- * ForgetRelationFsyncRequests -- forget any fsyncs for a relation fork
- *
- * forknum == InvalidForkNumber means all forks, although this code doesn't
- * actually know that, since it's just forwarding the request elsewhere.
- */
-void
-ForgetRelationFsyncRequests(RelFileNode rnode, ForkNumber forknum)
-{
-	if (pendingOpsTable)
-	{
-		/* standalone backend or startup process: fsync state is local */
-		RememberFsyncRequest(rnode, forknum, FORGET_RELATION_FSYNC);
-	}
-	else if (IsUnderPostmaster)
-	{
-		/*
-		 * Notify the checkpointer about it.  If we fail to queue the cancel
-		 * message, we have to sleep and try again ... ugly, but hopefully
-		 * won't happen often.
-		 *
-		 * XXX should we CHECK_FOR_INTERRUPTS in this loop?  Escaping with an
-		 * error would leave the no-longer-used file still present on disk,
-		 * which would be bad, so I'm inclined to assume that the checkpointer
-		 * will always empty the queue soon.
-		 */
-		while (!ForwardFsyncRequest(rnode, forknum, FORGET_RELATION_FSYNC))
-			pg_usleep(10000L);	/* 10 msec seems a good number */
-
-		/*
-		 * Note we don't wait for the checkpointer to actually absorb the
-		 * cancel message; see mdsync() for the implications.
-		 */
-	}
-}
-
-/*
- * ForgetDatabaseFsyncRequests -- forget any fsyncs and unlinks for a DB
- */
-void
-ForgetDatabaseFsyncRequests(Oid dbid)
-{
-	RelFileNode rnode;
-
-	rnode.dbNode = dbid;
-	rnode.spcNode = 0;
-	rnode.relNode = 0;
-
-	if (pendingOpsTable)
-	{
-		/* standalone backend or startup process: fsync state is local */
-		RememberFsyncRequest(rnode, InvalidForkNumber, FORGET_DATABASE_FSYNC);
-	}
-	else if (IsUnderPostmaster)
-	{
-		/* see notes in ForgetRelationFsyncRequests */
-		while (!ForwardFsyncRequest(rnode, InvalidForkNumber,
-									FORGET_DATABASE_FSYNC))
-			pg_usleep(10000L);	/* 10 msec seems a good number */
-	}
-}
-
-/*
- * DropRelationFiles -- drop files of all given relations
- */
-void
-DropRelationFiles(RelFileNode *delrels, int ndelrels, bool isRedo)
-{
-	SMgrRelation *srels;
-	int			i;
-
-	srels = palloc(sizeof(SMgrRelation) * ndelrels);
-	for (i = 0; i < ndelrels; i++)
-	{
-		SMgrRelation srel = smgropen(delrels[i], InvalidBackendId);
-
-		if (isRedo)
-		{
-			ForkNumber	fork;
-
-			for (fork = 0; fork <= MAX_FORKNUM; fork++)
-				XLogDropRelation(delrels[i], fork);
-		}
-		srels[i] = srel;
-	}
-
-	smgrdounlinkall(srels, ndelrels, isRedo);
-
-	/*
-	 * Call smgrclose() in reverse order as when smgropen() is called.
-	 * This trick enables remove_from_unowned_list() in smgrclose()
-	 * to search the SMgrRelation from the unowned list,
-	 * with O(1) performance.
-	 */
-	for (i = ndelrels - 1; i >= 0; i--)
-		smgrclose(srels[i]);
-	pfree(srels);
-}
-
-
 /*
  *	_fdvec_resize() -- Resize the fork's open segments array
  */
@@ -1748,29 +1018,6 @@ _fdvec_resize(SMgrRelation reln,
 	reln->md_num_open_segs[forknum] = nseg;
 }
 
-/*
- * Return the filename for the specified segment of the relation. The
- * returned string is palloc'd.
- */
-static char *
-_mdfd_segpath(SMgrRelation reln, ForkNumber forknum, BlockNumber segno)
-{
-	char	   *path,
-			   *fullpath;
-
-	path = relpath(reln->smgr_rnode, forknum);
-
-	if (segno > 0)
-	{
-		fullpath = psprintf("%s.%u", path, segno);
-		pfree(path);
-	}
-	else
-		fullpath = path;
-
-	return fullpath;
-}
-
 /*
  * Open the specified segment of the relation,
  * and make a MdfdVec object for it.  Returns NULL on failure.
@@ -1783,7 +1030,7 @@ _mdfd_openseg(SMgrRelation reln, ForkNumber forknum, BlockNumber segno,
 	int			fd;
 	char	   *fullpath;
 
-	fullpath = _mdfd_segpath(reln, forknum, segno);
+	fullpath = mdsegpath(reln->smgr_rnode, forknum, segno);
 
 	/* open the file */
 	fd = PathNameOpenFile(fullpath, O_RDWR | PG_BINARY | oflags);
@@ -1918,7 +1165,7 @@ _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
 			ereport(ERROR,
 					(errcode_for_file_access(),
 					 errmsg("could not open file \"%s\" (target block %u): previous segment is only %u blocks",
-							_mdfd_segpath(reln, forknum, nextsegno),
+							mdsegpath(reln->smgr_rnode, forknum, nextsegno),
 							blkno, nblocks)));
 		}
 
@@ -1932,7 +1179,7 @@ _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
 			ereport(ERROR,
 					(errcode_for_file_access(),
 					 errmsg("could not open file \"%s\" (target block %u): %m",
-							_mdfd_segpath(reln, forknum, nextsegno),
+							mdsegpath(reln->smgr_rnode, forknum, nextsegno),
 							blkno)));
 		}
 	}
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index 0c0bba4ab3..e387c01776 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -17,13 +17,77 @@
  */
 #include "postgres.h"
 
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/file.h>
+
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "access/xlogutils.h"
+#include "access/xlog.h"
 #include "commands/tablespace.h"
+#include "portability/instr_time.h"
+#include "postmaster/bgwriter.h"
 #include "storage/bufmgr.h"
 #include "storage/ipc.h"
 #include "storage/smgr.h"
 #include "utils/hsearch.h"
+#include "utils/memutils.h"
 #include "utils/inval.h"
 
+static MemoryContext pendingOpsCxt; /* context for the pending ops state  */
+
+/*
+ * In some contexts (currently, standalone backends and the checkpointer)
+ * we keep track of pending fsync operations: we need to remember all relation
+ * segments that have been written since the last checkpoint, so that we can
+ * fsync them down to disk before completing the next checkpoint.  This hash
+ * table remembers the pending operations.  We use a hash table mostly as
+ * a convenient way of merging duplicate requests.
+ *
+ * We use a similar mechanism to remember no-longer-needed files that can
+ * be deleted after the next checkpoint, but we use a linked list instead of
+ * a hash table, because we don't expect there to be any duplicate requests.
+ *
+ * These mechanisms are only used for non-temp relations; we never fsync
+ * temp rels, nor do we need to postpone their deletion (see comments in
+ * mdunlink).
+ *
+ * (Regular backends do not track pending operations locally, but forward
+ * them to the checkpointer.)
+ */
+typedef uint16 CycleCtr;		/* can be any convenient integer size */
+
+typedef struct FsyncTag
+{
+	RelFileNode		rnode;			/* rel file node entry */
+	ForkNumber		forknum;
+	SegmentNumber	segno;			/* segment number */
+} FsyncTag;
+
+typedef struct
+{
+	FsyncTag	tag;			/* hash table key (must be first!) */
+	CycleCtr	cycle_ctr;		/* sync_cycle_ctr of oldest request */
+	bool		canceled;		/* canceled is true if we canceled "recently" */
+} PendingFsyncEntry;
+
+typedef struct
+{
+	RelFileNode rnode;			/* the dead relation to delete */
+	CycleCtr	cycle_ctr;		/* checkpoint_cycle_ctr when request was made */
+} PendingUnlinkEntry;
+
+static HTAB *pendingOps = NULL;
+static List *pendingUnlinks = NIL;
+static MemoryContext pendingOpsCxt; /* context for the above  */
+
+static CycleCtr sync_cycle_ctr = 0;
+static CycleCtr checkpoint_cycle_ctr = 0;
+
+/* Intervals for calling AbsorbFsyncRequests */
+#define FSYNCS_PER_ABSORB		10
+#define UNLINKS_PER_ABSORB		10
 
 /*
  * This struct of function pointers defines the API between smgr.c and
@@ -58,13 +122,13 @@ typedef struct f_smgr
 	BlockNumber (*smgr_nblocks) (SMgrRelation reln, ForkNumber forknum);
 	void		(*smgr_truncate) (SMgrRelation reln, ForkNumber forknum,
 								  BlockNumber nblocks);
-	void		(*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum);
-	void		(*smgr_pre_ckpt) (void);	/* may be NULL */
-	void		(*smgr_sync) (void);	/* may be NULL */
-	void		(*smgr_post_ckpt) (void);	/* may be NULL */
+	void		(*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum,
+								   SegmentNumber segno);
+	char*		(*smgr_relpath) (RelFileNodeBackend rnode, ForkNumber forknum);
+	char*		(*smgr_segpath) (RelFileNodeBackend rnode, ForkNumber forknum,
+								   SegmentNumber segno);
 } f_smgr;
 
-
 static const f_smgr smgrsw[] = {
 	/* magnetic disk */
 	{
@@ -82,15 +146,13 @@ static const f_smgr smgrsw[] = {
 		.smgr_nblocks = mdnblocks,
 		.smgr_truncate = mdtruncate,
 		.smgr_immedsync = mdimmedsync,
-		.smgr_pre_ckpt = mdpreckpt,
-		.smgr_sync = mdsync,
-		.smgr_post_ckpt = mdpostckpt
+		.smgr_relpath = mdrelpath,
+		.smgr_segpath = mdsegpath
 	}
 };
 
 static const int NSmgr = lengthof(smgrsw);
 
-
 /*
  * Each backend has a hashtable that stores all extant SMgrRelation objects.
  * In addition, "unowned" SMgrRelation objects are chained together in a list.
@@ -99,7 +161,7 @@ static HTAB *SMgrRelationHash = NULL;
 
 static SMgrRelation first_unowned_reln = NULL;
 
-/* local function prototypes */
+/* Local function prototypes */
 static void smgrshutdown(int code, Datum arg);
 static void add_to_unowned_list(SMgrRelation reln);
 static void remove_from_unowned_list(SMgrRelation reln);
@@ -126,6 +188,40 @@ smgrinit(void)
 
 	/* register the shutdown proc */
 	on_proc_exit(smgrshutdown, 0);
+
+	/*
+	 * Create pending-operations hashtable if we need it.  Currently, we need
+	 * it if we are standalone (not under a postmaster) or if we are a startup
+	 * or checkpointer auxiliary process.
+	 */
+	if (!IsUnderPostmaster || AmStartupProcess() || AmCheckpointerProcess())
+	{
+		HASHCTL		hash_ctl;
+
+		/*
+		 * XXX: The checkpointer needs to add entries to the pending ops table
+		 * when absorbing fsync requests.  That is done within a critical
+		 * section, which isn't usually allowed, but we make an exception. It
+		 * means that there's a theoretical possibility that you run out of
+		 * memory while absorbing fsync requests, which leads to a PANIC.
+		 * Fortunately the hash table is small so that's unlikely to happen in
+		 * practice.
+		 */
+		pendingOpsCxt = AllocSetContextCreate(TopMemoryContext,
+											  "Pending ops context",
+											  ALLOCSET_DEFAULT_SIZES);
+		MemoryContextAllowInCriticalSection(pendingOpsCxt, true);
+
+		MemSet(&hash_ctl, 0, sizeof(hash_ctl));
+		hash_ctl.keysize = sizeof(FsyncTag);
+		hash_ctl.entrysize = sizeof(PendingFsyncEntry);
+		hash_ctl.hcxt = pendingOpsCxt;
+		pendingOps = hash_create("Pending Ops Table",
+									  100L,
+									  &hash_ctl,
+									  HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
+		pendingUnlinks = NIL;
+	}
 }
 
 /*
@@ -725,8 +821,8 @@ smgrtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 /*
  *	smgrimmedsync() -- Force the specified relation to stable storage.
  *
- *		Synchronously force all previous writes to the specified relation
- *		down to disk.
+ *		Synchronously force all previous writes to the specified relation,
+ *		or specific relation segment, down to disk.
  *
  *		This is useful for building completely new relations (eg, new
  *		indexes).  Instead of incrementally WAL-logging the index build
@@ -746,55 +842,599 @@ smgrtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
  *		otherwise the sync is not very meaningful.
  */
 void
-smgrimmedsync(SMgrRelation reln, ForkNumber forknum)
+smgrimmedsync(SMgrRelation reln, ForkNumber forknum, SegmentNumber segno)
+{
+	smgrsw[reln->smgr_which].smgr_immedsync(reln, forknum, segno);
+}
+
+/*
+ * smgrsync() -- Enqueue fsync request
+ *
+ * Called by internal smgr components to queue a request either locally or
+ * to forward the request to checkpointer.
+ *
+ * Note: this api requires RelFileNode instead of SMgrRelation as callers
+ * include unlink which may not have an open SMgrRelation.
+ *
+ * Returns false if we failed to do either, which means the backend is required
+ * to do their own sync.
+ */
+bool
+smgrsync(RelFileNode rnode, ForkNumber forknum, SegmentNumber segno)
 {
-	smgrsw[reln->smgr_which].smgr_immedsync(reln, forknum);
+	if (pendingOps)
+	{
+		/* Push it into local pending-ops table */
+		RememberFsyncRequest(rnode, forknum, segno);
+		return true;
+	}
+	else
+	{
+		/* Forward request to checkpointer, which can fail if queue is full */
+		return ForwardFsyncRequest(rnode, forknum, segno);
+	}
 }
 
 
 /*
- *	smgrpreckpt() -- Prepare for checkpoint.
+ * Fsync related operations
+ */
+
+
+/*
+ * smgrprecheckpoint() -- Do pre-checkpoint work
+ *
+ * To distinguish unlink requests that arrived before this checkpoint
+ * started from those that arrived during the checkpoint, we use a cycle
+ * counter similar to the one we use for fsync requests. That cycle
+ * counter is incremented here.
+ *
+ * This must be called *before* the checkpoint REDO point is determined.
+ * That ensures that we won't delete files too soon.
+ *
+ * Note that we can't do anything here that depends on the assumption
+ * that the checkpoint will be completed.
+ */
+void
+smgrprecheckpoint(void)
+{
+	/*
+	 * Any unlink requests arriving after this point will be assigned the next
+	 * cycle counter, and won't be unlinked until next checkpoint.
+	 */
+	checkpoint_cycle_ctr++;
+}
+
+/*
+ * smgrpostcheckpoint() -- Do post-checkpoint work
+ *
+ * Remove any lingering files that can now be safely removed.
  */
 void
-smgrpreckpt(void)
+smgrpostcheckpoint(void)
 {
-	int			i;
+	int			absorb_counter;
 
-	for (i = 0; i < NSmgr; i++)
+	absorb_counter = UNLINKS_PER_ABSORB;
+	while (pendingUnlinks != NIL)
 	{
-		if (smgrsw[i].smgr_pre_ckpt)
-			smgrsw[i].smgr_pre_ckpt();
+		PendingUnlinkEntry *entry = (PendingUnlinkEntry *) linitial(pendingUnlinks);
+		RelFileNodeBackend rnode = {.node = entry->rnode, .backend = InvalidBackendId};
+		char	   *path;
+
+		/*
+		 * New entries are appended to the end, so if the entry is new we've
+		 * reached the end of old entries.
+		 *
+		 * Note: if just the right number of consecutive checkpoints fail, we
+		 * could be fooled here by cycle_ctr wraparound.  However, the only
+		 * consequence is that we'd delay unlinking for one more checkpoint,
+		 * which is perfectly tolerable.
+		 */
+		if (entry->cycle_ctr == checkpoint_cycle_ctr)
+			break;
+
+		/* Unlink the file */
+		path = smgrsw[0].smgr_relpath(rnode, MAIN_FORKNUM);
+		if (unlink(path) < 0)
+		{
+			/*
+			 * There's a race condition, when the database is dropped at the
+			 * same time that we process the pending unlink requests. If the
+			 * DROP DATABASE deletes the file before we do, we will get ENOENT
+			 * here. rmtree() also has to ignore ENOENT errors, to deal with
+			 * the possibility that we delete the file first.
+			 */
+			if (errno != ENOENT)
+				ereport(WARNING,
+						(errcode_for_file_access(),
+						 errmsg("could not remove file \"%s\": %m", path)));
+		}
+		pfree(path);
+
+		/* And remove the list entry */
+		pendingUnlinks = list_delete_first(pendingUnlinks);
+		pfree(entry);
+
+		/*
+		 * As in ProcessFsyncRequests, we don't want to stop absorbing fsync
+		 * requests for along time when there are many deletions to be done.  We
+		 * can safelycall AbsorbFsyncRequests() at this point in the loop
+		 * (note it might try to delete list entries).
+		 */
+		if (--absorb_counter <= 0)
+		{
+			AbsorbFsyncRequests();
+			absorb_counter = UNLINKS_PER_ABSORB;
+		}
 	}
 }
 
 /*
- *	smgrsync() -- Sync files to disk during checkpoint.
+ *	ProcessFsyncRequests() -- Process queued fsync requests.
  */
 void
-smgrsync(void)
+ProcessFsyncRequests(void)
 {
-	int			i;
+	static bool sync_in_progress = false;
 
-	for (i = 0; i < NSmgr; i++)
+	HASH_SEQ_STATUS hstat;
+	PendingFsyncEntry *entry;
+	int			absorb_counter;
+
+	/* Statistics on sync times */
+	int			processed = 0;
+	instr_time	sync_start,
+				sync_end,
+				sync_diff;
+	uint64		elapsed;
+	uint64		longest = 0;
+	uint64		total_elapsed = 0;
+
+	/*
+	 * This is only called during checkpoints, and checkpoints should only
+	 * occur in processes that have created a pendingOps.
+	 */
+	if (!pendingOps)
+		elog(ERROR, "cannot sync without a pendingOps table");
+
+	/*
+	 * If we are in the checkpointer, the sync had better include all fsync
+	 * requests that were queued by backends up to this point.  The tightest
+	 * race condition that could occur is that a buffer that must be written
+	 * and fsync'd for the checkpoint could have been dumped by a backend just
+	 * before it was visited by BufferSync().  We know the backend will have
+	 * queued an fsync request before clearing the buffer's dirtybit, so we
+	 * are safe as long as we do an Absorb after completing BufferSync().
+	 */
+	AbsorbFsyncRequests();
+
+	/*
+	 * To avoid excess fsync'ing (in the worst case, maybe a never-terminating
+	 * checkpoint), we want to ignore fsync requests that are entered into the
+	 * hashtable after this point --- they should be processed next time,
+	 * instead.  We use sync_cycle_ctr to tell old entries apart from new
+	 * ones: new ones will have cycle_ctr equal to the incremented value of
+	 * sync_cycle_ctr.
+	 *
+	 * In normal circumstances, all entries present in the table at this point
+	 * will have cycle_ctr exactly equal to the current (about to be old)
+	 * value of sync_cycle_ctr.  However, if we fail partway through the
+	 * fsync'ing loop, then older values of cycle_ctr might remain when we
+	 * come back here to try again.  Repeated checkpoint failures would
+	 * eventually wrap the counter around to the point where an old entry
+	 * might appear new, causing us to skip it, possibly allowing a checkpoint
+	 * to succeed that should not have.  To forestall wraparound, any time the
+	 * previous ProcessFsyncRequests() failed to complete, run through the
+	 * table and forcibly set cycle_ctr = sync_cycle_ctr.
+	 *
+	 * Think not to merge this loop with the main loop, as the problem is
+	 * exactly that that loop may fail before having visited all the entries.
+	 * From a performance point of view it doesn't matter anyway, as this path
+	 * will never be taken in a system that's functioning normally.
+	 */
+	if (sync_in_progress)
 	{
-		if (smgrsw[i].smgr_sync)
-			smgrsw[i].smgr_sync();
+		/* prior try failed, so update any stale cycle_ctr values */
+		hash_seq_init(&hstat, pendingOps);
+		while ((entry = (PendingFsyncEntry *) hash_seq_search(&hstat)) != NULL)
+		{
+			entry->cycle_ctr = sync_cycle_ctr;
+		}
 	}
+
+	/* Advance counter so that new hashtable entries are distinguishable */
+	sync_cycle_ctr++;
+
+	/* Set flag to detect failure if we don't reach the end of the loop */
+	sync_in_progress = true;
+
+	/* Now scan the hashtable for fsync requests to process */
+	absorb_counter = FSYNCS_PER_ABSORB;
+	hash_seq_init(&hstat, pendingOps);
+	while ((entry = (PendingFsyncEntry *) hash_seq_search(&hstat)) != NULL)
+	{
+		int			failures;
+
+		/*
+		 * If fsync is off then we don't have to bother opening the
+		 * file at all.  (We delay checking until this point so that
+		 * changing fsync on the fly behaves sensibly.)
+		 */
+		if (!enableFsync)
+			continue;
+
+		/*
+		 * If the entry is new then don't process it this time; it might
+		 * contain multiple fsync-request bits, but they are all new.  Note
+		 * "continue" bypasses the hash-remove call at the bottom of the loop.
+		 */
+		if (entry->cycle_ctr == sync_cycle_ctr)
+			continue;
+
+		/* Else assert we haven't missed it */
+		Assert((CycleCtr) (entry->cycle_ctr + 1) == sync_cycle_ctr);
+
+		/*
+		 * If in checkpointer, we want to absorb pending requests
+		 * every so often to prevent overflow of the fsync request
+		 * queue.  It is unspecified whether newly-added entries will
+		 * be visited by hash_seq_search, but we don't care since we
+		 * don't need to process them anyway.
+		 */
+		if (--absorb_counter <= 0)
+		{
+			AbsorbFsyncRequests();
+			absorb_counter = FSYNCS_PER_ABSORB;
+		}
+
+		/*
+		 * The fsync table could contain requests to fsync segments
+		 * that have been deleted (unlinked) by the time we get to
+		 * them. Rather than just hoping an ENOENT (or EACCES on
+		 * Windows) error can be ignored, what we do on error is
+		 * absorb pending requests and then retry.  Since mdunlink()
+		 * queues a "cancel" message before actually unlinking, the
+		 * fsync request is guaranteed to be marked canceled after the
+		 * absorb if it really was this case. DROP DATABASE likewise
+		 * has to tell us to forget fsync requests before it starts
+		 * deletions.
+		 */
+		for (failures = 0;; failures++) /* loop exits at "break" */
+		{
+			char	   *path;
+			File		fd;
+			RelFileNodeBackend rnode = {.node = entry->tag.rnode, .backend = InvalidBackendId};
+
+			path = smgrsw[0].smgr_segpath(rnode, MAIN_FORKNUM, entry->tag.segno);
+			fd = PathNameOpenFile(path, O_RDWR | PG_BINARY);
+
+			INSTR_TIME_SET_CURRENT(sync_start);
+			if (fd >= 0 &&
+				   FileSync(fd, WAIT_EVENT_DATA_FILE_SYNC) >= 0)
+			{
+				/* Success; update statistics about sync timing */
+				INSTR_TIME_SET_CURRENT(sync_end);
+				sync_diff = sync_end;
+				INSTR_TIME_SUBTRACT(sync_diff, sync_start);
+				elapsed = INSTR_TIME_GET_MICROSEC(sync_diff);
+				if (elapsed > longest)
+					longest = elapsed;
+				total_elapsed += elapsed;
+				processed++;
+
+				if (log_checkpoints)
+					elog(DEBUG1, "checkpoint sync: number=%d file=%s time=%.3f msec",
+						 processed,
+						 path,
+						 (double) elapsed / 1000);
+
+				pfree(path);
+				break;	/* out of retry loop */
+			}
+
+			/*
+			 * It is possible that the relation has been dropped or
+			 * truncated since the fsync request was entered.
+			 * Therefore, allow ENOENT, but only if we didn't fail
+			 * already on this file.  This applies both for
+			 * smgrgetseg() and for FileSync, since fd.c might have
+			 * closed the file behind our back.
+			 *
+			 * XXX is there any point in allowing more than one retry?
+			 * Don't see one at the moment, but easy to change the
+			 * test here if so.
+			 */
+			if (!FILE_POSSIBLY_DELETED(errno) || failures > 0)
+				ereport(data_sync_elevel(ERROR),
+						(errcode_for_file_access(),
+						 errmsg("could not fsync file \"%s\": %m",
+								path)));
+			else
+				ereport(DEBUG1,
+						(errcode_for_file_access(),
+						 errmsg("could not fsync file \"%s\" but retrying: %m",
+								path)));
+
+			pfree(path);
+
+			/*
+			 * Absorb incoming requests and check to see if a cancel
+			 * arrived for this relation fork.
+			 */
+			AbsorbFsyncRequests();
+			absorb_counter = FSYNCS_PER_ABSORB; /* might as well... */
+
+			if (entry->canceled)
+				break;
+		}				/* end retry loop */
+
+		/* We are done with this entry, remove it */
+		if (hash_search(pendingOps, &entry->tag, HASH_REMOVE, NULL) == NULL)
+			elog(ERROR, "pendingOps corrupted");
+	}	/* end loop over hashtable entries */
+
+	/* Return sync performance metrics for report at checkpoint end */
+	CheckpointStats.ckpt_sync_rels = processed;
+	CheckpointStats.ckpt_longest_sync = longest;
+	CheckpointStats.ckpt_agg_sync_time = total_elapsed;
+
+	/* Flag successful completion of ProcessFsyncRequests */
+	sync_in_progress = false;
 }
 
 /*
- *	smgrpostckpt() -- Post-checkpoint cleanup.
+ * RememberFsyncRequest() -- callback from checkpointer side of fsync request
+ *
+ * We stuff fsync requests into the local hash table for execution
+ * during the checkpointer's next checkpoint.  UNLINK requests go into a
+ * separate linked list, however, because they get processed separately.
+ *
+ * The range of possible segment numbers is way less than the range of
+ * SegmentNumber, so we can reserve high values of segno for special purposes.
+ * We define three:
+ * - FORGET_RELATION_FSYNC means to cancel pending fsyncs for a relation,
+ *	 either for one fork, or all forks if forknum is InvalidForkNumber
+ * - FORGET_DATABASE_FSYNC means to cancel pending fsyncs for a whole database
+ * - UNLINK_RELATION_REQUEST is a request to delete the file after the next
+ *	 checkpoint.
+ * Note also that we're assuming real segment numbers don't exceed INT_MAX.
+ *
+ * (Handling FORGET_DATABASE_FSYNC requests is a tad slow because the hash
+ * table has to be searched linearly, but dropping a database is a pretty
+ * heavyweight operation anyhow, so we'll live with it.)
  */
 void
-smgrpostckpt(void)
+RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum, SegmentNumber segno)
 {
+	Assert(pendingOps);
+
+	if (segno == FORGET_RELATION_FSYNC)
+	{
+		/* Remove any pending requests for the relation (one or all forks) */
+		HASH_SEQ_STATUS hstat;
+		PendingFsyncEntry *entry;
+
+		/* Remove fsync requests */
+		hash_seq_init(&hstat, pendingOps);
+		while ((entry = (PendingFsyncEntry *) hash_seq_search(&hstat)) != NULL)
+		{
+			if (entry->tag.rnode.dbNode == rnode.dbNode &&
+				   entry->tag.rnode.relNode == rnode.relNode)
+			{
+				/* Check if we should remove all forks or a specific fork */
+				if (forknum == InvalidForkNumber ||
+					(forknum != InvalidForkNumber &&
+					   entry->tag.forknum == forknum))
+				{
+					entry->canceled = true;
+				}
+			}
+		}
+	}
+	else if (segno == FORGET_DATABASE_FSYNC)
+	{
+		/* Remove any pending requests for the entire database */
+		HASH_SEQ_STATUS hstat;
+		PendingFsyncEntry *entry;
+		ListCell   *cell,
+				   *prev,
+				   *next;
+
+		/* Remove fsync requests */
+		hash_seq_init(&hstat, pendingOps);
+		while ((entry = (PendingFsyncEntry *) hash_seq_search(&hstat)) != NULL)
+		{
+			if (entry->tag.rnode.dbNode == rnode.dbNode)
+			{
+				entry->canceled = true;
+			}
+		}
+
+		/* Remove unlink requests */
+		prev = NULL;
+		for (cell = list_head(pendingUnlinks); cell; cell = next)
+		{
+			PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(cell);
+
+			next = lnext(cell);
+			if (entry->rnode.dbNode == rnode.dbNode)
+			{
+				pendingUnlinks = list_delete_cell(pendingUnlinks, cell, prev);
+				pfree(entry);
+			}
+			else
+				prev = cell;
+		}
+	}
+	else if (segno == UNLINK_RELATION_REQUEST)
+	{
+		/* Unlink request: put it in the linked list */
+		MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt);
+		PendingUnlinkEntry *entry;
+
+		/* PendingUnlinkEntry doesn't store forknum, since it's always MAIN */
+		Assert(forknum == MAIN_FORKNUM);
+
+		entry = palloc(sizeof(PendingUnlinkEntry));
+		entry->rnode = rnode;
+		entry->cycle_ctr = checkpoint_cycle_ctr;
+
+		pendingUnlinks = lappend(pendingUnlinks, entry);
+
+		MemoryContextSwitchTo(oldcxt);
+	}
+	else
+	{
+		/* Normal case: enter a request to fsync this segment */
+		MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt);
+		PendingFsyncEntry *entry;
+		bool		found;
+		FsyncTag	tag = {.rnode = rnode, .forknum = forknum, .segno = segno};
+
+		entry = (PendingFsyncEntry *) hash_search(pendingOps,
+													  &tag,
+													  HASH_ENTER,
+													  &found);
+		/* if new entry, initialize it */
+		if (!found)
+		{
+			entry->cycle_ctr = sync_cycle_ctr;
+			entry->canceled = false;
+		}
+
+		/*
+		 * NB: it's intentional that we don't change cycle_ctr if the entry
+		 * already exists.  The cycle_ctr must represent the oldest fsync
+		 * request that could be in the entry.
+		 */
+
+		MemoryContextSwitchTo(oldcxt);
+	}
+}
+
+/*
+ * ForgetRelationFsyncRequests -- forget any fsyncs for a relation fork
+ *
+ * forknum == InvalidForkNumber means all forks, although this code doesn't
+ * actually know that, since it's just forwarding the request elsewhere.
+ */
+void
+ForgetRelationFsyncRequests(RelFileNode rnode, ForkNumber forknum)
+{
+	if (pendingOps)
+	{
+		/* standalone backend or startup process: fsync state is local */
+		RememberFsyncRequest(rnode, forknum, FORGET_RELATION_FSYNC);
+	}
+	else if (IsUnderPostmaster)
+	{
+		/*
+		 * Notify the checkpointer about it.  If we fail to queue the cancel
+		 * message, we have to sleep and try again ... ugly, but hopefully
+		 * won't happen often.
+		 *
+		 * XXX should we CHECK_FOR_INTERRUPTS in this loop?  Escaping with an
+		 * error would leave the no-longer-used file still present on disk,
+		 * which would be bad, so I'm inclined to assume that the checkpointer
+		 * will always empty the queue soon.
+		 */
+		while (!ForwardFsyncRequest(rnode, forknum, FORGET_RELATION_FSYNC))
+			pg_usleep(10000L);	/* 10 msec seems a good number */
+
+		/*
+		 * Note we don't wait for the checkpointer to actually absorb the
+		 * cancel message; see ProcessFsyncRequests() for the implications.
+		 */
+	}
+}
+
+/*
+ * ForgetDatabaseFsyncRequests -- forget any fsyncs and unlinks for a DB
+ */
+void
+ForgetDatabaseFsyncRequests(Oid dbid)
+{
+	RelFileNode rnode;
+
+	rnode.dbNode = dbid;
+	rnode.spcNode = 0;
+	rnode.relNode = 0;
+
+	if (pendingOps)
+	{
+		/* standalone backend or startup process: fsync state is local */
+		RememberFsyncRequest(rnode, InvalidForkNumber, FORGET_DATABASE_FSYNC);
+	}
+	else if (IsUnderPostmaster)
+	{
+		/* see notes in ForgetRelationFsyncRequests */
+		while (!ForwardFsyncRequest(rnode, InvalidForkNumber,
+									FORGET_DATABASE_FSYNC))
+			pg_usleep(10000L);	/* 10 msec seems a good number */
+	}
+}
+
+
+/*
+ * In archive recovery, we rely on checkpointer to do fsyncs, but we will have
+ * already created the pendingOps during initialization of the startup
+ * process.  Calling this function drops the local pendingOps so that
+ * subsequent requests will be forwarded to checkpointer.
+ */
+void
+SetForwardFsyncRequests(void)
+{
+	/* Perform any pending fsyncs we may have queued up, then drop table */
+	if (pendingOps)
+	{
+		ProcessFsyncRequests();
+		hash_destroy(pendingOps);
+	}
+	pendingOps = NULL;
+
+	/*
+	 * We should not have any pending unlink requests, since mdunlink doesn't
+	 * queue unlink requests when isRedo.
+	 */
+	Assert(pendingUnlinks == NIL);
+}
+
+/*
+ * DropRelationFiles -- drop files of all given relations
+ */
+void
+DropRelationFiles(RelFileNode *delrels, int ndelrels, bool isRedo)
+{
+	SMgrRelation *srels;
 	int			i;
 
-	for (i = 0; i < NSmgr; i++)
+	srels = palloc(sizeof(SMgrRelation) * ndelrels);
+	for (i = 0; i < ndelrels; i++)
 	{
-		if (smgrsw[i].smgr_post_ckpt)
-			smgrsw[i].smgr_post_ckpt();
+		SMgrRelation srel = smgropen(delrels[i], InvalidBackendId);
+
+		if (isRedo)
+		{
+			ForkNumber	fork;
+
+			for (fork = 0; fork <= MAX_FORKNUM; fork++)
+				XLogDropRelation(delrels[i], fork);
+		}
+		srels[i] = srel;
 	}
+
+	smgrdounlinkall(srels, ndelrels, isRedo);
+
+	/*
+	 * Call smgrclose() in reverse order as when smgropen() is called.
+	 * This trick enables remove_from_unowned_list() in smgrclose()
+	 * to search the SMgrRelation from the unowned list,
+	 * with O(1) performance.
+	 */
+	for (i = ndelrels - 1; i >= 0; i--)
+		smgrclose(srels[i]);
+	pfree(srels);
 }
 
 /*
diff --git a/src/include/postmaster/bgwriter.h b/src/include/postmaster/bgwriter.h
index 53b8f5fe3c..5eae12455b 100644
--- a/src/include/postmaster/bgwriter.h
+++ b/src/include/postmaster/bgwriter.h
@@ -17,6 +17,7 @@
 
 #include "storage/block.h"
 #include "storage/relfilenode.h"
+#include "storage/smgr.h"
 
 
 /* GUC options */
@@ -32,7 +33,7 @@ extern void RequestCheckpoint(int flags);
 extern void CheckpointWriteDelay(int flags, double progress);
 
 extern bool ForwardFsyncRequest(RelFileNode rnode, ForkNumber forknum,
-					BlockNumber segno);
+					SegmentNumber segno);
 extern void AbsorbFsyncRequests(void);
 
 extern Size CheckpointerShmemSize(void);
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index 820d08ed4e..9d38708c20 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -18,6 +18,39 @@
 #include "storage/block.h"
 #include "storage/relfilenode.h"
 
+/*
+ * The type used to identify segment numbers.  Generally, segments are an
+ * internal detail of individual storage manager implementations, but since
+ * they appear in various places to allow them to be passed between processes,
+ * it seemed worthwhile to have a typename.
+ */
+typedef uint32 SegmentNumber;
+
+#define InvalidSegmentNumber ((SegmentNumber) 0xFFFFFFFF)
+
+/*
+ * Special values for the segno arg to RememberFsyncRequest.
+ *
+ * Note that CompactCheckpointerRequestQueue assumes that it's OK to remove an
+ * fsync request from the queue if an identical, subsequent request is found.
+ * See comments there before making changes here.
+ */
+#define FORGET_RELATION_FSYNC	(InvalidBlockNumber)
+#define FORGET_DATABASE_FSYNC	(InvalidBlockNumber-1)
+#define UNLINK_RELATION_REQUEST (InvalidBlockNumber-2)
+
+/*
+ * On Windows, we have to interpret EACCES as possibly meaning the same as
+ * ENOENT, because if a file is unlinked-but-not-yet-gone on that platform,
+ * that's what you get.  Ugh.  This code is designed so that we don't
+ * actually believe these cases are okay without further evidence (namely,
+ * a pending fsync request getting canceled ... see mdsync).
+ */
+#ifndef WIN32
+#define FILE_POSSIBLY_DELETED(err)	((err) == ENOENT)
+#else
+#define FILE_POSSIBLY_DELETED(err)	((err) == ENOENT || (err) == EACCES)
+#endif
 
 /*
  * smgr.c maintains a table of SMgrRelation objects, which are essentially
@@ -105,12 +138,21 @@ extern void smgrwriteback(SMgrRelation reln, ForkNumber forknum,
 extern BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum);
 extern void smgrtruncate(SMgrRelation reln, ForkNumber forknum,
 			 BlockNumber nblocks);
-extern void smgrimmedsync(SMgrRelation reln, ForkNumber forknum);
-extern void smgrpreckpt(void);
-extern void smgrsync(void);
-extern void smgrpostckpt(void);
-extern void AtEOXact_SMgr(void);
+extern void smgrimmedsync(SMgrRelation reln, ForkNumber forknum,
+						   SegmentNumber segno);
+extern bool smgrsync(RelFileNode rnode, ForkNumber forknum,
+					   SegmentNumber segno);
+extern void smgrprecheckpoint(void);
+extern void smgrpostcheckpoint(void);
+extern void ProcessFsyncRequests(void);
+extern void RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum,
+					 SegmentNumber segno);
+extern void ForgetRelationFsyncRequests(RelFileNode rnode, ForkNumber forknum);
+extern void ForgetDatabaseFsyncRequests(Oid dbid);
+extern void SetForwardFsyncRequests(void);
 
+extern void DropRelationFiles(RelFileNode *delrels, int ndelrels, bool isRedo);
+extern void AtEOXact_SMgr(void);
 
 /* internals: move me elsewhere -- ay 7/94 */
 
@@ -133,16 +175,10 @@ extern void mdwriteback(SMgrRelation reln, ForkNumber forknum,
 extern BlockNumber mdnblocks(SMgrRelation reln, ForkNumber forknum);
 extern void mdtruncate(SMgrRelation reln, ForkNumber forknum,
 		   BlockNumber nblocks);
-extern void mdimmedsync(SMgrRelation reln, ForkNumber forknum);
-extern void mdpreckpt(void);
-extern void mdsync(void);
-extern void mdpostckpt(void);
-
-extern void SetForwardFsyncRequests(void);
-extern void RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum,
-					 BlockNumber segno);
-extern void ForgetRelationFsyncRequests(RelFileNode rnode, ForkNumber forknum);
-extern void ForgetDatabaseFsyncRequests(Oid dbid);
-extern void DropRelationFiles(RelFileNode *delrels, int ndelrels, bool isRedo);
+extern void mdimmedsync(SMgrRelation reln, ForkNumber forknum,
+						   SegmentNumber segno);
+extern char* mdrelpath(RelFileNodeBackend rnode, ForkNumber forknum);
+extern char* mdsegpath(RelFileNodeBackend rnode, ForkNumber forknum,
+		   SegmentNumber segno);
 
 #endif							/* SMGR_H */
-- 
2.16.5