diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index e900dcc..1cec243 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -2454,6 +2454,28 @@ include_dir 'conf.d'
       </listitem>
      </varlistentry>
 
+     <varlistentry id="guc-checkpoint-sort" xreflabel="checkpoint_sort">
+      <term><varname>checkpoint_sort</varname> (<type>bool</type>)
+      <indexterm>
+       <primary><varname>checkpoint_sort</> configuration parameter</primary>
+      </indexterm>
+      </term>
+      <listitem>
+       <para>
+        Whether to sort buffers before writting them out to disk on checkpoint.
+        For a HDD storage, this setting allows to group together
+        neighboring pages written to disk, thus improving performance by
+        reducing random write activity.
+        This sorting should have limited performance effects on SSD backends
+        as such storages have good random write performance, but it may
+        help with wear-leveling so be worth keeping anyway.
+        The default is <literal>on</>.
+        This parameter can only be set in the <filename>postgresql.conf</>
+        file or on the server command line.
+       </para>
+      </listitem>
+     </varlistentry>
+
      <varlistentry id="guc-checkpoint-warning" xreflabel="checkpoint_warning">
       <term><varname>checkpoint_warning</varname> (<type>integer</type>)
       <indexterm>
diff --git a/doc/src/sgml/wal.sgml b/doc/src/sgml/wal.sgml
index e3941c9..f538698 100644
--- a/doc/src/sgml/wal.sgml
+++ b/doc/src/sgml/wal.sgml
@@ -546,6 +546,18 @@
   </para>
 
   <para>
+   When hard-disk drives (HDD) are used for terminal data storage
+   <xref linkend="guc-checkpoint-sort"> allows to sort pages
+   so that neighboring pages on disk will be flushed together by
+   chekpoints, reducing the random write load and improving performance.
+   If solid-state drives (SSD) are used, sorting pages induces no benefit
+   as their random write I/O performance is good: this feature could then
+   be disabled by setting <varname>checkpoint_sort</> to <value>off</>.
+   It is possible that sorting may help with SSD wear leveling, so it may
+   be kept on that account.
+  </para>
+
+  <para>
    The number of WAL segment files in <filename>pg_xlog</> directory depends on
    <varname>min_wal_size</>, <varname>max_wal_size</> and
    the amount of WAL generated in previous checkpoint cycles. When old log
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 68e33eb..bee38ab 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -7995,11 +7995,13 @@ LogCheckpointEnd(bool restartpoint)
 				sync_secs,
 				total_secs,
 				longest_secs,
+				sort_secs,
 				average_secs;
 	int			write_usecs,
 				sync_usecs,
 				total_usecs,
 				longest_usecs,
+				sort_usecs,
 				average_usecs;
 	uint64		average_sync_time;
 
@@ -8030,6 +8032,10 @@ LogCheckpointEnd(bool restartpoint)
 						CheckpointStats.ckpt_end_t,
 						&total_secs, &total_usecs);
 
+	TimestampDifference(CheckpointStats.ckpt_sort_t,
+						CheckpointStats.ckpt_sort_end_t,
+						&sort_secs, &sort_usecs);
+
 	/*
 	 * Timing values returned from CheckpointStats are in microseconds.
 	 * Convert to the second plus microsecond form that TimestampDifference
@@ -8048,8 +8054,8 @@ LogCheckpointEnd(bool restartpoint)
 
 	elog(LOG, "%s complete: wrote %d buffers (%.1f%%); "
 		 "%d transaction log file(s) added, %d removed, %d recycled; "
-		 "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
-		 "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s; "
+		 "sort=%ld.%03d s, write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s;"
+		 " sync files=%d, longest=%ld.%03d s, average=%ld.%03d s; "
 		 "distance=%d kB, estimate=%d kB",
 		 restartpoint ? "restartpoint" : "checkpoint",
 		 CheckpointStats.ckpt_bufs_written,
@@ -8057,6 +8063,7 @@ LogCheckpointEnd(bool restartpoint)
 		 CheckpointStats.ckpt_segs_added,
 		 CheckpointStats.ckpt_segs_removed,
 		 CheckpointStats.ckpt_segs_recycled,
+		 sort_secs, sort_usecs / 1000,
 		 write_secs, write_usecs / 1000,
 		 sync_secs, sync_usecs / 1000,
 		 total_secs, total_usecs / 1000,
diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c
index 3ae2848..ec2436f 100644
--- a/src/backend/storage/buffer/buf_init.c
+++ b/src/backend/storage/buffer/buf_init.c
@@ -65,7 +65,8 @@ void
 InitBufferPool(void)
 {
 	bool		foundBufs,
-				foundDescs;
+				foundDescs,
+				foundCpid;
 
 	/* Align descriptors to a cacheline boundary. */
 	BufferDescriptors = (BufferDescPadded *) CACHELINEALIGN(
@@ -77,10 +78,14 @@ InitBufferPool(void)
 		ShmemInitStruct("Buffer Blocks",
 						NBuffers * (Size) BLCKSZ, &foundBufs);
 
-	if (foundDescs || foundBufs)
+	CheckpointBufferIds = (int *)
+		ShmemInitStruct("Checkpoint BufferIds",
+						NBuffers * sizeof(int), &foundCpid);
+
+	if (foundDescs || foundBufs || foundCpid)
 	{
-		/* both should be present or neither */
-		Assert(foundDescs && foundBufs);
+		/* all should be present or neither */
+		Assert(foundDescs && foundBufs && foundCpid);
 		/* note: this path is only taken in EXEC_BACKEND case */
 	}
 	else
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index e4b25587..ba5298d 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -80,6 +80,7 @@ bool		zero_damaged_pages = false;
 int			bgwriter_lru_maxpages = 100;
 double		bgwriter_lru_multiplier = 2.0;
 bool		track_io_timing = false;
+bool		checkpoint_sort = true;
 
 /*
  * How many buffers PrefetchBuffer callers should try to stay ahead of their
@@ -95,6 +96,9 @@ static bool IsForInput;
 /* local state for LockBufferForCleanup */
 static volatile BufferDesc *PinCountWaitBuf = NULL;
 
+/* Array of buffer ids of all buffers to checkpoint */
+int * CheckpointBufferIds = NULL;
+
 /*
  * Backend-Private refcount management:
  *
@@ -1561,6 +1565,136 @@ UnpinBuffer(volatile BufferDesc *buf, bool fixOwner)
 	}
 }
 
+/* Compare checkpoint buffers.
+ * No lock is acquired, see comments below.
+ */
+static int bufcmp(const void * pa, const void * pb)
+{
+	BufferDesc
+		*a = GetBufferDescriptor(* (int *) pa),
+		*b = GetBufferDescriptor(* (int *) pb);
+
+	/* tag: rnode, forkNum (different files), blockNum
+	 * rnode: { spcNode (ignore: not really needed),
+	 *   dbNode (ignore: this is a directory), relNode }
+	 * spcNode: table space oid, not that there are at least two
+	 * (pg_global and pg_default).
+	 */
+	/* compare relation */
+	if (a->tag.rnode.relNode < b->tag.rnode.relNode)
+		return -1;
+	else if (a->tag.rnode.relNode > b->tag.rnode.relNode)
+		return 1;
+	/* same relation, compare fork */
+	else if (a->tag.forkNum < b->tag.forkNum)
+		return -1;
+	else if (a->tag.forkNum > b->tag.forkNum)
+		return 1;
+	/* same relation/fork, so same segmented "file", compare block number
+	 * which are mapped on different segments depending on the number.
+	 */
+	else if (a->tag.blockNum < b->tag.blockNum)
+		return -1;
+	else /* should not be the same block anyway... */
+		return 1;
+}
+
+/* Status of buffers to checkpoint for a particular tablespace,
+ * used internally in BufferSync.
+ * - space: oid of the tablespace
+ * - num_to_write: number of checkpoint pages counted for this tablespace
+ * - num_written: number of pages actually written out
+ * - index: scanning position in CheckpointBufferIds for this tablespace
+ */
+typedef struct TableSpaceCheckpointStatus {
+	Oid space;
+	int num_to_write;
+	int num_written;
+	int index;
+} TableSpaceCheckpointStatus;
+
+/* entry structure for table space to count hashtable,
+ * used internally in BufferSync.
+ */
+typedef struct TableSpaceCountEntry {
+	Oid space;
+	int count;
+} TableSpaceCountEntry;
+
+/* return the next buffer to write, or NULL if none.
+ * this function balances buffers over tablespaces.
+ */
+static int
+NextBufferToWrite(
+	TableSpaceCheckpointStatus *spcStatus, int nb_spaces,
+	int *pspace, int num_to_write, int num_written)
+{
+	int	space = *pspace, buf_id = -1, index;
+
+	/*
+	 * Select a tablespace depending on the current overall progress.
+	 *
+	 * The progress ratio of each unfinished tablespace is compared to
+	 * the overall progress ratio to find one with is not in advance
+	 * (i.e. tablespace ratio <= overall ratio).
+	 *
+	 * Existence: it is bound to exist otherwise the overall progress
+	 * ratio would be inconsistent: with positive buffers to write (t1 & t2)
+	 * and already written buffers (w1 & w2), we have:
+	 *
+	 * If w1/t1 > (w1+w2)/(t1+t2)          # one table space is in advance
+	 *   => w1t1+w1t2 > w1t1+w2t1 => w1t2 > w2t1 => w1t2+w2t2 > w2t1+w2t2
+	 *   => (w1+w2) / (t1+t2) > w2 / t2    # the other one is late
+	 *
+	 * The round robin ensures that each space is given some attention
+	 * till it is over the current ratio, before going to the next.
+	 *
+	 * Precision: using int32 computations for comparing fractions
+	 * (w1 / t1 > w / t <=> w1 t > w t1) seems a bad idea as the values
+	 * can overflow 32-bit integers: the limit would be sqrt(2**31) ~
+	 * 46340 buffers, i.e. a 362 MB checkpoint. So ensure that 64-bit
+	 * integers are used in the comparison.
+	 */
+	while (/* compare tablespace vs overall progress ratio:
+			* tablespace written/to_write > overall written/to_write
+			*/
+		(int64) spcStatus[space].num_written * num_to_write >
+		(int64) num_written * spcStatus[space].num_to_write)
+		space = (space + 1) % nb_spaces;	/* round robin */
+
+	/*
+	 * Find a valid buffer in the selected tablespace,
+	 * by continuing the tablespace specific buffer scan
+	 * where it was left.
+	 */
+	index = spcStatus[space].index;
+
+	while (index < num_to_write && buf_id == -1)
+	{
+		volatile BufferDesc *bufHdr;
+
+		buf_id = CheckpointBufferIds[index];
+		bufHdr = GetBufferDescriptor(buf_id);
+
+		/* Skip if in another tablespace or not in checkpoint anymore.
+		 * No lock is acquired, see comments below.
+		 */
+		if (spcStatus[space].space != bufHdr->tag.rnode.spcNode ||
+			! (bufHdr->flags & BM_CHECKPOINT_NEEDED))
+		{
+			index ++;
+			buf_id = -1;
+		}
+	}
+
+	/* Update tablespace writing status, will start over at next index */
+	spcStatus[space].index = index+1;
+
+	*pspace = space;
+
+	return buf_id;
+}
+
 /*
  * BufferSync -- Write out all dirty buffers in the pool.
  *
@@ -1574,11 +1708,13 @@ UnpinBuffer(volatile BufferDesc *buf, bool fixOwner)
 static void
 BufferSync(int flags)
 {
-	int			buf_id;
-	int			num_to_scan;
+	int			buf_id = -1;
 	int			num_to_write;
 	int			num_written;
 	int			mask = BM_DIRTY;
+	HTAB		*spcBuffers;
+	TableSpaceCheckpointStatus *spcStatus = NULL;
+	int         nb_spaces, space;
 
 	/* Make sure we can handle the pin inside SyncOneBuffer */
 	ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
@@ -1609,6 +1745,18 @@ BufferSync(int flags)
 	 * certainly need to be written for the next checkpoint attempt, too.
 	 */
 	num_to_write = 0;
+
+	/* initialize oid -> int buffer count hash table */
+	{
+		HASHCTL		ctl;
+
+		MemSet(&ctl, 0, sizeof(HASHCTL));
+		ctl.keysize = sizeof(Oid);
+		ctl.entrysize = sizeof(TableSpaceCountEntry);
+		spcBuffers = hash_create("Number of buffers to write per tablespace",
+								 16, &ctl, HASH_ELEM | HASH_BLOBS);
+	}
+
 	for (buf_id = 0; buf_id < NBuffers; buf_id++)
 	{
 		volatile BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
@@ -1621,32 +1769,107 @@ BufferSync(int flags)
 
 		if ((bufHdr->flags & mask) == mask)
 		{
+			Oid spc;
+			TableSpaceCountEntry * entry;
+			bool found;
+
 			bufHdr->flags |= BM_CHECKPOINT_NEEDED;
+			CheckpointBufferIds[num_to_write] = buf_id;
 			num_to_write++;
+
+			/* keep track of per tablespace buffers */
+			spc = bufHdr->tag.rnode.spcNode;
+			entry = (TableSpaceCountEntry *)
+				hash_search(spcBuffers, (void *) &spc, HASH_ENTER, &found);
+
+			if (found) entry->count++;
+			else entry->count = 1;
 		}
 
 		UnlockBufHdr(bufHdr);
 	}
 
 	if (num_to_write == 0)
+	{
+		hash_destroy(spcBuffers);
 		return;					/* nothing to do */
+	}
 
 	TRACE_POSTGRESQL_BUFFER_SYNC_START(NBuffers, num_to_write);
 
+	/* Build checkpoint tablespace buffer status */
+	nb_spaces = hash_get_num_entries(spcBuffers);
+	spcStatus = (TableSpaceCheckpointStatus *)
+		palloc(sizeof(TableSpaceCheckpointStatus) * nb_spaces);
+
+	{
+		int index = 0;
+		HASH_SEQ_STATUS hseq;
+		TableSpaceCountEntry * entry;
+
+		hash_seq_init(&hseq, spcBuffers);
+		while ((entry = (TableSpaceCountEntry *) hash_seq_search(&hseq)))
+		{
+			Assert(index < nb_spaces);
+			spcStatus[index].space = entry->space;
+			spcStatus[index].num_to_write = entry->count;
+			spcStatus[index].num_written = 0;
+			/* should it be randomized? chosen with some criterion? */
+			spcStatus[index].index = 0;
+
+			index ++;
+		}
+	}
+
+	hash_destroy(spcBuffers);
+	spcBuffers = NULL;
+
 	/*
-	 * Loop over all buffers again, and write the ones (still) marked with
-	 * BM_CHECKPOINT_NEEDED.  In this loop, we start at the clock sweep point
-	 * since we might as well dump soon-to-be-recycled buffers first.
+	 * Sort buffer ids to help find sequential writes.
 	 *
-	 * Note that we don't read the buffer alloc count here --- that should be
-	 * left untouched till the next BgBufferSync() call.
+	 * Note: Buffers are not locked in any way during sorting, but that's ok:
+	 * A change in the buffer header is only relevant when it changes the
+	 * buffer's identity. If the identity has changed it'll have been
+	 * written out by BufferAlloc(), so there's no need for checkpointer to
+	 * write it out anymore. The buffer might also get written out by a
+	 * backend or bgwriter, but that's equally harmless.
+	 *
+	 * Marked buffers must not be move during the checkpoint.
+	 * Also, qsort implementation should be resilient to occasional
+	 * contradictions (cmp(a,b) != -cmp(b,a)) because of possible
+	 * concurrent changes.
 	 */
-	buf_id = StrategySyncStart(NULL, NULL);
-	num_to_scan = NBuffers;
+	CheckpointStats.ckpt_sort_t = GetCurrentTimestamp();
+
+	if (checkpoint_sort)
+	{
+		qsort(CheckpointBufferIds, num_to_write,  sizeof(int),
+				  (int(*)(const void *, const void *)) bufcmp);
+	}
+
+	CheckpointStats.ckpt_sort_end_t = GetCurrentTimestamp();
+
+	/*
+	 * Loop over buffers to write through CheckpointBufferIds,
+	 * and write the ones (still) marked with BM_CHECKPOINT_NEEDED,
+	 * with some round robin over table spaces so as to balance writes,
+	 * so that buffer writes move forward roughly proportionally for each
+	 * tablespace.
+	 *
+	 * Termination: if a tablespace is selected by the inner while loop
+	 * (see argument there), its index is incremented and will eventually
+	 * reach num_to_write, mark this table space scanning as done and
+	 * decrement the number of (active) spaces, which will thus reach 0.
+	 */
+	space = 0;
 	num_written = 0;
-	while (num_to_scan-- > 0)
+
+	while (nb_spaces != 0)
 	{
-		volatile BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
+		volatile BufferDesc *bufHdr;
+		buf_id = NextBufferToWrite(spcStatus, nb_spaces, &space,
+								   num_to_write, num_written);
+		bufHdr = GetBufferDescriptor(buf_id);
 
 		/*
 		 * We don't need to acquire the lock here, because we're only looking
@@ -1660,39 +1883,45 @@ BufferSync(int flags)
 		 * write the buffer though we didn't need to.  It doesn't seem worth
 		 * guarding against this, though.
 		 */
-		if (bufHdr->flags & BM_CHECKPOINT_NEEDED)
+		if (bufHdr != NULL && bufHdr->flags & BM_CHECKPOINT_NEEDED)
 		{
 			if (SyncOneBuffer(buf_id, false) & BUF_WRITTEN)
 			{
 				TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id);
 				BgWriterStats.m_buf_written_checkpoints++;
+				spcStatus[space].num_written++;
 				num_written++;
 
 				/*
-				 * We know there are at most num_to_write buffers with
-				 * BM_CHECKPOINT_NEEDED set; so we can stop scanning if
-				 * num_written reaches num_to_write.
-				 *
-				 * Note that num_written doesn't include buffers written by
-				 * other backends, or by the bgwriter cleaning scan. That
-				 * means that the estimate of how much progress we've made is
-				 * conservative, and also that this test will often fail to
-				 * trigger.  But it seems worth making anyway.
-				 */
-				if (num_written >= num_to_write)
-					break;
-
-				/*
 				 * Sleep to throttle our I/O rate.
 				 */
 				CheckpointWriteDelay(flags, (double) num_written / num_to_write);
 			}
 		}
 
-		if (++buf_id >= NBuffers)
-			buf_id = 0;
+		/*
+		 * Detect checkpoint end for a tablespace: either the scan is done
+		 * or all tablespace buffers have been written out. If so, the
+		 * another active tablespace status is moved in place of the current
+		 * one and the next round will start on this one, or maybe round about.
+		 * Note: maybe an exchange could be made instead in order to keep
+		 * informations about the closed table space, but this is currently
+		 * not used afterwards.
+		 */
+		if (spcStatus[space].index >= num_to_write ||
+			spcStatus[space].num_written >= spcStatus[space].num_to_write)
+		{
+			nb_spaces--;
+			if (space != nb_spaces)
+				spcStatus[space] = spcStatus[nb_spaces];
+			else
+				space = 0;
+		}
 	}
 
+	pfree(spcStatus);
+	spcStatus = NULL;
+
 	/*
 	 * Update checkpoint statistics. As noted above, this doesn't include
 	 * buffers written by other backends or bgwriter scan.
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index b3dac51..ff95e61 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -1013,6 +1013,17 @@ static struct config_bool ConfigureNamesBool[] =
 		false,
 		NULL, NULL, NULL
 	},
+
+	{
+		{"checkpoint_sort", PGC_SIGHUP, WAL_CHECKPOINTS,
+		 gettext_noop("Whether disk-page buffers are sorted on checkpoints."),
+		 NULL
+		},
+		&checkpoint_sort,
+		true,
+		NULL, NULL, NULL
+	},
+
 	{
 		{"log_connections", PGC_SU_BACKEND, LOGGING_WHAT,
 			gettext_noop("Logs each successful connection."),
@@ -1798,6 +1809,9 @@ static struct config_int ConfigureNamesInt[] =
 	/*
 	 * We sometimes multiply the number of shared buffers by two without
 	 * checking for overflow, so we mustn't allow more than INT_MAX / 2.
+	 * Also, checkpoint uses a malloced int array to store index of shared
+	 * buffers for sorting, which results in a SIZE_MAX / sizeof(int) limit,
+	 * that is UINT_MAX / 4 == INT_MAX / 2 as well on a 32 bits system.
 	 */
 	{
 		{"shared_buffers", PGC_POSTMASTER, RESOURCES_MEM,
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index e5d275d..e84f380 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -201,6 +201,7 @@
 #max_wal_size = 1GB
 #min_wal_size = 80MB
 #checkpoint_completion_target = 0.5	# checkpoint target duration, 0.0 - 1.0
+#checkpoint_sort = on			# sort buffers on checkpoint
 #checkpoint_warning = 30s		# 0 disables
 
 # - Archiving -
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index 6dacee2..dbd4757 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -186,6 +186,8 @@ extern bool XLOG_DEBUG;
 typedef struct CheckpointStatsData
 {
 	TimestampTz ckpt_start_t;	/* start of checkpoint */
+	TimestampTz ckpt_sort_t;    /* start buffer sorting */
+	TimestampTz ckpt_sort_end_t;      /* end of sorting */
 	TimestampTz ckpt_write_t;	/* start of flushing buffers */
 	TimestampTz ckpt_sync_t;	/* start of fsyncs */
 	TimestampTz ckpt_sync_end_t;	/* end of fsyncs */
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index 521ee1c..4cb3a60 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -210,6 +210,8 @@ extern PGDLLIMPORT BufferDescPadded *BufferDescriptors;
 /* in localbuf.c */
 extern BufferDesc *LocalBufferDescriptors;
 
+/* in bufmgr.c */
+extern int *CheckpointBufferIds;
 
 /*
  * Internal routines: only called by bufmgr
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index ec0a254..c228f39 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -54,6 +54,7 @@ extern int	bgwriter_lru_maxpages;
 extern double bgwriter_lru_multiplier;
 extern bool track_io_timing;
 extern int	target_prefetch_pages;
+extern bool checkpoint_sort;
 
 /* in buf_init.c */
 extern PGDLLIMPORT char *BufferBlocks;
