diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index e900dcc..1cec243 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -2454,6 +2454,28 @@ include_dir 'conf.d'
+
+ checkpoint_sort (bool)
+
+ checkpoint_sort> configuration parameter
+
+
+
+
+ Whether to sort buffers before writting them out to disk on checkpoint.
+ For a HDD storage, this setting allows to group together
+ neighboring pages written to disk, thus improving performance by
+ reducing random write activity.
+ This sorting should have limited performance effects on SSD backends
+ as such storages have good random write performance, but it may
+ help with wear-leveling so be worth keeping anyway.
+ The default is on>.
+ This parameter can only be set in the postgresql.conf>
+ file or on the server command line.
+
+
+
+
checkpoint_warning (integer)
diff --git a/doc/src/sgml/wal.sgml b/doc/src/sgml/wal.sgml
index e3941c9..f538698 100644
--- a/doc/src/sgml/wal.sgml
+++ b/doc/src/sgml/wal.sgml
@@ -546,6 +546,18 @@
+ When hard-disk drives (HDD) are used for terminal data storage
+ allows to sort pages
+ so that neighboring pages on disk will be flushed together by
+ chekpoints, reducing the random write load and improving performance.
+ If solid-state drives (SSD) are used, sorting pages induces no benefit
+ as their random write I/O performance is good: this feature could then
+ be disabled by setting checkpoint_sort> to off>.
+ It is possible that sorting may help with SSD wear leveling, so it may
+ be kept on that account.
+
+
+
The number of WAL segment files in pg_xlog> directory depends on
min_wal_size>, max_wal_size> and
the amount of WAL generated in previous checkpoint cycles. When old log
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 68e33eb..bee38ab 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -7995,11 +7995,13 @@ LogCheckpointEnd(bool restartpoint)
sync_secs,
total_secs,
longest_secs,
+ sort_secs,
average_secs;
int write_usecs,
sync_usecs,
total_usecs,
longest_usecs,
+ sort_usecs,
average_usecs;
uint64 average_sync_time;
@@ -8030,6 +8032,10 @@ LogCheckpointEnd(bool restartpoint)
CheckpointStats.ckpt_end_t,
&total_secs, &total_usecs);
+ TimestampDifference(CheckpointStats.ckpt_sort_t,
+ CheckpointStats.ckpt_sort_end_t,
+ &sort_secs, &sort_usecs);
+
/*
* Timing values returned from CheckpointStats are in microseconds.
* Convert to the second plus microsecond form that TimestampDifference
@@ -8048,8 +8054,8 @@ LogCheckpointEnd(bool restartpoint)
elog(LOG, "%s complete: wrote %d buffers (%.1f%%); "
"%d transaction log file(s) added, %d removed, %d recycled; "
- "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
- "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s; "
+ "sort=%ld.%03d s, write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s;"
+ " sync files=%d, longest=%ld.%03d s, average=%ld.%03d s; "
"distance=%d kB, estimate=%d kB",
restartpoint ? "restartpoint" : "checkpoint",
CheckpointStats.ckpt_bufs_written,
@@ -8057,6 +8063,7 @@ LogCheckpointEnd(bool restartpoint)
CheckpointStats.ckpt_segs_added,
CheckpointStats.ckpt_segs_removed,
CheckpointStats.ckpt_segs_recycled,
+ sort_secs, sort_usecs / 1000,
write_secs, write_usecs / 1000,
sync_secs, sync_usecs / 1000,
total_secs, total_usecs / 1000,
diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c
index 3ae2848..3bd5eab 100644
--- a/src/backend/storage/buffer/buf_init.c
+++ b/src/backend/storage/buffer/buf_init.c
@@ -65,7 +65,8 @@ void
InitBufferPool(void)
{
bool foundBufs,
- foundDescs;
+ foundDescs,
+ foundCpid;
/* Align descriptors to a cacheline boundary. */
BufferDescriptors = (BufferDescPadded *) CACHELINEALIGN(
@@ -77,10 +78,14 @@ InitBufferPool(void)
ShmemInitStruct("Buffer Blocks",
NBuffers * (Size) BLCKSZ, &foundBufs);
- if (foundDescs || foundBufs)
+ CheckpointBufferIds = (CheckpointSortItem *)
+ ShmemInitStruct("Checkpoint BufferIds",
+ NBuffers * sizeof(CheckpointSortItem), &foundCpid);
+
+ if (foundDescs || foundBufs || foundCpid)
{
- /* both should be present or neither */
- Assert(foundDescs && foundBufs);
+ /* all should be present or neither */
+ Assert(foundDescs && foundBufs && foundCpid);
/* note: this path is only taken in EXEC_BACKEND case */
}
else
@@ -144,5 +149,8 @@ BufferShmemSize(void)
/* size of stuff controlled by freelist.c */
size = add_size(size, StrategyShmemSize());
+ /* size of checkpoint sort array in bufmgr.c */
+ size = add_size(size, mul_size(NBuffers, sizeof(CheckpointSortItem)));
+
return size;
}
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index cd3aaad..ca295f1 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -80,6 +80,7 @@ bool zero_damaged_pages = false;
int bgwriter_lru_maxpages = 100;
double bgwriter_lru_multiplier = 2.0;
bool track_io_timing = false;
+bool checkpoint_sort = true;
/*
* How many buffers PrefetchBuffer callers should try to stay ahead of their
@@ -95,6 +96,9 @@ static bool IsForInput;
/* local state for LockBufferForCleanup */
static volatile BufferDesc *PinCountWaitBuf = NULL;
+/* array of buffer ids & sort criterion of all buffers to checkpoint */
+CheckpointSortItem *CheckpointBufferIds = NULL;
+
/*
* Backend-Private refcount management:
*
@@ -1561,6 +1565,129 @@ UnpinBuffer(volatile BufferDesc *buf, bool fixOwner)
}
}
+/* Compare checkpoint buffers.
+ */
+static int bufcmp(const void * pa, const void * pb)
+{
+ CheckpointSortItem
+ *a = (CheckpointSortItem *) pa,
+ *b = (CheckpointSortItem *) pb;
+
+ /* compare relation */
+ if (a->relNode < b->relNode)
+ return -1;
+ else if (a->relNode > b->relNode)
+ return 1;
+ /* same relation, compare fork */
+ else if (a->forkNum < b->forkNum)
+ return -1;
+ else if (a->forkNum > b->forkNum)
+ return 1;
+ /* same relation/fork, so same segmented "file", compare block number
+ * which are mapped on different segments depending on the number.
+ */
+ else if (a->blockNum < b->blockNum)
+ return -1;
+ else /* should not be the same block anyway... */
+ return 1;
+}
+
+/* Status of buffers to checkpoint for a particular tablespace,
+ * used internally in BufferSync.
+ * - space: oid of the tablespace
+ * - num_to_write: number of checkpoint pages counted for this tablespace
+ * - num_written: number of pages actually written out
+ * - index: scanning position in CheckpointBufferIds for this tablespace
+ */
+typedef struct TableSpaceCheckpointStatus {
+ Oid space;
+ int num_to_write;
+ int num_written;
+ int index;
+} TableSpaceCheckpointStatus;
+
+/* entry structure for table space to count hashtable,
+ * used internally in BufferSync.
+ */
+typedef struct TableSpaceCountEntry {
+ Oid space;
+ int count;
+} TableSpaceCountEntry;
+
+/* return the next buffer to write, or -1.
+ * this function balances buffers over tablespaces.
+ */
+static int
+NextBufferToWrite(
+ TableSpaceCheckpointStatus *spcStatus, int nb_spaces,
+ int *pspace, int num_to_write, int num_written)
+{
+ int space = *pspace, buf_id = -1, index;
+
+ /*
+ * Select a tablespace depending on the current overall progress.
+ *
+ * The progress ratio of each unfinished tablespace is compared to
+ * the overall progress ratio to find one with is not in advance
+ * (i.e. tablespace ratio <= overall ratio).
+ *
+ * Existence: it is bound to exist otherwise the overall progress
+ * ratio would be inconsistent: with positive buffers to write (t1 & t2)
+ * and already written buffers (w1 & w2), we have:
+ *
+ * If w1/t1 > (w1+w2)/(t1+t2) # one table space is in advance
+ * => w1t1+w1t2 > w1t1+w2t1 => w1t2 > w2t1 => w1t2+w2t2 > w2t1+w2t2
+ * => (w1+w2) / (t1+t2) > w2 / t2 # the other one is late
+ *
+ * The round robin ensures that each space is given some attention
+ * till it is over the current ratio, before going to the next.
+ *
+ * Precision: using int32 computations for comparing fractions
+ * (w1 / t1 > w / t <=> w1 t > w t1) seems a bad idea as the values
+ * can overflow 32-bit integers: the limit would be sqrt(2**31) ~
+ * 46340 buffers, i.e. a 362 MB checkpoint. So ensure that 64-bit
+ * integers are used in the comparison.
+ */
+ while (/* compare tablespace vs overall progress ratio:
+ * tablespace written/to_write > overall written/to_write
+ */
+ (int64) spcStatus[space].num_written * num_to_write >
+ (int64) num_written * spcStatus[space].num_to_write)
+ space = (space + 1) % nb_spaces; /* round robin */
+
+ /*
+ * Find a valid buffer in the selected tablespace,
+ * by continuing the tablespace specific buffer scan
+ * where it was left.
+ */
+ index = spcStatus[space].index;
+
+ while (index < num_to_write && buf_id == -1)
+ {
+ volatile BufferDesc *bufHdr;
+
+ buf_id = CheckpointBufferIds[index].buf_id;
+ bufHdr = GetBufferDescriptor(buf_id);
+
+ /* Skip if in another tablespace or not in checkpoint anymore.
+ * No lock is acquired, see comments below.
+ */
+ if (spcStatus[space].space != bufHdr->tag.rnode.spcNode ||
+ ! (bufHdr->flags & BM_CHECKPOINT_NEEDED))
+ {
+ index ++;
+ buf_id = -1;
+ }
+ }
+
+ /* Update tablespace writing status, will start over at next index */
+ spcStatus[space].index = index+1;
+
+ *pspace = space;
+
+ return buf_id;
+}
+
/*
* BufferSync -- Write out all dirty buffers in the pool.
*
@@ -1574,11 +1701,13 @@ UnpinBuffer(volatile BufferDesc *buf, bool fixOwner)
static void
BufferSync(int flags)
{
- int buf_id;
- int num_to_scan;
+ int buf_id = -1;
int num_to_write;
int num_written;
int mask = BM_DIRTY;
+ HTAB *spcBuffers;
+ TableSpaceCheckpointStatus *spcStatus = NULL;
+ int nb_spaces, space;
/* Make sure we can handle the pin inside SyncOneBuffer */
ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
@@ -1609,6 +1738,18 @@ BufferSync(int flags)
* certainly need to be written for the next checkpoint attempt, too.
*/
num_to_write = 0;
+
+ /* initialize oid -> int buffer count hash table */
+ {
+ HASHCTL ctl;
+
+ MemSet(&ctl, 0, sizeof(HASHCTL));
+ ctl.keysize = sizeof(Oid);
+ ctl.entrysize = sizeof(TableSpaceCountEntry);
+ spcBuffers = hash_create("Number of buffers to write per tablespace",
+ 16, &ctl, HASH_ELEM | HASH_BLOBS);
+ }
+
for (buf_id = 0; buf_id < NBuffers; buf_id++)
{
volatile BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
@@ -1621,32 +1762,111 @@ BufferSync(int flags)
if ((bufHdr->flags & mask) == mask)
{
+ Oid spc;
+ TableSpaceCountEntry * entry;
+ bool found;
+
bufHdr->flags |= BM_CHECKPOINT_NEEDED;
+ CheckpointBufferIds[num_to_write].buf_id = buf_id;
+ CheckpointBufferIds[num_to_write].relNode = bufHdr->tag.rnode.relNode;
+ CheckpointBufferIds[num_to_write].forkNum = bufHdr->tag.forkNum;
+ CheckpointBufferIds[num_to_write].blockNum = bufHdr->tag.blockNum;
num_to_write++;
+
+ /* keep track of per tablespace buffers */
+ spc = bufHdr->tag.rnode.spcNode;
+ entry = (TableSpaceCountEntry *)
+ hash_search(spcBuffers, (void *) &spc, HASH_ENTER, &found);
+
+ if (found) entry->count++;
+ else entry->count = 1;
}
UnlockBufHdr(bufHdr);
}
if (num_to_write == 0)
+ {
+ hash_destroy(spcBuffers);
return; /* nothing to do */
+ }
TRACE_POSTGRESQL_BUFFER_SYNC_START(NBuffers, num_to_write);
+ /* Build checkpoint tablespace buffer status */
+ nb_spaces = hash_get_num_entries(spcBuffers);
+ spcStatus = (TableSpaceCheckpointStatus *)
+ palloc(sizeof(TableSpaceCheckpointStatus) * nb_spaces);
+
+ {
+ int index = 0;
+ HASH_SEQ_STATUS hseq;
+ TableSpaceCountEntry * entry;
+
+ hash_seq_init(&hseq, spcBuffers);
+ while ((entry = (TableSpaceCountEntry *) hash_seq_search(&hseq)))
+ {
+ Assert(index < nb_spaces);
+ spcStatus[index].space = entry->space;
+ spcStatus[index].num_to_write = entry->count;
+ spcStatus[index].num_written = 0;
+ /* should it be randomized? chosen with some criterion? */
+ spcStatus[index].index = 0;
+
+ index ++;
+ }
+ }
+
+ hash_destroy(spcBuffers);
+ spcBuffers = NULL;
+
/*
- * Loop over all buffers again, and write the ones (still) marked with
- * BM_CHECKPOINT_NEEDED. In this loop, we start at the clock sweep point
- * since we might as well dump soon-to-be-recycled buffers first.
+ * Sort buffer ids to help find sequential writes.
*
- * Note that we don't read the buffer alloc count here --- that should be
- * left untouched till the next BgBufferSync() call.
+ * Note: Buffers are not locked in any way during sorting, but that's ok:
+ * A change in the buffer header is only relevant when it changes the
+ * buffer's identity. If the identity has changed it'll have been
+ * written out by BufferAlloc(), so there's no need for checkpointer to
+ * write it out anymore. The buffer might also get written out by a
+ * backend or bgwriter, but that's equally harmless.
+ *
+ * Marked buffers must not be move during the checkpoint.
+ * Also, qsort implementation should be resilient to occasional
+ * contradictions (cmp(a,b) != -cmp(b,a)) because of possible
+ * concurrent changes.
*/
- buf_id = StrategySyncStart(NULL, NULL);
- num_to_scan = NBuffers;
+ CheckpointStats.ckpt_sort_t = GetCurrentTimestamp();
+
+ if (checkpoint_sort)
+ {
+ qsort(CheckpointBufferIds, num_to_write, sizeof(CheckpointSortItem),
+ bufcmp);
+ }
+
+ CheckpointStats.ckpt_sort_end_t = GetCurrentTimestamp();
+
+ /*
+ * Loop over buffers to write through CheckpointBufferIds,
+ * and write the ones (still) marked with BM_CHECKPOINT_NEEDED,
+ * with some round robin over table spaces so as to balance writes,
+ * so that buffer writes move forward roughly proportionally for each
+ * tablespace.
+ *
+ * Termination: if a tablespace is selected by the inner while loop
+ * (see argument there), its index is incremented and will eventually
+ * reach num_to_write, mark this table space scanning as done and
+ * decrement the number of (active) spaces, which will thus reach 0.
+ */
+ space = 0;
num_written = 0;
- while (num_to_scan-- > 0)
+
+ while (nb_spaces != 0)
{
- volatile BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
+ volatile BufferDesc *bufHdr = NULL;
+ buf_id = NextBufferToWrite(spcStatus, nb_spaces, &space,
+ num_to_write, num_written);
+ if (buf_id != -1)
+ bufHdr = GetBufferDescriptor(buf_id);
/*
* We don't need to acquire the lock here, because we're only looking
@@ -1660,39 +1880,45 @@ BufferSync(int flags)
* write the buffer though we didn't need to. It doesn't seem worth
* guarding against this, though.
*/
- if (bufHdr->flags & BM_CHECKPOINT_NEEDED)
+ if (bufHdr != NULL && bufHdr->flags & BM_CHECKPOINT_NEEDED)
{
if (SyncOneBuffer(buf_id, false) & BUF_WRITTEN)
{
TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id);
BgWriterStats.m_buf_written_checkpoints++;
+ spcStatus[space].num_written++;
num_written++;
/*
- * We know there are at most num_to_write buffers with
- * BM_CHECKPOINT_NEEDED set; so we can stop scanning if
- * num_written reaches num_to_write.
- *
- * Note that num_written doesn't include buffers written by
- * other backends, or by the bgwriter cleaning scan. That
- * means that the estimate of how much progress we've made is
- * conservative, and also that this test will often fail to
- * trigger. But it seems worth making anyway.
- */
- if (num_written >= num_to_write)
- break;
-
- /*
* Sleep to throttle our I/O rate.
*/
CheckpointWriteDelay(flags, (double) num_written / num_to_write);
}
}
- if (++buf_id >= NBuffers)
- buf_id = 0;
+ /*
+ * Detect checkpoint end for a tablespace: either the scan is done
+ * or all tablespace buffers have been written out. If so, the
+ * another active tablespace status is moved in place of the current
+ * one and the next round will start on this one, or maybe round about.
+ * Note: maybe an exchange could be made instead in order to keep
+ * informations about the closed table space, but this is currently
+ * not used afterwards.
+ */
+ if (spcStatus[space].index >= num_to_write ||
+ spcStatus[space].num_written >= spcStatus[space].num_to_write)
+ {
+ nb_spaces--;
+ if (space != nb_spaces)
+ spcStatus[space] = spcStatus[nb_spaces];
+ else
+ space = 0;
+ }
}
+ pfree(spcStatus);
+ spcStatus = NULL;
+
/*
* Update checkpoint statistics. As noted above, this doesn't include
* buffers written by other backends or bgwriter scan.
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index b3dac51..cf1e505 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -1013,6 +1013,17 @@ static struct config_bool ConfigureNamesBool[] =
false,
NULL, NULL, NULL
},
+
+ {
+ {"checkpoint_sort", PGC_SIGHUP, WAL_CHECKPOINTS,
+ gettext_noop("Whether disk-page buffers are sorted on checkpoints."),
+ NULL
+ },
+ &checkpoint_sort,
+ true,
+ NULL, NULL, NULL
+ },
+
{
{"log_connections", PGC_SU_BACKEND, LOGGING_WHAT,
gettext_noop("Logs each successful connection."),
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index e5d275d..e84f380 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -201,6 +201,7 @@
#max_wal_size = 1GB
#min_wal_size = 80MB
#checkpoint_completion_target = 0.5 # checkpoint target duration, 0.0 - 1.0
+#checkpoint_sort = on # sort buffers on checkpoint
#checkpoint_warning = 30s # 0 disables
# - Archiving -
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index 6dacee2..dbd4757 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -186,6 +186,8 @@ extern bool XLOG_DEBUG;
typedef struct CheckpointStatsData
{
TimestampTz ckpt_start_t; /* start of checkpoint */
+ TimestampTz ckpt_sort_t; /* start buffer sorting */
+ TimestampTz ckpt_sort_end_t; /* end of sorting */
TimestampTz ckpt_write_t; /* start of flushing buffers */
TimestampTz ckpt_sync_t; /* start of fsyncs */
TimestampTz ckpt_sync_end_t; /* end of fsyncs */
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index 521ee1c..7fde0dc 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -210,6 +210,22 @@ extern PGDLLIMPORT BufferDescPadded *BufferDescriptors;
/* in localbuf.c */
extern BufferDesc *LocalBufferDescriptors;
+/* in bufmgr.c */
+
+/*
+ * Structure to sort buffers per file on checkpoints.
+ *
+ * Maybe the sort criterion could be compacted to reduce memory requirement
+ * and for faster comparison?
+ */
+typedef struct CheckpointSortItem {
+ int buf_id;
+ Oid relNode;
+ ForkNumber forkNum; /* only 4 values */
+ BlockNumber blockNum;
+} CheckpointSortItem;
+
+extern CheckpointSortItem *CheckpointBufferIds;
/*
* Internal routines: only called by bufmgr
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index ec0a254..c228f39 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -54,6 +54,7 @@ extern int bgwriter_lru_maxpages;
extern double bgwriter_lru_multiplier;
extern bool track_io_timing;
extern int target_prefetch_pages;
+extern bool checkpoint_sort;
/* in buf_init.c */
extern PGDLLIMPORT char *BufferBlocks;