From 47f436d7e423ece33a25adebf4265eac02e575f3 Mon Sep 17 00:00:00 2001
From: Masahiro Ikeda <ikedamsh@oss.nttdata.com>
Date: Fri, 29 Jan 2021 16:41:34 +0900
Subject: [PATCH 1/2] Add statistics related to write/sync wal records.

This patch adds following statistics to pg_stat_wal view
to track WAL I/O activity.

- the total number of times writing/syncing WAL data.
- the total amount of time spent writing/syncing WAL data.

Since to track I/O timing may leads significant overhead,
GUC parameter "track_wal_io_timing" is introduced.
Only if this is on, the I/O timing is measured.

The statistics related to sync are zero when "wal_sync_method"
is "open_datasync" or "open_sync", because it doesn't call each
sync method.

(This requires a catversion bump, as well as an update to
 PGSTAT_FILE_FORMAT_ID)

Author: Masahiro Ikeda
Reviewed-By: Japin Li, Hayato Kuroda, Masahiko Sawada, David Johnston
Discussion: https://postgr.es/m/0509ad67b585a5b86a83d445dfa75392@oss.nttdata.com
---
 doc/src/sgml/config.sgml                      | 21 +++++++
 doc/src/sgml/monitoring.sgml                  | 50 ++++++++++++++++
 doc/src/sgml/wal.sgml                         | 12 +++-
 src/backend/access/transam/xlog.c             | 57 +++++++++++++++++++
 src/backend/catalog/system_views.sql          |  4 ++
 src/backend/postmaster/checkpointer.c         |  2 +-
 src/backend/postmaster/pgstat.c               |  4 ++
 src/backend/postmaster/walwriter.c            |  3 +
 src/backend/utils/adt/pgstatfuncs.c           | 24 +++++++-
 src/backend/utils/misc/guc.c                  |  9 +++
 src/backend/utils/misc/postgresql.conf.sample |  1 +
 src/include/access/xlog.h                     |  1 +
 src/include/catalog/pg_proc.dat               |  6 +-
 src/include/pgstat.h                          | 10 ++++
 src/test/regress/expected/rules.out           |  6 +-
 15 files changed, 199 insertions(+), 11 deletions(-)

diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 5ef1c7ad3c..4bdc341141 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -7418,6 +7418,27 @@ COPY postgres_log FROM '/full/path/to/logfile.csv' WITH csv;
       </listitem>
      </varlistentry>
 
+     <varlistentry id="guc-track-wal-io-timing" xreflabel="track_wal_io_timing">
+      <term><varname>track_wal_io_timing</varname> (<type>boolean</type>)
+      <indexterm>
+       <primary><varname>track_wal_io_timing</varname> configuration parameter</primary>
+      </indexterm>
+      </term>
+      <listitem>
+       <para>
+        Enables timing of WAL I/O calls. This parameter is off by default,
+        because it will repeatedly query the operating system for
+        the current time, which may cause significant overhead on some
+        platforms.  You can use the <xref linkend="pgtesttiming"/> tool to
+        measure the overhead of timing on your system.
+        I/O timing information is
+        displayed in <link linkend="monitoring-pg-stat-wal-view">
+        <structname>pg_stat_wal</structname></link>.  Only superusers can
+        change this setting.
+       </para>
+      </listitem>
+     </varlistentry>
+
      <varlistentry id="guc-track-functions" xreflabel="track_functions">
       <term><varname>track_functions</varname> (<type>enum</type>)
       <indexterm>
diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml
index c602ee4427..2435f401db 100644
--- a/doc/src/sgml/monitoring.sgml
+++ b/doc/src/sgml/monitoring.sgml
@@ -3487,6 +3487,56 @@ SELECT pid, wait_event_type, wait_event FROM pg_stat_activity WHERE wait_event i
       </para></entry>
      </row>
 
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>wal_write</structfield> <type>bigint</type>
+      </para>
+      <para>
+       Number of times WAL buffers were written out to disk via 
+       <function>XLogWrite</function>, which nomally called by an 
+       <function>XLogFlush</function> request(see <xref linkend="wal-configuration"/>)
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>wal_write_time</structfield> <type>double precision</type>
+      </para>
+      <para>
+       Total amount of time spent writing WAL data to disk, excluding sync time unless 
+       <xref linkend="guc-wal-sync-method"/> is ether <literal>open_datasync</literal> or 
+       <literal>open_sync</literal>. Units are in milliseconds with microsecond resolution.
+       This is zero when <xref linkend="guc-track-wal-io-timing"/> is disabled.
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>wal_sync</structfield> <type>bigint</type>
+      </para>
+      <para>
+       Number of times WAL files were synced to disk via 
+       <function>issue_xlog_fsync</function>, which nomally called by an 
+       <function>XLogFlush</function> request(see <xref linkend="wal-configuration"/>),
+       while <xref linkend="guc-wal-sync-method"/> was set to one of the 
+       "sync at commit" options (i.e., <literal>fdatasync</literal>, 
+       <literal>fsync</literal>, or <literal>fsync_writethrough</literal>).
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>wal_sync_time</structfield> <type>double precision</type>
+      </para>
+      <para>
+       Total amount of time spent syncing WAL files to disk, in milliseconds with microsecond 
+       resolution. This requires setting <xref linkend="guc-wal-sync-method"/> to one of 
+       the "sync at commit" options (i.e., <literal>fdatasync</literal>, <literal>fsync</literal>,
+       or <literal>fsync_writethrough</literal>).
+       This is zero when <xref linkend="guc-track-wal-io-timing"/> is disabled.
+      </para></entry>
+     </row>
+
      <row>
       <entry role="catalog_table_entry"><para role="column_definition">
        <structfield>stats_reset</structfield> <type>timestamp with time zone</type>
diff --git a/doc/src/sgml/wal.sgml b/doc/src/sgml/wal.sgml
index 66de1ee2f8..984cb5764c 100644
--- a/doc/src/sgml/wal.sgml
+++ b/doc/src/sgml/wal.sgml
@@ -663,7 +663,9 @@
    the <acronym>WAL</acronym> buffers in shared memory. If there is no
    space for the new record, <function>XLogInsertRecord</function> will have
    to write (move to kernel cache) a few filled <acronym>WAL</acronym>
-   buffers. This is undesirable because <function>XLogInsertRecord</function>
+   buffers. The number of times it happend is counted as 
+   <literal>wal_buffers_full</literal> in <xref linkend="pg-stat-wal-view"/>.
+   This is undesirable because <function>XLogInsertRecord</function> 
    is used on every database low level modification (for example, row
    insertion) at a time when an exclusive lock is held on affected
    data pages, so the operation needs to be as fast as possible.  What
@@ -672,8 +674,12 @@
    time. Normally, <acronym>WAL</acronym> buffers should be written
    and flushed by an <function>XLogFlush</function> request, which is
    made, for the most part, at transaction commit time to ensure that
-   transaction records are flushed to permanent storage. On systems
-   with high log output, <function>XLogFlush</function> requests might
+   transaction records are flushed to permanent storage. 
+   <function>XLogFlush</function> calls <function>XLogWrite</function> to write 
+   and <function>issue_xlog_fsync</function> to flush them, which are counted as 
+   <literal>wal_write</literal> and <literal>wal_sync</literal> in 
+   <xref linkend="pg-stat-wal-view"/>. On systems with high log output, 
+   <function>XLogFlush</function> requests might
    not occur often enough to prevent <function>XLogInsertRecord</function>
    from having to do writes.  On such systems
    one should increase the number of <acronym>WAL</acronym> buffers by
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index eda322b910..c396ff4090 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -110,6 +110,7 @@ int			CommitDelay = 0;	/* precommit delay in microseconds */
 int			CommitSiblings = 5; /* # concurrent xacts needed to sleep */
 int			wal_retrieve_retry_interval = 5000;
 int			max_slot_wal_keep_size_mb = -1;
+bool		track_wal_io_timing = false;
 
 #ifdef WAL_DEBUG
 bool		XLOG_DEBUG = false;
@@ -2536,6 +2537,7 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible)
 			Size		nbytes;
 			Size		nleft;
 			int			written;
+			instr_time	start;
 
 			/* OK to write the page(s) */
 			from = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ;
@@ -2544,9 +2546,30 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible)
 			do
 			{
 				errno = 0;
+
+				/* Measure I/O timing to write WAL data */
+				if (track_wal_io_timing)
+					INSTR_TIME_SET_CURRENT(start);
+
 				pgstat_report_wait_start(WAIT_EVENT_WAL_WRITE);
 				written = pg_pwrite(openLogFile, from, nleft, startoffset);
 				pgstat_report_wait_end();
+
+				/*
+				 * Increment the I/O timing and the number of times WAL data
+				 * were written out to disk.
+				 */
+				if (track_wal_io_timing)
+				{
+					instr_time	duration;
+
+					INSTR_TIME_SET_CURRENT(duration);
+					INSTR_TIME_SUBTRACT(duration, start);
+					WalStats.m_wal_write_time = INSTR_TIME_GET_MICROSEC(duration);
+				}
+
+				WalStats.m_wal_write++;
+
 				if (written <= 0)
 				{
 					char		xlogfname[MAXFNAMELEN];
@@ -10545,6 +10568,21 @@ void
 issue_xlog_fsync(int fd, XLogSegNo segno)
 {
 	char	   *msg = NULL;
+	bool		issue_fsync = false;
+	instr_time	start;
+
+	/* Check whether the WAL file was synced to disk right now */
+	if (enableFsync &&
+		(sync_method == SYNC_METHOD_FSYNC ||
+		 sync_method == SYNC_METHOD_FSYNC_WRITETHROUGH ||
+		 sync_method == SYNC_METHOD_FDATASYNC))
+	{
+		/* Measure I/O timing to sync the WAL file */
+		if (track_wal_io_timing)
+			INSTR_TIME_SET_CURRENT(start);
+
+		issue_fsync = true;
+	}
 
 	pgstat_report_wait_start(WAIT_EVENT_WAL_SYNC);
 	switch (sync_method)
@@ -10589,6 +10627,25 @@ issue_xlog_fsync(int fd, XLogSegNo segno)
 	}
 
 	pgstat_report_wait_end();
+
+	/*
+	 * Increment the I/O timing and the number of times WAL files were synced.
+	 *
+	 * Check whether the WAL file was synced to disk right now because
+	 * statistics must be incremented when syncing really occurred.
+	 */
+	if (issue_fsync)
+	{
+		if (track_wal_io_timing)
+		{
+			instr_time	duration;
+
+			INSTR_TIME_SET_CURRENT(duration);
+			INSTR_TIME_SUBTRACT(duration, start);
+			WalStats.m_wal_sync_time = INSTR_TIME_GET_MICROSEC(duration);
+		}
+		WalStats.m_wal_sync++;
+	}
 }
 
 /*
diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql
index fa58afd9d7..b8ace4fc41 100644
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -1004,6 +1004,10 @@ CREATE VIEW pg_stat_wal AS
         w.wal_fpi,
         w.wal_bytes,
         w.wal_buffers_full,
+        w.wal_write,
+        w.wal_write_time,
+        w.wal_sync,
+        w.wal_sync_time,
         w.stats_reset
     FROM pg_stat_get_wal() w;
 
diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c
index 54a818bf61..7f0996ce3c 100644
--- a/src/backend/postmaster/checkpointer.c
+++ b/src/backend/postmaster/checkpointer.c
@@ -504,7 +504,7 @@ CheckpointerMain(void)
 		 */
 		pgstat_send_bgwriter();
 
-		/* Send WAL statistics to the stats collector. */
+		/* Send WAL statistics to the stats collector */
 		pgstat_send_wal();
 
 		/*
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index f75b52719d..987bbd058d 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -6892,6 +6892,10 @@ pgstat_recv_wal(PgStat_MsgWal *msg, int len)
 	walStats.wal_fpi += msg->m_wal_fpi;
 	walStats.wal_bytes += msg->m_wal_bytes;
 	walStats.wal_buffers_full += msg->m_wal_buffers_full;
+	walStats.wal_write += msg->m_wal_write;
+	walStats.wal_write_time += msg->m_wal_write_time;
+	walStats.wal_sync += msg->m_wal_sync;
+	walStats.wal_sync_time += msg->m_wal_sync_time;
 }
 
 /* ----------
diff --git a/src/backend/postmaster/walwriter.c b/src/backend/postmaster/walwriter.c
index 4f1a8e356b..104cba4581 100644
--- a/src/backend/postmaster/walwriter.c
+++ b/src/backend/postmaster/walwriter.c
@@ -253,6 +253,9 @@ WalWriterMain(void)
 		else if (left_till_hibernate > 0)
 			left_till_hibernate--;
 
+		/* Send WAL statistics */
+		pgstat_send_wal();
+
 		/*
 		 * Sleep until we are signaled or WalWriterDelay has elapsed.  If we
 		 * haven't done anything useful for quite some time, lengthen the
diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c
index 62bff52638..7296ef04ff 100644
--- a/src/backend/utils/adt/pgstatfuncs.c
+++ b/src/backend/utils/adt/pgstatfuncs.c
@@ -1799,7 +1799,7 @@ pg_stat_get_buf_alloc(PG_FUNCTION_ARGS)
 Datum
 pg_stat_get_wal(PG_FUNCTION_ARGS)
 {
-#define PG_STAT_GET_WAL_COLS	5
+#define PG_STAT_GET_WAL_COLS	9
 	TupleDesc	tupdesc;
 	Datum		values[PG_STAT_GET_WAL_COLS];
 	bool		nulls[PG_STAT_GET_WAL_COLS];
@@ -1820,7 +1820,15 @@ pg_stat_get_wal(PG_FUNCTION_ARGS)
 					   NUMERICOID, -1, 0);
 	TupleDescInitEntry(tupdesc, (AttrNumber) 4, "wal_buffers_full",
 					   INT8OID, -1, 0);
-	TupleDescInitEntry(tupdesc, (AttrNumber) 5, "stats_reset",
+	TupleDescInitEntry(tupdesc, (AttrNumber) 5, "wal_write",
+					   INT8OID, -1, 0);
+	TupleDescInitEntry(tupdesc, (AttrNumber) 6, "wal_write_time",
+					   FLOAT8OID, -1, 0);
+	TupleDescInitEntry(tupdesc, (AttrNumber) 7, "wal_sync",
+					   INT8OID, -1, 0);
+	TupleDescInitEntry(tupdesc, (AttrNumber) 8, "wal_sync_time",
+					   FLOAT8OID, -1, 0);
+	TupleDescInitEntry(tupdesc, (AttrNumber) 9, "stats_reset",
 					   TIMESTAMPTZOID, -1, 0);
 
 	BlessTupleDesc(tupdesc);
@@ -1840,7 +1848,17 @@ pg_stat_get_wal(PG_FUNCTION_ARGS)
 									Int32GetDatum(-1));
 
 	values[3] = Int64GetDatum(wal_stats->wal_buffers_full);
-	values[4] = TimestampTzGetDatum(wal_stats->stat_reset_timestamp);
+	values[4] = Int64GetDatum(wal_stats->wal_write);
+
+	/* convert counter from microsec to millisec for display */
+	values[5] = Float8GetDatum((double) wal_stats->wal_write_time / 1000.0);
+
+	values[6] = Int64GetDatum(wal_stats->wal_sync);
+
+	/* convert counter from microsec to millisec for display */
+	values[7] = Float8GetDatum((double) wal_stats->wal_sync_time / 1000.0);
+
+	values[8] = TimestampTzGetDatum(wal_stats->stat_reset_timestamp);
 
 	/* Returns the record as Datum */
 	PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls)));
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 8735e36174..9a58922b8f 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -1485,6 +1485,15 @@ static struct config_bool ConfigureNamesBool[] =
 		false,
 		NULL, NULL, NULL
 	},
+	{
+		{"track_wal_io_timing", PGC_SUSET, STATS_COLLECTOR,
+			gettext_noop("Collects timing statistics for WAL I/O activity."),
+			NULL
+		},
+		&track_wal_io_timing,
+		false,
+		NULL, NULL, NULL
+	},
 
 	{
 		{"update_process_title", PGC_SUSET, PROCESS_TITLE,
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index bd57e917e1..a4c1d6650c 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -585,6 +585,7 @@
 #track_activities = on
 #track_counts = on
 #track_io_timing = off
+#track_wal_io_timing = off
 #track_functions = none			# none, pl, all
 #track_activity_query_size = 1024	# (change requires restart)
 #stats_temp_directory = 'pg_stat_tmp'
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index 75ec1073bd..1e53d9d4ca 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -131,6 +131,7 @@ extern int	recovery_min_apply_delay;
 extern char *PrimaryConnInfo;
 extern char *PrimarySlotName;
 extern bool wal_receiver_create_temp_slot;
+extern bool track_wal_io_timing;
 
 /* indirectly set via GUC system */
 extern TransactionId recoveryTargetXid;
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index ada55e7ad5..6962ffeef2 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -5543,9 +5543,9 @@
 { oid => '1136', descr => 'statistics: information about WAL activity',
   proname => 'pg_stat_get_wal', proisstrict => 'f', provolatile => 's',
   proparallel => 'r', prorettype => 'record', proargtypes => '',
-  proallargtypes => '{int8,int8,numeric,int8,timestamptz}',
-  proargmodes => '{o,o,o,o,o}',
-  proargnames => '{wal_records,wal_fpi,wal_bytes,wal_buffers_full,stats_reset}',
+  proallargtypes => '{int8,int8,numeric,int8,int8,float8,int8,float8,timestamptz}',
+  proargmodes => '{o,o,o,o,o,o,o,o,o}',
+  proargnames => '{wal_records,wal_fpi,wal_bytes,wal_buffers_full,wal_write,wal_write_time,wal_sync,wal_sync_time,stats_reset}',
   prosrc => 'pg_stat_get_wal' },
 
 { oid => '2306', descr => 'statistics: information about SLRU caches',
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index 724068cf87..000bb14c0b 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -474,6 +474,12 @@ typedef struct PgStat_MsgWal
 	PgStat_Counter m_wal_fpi;
 	uint64		m_wal_bytes;
 	PgStat_Counter m_wal_buffers_full;
+	PgStat_Counter m_wal_write;
+	PgStat_Counter m_wal_write_time;	/* time spend writing wal records in
+										 * micro seconds */
+	PgStat_Counter m_wal_sync;
+	PgStat_Counter m_wal_sync_time; /* time spend syncing wal records in micro
+									 * seconds */
 } PgStat_MsgWal;
 
 /* ----------
@@ -839,6 +845,10 @@ typedef struct PgStat_WalStats
 	PgStat_Counter wal_fpi;
 	uint64		wal_bytes;
 	PgStat_Counter wal_buffers_full;
+	PgStat_Counter wal_write;
+	PgStat_Counter wal_write_time;
+	PgStat_Counter wal_sync;
+	PgStat_Counter wal_sync_time;
 	TimestampTz stat_reset_timestamp;
 } PgStat_WalStats;
 
diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out
index 6173473de9..bc3909fd17 100644
--- a/src/test/regress/expected/rules.out
+++ b/src/test/regress/expected/rules.out
@@ -2158,8 +2158,12 @@ pg_stat_wal| SELECT w.wal_records,
     w.wal_fpi,
     w.wal_bytes,
     w.wal_buffers_full,
+    w.wal_write,
+    w.wal_write_time,
+    w.wal_sync,
+    w.wal_sync_time,
     w.stats_reset
-   FROM pg_stat_get_wal() w(wal_records, wal_fpi, wal_bytes, wal_buffers_full, stats_reset);
+   FROM pg_stat_get_wal() w(wal_records, wal_fpi, wal_bytes, wal_buffers_full, wal_write, wal_write_time, wal_sync, wal_sync_time, stats_reset);
 pg_stat_wal_receiver| SELECT s.pid,
     s.status,
     s.receive_start_lsn,
-- 
2.25.1

