From 4362bfbd9972e98c2c369434f9cfa9d51dd7aca0 Mon Sep 17 00:00:00 2001
From: Bertrand Drouvot <bertranddrouvot.pg@gmail.com>
Date: Mon, 1 Sep 2025 10:54:35 +0000
Subject: [PATCH v2] Get rid of pgstat_count_backend_io_op*() functions

This commit removes the functions that are incrementing the backend IO stats.
Instead, it now uses a "global" IO stat counters and each IO stats kind (global
and per-backend) compute the diff during their dedicated flush with a previous
copy of the global counters. That's the same idea as for WAL usage.

It's done that way to avoid incrementing the "same" counters twice as it produces
increased overhead in profiles.

Reported-by: Andres Freund <andres@anarazel.de>
---
 src/backend/utils/activity/pgstat.c         |  2 +
 src/backend/utils/activity/pgstat_backend.c | 86 +++++++--------------
 src/backend/utils/activity/pgstat_io.c      | 56 +++++++++-----
 src/include/pgstat.h                        | 35 +++------
 src/include/utils/pgstat_internal.h         |  1 +
 src/tools/pgindent/typedefs.list            |  3 +-
 6 files changed, 84 insertions(+), 99 deletions(-)
  84.8% src/backend/utils/activity/
  14.4% src/include/

diff --git a/src/backend/utils/activity/pgstat.c b/src/backend/utils/activity/pgstat.c
index ffb5b8cce34..d499d052c54 100644
--- a/src/backend/utils/activity/pgstat.c
+++ b/src/backend/utils/activity/pgstat.c
@@ -114,6 +114,7 @@
 #include "utils/pgstat_internal.h"
 #include "utils/timestamp.h"
 
+IOUsage		pgIOUsage;
 
 /* ----------
  * Timer definitions.
@@ -440,6 +441,7 @@ static const PgStat_KindInfo pgstat_kind_builtin_infos[PGSTAT_KIND_BUILTIN_SIZE]
 		.shared_data_off = offsetof(PgStatShared_IO, stats),
 		.shared_data_len = sizeof(((PgStatShared_IO *) 0)->stats),
 
+		.init_backend_cb = pgstat_io_init_backend_cb,
 		.flush_static_cb = pgstat_io_flush_cb,
 		.init_shmem_cb = pgstat_io_init_shmem_cb,
 		.reset_all_cb = pgstat_io_reset_all_cb,
diff --git a/src/backend/utils/activity/pgstat_backend.c b/src/backend/utils/activity/pgstat_backend.c
index 8714a85e2d9..124aee7f6ae 100644
--- a/src/backend/utils/activity/pgstat_backend.c
+++ b/src/backend/utils/activity/pgstat_backend.c
@@ -12,7 +12,7 @@
  * of pgstats.  Entries are created each time a process is spawned, and are
  * dropped when the process exits.  These are not written to the pgstats file
  * on disk.  Pending statistics are managed without direct interactions with
- * PgStat_EntryRef->pending, relying on PendingBackendStats instead so as it
+ * PgStat_EntryRef->pending, relying on global counters instead so as it
  * is possible to report data within critical sections.
  *
  * Copyright (c) 2001-2025, PostgreSQL Global Development Group
@@ -32,12 +32,13 @@
 #include "utils/pgstat_internal.h"
 
 /*
- * Backend statistics counts waiting to be flushed out. These counters may be
- * reported within critical sections so we use static memory in order to avoid
- * memory allocation.
+ * IO counters saved from pgIOUsage at the previous call to
+ * pgstat_report_stat(). This is used to calculate how much IO usage
+ * happens between pgstat_report_stat() calls, by subtracting
+ * the previous counters from the current ones.
  */
-static PgStat_BackendPending PendingBackendStats;
-static bool backend_has_iostats = false;
+static IOUsage prevBackendIOUsage;
+bool		backend_has_iostats = false;
 
 /*
  * WAL usage counters saved from pgWalUsage at the previous call to
@@ -47,44 +48,6 @@ static bool backend_has_iostats = false;
  */
 static WalUsage prevBackendWalUsage;
 
-/*
- * Utility routines to report I/O stats for backends, kept here to avoid
- * exposing PendingBackendStats to the outside world.
- */
-void
-pgstat_count_backend_io_op_time(IOObject io_object, IOContext io_context,
-								IOOp io_op, instr_time io_time)
-{
-	Assert(track_io_timing || track_wal_io_timing);
-
-	if (!pgstat_tracks_backend_bktype(MyBackendType))
-		return;
-
-	Assert(pgstat_tracks_io_op(MyBackendType, io_object, io_context, io_op));
-
-	INSTR_TIME_ADD(PendingBackendStats.pending_io.pending_times[io_object][io_context][io_op],
-				   io_time);
-
-	backend_has_iostats = true;
-	pgstat_report_fixed = true;
-}
-
-void
-pgstat_count_backend_io_op(IOObject io_object, IOContext io_context,
-						   IOOp io_op, uint32 cnt, uint64 bytes)
-{
-	if (!pgstat_tracks_backend_bktype(MyBackendType))
-		return;
-
-	Assert(pgstat_tracks_io_op(MyBackendType, io_object, io_context, io_op));
-
-	PendingBackendStats.pending_io.counts[io_object][io_context][io_op] += cnt;
-	PendingBackendStats.pending_io.bytes[io_object][io_context][io_op] += bytes;
-
-	backend_has_iostats = true;
-	pgstat_report_fixed = true;
-}
-
 /*
  * Returns statistics of a backend by proc number.
  */
@@ -166,7 +129,6 @@ pgstat_flush_backend_entry_io(PgStat_EntryRef *entry_ref)
 {
 	PgStatShared_Backend *shbackendent;
 	PgStat_BktypeIO *bktype_shstats;
-	PgStat_PendingIO pending_io;
 
 	/*
 	 * This function can be called even if nothing at all has happened for IO
@@ -178,7 +140,6 @@ pgstat_flush_backend_entry_io(PgStat_EntryRef *entry_ref)
 
 	shbackendent = (PgStatShared_Backend *) entry_ref->shared_stats;
 	bktype_shstats = &shbackendent->stats.io_stats;
-	pending_io = PendingBackendStats.pending_io;
 
 	for (int io_object = 0; io_object < IOOBJECT_NUM_TYPES; io_object++)
 	{
@@ -186,24 +147,31 @@ pgstat_flush_backend_entry_io(PgStat_EntryRef *entry_ref)
 		{
 			for (int io_op = 0; io_op < IOOP_NUM_TYPES; io_op++)
 			{
-				instr_time	time;
+				instr_time	time_diff;
 
 				bktype_shstats->counts[io_object][io_context][io_op] +=
-					pending_io.counts[io_object][io_context][io_op];
+					pgIOUsage.counts[io_object][io_context][io_op] -
+					prevBackendIOUsage.counts[io_object][io_context][io_op];
+
 				bktype_shstats->bytes[io_object][io_context][io_op] +=
-					pending_io.bytes[io_object][io_context][io_op];
-				time = pending_io.pending_times[io_object][io_context][io_op];
+					pgIOUsage.bytes[io_object][io_context][io_op] -
+					prevBackendIOUsage.bytes[io_object][io_context][io_op];
+
+				INSTR_TIME_SET_ZERO(time_diff);
+				INSTR_TIME_ACCUM_DIFF(time_diff,
+									  pgIOUsage.pending_times[io_object][io_context][io_op],
+									  prevBackendIOUsage.pending_times[io_object][io_context][io_op]);
 
 				bktype_shstats->times[io_object][io_context][io_op] +=
-					INSTR_TIME_GET_MICROSEC(time);
+					INSTR_TIME_GET_MICROSEC(time_diff);
 			}
 		}
 	}
 
 	/*
-	 * Clear out the statistics buffer, so it can be re-used.
+	 * Save the current counters for the subsequent calculation of IO usage.
 	 */
-	MemSet(&PendingBackendStats.pending_io, 0, sizeof(PgStat_PendingIO));
+	prevBackendIOUsage = pgIOUsage;
 
 	backend_has_iostats = false;
 }
@@ -334,15 +302,21 @@ pgstat_create_backend(ProcNumber procnum)
 	memset(&shstatent->stats, 0, sizeof(shstatent->stats));
 	pgstat_unlock_entry(entry_ref);
 
-	MemSet(&PendingBackendStats, 0, sizeof(PgStat_BackendPending));
-	backend_has_iostats = false;
-
 	/*
 	 * Initialize prevBackendWalUsage with pgWalUsage so that
 	 * pgstat_backend_flush_cb() can calculate how much pgWalUsage counters
 	 * are increased by subtracting prevBackendWalUsage from pgWalUsage.
 	 */
 	prevBackendWalUsage = pgWalUsage;
+
+	/*
+	 * Initialize prevBackendIOUsage with pgIOUsage so that
+	 * pgstat_backend_flush_cb() can calculate how much IO counters are
+	 * increased by subtracting prevBackendIOUsage from pgIOUsage.
+	 */
+	prevBackendIOUsage = pgIOUsage;
+
+	backend_has_iostats = false;
 }
 
 /*
diff --git a/src/backend/utils/activity/pgstat_io.c b/src/backend/utils/activity/pgstat_io.c
index 13ae57ed649..aa20bd2590e 100644
--- a/src/backend/utils/activity/pgstat_io.c
+++ b/src/backend/utils/activity/pgstat_io.c
@@ -20,9 +20,16 @@
 #include "storage/bufmgr.h"
 #include "utils/pgstat_internal.h"
 
-static PgStat_PendingIO PendingIOStats;
 static bool have_iostats = false;
 
+/*
+ * IO counters saved from pgIOUsage at the previous call to
+ * pgstat_report_stat(). This is used to calculate how much IO usage
+ * happens between pgstat_report_stat() calls, by subtracting
+ * the previous counters from the current ones.
+ */
+static IOUsage prevIOUsage;
+
 /*
  * Check that stats have not been counted for any combination of IOObject,
  * IOContext, and IOOp which are not tracked for the passed-in BackendType. If
@@ -73,13 +80,11 @@ pgstat_count_io_op(IOObject io_object, IOContext io_context, IOOp io_op,
 	Assert(pgstat_is_ioop_tracked_in_bytes(io_op) || bytes == 0);
 	Assert(pgstat_tracks_io_op(MyBackendType, io_object, io_context, io_op));
 
-	PendingIOStats.counts[io_object][io_context][io_op] += cnt;
-	PendingIOStats.bytes[io_object][io_context][io_op] += bytes;
-
-	/* Add the per-backend counts */
-	pgstat_count_backend_io_op(io_object, io_context, io_op, cnt, bytes);
+	pgIOUsage.counts[io_object][io_context][io_op] += cnt;
+	pgIOUsage.bytes[io_object][io_context][io_op] += bytes;
 
 	have_iostats = true;
+	backend_has_iostats = true;
 	pgstat_report_fixed = true;
 }
 
@@ -149,12 +154,8 @@ pgstat_count_io_op_time(IOObject io_object, IOContext io_context, IOOp io_op,
 			}
 		}
 
-		INSTR_TIME_ADD(PendingIOStats.pending_times[io_object][io_context][io_op],
+		INSTR_TIME_ADD(pgIOUsage.pending_times[io_object][io_context][io_op],
 					   io_time);
-
-		/* Add the per-backend count */
-		pgstat_count_backend_io_op_time(io_object, io_context, io_op,
-										io_time);
 	}
 
 	pgstat_count_io_op(io_object, io_context, io_op, cnt, bytes);
@@ -209,27 +210,35 @@ pgstat_io_flush_cb(bool nowait)
 		{
 			for (int io_op = 0; io_op < IOOP_NUM_TYPES; io_op++)
 			{
-				instr_time	time;
+				instr_time	time_diff;
 
 				bktype_shstats->counts[io_object][io_context][io_op] +=
-					PendingIOStats.counts[io_object][io_context][io_op];
+					pgIOUsage.counts[io_object][io_context][io_op] -
+					prevIOUsage.counts[io_object][io_context][io_op];
 
 				bktype_shstats->bytes[io_object][io_context][io_op] +=
-					PendingIOStats.bytes[io_object][io_context][io_op];
+					pgIOUsage.bytes[io_object][io_context][io_op] -
+					prevIOUsage.bytes[io_object][io_context][io_op];
 
-				time = PendingIOStats.pending_times[io_object][io_context][io_op];
+				INSTR_TIME_SET_ZERO(time_diff);
+				INSTR_TIME_ACCUM_DIFF(time_diff,
+									  pgIOUsage.pending_times[io_object][io_context][io_op],
+									  prevIOUsage.pending_times[io_object][io_context][io_op]);
 
 				bktype_shstats->times[io_object][io_context][io_op] +=
-					INSTR_TIME_GET_MICROSEC(time);
+					INSTR_TIME_GET_MICROSEC(time_diff);
 			}
 		}
 	}
 
 	Assert(pgstat_bktype_io_stats_valid(bktype_shstats, MyBackendType));
 
-	LWLockRelease(bktype_lock);
+	/*
+	 * Save the current counters for the subsequent calculation of IO usage.
+	 */
+	prevIOUsage = pgIOUsage;
 
-	memset(&PendingIOStats, 0, sizeof(PendingIOStats));
+	LWLockRelease(bktype_lock);
 
 	have_iostats = false;
 
@@ -274,6 +283,17 @@ pgstat_get_io_object_name(IOObject io_object)
 	pg_unreachable();
 }
 
+void
+pgstat_io_init_backend_cb(void)
+{
+	/*
+	 * Initialize prevIOUsage with pgIOUsage so that pgstat_io_flush_cb() can
+	 * calculate how much pgIOUsage counters are increased by subtracting
+	 * prevIOUsage from pgIOUsage.
+	 */
+	prevIOUsage = pgIOUsage;
+}
+
 void
 pgstat_io_init_shmem_cb(void *stats)
 {
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index f402b17295c..660bb6c6876 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -326,12 +326,12 @@ typedef struct PgStat_BktypeIO
 	PgStat_Counter times[IOOBJECT_NUM_TYPES][IOCONTEXT_NUM_TYPES][IOOP_NUM_TYPES];
 } PgStat_BktypeIO;
 
-typedef struct PgStat_PendingIO
+typedef struct IOUsage
 {
 	uint64		bytes[IOOBJECT_NUM_TYPES][IOCONTEXT_NUM_TYPES][IOOP_NUM_TYPES];
 	PgStat_Counter counts[IOOBJECT_NUM_TYPES][IOCONTEXT_NUM_TYPES][IOOP_NUM_TYPES];
 	instr_time	pending_times[IOOBJECT_NUM_TYPES][IOCONTEXT_NUM_TYPES][IOOP_NUM_TYPES];
-} PgStat_PendingIO;
+} IOUsage;
 
 typedef struct PgStat_IO
 {
@@ -492,18 +492,6 @@ typedef struct PgStat_Backend
 	PgStat_WalCounters wal_counters;
 } PgStat_Backend;
 
-/* ---------
- * PgStat_BackendPending	Non-flushed backend stats.
- * ---------
- */
-typedef struct PgStat_BackendPending
-{
-	/*
-	 * Backend statistics store the same amount of IO data as PGSTAT_KIND_IO.
-	 */
-	PgStat_PendingIO pending_io;
-} PgStat_BackendPending;
-
 /*
  * Functions in pgstat.c
  */
@@ -548,15 +536,6 @@ extern PgStat_ArchiverStats *pgstat_fetch_stat_archiver(void);
  * Functions in pgstat_backend.c
  */
 
-/* used by pgstat_io.c for I/O stats tracked in backends */
-extern void pgstat_count_backend_io_op_time(IOObject io_object,
-											IOContext io_context,
-											IOOp io_op,
-											instr_time io_time);
-extern void pgstat_count_backend_io_op(IOObject io_object,
-									   IOContext io_context,
-									   IOOp io_op, uint32 cnt,
-									   uint64 bytes);
 extern PgStat_Backend *pgstat_fetch_stat_backend(ProcNumber procNumber);
 extern PgStat_Backend *pgstat_fetch_stat_backend_by_pid(int pid,
 														BackendType *bktype);
@@ -800,6 +779,10 @@ extern PGDLLIMPORT bool pgstat_track_counts;
 extern PGDLLIMPORT int pgstat_track_functions;
 extern PGDLLIMPORT int pgstat_fetch_consistency;
 
+/*
+ * Variables in pgstat_backend.c
+ */
+extern PGDLLIMPORT bool backend_has_iostats;
 
 /*
  * Variables in pgstat_bgwriter.c
@@ -838,4 +821,10 @@ extern PGDLLIMPORT PgStat_Counter pgStatTransactionIdleTime;
 /* updated by the traffic cop and in errfinish() */
 extern PGDLLIMPORT SessionEndType pgStatSessionEndCause;
 
+/*
+ * Variables in pgstat_io.c
+ */
+
+extern PGDLLIMPORT IOUsage pgIOUsage;
+
 #endif							/* PGSTAT_H */
diff --git a/src/include/utils/pgstat_internal.h b/src/include/utils/pgstat_internal.h
index 6cf00008f63..99a94560fcc 100644
--- a/src/include/utils/pgstat_internal.h
+++ b/src/include/utils/pgstat_internal.h
@@ -666,6 +666,7 @@ extern bool pgstat_function_flush_cb(PgStat_EntryRef *entry_ref, bool nowait);
  * Functions in pgstat_io.c
  */
 
+extern void pgstat_io_init_backend_cb(void);
 extern void pgstat_flush_io(bool nowait);
 
 extern bool pgstat_io_flush_cb(bool nowait);
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index a13e8162890..36eadeaf281 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -1309,6 +1309,7 @@ InvalMessageArray
 InvalidationInfo
 InvalidationMsgsGroup
 IoMethodOps
+IOUsage
 IpcMemoryId
 IpcMemoryKey
 IpcMemoryState
@@ -2226,7 +2227,6 @@ PgStatShared_Subscription
 PgStatShared_Wal
 PgStat_ArchiverStats
 PgStat_Backend
-PgStat_BackendPending
 PgStat_BackendSubEntry
 PgStat_BgWriterStats
 PgStat_BktypeIO
@@ -2242,7 +2242,6 @@ PgStat_IO
 PgStat_KindInfo
 PgStat_LocalState
 PgStat_PendingDroppedStatsItem
-PgStat_PendingIO
 PgStat_SLRUStats
 PgStat_ShmemControl
 PgStat_Snapshot
-- 
2.34.1

