From ae68cf0a0f256ee09e2e9f779d3e673dccef0486 Mon Sep 17 00:00:00 2001
From: Bertrand Drouvot <bertranddrouvot.pg@gmail.com>
Date: Fri, 20 Jun 2025 11:24:32 +0000
Subject: [PATCH v1 2/4] Add wait events statistics
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adding a new stat kind PGSTAT_KIND_WAIT_EVENT for the wait event statistics.

This new statistic kind is a fixed one because we know the maximum number of wait
events. Indeed:

 * it does not take into account custom wait events as extensions have all they need
 at their disposal to create custom stats on their own wait events should they
 want to (limited by WAIT_EVENT_CUSTOM_HASH_MAX_SIZE though).

 * it does not take into account LWLock > LWTRANCHE_FIRST_USER_DEFINED for the same
 reasons as above. That said, there is no maximum limitation in LWLockNewTrancheId().

 * we don't want to allocate memory in some places where the counters are
 incremented (see 4feba03d8b9). We could still use the same implementation as for
 backend statistics (i.e, make use of flush_static_cb) if we really want/need to
 switch to variable stats.

Some notes about the current design/implementation done in this patch:

For the moment only the counters are added (an array of currently 285 counters),
, we’ll study/discuss about adding the timings once the counters are fully done.

I think we’d have more discussion/debate around the timings (should we add them
by default, add a new GUC, enable them at compilation time?…), that’s why only
the counters are in this patch.

I think it makes sense as the counters have merit on their own.
We currently have 273 wait events but the array is 285 long: the reason is that
some wait events classes have "holes".

For some backends type the wait events stats are not flushed (walwriter for
example), so we need to find additional places to flush the wait events stats.

A few questions:

 * Do we need to serialize the stats based on their names (as for
 PGSTAT_KIND_REPLSLOT)? This question is the same as "is the ordering preserved
 if file stats format is not changed": I think the answer is yes (see f98dbdeb51d)
 , which means there is no need for to_serialized_name/from_serialized_name.

 * What if a new wait event is added? We'd need to change the stats file format,
 unless: the wait event stats kind becomes a variable one or we change a bit the
 way fixed stats are written/read to/from the stat file (we could add a new
 field in the PgStat_KindInfo for example).

XXX: Bump stat file format
---
 src/backend/utils/activity/Makefile           |   1 +
 src/backend/utils/activity/meson.build        |   1 +
 src/backend/utils/activity/pgstat.c           |  18 ++
 src/backend/utils/activity/pgstat_waitevent.c | 232 ++++++++++++++++++
 src/backend/utils/activity/wait_event.c       |   3 -
 src/include/pgstat.h                          |  15 ++
 src/include/utils/pgstat_internal.h           |  20 ++
 src/include/utils/pgstat_kind.h               |   3 +-
 src/include/utils/wait_event.h                |  10 +
 src/tools/pgindent/typedefs.list              |   3 +
 10 files changed, 302 insertions(+), 4 deletions(-)
 create mode 100644 src/backend/utils/activity/pgstat_waitevent.c

diff --git a/src/backend/utils/activity/Makefile b/src/backend/utils/activity/Makefile
index f9849bebc98..e7fc1354c1f 100644
--- a/src/backend/utils/activity/Makefile
+++ b/src/backend/utils/activity/Makefile
@@ -31,6 +31,7 @@ OBJS = \
 	pgstat_shmem.o \
 	pgstat_slru.o \
 	pgstat_subscription.o \
+	pgstat_waitevent.o \
 	pgstat_wal.o \
 	pgstat_xact.o \
 	wait_event.o \
diff --git a/src/backend/utils/activity/meson.build b/src/backend/utils/activity/meson.build
index d8e56b49c24..8b9b4b4bdb2 100644
--- a/src/backend/utils/activity/meson.build
+++ b/src/backend/utils/activity/meson.build
@@ -16,6 +16,7 @@ backend_sources += files(
   'pgstat_shmem.c',
   'pgstat_slru.c',
   'pgstat_subscription.c',
+  'pgstat_waitevent.c',
   'pgstat_wal.c',
   'pgstat_xact.c',
 )
diff --git a/src/backend/utils/activity/pgstat.c b/src/backend/utils/activity/pgstat.c
index 8b57845e870..68c3eb3d894 100644
--- a/src/backend/utils/activity/pgstat.c
+++ b/src/backend/utils/activity/pgstat.c
@@ -479,6 +479,24 @@ static const PgStat_KindInfo pgstat_kind_builtin_infos[PGSTAT_KIND_BUILTIN_SIZE]
 		.reset_all_cb = pgstat_wal_reset_all_cb,
 		.snapshot_cb = pgstat_wal_snapshot_cb,
 	},
+
+	[PGSTAT_KIND_WAIT_EVENT] = {
+		.name = "wait_event",
+
+		.fixed_amount = true,
+		.write_to_file = true,
+
+		.snapshot_ctl_off = offsetof(PgStat_Snapshot, wait_event),
+		.shared_ctl_off = offsetof(PgStat_ShmemControl, wait_event),
+		.shared_data_off = offsetof(PgStatShared_WaitEvent, stats),
+		.shared_data_len = sizeof(((PgStatShared_WaitEvent *) 0)->stats),
+
+		.flush_static_cb = pgstat_wait_event_flush_cb,
+		.have_static_pending_cb = pgstat_wait_event_have_pending_cb,
+		.init_shmem_cb = pgstat_wait_event_init_shmem_cb,
+		.reset_all_cb = pgstat_wait_event_reset_all_cb,
+		.snapshot_cb = pgstat_wait_event_snapshot_cb,
+	},
 };
 
 /*
diff --git a/src/backend/utils/activity/pgstat_waitevent.c b/src/backend/utils/activity/pgstat_waitevent.c
new file mode 100644
index 00000000000..a884784e43d
--- /dev/null
+++ b/src/backend/utils/activity/pgstat_waitevent.c
@@ -0,0 +1,232 @@
+/* -------------------------------------------------------------------------
+ *
+ * pgstat_waitevent.c
+ *	  Implementation of wait event statistics.
+ *
+ * This file contains the implementation of wait event statistics. It is kept
+ * separate from pgstat.c to enforce the line between the statistics access /
+ * storage implementation and the details about individual types of
+ * statistics.
+ *
+ * Copyright (c) 2001-2025, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  src/backend/utils/activity/pgstat_waitevent.c
+ * -------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "utils/pgstat_internal.h"
+
+bool		have_wait_event_stats = false;
+
+static PgStat_PendingWaitevent PendingWaitEventStats;
+
+/*
+ * Support function for the SQL-callable pgstat* functions. Returns
+ * a pointer to the wait events statistics struct.
+ */
+PgStat_WaitEvent *
+pgstat_fetch_stat_wait_event(void)
+{
+	pgstat_snapshot_fixed(PGSTAT_KIND_WAIT_EVENT);
+
+	return &pgStatLocal.snapshot.wait_event;
+}
+
+/*
+ * Returns a pointer to the first counter for a specific class.
+ */
+static PgStat_Counter *
+waitEventGetClassCounters(int64 *waitEventStats, int classId)
+{
+	int			offset = WaitClassTable[classId].offSet;
+
+	return &waitEventStats[offset];
+}
+
+/*
+ * Returns a pointer to the counter for a specific wait event.
+ */
+static PgStat_Counter *
+waitEventGetCounter(int64 *waitEventStats, int classId, int eventId)
+{
+	int64	   *classCounters;
+
+	Assert(classId >= 0 && classId < NB_WAITCLASSTABLE_ENTRIES);
+	Assert(eventId >= 0 && eventId < WaitClassTable[classId].numberOfEvents);
+
+	classCounters = waitEventGetClassCounters(waitEventStats, classId);
+
+	return &classCounters[eventId];
+}
+
+/*
+ * Increment a wait event stat counter.
+ */
+inline void
+waitEventIncrementCounter(uint32 wait_event_info)
+{
+	DecodedWaitInfo waitInfo;
+	PgStat_Counter *counter;
+	uint32		classId;
+	uint16		eventId;
+
+	classId = *my_wait_event_info & WAIT_EVENT_CLASS_MASK;
+	eventId = *my_wait_event_info & WAIT_EVENT_ID_MASK;
+
+	if (classId == 0 && eventId == 0)
+		return;
+
+	/* Don't take into account user defined LWLock in the stats */
+	if (classId == PG_WAIT_LWLOCK && eventId >= LWTRANCHE_FIRST_USER_DEFINED)
+		return;
+
+	/* Don't take into account custom wait event extension in the stats */
+	if (classId == PG_WAIT_EXTENSION && eventId >= WAIT_EVENT_CUSTOM_INITIAL_ID)
+		return;
+
+	/* Don't take account PG_WAIT_INJECTIONPOINT */
+	if (classId == PG_WAIT_INJECTIONPOINT)
+		return;
+
+	WAIT_EVENT_INFO_DECODE(waitInfo, wait_event_info);
+
+	counter = waitEventGetCounter(PendingWaitEventStats.counts, waitInfo.classId,
+								  waitInfo.eventId);
+
+	(*counter)++;
+
+	have_wait_event_stats = true;
+}
+
+const char *
+get_wait_event_name_from_index(int index)
+{
+	/* Iterate through the WaitClassTable */
+	for (int classIdx = 0; classIdx < NB_WAITCLASSTABLE_ENTRIES; classIdx++)
+	{
+		int			classOffset = WaitClassTable[classIdx].offSet;
+		int			classSize = WaitClassTable[classIdx].numberOfEvents;
+
+		/* Skip empty entries */
+		if (WaitClassTable[classIdx].numberOfEvents == 0)
+			continue;
+
+		/* Check if the index falls within this class section */
+		if (index >= classOffset && index < classOffset + classSize)
+		{
+			/* Calculate the event ID within this class */
+			int			eventId = index - classOffset;
+
+			return WaitClassTable[classIdx].eventNames[eventId];
+		}
+	}
+
+	Assert(false);
+	return "unknown";
+}
+
+/*
+ * Flush out locally pending wait event statistics
+ *
+ * If no stats have been recorded, this function returns false.
+ *
+ * If nowait is true, this function returns true if the lock could not be
+ * acquired. Otherwise, return false.
+ */
+bool
+pgstat_wait_event_flush_cb(bool nowait)
+{
+	PgStatShared_WaitEvent *stats_shmem = &pgStatLocal.shmem->wait_event;
+	int			i;
+
+	if (!have_wait_event_stats)
+		return false;
+
+	if (!nowait)
+		LWLockAcquire(&stats_shmem->lock, LW_EXCLUSIVE);
+	else if (!LWLockConditionalAcquire(&stats_shmem->lock, LW_EXCLUSIVE))
+		return true;
+
+	for (i = 0; i < NB_WAITCLASSTABLE_SIZE; i++)
+	{
+		PgStat_WaitEvent *sharedent = &stats_shmem->stats;
+
+		sharedent->counts[i] += PendingWaitEventStats.counts[i];
+	}
+
+	/* done, clear the pending entry */
+	MemSet(PendingWaitEventStats.counts, 0, sizeof(PendingWaitEventStats.counts));
+
+	LWLockRelease(&stats_shmem->lock);
+
+	have_wait_event_stats = false;
+
+	return false;
+}
+
+void
+pgstat_wait_event_init_shmem_cb(void *stats)
+{
+	PgStatShared_WaitEvent *stat_shmem = (PgStatShared_WaitEvent *) stats;
+
+	LWLockInitialize(&stat_shmem->lock, LWTRANCHE_PGSTATS_DATA);
+}
+
+void
+pgstat_wait_event_reset_all_cb(TimestampTz ts)
+{
+	for (int i = 0; i < NB_WAITCLASSTABLE_SIZE; i++)
+	{
+		LWLock	   *stats_lock = &pgStatLocal.shmem->wait_event.lock;
+		PgStat_Counter *counters = &pgStatLocal.shmem->wait_event.stats.counts[i];
+
+		LWLockAcquire(stats_lock, LW_EXCLUSIVE);
+
+		/*
+		 * Use the lock in the first wait event to protect the reset timestamp
+		 * as well.
+		 */
+		if (i == 0)
+			pgStatLocal.shmem->wait_event.stats.stat_reset_timestamp = ts;
+
+		memset(counters, 0, sizeof(*counters));
+		LWLockRelease(stats_lock);
+	}
+}
+
+void
+pgstat_wait_event_snapshot_cb(void)
+{
+	for (int i = 0; i < NB_WAITCLASSTABLE_SIZE; i++)
+	{
+		LWLock	   *stats_lock = &pgStatLocal.shmem->wait_event.lock;
+		PgStat_Counter *sh_counters = &pgStatLocal.shmem->wait_event.stats.counts[i];
+		PgStat_Counter *counters_snap = &pgStatLocal.snapshot.wait_event.counts[i];
+
+		LWLockAcquire(stats_lock, LW_SHARED);
+
+		/*
+		 * Use the lock in the first wait event to protect the reset timestamp
+		 * as well.
+		 */
+		if (i == 0)
+			pgStatLocal.snapshot.wait_event.stat_reset_timestamp =
+				pgStatLocal.shmem->wait_event.stats.stat_reset_timestamp;
+
+		/* using struct assignment due to better type safety */
+		*counters_snap = *sh_counters;
+		LWLockRelease(stats_lock);
+	}
+}
+
+/*
+ * Check if there any wait event stats waiting for flush.
+ */
+bool
+pgstat_wait_event_have_pending_cb(void)
+{
+	return have_wait_event_stats;
+}
diff --git a/src/backend/utils/activity/wait_event.c b/src/backend/utils/activity/wait_event.c
index eba7d338c1f..613935f22a2 100644
--- a/src/backend/utils/activity/wait_event.c
+++ b/src/backend/utils/activity/wait_event.c
@@ -87,9 +87,6 @@ typedef struct WaitEventCustomCounterData
 /* pointer to the shared memory */
 static WaitEventCustomCounterData *WaitEventCustomCounter;
 
-/* first event ID of custom wait events */
-#define WAIT_EVENT_CUSTOM_INITIAL_ID	1
-
 static uint32 WaitEventCustomNew(uint32 classId, const char *wait_event_name);
 static const char *GetWaitEventCustomIdentifier(uint32 wait_event_info);
 
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index 378f2f2c2ba..4b249fbbb73 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -339,6 +339,17 @@ typedef struct PgStat_IO
 	PgStat_BktypeIO stats[BACKEND_NUM_TYPES];
 } PgStat_IO;
 
+typedef struct PgStat_WaitEvent
+{
+	TimestampTz stat_reset_timestamp;
+	PgStat_Counter counts[NB_WAITCLASSTABLE_SIZE];
+} PgStat_WaitEvent;
+
+typedef struct PgStat_PendingWaitEvent
+{
+	PgStat_Counter counts[NB_WAITCLASSTABLE_SIZE];
+} PgStat_PendingWaitevent;
+
 typedef struct PgStat_StatDBEntry
 {
 	PgStat_Counter xact_commit;
@@ -782,6 +793,10 @@ struct xl_xact_stats_item;
 extern int	pgstat_get_transactional_drops(bool isCommit, struct xl_xact_stats_item **items);
 extern void pgstat_execute_transactional_drops(int ndrops, struct xl_xact_stats_item *items, bool is_redo);
 
+/*
+ * Functions in pgstat_waitevent.c
+ */
+extern PgStat_WaitEvent *pgstat_fetch_stat_wait_event(void);
 
 /*
  * Functions in pgstat_wal.c
diff --git a/src/include/utils/pgstat_internal.h b/src/include/utils/pgstat_internal.h
index d5557e6e998..33dc6e7ae05 100644
--- a/src/include/utils/pgstat_internal.h
+++ b/src/include/utils/pgstat_internal.h
@@ -403,6 +403,14 @@ typedef struct PgStatShared_IO
 	PgStat_IO	stats;
 } PgStatShared_IO;
 
+/* Shared-memory ready PgStat_WaitEvent */
+typedef struct PgStatShared_WaitEvent
+{
+	/* lock protects ->stats */
+	LWLock		lock;
+	PgStat_WaitEvent stats;
+} PgStatShared_WaitEvent;
+
 typedef struct PgStatShared_SLRU
 {
 	/* lock protects ->stats */
@@ -501,6 +509,7 @@ typedef struct PgStat_ShmemControl
 	PgStatShared_IO io;
 	PgStatShared_SLRU slru;
 	PgStatShared_Wal wal;
+	PgStatShared_WaitEvent wait_event;
 
 	/*
 	 * Custom stats data with fixed-numbered objects, indexed by (PgStat_Kind
@@ -535,6 +544,8 @@ typedef struct PgStat_Snapshot
 
 	PgStat_WalStats wal;
 
+	PgStat_WaitEvent wait_event;
+
 	/*
 	 * Data in snapshot for custom fixed-numbered statistics, indexed by
 	 * (PgStat_Kind - PGSTAT_KIND_CUSTOM_MIN).  Each entry is allocated in
@@ -780,6 +791,15 @@ extern void pgstat_create_transactional(PgStat_Kind kind, Oid dboid, uint64 obji
 
 extern PGDLLIMPORT PgStat_LocalState pgStatLocal;
 
+/*
+ * Functions in pgstat_waitevent.c
+ */
+
+extern bool pgstat_wait_event_flush_cb(bool nowait);
+extern void pgstat_wait_event_init_shmem_cb(void *stats);
+extern void pgstat_wait_event_reset_all_cb(TimestampTz ts);
+extern void pgstat_wait_event_snapshot_cb(void);
+extern bool pgstat_wait_event_have_pending_cb(void);
 
 /*
  * Implementation of inline functions declared above.
diff --git a/src/include/utils/pgstat_kind.h b/src/include/utils/pgstat_kind.h
index f44169fd5a3..848966111c6 100644
--- a/src/include/utils/pgstat_kind.h
+++ b/src/include/utils/pgstat_kind.h
@@ -38,9 +38,10 @@
 #define PGSTAT_KIND_IO	10
 #define PGSTAT_KIND_SLRU	11
 #define PGSTAT_KIND_WAL	12
+#define PGSTAT_KIND_WAIT_EVENT	13
 
 #define PGSTAT_KIND_BUILTIN_MIN PGSTAT_KIND_DATABASE
-#define PGSTAT_KIND_BUILTIN_MAX PGSTAT_KIND_WAL
+#define PGSTAT_KIND_BUILTIN_MAX PGSTAT_KIND_WAIT_EVENT
 #define PGSTAT_KIND_BUILTIN_SIZE (PGSTAT_KIND_BUILTIN_MAX + 1)
 
 /* Custom stats kinds */
diff --git a/src/include/utils/wait_event.h b/src/include/utils/wait_event.h
index f5815b4994a..45f2347b91b 100644
--- a/src/include/utils/wait_event.h
+++ b/src/include/utils/wait_event.h
@@ -10,6 +10,8 @@
 #ifndef WAIT_EVENT_H
 #define WAIT_EVENT_H
 
+#include "storage/lwlock.h"
+
 /* enums for wait events */
 #include "utils/wait_event_types.h"
 
@@ -19,9 +21,14 @@ static inline void pgstat_report_wait_start(uint32 wait_event_info);
 static inline void pgstat_report_wait_end(void);
 extern void pgstat_set_wait_event_storage(uint32 *wait_event_info);
 extern void pgstat_reset_wait_event_storage(void);
+extern void waitEventIncrementCounter(uint32 wait_event_info);
+extern const char *get_wait_event_name_from_index(int index);
 
 extern PGDLLIMPORT uint32 *my_wait_event_info;
+extern PGDLLIMPORT bool have_wait_event_stats;
 
+/* first event ID of custom wait events */
+#define WAIT_EVENT_CUSTOM_INITIAL_ID    1
 
 /*
  * Wait Events - Extension, InjectionPoint
@@ -84,6 +91,9 @@ pgstat_report_wait_start(uint32 wait_event_info)
 static inline void
 pgstat_report_wait_end(void)
 {
+	/* Increment the wait event counter */
+	waitEventIncrementCounter(*(volatile uint32 *) my_wait_event_info);
+
 	/* see pgstat_report_wait_start() */
 	*(volatile uint32 *) my_wait_event_info = 0;
 }
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 32d6e718adc..c45fd61098b 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -2215,6 +2215,7 @@ PgStatShared_Relation
 PgStatShared_ReplSlot
 PgStatShared_SLRU
 PgStatShared_Subscription
+PgStatShared_WaitEvent
 PgStatShared_Wal
 PgStat_ArchiverStats
 PgStat_Backend
@@ -2235,6 +2236,7 @@ PgStat_KindInfo
 PgStat_LocalState
 PgStat_PendingDroppedStatsItem
 PgStat_PendingIO
+PgStat_PendingWaitevent
 PgStat_SLRUStats
 PgStat_ShmemControl
 PgStat_Snapshot
@@ -2250,6 +2252,7 @@ PgStat_SubXactStatus
 PgStat_TableCounts
 PgStat_TableStatus
 PgStat_TableXactStatus
+PgStat_WaitEvent
 PgStat_WalCounters
 PgStat_WalStats
 PgXmlErrorContext
-- 
2.34.1

