From 8d9d8109bc94de487a9e0cdb31b4090bd25b5393 Mon Sep 17 00:00:00 2001
From: Bertrand Drouvot <bertranddrouvot.pg@gmail.com>
Date: Fri, 27 Jun 2025 05:55:43 +0000
Subject: [PATCH v1 4/4] switching PGSTAT_KIND_WAIT_EVENT to variable sized

It might be better for PGSTAT_KIND_WAIT_EVENT to be a variable sized stats kind.
That way:

* It would be easier to add custom wait events if we want to
* It would be possible to add a new wait events without having to change the
stats file format

It uses the uint32 as the hash key while the hash key is defined as uint64: that
should not be an issue but this patch does explicit casting though.

That said it might be better to use all the 64 bits (means not have the half full
of zeroes) for the hash key (better hashing distribution?) then we could imagine
using something like:

((uint64) wait_event_info) | (((uint64) wait_event_info) << 32)

for the hash key.
---
 .../activity/generate-wait_event_types.pl     |  17 ++-
 src/backend/utils/activity/pgstat.c           |  35 +++--
 src/backend/utils/activity/pgstat_waitevent.c | 123 ++++++++----------
 src/backend/utils/adt/pgstatfuncs.c           |   9 +-
 src/include/pgstat.h                          |  32 +++--
 src/include/utils/pgstat_internal.h           |  22 ++--
 src/include/utils/pgstat_kind.h               |  16 +--
 7 files changed, 129 insertions(+), 125 deletions(-)

diff --git a/src/backend/utils/activity/generate-wait_event_types.pl b/src/backend/utils/activity/generate-wait_event_types.pl
index c18693aa68b..623a7aa5e85 100644
--- a/src/backend/utils/activity/generate-wait_event_types.pl
+++ b/src/backend/utils/activity/generate-wait_event_types.pl
@@ -241,13 +241,22 @@ if ($gen_code)
 
 ';
 
+	my $wait_event_class_shift = 0;
+	my $temp_mask = $wait_event_class_mask;
+	while (($temp_mask & 1) == 0 && $temp_mask != 0)
+	{
+		$wait_event_class_shift++;
+		$temp_mask >>= 1;
+	}
+
 	printf $h $header_comment, 'wait_event_types.h';
 	printf $h "#ifndef WAIT_EVENT_TYPES_H\n";
 	printf $h "#define WAIT_EVENT_TYPES_H\n\n";
 	printf $h "#define WAIT_EVENT_CLASS_MASK   0x%08X\n",
 	  $wait_event_class_mask;
-	printf $h "#define WAIT_EVENT_ID_MASK      0x%08X\n\n",
-	  $wait_event_id_mask;
+	printf $h "#define WAIT_EVENT_ID_MASK      0x%08X\n", $wait_event_id_mask;
+	printf $h "#define WAIT_EVENT_CLASS_SHIFT  %d\n\n",
+	  $wait_event_class_shift;
 	printf $h "#include \"utils/wait_classes.h\"\n\n";
 
 	printf $c $header_comment, 'pgstat_wait_event.c';
@@ -363,6 +372,10 @@ typedef struct DecodedWaitInfo
     d.classId = ((i) & WAIT_EVENT_CLASS_MASK) / (WAIT_EVENT_CLASS_MASK & (-WAIT_EVENT_CLASS_MASK)), \\
     d.eventId = (i) & WAIT_EVENT_ID_MASK
 
+/* To encode wait_event_info from classId and eventId as integers */
+#define ENCODE_WAIT_EVENT_INFO(classId, eventId) \\
+	(((classId) << WAIT_EVENT_CLASS_SHIFT) | ((eventId) & WAIT_EVENT_ID_MASK))
+
 /* To map wait event classes into the WaitClassTable */
 typedef struct
 {
diff --git a/src/backend/utils/activity/pgstat.c b/src/backend/utils/activity/pgstat.c
index 68c3eb3d894..ca7a1d1b23e 100644
--- a/src/backend/utils/activity/pgstat.c
+++ b/src/backend/utils/activity/pgstat.c
@@ -375,6 +375,23 @@ static const PgStat_KindInfo pgstat_kind_builtin_infos[PGSTAT_KIND_BUILTIN_SIZE]
 		.reset_timestamp_cb = pgstat_backend_reset_timestamp_cb,
 	},
 
+	[PGSTAT_KIND_WAIT_EVENT] = {
+		.name = "wait_event",
+
+		.fixed_amount = false,
+		.write_to_file = true,
+
+		.accessed_across_databases = true,
+
+		.shared_size = sizeof(PgStatShared_WaitEvent),
+		.shared_data_off = offsetof(PgStatShared_WaitEvent, stats),
+		.shared_data_len = sizeof(((PgStatShared_WaitEvent *) 0)->stats),
+
+		.have_static_pending_cb = pgstat_wait_event_have_pending_cb,
+		.flush_static_cb = pgstat_wait_event_flush_cb,
+		.reset_timestamp_cb = pgstat_wait_event_reset_timestamp_cb,
+	},
+
 	/* stats for fixed-numbered (mostly 1) objects */
 
 	[PGSTAT_KIND_ARCHIVER] = {
@@ -479,24 +496,6 @@ static const PgStat_KindInfo pgstat_kind_builtin_infos[PGSTAT_KIND_BUILTIN_SIZE]
 		.reset_all_cb = pgstat_wal_reset_all_cb,
 		.snapshot_cb = pgstat_wal_snapshot_cb,
 	},
-
-	[PGSTAT_KIND_WAIT_EVENT] = {
-		.name = "wait_event",
-
-		.fixed_amount = true,
-		.write_to_file = true,
-
-		.snapshot_ctl_off = offsetof(PgStat_Snapshot, wait_event),
-		.shared_ctl_off = offsetof(PgStat_ShmemControl, wait_event),
-		.shared_data_off = offsetof(PgStatShared_WaitEvent, stats),
-		.shared_data_len = sizeof(((PgStatShared_WaitEvent *) 0)->stats),
-
-		.flush_static_cb = pgstat_wait_event_flush_cb,
-		.have_static_pending_cb = pgstat_wait_event_have_pending_cb,
-		.init_shmem_cb = pgstat_wait_event_init_shmem_cb,
-		.reset_all_cb = pgstat_wait_event_reset_all_cb,
-		.snapshot_cb = pgstat_wait_event_snapshot_cb,
-	},
 };
 
 /*
diff --git a/src/backend/utils/activity/pgstat_waitevent.c b/src/backend/utils/activity/pgstat_waitevent.c
index a884784e43d..0655e2bdfe2 100644
--- a/src/backend/utils/activity/pgstat_waitevent.c
+++ b/src/backend/utils/activity/pgstat_waitevent.c
@@ -28,11 +28,14 @@ static PgStat_PendingWaitevent PendingWaitEventStats;
  * a pointer to the wait events statistics struct.
  */
 PgStat_WaitEvent *
-pgstat_fetch_stat_wait_event(void)
+pgstat_fetch_stat_wait_event(uint32 wait_event_info)
 {
-	pgstat_snapshot_fixed(PGSTAT_KIND_WAIT_EVENT);
+	PgStat_WaitEvent *wait_event_entry;
 
-	return &pgStatLocal.snapshot.wait_event;
+	wait_event_entry = (PgStat_WaitEvent *) pgstat_fetch_entry(PGSTAT_KIND_WAIT_EVENT,
+															   InvalidOid, (uint64) wait_event_info);
+
+	return wait_event_entry;
 }
 
 /*
@@ -131,95 +134,81 @@ get_wait_event_name_from_index(int index)
 /*
  * Flush out locally pending wait event statistics
  *
- * If no stats have been recorded, this function returns false.
- *
- * If nowait is true, this function returns true if the lock could not be
- * acquired. Otherwise, return false.
+ * Returns true if some statistics could not be flushed due to lock contention.
  */
+
 bool
 pgstat_wait_event_flush_cb(bool nowait)
 {
-	PgStatShared_WaitEvent *stats_shmem = &pgStatLocal.shmem->wait_event;
-	int			i;
+	PgStat_EntryRef *entry_ref;
+	bool		could_not_be_flushed = false;
 
 	if (!have_wait_event_stats)
 		return false;
 
-	if (!nowait)
-		LWLockAcquire(&stats_shmem->lock, LW_EXCLUSIVE);
-	else if (!LWLockConditionalAcquire(&stats_shmem->lock, LW_EXCLUSIVE))
-		return true;
-
-	for (i = 0; i < NB_WAITCLASSTABLE_SIZE; i++)
+	for (int classIdx = 0; classIdx < NB_WAITCLASSTABLE_ENTRIES; classIdx++)
 	{
-		PgStat_WaitEvent *sharedent = &stats_shmem->stats;
+		WaitClassTableEntry *class;
+		int			classOffset;
+		int			classSize;
 
-		sharedent->counts[i] += PendingWaitEventStats.counts[i];
-	}
+		/* Skip empty entries */
+		if (WaitClassTable[classIdx].numberOfEvents == 0)
+			continue;
 
-	/* done, clear the pending entry */
-	MemSet(PendingWaitEventStats.counts, 0, sizeof(PendingWaitEventStats.counts));
+		class = &WaitClassTable[classIdx];
 
-	LWLockRelease(&stats_shmem->lock);
+		classOffset = class->offSet;
+		classSize = class->numberOfEvents;
 
-	have_wait_event_stats = false;
+		for (int eventId = 0; eventId < classSize; eventId++)
+		{
+			const char *name;
+			PgStatShared_WaitEvent *shwaiteventent;
+			PgStat_Counter *shstat;
+			PgStat_Counter pending_counter;
+			uint32		wait_event_info;
 
-	return false;
-}
+			name = get_wait_event_name_from_index(classOffset + eventId);
 
-void
-pgstat_wait_event_init_shmem_cb(void *stats)
-{
-	PgStatShared_WaitEvent *stat_shmem = (PgStatShared_WaitEvent *) stats;
+			if (!name)
+				continue;
 
-	LWLockInitialize(&stat_shmem->lock, LWTRANCHE_PGSTATS_DATA);
-}
+			/* Build the wait_event_info */
+			wait_event_info = ENCODE_WAIT_EVENT_INFO(classIdx, eventId);
 
-void
-pgstat_wait_event_reset_all_cb(TimestampTz ts)
-{
-	for (int i = 0; i < NB_WAITCLASSTABLE_SIZE; i++)
-	{
-		LWLock	   *stats_lock = &pgStatLocal.shmem->wait_event.lock;
-		PgStat_Counter *counters = &pgStatLocal.shmem->wait_event.stats.counts[i];
+			entry_ref = pgstat_get_entry_ref_locked(PGSTAT_KIND_WAIT_EVENT,
+													InvalidOid, (uint64) wait_event_info, nowait);
+
+			if (!entry_ref)
+			{
+				could_not_be_flushed = true;
+				continue;
+			}
 
-		LWLockAcquire(stats_lock, LW_EXCLUSIVE);
+			shwaiteventent = (PgStatShared_WaitEvent *) entry_ref->shared_stats;
+			shstat = &shwaiteventent->stats.counts;
+			pending_counter = PendingWaitEventStats.counts[classOffset + eventId];
 
-		/*
-		 * Use the lock in the first wait event to protect the reset timestamp
-		 * as well.
-		 */
-		if (i == 0)
-			pgStatLocal.shmem->wait_event.stats.stat_reset_timestamp = ts;
+			*shstat += pending_counter;
 
-		memset(counters, 0, sizeof(*counters));
-		LWLockRelease(stats_lock);
+			pgstat_unlock_entry(entry_ref);
+		}
 	}
+
+	/* done, clear the pending entry */
+	MemSet(PendingWaitEventStats.counts, 0, sizeof(PendingWaitEventStats.counts));
+
+	if (!could_not_be_flushed)
+		have_wait_event_stats = false;
+
+	return could_not_be_flushed;
 }
 
 void
-pgstat_wait_event_snapshot_cb(void)
+pgstat_wait_event_reset_timestamp_cb(PgStatShared_Common *header, TimestampTz ts)
 {
-	for (int i = 0; i < NB_WAITCLASSTABLE_SIZE; i++)
-	{
-		LWLock	   *stats_lock = &pgStatLocal.shmem->wait_event.lock;
-		PgStat_Counter *sh_counters = &pgStatLocal.shmem->wait_event.stats.counts[i];
-		PgStat_Counter *counters_snap = &pgStatLocal.snapshot.wait_event.counts[i];
-
-		LWLockAcquire(stats_lock, LW_SHARED);
-
-		/*
-		 * Use the lock in the first wait event to protect the reset timestamp
-		 * as well.
-		 */
-		if (i == 0)
-			pgStatLocal.snapshot.wait_event.stat_reset_timestamp =
-				pgStatLocal.shmem->wait_event.stats.stat_reset_timestamp;
-
-		/* using struct assignment due to better type safety */
-		*counters_snap = *sh_counters;
-		LWLockRelease(stats_lock);
-	}
+	((PgStatShared_WaitEvent *) header)->stats.stat_reset_timestamp = ts;
 }
 
 /*
diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c
index cfa6aefd95e..5060090ee21 100644
--- a/src/backend/utils/adt/pgstatfuncs.c
+++ b/src/backend/utils/adt/pgstatfuncs.c
@@ -1757,9 +1757,6 @@ pg_stat_get_wait_event(PG_FUNCTION_ARGS)
 
 	InitMaterializedSRF(fcinfo, 0);
 
-	/* request wait event stats from the cumulative stats system */
-	stats = pgstat_fetch_stat_wait_event();
-
 	for (i = 0; i < NB_WAITCLASSTABLE_ENTRIES; i++)
 	{
 		/* for each row */
@@ -1773,15 +1770,19 @@ pg_stat_get_wait_event(PG_FUNCTION_ARGS)
 		for (j = 0; j < numWaitEvents; j++)
 		{
 			const char *name;
+			uint32		wait_event_info;
 
 			name = get_wait_event_name_from_index(class->offSet + j);
 
 			if (!name)
 				continue;
 
+			wait_event_info = ENCODE_WAIT_EVENT_INFO(i, j);
+			stats = pgstat_fetch_stat_wait_event(wait_event_info);
+
 			values[0] = PointerGetDatum(cstring_to_text(class->className));
 			values[1] = PointerGetDatum(cstring_to_text(name));
-			values[2] = Int64GetDatum(stats->counts[class->offSet + j]);
+			values[2] = Int64GetDatum(stats->counts);
 			values[3] = TimestampTzGetDatum(stats->stat_reset_timestamp);
 
 			tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls);
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index 4b249fbbb73..f1883e1ec83 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -339,17 +339,6 @@ typedef struct PgStat_IO
 	PgStat_BktypeIO stats[BACKEND_NUM_TYPES];
 } PgStat_IO;
 
-typedef struct PgStat_WaitEvent
-{
-	TimestampTz stat_reset_timestamp;
-	PgStat_Counter counts[NB_WAITCLASSTABLE_SIZE];
-} PgStat_WaitEvent;
-
-typedef struct PgStat_PendingWaitEvent
-{
-	PgStat_Counter counts[NB_WAITCLASSTABLE_SIZE];
-} PgStat_PendingWaitevent;
-
 typedef struct PgStat_StatDBEntry
 {
 	PgStat_Counter xact_commit;
@@ -515,6 +504,25 @@ typedef struct PgStat_BackendPending
 	PgStat_PendingIO pending_io;
 } PgStat_BackendPending;
 
+/* -------
+ * PgStat_WaitEvent		Wait events statistics
+ * -------
+ */
+typedef struct PgStat_WaitEvent
+{
+	TimestampTz stat_reset_timestamp;
+	PgStat_Counter counts;
+} PgStat_WaitEvent;
+
+/* ---------
+ * PgStat_PendingWaitEvent	Non-flushed wait events stats.
+ * ---------
+ */
+typedef struct PgStat_PendingWaitEvent
+{
+	PgStat_Counter counts[NB_WAITCLASSTABLE_SIZE];
+} PgStat_PendingWaitevent;
+
 /*
  * Functions in pgstat.c
  */
@@ -796,7 +804,7 @@ extern void pgstat_execute_transactional_drops(int ndrops, struct xl_xact_stats_
 /*
  * Functions in pgstat_waitevent.c
  */
-extern PgStat_WaitEvent *pgstat_fetch_stat_wait_event(void);
+extern PgStat_WaitEvent *pgstat_fetch_stat_wait_event(uint32 wait_event_info);
 
 /*
  * Functions in pgstat_wal.c
diff --git a/src/include/utils/pgstat_internal.h b/src/include/utils/pgstat_internal.h
index 33dc6e7ae05..4c3ff60b8a9 100644
--- a/src/include/utils/pgstat_internal.h
+++ b/src/include/utils/pgstat_internal.h
@@ -403,14 +403,6 @@ typedef struct PgStatShared_IO
 	PgStat_IO	stats;
 } PgStatShared_IO;
 
-/* Shared-memory ready PgStat_WaitEvent */
-typedef struct PgStatShared_WaitEvent
-{
-	/* lock protects ->stats */
-	LWLock		lock;
-	PgStat_WaitEvent stats;
-} PgStatShared_WaitEvent;
-
 typedef struct PgStatShared_SLRU
 {
 	/* lock protects ->stats */
@@ -471,6 +463,12 @@ typedef struct PgStatShared_Backend
 	PgStat_Backend stats;
 } PgStatShared_Backend;
 
+typedef struct PgStatShared_WaitEvent
+{
+	PgStatShared_Common header;
+	PgStat_WaitEvent stats;
+} PgStatShared_WaitEvent;
+
 /*
  * Central shared memory entry for the cumulative stats system.
  *
@@ -509,7 +507,6 @@ typedef struct PgStat_ShmemControl
 	PgStatShared_IO io;
 	PgStatShared_SLRU slru;
 	PgStatShared_Wal wal;
-	PgStatShared_WaitEvent wait_event;
 
 	/*
 	 * Custom stats data with fixed-numbered objects, indexed by (PgStat_Kind
@@ -544,8 +541,6 @@ typedef struct PgStat_Snapshot
 
 	PgStat_WalStats wal;
 
-	PgStat_WaitEvent wait_event;
-
 	/*
 	 * Data in snapshot for custom fixed-numbered statistics, indexed by
 	 * (PgStat_Kind - PGSTAT_KIND_CUSTOM_MIN).  Each entry is allocated in
@@ -796,9 +791,8 @@ extern PGDLLIMPORT PgStat_LocalState pgStatLocal;
  */
 
 extern bool pgstat_wait_event_flush_cb(bool nowait);
-extern void pgstat_wait_event_init_shmem_cb(void *stats);
-extern void pgstat_wait_event_reset_all_cb(TimestampTz ts);
-extern void pgstat_wait_event_snapshot_cb(void);
+extern void pgstat_wait_event_reset_timestamp_cb(PgStatShared_Common *header,
+												 TimestampTz ts);
 extern bool pgstat_wait_event_have_pending_cb(void);
 
 /*
diff --git a/src/include/utils/pgstat_kind.h b/src/include/utils/pgstat_kind.h
index 848966111c6..90ae7d49325 100644
--- a/src/include/utils/pgstat_kind.h
+++ b/src/include/utils/pgstat_kind.h
@@ -30,18 +30,18 @@
 #define PGSTAT_KIND_REPLSLOT	4	/* per-slot statistics */
 #define PGSTAT_KIND_SUBSCRIPTION	5	/* per-subscription statistics */
 #define PGSTAT_KIND_BACKEND	6	/* per-backend statistics */
+#define PGSTAT_KIND_WAIT_EVENT 7	/* wait events statistics */
 
 /* stats for fixed-numbered objects */
-#define PGSTAT_KIND_ARCHIVER	7
-#define PGSTAT_KIND_BGWRITER	8
-#define PGSTAT_KIND_CHECKPOINTER	9
-#define PGSTAT_KIND_IO	10
-#define PGSTAT_KIND_SLRU	11
-#define PGSTAT_KIND_WAL	12
-#define PGSTAT_KIND_WAIT_EVENT	13
+#define PGSTAT_KIND_ARCHIVER	8
+#define PGSTAT_KIND_BGWRITER	9
+#define PGSTAT_KIND_CHECKPOINTER	10
+#define PGSTAT_KIND_IO	11
+#define PGSTAT_KIND_SLRU	12
+#define PGSTAT_KIND_WAL	13
 
 #define PGSTAT_KIND_BUILTIN_MIN PGSTAT_KIND_DATABASE
-#define PGSTAT_KIND_BUILTIN_MAX PGSTAT_KIND_WAIT_EVENT
+#define PGSTAT_KIND_BUILTIN_MAX PGSTAT_KIND_WAL
 #define PGSTAT_KIND_BUILTIN_SIZE (PGSTAT_KIND_BUILTIN_MAX + 1)
 
 /* Custom stats kinds */
-- 
2.34.1

