From 3102bd1158182058e61fe8d085d3f09b7807f9c8 Mon Sep 17 00:00:00 2001 From: Dmitry Fomin Date: Mon, 8 Jun 2026 20:29:18 +0000 Subject: [PATCH v2 2/7] wait_event_timing: record per-backend wait event statistics (stats level) Implement wait_event_capture = stats. When enabled, every transition through pgstat_report_wait_start()/pgstat_report_wait_end() records the wait duration and accumulates per-(backend, event) statistics -- count, total and maximum duration, and a 32-bucket log2 duration histogram -- in shared memory. Storage is a per-backend slot array in the main shared memory segment, sized at postmaster start from wait_event_timing_max_tranches. Because each slot lives for the entire life of its backend, the hot path needs no lazy attach and no teardown gating: pgstat_report_wait_start_timing() records a timestamp and the current event, and pgstat_report_wait_end_timing() computes the duration and accumulates. Each backend writes only to its own slot, so no locking is required and the SRF reader is lock-free. The inline gate in pgstat_report_wait_start()/_end() is a single load of wait_event_capture plus a branch, with the bodies kept out-of-line so the many inlined call sites stay compact; while capture is off the hot path adds only that branch. Non-LWLock wait events map to a flat array via a class table generated from wait_event_names.txt by generate-wait_event_types.pl. LWLock events, whose tranche ids are unbounded, use a per-backend open-addressing hash capped by wait_event_timing_max_tranches (PGC_POSTMASTER, default 192). SQL surface: - pg_stat_get_wait_event_timing(pid) and the pg_stat_wait_event_timing view, one row per backend per event with a non-zero count; - pg_wait_event_timing_histogram_buckets, naming the 32 histogram bins. Builds compiled without --enable-wait-event-timing keep the GUC (its check hook rejects any non-off value) and empty-result SQL stubs, so tooling sees a uniform surface. A later commit in the series exposes the per-backend overflow counters (maintained here) and adds the reset functions. --- doc/src/sgml/config.sgml | 62 ++ doc/src/sgml/monitoring.sgml | 268 +++++++ src/backend/catalog/system_views.sql | 62 ++ src/backend/storage/lmgr/proc.c | 5 + src/backend/utils/.gitignore | 1 + src/backend/utils/Makefile | 9 +- src/backend/utils/activity/Makefile | 1 + .../activity/generate-wait_event_types.pl | 179 +++++ src/backend/utils/activity/wait_event.c | 3 +- .../utils/activity/wait_event_timing.c | 740 +++++++++++++++++- src/backend/utils/misc/guc_parameters.dat | 9 + src/backend/utils/misc/postgresql.conf.sample | 1 + src/include/catalog/pg_proc.dat | 10 + src/include/storage/subsystemlist.h | 1 + src/include/utils/.gitignore | 1 + src/include/utils/meson.build | 4 +- src/include/utils/wait_classes.h | 9 + src/include/utils/wait_event.h | 49 ++ src/include/utils/wait_event_timing.h | 145 +++- src/test/regress/expected/rules.out | 16 + .../regress/expected/wait_event_timing.out | 84 ++ .../regress/expected/wait_event_timing_1.out | 85 ++ src/test/regress/parallel_schedule | 4 + src/test/regress/sql/wait_event_timing.sql | 54 ++ src/tools/pginclude/headerscheck | 2 + src/tools/pgindent/typedefs.list | 4 + 26 files changed, 1753 insertions(+), 55 deletions(-) create mode 100644 src/test/regress/expected/wait_event_timing.out create mode 100644 src/test/regress/expected/wait_event_timing_1.out create mode 100644 src/test/regress/sql/wait_event_timing.sql diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index 9172a4c5c95..43c9d395cf5 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -9137,6 +9137,68 @@ COPY postgres_log FROM '/full/path/to/logfile.csv' WITH csv; + + wait_event_capture (enum) + + wait_event_capture configuration parameter + + + + + Controls collection of wait event timing instrumentation. Requires + the server to be compiled with + . Possible values are + off (the default) and stats. + + + At stats, the server records per-backend wait + event statistics — counts, total and average durations, and a + log2 duration histogram — visible in the + + pg_stat_wait_event_timing view. + Two clock_gettime()-grade timestamps are taken + around every wait event transition, costing roughly + 40–100 ns each on modern hardware; while + wait_event_capture is off the + hot path adds only a single predictable branch. + + + Only superusers and users with the appropriate SET + privilege can change this setting. Read access to the resulting + statistics is controlled separately by membership in the + pg_read_all_stats + role (which the pg_monitor role inherits). To + delegate the ability to change the setting to a monitoring role, + use GRANT SET ON PARAMETER: + +GRANT SET ON PARAMETER wait_event_capture TO pg_monitor; + + + + + + + wait_event_timing_max_tranches (integer) + + wait_event_timing_max_tranches configuration parameter + + + + + Sets the maximum number of distinct LWLock tranches whose timing is + recorded individually per backend. PostgreSQL maintains a + per-backend hash table mapping each tranche the backend encounters + to its histogram; once the table fills, further tranches are not + individually timed. Sized at server start; this parameter has no + effect on builds compiled without + . The default is + 192; raise it if your installation loads many + extensions that register their own LWLock tranches. This parameter + can only be set at server start. + + + + track_functions (enum) diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml index 12b9ee20d4a..8c4a523f971 100644 --- a/doc/src/sgml/monitoring.sgml +++ b/doc/src/sgml/monitoring.sgml @@ -561,6 +561,15 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser + + pg_stat_wait_event_timingpg_stat_wait_event_timing + One row per backend per wait event, showing accumulated timing + statistics. See + + pg_stat_wait_event_timing for details. + + + @@ -3830,6 +3839,265 @@ description | Waiting for a newly initialized WAL file to reach durable storage + + <structname>pg_stat_wait_event_timing</structname> + + + pg_stat_wait_event_timing + + + + The pg_stat_wait_event_timing view contains one + row for each combination of backend and wait event that has a non-zero + call count. It shows accumulated timing statistics collected when + is set to stats. Requires the server to be compiled with + . + + + + Statistics are accumulated in each backend's own shared memory and are + reported only for backends that are currently connected, much like + + pg_stat_activity; a backend's rows + disappear when it disconnects, and they are reset to zero if its process + number is later reused by a new backend. To study a short-lived + workload, query this view while that workload is still running rather + than after it finishes. + + + + <structname>pg_stat_wait_event_timing</structname> View + + + + + Column Type + + + Description + + + + + + + + pid integer + + + Process ID of the backend + + + + + + backend_type text + + + Type of the backend (e.g. client backend, + checkpointer, walwriter) + + + + + + procnumber integer + + + Internal slot number (0-based process number) of the backend. + + + + + + wait_event_type text + + + Wait event type (e.g. IO, LWLock, + Timeout) + + + + + + wait_event text + + + Wait event name (e.g. DataFileRead, + WALWrite, PgSleep) + + + + + + calls bigint + + + Number of times this wait event occurred + + + + + + total_time_ms double precision + + + Total time spent in this wait event, in milliseconds + + + + + + avg_time_us double precision + + + Average wait duration, in microseconds + + + + + + max_time_us double precision + + + Maximum single wait duration, in microseconds + + + + + + histogram bigint[] + + + Log2 histogram of wait durations with 32 buckets. Bin edges are + powers of two on the nanosecond axis: bucket 0 covers + [0, 1024) ns, bucket k covers + [2^(k+9), + 2^(k+10)) ns, and the last bucket covers + [2^40, ∞) ns. The boundaries approximate the + decimal-microsecond grid (1024 ns ≈ 1 μs, 2048 ns ≈ + 2 μs, ...); the exact edges are chosen to let the hot path skip + a division by 1000. The + + pg_wait_event_timing_histogram_buckets + view provides the numeric bin edges and human-readable labels for + each index; the canonical join pattern is: + +SELECT w.wait_event, b.label, h.count +FROM pg_stat_wait_event_timing w, + LATERAL unnest(w.histogram) WITH ORDINALITY AS h(count, idx) +JOIN pg_wait_event_timing_histogram_buckets b ON b.bucket_idx = h.idx - 1 +WHERE w.wait_event = 'PgSleep' +ORDER BY b.bucket_idx; + + + + + + +
+
+ + + <structname>pg_wait_event_timing_histogram_buckets</structname> + + + pg_wait_event_timing_histogram_buckets + + + + The pg_wait_event_timing_histogram_buckets + view describes the 32 bins used by the + histogram column of + + pg_stat_wait_event_timing. It always + contains 32 rows in ascending order of + bucket_idx, and is independent of runtime + state; a join against it attaches numeric bin edges and human + labels to any histogram array. Bins are powers of two on the + nanosecond axis: bin 0 covers [0, 1us), each + subsequent bin doubles its lower edge, and the final bin + (bucket_idx = 31) is open-ended at + approximately 1024 seconds. + + + + The 32-bin layout (rather than the more common 16-bin choice for + log-scale histograms) is deliberate: real-world wait-event + distributions have long tails routinely extending past 16 ms + into multi-second territory (slow-disk + DataFileRead, lock contention waits, replication + apply waits, vacuum waits). A 16-bin histogram would collapse all + of those into a single overflow bin, hiding the very signal that + wait-event timing exists to surface. The 32-bin layout keeps the + long tail individually addressable up to about 17 minutes + before the open-ended bin; single waits beyond that belong in + auto_explain + or pg_stat_activity, not a histogram. + + + + <structname>pg_wait_event_timing_histogram_buckets</structname> View + + + + + Column Type + + + Description + + + + + + + + bucket_idx integer + + + Zero-based bin index (0–31). Matches the offset into the + histogram array of + pg_stat_wait_event_timing. + + + + + + lower_ns bigint + + + Inclusive lower edge of this bin in nanoseconds. + + + + + + upper_ns bigint + + + Exclusive upper edge of this bin in nanoseconds, or + NULL for the final bin which extends to + infinity. + + + + + + label text + + + Short human-readable label for the bin (e.g. + <1us, 1-2us, + >=1024s), expressed on the approximate + decimal-microsecond grid the bin edges are aligned to. + + + + +
+
+ <structname>pg_stat_database</structname> diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index 6c1c5545cb5..b41661eb5d2 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -1568,3 +1568,65 @@ CREATE VIEW pg_aios AS SELECT * FROM pg_get_aios(); REVOKE ALL ON pg_aios FROM PUBLIC; GRANT SELECT ON pg_aios TO pg_read_all_stats; + +-- Taxonomy for the histogram column on pg_stat_wait_event_timing. The +-- histogram array has one entry per bucket, in ascending order. This +-- view names them so callers do not have to memorise the layout; join +-- against it via unnest(histogram) WITH ORDINALITY. +-- +-- WARNING: keep this list in lock-step with WAIT_EVENT_TIMING_HISTOGRAM_BUCKETS +-- and wait_event_timing_bucket() in src/backend/utils/activity/wait_event_timing.c. +-- Bin edges are powers of two in nanoseconds; labels are the approximate +-- decimal-microsecond grid documented in src/include/utils/wait_event_timing.h. +CREATE VIEW pg_wait_event_timing_histogram_buckets AS + SELECT bucket_idx, lower_ns, upper_ns, label + FROM (VALUES + ( 0, 0::bigint, 1024::bigint, '<1us'::text), + ( 1, 1024::bigint, 2048::bigint, '1-2us'), + ( 2, 2048::bigint, 4096::bigint, '2-4us'), + ( 3, 4096::bigint, 8192::bigint, '4-8us'), + ( 4, 8192::bigint, 16384::bigint, '8-16us'), + ( 5, 16384::bigint, 32768::bigint, '16-32us'), + ( 6, 32768::bigint, 65536::bigint, '32-64us'), + ( 7, 65536::bigint, 131072::bigint, '64-128us'), + ( 8, 131072::bigint, 262144::bigint, '128-256us'), + ( 9, 262144::bigint, 524288::bigint, '256-512us'), + (10, 524288::bigint, 1048576::bigint, '512us-1ms'), + (11, 1048576::bigint, 2097152::bigint, '1-2ms'), + (12, 2097152::bigint, 4194304::bigint, '2-4ms'), + (13, 4194304::bigint, 8388608::bigint, '4-8ms'), + (14, 8388608::bigint, 16777216::bigint, '8-16ms'), + (15, 16777216::bigint, 33554432::bigint, '16-32ms'), + (16, 33554432::bigint, 67108864::bigint, '32-64ms'), + (17, 67108864::bigint, 134217728::bigint, '64-128ms'), + (18, 134217728::bigint, 268435456::bigint, '128-256ms'), + (19, 268435456::bigint, 536870912::bigint, '256-512ms'), + (20, 536870912::bigint, 1073741824::bigint, '512ms-1s'), + (21, 1073741824::bigint, 2147483648::bigint, '1-2s'), + (22, 2147483648::bigint, 4294967296::bigint, '2-4s'), + (23, 4294967296::bigint, 8589934592::bigint, '4-8s'), + (24, 8589934592::bigint, 17179869184::bigint, '8-16s'), + (25, 17179869184::bigint, 34359738368::bigint, '16-32s'), + (26, 34359738368::bigint, 68719476736::bigint, '32-64s'), + (27, 68719476736::bigint, 137438953472::bigint, '64-128s'), + (28, 137438953472::bigint, 274877906944::bigint, '128-256s'), + (29, 274877906944::bigint, 549755813888::bigint, '256-512s'), + (30, 549755813888::bigint, 1099511627776::bigint, '512s-1024s'), + (31, 1099511627776::bigint, NULL::bigint, '>=1024s') + ) AS t(bucket_idx, lower_ns, upper_ns, label); + +CREATE VIEW pg_stat_wait_event_timing AS + SELECT + t.pid, + t.backend_type, + t.procnumber, + t.wait_event_type, + t.wait_event, + t.calls, + t.total_time_ms, + t.avg_time_us, + t.max_time_us, + t.histogram + FROM pg_stat_get_wait_event_timing(NULL) t; +REVOKE ALL ON pg_stat_wait_event_timing FROM PUBLIC; +GRANT SELECT ON pg_stat_wait_event_timing TO pg_read_all_stats; diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c index 7d01c981a1f..3630d69ba49 100644 --- a/src/backend/storage/lmgr/proc.c +++ b/src/backend/storage/lmgr/proc.c @@ -57,6 +57,7 @@ #include "utils/timeout.h" #include "utils/timestamp.h" #include "utils/wait_event.h" +#include "utils/wait_event_timing.h" /* GUC variables */ int DeadlockTimeout = 1000; @@ -541,6 +542,7 @@ InitProcess(void) /* now that we have a proc, report wait events to shared memory */ pgstat_set_wait_event_storage(&MyProc->wait_event_info); + pgstat_set_wait_event_timing_storage(MyProcNumber); /* * We might be reusing a semaphore that belonged to a failed process. So @@ -713,6 +715,7 @@ InitAuxiliaryProcess(void) /* now that we have a proc, report wait events to shared memory */ pgstat_set_wait_event_storage(&MyProc->wait_event_info); + pgstat_set_wait_event_timing_storage(MyProcNumber); /* Check that group locking fields are in a proper initial state. */ Assert(MyProc->lockGroupLeader == NULL); @@ -1039,6 +1042,7 @@ ProcKill(int code, Datum arg) /* See comment above, close to DisownLatch() */ pgstat_reset_wait_event_storage(); + pgstat_reset_wait_event_timing_storage(); MyProc = NULL; MyProcNumber = INVALID_PROC_NUMBER; @@ -1101,6 +1105,7 @@ AuxiliaryProcKill(int code, Datum arg) /* look at the equivalent ProcKill() code for comments */ SwitchBackToLocalLatch(); pgstat_reset_wait_event_storage(); + pgstat_reset_wait_event_timing_storage(); proc = MyProc; MyProc = NULL; diff --git a/src/backend/utils/.gitignore b/src/backend/utils/.gitignore index fa9cfb39693..5051e36d1f0 100644 --- a/src/backend/utils/.gitignore +++ b/src/backend/utils/.gitignore @@ -7,4 +7,5 @@ /errcodes.h /pgstat_wait_event.c /wait_event_funcs_data.c +/wait_event_timing_data.h /wait_event_types.h diff --git a/src/backend/utils/Makefile b/src/backend/utils/Makefile index 81b4a956bda..5c11d8294f0 100644 --- a/src/backend/utils/Makefile +++ b/src/backend/utils/Makefile @@ -43,7 +43,7 @@ generated-header-symlinks: $(top_builddir)/src/include/utils/header-stamp submak submake-adt-headers: $(MAKE) -C adt jsonpath_gram.h -$(SUBDIRS:%=%-recursive): fmgr-stamp errcodes.h guc_tables.inc.c pgstat_wait_event.c wait_event_funcs_data.c wait_event_types.h +$(SUBDIRS:%=%-recursive): fmgr-stamp errcodes.h guc_tables.inc.c pgstat_wait_event.c wait_event_funcs_data.c wait_event_timing_data.h wait_event_types.h # fmgr-stamp records the last time we ran Gen_fmgrtab.pl. We don't rely on # the timestamps of the individual output files, because the Perl script @@ -60,6 +60,7 @@ guc_tables.inc.c: $(top_srcdir)/src/backend/utils/misc/guc_parameters.dat $(top_ pgstat_wait_event.c: wait_event_types.h wait_event_funcs_data.c: wait_event_types.h +wait_event_timing_data.h: wait_event_types.h wait_event_types.h: $(top_srcdir)/src/backend/utils/activity/wait_event_names.txt $(top_srcdir)/src/backend/utils/activity/generate-wait_event_types.pl $(PERL) $(top_srcdir)/src/backend/utils/activity/generate-wait_event_types.pl --code $< @@ -79,8 +80,8 @@ endif # These generated headers must be symlinked into src/include/. # We use header-stamp to record that we've done this because the symlinks # themselves may appear older than fmgr-stamp. -$(top_builddir)/src/include/utils/header-stamp: fmgr-stamp errcodes.h probes.h guc_tables.inc.c pgstat_wait_event.c wait_event_funcs_data.c wait_event_types.h - cd '$(dir $@)' && for file in fmgroids.h fmgrprotos.h errcodes.h probes.h guc_tables.inc.c pgstat_wait_event.c wait_event_funcs_data.c wait_event_types.h; do \ +$(top_builddir)/src/include/utils/header-stamp: fmgr-stamp errcodes.h probes.h guc_tables.inc.c pgstat_wait_event.c wait_event_funcs_data.c wait_event_timing_data.h wait_event_types.h + cd '$(dir $@)' && for file in fmgroids.h fmgrprotos.h errcodes.h probes.h guc_tables.inc.c pgstat_wait_event.c wait_event_funcs_data.c wait_event_timing_data.h wait_event_types.h; do \ rm -f $$file && $(LN_S) "../../../$(subdir)/$$file" . ; \ done touch $@ @@ -99,4 +100,4 @@ uninstall-data: clean: rm -f probes.h probes.h.tmp rm -f fmgroids.h fmgrprotos.h fmgrtab.c fmgr-stamp errcodes.h guc_tables.inc.c - rm -f wait_event_types.h pgstat_wait_event.c wait_event_funcs_data.c + rm -f wait_event_types.h pgstat_wait_event.c wait_event_funcs_data.c wait_event_timing_data.h diff --git a/src/backend/utils/activity/Makefile b/src/backend/utils/activity/Makefile index 1c824e9b788..cc6e63855f9 100644 --- a/src/backend/utils/activity/Makefile +++ b/src/backend/utils/activity/Makefile @@ -42,5 +42,6 @@ OBJS = \ # Force these dependencies to be known even without dependency info built: wait_event.o: wait_event.c $(top_builddir)/src/backend/utils/pgstat_wait_event.c wait_event_funcs.o: wait_event_funcs.c $(top_builddir)/src/backend/utils/wait_event_funcs_data.c +wait_event_timing.o: wait_event_timing.c $(top_builddir)/src/backend/utils/wait_event_timing_data.h include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/utils/activity/generate-wait_event_types.pl b/src/backend/utils/activity/generate-wait_event_types.pl index d39a30d0478..f3f1f107a4c 100644 --- a/src/backend/utils/activity/generate-wait_event_types.pl +++ b/src/backend/utils/activity/generate-wait_event_types.pl @@ -5,6 +5,7 @@ # - wait_event_types.h (if --code is passed) # - pgstat_wait_event.c (if --code is passed) # - wait_event_funcs_data.c (if --code is passed) +# - wait_event_timing_data.h (if --code is passed) # - wait_event_types.sgml (if --docs is passed) # # Portions Copyright (c) 1996-2026, PostgreSQL Global Development Group @@ -269,17 +270,195 @@ if ($gen_code) } } + # ----------------------------------------------------------- + # Compute wait_event_timing class mapping data. + # + # The dense class table maps raw classId (0x00..max) to a + # dense index, with per-class slot counts rounded up to the + # next power of 2 (minimum 16). Extension and InjectionPoint + # are fixed at 128 because extensions register custom events. + # LWLock uses a hash table (dense = -1). + # ----------------------------------------------------------- + + # Map section name -> raw classId (from wait_classes.h constants) + my %class_to_raw = ( + 'Lock' => 0x03, + 'Buffer' => 0x04, + 'Activity' => 0x05, + 'Client' => 0x06, + 'Extension' => 0x07, + 'IPC' => 0x08, + 'Timeout' => 0x09, + 'IO' => 0x0A, + 'InjectionPoint' => 0x0B, + ); + + # Classes that need fixed large slot counts (dynamically extensible) + my %fixed_slot_classes = ( + 'Extension' => 128, + 'InjectionPoint' => 128, + ); + + # Count events per class from the parsed data. + # Build a list of (className, rawId, actualCount) sorted by rawId. + my @timing_classes; + foreach my $waitclass (keys %hashwe) + { + my $short = $waitclass; + $short =~ s/^WaitEvent//; + + # Skip LWLock -- uses hash table, not flat array + next unless exists $class_to_raw{$short}; + + my $raw_id = $class_to_raw{$short}; + my $count = scalar @{ $hashwe{$waitclass} }; + + push @timing_classes, { + name => $short, + raw_id => $raw_id, + actual => $count, + }; + } + + # InjectionPoint (0x0B) has no section in wait_event_names.txt + # because its events are dynamically registered at runtime. + # Add it explicitly with actual=0 and a fixed slot count. + if (!grep { $_->{name} eq 'InjectionPoint' } @timing_classes) + { + push @timing_classes, { + name => 'InjectionPoint', + raw_id => $class_to_raw{'InjectionPoint'}, + actual => 0, + }; + } + + # Sort by raw classId + @timing_classes = sort { $a->{raw_id} <=> $b->{raw_id} } @timing_classes; + + # Compute slot counts: next power of 2, minimum 16, or fixed + foreach my $cls (@timing_classes) + { + if (exists $fixed_slot_classes{$cls->{name}}) + { + $cls->{slots} = $fixed_slot_classes{$cls->{name}}; + } + else + { + my $slots = 16; # minimum + $slots *= 2 while $slots < $cls->{actual}; + $cls->{slots} = $slots; + } + } + + # Compute cumulative offsets + my $offset = 0; + foreach my $cls (@timing_classes) + { + $cls->{offset} = $offset; + $offset += $cls->{slots}; + } + my $total_events = $offset; + + # Determine max raw classId for array sizing + my $max_raw = 0; + foreach my $cls (@timing_classes) + { + $max_raw = $cls->{raw_id} if $cls->{raw_id} > $max_raw; + } + my $raw_classes = $max_raw + 1; + my $dense_classes = scalar @timing_classes; + + # Emit timing defines into wait_event_types.h + printf $h "\n/* Wait event timing flat array sizing (generated) */\n"; + printf $h "#define WAIT_EVENT_TIMING_RAW_CLASSES\t%d\n", $raw_classes; + printf $h "#define WAIT_EVENT_TIMING_DENSE_CLASSES\t%d\n", $dense_classes; + printf $h "#define WAIT_EVENT_TIMING_NUM_EVENTS\t%d\n\n", $total_events; + printf $h "#endif /* WAIT_EVENT_TYPES_H */\n"; close $h; close $c; close $wc; + # Generate wait_event_timing_data.h with the mapping arrays. + # A header (rather than a .c file) keeps the file-extension category + # straight: it is included into a single TU (wait_event_timing.c) and + # defines static const tables there. The include guard makes the + # single-owner intent explicit and prevents accidental double inclusion. + my $ttmp = "$output_path/wait_event_timing_data.h.tmp$$"; + open my $t, '>', $ttmp or die "Could not open $ttmp: $!"; + printf $t $header_comment, 'wait_event_timing_data.h'; + + printf $t "#ifndef WAIT_EVENT_TIMING_DATA_H\n"; + printf $t "#define WAIT_EVENT_TIMING_DATA_H\n\n"; + + # Emit wait_event_class_dense[] + printf $t "static const int8 wait_event_class_dense[WAIT_EVENT_TIMING_RAW_CLASSES] = {\n"; + for (my $i = 0; $i < $raw_classes; $i++) + { + my $dense = -1; + my $comment = "unused"; + for (my $d = 0; $d < $dense_classes; $d++) + { + if ($timing_classes[$d]->{raw_id} == $i) + { + $dense = $d; + $comment = $timing_classes[$d]->{name}; + last; + } + } + # classId 0x01 is LWLock + if ($i == 0x01) + { + $comment = "LWLock (uses hash)"; + } + my $comma = ($i < $raw_classes - 1) ? "," : ""; + printf $t "\t%2d$comma\t\t/* 0x%02x: %s */\n", $dense, $i, $comment; + } + printf $t "};\n\n"; + + # Emit wait_event_class_nevents[] + printf $t "static const int wait_event_class_nevents[WAIT_EVENT_TIMING_DENSE_CLASSES] = {\n"; + for (my $d = 0; $d < $dense_classes; $d++) + { + my $cls = $timing_classes[$d]; + my $comma = ($d < $dense_classes - 1) ? "," : ""; + printf $t "\t%d$comma\t\t/* %s (actual: %d) */\n", + $cls->{slots}, $cls->{name}, $cls->{actual}; + } + printf $t "};\n\n"; + + # Emit wait_event_class_offset[] + printf $t "static const int wait_event_class_offset[WAIT_EVENT_TIMING_DENSE_CLASSES] = {\n"; + for (my $d = 0; $d < $dense_classes; $d++) + { + my $cls = $timing_classes[$d]; + my $comma = ($d < $dense_classes - 1) ? "," : ""; + printf $t "\t%d$comma\t\t/* %s */\n", $cls->{offset}, $cls->{name}; + } + printf $t "};\n\n"; + + # Emit wait_event_dense_to_classid[] + printf $t "static const uint8 wait_event_dense_to_classid[WAIT_EVENT_TIMING_DENSE_CLASSES] = {\n\t"; + for (my $d = 0; $d < $dense_classes; $d++) + { + my $cls = $timing_classes[$d]; + my $comma = ($d < $dense_classes - 1) ? ", " : ""; + printf $t "0x%02x$comma", $cls->{raw_id}; + } + printf $t "\n};\n\n"; + + printf $t "#endif /* WAIT_EVENT_TIMING_DATA_H */\n"; + + close $t; + rename($htmp, "$output_path/wait_event_types.h") || die "rename: $htmp to $output_path/wait_event_types.h: $!"; rename($ctmp, "$output_path/pgstat_wait_event.c") || die "rename: $ctmp to $output_path/pgstat_wait_event.c: $!"; rename($wctmp, "$output_path/wait_event_funcs_data.c") || die "rename: $wctmp to $output_path/wait_event_funcs_data.c: $!"; + rename($ttmp, "$output_path/wait_event_timing_data.h") + || die "rename: $ttmp to $output_path/wait_event_timing_data.h: $!"; } # Generate the .sgml file. elsif ($gen_docs) diff --git a/src/backend/utils/activity/wait_event.c b/src/backend/utils/activity/wait_event.c index 95635c7f56c..c8fab55b363 100644 --- a/src/backend/utils/activity/wait_event.c +++ b/src/backend/utils/activity/wait_event.c @@ -41,8 +41,7 @@ static const char *pgstat_get_wait_io(WaitEventIO w); static uint32 local_my_wait_event_info; uint32 *my_wait_event_info = &local_my_wait_event_info; -#define WAIT_EVENT_CLASS_MASK 0xFF000000 -#define WAIT_EVENT_ID_MASK 0x0000FFFF +/* WAIT_EVENT_CLASS_MASK / WAIT_EVENT_ID_MASK are defined in utils/wait_classes.h */ /* * Hash tables for storing custom wait event ids and their names in diff --git a/src/backend/utils/activity/wait_event_timing.c b/src/backend/utils/activity/wait_event_timing.c index d11823adc57..558bac511ed 100644 --- a/src/backend/utils/activity/wait_event_timing.c +++ b/src/backend/utils/activity/wait_event_timing.c @@ -1,17 +1,24 @@ /*------------------------------------------------------------------------- * * wait_event_timing.c - * Per-backend wait event timing instrumentation. + * Per-backend wait event timing and histogram accumulation. * - * Controlled by the wait_event_capture GUC (off | stats, default off) - * and the compile-time option --enable-wait-event-timing. + * Every transition through pgstat_report_wait_start()/_end() records the + * wait duration with INSTR_TIME (a clock_gettime()-grade timestamp) and + * accumulates per-(backend, event) statistics -- count, total/max + * nanoseconds, and a log2 duration histogram -- in shared memory. Each + * backend writes only to its own slot, so the hot path needs no locking. * - * This commit provides only the GUC scaffolding: the backing variable, - * the enum-value table consumed by guc.c, and the check/assign hooks. - * No instrumentation is performed yet -- later commits in the series add - * the recording hot path and the SQL surface. The file compiles in both - * build configurations; in builds without --enable-wait-event-timing the - * check hook rejects any value other than off. + * The per-backend slot array lives in the main shared memory segment, + * sized at postmaster start from wait_event_timing_max_tranches, so it is + * valid for the entire life of every backend -- no lazy attach and no + * teardown gating are required. + * + * Controlled by the wait_event_capture GUC (off | stats, default off) and + * the compile-time option --enable-wait-event-timing. In builds without + * that option the file still compiles (the GUC backing variable, the enum + * table, the rejecting check hook, and empty-result SQL stubs), so the GUC + * and the catalog functions exist uniformly. * * Copyright (c) 2026, PostgreSQL Global Development Group * @@ -22,16 +29,17 @@ */ #include "postgres.h" +#include "storage/subsystems.h" #include "utils/guc.h" #include "utils/guc_hooks.h" #include "utils/wait_event_timing.h" /* - * GUC variable -- always defined so the GUC system has a backing variable - * even when compiled without --enable-wait-event-timing. In stub builds - * the check hook below rejects any value other than OFF. + * GUC variables -- always defined so the GUC system has backing variables + * even when compiled without --enable-wait-event-timing. */ int wait_event_capture = WAIT_EVENT_CAPTURE_OFF; +int wait_event_timing_max_tranches = 192; /* * Enum value table consumed by guc.c. Order matches the @@ -43,40 +51,29 @@ const struct config_enum_entry wait_event_capture_options[] = { {NULL, 0, false} }; -#ifdef USE_WAIT_EVENT_TIMING +#ifndef USE_WAIT_EVENT_TIMING /* - * GUC check hook for wait_event_capture (timing build). - * - * All enum values are accepted at this level. Side effects (attaching - * storage, etc.) are introduced by later commits; for now there is - * nothing to validate beyond the enum mapping that guc.c already did. + * Stub build: no instrumentation. Provide the symbols referenced by + * pg_proc.dat, the GUC machinery, and the shmem subsystem registry. */ -bool -check_wait_event_capture(int *newval, void **extra, GucSource source) -{ - return true; -} +#include "fmgr.h" +#include "funcapi.h" -/* - * GUC assign hook for wait_event_capture (timing build). - * - * No-op for now. Later commits use this hook to drop in-flight wait - * state and manage per-session resources when the capture level changes. - */ -void -assign_wait_event_capture(int newval, void *extra) +Datum pg_stat_get_wait_event_timing(PG_FUNCTION_ARGS); + +Datum +pg_stat_get_wait_event_timing(PG_FUNCTION_ARGS) { + InitMaterializedSRF(fcinfo, 0); + PG_RETURN_VOID(); } -#else /* !USE_WAIT_EVENT_TIMING */ - /* * GUC check hook for the stub build. Any value other than 'off' is * meaningless without --enable-wait-event-timing, so reject it -- or * downgrade to 'off' with a warning when the value comes from a - * non-interactive source (config file at startup), so a leftover setting - * does not prevent the server from starting. + * non-interactive source, so a leftover setting does not block startup. */ bool check_wait_event_capture(int *newval, void **extra, GucSource source) @@ -100,10 +97,681 @@ check_wait_event_capture(int *newval, void **extra, GucSource source) return true; } -/* Stub assign hook -- nothing to do without compile-time support. */ void assign_wait_event_capture(int newval, void *extra) { } +/* No shared memory is reserved in the stub build. */ +const ShmemCallbacks WaitEventTimingShmemCallbacks = {0}; + +/* Defined so every extern in wait_event_timing.h resolves in stub builds. */ +WaitEventTimingState *my_wait_event_timing = NULL; + +void +pgstat_set_wait_event_timing_storage(int procNumber) +{ +} + +void +pgstat_reset_wait_event_timing_storage(void) +{ +} + +#else /* USE_WAIT_EVENT_TIMING */ + +#include "catalog/pg_authid.h" +#include "catalog/pg_type_d.h" +#include "funcapi.h" +#include "miscadmin.h" +#include "port/pg_bitutils.h" +#include "storage/proc.h" +#include "storage/procarray.h" +#include "storage/procnumber.h" +#include "utils/acl.h" +#include "utils/array.h" +#include "utils/backend_status.h" +#include "utils/builtins.h" +#include "utils/tuplestore.h" +#include "utils/wait_event.h" + +#define NUM_WAIT_EVENT_TIMING_SLOTS (MaxBackends + NUM_AUXILIARY_PROCS) + +#define HAS_PGSTAT_PERMISSIONS(role) \ + (has_privs_of_role(GetUserId(), ROLE_PG_READ_ALL_STATS) || \ + has_privs_of_role(GetUserId(), role)) + +/* Pointer to this backend's timing state in shared memory. */ +WaitEventTimingState *my_wait_event_timing = NULL; + +/* + * Backend-local cached pointer to the start of the shared slot array, set + * at shmem init (postmaster) and, in EXEC_BACKEND mode, at attach. Slots + * are NOT a simple C array: each has a runtime-determined stride (header + + * variable-size hash arrays); use wet_slot() to index. + */ +static char *WaitEventTimingArray = NULL; + +/* + * Per-backend slot stride and the hash dimensions, all derived from the + * GUC wait_event_timing_max_tranches. Because the GUC is PGC_POSTMASTER, + * every backend in the cluster derives identical values, so the shared + * layout is consistent across fork() and EXEC_BACKEND. + */ +static Size wait_event_timing_per_backend_stride = 0; +static int wait_event_timing_hash_size = 0; +static int wait_event_timing_max_entries = 0; + +/* + * Mapping arrays for the flat events[] array, generated from + * wait_event_names.txt by generate-wait_event_types.pl. Defines + * wait_event_class_dense / _nevents / _offset / _dense_to_classid. + */ +#include "utils/wait_event_timing_data.h" + +/* + * Round up to the next power of two, with a minimum of 32. The hash slot + * count must be a power of two for the mask-based modulo in the lookup hot + * path; we target >= 2x the entry cap so the load factor stays <= 50%. + */ +static int +wait_event_timing_hash_size_for(int max_entries) +{ + int size = 32; + + while (size < max_entries * 2) + size <<= 1; + return size; +} + +/* + * Compute the per-backend slot size for the given max_entries. Layout: + * + * [ WaitEventTimingState header ] + * [ LWLockTimingHashEntry[hash_size] ] + * [ WaitEventTimingEntry[max_entries] <- lwlock_events[] ] + */ +static Size +wait_event_timing_slot_size(int max_entries) +{ + int hash_size = wait_event_timing_hash_size_for(max_entries); + + return add_size(sizeof(WaitEventTimingState), + add_size(mul_size(hash_size, sizeof(LWLockTimingHashEntry)), + mul_size(max_entries, sizeof(WaitEventTimingEntry)))); +} + +/* Cache the backend-local layout dimensions from the GUC (idempotent). */ +static void +wait_event_timing_init_local_dims(void) +{ + if (wait_event_timing_per_backend_stride != 0) + return; + wait_event_timing_max_entries = wait_event_timing_max_tranches; + wait_event_timing_hash_size = + wait_event_timing_hash_size_for(wait_event_timing_max_entries); + wait_event_timing_per_backend_stride = + wait_event_timing_slot_size(wait_event_timing_max_entries); +} + +/* Resolve the address of slot `idx` within WaitEventTimingArray. */ +static inline WaitEventTimingState * +wet_slot(int idx) +{ + return (WaitEventTimingState *) + (WaitEventTimingArray + (Size) idx * wait_event_timing_per_backend_stride); +} + +/* + * Address of the LWLock hash slot table for a slot (immediately follows + * the WaitEventTimingState header). + */ +static inline LWLockTimingHashEntry * +wet_lwlock_hash_entries(WaitEventTimingState *state) +{ + return (LWLockTimingHashEntry *) ((char *) state + sizeof(WaitEventTimingState)); +} + +/* + * Address of the dense LWLock events array for a slot (immediately follows + * the slot table). + */ +static inline WaitEventTimingEntry * +wet_lwlock_hash_events(WaitEventTimingState *state) +{ + return (WaitEventTimingEntry *) + ((char *) state + sizeof(WaitEventTimingState) + + (Size) state->lwlock_hash.hash_size * sizeof(LWLockTimingHashEntry)); +} + +/* + * Convert wait_event_info to a flat index for the events[] array. Returns + * WAIT_EVENT_TIMING_IDX_LWLOCK for LWLock events (which use the hash) and + * -1 for events outside the mapped classes. + */ +static int +wait_event_timing_index(uint32 wait_event_info) +{ + uint32 classId = wait_event_info & WAIT_EVENT_CLASS_MASK; + int eventId = wait_event_info & WAIT_EVENT_ID_MASK; + int class_byte; + int dense; + + if (classId == PG_WAIT_LWLOCK) + return WAIT_EVENT_TIMING_IDX_LWLOCK; + + class_byte = classId >> 24; + if (class_byte >= WAIT_EVENT_TIMING_RAW_CLASSES) + return -1; + + dense = wait_event_class_dense[class_byte]; + if (dense < 0) + return -1; + + if (eventId >= wait_event_class_nevents[dense]) + return -1; + + return wait_event_class_offset[dense] + eventId; +} + +/* + * Reset a slot's LWLockTimingHash to its empty initial state. The hash + * header's hash_size and max_entries are immutable and not reset here. + */ +static void +lwlock_timing_hash_clear(WaitEventTimingState *state) +{ + LWLockTimingHash *ht = &state->lwlock_hash; + LWLockTimingHashEntry *entries = wet_lwlock_hash_entries(state); + WaitEventTimingEntry *events = wet_lwlock_hash_events(state); + int i; + + ht->num_used = 0; + memset(events, 0, (Size) ht->max_entries * sizeof(WaitEventTimingEntry)); + for (i = 0; i < ht->hash_size; i++) + { + entries[i].tranche_id = LWLOCK_TIMING_EMPTY_SLOT; + entries[i].dense_idx = 0; + } +} + +/* + * Maximum probes attempted on the lookup hot path once the table is at + * capacity. At cap an unknown tranche cannot be inserted, so bounding the + * scan caps the per-event cost instead of walking clustered occupied slots + * on every unknown-tranche wait_end. 8 is well above the expected probe + * distance at the target load factor. + */ +#define LWLOCK_TIMING_LOOKUP_AT_CAP_PROBE_LIMIT 8 + +/* + * Look up (or insert) the timing entry for an LWLock tranche id. Returns + * NULL when the table is at capacity and the tranche is not already + * present. + */ +static WaitEventTimingEntry * +lwlock_timing_lookup(WaitEventTimingState *state, uint16 tranche_id) +{ + LWLockTimingHash *ht = &state->lwlock_hash; + LWLockTimingHashEntry *entries = wet_lwlock_hash_entries(state); + WaitEventTimingEntry *events = wet_lwlock_hash_events(state); + uint32 hash = (uint32) tranche_id * 2654435761U; + int slot = hash & (ht->hash_size - 1); + int limit; + int i; + + limit = (ht->num_used >= ht->max_entries) + ? LWLOCK_TIMING_LOOKUP_AT_CAP_PROBE_LIMIT + : ht->hash_size; + + for (i = 0; i < limit; i++) + { + LWLockTimingHashEntry *e = &entries[slot]; + + if (e->tranche_id == tranche_id) + return &events[e->dense_idx]; + + if (e->tranche_id == LWLOCK_TIMING_EMPTY_SLOT) + { + if (ht->num_used >= ht->max_entries) + return NULL; + + e->tranche_id = tranche_id; + e->dense_idx = ht->num_used++; + return &events[e->dense_idx]; + } + + slot = (slot + 1) & (ht->hash_size - 1); + } + + return NULL; +} + +/* + * Compute the histogram bucket index for a duration in nanoseconds. See + * the rationale on WAIT_EVENT_TIMING_HISTOGRAM_BUCKETS in the header. + */ +static int +wait_event_timing_bucket(int64 duration_ns) +{ + int bucket; + + /* + * Everything under ~1us (and 0, undefined for pg_leftmost_one_pos64) + * lands in bucket 0. + */ + if (duration_ns < 1024) + return 0; + + bucket = pg_leftmost_one_pos64((uint64) duration_ns) - 9; + + if (bucket >= WAIT_EVENT_TIMING_HISTOGRAM_BUCKETS) + bucket = WAIT_EVENT_TIMING_HISTOGRAM_BUCKETS - 1; + + return bucket; +} + +/* + * ShmemRequest: reserve the per-backend slot array. Sized from + * wait_event_timing_max_tranches; the framework stores the allocated + * address in WaitEventTimingArray before WaitEventTimingShmemInit runs. + */ +static void +WaitEventTimingShmemRequest(void *arg) +{ + Size stride; + + wait_event_timing_init_local_dims(); + stride = wait_event_timing_per_backend_stride; + + ShmemRequestStruct(.name = "WaitEventTimingArray", + .size = mul_size(NUM_WAIT_EVENT_TIMING_SLOTS, stride), + .ptr = (void **) &WaitEventTimingArray); +} + +/* ShmemInit: zero the array and initialise each slot's hash header. */ +static void +WaitEventTimingShmemInit(void *arg) +{ + int i; + + wait_event_timing_init_local_dims(); + + memset(WaitEventTimingArray, 0, + mul_size(NUM_WAIT_EVENT_TIMING_SLOTS, + wait_event_timing_per_backend_stride)); + + for (i = 0; i < NUM_WAIT_EVENT_TIMING_SLOTS; i++) + { + WaitEventTimingState *slot = wet_slot(i); + LWLockTimingHashEntry *entries; + int j; + + slot->lwlock_hash.num_used = 0; + slot->lwlock_hash.hash_size = wait_event_timing_hash_size; + slot->lwlock_hash.max_entries = wait_event_timing_max_entries; + + /* The array was zeroed above, but the empty sentinel is 0xFFFF. */ + entries = wet_lwlock_hash_entries(slot); + for (j = 0; j < wait_event_timing_hash_size; j++) + entries[j].tranche_id = LWLOCK_TIMING_EMPTY_SLOT; + } +} + +const ShmemCallbacks WaitEventTimingShmemCallbacks = { + .request_fn = WaitEventTimingShmemRequest, + .init_fn = WaitEventTimingShmemInit, +}; + +/* + * Point my_wait_event_timing at this backend's slot. Called from + * InitProcess()/InitAuxiliaryProcess() once the backend has a procNumber. + * The slot is cleared here so stats do not leak across slot reuse when a + * new backend inherits a procNumber previously held by an exited one. + */ +void +pgstat_set_wait_event_timing_storage(int procNumber) +{ + WaitEventTimingState *slot; + + if (procNumber < 0 || procNumber >= NUM_WAIT_EVENT_TIMING_SLOTS || + WaitEventTimingArray == NULL) + { + my_wait_event_timing = NULL; + return; + } + + wait_event_timing_init_local_dims(); + + slot = wet_slot(procNumber); + + memset(slot->events, 0, sizeof(slot->events)); + lwlock_timing_hash_clear(slot); + slot->lwlock_overflow_count = 0; + slot->flat_overflow_count = 0; + slot->current_event = 0; + INSTR_TIME_SET_ZERO(slot->wait_start); + + /* Publish only after the slot is fully initialised. */ + my_wait_event_timing = slot; +} + +/* + * Detach from the timing slot on backend exit. The slot itself stays in + * shared memory; clearing the pointer keeps the late-shutdown wait-event + * hot path from touching it. + */ +void +pgstat_reset_wait_event_timing_storage(void) +{ + my_wait_event_timing = NULL; +} + +/* + * GUC check hook for wait_event_capture (timing build). All enum values + * are accepted; there is nothing to validate beyond the enum mapping. + */ +bool +check_wait_event_capture(int *newval, void **extra, GucSource source) +{ + return true; +} + +/* + * GUC assign hook for wait_event_capture (timing build). + * + * Drop any in-flight wait state: after the capture level changes, the + * existing wait_start / current_event can no longer be trusted (a wait + * that started under one level and ends under another would be miscredited + * or use a stale start time). Forfeiting at most one in-flight sample per + * GUC change is negligible and eliminates all such miscredits. + */ +void +assign_wait_event_capture(int newval, void *extra) +{ + if (my_wait_event_timing != NULL) + { + INSTR_TIME_SET_ZERO(my_wait_event_timing->wait_start); + my_wait_event_timing->current_event = 0; + } +} + +/* + * Out-of-line body for pgstat_report_wait_start()'s timing path. Records + * the start timestamp and the event being waited on. Reached only when + * wait_event_capture != OFF. + */ +void +pgstat_report_wait_start_timing(uint32 wait_event_info) +{ + if (my_wait_event_timing == NULL) + return; + + INSTR_TIME_SET_CURRENT(my_wait_event_timing->wait_start); + my_wait_event_timing->current_event = wait_event_info; +} + +/* + * Out-of-line body for pgstat_report_wait_end()'s timing path. Computes + * the wait duration and accumulates per-event statistics. + * + * capture_level is the value of wait_event_capture observed at the inline + * gate; in this commit only STATS exists, so it is not branched on, but it + * is threaded through to keep the gate ABI stable for the trace level + * added later in the series. + */ +void +pgstat_report_wait_end_timing(int capture_level) +{ + uint32 event; + + (void) capture_level; + + if (my_wait_event_timing == NULL) + return; + + event = my_wait_event_timing->current_event; + + if (event != 0 && !INSTR_TIME_IS_ZERO(my_wait_event_timing->wait_start)) + { + instr_time now; + int64 duration_ns; + int idx; + WaitEventTimingEntry *entry = NULL; + bool warn_lwlock_overflow = false; + bool warn_flat_overflow = false; + + INSTR_TIME_SET_CURRENT(now); + duration_ns = INSTR_TIME_GET_NANOSEC(now) - + INSTR_TIME_GET_NANOSEC(my_wait_event_timing->wait_start); + + if (duration_ns < 0) + duration_ns = 0; + + idx = wait_event_timing_index(event); + + /* + * Single-writer hot path: each slot has exactly one writer (the + * owning backend), and the SRF reader is lock-free, so no locking is + * needed here. Overflow WARNINGs are deferred to after the + * bookkeeping so ereport() cannot recurse through a wait event while + * counters are in an intermediate state. + */ + if (idx == WAIT_EVENT_TIMING_IDX_LWLOCK) + entry = lwlock_timing_lookup(my_wait_event_timing, event & 0xFFFF); + else if (idx >= 0) + entry = &my_wait_event_timing->events[idx]; + + if (entry != NULL) + { + entry->count++; + entry->total_ns += duration_ns; + if (duration_ns > entry->max_ns) + entry->max_ns = duration_ns; + entry->histogram[wait_event_timing_bucket(duration_ns)]++; + } + else if (idx == WAIT_EVENT_TIMING_IDX_LWLOCK) + { + if (my_wait_event_timing->lwlock_overflow_count++ == 0) + warn_lwlock_overflow = true; + } + else if (idx == -1) + { + if (my_wait_event_timing->flat_overflow_count++ == 0) + warn_flat_overflow = true; + } + + if (warn_lwlock_overflow) + ereport(WARNING, + (errmsg("wait event timing data for some LWLock tranches will be lost"), + errdetail("This backend has used more than %d distinct LWLock tranches.", + wait_event_timing_max_entries), + errhint("Consider raising \"wait_event_timing_max_tranches\"."))); + else if (warn_flat_overflow) + ereport(WARNING, + (errmsg("some wait events will not be timed"), + errdetail("A wait event was reported with a class unknown to the timing infrastructure."))); + + INSTR_TIME_SET_ZERO(my_wait_event_timing->wait_start); + } +} + +/* + * Resolve the optional pid SRF argument to a procNumber range + * [out_start, out_end). Returns false if the SRF should emit zero rows + * (unknown pid -- silent no-op). + */ +static bool +wait_event_timing_pid_range(FunctionCallInfo fcinfo, + int *out_start, int *out_end) +{ + if (PG_ARGISNULL(0)) + { + *out_start = 0; + *out_end = NUM_WAIT_EVENT_TIMING_SLOTS; + return true; + } + else + { + int target_pid = PG_GETARG_INT32(0); + PGPROC *proc; + int procNumber; + + proc = BackendPidGetProc(target_pid); + if (proc == NULL) + proc = AuxiliaryPidGetProc(target_pid); + if (proc == NULL) + return false; + + procNumber = GetNumberFromPGProc(proc); + if (procNumber < 0 || procNumber >= NUM_WAIT_EVENT_TIMING_SLOTS) + return false; + + *out_start = procNumber; + *out_end = procNumber + 1; + return true; + } +} + +/* Emit one SRF row for a populated timing entry. */ +static void +wait_event_timing_emit_row(ReturnSetInfo *rsinfo, PgBackendStatus *beentry, + int procnumber, uint32 wait_event_info, + WaitEventTimingEntry *entry, + ArrayType *hist_array, int64 *hist_payload) +{ + Datum values[10]; + bool nulls[10]; + const char *event_type; + const char *event_name; + int bucket; + + event_type = pgstat_get_wait_event_type(wait_event_info); + event_name = pgstat_get_wait_event(wait_event_info); + if (event_type == NULL || event_name == NULL) + return; + + memset(nulls, 0, sizeof(nulls)); + + values[0] = Int32GetDatum(beentry->st_procpid); + values[1] = CStringGetTextDatum(GetBackendTypeDesc(beentry->st_backendType)); + values[2] = Int32GetDatum(procnumber); + values[3] = CStringGetTextDatum(event_type); + values[4] = CStringGetTextDatum(event_name); + values[5] = Int64GetDatum(entry->count); + values[6] = Float8GetDatum((double) entry->total_ns / 1000000.0); + values[7] = Float8GetDatum(entry->count > 0 + ? (double) entry->total_ns / entry->count / 1000.0 + : 0.0); + values[8] = Float8GetDatum((double) entry->max_ns / 1000.0); + + for (bucket = 0; bucket < WAIT_EVENT_TIMING_HISTOGRAM_BUCKETS; bucket++) + hist_payload[bucket] = entry->histogram[bucket]; + values[9] = PointerGetDatum(hist_array); + + tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls); +} + +/* + * SQL function: pg_stat_get_wait_event_timing(pid int4, OUT ...) + * + * Returns one row per (backend, wait_event) with a non-zero count. pid is + * optional: NULL means all backends; a non-NULL value restricts the sweep + * to that backend (silently empty for unknown pids). + */ +Datum +pg_stat_get_wait_event_timing(PG_FUNCTION_ARGS) +{ + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + int start_idx; + int end_idx; + int backend_idx; + ArrayType *hist_array; + int64 *hist_payload; + + InitMaterializedSRF(fcinfo, 0); + + if (WaitEventTimingArray == NULL) + PG_RETURN_VOID(); + + if (!wait_event_timing_pid_range(fcinfo, &start_idx, &end_idx)) + PG_RETURN_VOID(); + + /* + * Allocate the histogram ArrayType once and reuse it across every row; + * tuplestore_putvalues flattens the varlena into its stored tuple, so + * rewriting the payload cannot corrupt previously emitted rows. + */ + { + Datum zero_elems[WAIT_EVENT_TIMING_HISTOGRAM_BUCKETS]; + + memset(zero_elems, 0, sizeof(zero_elems)); + hist_array = construct_array_builtin(zero_elems, + WAIT_EVENT_TIMING_HISTOGRAM_BUCKETS, + INT8OID); + hist_payload = (int64 *) ARR_DATA_PTR(hist_array); + } + + for (backend_idx = start_idx; backend_idx < end_idx; backend_idx++) + { + WaitEventTimingState *state = wet_slot(backend_idx); + PgBackendStatus *beentry; + int i; + + /* Skip dead backend slots and enforce stats permissions. */ + beentry = pgstat_get_beentry_by_proc_number(backend_idx); + if (beentry == NULL) + continue; + if (!HAS_PGSTAT_PERMISSIONS(beentry->st_userid)) + continue; + + /* Flat array rows (all classes except LWLock). */ + for (i = 0; i < WAIT_EVENT_TIMING_DENSE_CLASSES; i++) + { + int base = wait_event_class_offset[i]; + int nevents = wait_event_class_nevents[i]; + uint32 classId = wait_event_dense_to_classid[i]; + int j; + + for (j = 0; j < nevents; j++) + { + WaitEventTimingEntry *entry = &state->events[base + j]; + + if (entry->count == 0) + continue; + + wait_event_timing_emit_row(rsinfo, beentry, backend_idx, + ((uint32) classId << 24) | j, + entry, hist_array, hist_payload); + } + } + + /* LWLock hash rows. */ + { + LWLockTimingHashEntry *entries = wet_lwlock_hash_entries(state); + WaitEventTimingEntry *events = wet_lwlock_hash_events(state); + int hash_size = state->lwlock_hash.hash_size; + + for (i = 0; i < hash_size; i++) + { + LWLockTimingHashEntry *he = &entries[i]; + WaitEventTimingEntry *entry; + + if (he->tranche_id == LWLOCK_TIMING_EMPTY_SLOT) + continue; + + entry = &events[he->dense_idx]; + if (entry->count == 0) + continue; + + wait_event_timing_emit_row(rsinfo, beentry, backend_idx, + PG_WAIT_LWLOCK | he->tranche_id, + entry, hist_array, hist_payload); + } + } + } + + PG_RETURN_VOID(); +} + #endif /* USE_WAIT_EVENT_TIMING */ diff --git a/src/backend/utils/misc/guc_parameters.dat b/src/backend/utils/misc/guc_parameters.dat index 2e2701cb0a2..8f299f0971e 100644 --- a/src/backend/utils/misc/guc_parameters.dat +++ b/src/backend/utils/misc/guc_parameters.dat @@ -3442,6 +3442,15 @@ assign_hook => 'assign_wait_event_capture', }, +{ name => 'wait_event_timing_max_tranches', type => 'int', context => 'PGC_POSTMASTER', group => 'STATS_CUMULATIVE', + short_desc => 'Sets the maximum number of distinct LWLock tranches whose timing is recorded per backend.', + long_desc => 'Each backend\'s wait-event-timing hash table can hold this many distinct LWLock tranches; subsequent tranches are not individually timed. Sized at server start; raise this if your installation loads many extensions that register their own LWLock tranches.', + variable => 'wait_event_timing_max_tranches', + boot_val => '192', + min => '16', + max => '65534', +}, + { name => 'wal_block_size', type => 'int', context => 'PGC_INTERNAL', group => 'PRESET_OPTIONS', short_desc => 'Shows the block size in the write ahead log.', flags => 'GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE', diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index ea0391c4952..362fa48e7e6 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -705,6 +705,7 @@ #track_wal_io_timing = off #wait_event_capture = off # off, stats # (requires --enable-wait-event-timing) +#wait_event_timing_max_tranches = 192 # (change requires restart) #track_functions = none # none, pl, all #stats_fetch_consistency = cache # cache, none, snapshot diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index 3cb84359adf..1b384af298b 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -12718,4 +12718,14 @@ proname => 'hashoid8extended', prorettype => 'int8', proargtypes => 'oid8 int8', prosrc => 'hashoid8extended' }, +{ oid => '9956', + descr => 'statistics: per-backend wait event timing (count, duration, histogram)', + proname => 'pg_stat_get_wait_event_timing', prorows => '1000', + proisstrict => 'f', proretset => 't', provolatile => 's', proparallel => 'r', + prorettype => 'record', proargtypes => 'int4', + proallargtypes => '{int4,int4,text,int4,text,text,int8,float8,float8,float8,_int8}', + proargmodes => '{i,o,o,o,o,o,o,o,o,o,o}', + proargnames => '{pid,pid,backend_type,procnumber,wait_event_type,wait_event,calls,total_time_ms,avg_time_us,max_time_us,histogram}', + prosrc => 'pg_stat_get_wait_event_timing' }, + ] diff --git a/src/include/storage/subsystemlist.h b/src/include/storage/subsystemlist.h index 9ad619080be..331d60cf703 100644 --- a/src/include/storage/subsystemlist.h +++ b/src/include/storage/subsystemlist.h @@ -79,6 +79,7 @@ PG_SHMEM_SUBSYSTEM(SyncScanShmemCallbacks) PG_SHMEM_SUBSYSTEM(AsyncShmemCallbacks) PG_SHMEM_SUBSYSTEM(StatsShmemCallbacks) PG_SHMEM_SUBSYSTEM(WaitEventCustomShmemCallbacks) +PG_SHMEM_SUBSYSTEM(WaitEventTimingShmemCallbacks) #ifdef USE_INJECTION_POINTS PG_SHMEM_SUBSYSTEM(InjectionPointShmemCallbacks) #endif diff --git a/src/include/utils/.gitignore b/src/include/utils/.gitignore index ff6f61cd7ee..8a489b7769b 100644 --- a/src/include/utils/.gitignore +++ b/src/include/utils/.gitignore @@ -6,4 +6,5 @@ /header-stamp /pgstat_wait_event.c /wait_event_funcs_data.c +/wait_event_timing_data.h /wait_event_types.h diff --git a/src/include/utils/meson.build b/src/include/utils/meson.build index fd3a2352df5..ef8b2dc2618 100644 --- a/src/include/utils/meson.build +++ b/src/include/utils/meson.build @@ -1,6 +1,6 @@ # Copyright (c) 2022-2026, PostgreSQL Global Development Group -wait_event_output = ['wait_event_types.h', 'pgstat_wait_event.c', 'wait_event_funcs_data.c'] +wait_event_output = ['wait_event_types.h', 'pgstat_wait_event.c', 'wait_event_funcs_data.c', 'wait_event_timing_data.h'] wait_event_target = custom_target('wait_event_names', input: files('../../backend/utils/activity/wait_event_names.txt'), output: wait_event_output, @@ -11,7 +11,7 @@ wait_event_target = custom_target('wait_event_names', ], build_by_default: true, install: true, - install_dir: [dir_include_server / 'utils', false, false], + install_dir: [dir_include_server / 'utils', false, false, false], ) wait_event_types_h = wait_event_target[0] diff --git a/src/include/utils/wait_classes.h b/src/include/utils/wait_classes.h index b91690a22c6..c6c692a1e93 100644 --- a/src/include/utils/wait_classes.h +++ b/src/include/utils/wait_classes.h @@ -26,4 +26,13 @@ #define PG_WAIT_IO 0x0A000000U #define PG_WAIT_INJECTIONPOINT 0x0B000000U +/* + * Bit-layout masks for wait_event_info. The high byte encodes the + * class (one of the PG_WAIT_* constants above); the low 16 bits + * encode the per-class event id; the middle byte is currently + * reserved (see pgstat_report_wait_start in wait_event.h). + */ +#define WAIT_EVENT_CLASS_MASK 0xFF000000U +#define WAIT_EVENT_ID_MASK 0x0000FFFFU + #endif /* WAIT_CLASSES_H */ diff --git a/src/include/utils/wait_event.h b/src/include/utils/wait_event.h index 86ee348220d..3f94cd09099 100644 --- a/src/include/utils/wait_event.h +++ b/src/include/utils/wait_event.h @@ -13,6 +13,10 @@ /* enums for wait events */ #include "utils/wait_event_types.h" +#ifdef USE_WAIT_EVENT_TIMING +#include "utils/wait_event_timing.h" +#endif + extern const char *pgstat_get_wait_event(uint32 wait_event_info); extern const char *pgstat_get_wait_event_type(uint32 wait_event_info); static inline void pgstat_report_wait_start(uint32 wait_event_info); @@ -22,6 +26,11 @@ extern void pgstat_reset_wait_event_storage(void); extern PGDLLIMPORT uint32 *my_wait_event_info; +#ifdef USE_WAIT_EVENT_TIMING +extern void pgstat_report_wait_start_timing(uint32 wait_event_info); +extern void pgstat_report_wait_end_timing(int capture_level); +#endif + /* * Wait Events - Extension, InjectionPoint @@ -61,6 +70,9 @@ extern char **GetWaitEventCustomNames(uint32 classId, int *nwaitevents); * * my_wait_event_info initially points to local memory, making it safe to * call this before MyProc has been initialized. + * + * When compiled with --enable-wait-event-timing, also records the start + * timestamp for later duration computation in pgstat_report_wait_end(). * ---------- */ static inline void @@ -71,17 +83,54 @@ pgstat_report_wait_start(uint32 wait_event_info) * four-bytes, updates are atomic. */ *(volatile uint32 *) my_wait_event_info = wait_event_info; + +#ifdef USE_WAIT_EVENT_TIMING + + /* + * Minimal inline gate: one global load and a branch. The body -- + * lazy/eager slot resolution, INSTR_TIME read, current-event write -- + * lives out-of-line in pgstat_report_wait_start_timing() so the many + * inlined call sites (LWLockAcquire, XLogInsert, ...) stay compact and + * the off-mode codegen impact is a load + test per site. + * + * No unlikely(): wait_event_capture is monomorphic for long stretches, so + * the dynamic branch predictor handles it perfectly with or without the + * hint, and a hint would point the wrong way once capture is on. + */ + if (wait_event_capture != WAIT_EVENT_CAPTURE_OFF) + pgstat_report_wait_start_timing(wait_event_info); +#endif } /* ---------- * pgstat_report_wait_end() - * * Called to report end of a wait. + * + * When compiled with --enable-wait-event-timing and the GUC is enabled, + * calls the out-of-line pgstat_report_wait_end_timing() to compute the + * wait duration and accumulate statistics. * ---------- */ static inline void pgstat_report_wait_end(void) { +#ifdef USE_WAIT_EVENT_TIMING + /* + * The load of wait_event_capture is reused as the argument to + * pgstat_report_wait_end_timing(), so the out-of-line body does not have + * to re-load it across the call boundary (CSE doesn't cross function + * calls). See pgstat_report_wait_start() for the no-unlikely() + * rationale. + */ + { + int capture_level = wait_event_capture; + + if (capture_level != WAIT_EVENT_CAPTURE_OFF) + pgstat_report_wait_end_timing(capture_level); + } +#endif + /* see pgstat_report_wait_start() */ *(volatile uint32 *) my_wait_event_info = 0; } diff --git a/src/include/utils/wait_event_timing.h b/src/include/utils/wait_event_timing.h index b44d19ecf35..87b444d4763 100644 --- a/src/include/utils/wait_event_timing.h +++ b/src/include/utils/wait_event_timing.h @@ -3,14 +3,19 @@ * wait_event_timing.h * Per-backend wait event timing instrumentation. * - * This header declares the public surface of the wait-event-timing - * feature, gated by the compile-time option --enable-wait-event-timing - * (USE_WAIT_EVENT_TIMING) and the runtime GUC wait_event_capture. + * When enabled via the wait_event_capture GUC, every transition through + * pgstat_report_wait_start()/pgstat_report_wait_end() records the wait + * duration and accumulates per-(backend, event) statistics -- count, + * total/maximum duration, and a log2 duration histogram -- in shared + * memory. Each backend writes only to its own slot, so the hot path + * needs no locking; cross-backend readers (the pg_stat_wait_event_timing + * SRF) read lock-free and tolerate torn reads of 64-bit fields on 32-bit + * platforms, which is acceptable for statistics. * - * This commit introduces only the scaffolding: the capture-level enum - * and the GUC backing variable. Later commits in the series add the - * recording hot path, the SQL-visible statistics, and the per-session - * trace ring. + * The per-backend slot array lives in the main shared memory segment, + * sized at postmaster start (see WaitEventTimingShmemCallbacks). It is + * therefore valid for the entire life of every backend, including the + * proc_exit cascade -- no lazy attach and no teardown gating are needed. * * Copyright (c) 2026, PostgreSQL Global Development Group * @@ -20,7 +25,8 @@ #ifndef WAIT_EVENT_TIMING_H #define WAIT_EVENT_TIMING_H -#include "c.h" +#include "portability/instr_time.h" +#include "utils/wait_event_types.h" /* * Capture levels for the wait_event_capture GUC. Order is significant: @@ -28,7 +34,8 @@ * test for activation with "level >= WAIT_EVENT_CAPTURE_STATS". * * OFF - No instrumentation, no hot-path cost. - * STATS - Aggregated per-event statistics (added by a later commit). + * STATS - Aggregated per-event statistics (counts, durations, histogram) + * exposed via pg_stat_wait_event_timing. * * A further TRACE level is added later in the series. */ @@ -36,7 +43,7 @@ typedef enum WaitEventCaptureLevel { WAIT_EVENT_CAPTURE_OFF = 0, WAIT_EVENT_CAPTURE_STATS, -} WaitEventCaptureLevel; +} WaitEventCaptureLevel; /* * Pin the enum ordering at compile time so future code that compares with @@ -47,7 +54,123 @@ StaticAssertDecl(WAIT_EVENT_CAPTURE_OFF == 0 && WAIT_EVENT_CAPTURE_STATS == 1, "WaitEventCaptureLevel values must be 0=OFF < 1=STATS"); -/* GUC variable (defined in wait_event_timing.c, even in stub builds). */ +/* + * Number of log2 histogram buckets. Bin edges are powers of two on the + * nanosecond axis: bucket 0 covers [0, 1024) ns, bucket i covers + * [2^(i+9), 2^(i+10)) ns, and the last bucket is open-ended at + * [2^(NBUCKETS+8), inf) ns. These boundaries approximate the + * decimal-microsecond grid (1024 ~ 1 us, 2048 ~ 2 us, ...), which lets + * wait_event_timing_bucket() avoid a divide on the hot path. + * + * 32 buckets cover from <1us through the open-ended overflow at 2^40 ns + * (~18 minutes), so the long tail (lock contention, vacuum, replication + * apply, noisy-neighbour I/O spikes) lands in a real bucket rather than a + * single overflow bin -- which is exactly where tail/P99 analysis pays + * off. + */ +#define WAIT_EVENT_TIMING_HISTOGRAM_BUCKETS 32 + +/* Sentinel returned by wait_event_timing_index() for LWLock events. */ +#define WAIT_EVENT_TIMING_IDX_LWLOCK (-2) + +/* + * Per-event accumulated statistics. One entry per distinct wait event per + * backend, written only by the owning backend. + */ +typedef struct WaitEventTimingEntry +{ + int64 count; /* number of occurrences */ + int64 total_ns; /* total wait duration in nanoseconds */ + int64 max_ns; /* longest single wait in nanoseconds */ + int64 histogram[WAIT_EVENT_TIMING_HISTOGRAM_BUCKETS]; +} WaitEventTimingEntry; + +/* + * Sentinel marking an empty LWLock-hash slot. We reserve the top of the + * uint16 range (0xFFFF) rather than 0 so that any legal tranche id -- + * including the currently-unused tranche 0 -- can be stored and matched. + */ +#define LWLOCK_TIMING_EMPTY_SLOT ((uint16) 0xFFFF) + +/* + * Open-addressing hash slot mapping an LWLock tranche id to a dense index + * into the per-backend lwlock_events[] array. Per-backend, single-writer. + */ +typedef struct LWLockTimingHashEntry +{ + uint16 tranche_id; /* LWLOCK_TIMING_EMPTY_SLOT marks empty */ + uint16 dense_idx; /* index into lwlock_events[] */ +} LWLockTimingHashEntry; + +/* + * Header for the per-backend LWLock-timing hash. The slot table and the + * dense events array follow the WaitEventTimingState in memory (their + * lengths are runtime-determined by wait_event_timing_max_tranches), so + * they are not struct members; resolve them via the helpers in + * wait_event_timing.c. + */ +typedef struct LWLockTimingHash +{ + int num_used; /* count of occupied entries */ + int hash_size; /* slot-table size (power of two); immutable */ + int max_entries; /* cap on distinct tranches; immutable */ +} LWLockTimingHash; + +/* Declaration of the GUC (see guc_parameters.dat). */ +extern PGDLLIMPORT int wait_event_timing_max_tranches; + +/* + * Per-backend wait event timing state. One slot per + * MaxBackends + NUM_AUXILIARY_PROCS, written exclusively by the owning + * backend. Shared-memory layout of one slot: + * + * [ WaitEventTimingState header ] + * [ LWLockTimingHashEntry[hash_size] ] + * [ WaitEventTimingEntry[max_entries] <- lwlock_events[] ] + * + * where hash_size and max_entries are runtime-derived from the GUC + * wait_event_timing_max_tranches and recorded in lwlock_hash. Slots are + * laid out contiguously in the main shared memory segment using a runtime + * stride rather than C array indexing, since the per-backend size is + * determined at server start. + */ +typedef struct WaitEventTimingState +{ + /* Current wait start timestamp (set by pgstat_report_wait_start). */ + instr_time wait_start; + + /* Current wait_event_info (cached for use in wait_end). */ + uint32 current_event; + + /* Per-event statistics: flat array for bounded classes. */ + WaitEventTimingEntry events[WAIT_EVENT_TIMING_NUM_EVENTS]; + + /* Per-event statistics: hash for the LWLock class (unbounded ids). */ + LWLockTimingHash lwlock_hash; + + /* + * Count of LWLock events dropped because the LWLock-timing hash reached + * its cap (wait_event_timing_max_tranches). Written here; a later commit + * in the series exposes it via SQL. + */ + int64 lwlock_overflow_count; + + /* Count of events dropped because the class index was out of range. */ + int64 flat_overflow_count; +} WaitEventTimingState; + +/* GUC variables (see guc_parameters.dat). */ extern PGDLLIMPORT int wait_event_capture; +/* Pointer to this backend's timing state in shared memory. */ +extern PGDLLIMPORT WaitEventTimingState *my_wait_event_timing; + +/* + * Called from InitProcess()/InitAuxiliaryProcess() to point + * my_wait_event_timing at this backend's slot, and from ProcKill() to + * clear it. + */ +extern void pgstat_set_wait_event_timing_storage(int procNumber); +extern void pgstat_reset_wait_event_timing_storage(void); + #endif /* WAIT_EVENT_TIMING_H */ diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out index e266f501d87..7375d411c81 100644 --- a/src/test/regress/expected/rules.out +++ b/src/test/regress/expected/rules.out @@ -2423,6 +2423,17 @@ pg_stat_user_tables| SELECT relid, stats_reset FROM pg_stat_all_tables WHERE ((schemaname <> ALL (ARRAY['pg_catalog'::name, 'information_schema'::name])) AND (schemaname !~ '^pg_toast'::text)); +pg_stat_wait_event_timing| SELECT pid, + backend_type, + procnumber, + wait_event_type, + wait_event, + calls, + total_time_ms, + avg_time_us, + max_time_us, + histogram + FROM pg_stat_get_wait_event_timing(NULL::integer) t(pid, backend_type, procnumber, wait_event_type, wait_event, calls, total_time_ms, avg_time_us, max_time_us, histogram); pg_stat_wal| SELECT wal_records, wal_fpi, wal_bytes, @@ -2899,6 +2910,11 @@ pg_views| SELECT n.nspname AS schemaname, FROM (pg_class c LEFT JOIN pg_namespace n ON ((n.oid = c.relnamespace))) WHERE (c.relkind = 'v'::"char"); +pg_wait_event_timing_histogram_buckets| SELECT bucket_idx, + lower_ns, + upper_ns, + label + FROM ( VALUES (0,(0)::bigint,(1024)::bigint,'<1us'::text), (1,(1024)::bigint,(2048)::bigint,'1-2us'::text), (2,(2048)::bigint,(4096)::bigint,'2-4us'::text), (3,(4096)::bigint,(8192)::bigint,'4-8us'::text), (4,(8192)::bigint,(16384)::bigint,'8-16us'::text), (5,(16384)::bigint,(32768)::bigint,'16-32us'::text), (6,(32768)::bigint,(65536)::bigint,'32-64us'::text), (7,(65536)::bigint,(131072)::bigint,'64-128us'::text), (8,(131072)::bigint,(262144)::bigint,'128-256us'::text), (9,(262144)::bigint,(524288)::bigint,'256-512us'::text), (10,(524288)::bigint,(1048576)::bigint,'512us-1ms'::text), (11,(1048576)::bigint,(2097152)::bigint,'1-2ms'::text), (12,(2097152)::bigint,(4194304)::bigint,'2-4ms'::text), (13,(4194304)::bigint,(8388608)::bigint,'4-8ms'::text), (14,(8388608)::bigint,(16777216)::bigint,'8-16ms'::text), (15,(16777216)::bigint,(33554432)::bigint,'16-32ms'::text), (16,(33554432)::bigint,(67108864)::bigint,'32-64ms'::text), (17,(67108864)::bigint,(134217728)::bigint,'64-128ms'::text), (18,(134217728)::bigint,(268435456)::bigint,'128-256ms'::text), (19,(268435456)::bigint,(536870912)::bigint,'256-512ms'::text), (20,(536870912)::bigint,(1073741824)::bigint,'512ms-1s'::text), (21,(1073741824)::bigint,'2147483648'::bigint,'1-2s'::text), (22,'2147483648'::bigint,'4294967296'::bigint,'2-4s'::text), (23,'4294967296'::bigint,'8589934592'::bigint,'4-8s'::text), (24,'8589934592'::bigint,'17179869184'::bigint,'8-16s'::text), (25,'17179869184'::bigint,'34359738368'::bigint,'16-32s'::text), (26,'34359738368'::bigint,'68719476736'::bigint,'32-64s'::text), (27,'68719476736'::bigint,'137438953472'::bigint,'64-128s'::text), (28,'137438953472'::bigint,'274877906944'::bigint,'128-256s'::text), (29,'274877906944'::bigint,'549755813888'::bigint,'256-512s'::text), (30,'549755813888'::bigint,'1099511627776'::bigint,'512s-1024s'::text), (31,'1099511627776'::bigint,NULL::bigint,'>=1024s'::text)) t(bucket_idx, lower_ns, upper_ns, label); pg_wait_events| SELECT type, name, description diff --git a/src/test/regress/expected/wait_event_timing.out b/src/test/regress/expected/wait_event_timing.out new file mode 100644 index 00000000000..925b315c4e8 --- /dev/null +++ b/src/test/regress/expected/wait_event_timing.out @@ -0,0 +1,84 @@ +-- +-- WAIT_EVENT_TIMING +-- +-- Exercises the wait_event_capture = stats instrumentation: the GUC, the +-- pg_stat_get_wait_event_timing() SRF, the pg_stat_wait_event_timing view, +-- and the pg_wait_event_timing_histogram_buckets taxonomy view. +-- +-- Two expected outputs are maintained: +-- wait_event_timing.out -- --enable-wait-event-timing builds +-- wait_event_timing_1.out -- builds without the option (stub) +-- The difference is whether SET wait_event_capture = stats succeeds and +-- whether the SRF records anything; durations are never printed, so the +-- timing-build output is deterministic. +-- +-- Default is off. +SHOW wait_event_capture; + wait_event_capture +-------------------- + off +(1 row) + +-- The taxonomy view is pure SQL and identical in both build configs. +SELECT count(*) AS buckets FROM pg_wait_event_timing_histogram_buckets; + buckets +--------- + 32 +(1 row) + +SELECT bucket_idx, lower_ns, upper_ns, label +FROM pg_wait_event_timing_histogram_buckets +WHERE bucket_idx IN (0, 1, 31) +ORDER BY bucket_idx; + bucket_idx | lower_ns | upper_ns | label +------------+---------------+----------+--------- + 0 | 0 | 1024 | <1us + 1 | 1024 | 2048 | 1-2us + 31 | 1099511627776 | | >=1024s +(3 rows) + +-- Enable stats capture and generate a deterministic wait: pg_sleep emits a +-- Timeout / PgSleep wait of ~0.1s. (In a stub build the SET errors and the +-- SRF stays empty; that is the documented difference between the two +-- expected files.) +SET wait_event_capture = stats; +SELECT pg_sleep(0.1); + pg_sleep +---------- + +(1 row) + +-- PgSleep must now be recorded for this backend, with the per-event +-- invariants holding. We print only booleans so the output is stable. +SELECT calls >= 1 AS calls_ok, + calls = (SELECT sum(h) FROM unnest(histogram) AS h) AS hist_sum_eq_calls, + total_time_ms > 0 AS total_positive, + max_time_us > 0 AS max_positive, + array_length(histogram, 1) + = (SELECT count(*)::int FROM pg_wait_event_timing_histogram_buckets) + AS histogram_len_ok +FROM pg_stat_get_wait_event_timing(pg_backend_pid()) +WHERE wait_event = 'PgSleep'; + calls_ok | hist_sum_eq_calls | total_positive | max_positive | histogram_len_ok +----------+-------------------+----------------+--------------+------------------ + t | t | t | t | t +(1 row) + +-- The view surfaces the same row (type/name only; durations omitted). +SELECT wait_event_type, wait_event +FROM pg_stat_wait_event_timing +WHERE pid = pg_backend_pid() AND wait_event = 'PgSleep'; + wait_event_type | wait_event +-----------------+------------ + Timeout | PgSleep +(1 row) + +-- A non-NULL pid that does not exist yields no rows (silent, not an error). +SELECT count(*) AS rows_for_bogus_pid +FROM pg_stat_get_wait_event_timing(-1); + rows_for_bogus_pid +-------------------- + 0 +(1 row) + +RESET wait_event_capture; diff --git a/src/test/regress/expected/wait_event_timing_1.out b/src/test/regress/expected/wait_event_timing_1.out new file mode 100644 index 00000000000..038a8565657 --- /dev/null +++ b/src/test/regress/expected/wait_event_timing_1.out @@ -0,0 +1,85 @@ +-- +-- WAIT_EVENT_TIMING +-- +-- Exercises the wait_event_capture = stats instrumentation: the GUC, the +-- pg_stat_get_wait_event_timing() SRF, the pg_stat_wait_event_timing view, +-- and the pg_wait_event_timing_histogram_buckets taxonomy view. +-- +-- Two expected outputs are maintained: +-- wait_event_timing.out -- --enable-wait-event-timing builds +-- wait_event_timing_1.out -- builds without the option (stub) +-- The difference is whether SET wait_event_capture = stats succeeds and +-- whether the SRF records anything; durations are never printed, so the +-- timing-build output is deterministic. +-- +-- Default is off. +SHOW wait_event_capture; + wait_event_capture +-------------------- + off +(1 row) + +-- The taxonomy view is pure SQL and identical in both build configs. +SELECT count(*) AS buckets FROM pg_wait_event_timing_histogram_buckets; + buckets +--------- + 32 +(1 row) + +SELECT bucket_idx, lower_ns, upper_ns, label +FROM pg_wait_event_timing_histogram_buckets +WHERE bucket_idx IN (0, 1, 31) +ORDER BY bucket_idx; + bucket_idx | lower_ns | upper_ns | label +------------+---------------+----------+--------- + 0 | 0 | 1024 | <1us + 1 | 1024 | 2048 | 1-2us + 31 | 1099511627776 | | >=1024s +(3 rows) + +-- Enable stats capture and generate a deterministic wait: pg_sleep emits a +-- Timeout / PgSleep wait of ~0.1s. (In a stub build the SET errors and the +-- SRF stays empty; that is the documented difference between the two +-- expected files.) +SET wait_event_capture = stats; +ERROR: invalid value for parameter "wait_event_capture": "stats" +DETAIL: This build does not support wait event capture. +HINT: Compile PostgreSQL with --enable-wait-event-timing. +SELECT pg_sleep(0.1); + pg_sleep +---------- + +(1 row) + +-- PgSleep must now be recorded for this backend, with the per-event +-- invariants holding. We print only booleans so the output is stable. +SELECT calls >= 1 AS calls_ok, + calls = (SELECT sum(h) FROM unnest(histogram) AS h) AS hist_sum_eq_calls, + total_time_ms > 0 AS total_positive, + max_time_us > 0 AS max_positive, + array_length(histogram, 1) + = (SELECT count(*)::int FROM pg_wait_event_timing_histogram_buckets) + AS histogram_len_ok +FROM pg_stat_get_wait_event_timing(pg_backend_pid()) +WHERE wait_event = 'PgSleep'; + calls_ok | hist_sum_eq_calls | total_positive | max_positive | histogram_len_ok +----------+-------------------+----------------+--------------+------------------ +(0 rows) + +-- The view surfaces the same row (type/name only; durations omitted). +SELECT wait_event_type, wait_event +FROM pg_stat_wait_event_timing +WHERE pid = pg_backend_pid() AND wait_event = 'PgSleep'; + wait_event_type | wait_event +-----------------+------------ +(0 rows) + +-- A non-NULL pid that does not exist yields no rows (silent, not an error). +SELECT count(*) AS rows_for_bogus_pid +FROM pg_stat_get_wait_event_timing(-1); + rows_for_bogus_pid +-------------------- + 0 +(1 row) + +RESET wait_event_capture; diff --git a/src/test/regress/parallel_schedule b/src/test/regress/parallel_schedule index 8fa0a6c47fb..b5e6712a1e8 100644 --- a/src/test/regress/parallel_schedule +++ b/src/test/regress/parallel_schedule @@ -130,6 +130,10 @@ test: partition_merge partition_split partition_join partition_prune reloptions # ---------- test: compression compression_lz4 compression_pglz cluster +# wait_event_timing reads only the calling backend's own stats, so it is +# safe on its own line; pg_sleep keeps its runtime small. +test: wait_event_timing + # event_trigger depends on create_am and cannot run concurrently with # any test that runs DDL # oidjoins is read-only, though, and should run late for best coverage diff --git a/src/test/regress/sql/wait_event_timing.sql b/src/test/regress/sql/wait_event_timing.sql new file mode 100644 index 00000000000..da15dbbe495 --- /dev/null +++ b/src/test/regress/sql/wait_event_timing.sql @@ -0,0 +1,54 @@ +-- +-- WAIT_EVENT_TIMING +-- +-- Exercises the wait_event_capture = stats instrumentation: the GUC, the +-- pg_stat_get_wait_event_timing() SRF, the pg_stat_wait_event_timing view, +-- and the pg_wait_event_timing_histogram_buckets taxonomy view. +-- +-- Two expected outputs are maintained: +-- wait_event_timing.out -- --enable-wait-event-timing builds +-- wait_event_timing_1.out -- builds without the option (stub) +-- The difference is whether SET wait_event_capture = stats succeeds and +-- whether the SRF records anything; durations are never printed, so the +-- timing-build output is deterministic. +-- + +-- Default is off. +SHOW wait_event_capture; + +-- The taxonomy view is pure SQL and identical in both build configs. +SELECT count(*) AS buckets FROM pg_wait_event_timing_histogram_buckets; +SELECT bucket_idx, lower_ns, upper_ns, label +FROM pg_wait_event_timing_histogram_buckets +WHERE bucket_idx IN (0, 1, 31) +ORDER BY bucket_idx; + +-- Enable stats capture and generate a deterministic wait: pg_sleep emits a +-- Timeout / PgSleep wait of ~0.1s. (In a stub build the SET errors and the +-- SRF stays empty; that is the documented difference between the two +-- expected files.) +SET wait_event_capture = stats; +SELECT pg_sleep(0.1); + +-- PgSleep must now be recorded for this backend, with the per-event +-- invariants holding. We print only booleans so the output is stable. +SELECT calls >= 1 AS calls_ok, + calls = (SELECT sum(h) FROM unnest(histogram) AS h) AS hist_sum_eq_calls, + total_time_ms > 0 AS total_positive, + max_time_us > 0 AS max_positive, + array_length(histogram, 1) + = (SELECT count(*)::int FROM pg_wait_event_timing_histogram_buckets) + AS histogram_len_ok +FROM pg_stat_get_wait_event_timing(pg_backend_pid()) +WHERE wait_event = 'PgSleep'; + +-- The view surfaces the same row (type/name only; durations omitted). +SELECT wait_event_type, wait_event +FROM pg_stat_wait_event_timing +WHERE pid = pg_backend_pid() AND wait_event = 'PgSleep'; + +-- A non-NULL pid that does not exist yields no rows (silent, not an error). +SELECT count(*) AS rows_for_bogus_pid +FROM pg_stat_get_wait_event_timing(-1); + +RESET wait_event_capture; diff --git a/src/tools/pginclude/headerscheck b/src/tools/pginclude/headerscheck index 785d6f867ad..eda4599de70 100755 --- a/src/tools/pginclude/headerscheck +++ b/src/tools/pginclude/headerscheck @@ -125,6 +125,8 @@ do test "$f" = src/include/nodes/nodetags.h && continue test "$f" = src/backend/nodes/nodetags.h && continue test "$f" = src/include/storage/checksum_block_internal.h && continue + test "$f" = src/include/utils/wait_event_timing_data.h && continue + test "$f" = src/backend/utils/wait_event_timing_data.h && continue # These files are not meant to be included standalone, because # they contain lists that might have multiple use-cases. diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 67eb7ea00a9..7902b8949b9 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -1612,6 +1612,8 @@ LWLock LWLockHandle LWLockMode LWLockPadded +LWLockTimingHash +LWLockTimingHashEntry LWLockTrancheShmemData LZ4F_compressionContext_t LZ4F_decompressOptions_t @@ -3426,6 +3428,8 @@ WaitEventIO WaitEventIPC WaitEventSet WaitEventTimeout +WaitEventTimingEntry +WaitEventTimingState WaitLSNProcInfo WaitLSNResult WaitLSNState -- 2.43.0