From a0305d3242b854b3a22ded5d66bbf3dba0bf943c Mon Sep 17 00:00:00 2001 From: Dmitry Fomin Date: Thu, 11 Jun 2026 15:10:36 +0000 Subject: [PATCH v1 6/7] wait_event_timing: add trace level with a per-session ring buffer Add the third capture level, wait_event_capture = trace. On top of the STATS aggregates, every completed wait is pushed into a per-session ring buffer in DSA -- one record per wait -- allocated lazily on first use so only sessions that enable trace pay the (default 4 MB) per-ring cost. The ring is exposed two ways: - pg_get_backend_wait_event_trace() / the pg_backend_wait_event_trace view read the calling backend's own ring; - pg_get_wait_event_trace(procnumber) reads any backend's ring cross-process, including rings left behind by exited backends. A backend's ring is not freed when it exits: the slot transitions to ORPHANED and the ring stays in DSA so cross-backend consumers can read the dying backend's final waits (important for short-lived parallel workers). An orphan is reclaimed when a new backend reuses the same procNumber (clear-on-init) or by pg_stat_clear_orphaned_wait_event_rings() (execution revoked from PUBLIC; delegable with GRANT). Slot transitions (FREE/OWNED/ORPHANED) are serialised by the WaitEventTraceControl lock; the ring size is fixed cluster-wide at server start by wait_event_trace_ring_size (power of two, default 4 MB). The single-writer hot path writes each record under a seqlock (odd seq while writing, even when complete). Cross-backend readers use a POSITION-ENCODED IDENTITY seqlock -- a record at ring index i is valid only if its seq equals the writer's complete value for that exact position -- which rejects stale previous-cycle reads that a parity-only seqlock would accept under cross-process visibility lag. A TAP test drives an injection point between the writer's write_pos advance and its seq stamp to prove exactly that: with the ring wrapped and the writer wedged mid-record, a cross-backend read returns ring_size - 1 records, skipping the in-flight slot whose stale prior-cycle record a parity-only check would have emitted. The trace ring writer carries the same teardown discipline as the stats path: a wait_event_trace_writes_disabled gate (raised around slot transitions and at proc_exit) plus in_attach/in_release re-entrancy guards keep the DSA-internal LWLock waits that those operations emit from recursing into a ring that is being freed or orphaned. A further re-entrancy guard on the record writer itself keeps wait events emitted mid-record-write (reachable only via the injection point; the write is plain stores in production) from recursing into the writer. Privileges: both trace SRFs and the session-local view are REVOKE'd from PUBLIC and GRANT'ed to pg_read_all_stats (reading a session's trace exposes its wait sequence). Query-attribution markers and their executor/protocol hooks are added in the next commit; this commit records wait events only. --- doc/src/sgml/config.sgml | 37 +- doc/src/sgml/monitoring.sgml | 301 ++- src/backend/catalog/system_views.sql | 24 + src/backend/postmaster/auxprocess.c | 11 + .../utils/activity/wait_event_names.txt | 1 + .../utils/activity/wait_event_timing.c | 1966 ++++++++++++++++- src/backend/utils/init/postinit.c | 11 + src/backend/utils/misc/guc_parameters.dat | 11 + src/backend/utils/misc/postgresql.conf.sample | 4 +- src/include/catalog/pg_proc.dat | 26 + src/include/storage/lwlocklist.h | 1 + src/include/storage/subsystemlist.h | 1 + src/include/utils/guc_hooks.h | 1 + src/include/utils/wait_event_timing.h | 190 +- src/test/modules/test_misc/meson.build | 1 + .../t/015_wait_event_trace_seqlock.pl | 122 + src/test/regress/expected/rules.out | 7 + .../regress/expected/wait_event_timing.out | 46 +- .../regress/expected/wait_event_timing_1.out | 49 +- src/test/regress/sql/wait_event_timing.sql | 31 +- src/tools/pgindent/typedefs.list | 6 + 21 files changed, 2743 insertions(+), 104 deletions(-) create mode 100644 src/test/modules/test_misc/t/015_wait_event_trace_seqlock.pl diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index 15131fd9b7f..734310935ed 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -9148,7 +9148,9 @@ COPY postgres_log FROM '/full/path/to/logfile.csv' WITH csv; Controls collection of wait event timing instrumentation. Requires the server to be compiled with . Possible values are - off (the default) and stats. + off (the default), stats, and + trace; each level is a strict superset of the + previous one. At stats, the server records per-backend wait @@ -9162,6 +9164,16 @@ COPY postgres_log FROM '/full/path/to/logfile.csv' WITH csv; wait_event_capture is off the hot path adds only a single predictable branch. + + At trace, the server additionally records every + individual wait event into a per-session ring buffer exposed via the + + pg_backend_wait_event_trace view. + The ring is allocated lazily from dynamic shared memory on first + use (default 4 MB per backend; see + ), so only + sessions that enable trace pay the per-ring memory cost. + Only superusers and users with the appropriate SET privilege can change this setting. Read access to the resulting @@ -9203,6 +9215,29 @@ GRANT SET ON PARAMETER wait_event_capture TO pg_monitor; + + wait_event_trace_ring_size (integer) + + wait_event_trace_ring_size configuration parameter + + + + + Sets the size of the per-backend ring buffer used at the + trace level of + . Each backend that enables + trace allocates a ring of this size from a cluster-wide dynamic + shared memory area. The value must be a power of two; larger rings + retain a longer history of wait events before wrapping. If this + parameter is specified without units, it is taken as kilobytes. The + default is 4096 (4 MB). This parameter has + no effect on builds compiled without + , and can only be set at + server start. + + + + track_functions (enum) diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml index 34676e43eae..2f1fc457d00 100644 --- a/doc/src/sgml/monitoring.sgml +++ b/doc/src/sgml/monitoring.sgml @@ -579,6 +579,15 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser + + pg_backend_wait_event_tracepg_backend_wait_event_trace + Individual wait event records from the current backend's trace + ring buffer. See + + pg_backend_wait_event_trace for details. + + + @@ -3859,7 +3868,8 @@ description | Waiting for a newly initialized WAL file to reach durable storage The pg_stat_wait_event_timing view contains one row for each combination of backend and wait event that has a non-zero call count. It shows accumulated timing statistics collected when - is set to stats. Requires the server to be compiled with + is set to stats + or trace. Requires the server to be compiled with . @@ -4225,6 +4235,200 @@ ORDER BY b.bucket_idx; + + <structname>pg_backend_wait_event_trace</structname> + + + pg_backend_wait_event_trace + + + + The pg_backend_wait_event_trace view shows + individual wait event records from the current backend's + trace ring buffer. Each record captures a single wait event + (with timestamp and duration). + Requires to be set to + trace. The ring buffer is sized by + (default 4 MB = + 131072 records of 32 bytes each); older records are overwritten in + FIFO order. The view is session-local + and analogous in scope to + + pg_backend_memory_contexts; querying it + from a superuser session still returns only that session's own + records, never another backend's. + + + + The pg_backend_wait_event_trace view is + intended for session-local interactive diagnostics: + running ad-hoc SELECT queries against your own + session's trace from psql while + investigating wait-event behaviour. The view materialises up to + one ring's worth of records (default ~4 MB, controlled by + ) into a + tuplestore on each call, which is bounded and acceptable for that + use; for narrow result sets, append + ORDER BY seq DESC LIMIT N + to get the most recent records. + + + + Cross-backend monitoring tools — extensions and background + workers that read wait events losslessly from every backend's + ring — should not consume through this + view. The in-tree cross-backend reader is + pg_get_wait_event_trace + (see ); the underlying + per-session SQL function returns only the calling backend's own + ring, so a background worker invoking + SELECT * FROM pg_backend_wait_event_trace via + SPI would receive only its own (typically empty) ring, not the + target backend's data. External tools that need cross-backend + access without going through SQL use the shared-memory snapshot + pattern documented on + WaitEventTraceControl in + src/include/utils/wait_event_timing.h: + snapshot trace_slots[procNumber].generation, + acquire WaitEventTraceCtl->lock in + LW_SHARED, resolve the target slot's + ring_ptr via + dsa_get_address, snapshot the relevant slice + of the ring into local memory, release the lock, re-snapshot + generation and discard the read if it + changed, then process the snapshot off the lock. That bypasses + this view entirely and is the supported cross-backend interface + for monitoring extensions. + + + + Slot lifecycle. Per-backend trace rings are + not freed when their owner backend exits. The ring stays + allocated in shared memory in an orphaned state + so the dying backend's final waits remain readable by the + cross-backend interface — + pg_get_wait_event_trace + (see ) for in-tree + access, or external background workers that follow the + snapshot pattern documented above. + This does not change the behaviour of this view, + which always reads the calling backend's own ring and is + unaffected by orphan-state slots belonging to other + procnumbers. The lifecycle change matters for short-lived + backends that exit before any monitoring tool has read their + data: parallel workers in particular exit in milliseconds at + end-of-parallel-query, well below typical reader polling + intervals, and without orphan-persistence their final waits + would be lost. Orphaned rings are reclaimed automatically when a new + backend takes over the same procNumber + slot, and the DBA can force a sweep at any time via + pg_stat_clear_orphaned_wait_event_rings. + The worst-case orphan-memory footprint is bounded by the slot + count times ~4 MB; see + pg_stat_clear_orphaned_wait_event_rings + under for details and + the deployment patterns where the function is most useful. + + + + The ring buffer is designed as a lock-free transport mechanism for + external consumption. At high wait event rates (e.g., 220K events/sec), + the ring wraps in roughly 0.5–1 seconds. Consumers should poll + the ring buffer before it wraps. + + + + The seq column is the absolute write + position of each record; it is monotonically increasing and never + resets while the ring is alive. A consumer polling the ring + repeatedly can detect wraparound losses by tracking + max(seq) between successive scrapes: given two + consecutive polls returning N2 rows with + maximum seq values + S1 (previous poll) and + S2 (current poll), the number of records + overwritten before the second poll could read them is + max(0, (S2 - S1) - N2). No separate + trace overflow counter is exposed because this + information is exact and derivable from seq + alone. + + + + + <structname>pg_backend_wait_event_trace</structname> View + + + + + Column Type + + + Description + + + + + + + + seq bigint + + + Sequence number of this record in the ring buffer + + + + + + timestamp_ns bigint + + + Monotonic clock timestamp in nanoseconds + + + + + + wait_event_type text + + + Wait event type + + + + + + wait_event text + + + Wait event name + + + + + + duration_us double precision + + + Wait duration in microseconds + + + + + + query_id bigint + + + Reserved; always 0 for wait event records + + + + + +
+
+ <structname>pg_stat_database</structname> @@ -6332,6 +6536,101 @@ ORDER BY b.bucket_idx; + + + + pg_get_wait_event_trace + + pg_get_wait_event_trace ( procnumber integer ) + setof record + + + Returns individual wait event records from the trace ring of + the backend that currently or previously occupied the slot + identified by procnumber. Reads slots + in OWNED state (live writer) and + ORPHANED state (writer has exited but the + ring is preserved for post-mortem reading) uniformly. An + empty result indicates the slot is in FREE + state (no ring) or no records have been written. Concurrent + slot transitions cannot interrupt the read because the + function holds the cross-backend trace lock in + SHARED mode throughout the iteration; the + per-record seqlock protocol skips any record being written + by a concurrent live writer. + + + This is the canonical cross-backend reader. External + monitoring extensions that need cross-backend access without + going through SQL should follow the same snapshot pattern + documented on WaitEventTraceControl + in src/include/utils/wait_event_timing.h; + this function serves as both the reference implementation and + a DBA-facing diagnostic tool. The + procnumber argument can be obtained + from the procnumber column of + pg_stat_get_wait_event_timing or + pg_stat_get_wait_event_timing_overflow + for live backends. For post-mortem reads of short-lived + backends (parallel workers, autovacuum, walsender) the + procnumber must be captured while the + backend is still alive, or discovered by iterating slots in a + monitoring background worker. A pid-keyed lookup for live + backends only is one query away: + + +SELECT * FROM pg_get_wait_event_trace( + (SELECT procnumber FROM pg_stat_get_wait_event_timing(target_pid) + WHERE pid = target_pid LIMIT 1)); + + + + Requires membership in pg_read_all_stats + (matching the privilege model of the session-local view + pg_backend_wait_event_trace). + + + + + + + pg_stat_clear_orphaned_wait_event_rings + + pg_stat_clear_orphaned_wait_event_rings () + bigint + + + Frees every wait-event-trace ring whose owner backend has + exited. Returns the number of rings released. Requires + superuser. + + + When a backend that had wait_event_capture = + trace exits, its ~4 MB trace ring is + intentionally not freed at exit so that + cross-backend consumers + (pg_get_wait_event_trace and extensions + following the snapshot pattern) can still read the dying + backend's final waits. The + memory is reclaimed lazily: in the common case, the ring is + freed automatically when a new backend takes over the same + procNumber slot. This function is the + explicit DBA-driven sweep for the pathological case where + capture was briefly enabled, then disabled, on a cluster with + long-lived pooled connections that never recycle the + procNumber. The maximum amount of memory + this can release is bounded by the slot count times the + per-ring size (~400 MB at max_connections + = 100, ~4 GB at 1000); on most deployments the function will + report 0 because connection churn already drained orphans + naturally. + + + Safe to call when capture is currently off + and even when no orphans exist (returns 0 in both cases). + + + diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index 96aba833895..ccb2fb51236 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -1643,7 +1643,31 @@ CREATE VIEW pg_stat_wait_event_timing_overflow AS REVOKE ALL ON pg_stat_wait_event_timing_overflow FROM PUBLIC; GRANT SELECT ON pg_stat_wait_event_timing_overflow TO pg_read_all_stats; +-- Per-session wait event trace ring, one record per completed wait +-- (wait_event_capture = trace). +-- Reading a session's trace exposes its query_id and wait sequence, which can +-- leak across SECURITY DEFINER call chains, so the view AND both underlying +-- SRFs are locked to pg_read_all_stats. +CREATE VIEW pg_backend_wait_event_trace AS + SELECT + t.seq, + t.timestamp_ns, + t.wait_event_type, + t.wait_event, + t.duration_us, + t.query_id + FROM pg_get_backend_wait_event_trace() t; +REVOKE ALL ON pg_backend_wait_event_trace FROM PUBLIC; +GRANT SELECT ON pg_backend_wait_event_trace TO pg_read_all_stats; +-- Revoke the session-local SRF itself, not just the view, so a role that can +-- enable trace cannot read its own ring via the function and bypass the view. +REVOKE EXECUTE ON FUNCTION pg_get_backend_wait_event_trace() FROM PUBLIC; +GRANT EXECUTE ON FUNCTION pg_get_backend_wait_event_trace() TO pg_read_all_stats; +-- Cross-backend reader, keyed by procnumber (reads OWNED and ORPHANED slots). +REVOKE EXECUTE ON FUNCTION pg_get_wait_event_trace(int4) FROM PUBLIC; +GRANT EXECUTE ON FUNCTION pg_get_wait_event_trace(int4) TO pg_read_all_stats; -- Cluster-scope operations: revoked from PUBLIC (administrators can -- delegate with GRANT EXECUTE); not granted to pg_read_all_stats because -- they mutate state rather than read it. REVOKE EXECUTE ON FUNCTION pg_stat_reset_wait_event_timing_all() FROM PUBLIC; +REVOKE EXECUTE ON FUNCTION pg_stat_clear_orphaned_wait_event_rings() FROM PUBLIC; diff --git a/src/backend/postmaster/auxprocess.c b/src/backend/postmaster/auxprocess.c index ad4bf4bd2a8..6575dec4f00 100644 --- a/src/backend/postmaster/auxprocess.c +++ b/src/backend/postmaster/auxprocess.c @@ -26,6 +26,7 @@ #include "utils/memutils.h" #include "utils/ps_status.h" #include "utils/wait_event.h" +#include "utils/wait_event_timing.h" static void ShutdownAuxiliaryProcess(int code, Datum arg); @@ -113,6 +114,16 @@ AuxiliaryProcessMainCommon(void) */ CreateAuxProcessResourceOwner(); +#ifdef USE_WAIT_EVENT_TIMING + + /* + * Attach trace ring if wait_event_capture = trace was set via + * postgresql.conf + */ + if (wait_event_capture == WAIT_EVENT_CAPTURE_TRACE && my_trace_proc_number >= 0) + wait_event_trace_attach(my_trace_proc_number); +#endif + /* Initialize backend status information */ pgstat_beinit(); diff --git a/src/backend/utils/activity/wait_event_names.txt b/src/backend/utils/activity/wait_event_names.txt index 0a06cd68604..fe3fac660c9 100644 --- a/src/backend/utils/activity/wait_event_names.txt +++ b/src/backend/utils/activity/wait_event_names.txt @@ -418,6 +418,7 @@ ParallelVacuumDSA "Waiting for parallel vacuum dynamic shared memory allocation. AioUringCompletion "Waiting for another process to complete IO via io_uring." ShmemIndex "Waiting to find or allocate space in shared memory." WaitEventTimingDSA "Waiting for wait event timing dynamic shared memory allocation." +WaitEventTraceDSA "Waiting for wait event trace dynamic shared memory allocation." # No "ABI_compatibility" region here as WaitEventLWLock has its own C code. diff --git a/src/backend/utils/activity/wait_event_timing.c b/src/backend/utils/activity/wait_event_timing.c index e64919271a1..993bd303fba 100644 --- a/src/backend/utils/activity/wait_event_timing.c +++ b/src/backend/utils/activity/wait_event_timing.c @@ -15,11 +15,16 @@ * attach on their first wait event under capture; a before_shmem_exit gate * keeps the hot path away from DSA mappings that proc_exit has torn down. * - * Controlled by the wait_event_capture GUC (off | stats, default off) and - * the compile-time option --enable-wait-event-timing. In builds without - * that option the file still compiles (the GUC backing variable, the enum - * table, the rejecting check hook, and empty-result SQL stubs), so the GUC - * and the catalog functions exist uniformly. + * At the trace level, every completed wait (and a set of query-attribution + * markers) is additionally pushed into a per-session DSA ring buffer that + * survives backend exit for post-mortem reading; see the trace section + * below and the lifecycle discussion on WaitEventTraceControl. + * + * Controlled by the wait_event_capture GUC (off | stats | trace, default + * off) and the compile-time option --enable-wait-event-timing. In builds + * without that option the file still compiles (the GUC backing variables, + * the enum table, the rejecting check hook, and empty-result SQL stubs), + * so the GUC and the catalog functions exist uniformly. * * Copyright (c) 2026, PostgreSQL Global Development Group * @@ -40,17 +45,46 @@ */ int wait_event_capture = WAIT_EVENT_CAPTURE_OFF; int wait_event_timing_max_tranches = 192; +int wait_event_trace_ring_size = 4096; + +/* + * Records-per-ring derived from wait_event_trace_ring_size at server + * start. Read by the writer (via the per-ring cached mask) and by the ring + * allocator. Zero until the GUC framework commits the boot value. + */ +uint32 WaitEventTraceRingSize = 0; /* * Enum value table consumed by guc.c. Order matches the - * WaitEventCaptureLevel enum and the documented "off < stats" ordering. + * WaitEventCaptureLevel enum and the documented "off < stats < trace". */ const struct config_enum_entry wait_event_capture_options[] = { {"off", WAIT_EVENT_CAPTURE_OFF, false}, {"stats", WAIT_EVENT_CAPTURE_STATS, false}, + {"trace", WAIT_EVENT_CAPTURE_TRACE, false}, {NULL, 0, false} }; +/* + * GUC check hook for wait_event_trace_ring_size. The ring size in records + * must be a power of two for the writer's mask-indexing (pos & ring_mask). + * Each record is 32 bytes, so kb is a power of two iff the record count is. + * Defined for both build configurations so the GUC framework validates it + * uniformly. + */ +bool +check_wait_event_trace_ring_size(int *newval, void **extra, GucSource source) +{ + int v = *newval; + + if (v <= 0 || (v & (v - 1)) != 0) + { + GUC_check_errdetail("wait_event_trace_ring_size must be a positive power of two."); + return false; + } + return true; +} + #ifndef USE_WAIT_EVENT_TIMING /* @@ -64,6 +98,9 @@ Datum pg_stat_get_wait_event_timing(PG_FUNCTION_ARGS); Datum pg_stat_get_wait_event_timing_overflow(PG_FUNCTION_ARGS); Datum pg_stat_reset_wait_event_timing(PG_FUNCTION_ARGS); Datum pg_stat_reset_wait_event_timing_all(PG_FUNCTION_ARGS); +Datum pg_get_backend_wait_event_trace(PG_FUNCTION_ARGS); +Datum pg_get_wait_event_trace(PG_FUNCTION_ARGS); +Datum pg_stat_clear_orphaned_wait_event_rings(PG_FUNCTION_ARGS); Datum pg_stat_get_wait_event_timing(PG_FUNCTION_ARGS) @@ -79,6 +116,27 @@ pg_stat_get_wait_event_timing_overflow(PG_FUNCTION_ARGS) PG_RETURN_VOID(); } +Datum +pg_get_backend_wait_event_trace(PG_FUNCTION_ARGS) +{ + InitMaterializedSRF(fcinfo, 0); + PG_RETURN_VOID(); +} + +Datum +pg_get_wait_event_trace(PG_FUNCTION_ARGS) +{ + InitMaterializedSRF(fcinfo, 0); + PG_RETURN_VOID(); +} + +Datum +pg_stat_clear_orphaned_wait_event_rings(PG_FUNCTION_ARGS) +{ + /* No trace infrastructure in this build, so never any orphans. */ + PG_RETURN_INT64(0); +} + Datum pg_stat_reset_wait_event_timing(PG_FUNCTION_ARGS) { @@ -134,9 +192,11 @@ assign_wait_event_capture(int newval, void *extra) /* No shared memory is reserved in the stub build. */ const ShmemCallbacks WaitEventTimingShmemCallbacks = {0}; +const ShmemCallbacks WaitEventTraceControlShmemCallbacks = {0}; /* Defined so every extern in wait_event_timing.h resolves in stub builds. */ WaitEventTimingState *my_wait_event_timing = NULL; +int my_trace_proc_number = -1; void pgstat_set_wait_event_timing_storage(int procNumber) @@ -148,6 +208,35 @@ pgstat_reset_wait_event_timing_storage(void) { } +void +wait_event_trace_attach(int procNumber) +{ +} + +/* + * Stub trace-marker entry points -- declared unconditionally in the header + * so call sites in execMain.c/postgres.c/backend_status.c need no #ifdef. + */ +void +wait_event_trace_query_start(int64 query_id) +{ +} + +void +wait_event_trace_query_end(int64 query_id) +{ +} + +void +wait_event_trace_exec_start(int64 query_id) +{ +} + +void +wait_event_trace_exec_end(int64 query_id) +{ +} + #else /* USE_WAIT_EVENT_TIMING */ #include "catalog/pg_authid.h" @@ -166,6 +255,7 @@ pgstat_reset_wait_event_timing_storage(void) #include "utils/backend_status.h" #include "utils/builtins.h" #include "utils/dsa.h" +#include "utils/injection_point.h" #include "utils/tuplestore.h" #include "utils/wait_event.h" @@ -232,6 +322,107 @@ static Size wait_event_timing_per_backend_stride = 0; static int wait_event_timing_hash_size = 0; static int wait_event_timing_max_entries = 0; +/* ---- Trace-level (wait_event_capture = trace) state ---- */ +/* Pointer to this backend's trace ring buffer */ +static WaitEventTraceState *my_wait_event_trace = NULL; + +/* DSA-based trace ring buffer control */ +static WaitEventTraceControl *WaitEventTraceCtl = NULL; +static dsa_area *trace_dsa = NULL; +int my_trace_proc_number = -1; + +/* + * Same-backend coordination between pg_get_backend_wait_event_trace (the + * own-session SRF reader) and wait_event_trace_release_slot (the GUC + * step-down path that frees this backend's ring). Both paths run in this + * same backend, single-threaded, so a plain bool is sufficient -- no + * atomics needed. + * + * srf_in_progress set true while the SRF is iterating the ring; the + * release path observes this and defers the dsa_free + * instead of yanking the chunk out from under us. + * + * release_pending set by the release path when it had to defer; the + * SRF's PG_FINALLY checks it and performs the deferred + * dsa_free after the iteration completes. + * + * Cross-backend readers (extensions, bgworkers reading another backend's + * ring) cannot use this mechanism -- they coordinate with the release + * path via WaitEventTraceCtl->lock instead. See the header for the + * recommended snapshot-under-lock pattern for those consumers. + */ +static bool wait_event_trace_srf_in_progress = false; +static bool wait_event_trace_release_pending = false; + +/* + * Per-backend gate that disables the trace-ring writer in the wait- + * event hot path while a slot-state transition is in progress. + * + * Set true around code paths that either free the local trace ring + * (wait_event_trace_release_slot's dsa_free) or transition the slot + * out of OWNED (wait_event_trace_before_shmem_exit's OWNED -> + * ORPHANED publish). In both cases an internal LWLock inside + * dsa_free / dsa_attach / dsa_pin_mapping / dsa_pin can in + * principle contend long enough to dispatch a wait event; that + * wait event's pgstat_report_wait_end_timing inline path runs in + * the SAME backend, sees capture_level == TRACE (the GUC hasn't + * been committed yet by the time the assign hook runs), and would: + * + * * during release_slot's dsa_free: write into a ring that has + * already been returned to the DSA freelist -- if another + * allocator has since reused the chunk, this is a stray write + * into someone else's allocation. + * + * * during release_slot's dsa_free, alternative timing: see + * my_wait_event_trace == NULL on a naive "clear before free" + * fix and recurse into wait_event_trace_attach, which would + * either deadlock on the WaitEventTraceCtl->lock the outer + * release_slot already holds, or (on a lock-free moment) + * allocate a fresh ring that the outer release_slot would + * then free again as part of its post-acquire DsaPointerIsValid + * check -- a different use-after-free of a freshly-allocated + * chunk. + * + * * during before_shmem_exit: write into the ring after the slot + * has been published as ORPHANED, violating the post-mortem + * read-only contract that cross-backend readers rely on. + * + * The flag is per-backend (static at file scope means per-process + * in PG's process-per-backend model), so the hot path's check is a + * single cache-warm load and a branch; no atomic, no fence. The + * trace branch is already gated by capture_level == TRACE so the + * additional check costs nothing in the common case where capture + * is off or stats-only. The flag is set on the very same backend + * that may later read it from the hot path, so there is no + * cross-process visibility concern. + * + * See the release_slot and before_shmem_exit doc comments for the + * specific transition each uses this flag around. + */ +static bool wait_event_trace_writes_disabled = false; + +/* + * Re-entrancy guard for the trace-record writer in + * pgstat_report_wait_end_timing(): wait events emitted while a trace + * record is mid-write are not themselves ring-recorded. In production the + * record write is a handful of plain stores and cannot wait, so this never + * fires; it exists for the injection point inside the writer, whose wait + * machinery emits nested wait events that would otherwise recurse into the + * writer (and, by re-running the injection point, self-deadlock on locks + * the outer invocation holds). Per-backend, single-threaded -- a plain + * bool suffices. + */ +static bool wait_event_trace_in_write = false; + +/* + * Forward declarations for trace functions referenced before their + * definitions (the storage/assign hooks call them; the bodies are in the + * trace machinery section below). + */ +static void wait_event_trace_detach(int procNumber); +static void wait_event_trace_release_slot(int procNumber); +static void wait_event_trace_clear_orphan_at_init(int procNumber); + /* * Mapping arrays for the flat events[] array, generated from * wait_event_names.txt by generate-wait_event_types.pl. Defines @@ -747,17 +938,39 @@ void pgstat_set_wait_event_timing_storage(int procNumber) { my_wait_event_timing = NULL; + my_wait_event_trace = NULL; + + if (procNumber < 0 || procNumber >= NUM_WAIT_EVENT_TIMING_SLOTS) + { + my_trace_proc_number = -1; + return; + } + + my_trace_proc_number = procNumber; + + /* + * If the previous occupant of this procNumber left an ORPHANED trace ring + * behind (we do not free trace rings at backend exit -- see + * WaitEventTraceControl), free it now so this backend starts with a clean + * FREE slot. + */ + wait_event_trace_clear_orphan_at_init(procNumber); } /* - * Detach from the timing slot on backend exit. The slot stays in DSA; - * clearing the local pointer keeps the late-shutdown hot path from touching + * Detach from the timing slot on backend exit. The slots stay in DSA; + * clearing the local pointers keeps the late-shutdown hot path from touching * already-detached memory. */ void pgstat_reset_wait_event_timing_storage(void) { + if (my_trace_proc_number >= 0) + wait_event_trace_detach(my_trace_proc_number); + my_wait_event_timing = NULL; + my_wait_event_trace = NULL; + my_trace_proc_number = -1; } /* @@ -787,118 +1000,898 @@ assign_wait_event_capture(int newval, void *extra) INSTR_TIME_SET_ZERO(my_wait_event_timing->wait_start); my_wait_event_timing->current_event = 0; } + + /* + * Stepping down from TRACE: release the ~4 MB DSA ring now rather than + * holding it pinned for the rest of the session. Only fires when a ring + * is actually attached, so OFF -> TRACE -> OFF without ever emitting a + * trace record stays a no-op. Re-enable re-allocates a fresh ring on the + * first wait event via wait_event_trace_attach. dsa_free is non-raising + * LWLock bookkeeping, so it is safe from an assign hook. + */ + if (newval != WAIT_EVENT_CAPTURE_TRACE && my_wait_event_trace != NULL) + wait_event_trace_release_slot(my_trace_proc_number); } +/* ================= Trace-level ring machinery ================= */ + + /* - * Out-of-line body for pgstat_report_wait_start()'s timing path. Records - * the start timestamp and the event being waited on. Reached only when - * wait_event_capture != OFF. + * Write a trace ring marker record. Shared helper for all marker types. */ -void -pgstat_report_wait_start_timing(uint32 wait_event_info) +static void +wait_event_trace_write_marker(uint8 record_type, int64 query_id) { + uint64 pos; + WaitEventTraceRecord *rec; + uint32 seq; + instr_time now; + /* - * Stay out of the timing path once proc_exit has begun tearing down DSA - * mappings (see the before_shmem_exit callback). + * Single capture-level gate: markers only land in the ring when + * wait_event_capture is at TRACE. This guarantees consistency with the + * wait-event hot path (also gated on the same level) -- there is no + * configuration in which one half of the trace fires and the other + * doesn't. query_id == 0 means "no query ID available" (utility command + * or compute_query_id = off), which we skip. + * + * wait_event_trace_writes_disabled is the same per-backend gate the + * wait-event hot path uses; it is raised by release_slot and + * before_shmem_exit around slot-state transitions to keep both writers + * consistent. Markers cannot fire during those transitions today + * (single-threaded execution, no nested executor), but checking here + * keeps the contract uniform across all trace-ring writers and is robust + * to future code paths that might invoke a marker from a nested context. + * + * No likely()/unlikely() annotation: this function is called at + * query/exec boundaries (a handful per query, not per wait event), so + * neither side of the branch dominates often enough for static layout to + * matter, and the meaningful production configuration (wait_event_capture + * = trace) is exactly when the body is hot -- an annotation on the + * early-return would point the wrong way. */ - if (wait_event_timing_writes_disabled) + if (wait_event_capture != WAIT_EVENT_CAPTURE_TRACE || + wait_event_trace_writes_disabled || + query_id == 0) return; - if (my_wait_event_timing == NULL) + /* + * Lazy attach on first use. Allocation lives here (not in the assign + * hook) because dsa_allocate_extended() can ereport(ERROR) on OOM, which + * is forbidden in assign-hook context but legitimate here. Idempotent: + * wait_event_trace_attach() short-circuits on subsequent calls. + */ + if (my_wait_event_trace == NULL) { - pgstat_wait_event_timing_lazy_attach(); + if (my_trace_proc_number < 0) + return; + wait_event_trace_attach(my_trace_proc_number); + if (my_wait_event_trace == NULL) + return; /* attach path unable to allocate */ + } - /* - * lazy_attach can dispatch nested wait events while it sets up DSA - * (dsa_attach takes an internal LWLock); those nested wait_end calls - * clear my_wait_event_info to 0. Re-publish so the outer wait stays - * visible in pg_stat_activity. Only needed on the first-attach path. - */ - *(volatile uint32 *) my_wait_event_info = wait_event_info; + /* + * Claim the next slot. Single-writer counter (only the owning backend + * writes its own ring), so a plain read+write is sufficient and avoids + * the LOCK XADD that pg_atomic_fetch_add_u64 would emit -- a wasted + * cache-coherence trip on an unshared cache line at this rate (one per + * wait event). Cross-backend readers use pg_atomic_read_u64, which + * compiles to a plain MOV on x86 and tolerates concurrent writes here + * (their actual safety against the records[] window is the per-record + * seqlock below). Same idiom as injection_point.c's per-entry generation + * counter (single writer + multiple lock-free readers). + */ + pos = pg_atomic_read_u64(&my_wait_event_trace->write_pos); + pg_atomic_write_u64(&my_wait_event_trace->write_pos, pos + 1); + rec = &my_wait_event_trace->records[pos & my_wait_event_trace->ring_mask]; + seq = (uint32) (pos * 2 + 1); + + rec->seq = seq; + pg_write_barrier(); /* release: payload stores must not rise above + * seq=odd */ + + INSTR_TIME_SET_CURRENT(now); + rec->record_type = record_type; + rec->timestamp_ns = INSTR_TIME_GET_NANOSEC(now); + rec->data.query.query_id = query_id; + rec->data.query.pad2 = 0; + + pg_write_barrier(); /* release: payload stores must land before + * seq=even */ + rec->seq = seq + 1; +} + +void +wait_event_trace_query_start(int64 query_id) +{ + wait_event_trace_write_marker(TRACE_QUERY_START, query_id); +} + +void +wait_event_trace_query_end(int64 query_id) +{ + wait_event_trace_write_marker(TRACE_QUERY_END, query_id); +} + +void +wait_event_trace_exec_start(int64 query_id) +{ + wait_event_trace_write_marker(TRACE_EXEC_START, query_id); +} + +void +wait_event_trace_exec_end(int64 query_id) +{ + wait_event_trace_write_marker(TRACE_EXEC_END, query_id); +} + +/* + * Report the shared memory space needed for trace ring buffer control. + * Only a small control struct is in fixed shmem; the actual ring buffers + * are allocated lazily via DSA. At ~24 bytes/slot, the slot array adds + * ~26 KB at a default MaxBackends, negligible compared to the ring + * memory itself. + */ +static Size +WaitEventTraceControlShmemSize(void) +{ + return add_size(offsetof(WaitEventTraceControl, trace_slots), + mul_size(NUM_WAIT_EVENT_TIMING_SLOTS, + sizeof(WaitEventTraceSlot))); +} + +static void +WaitEventTraceControlShmemRequest(void *arg) +{ + ShmemRequestStruct(.name = "WaitEventTraceControl", + .size = WaitEventTraceControlShmemSize(), + .ptr = (void **) &WaitEventTraceCtl); +} + +/* + * Initialize shared memory for trace ring buffer control. + */ +static void +WaitEventTraceControlShmemInit(void *arg) +{ + int i; + + WaitEventTraceCtl->trace_dsa_handle = DSA_HANDLE_INVALID; + LWLockInitialize(&WaitEventTraceCtl->lock, + LWTRANCHE_WAIT_EVENT_TRACE_DSA); + for (i = 0; i < NUM_WAIT_EVENT_TIMING_SLOTS; i++) + { + WaitEventTraceSlot *s = &WaitEventTraceCtl->trace_slots[i]; + + pg_atomic_init_u64(&s->generation, 0); + pg_atomic_init_u32(&s->state, WAIT_EVENT_TRACE_SLOT_FREE); + s->pad = 0; + s->ring_ptr = InvalidDsaPointer; } +} - if (my_wait_event_timing != NULL) +const ShmemCallbacks WaitEventTraceControlShmemCallbacks = { + .request_fn = WaitEventTraceControlShmemRequest, + .init_fn = WaitEventTraceControlShmemInit, +}; + +/* + * Ensure the shared DSA for trace ring buffers exists and is attached. + * Creates it on first call (any backend), attaches on subsequent calls. + * Must be called from a backend context (not postmaster). + */ +static void +wait_event_trace_ensure_dsa(void) +{ + MemoryContext oldcontext; + + if (trace_dsa != NULL) + return; + + oldcontext = MemoryContextSwitchTo(TopMemoryContext); + + LWLockAcquire(&WaitEventTraceCtl->lock, LW_EXCLUSIVE); + + if (WaitEventTraceCtl->trace_dsa_handle == DSA_HANDLE_INVALID) { - INSTR_TIME_SET_CURRENT(my_wait_event_timing->wait_start); - my_wait_event_timing->current_event = wait_event_info; + trace_dsa = dsa_create(LWTRANCHE_WAIT_EVENT_TRACE_DSA); + dsa_pin(trace_dsa); + dsa_pin_mapping(trace_dsa); + WaitEventTraceCtl->trace_dsa_handle = dsa_get_handle(trace_dsa); + } + else + { + trace_dsa = dsa_attach(WaitEventTraceCtl->trace_dsa_handle); + dsa_pin_mapping(trace_dsa); } + + LWLockRelease(&WaitEventTraceCtl->lock); + + MemoryContextSwitchTo(oldcontext); } /* - * Out-of-line body for pgstat_report_wait_end()'s timing path. Computes - * the wait duration and accumulates per-event statistics. + * Transition our trace ring slot to ORPHANED on backend exit. * - * capture_level is the value of wait_event_capture observed at the inline - * gate; in this commit only STATS exists, so it is not branched on, but it - * is threaded through to keep the gate ABI stable for the trace level - * added later in the series. + * Registered as a before_shmem_exit callback. Runs BEFORE + * dsm_backend_shutdown() detaches the DSA. + * + * Crucially, we do NOT free the ring here. The ring stays allocated in + * DSA so that cross-backend consumers -- the in-tree + * pg_get_wait_event_trace SRF and any extension following the + * snapshot pattern documented on WaitEventTraceControl -- can read + * the dying backend's final waits. The original "free at exit" + * design lost data the instant a worker terminated, which was + * particularly bad for parallel workers exiting in milliseconds at + * end-of-parallel-query. See the lifecycle comment on + * WaitEventTraceControl for the full design + * rationale and the bounded-memory cost we accept in exchange. + * + * The ORPHANED slot is reclaimed in one of two ways: + * (a) a new backend at this procNumber calls + * wait_event_trace_clear_orphan_at_init() at backend init, or + * (b) the DBA calls pg_stat_clear_orphaned_wait_event_rings(). + * + * State transition order matters: bump generation BEFORE storing the + * new state, so cross-backend readers that snapshot + * (generation_before, state, ring_ptr, generation_after) under the + * lock see a consistent (state, ring_ptr) pair iff generation didn't + * change. We hold the lock for the whole transition, but readers do + * not have to (they just take it briefly to snapshot the ring + * contents); the generation check is what makes the unlocked-read + * path safe. */ -void -pgstat_report_wait_end_timing(int capture_level) +static void +wait_event_trace_before_shmem_exit(int code, Datum arg) { - uint32 event; - uint32 cur_reset_gen; + int procNumber = DatumGetInt32(arg); + WaitEventTraceSlot *slot; - (void) capture_level; + if (WaitEventTraceCtl == NULL) + return; - if (wait_event_timing_writes_disabled) + if (procNumber < 0 || procNumber >= NUM_WAIT_EVENT_TIMING_SLOTS) return; - if (my_wait_event_timing == NULL) + slot = &WaitEventTraceCtl->trace_slots[procNumber]; + + /* + * If this backend never ended up with an OWNED slot (e.g. capture was off + * the whole session, or the trace was released back to FREE via + * assign_wait_event_capture going trace -> off), there is nothing to + * transition. Read state without the lock first as a fast-path check; + * the authoritative re-check happens under the lock below. + */ + if (pg_atomic_read_u32(&slot->state) != WAIT_EVENT_TRACE_SLOT_OWNED) { - pgstat_wait_event_timing_lazy_attach(); - if (my_wait_event_timing == NULL) - return; + wait_event_trace_writes_disabled = true; + my_wait_event_trace = NULL; + return; } - event = my_wait_event_timing->current_event; + /* + * Disable trace-ring writes on this backend before we touch the lock. + * Writes after this point would race with the OWNED -> ORPHANED state + * publish below: a wait event whose end-timing path runs after the state + * has been published as ORPHANED would write into a ring that the patch + * contract declares read-only post-mortem. Cross-backend readers + * snapshot ORPHANED rings without expecting concurrent writes from the + * dying owner. See wait_event_trace_writes_disabled for the full UAF / + * contract-violation analysis. + * + * The flag stays true for the remainder of this backend's life (we are in + * proc_exit; there is no subsequent capture re-enable to handle), so we + * do not reset it. + */ + wait_event_trace_writes_disabled = true; + + LWLockAcquire(&WaitEventTraceCtl->lock, LW_EXCLUSIVE); /* - * Service a pending cross-backend reset request. A single relaxed atomic - * load; when the shared generation has advanced past the value we last - * acted on, clear our own counters on behalf of the requester and record - * the reset. wait_start is left untouched so the in-flight measurement - * still lands (in the freshly-zeroed counters), and current_event is - * zeroed so external readers do not see stale state. + * Drop the local pointer inside the lock-held region as a second line of + * defense; the writes-disabled flag above is the primary gate. */ - cur_reset_gen = pg_atomic_read_u32(&my_wait_event_timing->reset_generation); - if (cur_reset_gen != my_last_reset_generation) + my_wait_event_trace = NULL; + + if (pg_atomic_read_u32(&slot->state) == WAIT_EVENT_TRACE_SLOT_OWNED && + DsaPointerIsValid(slot->ring_ptr)) { - memset(my_wait_event_timing->events, 0, - sizeof(my_wait_event_timing->events)); - lwlock_timing_hash_clear(my_wait_event_timing); - my_wait_event_timing->reset_count++; - my_wait_event_timing->lwlock_overflow_count = 0; - my_wait_event_timing->flat_overflow_count = 0; - my_wait_event_timing->current_event = 0; - my_last_reset_generation = cur_reset_gen; + /* + * Bump generation first so any reader that snapped the old generation + * will detect the change on its post-read recheck and discard its + * read. Then publish the ORPHANED state. Keep ring_ptr valid -- the + * data is what we want to preserve. + */ + pg_atomic_fetch_add_u64(&slot->generation, 1); + pg_atomic_write_u32(&slot->state, WAIT_EVENT_TRACE_SLOT_ORPHANED); } - if (event != 0 && !INSTR_TIME_IS_ZERO(my_wait_event_timing->wait_start)) - { - instr_time now; - int64 duration_ns; - int idx; - WaitEventTimingEntry *entry = NULL; - bool warn_lwlock_overflow = false; - bool warn_flat_overflow = false; + LWLockRelease(&WaitEventTraceCtl->lock); +} - INSTR_TIME_SET_CURRENT(now); - duration_ns = INSTR_TIME_GET_NANOSEC(now) - - INSTR_TIME_GET_NANOSEC(my_wait_event_timing->wait_start); +/* + * Allocate (or re-acquire) a trace ring buffer for this backend via DSA. + * Called when wait_event_capture is set to 'trace'. + * + * Slot state at entry will be one of: + * + * FREE fresh slot (or one cleared on this backend's init by + * wait_event_trace_clear_orphan_at_init): allocate a new + * ring, transition slot to OWNED, bump generation. + * + * OWNED we already attached earlier in this same backend's life + * (e.g. user toggled capture trace->stats->trace; the + * stats step calls wait_event_trace_release_slot which + * transitions back to FREE, but our cached + * my_wait_event_trace was cleared on the way down -- so + * seeing OWNED here at attach time means a different + * backend somehow ended up with this procNumber, which + * cannot happen because procNumber is per-backend and a + * single backend can only run one attach at a time. We + * still tolerate this state defensively by re-mapping the + * existing ring rather than leaking a second allocation. + * + * ORPHANED can never be observed here: a new backend's + * pgstat_set_wait_event_timing_storage() called + * wait_event_trace_clear_orphan_at_init() before any + * wait-event capture path can run, so any prior orphan has + * already been demoted to FREE. Treated as a safety check + * (Assert in debug builds). + */ +void +wait_event_trace_attach(int procNumber) +{ + /* + * Re-entrancy guard. dsa_create / dsa_allocate_extended below can emit + * wait events internally; those reach the lazy-attach hot path which + * calls back into this function while we still hold + * WaitEventTraceCtl->lock or are mid-allocation. See the + * function-local-static-bool pattern explainer on + * wait_event_timing_attach_array. + */ + static bool in_attach = false; + static bool shmem_exit_registered = false; + WaitEventTraceSlot *slot; + dsa_pointer p; + WaitEventTraceState *ts; + uint32 state_now; - if (duration_ns < 0) - duration_ns = 0; + if (in_attach) + return; - idx = wait_event_timing_index(event); + if (WaitEventTraceCtl == NULL) + return; - /* - * Single-writer hot path: each slot has exactly one writer (the - * owning backend), and the SRF reader is lock-free, so no locking is - * needed here. Overflow WARNINGs are deferred to after the - * bookkeeping so ereport() cannot recurse through a wait event while - * counters are in an intermediate state. - */ + if (procNumber < 0 || procNumber >= NUM_WAIT_EVENT_TIMING_SLOTS) + return; + + /* + * Skip the attach if we are inside a critical section. Below this point + * we call dsa_create / dsa_attach / dsa_allocate_extended, all of which + * can allocate memory via MemoryContextAlloc and Assert-fail on + * "CritSectionCount == 0 || allowInCritSection". The very-first wait + * event after wait_event_capture = trace can land inside a critical + * section (e.g. a parallel worker scanning a heap page hits + * BufferSetHintBits16 -> XLogSaveBufferForHint -> XLogInsert -> + * LWLockAcquire, with the XLogInsert critical section open). + * + * Skipping here silently drops the in-flight wait event (it is not + * traced) but keeps the backend alive. The next wait event outside any + * critical section will hit this function again and attach successfully. + * See the matching guard in pgstat_wait_event_timing_lazy_attach. + */ + if (CritSectionCount > 0) + return; + + /* + * Skip the attach if MyProc is already on an LWLock wait queue. We are + * called from the wait-event hot path which fires AFTER LWLockQueueSelf + * has set MyProc->lwWaiting; a nested LWLockAcquire on our internal lock + * (via wait_event_trace_ensure_dsa) would PANIC at lwlock.c:1029. See the + * matching guard in pgstat_wait_event_timing_lazy_ attach for the full + * rationale. + */ + if (MyProc != NULL && MyProc->lwWaiting != LW_WS_NOT_WAITING) + return; + + slot = &WaitEventTraceCtl->trace_slots[procNumber]; + + in_attach = true; + PG_TRY(); + { + state_now = pg_atomic_read_u32(&slot->state); + + /* + * ORPHANED is normally impossible at attach time -- + * pgstat_set_wait_event_timing_storage() at backend init calls + * wait_event_trace_clear_orphan_at_init() which demotes any inherited + * orphan to FREE. But there is one case where this backend can + * legitimately observe its own slot in the ORPHANED state: after we + * have already run wait_event_trace_before_shmem_exit() + * (transitioning the slot to ORPHANED on exit), a later + * before_shmem_exit callback (e.g. pgstat_io_flush_cb during + * proc_exit shutdown) can contend on an LWLock that emits a wait + * event, which calls pgstat_report_wait_end_timing() -> + * wait_event_trace_attach() after my_wait_event_trace has been + * cleared. We must not re-attach in that case: we are dying, the + * ring is now post-mortem data for cross-backend readers, and the + * writer invariant must hold. Skip the trace for any wait events + * emitted after our own exit transition. + */ + if (state_now == WAIT_EVENT_TRACE_SLOT_ORPHANED) + { + /* PG_FINALLY below clears in_attach. */ + } + else if (state_now == WAIT_EVENT_TRACE_SLOT_OWNED && + DsaPointerIsValid(slot->ring_ptr)) + { + /* Already have a ring buffer; re-map to it. */ + wait_event_trace_ensure_dsa(); + my_wait_event_trace = dsa_get_address(trace_dsa, slot->ring_ptr); + my_trace_proc_number = procNumber; + } + else + { + Size alloc_size; + + wait_event_trace_ensure_dsa(); + + /* + * Cache the cluster-wide ring size on first allocation in this + * backend. wait_event_trace_ring_size is PGC_POSTMASTER, so by + * the time any backend reaches here, its boot value has been + * committed by the GUC framework. All rings in the postmaster + * run share the same dimensions. + */ + if (WaitEventTraceRingSize == 0) + WaitEventTraceRingSize = + (uint32) wait_event_trace_ring_size * 1024U / + (uint32) sizeof(WaitEventTraceRecord); + + alloc_size = offsetof(WaitEventTraceState, records) + + (Size) WaitEventTraceRingSize * sizeof(WaitEventTraceRecord); + + p = dsa_allocate_extended(trace_dsa, alloc_size, DSA_ALLOC_ZERO); + ts = dsa_get_address(trace_dsa, p); + pg_atomic_init_u64(&ts->write_pos, 0); + ts->ring_mask = WaitEventTraceRingSize - 1; + + LWLockAcquire(&WaitEventTraceCtl->lock, LW_EXCLUSIVE); + + /* + * Publish ring_ptr BEFORE transitioning state to OWNED. + * Cross-backend readers that observe state==OWNED outside the + * lock then see a valid ring_ptr. Bump generation last so any + * reader that snapped the prior generation will detect the + * change. + */ + slot->ring_ptr = p; + pg_atomic_write_u32(&slot->state, WAIT_EVENT_TRACE_SLOT_OWNED); + pg_atomic_fetch_add_u64(&slot->generation, 1); + LWLockRelease(&WaitEventTraceCtl->lock); + + my_wait_event_trace = ts; + my_trace_proc_number = procNumber; + + /* + * Register cleanup to run BEFORE dsm_backend_shutdown() detaches + * the DSA. The before_shmem_exit callbacks run in LIFO order + * before DSM detach, so the ORPHANED transition (which does not + * actually free the ring) is safe at that point. + * + * Guarded by shmem_exit_registered because under the + * release-on-disable policy (see wait_event_trace_release_slot + * and assign_wait_event_capture) the allocate branch can run + * multiple times per backend lifetime -- once per off/stats -> + * trace re-enable cycle. The cleanup itself is idempotent (it + * short-circuits when state is not OWNED), so it is safe to + * invoke after a release-then-reattach cycle, but we still avoid + * growing the before_shmem_exit list. + */ + if (!shmem_exit_registered) + { + before_shmem_exit(wait_event_trace_before_shmem_exit, + Int32GetDatum(procNumber)); + shmem_exit_registered = true; + } + } + } + PG_FINALLY(); + { + in_attach = false; + } + PG_END_TRY(); +} + +/* + * Free trace ring buffer for this backend on exit. + */ +static void +wait_event_trace_detach(int procNumber) +{ + /* + * Only clear local pointers here. The actual DSA free happens in + * wait_event_trace_before_shmem_exit(), which runs before + * dsm_backend_shutdown() detaches the DSA segments. + */ + my_wait_event_trace = NULL; + my_trace_proc_number = -1; +} + +/* + * Release this backend's trace ring buffer back to DSA immediately. + * + * Called from assign_wait_event_capture when the user steps down from + * TRACE to STATS or OFF. Without this, a ~4 MB ring allocated by a + * brief investigation would remain pinned for the rest of the session's + * lifetime, which can leak gigabytes across large connection pools. + * + * Important contrast with wait_event_trace_before_shmem_exit: backend + * exit transitions the slot to ORPHANED (preserving data for + * cross-backend readers); release_slot fully frees and returns to FREE + * because the operator has explicitly disabled trace -- they have + * affirmatively decided not to keep the data, so we honour that and + * reclaim the memory immediately. Subsequent re-enable allocates a + * fresh ring via wait_event_trace_attach's allocate branch. + * + * The operation is LWLock-safe and does not raise -- dsa_free is pure + * bookkeeping on the DSA freelist, no allocation and no ereport paths. + * Safe to call from a GUC assign hook. + * + * If pg_get_backend_wait_event_trace is currently iterating our own ring + * (wait_event_trace_srf_in_progress), we must NOT free the chunk out + * from under it: that would be a use-after-free on the records[] the SRF + * is still reading. Set wait_event_trace_release_pending instead and + * return; the SRF's PG_FINALLY block will perform the deferred free + * after iteration completes. In practice this branch is unreachable in + * current PG (assign hooks fire only at command boundaries and the SRF + * is a single command), but it makes the invariant explicit and the + * future-proofing free. + */ +static void +wait_event_trace_release_slot(int procNumber) +{ + /* + * Re-entrancy guard. dsa_free takes a DSA-internal LWLock which can in + * principle emit a wait event; if a nested assign hook re-enters we must + * not recurse. See the function-local-static-bool pattern explainer on + * wait_event_timing_attach_array. + */ + static bool in_release = false; + WaitEventTraceSlot *slot; + + if (in_release) + return; + + if (WaitEventTraceCtl == NULL || trace_dsa == NULL) + return; + + /* + * Same-backend SRF is iterating our own ring. Defer the free until the + * SRF's PG_FINALLY runs. + */ + if (wait_event_trace_srf_in_progress) + { + wait_event_trace_release_pending = true; + return; + } + + if (procNumber < 0 || procNumber >= NUM_WAIT_EVENT_TIMING_SLOTS) + return; + + slot = &WaitEventTraceCtl->trace_slots[procNumber]; + + in_release = true; + + /* + * Disable trace-ring writes on this backend before we touch the lock or + * call dsa_free. An internal LWLock inside dsa_free can dispatch a wait + * event whose end-timing path would otherwise see capture_level == TRACE + * (the GUC assign hook is in flight; the variable has not been committed + * by the framework yet) and write into the very chunk we are returning to + * the DSA freelist. See the comment on wait_event_trace_writes_disabled + * for the full UAF analysis. + */ + wait_event_trace_writes_disabled = true; + + PG_TRY(); + { + LWLockAcquire(&WaitEventTraceCtl->lock, LW_EXCLUSIVE); + + /* + * Drop the local pointer BEFORE the dsa_free as a second line of + * defense (the writes-disabled flag above is the primary gate). Any + * wait event whose hot path slips past the gate check via a compiler + * or memory-ordering surprise would at least see my_wait_event_trace + * == NULL and skip the write. + */ + my_wait_event_trace = NULL; + + if (DsaPointerIsValid(slot->ring_ptr)) + { + /* + * Bump generation first to invalidate any concurrent + * cross-backend snapshot, then free, then publish the FREE state + * with a NULL ring_ptr. Order matters for unlocked readers that + * have already passed the state check. + */ + pg_atomic_fetch_add_u64(&slot->generation, 1); + dsa_free(trace_dsa, slot->ring_ptr); + slot->ring_ptr = InvalidDsaPointer; + pg_atomic_write_u32(&slot->state, WAIT_EVENT_TRACE_SLOT_FREE); + } + LWLockRelease(&WaitEventTraceCtl->lock); + } + PG_FINALLY(); + { + wait_event_trace_writes_disabled = false; + in_release = false; + } + PG_END_TRY(); +} + +/* + * Clear an orphaned trace ring at backend init time. + * + * Called from pgstat_set_wait_event_timing_storage() once the new + * backend has its procNumber. If the slot we're inheriting was left + * ORPHANED by a previous backend (because we deliberately do not free + * trace rings on backend exit -- see the lifecycle discussion on + * WaitEventTraceControl), free the ring now so the new backend starts + * with a clean FREE slot. Subsequent wait_event_trace_attach() calls + * (when this backend itself enables trace) will then take the + * allocate branch. + * + * No-op when the slot is already FREE or OWNED: FREE means there's + * nothing to clear; OWNED is impossible at backend init (only a + * not-yet-exited backend can leave a slot OWNED, and procNumbers are + * assigned exclusively). We assert OWNED is not observed in debug + * builds and conservatively skip the free in production. + * + * Robustness: this runs during InitProcess() (before the backend can + * accept any work), and the work it performs -- dsa_attach() and + * dsa_free() -- can raise ERROR on rare runtime failures (corrupted + * DSA segment headers, descriptor exhaustion, mmap ENOMEM, etc.). + * An uncaught ERROR here would propagate out of InitProcess() and + * abort backend startup entirely, even for sessions that never + * intended to use wait_event_capture. To prevent the trace + * feature's housekeeping from gating connection establishment, the + * body is wrapped in PG_TRY()/PG_CATCH(): any error from dsa_attach + * or dsa_free is captured, downgraded to a WARNING with a hint + * pointing at the admin sweep function, and execution continues. + * The orphan stays in place; it can be reclaimed by the next + * backend that inherits the same procNumber (if the underlying + * problem was transient), by pg_stat_clear_orphaned_wait_event_rings(), + * or at next cluster restart. + */ +static void +wait_event_trace_clear_orphan_at_init(int procNumber) +{ + WaitEventTraceSlot *slot; + uint32 state_now; + MemoryContext caller_cxt; + + if (WaitEventTraceCtl == NULL) + return; + + if (procNumber < 0 || procNumber >= NUM_WAIT_EVENT_TIMING_SLOTS) + return; + + slot = &WaitEventTraceCtl->trace_slots[procNumber]; + + state_now = pg_atomic_read_u32(&slot->state); + if (state_now != WAIT_EVENT_TRACE_SLOT_ORPHANED) + { + Assert(state_now != WAIT_EVENT_TRACE_SLOT_OWNED); + return; + } + + /* + * Save CurrentMemoryContext so the PG_CATCH path can copy the error data + * into a context that survives FlushErrorState(). FlushErrorState() calls + * MemoryContextReset(ErrorContext), so CopyErrorData() must run in a + * different context or the returned ErrorData becomes a dangling pointer. + */ + caller_cxt = CurrentMemoryContext; + + PG_TRY(); + { + /* + * The trace DSA is shared across the cluster. We must attach to it + * before calling dsa_free (which needs the dsa_area pointer). The + * DSA was created by some earlier backend that wrote a trace record + * (otherwise the slot couldn't have ended up ORPHANED), so the handle + * in WaitEventTraceCtl is valid; ensure_dsa() will attach. Both + * ensure_dsa() and dsa_free() can raise ERROR; the PG_CATCH below + * downgrades any such error to a WARNING so backend startup is not + * blocked. + */ + wait_event_trace_ensure_dsa(); + + LWLockAcquire(&WaitEventTraceCtl->lock, LW_EXCLUSIVE); + if (pg_atomic_read_u32(&slot->state) == WAIT_EVENT_TRACE_SLOT_ORPHANED && + DsaPointerIsValid(slot->ring_ptr)) + { + pg_atomic_fetch_add_u64(&slot->generation, 1); + dsa_free(trace_dsa, slot->ring_ptr); + slot->ring_ptr = InvalidDsaPointer; + pg_atomic_write_u32(&slot->state, WAIT_EVENT_TRACE_SLOT_FREE); + } + LWLockRelease(&WaitEventTraceCtl->lock); + } + PG_CATCH(); + { + ErrorData *edata; + + /* + * Release any LWLocks we (or anything we called) might still hold. + * Two paths can leave WaitEventTraceCtl->lock held when control + * reaches here: + * + * 1. The outer LWLockAcquire above succeeded and dsa_free raised + * before we reached LWLockRelease. 2. wait_event_trace_ensure_dsa() + * raised inside its own LWLockAcquire/dsa_attach/LWLockRelease + * region. + * + * We are running during InitProcess(), BEFORE any transaction or + * PostgresMain sigsetjmp has been set up, so PG's standard + * "AbortTransaction -> LWLockReleaseAll" cleanup does NOT fire on the + * longjmp into PG_CATCH. Without an explicit release here the lock + * would stay held for the lifetime of this backend, blocking every + * future LW_EXCLUSIVE acquirer (the orphan-clear sweep, release_slot, + * before_shmem_exit transitions, and subsequent backends' + * clear_orphan_at_init). That would be strictly worse than the + * original failure-startup behavior this commit set out to fix. + * + * LWLockReleaseAll() is the idiomatic catch-path lock cleanup used by + * the standalone aux-process error handlers (walwriter.c, + * checkpointer.c, pgarch.c). It is safe to call broadly here because + * pgstat_set_wait_ event_timing_storage runs at a fixed point in + * InitProcess where the caller frame holds no other LWLocks across + * our return: the earlier InitProcess steps that touch LWLocks + * (ProcArrayAdd, etc.) release them before returning, and the + * subsequent steps that acquire LWLocks have not yet run. + */ + LWLockReleaseAll(); + + /* + * Switch BACK to the caller's context before CopyErrorData so that + * edata is allocated in a context that survives FlushErrorState(). + * FlushErrorState() calls MemoryContextReset(ErrorContext); + * allocating edata in ErrorContext (the default at PG_CATCH entry on + * the error path) would make it a dangling pointer the moment we + * flush. See the matching pattern in spi.c PG_CATCH branches. + */ + MemoryContextSwitchTo(caller_cxt); + edata = CopyErrorData(); + FlushErrorState(); + + ereport(WARNING, + (errcode(edata->sqlerrcode), + errmsg("could not clear orphaned wait-event trace ring " + "at backend init: %s", edata->message), + errdetail("Backend startup proceeds with the orphan " + "still allocated for procnumber %d.", + procNumber), + errhint("Run pg_stat_clear_orphaned_wait_event_rings() " + "to release the orphan when the underlying " + "condition is resolved."))); + + FreeErrorData(edata); + } + PG_END_TRY(); +} + +/* + * Out-of-line body for pgstat_report_wait_start()'s timing path. Records + * the start timestamp and the event being waited on. Reached only when + * wait_event_capture != OFF. + */ +void +pgstat_report_wait_start_timing(uint32 wait_event_info) +{ + /* + * Stay out of the timing path once proc_exit has begun tearing down DSA + * mappings (see the before_shmem_exit callback). + */ + if (wait_event_timing_writes_disabled) + return; + + if (my_wait_event_timing == NULL) + { + pgstat_wait_event_timing_lazy_attach(); + + /* + * lazy_attach can dispatch nested wait events while it sets up DSA + * (dsa_attach takes an internal LWLock); those nested wait_end calls + * clear my_wait_event_info to 0. Re-publish so the outer wait stays + * visible in pg_stat_activity. Only needed on the first-attach path. + */ + *(volatile uint32 *) my_wait_event_info = wait_event_info; + } + + if (my_wait_event_timing != NULL) + { + INSTR_TIME_SET_CURRENT(my_wait_event_timing->wait_start); + my_wait_event_timing->current_event = wait_event_info; + } +} + +/* + * Out-of-line body for pgstat_report_wait_end()'s timing path. Computes + * the wait duration, accumulates per-event statistics, and at the trace + * level pushes the completed wait into the per-session ring. + * + * capture_level is the value of wait_event_capture observed at the inline + * gate. Passing it through (rather than re-loading the global here) avoids + * a redundant load on the trace branch below -- the function-call boundary + * defeats CSE -- and means a concurrent GUC change cannot half-apply to + * this call: we run entirely at the gate's view of the level. + */ +void +pgstat_report_wait_end_timing(int capture_level) +{ + uint32 event; + uint32 cur_reset_gen; + + if (wait_event_timing_writes_disabled) + return; + + if (my_wait_event_timing == NULL) + { + pgstat_wait_event_timing_lazy_attach(); + if (my_wait_event_timing == NULL) + return; + } + + event = my_wait_event_timing->current_event; + + /* + * Service a pending cross-backend reset request. A single relaxed atomic + * load; when the shared generation has advanced past the value we last + * acted on, clear our own counters on behalf of the requester and record + * the reset. wait_start is left untouched so the in-flight measurement + * still lands (in the freshly-zeroed counters), and current_event is + * zeroed so external readers do not see stale state. + */ + cur_reset_gen = pg_atomic_read_u32(&my_wait_event_timing->reset_generation); + if (cur_reset_gen != my_last_reset_generation) + { + memset(my_wait_event_timing->events, 0, + sizeof(my_wait_event_timing->events)); + lwlock_timing_hash_clear(my_wait_event_timing); + my_wait_event_timing->reset_count++; + my_wait_event_timing->lwlock_overflow_count = 0; + my_wait_event_timing->flat_overflow_count = 0; + my_wait_event_timing->current_event = 0; + my_last_reset_generation = cur_reset_gen; + } + + if (event != 0 && !INSTR_TIME_IS_ZERO(my_wait_event_timing->wait_start)) + { + instr_time now; + int64 duration_ns; + int idx; + WaitEventTimingEntry *entry = NULL; + bool warn_lwlock_overflow = false; + bool warn_flat_overflow = false; + + INSTR_TIME_SET_CURRENT(now); + duration_ns = INSTR_TIME_GET_NANOSEC(now) - + INSTR_TIME_GET_NANOSEC(my_wait_event_timing->wait_start); + + if (duration_ns < 0) + duration_ns = 0; + + idx = wait_event_timing_index(event); + + /* + * Single-writer hot path: each slot has exactly one writer (the + * owning backend), and the SRF reader is lock-free, so no locking is + * needed here. Overflow WARNINGs are deferred to after the + * bookkeeping so ereport() cannot recurse through a wait event while + * counters are in an intermediate state. + */ if (idx == WAIT_EVENT_TIMING_IDX_LWLOCK) entry = lwlock_timing_lookup(my_wait_event_timing, event & 0xFFFF); else if (idx >= 0) @@ -934,6 +1927,81 @@ pgstat_report_wait_end_timing(int capture_level) (errmsg("some wait events will not be timed"), errdetail("A wait event was reported with a class unknown to the timing infrastructure."))); + /* Trace level: push the completed wait into the per-session ring. */ + if (capture_level == WAIT_EVENT_CAPTURE_TRACE && + !wait_event_trace_writes_disabled && + !wait_event_trace_in_write) + { + /* + * Lazy-attach the ring on first use here (not in the assign hook, + * which must not ereport on OOM). The writes-disabled gate also + * blocks this re-attach during slot-state transitions + * (release_slot / before_shmem_exit): without it a nested wait + * event mid-transition could recurse into a fresh attach that + * deadlocks on the lock the transition already holds. + */ + if (my_wait_event_trace == NULL && my_trace_proc_number >= 0) + wait_event_trace_attach(my_trace_proc_number); + + if (my_wait_event_trace != NULL) + { + /* + * Single-writer claim: a plain read+write avoids the LOCK + * XADD that pg_atomic_fetch_add_u64 would emit on every wait + * event. Cross-backend readers use pg_atomic_read_u64 and + * rely on the per-record seqlock below for safety. + */ + uint64 pos = pg_atomic_read_u64(&my_wait_event_trace->write_pos); + WaitEventTraceRecord *rec; + uint32 seq; + + /* + * Wait events emitted while a trace record is being written + * must not themselves be ring-recorded (the gate above): the + * record write is plain stores and cannot wait, but the + * injection point below can, and the wait machinery it runs + * (DSM registry lookup, condition-variable sleep) emits wait + * events whose wait_end would recurse into this block -- + * re-running the injection point and self-deadlocking on + * locks the outer invocation still holds. Costs one + * process-local store each way; nested waits still + * accumulate in the stats above. + */ + wait_event_trace_in_write = true; + + pg_atomic_write_u64(&my_wait_event_trace->write_pos, pos + 1); + + /* + * Injection point for the regression test of the + * position-encoded identity seqlock: stalling here widens the + * window between the write_pos store and the rec->seq store, + * simulating weak-memory visibility that would otherwise be + * unreachable on x86. Compiled out unless + * --enable-injection-points. + */ + INJECTION_POINT("wait-event-trace-after-write-pos", NULL); + + rec = &my_wait_event_trace->records[pos & my_wait_event_trace->ring_mask]; + seq = (uint32) (pos * 2 + 1); + + rec->seq = seq; + pg_write_barrier(); /* payload stores must not rise above + * seq=odd */ + + rec->record_type = TRACE_WAIT_EVENT; + rec->timestamp_ns = INSTR_TIME_GET_NANOSEC(now); + rec->data.wait.event = event; + rec->data.wait.pad2 = 0; + rec->data.wait.duration_ns = duration_ns; + + pg_write_barrier(); /* payload stores must land before + * seq=even */ + rec->seq = seq + 1; + + wait_event_trace_in_write = false; + } + } + INSTR_TIME_SET_ZERO(my_wait_event_timing->wait_start); } } @@ -1288,4 +2356,738 @@ pg_stat_reset_wait_event_timing_all(PG_FUNCTION_ARGS) PG_RETURN_VOID(); } +/* ================= Trace-level readers ================= */ + +/* + * SQL function: pg_get_backend_wait_event_trace() + * + * Returns trace records from the current backend's own ring buffer. This + * function is deliberately session-local; cross-backend reading goes + * through pg_get_wait_event_trace(procnumber) below, which implements the + * snapshot-under-lock protocol documented on WaitEventTraceControl in + * wait_event_timing.h. The name mirrors pg_get_backend_memory_contexts() + * to make the session-local scope explicit at the API level. + * + * Same-backend coordination with wait_event_trace_release_slot uses the + * wait_event_trace_srf_in_progress / _release_pending flags rather than + * an LWLock: same-backend serialization is implicit, so a per-backend + * bool plus a deferred-free path is sufficient and avoids any of the + * cross-backend lock-hold latency that the cross-backend reader pattern + * has to manage. PG_TRY/PG_FINALLY guarantees the flag is cleared and + * any deferred dsa_free is performed even on ereport(ERROR). + * + * Uses InitMaterializedSRF (materialize-all). The ring holds up to + * WaitEventTraceRingSize records (set at server start from the + * wait_event_trace_ring_size GUC; default 4 MB = 131072 records); + * full materialization caps the per-call cost at the ring size of + * tuplestore memory, which is acceptable for the use case this SRF + * is designed for: interactive own-session diagnostics from psql. + * + * This SRF is NOT the path for cross-backend monitoring tools: it is + * hard-coded to the calling backend's own ring, so a bgworker selecting + * from pg_backend_wait_event_trace via SPI would see only its own + * (typically empty) ring. Cross-backend consumers use + * pg_get_wait_event_trace(procnumber) for SQL access, or implement the + * snapshot-under-lock protocol on WaitEventTraceControl directly. + * + * value-per-call (deferred) SRF mode would let an interactive + * "SELECT ... FROM pg_backend_wait_event_trace LIMIT N" short-circuit + * the materialisation, but converting this function would require + * spanning the wait_event_trace_srf_in_progress flag (and its + * deferred-free coordination with assign_wait_event_capture) across + * multiple SRF callbacks plus a transaction-cleanup registration to + * handle LIMIT abandonment. The complexity is not + * justified for the diagnostic use case, especially since cross- + * backend monitoring (the consumer that would actually benefit from + * streaming) goes through the snapshot pattern above instead. + * Interactive callers who want only recent records should use + * "ORDER BY seq DESC LIMIT N" -- the LIMIT is applied after + * materialisation but the cost stays bounded by the ring size. + */ +Datum +pg_get_backend_wait_event_trace(PG_FUNCTION_ARGS) +{ + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + WaitEventTraceState *ts; + uint64 write_pos; + uint64 read_start; + uint64 i; + + InitMaterializedSRF(fcinfo, 0); + + if (my_wait_event_trace == NULL) + PG_RETURN_VOID(); + + ts = my_wait_event_trace; + + write_pos = pg_atomic_read_u64(&ts->write_pos); + + if (write_pos == 0) + PG_RETURN_VOID(); + + /* Read from oldest available to newest */ + { + uint64 ring_size = (uint64) ts->ring_mask + 1; + + read_start = (write_pos > ring_size) + ? write_pos - ring_size : 0; + } + + /* + * Mark the iteration in progress so wait_event_trace_release_slot defers + * any concurrent dsa_free of our own ring (see the comment on that + * function for the deferral protocol). PG_FINALLY clears the flag and + * performs any deferred free, even on ereport(ERROR). + */ + wait_event_trace_srf_in_progress = true; + PG_TRY(); + { + for (i = read_start; i < write_pos; i++) + { + WaitEventTraceRecord *rec = + &ts->records[i & ts->ring_mask]; + Datum values[6]; + bool nulls[6]; + const char *event_type; + const char *event_name; + uint32 seq_before; + uint32 seq_after; + uint8 rtype; + int64 timestamp_ns; + uint32 event_info; + int64 duration_ns; + int64 query_id; + + /* Seqlock read */ + seq_before = rec->seq; + pg_read_barrier(); /* acquire: payload loads below must not rise + * above this */ + + if (seq_before & 1) + continue; + + rtype = rec->record_type; + timestamp_ns = rec->timestamp_ns; + + if (rtype == TRACE_WAIT_EVENT) + { + event_info = rec->data.wait.event; + duration_ns = rec->data.wait.duration_ns; + query_id = 0; + } + else if (rtype == TRACE_QUERY_START || rtype == TRACE_QUERY_END || + rtype == TRACE_EXEC_START || rtype == TRACE_EXEC_END) + { + event_info = 0; + duration_ns = 0; + query_id = rec->data.query.query_id; + } + else + { + pg_read_barrier(); /* acquire: pair with seq_before read + * above before skipping */ + continue; + } + + pg_read_barrier(); /* acquire: payload loads must have landed + * before seq_after */ + seq_after = rec->seq; + + if (seq_before != seq_after) + continue; + + /* Skip empty wait events */ + if (rtype == TRACE_WAIT_EVENT && event_info == 0) + continue; + + if (rtype == TRACE_WAIT_EVENT) + { + event_type = pgstat_get_wait_event_type(event_info); + event_name = pgstat_get_wait_event(event_info); + } + else if (rtype == TRACE_QUERY_START) + { + event_type = "Query"; + event_name = "QueryStart"; + } + else if (rtype == TRACE_EXEC_START) + { + event_type = "Query"; + event_name = "ExecStart"; + } + else if (rtype == TRACE_EXEC_END) + { + event_type = "Query"; + event_name = "ExecEnd"; + } + else + { + event_type = "Query"; + event_name = "QueryEnd"; + } + + if (event_type == NULL || event_name == NULL) + continue; + + memset(nulls, 0, sizeof(nulls)); + + values[0] = Int64GetDatum((int64) i); + values[1] = Int64GetDatum(timestamp_ns); + values[2] = CStringGetTextDatum(event_type); + values[3] = CStringGetTextDatum(event_name); + values[4] = Float8GetDatum((double) duration_ns / 1000.0); + values[5] = Int64GetDatum(query_id); + + tuplestore_putvalues(rsinfo->setResult, + rsinfo->setDesc, + values, nulls); + } + } + PG_FINALLY(); + { + wait_event_trace_srf_in_progress = false; + + /* + * If a GUC step-down fired during iteration, it deferred the + * dsa_free. Process it now that we're safely past the loop. Re-check + * release_pending under the same flag to handle the + * (impossible-today, possible-tomorrow) case of a nested SRF. + */ + if (wait_event_trace_release_pending) + { + wait_event_trace_release_pending = false; + if (my_trace_proc_number >= 0) + wait_event_trace_release_slot(my_trace_proc_number); + } + } + PG_END_TRY(); + + PG_RETURN_VOID(); +} + +/* + * One element of the local result buffer. Pairs a per-record copy + * with the original ring index (used as the seq output column). + */ +typedef struct WetValidRecord +{ + uint64 ring_index; /* original index in the writer's ring */ + WaitEventTraceRecord rec; +} WetValidRecord; + +/* + * Snapshot the trace ring for a given procNumber and emit records into + * the SRF's tuplestore. Returns silently for FREE slots, out-of-range + * procnumbers, slots whose ring was never allocated, and slots whose + * write_pos is zero. + * + * Cross-backend reader protocol implemented here: + * + * 1. Read slot->state without the lock as a cheap "worth visiting" + * check; FREE -> nothing to emit. + * 2. Allocate the worst-case result buffer BEFORE taking the lock, + * so the palloc -- which can bottom out in a glibc mmap syscall + * for the worst-case (full-ring) size -- runs without holding the + * WaitEventTraceCtl lock. + * 3. Acquire WaitEventTraceCtl->lock in LW_SHARED. All slot + * transitions take LW_EXCLUSIVE, so the slot's identity, state, + * and ring_ptr are stable for the duration of the iteration. + * 4. Re-check state under the lock and resolve ring_ptr via + * dsa_get_address. Read write_pos. + * 5. Iterate every live ring index [read_start, write_pos). For + * each record do the per-record POSITION-ENCODED IDENTITY + * seqlock check ON SHARED MEMORY (see the comment on the loop + * below). + * 6. Release the lock. + * 7. Walk the local result array and emit rows into the tuplestore. + * This is the expensive part (potential disk spill); doing it + * after release minimises lock-hold time. + * + * Why per-record seqlock against shared memory, not against a local + * memcpy of the full ring: the protocol requires the two seq reads + * to go to the SAME shared-memory location at DIFFERENT TIMES, with + * the payload read between them. A bulk memcpy then seqlock-on- + * local-copy reads the same frozen byte twice, the check degenerates + * to a no-op, and torn / stale-cycle reads slip through. + * + * Why position-encoded identity, not just parity: the writer encodes + * the ring position into the seq value (mid-write = pos*2+1, complete + * = pos*2+2). After RING_SIZE writes the slot wraps and is rewritten + * with a new numerically-distinct seq. A parity-only check accepts + * any stable even seq -- including the PREVIOUS cycle's seq if cross- + * process visibility puts the new write_pos ahead of the new seq + * update. See the loop body for the four failure modes the identity + * check rejects. + * + * Holding LW_SHARED throughout the iteration also makes the + * generation-counter retry unnecessary for this caller: slot + * transitions take LW_EXCLUSIVE and therefore cannot happen while we + * hold LW_SHARED. The generation counter is still part of the + * cross-backend reader contract on WaitEventTraceControl for external + * readers that follow a different lock-release pattern (e.g. an + * extension that wants to release the lock between batches of records + * and re-acquire), but this in-tree implementation does not release + * the lock mid-iteration. + * + * Both OWNED and ORPHANED slots are read uniformly. For OWNED the + * live owner is concurrently writing; the seqlock catches torn reads. + * For ORPHANED the records are immutable post-mortem so the check is + * essentially a pass-through (it still correctly skips at most one + * trailing odd-seq record if the owner died mid-write). + * + * Lock-hold is O(write_pos - read_start) shared-memory loads, at + * roughly the same wall-clock cost as a single 4 MB memcpy of the + * full ring (~1 ms on modern hardware), with no I/O and no syscalls. + */ +static void +emit_wait_event_trace_for_procnumber(int procNumber, ReturnSetInfo *rsinfo) +{ + WaitEventTraceSlot *slot; + WaitEventTraceState *ts; + WetValidRecord *valid_records = NULL; + uint64 valid_count = 0; + uint64 write_pos; + uint64 read_start; + uint64 i; + uint32 state_now; + + if (WaitEventTraceCtl == NULL) + return; + + /* + * Range check. Negative or out-of-range procnumbers return an empty + * result rather than ERRORing because the most natural use pattern for + * cross-backend readers is to iterate every possible slot index (a + * monitoring background worker doesn't know the exact + * NUM_WAIT_EVENT_TIMING_SLOTS at SQL level), and silent- empty for + * out-of-range matches the behaviour of sister functions like + * pg_stat_get_wait_event_timing(NULL) which iterate the shared array + * internally. FREE-but-in-range slots also return empty (see the state + * check below); the caller cannot distinguish out-of-range from FREE, + * which is fine. + */ + if (procNumber < 0 || procNumber >= NUM_WAIT_EVENT_TIMING_SLOTS) + return; + + slot = &WaitEventTraceCtl->trace_slots[procNumber]; + + /* + * If the trace DSA was never created (no backend in the cluster has ever + * set wait_event_capture = trace), every slot is still in its initial + * FREE state. Skip without taking the lock. + */ + if (WaitEventTraceCtl->trace_dsa_handle == DSA_HANDLE_INVALID) + return; + + /* + * Unlocked fast-path check; the authoritative check is under the lock + * below. + */ + if (pg_atomic_read_u32(&slot->state) == WAIT_EVENT_TRACE_SLOT_FREE) + return; + + wait_event_trace_ensure_dsa(); + if (trace_dsa == NULL) + return; + + /* + * Allocate the worst-case result buffer BEFORE taking the lock. The + * buffer is sized for the full ring (sizeof(WetValidRecord) * + * WaitEventTraceRingSize, e.g. ~5 MB at the 4 MB default ring and up to + * ~42 MB at the 32 MB maximum); on a near-empty ring most goes unused, + * but that is preferable to holding the WaitEventTraceCtl lock during a + * palloc that may bottom out in a glibc mmap() syscall (allocations above + * the malloc-mmap threshold). Glibc's arena-internal mutex around the + * syscall would serialise every concurrent reader of this lock through + * one VMA-modifying kernel operation; sizing the alloc outside the lock + * keeps the lock-hold time bounded by the per-record loop alone. + * + * After we acquire the lock we will either consume this buffer (writing + * up to (write_pos - read_start) entries) or release it unused on an + * early return. + */ + + /* + * Worst-case size = ring size. Derive it from the GUC on first use in + * this backend; subsequent calls see the cached value. The GUC is + * PGC_POSTMASTER so the value is the same across every backend in this + * postmaster run and never changes. + */ + if (WaitEventTraceRingSize == 0) + WaitEventTraceRingSize = + (uint32) wait_event_trace_ring_size * 1024U / + (uint32) sizeof(WaitEventTraceRecord); + valid_records = palloc(sizeof(WetValidRecord) * WaitEventTraceRingSize); + + LWLockAcquire(&WaitEventTraceCtl->lock, LW_SHARED); + + state_now = pg_atomic_read_u32(&slot->state); + if (state_now == WAIT_EVENT_TRACE_SLOT_FREE || + !DsaPointerIsValid(slot->ring_ptr)) + { + LWLockRelease(&WaitEventTraceCtl->lock); + pfree(valid_records); + return; + } + + ts = (WaitEventTraceState *) dsa_get_address(trace_dsa, slot->ring_ptr); + write_pos = pg_atomic_read_u64(&ts->write_pos); + + if (write_pos == 0) + { + LWLockRelease(&WaitEventTraceCtl->lock); + pfree(valid_records); + return; + } + + /* Live range: oldest available to newest. */ + { + uint64 ring_size = (uint64) ts->ring_mask + 1; + + read_start = (write_pos > ring_size) + ? write_pos - ring_size : 0; + } + + for (i = read_start; i < write_pos; i++) + { + WaitEventTraceRecord *rec_shared = + &ts->records[i & ts->ring_mask]; + WetValidRecord *out = &valid_records[valid_count]; + uint32 expected_seq; + uint32 seq_before; + uint32 seq_after; + + /* + * Position-encoded seqlock identity check (NOT just parity). + * + * The writer encodes the ring position into the seq value: mid-write + * -> (uint32)(pos * 2 + 1), complete -> + 2. After RING_SIZE writes + * the slot wraps and the same memory location gets a new seq value + * (next_pos * 2 + 2) that is numerically distinct from the previous + * cycle's seq. + * + * A parity-only check (skip on odd seq, accept on stable even) is + * INSUFFICIENT for this layout in the cross-backend case: if the + * writer just incremented write_pos to pos+1 but cross-process cache + * coherence has not yet propagated the subsequent rec->seq = + * (pos*2+1) store, this reader at i = pos would see the previous + * cycle's complete-even seq (from logical position pos - RING_SIZE). + * Both seq_before and seq_after would read that stale even value, + * parity passes, identity-against-itself passes, and a record + * belonging to the PREVIOUS cycle gets emitted with the new + * ring_index = pos. Silent data corruption (wrong attribution, not + * torn bytes). + * + * The fix is identity against EXPECTED: a record is valid for + * iterator position i if and only if its seq equals (uint32)(i * 2 + + * 2) -- the writer's encoded "complete" value for that exact ring + * position. This rejects: + * + * * Stale prior cycle (seq < expected): writer hasn't yet advanced + * rec->seq for the current cycle. * Mid-write current cycle (seq == + * expected - 1, odd): writer is in the payload write window. * Ring + * wrapped past us (seq > expected): the writer completed a later + * cycle on this slot during our read. + * + * The uint32 wraparound at 2^31 cycles is safe: we use exact + * equality, and the writer's existing wrap-safety argument + * (sizeof(seq) > worst-case in-flight window by 11 orders of + * magnitude) covers the seq value. + */ + expected_seq = (uint32) (i * 2 + 2); + + seq_before = rec_shared->seq; + pg_read_barrier(); + + if (seq_before != expected_seq) + continue; + + out->rec = *rec_shared; /* one 32-byte structure copy */ + + pg_read_barrier(); + seq_after = rec_shared->seq; + + if (seq_after != expected_seq) + continue; + + out->ring_index = i; + valid_count++; + } + + LWLockRelease(&WaitEventTraceCtl->lock); + + /* + * Walk the local result array and emit rows. No shared-memory access + * from here on, so spills to disk by the tuplestore (if the result is + * large) do not hold any wait-event-timing lock. + */ + for (i = 0; i < valid_count; i++) + { + WetValidRecord *vr = &valid_records[i]; + WaitEventTraceRecord *rec = &vr->rec; + Datum values[6]; + bool nulls[6]; + const char *event_type; + const char *event_name; + uint8 rtype = rec->record_type; + uint32 event_info; + int64 duration_ns; + int64 query_id; + + if (rtype == TRACE_WAIT_EVENT) + { + event_info = rec->data.wait.event; + duration_ns = rec->data.wait.duration_ns; + query_id = 0; + + /* Skip empty wait events. */ + if (event_info == 0) + continue; + + event_type = pgstat_get_wait_event_type(event_info); + event_name = pgstat_get_wait_event(event_info); + } + else if (rtype == TRACE_QUERY_START) + { + event_info = 0; + duration_ns = 0; + query_id = rec->data.query.query_id; + event_type = "Query"; + event_name = "QueryStart"; + } + else if (rtype == TRACE_QUERY_END) + { + event_info = 0; + duration_ns = 0; + query_id = rec->data.query.query_id; + event_type = "Query"; + event_name = "QueryEnd"; + } + else if (rtype == TRACE_EXEC_START) + { + event_info = 0; + duration_ns = 0; + query_id = rec->data.query.query_id; + event_type = "Query"; + event_name = "ExecStart"; + } + else if (rtype == TRACE_EXEC_END) + { + event_info = 0; + duration_ns = 0; + query_id = rec->data.query.query_id; + event_type = "Query"; + event_name = "ExecEnd"; + } + else + { + /* Unrecognised record_type -- skip defensively. */ + continue; + } + + if (event_type == NULL || event_name == NULL) + continue; + + memset(nulls, 0, sizeof(nulls)); + + values[0] = Int64GetDatum((int64) vr->ring_index); + values[1] = Int64GetDatum(rec->timestamp_ns); + values[2] = CStringGetTextDatum(event_type); + values[3] = CStringGetTextDatum(event_name); + values[4] = Float8GetDatum((double) duration_ns / 1000.0); + values[5] = Int64GetDatum(query_id); + + tuplestore_putvalues(rsinfo->setResult, + rsinfo->setDesc, + values, nulls); + } + + pfree(valid_records); +} + +/* + * SQL function: pg_get_wait_event_trace(procnumber int4) + * + * Cross-backend trace ring reader. Returns the records from the trace + * ring belonging to the backend that currently or previously occupied + * the given procNumber slot. Reads OWNED and ORPHANED slots uniformly; + * FREE slots return an empty result. + * + * This SRF is the in-tree consumer of the orphan-preserved trace data: + * a backend that exited while wait_event_capture = trace leaves its + * ring allocated in DSA in ORPHANED state, and this function reads it + * until either a new backend takes over the same procNumber or the + * DBA calls pg_stat_clear_orphaned_wait_event_rings(). External + * extensions that need cross-backend access follow the same + * snapshot pattern documented on WaitEventTraceControl in + * wait_event_timing.h; this function serves as both the reference + * implementation and a DBA-facing diagnostic tool. + * + * Privileges: REVOKE'd from PUBLIC and GRANT'ed to pg_read_all_stats + * in system_views.sql, matching the privilege model of the session- + * local view pg_backend_wait_event_trace. + * + * The procnumber argument can be obtained from the procnumber column + * of pg_stat_get_wait_event_timing or pg_stat_get_wait_event_timing_ + * overflow. For pid-keyed access against live backends, callers can + * do: + * + * SELECT * FROM pg_get_wait_event_trace( + * (SELECT procnumber FROM pg_stat_get_wait_event_timing() + * WHERE pid = LIMIT 1)); + * + * Note that pid-keyed access cannot read ORPHANED slots because a + * dying backend's pid is removed from procArray on exit; for + * post-mortem reading of short-lived backends (parallel workers, + * autovacuum, walsender) the procNumber must be captured before the + * backend exits, or discovered by iterating procnumbers in a + * monitoring background worker. + */ +Datum +pg_get_wait_event_trace(PG_FUNCTION_ARGS) +{ + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + int32 procNumber = PG_GETARG_INT32(0); + + InitMaterializedSRF(fcinfo, 0); + + emit_wait_event_trace_for_procnumber((int) procNumber, rsinfo); + + PG_RETURN_VOID(); +} + +/* + * SQL function: pg_stat_clear_orphaned_wait_event_rings() + * + * Free every trace ring whose owner has exited (slot state ORPHANED). + * Returns the number of rings released. + * + * Why this exists. When a backend that had wait_event_capture = trace + * exits, we deliberately do NOT free its ~4 MB trace ring (see the + * lifecycle discussion on WaitEventTraceControl): the data must remain + * readable by cross-backend consumers -- the in-tree + * pg_get_wait_event_trace SRF and any extension following the + * snapshot pattern on WaitEventTraceControl -- and an exit-time + * dsa_free would defeat that. + * The reclaim instead happens lazily in two places: + * + * (a) wait_event_trace_clear_orphan_at_init(): when a new backend + * inherits the same procNumber slot at init, it frees the prior + * orphan as part of starting clean. This handles the common + * case (busy clusters with connection churn) automatically. + * + * (b) THIS FUNCTION: an explicit DBA-driven sweep that releases + * every currently orphaned ring at once. + * + * The pathological case (a) does not handle is "capture briefly + * enabled, then disabled, on a cluster with long-lived pooled + * connections that never exit". In that scenario procNumbers do not + * recycle, so prior orphans persist until cluster restart unless the + * DBA calls this function. Worst-case bound is + * NUM_WAIT_EVENT_TIMING_SLOTS * sizeof(WaitEventTraceState) which is + * ~400 MB at MaxBackends=100, ~4 GB at MaxBackends=1000 -- bounded + * but worth a kill switch. + * + * Permissions: execution is revoked from PUBLIC by default, matching the + * cluster-wide reset (pg_stat_reset_wait_event_timing_all). This is a + * cluster-scope memory-reclamation operation: it can disrupt any + * concurrent cross-backend reader on any orphaned slot. The + * disruption is bounded (readers retry via the generation counter + * and at worst skip one read) but the operation is still + * cluster-wide, so the default privilege matches the reset variant + * with the same blast radius; administrators can delegate with GRANT. + * + * The function is safe to call even when no orphans exist (returns + * 0) and even when capture is currently OFF (the slot array exists + * unconditionally; only the rings are lazy). + */ +Datum +pg_stat_clear_orphaned_wait_event_rings(PG_FUNCTION_ARGS) +{ + int64 freed = 0; + int i; + + /* + * Execution is revoked from PUBLIC in system_views.sql; administrators + * can delegate with GRANT EXECUTE. + */ + if (WaitEventTraceCtl == NULL) + PG_RETURN_INT64(0); + + /* + * If no backend has ever enabled trace, the trace DSA was never created + * and there cannot be any ORPHANED slots: every slot is still in its + * initial FREE state. Nothing to do. + */ + if (WaitEventTraceCtl->trace_dsa_handle == DSA_HANDLE_INVALID) + PG_RETURN_INT64(0); + + /* Attach to the trace DSA so dsa_free() can be called. */ + wait_event_trace_ensure_dsa(); + if (trace_dsa == NULL) + PG_RETURN_INT64(0); + + /* + * Walk every slot, taking and releasing WaitEventTraceCtl->lock per slot + * rather than holding it across the entire sweep. + * + * Rationale: at MaxBackends = 1000 with a fully-orphaned cluster the + * per-slot work (atomic state read + dsa_free + ring_ptr clear + atomic + * state write) totals a few microseconds; holding the lock across all + * slots would yield a millisecond-scale lock-hold window during which + * every concurrent backend startup (the lazy + * wait_event_trace_clear_orphan_at_init path), every cross-backend reader + * (pg_get_wait_event_trace and the external snapshot pattern), and every + * capture step-down or restore would stall. PG's general convention is + * to keep LWLock-held windows in paths that compete with regular activity + * well under 100 microseconds; per-slot release/reacquire gives us a + * worst- case lock-hold of one slot's worth of work regardless of how + * many orphans exist cluster-wide. + * + * An unlocked fast-path read of slot->state skips non-ORPHANED slots + * without an LWLockAcquire/Release pair. This is safe: if a slot races + * from non-ORPHANED to ORPHANED after we read it, we miss that orphan -- + * but the function is documented as a snapshot sweep, the missed orphan + * can be cleared by a subsequent call, and the same race exists for + * orphans that appear after the loop ends. The authoritative re-check + * under the lock prevents racing on the dsa_free direction (we never free + * a slot whose owner became OWNED again). + * + * CHECK_FOR_INTERRUPTS at the top of the loop body lets the caller cancel + * a long sweep; with the previous single-lock structure the + * InterruptHoldoffCount elevation from LWLockAcquire deferred all + * cancellation until release. + */ + for (i = 0; i < NUM_WAIT_EVENT_TIMING_SLOTS; i++) + { + WaitEventTraceSlot *slot = &WaitEventTraceCtl->trace_slots[i]; + + CHECK_FOR_INTERRUPTS(); + + /* Unlocked fast-path: skip non-ORPHANED slots cheaply. */ + if (pg_atomic_read_u32(&slot->state) != WAIT_EVENT_TRACE_SLOT_ORPHANED) + continue; + + LWLockAcquire(&WaitEventTraceCtl->lock, LW_EXCLUSIVE); + + /* + * Authoritative re-check under the lock. A concurrent + * clear_orphan_at_init may have already freed this slot. + */ + if (pg_atomic_read_u32(&slot->state) == WAIT_EVENT_TRACE_SLOT_ORPHANED && + DsaPointerIsValid(slot->ring_ptr)) + { + pg_atomic_fetch_add_u64(&slot->generation, 1); + dsa_free(trace_dsa, slot->ring_ptr); + slot->ring_ptr = InvalidDsaPointer; + pg_atomic_write_u32(&slot->state, WAIT_EVENT_TRACE_SLOT_FREE); + freed++; + } + + LWLockRelease(&WaitEventTraceCtl->lock); + } + + PG_RETURN_INT64(freed); +} + #endif /* USE_WAIT_EVENT_TIMING */ diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c index 3d8c9bdebd5..b7e4caa9e3b 100644 --- a/src/backend/utils/init/postinit.c +++ b/src/backend/utils/init/postinit.c @@ -70,6 +70,7 @@ #include "utils/snapmgr.h" #include "utils/syscache.h" #include "utils/timeout.h" +#include "utils/wait_event_timing.h" /* has this backend called EmitConnectionWarnings()? */ static bool ConnectionWarningsEmitted; @@ -1250,6 +1251,16 @@ InitPostgres(const char *in_dbname, Oid dboid, /* Process pg_db_role_setting options */ process_settings(MyDatabaseId, GetSessionUserId()); +#ifdef USE_WAIT_EVENT_TIMING + + /* + * Attach trace ring if wait_event_capture = trace was set via + * config/db/role settings + */ + if (wait_event_capture == WAIT_EVENT_CAPTURE_TRACE && my_trace_proc_number >= 0) + wait_event_trace_attach(my_trace_proc_number); +#endif + /* Apply PostAuthDelay as soon as we've read all options */ if (PostAuthDelay > 0) pg_usleep(PostAuthDelay * 1000000L); diff --git a/src/backend/utils/misc/guc_parameters.dat b/src/backend/utils/misc/guc_parameters.dat index 8f299f0971e..26f9a818ee9 100644 --- a/src/backend/utils/misc/guc_parameters.dat +++ b/src/backend/utils/misc/guc_parameters.dat @@ -3451,6 +3451,17 @@ max => '65534', }, +{ name => 'wait_event_trace_ring_size', type => 'int', context => 'PGC_POSTMASTER', group => 'STATS_CUMULATIVE', + short_desc => 'Sets the size of each backend\'s wait-event-trace ring buffer.', + long_desc => 'Each backend that enables wait_event_capture = trace allocates a ring buffer of this size from a cluster-wide DSA. The value must be a power of two and is fixed at server start. Larger rings retain longer histories before wrapping.', + flags => 'GUC_UNIT_KB', + variable => 'wait_event_trace_ring_size', + boot_val => '4096', + min => '8', + max => '32768', + check_hook => 'check_wait_event_trace_ring_size', +}, + { name => 'wal_block_size', type => 'int', context => 'PGC_INTERNAL', group => 'PRESET_OPTIONS', short_desc => 'Shows the block size in the write ahead log.', flags => 'GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE', diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index 362fa48e7e6..23dfec1732a 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -703,9 +703,11 @@ #track_cost_delay_timing = off #track_io_timing = off #track_wal_io_timing = off -#wait_event_capture = off # off, stats +#wait_event_capture = off # off, stats, trace # (requires --enable-wait-event-timing) #wait_event_timing_max_tranches = 192 # (change requires restart) +#wait_event_trace_ring_size = 4MB # power of two, 8kB .. 32MB + # (change requires restart) #track_functions = none # none, pl, all #stats_fetch_consistency = cache # cache, none, snapshot diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index 9c5dad6d925..58821d5012c 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -12751,4 +12751,30 @@ provolatile => 'v', prorettype => 'void', proargtypes => '', prosrc => 'pg_stat_reset_wait_event_timing_all' }, +{ oid => '9957', + descr => 'current backend wait event trace ring buffer', + proname => 'pg_get_backend_wait_event_trace', prorows => '1000', + proretset => 't', provolatile => 's', proparallel => 'r', + prorettype => 'record', proargtypes => '', + proallargtypes => '{int8,int8,text,text,float8,int8}', + proargmodes => '{o,o,o,o,o,o}', + proargnames => '{seq,timestamp_ns,wait_event_type,wait_event,duration_us,query_id}', + prosrc => 'pg_get_backend_wait_event_trace' }, + +{ oid => '9962', + descr => 'wait event trace ring for the given procnumber slot, live or post-mortem', + proname => 'pg_get_wait_event_trace', prorows => '1000', + proretset => 't', provolatile => 'v', proparallel => 'r', + prorettype => 'record', proargtypes => 'int4', + proallargtypes => '{int4,int8,int8,text,text,float8,int8}', + proargmodes => '{i,o,o,o,o,o,o}', + proargnames => '{procnumber,seq,timestamp_ns,wait_event_type,wait_event,duration_us,query_id}', + prosrc => 'pg_get_wait_event_trace' }, + +{ oid => '9961', + descr => 'statistics: free wait-event-trace rings whose owner backend has exited (superuser only); returns count freed', + proname => 'pg_stat_clear_orphaned_wait_event_rings', + provolatile => 'v', prorettype => 'int8', proargtypes => '', + prosrc => 'pg_stat_clear_orphaned_wait_event_rings' }, + ] diff --git a/src/include/storage/lwlocklist.h b/src/include/storage/lwlocklist.h index f0a27019a11..ab842923db4 100644 --- a/src/include/storage/lwlocklist.h +++ b/src/include/storage/lwlocklist.h @@ -141,3 +141,4 @@ PG_LWLOCKTRANCHE(PARALLEL_VACUUM_DSA, ParallelVacuumDSA) PG_LWLOCKTRANCHE(AIO_URING_COMPLETION, AioUringCompletion) PG_LWLOCKTRANCHE(SHMEM_INDEX, ShmemIndex) PG_LWLOCKTRANCHE(WAIT_EVENT_TIMING_DSA, WaitEventTimingDSA) +PG_LWLOCKTRANCHE(WAIT_EVENT_TRACE_DSA, WaitEventTraceDSA) diff --git a/src/include/storage/subsystemlist.h b/src/include/storage/subsystemlist.h index 331d60cf703..90b14235464 100644 --- a/src/include/storage/subsystemlist.h +++ b/src/include/storage/subsystemlist.h @@ -80,6 +80,7 @@ PG_SHMEM_SUBSYSTEM(AsyncShmemCallbacks) PG_SHMEM_SUBSYSTEM(StatsShmemCallbacks) PG_SHMEM_SUBSYSTEM(WaitEventCustomShmemCallbacks) PG_SHMEM_SUBSYSTEM(WaitEventTimingShmemCallbacks) +PG_SHMEM_SUBSYSTEM(WaitEventTraceControlShmemCallbacks) #ifdef USE_INJECTION_POINTS PG_SHMEM_SUBSYSTEM(InjectionPointShmemCallbacks) #endif diff --git a/src/include/utils/guc_hooks.h b/src/include/utils/guc_hooks.h index 7896e9c7239..afd9017e45c 100644 --- a/src/include/utils/guc_hooks.h +++ b/src/include/utils/guc_hooks.h @@ -174,6 +174,7 @@ extern void assign_transaction_timeout(int newval, void *extra); extern const char *show_unix_socket_permissions(void); extern bool check_wait_event_capture(int *newval, void **extra, GucSource source); extern void assign_wait_event_capture(int newval, void *extra); +extern bool check_wait_event_trace_ring_size(int *newval, void **extra, GucSource source); extern bool check_wal_buffers(int *newval, void **extra, GucSource source); extern bool check_wal_consistency_checking(char **newval, void **extra, GucSource source); diff --git a/src/include/utils/wait_event_timing.h b/src/include/utils/wait_event_timing.h index 88a5015fb9c..d5766934b8c 100644 --- a/src/include/utils/wait_event_timing.h +++ b/src/include/utils/wait_event_timing.h @@ -30,7 +30,9 @@ #include "port/atomics.h" #include "portability/instr_time.h" +#include "storage/lwlock.h" #include "storage/shmem.h" +#include "utils/dsa.h" #include "utils/wait_event_types.h" /* @@ -41,23 +43,26 @@ * OFF - No instrumentation, no hot-path cost. * STATS - Aggregated per-event statistics (counts, durations, histogram) * exposed via pg_stat_wait_event_timing. - * - * A further TRACE level is added later in the series. + * TRACE - Everything in STATS plus a per-session ring buffer of individual + * wait events and query-attribution markers, exposed via + * pg_backend_wait_event_trace. */ typedef enum WaitEventCaptureLevel { WAIT_EVENT_CAPTURE_OFF = 0, WAIT_EVENT_CAPTURE_STATS, + WAIT_EVENT_CAPTURE_TRACE, } WaitEventCaptureLevel; /* - * Pin the enum ordering at compile time so future code that compares with - * >= against WAIT_EVENT_CAPTURE_STATS keeps working, and so reordering is - * caught at build time rather than via mysterious runtime mode switches. + * Pin the enum ordering at compile time so code that compares with >= keeps + * working, and so reordering is caught at build time rather than via + * mysterious runtime mode switches. */ StaticAssertDecl(WAIT_EVENT_CAPTURE_OFF == 0 && - WAIT_EVENT_CAPTURE_STATS == 1, - "WaitEventCaptureLevel values must be 0=OFF < 1=STATS"); + WAIT_EVENT_CAPTURE_STATS == 1 && + WAIT_EVENT_CAPTURE_TRACE == 2, + "WaitEventCaptureLevel values must be 0=OFF < 1=STATS < 2=TRACE"); /* * Number of log2 histogram buckets. Bin edges are powers of two on the @@ -194,11 +199,176 @@ extern PGDLLIMPORT WaitEventTimingState *my_wait_event_timing; extern PGDLLIMPORT const ShmemCallbacks WaitEventTimingShmemCallbacks; /* - * Called from InitProcess()/InitAuxiliaryProcess() to point - * my_wait_event_timing at this backend's slot, and from ProcKill() to - * clear it. + * Called from InitProcess()/InitAuxiliaryProcess() to set up this backend's + * timing/trace bookkeeping, and from ProcKill() to clear it. */ extern void pgstat_set_wait_event_timing_storage(int procNumber); extern void pgstat_reset_wait_event_timing_storage(void); + +/* ---------------------------------------------------------------------- + * Trace level (wait_event_capture = trace) + * + * In addition to the STATS aggregates, every completed wait (and a set of + * query-attribution markers) is pushed into a per-session ring buffer -- + * one record per completed wait. The ring is allocated lazily in DSA on + * first use, + * so only sessions that enable trace pay the per-ring memory cost. External + * tools read a session's ring via pg_get_backend_wait_event_trace() (own + * session) or pg_get_wait_event_trace(procnumber) (cross-backend). + * + * Query attribution is by scanning the ring at read time: QUERY/EXEC + * START/END markers delimit which wait events belong to which query_id. + * + * The ring size is set cluster-wide at server start by the + * wait_event_trace_ring_size GUC (PGC_POSTMASTER, default 4 MB). It + * MUST be a power of two: the writer indexes the ring as (pos & ring_mask). + * ---------------------------------------------------------------------- + */ + +/* Trace record types */ +#define TRACE_WAIT_EVENT 0 +#define TRACE_QUERY_START 1 +#define TRACE_QUERY_END 2 +#define TRACE_EXEC_START 3 +#define TRACE_EXEC_END 4 + +typedef struct WaitEventTraceRecord +{ + /* + * Seqlock for torn-read detection. Writers set seq odd before filling + * fields, then even after; readers check seq before and after and skip + * the record if either is odd or they differ. uint32 wrap is irrelevant + * over the ~10-20 ns reader access window. + */ + uint32 seq; + uint8 record_type; /* TRACE_WAIT_EVENT / QUERY_* / EXEC_* */ + uint8 pad[3]; + int64 timestamp_ns; /* monotonic clock */ + union + { + struct /* record_type = TRACE_WAIT_EVENT */ + { + uint32 event; /* wait_event_info */ + uint32 pad2; + int64 duration_ns; + } wait; + struct /* QUERY_START/END or EXEC_START/END */ + { + int64 query_id; + int64 pad2; + } query; + } data; +} WaitEventTraceRecord; /* 32 bytes */ + +/* + * The seqlock wrap-safety argument and the mask-index math both rely on a + * fixed 32-byte record stride; make a stray field addition a build failure. + */ +StaticAssertDecl(sizeof(WaitEventTraceRecord) == 32, + "WaitEventTraceRecord must be exactly 32 bytes"); + +/* + * Per-backend trace ring header followed by the records array. records[] + * is variably sized at allocation time (wait_event_trace_ring_size + * decides the row count). write_pos and ring_mask share a cache line so + * the hot-path index calculation touches one line. + */ +typedef struct WaitEventTraceState +{ + pg_atomic_uint64 write_pos; /* monotonically increasing, wraps via mask */ + uint32 ring_mask; /* ring_size - 1; ring_size is a power of two */ + uint32 ring_size_pad; /* keep the records[] slab 16-byte aligned */ + WaitEventTraceRecord records[FLEXIBLE_ARRAY_MEMBER]; +} WaitEventTraceState; + +/* + * Per-procNumber trace-ring slot lifecycle. Decoupled from backend + * lifecycle on purpose: when a backend exits we transition its slot to + * ORPHANED and leave the ring in DSA so cross-backend consumers can still + * read the dying backend's final waits. An orphan is reclaimed when a new + * backend takes the same procNumber, or by + * pg_stat_clear_orphaned_wait_event_rings(). + * + * FREE no ring allocated (ring_ptr invalid). + * OWNED a live backend at this procNumber is writing to the ring. + * ORPHANED the owner exited; the ring is post-mortem and immutable. + */ +typedef enum WaitEventTraceSlotState +{ + WAIT_EVENT_TRACE_SLOT_FREE = 0, + WAIT_EVENT_TRACE_SLOT_OWNED, + WAIT_EVENT_TRACE_SLOT_ORPHANED, +} WaitEventTraceSlotState; + +/* + * Per-procNumber slot. generation is bumped on every owner transition; + * cross-backend readers snapshot it before+after their read and retry if it + * changed (the BackendStatusArray st_changecount idiom). state is atomic + * only for cheap unlocked "worth visiting" probes; authoritative reads of + * (state, ring_ptr) are done under WaitEventTraceCtl->lock in LW_SHARED, + * while every transition holds it LW_EXCLUSIVE. + */ +typedef struct WaitEventTraceSlot +{ + pg_atomic_uint64 generation; /* bumped on every owner transition */ + pg_atomic_uint32 state; /* WaitEventTraceSlotState */ + uint32 pad; /* keep ring_ptr 8-aligned */ + dsa_pointer ring_ptr; /* InvalidDsaPointer when FREE; else the + * WaitEventTraceState chunk */ +} WaitEventTraceSlot; + +/* + * Control struct in fixed shared memory. trace_slots[] is indexed by + * procNumber. + * + * External cross-backend reader protocol (pg_get_wait_event_trace is the + * reference implementation): + * 1. read trace_slots[procNumber].state unlocked as a "worth visiting" + * probe; FREE -> nothing to read. + * 2. acquire lock LW_SHARED (all transitions take LW_EXCLUSIVE, so the + * slot's state/ring_ptr/ring memory are stable for the iteration). + * 3. re-check state under the lock; resolve ring_ptr via dsa_get_address; + * read write_pos. + * 4. iterate [read_start, write_pos): for each record do the per-record + * POSITION-ENCODED IDENTITY seqlock check against shared memory -- + * expected_seq = (uint32)(i*2 + 2); read seq, barrier, copy record, + * barrier, re-read seq; accept only if both equal expected_seq. This + * rejects stale previous-cycle reads (parity alone would not). + * 5. release the lock; emit the buffered records afterwards. + * 6. optional: snapshot generation before/after if releasing the lock + * between batches. + */ +typedef struct WaitEventTraceControl +{ + dsa_handle trace_dsa_handle; /* DSA_HANDLE_INVALID until first use */ + LWLock lock; /* protects DSA creation and slot transitions */ + WaitEventTraceSlot trace_slots[FLEXIBLE_ARRAY_MEMBER]; /* per procNumber */ +} WaitEventTraceControl; + +/* Trace GUC and the records-per-ring value derived from it at startup. */ +extern PGDLLIMPORT int wait_event_trace_ring_size; +extern PGDLLIMPORT uint32 WaitEventTraceRingSize; + +/* This backend's procNumber for the trace ring, or -1 if not set. */ +extern PGDLLIMPORT int my_trace_proc_number; + +/* + * Trace control shmem -- registered via subsystemlist.h; a no-op stub when + * the feature is compiled out. + */ +extern PGDLLIMPORT const ShmemCallbacks WaitEventTraceControlShmemCallbacks; + +/* + * Lazy DSA-based trace ring allocation -- called on first trace write and + * at backend startup when capture = trace was set via configuration. + */ +extern void wait_event_trace_attach(int procNumber); + +/* Query-attribution markers (defined in wait_event_timing.c). */ +extern void wait_event_trace_query_start(int64 query_id); +extern void wait_event_trace_query_end(int64 query_id); +extern void wait_event_trace_exec_start(int64 query_id); +extern void wait_event_trace_exec_end(int64 query_id); + #endif /* WAIT_EVENT_TIMING_H */ diff --git a/src/test/modules/test_misc/meson.build b/src/test/modules/test_misc/meson.build index ee290698b31..aba6fbdf86e 100644 --- a/src/test/modules/test_misc/meson.build +++ b/src/test/modules/test_misc/meson.build @@ -23,6 +23,7 @@ tests += { 't/012_ddlutils.pl', 't/013_temp_obj_multisession.pl', 't/014_log_statement_max_length.pl', + 't/015_wait_event_trace_seqlock.pl', ], # The injection points are cluster-wide, so disable installcheck 'runningcheck': false, diff --git a/src/test/modules/test_misc/t/015_wait_event_trace_seqlock.pl b/src/test/modules/test_misc/t/015_wait_event_trace_seqlock.pl new file mode 100644 index 00000000000..0f0e49970c8 --- /dev/null +++ b/src/test/modules/test_misc/t/015_wait_event_trace_seqlock.pl @@ -0,0 +1,122 @@ +# Copyright (c) 2026, PostgreSQL Global Development Group + +# Test the position-encoded identity seqlock that protects cross-backend +# reads of the wait-event trace ring (wait_event_capture = trace). +# +# The hazard: the trace writer advances write_pos and only then stamps the +# record's seq. A cross-backend reader that observes the new write_pos +# before the seq store has propagated sees, at the in-flight ring slot, the +# PREVIOUS cycle's record -- complete, with an even seq. A parity-only +# seqlock would accept it and emit a stale record attributed to the wrong +# ring index; the identity check (seq must equal the writer's completion +# value for that exact position) must reject it. +# +# That window is unobservable on TSO hardware without instrumentation, so +# the writer carries INJECTION_POINT("wait-event-trace-after-write-pos") +# between the write_pos advance and the seq stamp. This test: +# +# 1. fills and wraps a minimum-size ring (8kB = 256 records), so every +# slot holds a complete record from the previous cycle; +# 2. wedges the writer at the injection point, mid-record; +# 3. reads the ring cross-backend: the reader must return exactly +# ring_size - 1 records, skipping the in-flight slot whose stale +# prior-cycle record a parity-only check would have emitted; +# 4. releases the writer and verifies the ring reads full again. + +use strict; +use warnings FATAL => 'all'; + +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +plan skip_all => 'Injection points not supported by this build' + unless $ENV{enable_injection_points} eq 'yes'; + +my $ring_records = 256; # 8kB ring / 32-byte records + +my $node = PostgreSQL::Test::Cluster->new('seqlock'); +$node->init; +$node->append_conf( + 'postgresql.conf', q[ +wait_event_trace_ring_size = '8kB' +]); +$node->start; + +# Skip if the server was not built with --enable-wait-event-timing. +my ($ret, $stdout, $stderr) = + $node->psql('postgres', 'SET wait_event_capture = trace;'); +if ($ret != 0) +{ + $node->stop; + plan skip_all => 'server not built with --enable-wait-event-timing'; +} + +$node->safe_psql('postgres', 'CREATE EXTENSION injection_points;'); + +# Writer session: enable trace and wrap the ring. 400 pg_sleep calls emit +# at least 400 wait events into a 256-record ring, so every slot holds a +# complete record from the current window. +my $writer = $node->background_psql('postgres'); +$writer->query_safe('SET wait_event_capture = trace;'); +$writer->query_safe( + 'SELECT count(pg_sleep(0.001)) FROM generate_series(1, 400);'); + +my $writer_proc = $writer->query_safe( + 'SELECT procnumber FROM pg_stat_get_wait_event_timing(pg_backend_pid())' + . ' LIMIT 1;'); +chomp $writer_proc; +like($writer_proc, qr/^\d+$/, 'writer reported its procnumber'); + +# With the ring wrapped and the writer idle, a cross-backend read returns +# exactly ring_size records: every slot is complete and identity-valid. +my $count_full = $node->safe_psql('postgres', + "SELECT count(*) FROM pg_get_wait_event_trace($writer_proc);"); +is($count_full, $ring_records, 'wrapped ring reads full before the wedge'); + +# Wedge the writer mid-record: arm the injection point, then send a +# statement. The arrival of the statement completes the writer's +# ClientRead wait; its trace write advances write_pos and then blocks at +# the injection point, before stamping the record's seq. +$node->safe_psql('postgres', + "SELECT injection_points_attach('wait-event-trace-after-write-pos', 'wait');" +); +$writer->query_until( + qr/wedge_sent/, q[ +\echo wedge_sent +SELECT 1; +]); +$node->wait_for_event('client backend', 'wait-event-trace-after-write-pos'); + +# The decisive read: the in-flight slot still holds the previous cycle's +# complete record. A parity-only seqlock would emit it (ring_size rows, +# one misattributed); the identity check must skip exactly that slot. +my $count_wedged = $node->safe_psql('postgres', + "SELECT count(*) FROM pg_get_wait_event_trace($writer_proc);"); +is($count_wedged, $ring_records - 1, + 'reader skips the in-flight slot instead of emitting the stale prior-cycle record' +); + +# The read is stable and repeatable while the writer is wedged. +my $count_wedged2 = $node->safe_psql('postgres', + "SELECT count(*) FROM pg_get_wait_event_trace($writer_proc);"); +is($count_wedged2, $count_wedged, 'wedged-ring read is stable'); + +# Release the writer: detach first so the nested wakeup wait does not +# re-arm, then wake it. +$node->safe_psql('postgres', + "SELECT injection_points_detach('wait-event-trace-after-write-pos');"); +$node->safe_psql('postgres', + "SELECT injection_points_wakeup('wait-event-trace-after-write-pos');"); + +# The writer completes the wedged record (and its pending statement); the +# ring must read full again. +$writer->query_safe("SELECT 'resync';"); +my $count_after = $node->safe_psql('postgres', + "SELECT count(*) FROM pg_get_wait_event_trace($writer_proc);"); +is($count_after, $ring_records, 'ring reads full again after release'); + +$writer->quit; +$node->stop; + +done_testing(); diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out index 71c95cb0746..9d64e125972 100644 --- a/src/test/regress/expected/rules.out +++ b/src/test/regress/expected/rules.out @@ -1332,6 +1332,13 @@ pg_backend_memory_contexts| SELECT name, free_chunks, used_bytes FROM pg_get_backend_memory_contexts() pg_get_backend_memory_contexts(name, ident, type, level, path, total_bytes, total_nblocks, free_bytes, free_chunks, used_bytes); +pg_backend_wait_event_trace| SELECT seq, + timestamp_ns, + wait_event_type, + wait_event, + duration_us, + query_id + FROM pg_get_backend_wait_event_trace() t(seq, timestamp_ns, wait_event_type, wait_event, duration_us, query_id); pg_config| SELECT name, setting FROM pg_config() pg_config(name, setting); diff --git a/src/test/regress/expected/wait_event_timing.out b/src/test/regress/expected/wait_event_timing.out index 8938ffe5fb0..9fc9b115e79 100644 --- a/src/test/regress/expected/wait_event_timing.out +++ b/src/test/regress/expected/wait_event_timing.out @@ -1,9 +1,10 @@ -- -- WAIT_EVENT_TIMING -- --- Exercises the wait_event_capture = stats instrumentation: the GUC, the --- pg_stat_get_wait_event_timing() SRF, the pg_stat_wait_event_timing view, --- and the pg_wait_event_timing_histogram_buckets taxonomy view. +-- Exercises the wait_event_capture instrumentation: the GUC, the stats +-- surface (pg_stat_get_wait_event_timing(), the pg_stat_wait_event_timing +-- and histogram-buckets views, overflow counters, resets), and the trace +-- surface (the per-session ring and its readers). -- -- Two expected outputs are maintained: -- wait_event_timing.out -- --enable-wait-event-timing builds @@ -133,3 +134,42 @@ SELECT pg_stat_reset_wait_event_timing(2147483647); (1 row) RESET wait_event_capture; +-- +-- Trace level: per-session ring of individual waits + query markers. +-- (In a stub build SET trace errors and the trace readers stay empty; +-- that is the documented difference between the two expected files.) +-- +SET wait_event_capture = trace; +SELECT pg_sleep(0.1); + pg_sleep +---------- + +(1 row) + +-- PgSleep is recorded in this backend's ring with a positive duration. +SELECT count(*) >= 1 AS pgsleep_in_ring, + coalesce(bool_and(duration_us > 0), false) AS durations_positive +FROM pg_get_backend_wait_event_trace() +WHERE wait_event = 'PgSleep'; + pgsleep_in_ring | durations_positive +-----------------+-------------------- + t | t +(1 row) + +-- The same records are visible through the view. +SELECT count(*) >= 1 AS view_has_pgsleep +FROM pg_backend_wait_event_trace +WHERE wait_event = 'PgSleep'; + view_has_pgsleep +------------------ + t +(1 row) + +-- Clearing orphaned rings is a no-op here (no orphans) but must succeed. +SELECT pg_stat_clear_orphaned_wait_event_rings() >= 0 AS clear_orphans_ok; + clear_orphans_ok +------------------ + t +(1 row) + +RESET wait_event_capture; diff --git a/src/test/regress/expected/wait_event_timing_1.out b/src/test/regress/expected/wait_event_timing_1.out index 3aad898d9bf..f8b3beec02f 100644 --- a/src/test/regress/expected/wait_event_timing_1.out +++ b/src/test/regress/expected/wait_event_timing_1.out @@ -1,9 +1,10 @@ -- -- WAIT_EVENT_TIMING -- --- Exercises the wait_event_capture = stats instrumentation: the GUC, the --- pg_stat_get_wait_event_timing() SRF, the pg_stat_wait_event_timing view, --- and the pg_wait_event_timing_histogram_buckets taxonomy view. +-- Exercises the wait_event_capture instrumentation: the GUC, the stats +-- surface (pg_stat_get_wait_event_timing(), the pg_stat_wait_event_timing +-- and histogram-buckets views, overflow counters, resets), and the trace +-- surface (the per-session ring and its readers). -- -- Two expected outputs are maintained: -- wait_event_timing.out -- --enable-wait-event-timing builds @@ -123,3 +124,45 @@ SELECT pg_stat_reset_wait_event_timing(2147483647); ERROR: wait event capture is not supported by this build HINT: Compile PostgreSQL with --enable-wait-event-timing. RESET wait_event_capture; +-- +-- Trace level: per-session ring of individual waits + query markers. +-- (In a stub build SET trace errors and the trace readers stay empty; +-- that is the documented difference between the two expected files.) +-- +SET wait_event_capture = trace; +ERROR: invalid value for parameter "wait_event_capture": "trace" +DETAIL: This build does not support wait event capture. +HINT: Compile PostgreSQL with --enable-wait-event-timing. +SELECT pg_sleep(0.1); + pg_sleep +---------- + +(1 row) + +-- PgSleep is recorded in this backend's ring with a positive duration. +SELECT count(*) >= 1 AS pgsleep_in_ring, + coalesce(bool_and(duration_us > 0), false) AS durations_positive +FROM pg_get_backend_wait_event_trace() +WHERE wait_event = 'PgSleep'; + pgsleep_in_ring | durations_positive +-----------------+-------------------- + f | f +(1 row) + +-- The same records are visible through the view. +SELECT count(*) >= 1 AS view_has_pgsleep +FROM pg_backend_wait_event_trace +WHERE wait_event = 'PgSleep'; + view_has_pgsleep +------------------ + f +(1 row) + +-- Clearing orphaned rings is a no-op here (no orphans) but must succeed. +SELECT pg_stat_clear_orphaned_wait_event_rings() >= 0 AS clear_orphans_ok; + clear_orphans_ok +------------------ + t +(1 row) + +RESET wait_event_capture; diff --git a/src/test/regress/sql/wait_event_timing.sql b/src/test/regress/sql/wait_event_timing.sql index a30fcd155fd..49206afef66 100644 --- a/src/test/regress/sql/wait_event_timing.sql +++ b/src/test/regress/sql/wait_event_timing.sql @@ -1,9 +1,10 @@ -- -- WAIT_EVENT_TIMING -- --- Exercises the wait_event_capture = stats instrumentation: the GUC, the --- pg_stat_get_wait_event_timing() SRF, the pg_stat_wait_event_timing view, --- and the pg_wait_event_timing_histogram_buckets taxonomy view. +-- Exercises the wait_event_capture instrumentation: the GUC, the stats +-- surface (pg_stat_get_wait_event_timing(), the pg_stat_wait_event_timing +-- and histogram-buckets views, overflow counters, resets), and the trace +-- surface (the per-session ring and its readers). -- -- Two expected outputs are maintained: -- wait_event_timing.out -- --enable-wait-event-timing builds @@ -77,3 +78,27 @@ SELECT pg_stat_reset_wait_event_timing(); SELECT pg_stat_reset_wait_event_timing(2147483647); RESET wait_event_capture; + +-- +-- Trace level: per-session ring of individual waits + query markers. +-- (In a stub build SET trace errors and the trace readers stay empty; +-- that is the documented difference between the two expected files.) +-- +SET wait_event_capture = trace; +SELECT pg_sleep(0.1); + +-- PgSleep is recorded in this backend's ring with a positive duration. +SELECT count(*) >= 1 AS pgsleep_in_ring, + coalesce(bool_and(duration_us > 0), false) AS durations_positive +FROM pg_get_backend_wait_event_trace() +WHERE wait_event = 'PgSleep'; + +-- The same records are visible through the view. +SELECT count(*) >= 1 AS view_has_pgsleep +FROM pg_backend_wait_event_trace +WHERE wait_event = 'PgSleep'; + +-- Clearing orphaned rings is a no-op here (no orphans) but must succeed. +SELECT pg_stat_clear_orphaned_wait_event_rings() >= 0 AS clear_orphans_ok; + +RESET wait_event_capture; diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 7d43fc4e5b6..63f207f3da6 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -3431,6 +3431,11 @@ WaitEventTimeout WaitEventTimingControl WaitEventTimingEntry WaitEventTimingState +WaitEventTraceControl +WaitEventTraceRecord +WaitEventTraceSlot +WaitEventTraceSlotState +WaitEventTraceState WaitLSNProcInfo WaitLSNResult WaitLSNState @@ -3461,6 +3466,7 @@ WalUsage WalWriteMethod WalWriteMethodOps Walfile +WetValidRecord WindowAgg WindowAggPath WindowAggState -- 2.43.0