From a0305d3242b854b3a22ded5d66bbf3dba0bf943c Mon Sep 17 00:00:00 2001
From: Dmitry Fomin <fomin.list@gmail.com>
Date: Thu, 11 Jun 2026 15:10:36 +0000
Subject: [PATCH v1 6/7] wait_event_timing: add trace level with a per-session
 ring buffer

Add the third capture level, wait_event_capture = trace.  On top of the
STATS aggregates, every completed wait is pushed into a per-session ring
buffer in DSA -- one record per wait -- allocated lazily on first use so
only sessions that enable trace pay the (default 4 MB) per-ring cost.

The ring is exposed two ways:

  - pg_get_backend_wait_event_trace() / the pg_backend_wait_event_trace
    view read the calling backend's own ring;
  - pg_get_wait_event_trace(procnumber) reads any backend's ring
    cross-process, including rings left behind by exited backends.

A backend's ring is not freed when it exits: the slot transitions to
ORPHANED and the ring stays in DSA so cross-backend consumers can read
the dying backend's final waits (important for short-lived parallel
workers).  An orphan is reclaimed when a new backend reuses the same
procNumber (clear-on-init) or by pg_stat_clear_orphaned_wait_event_rings()
(execution revoked from PUBLIC; delegable with GRANT).  Slot transitions
(FREE/OWNED/ORPHANED) are serialised by the WaitEventTraceControl lock;
the ring size is fixed cluster-wide at server start by
wait_event_trace_ring_size (power of two, default 4 MB).

The single-writer hot path writes each record under a seqlock (odd seq
while writing, even when complete).  Cross-backend readers use a
POSITION-ENCODED IDENTITY seqlock -- a record at ring index i is valid
only if its seq equals the writer's complete value for that exact position
-- which rejects stale previous-cycle reads that a parity-only seqlock
would accept under cross-process visibility lag.  A TAP test drives an
injection point between the writer's write_pos advance and its seq stamp
to prove exactly that: with the ring wrapped and the writer wedged
mid-record, a cross-backend read returns ring_size - 1 records, skipping
the in-flight slot whose stale prior-cycle record a parity-only check
would have emitted.

The trace ring writer carries the same teardown discipline as the stats
path: a wait_event_trace_writes_disabled gate (raised around slot
transitions and at proc_exit) plus in_attach/in_release re-entrancy guards
keep the DSA-internal LWLock waits that those operations emit from
recursing into a ring that is being freed or orphaned.  A further
re-entrancy guard on the record writer itself keeps wait events emitted
mid-record-write (reachable only via the injection point; the write is
plain stores in production) from recursing into the writer.

Privileges: both trace SRFs and the session-local view are REVOKE'd from
PUBLIC and GRANT'ed to pg_read_all_stats (reading a session's trace
exposes its wait sequence).

Query-attribution markers and their executor/protocol hooks are added in
the next commit; this commit records wait events only.
---
 doc/src/sgml/config.sgml                      |   37 +-
 doc/src/sgml/monitoring.sgml                  |  301 ++-
 src/backend/catalog/system_views.sql          |   24 +
 src/backend/postmaster/auxprocess.c           |   11 +
 .../utils/activity/wait_event_names.txt       |    1 +
 .../utils/activity/wait_event_timing.c        | 1966 ++++++++++++++++-
 src/backend/utils/init/postinit.c             |   11 +
 src/backend/utils/misc/guc_parameters.dat     |   11 +
 src/backend/utils/misc/postgresql.conf.sample |    4 +-
 src/include/catalog/pg_proc.dat               |   26 +
 src/include/storage/lwlocklist.h              |    1 +
 src/include/storage/subsystemlist.h           |    1 +
 src/include/utils/guc_hooks.h                 |    1 +
 src/include/utils/wait_event_timing.h         |  190 +-
 src/test/modules/test_misc/meson.build        |    1 +
 .../t/015_wait_event_trace_seqlock.pl         |  122 +
 src/test/regress/expected/rules.out           |    7 +
 .../regress/expected/wait_event_timing.out    |   46 +-
 .../regress/expected/wait_event_timing_1.out  |   49 +-
 src/test/regress/sql/wait_event_timing.sql    |   31 +-
 src/tools/pgindent/typedefs.list              |    6 +
 21 files changed, 2743 insertions(+), 104 deletions(-)
 create mode 100644 src/test/modules/test_misc/t/015_wait_event_trace_seqlock.pl

diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 15131fd9b7f..734310935ed 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -9148,7 +9148,9 @@ COPY postgres_log FROM '/full/path/to/logfile.csv' WITH csv;
         Controls collection of wait event timing instrumentation.  Requires
         the server to be compiled with
         <option>--enable-wait-event-timing</option>.  Possible values are
-        <literal>off</literal> (the default) and <literal>stats</literal>.
+        <literal>off</literal> (the default), <literal>stats</literal>, and
+        <literal>trace</literal>; each level is a strict superset of the
+        previous one.
        </para>
        <para>
         At <literal>stats</literal>, the server records per-backend wait
@@ -9162,6 +9164,16 @@ COPY postgres_log FROM '/full/path/to/logfile.csv' WITH csv;
         <varname>wait_event_capture</varname> is <literal>off</literal> the
         hot path adds only a single predictable branch.
        </para>
+       <para>
+        At <literal>trace</literal>, the server additionally records every
+        individual wait event into a per-session ring buffer exposed via the
+        <link linkend="monitoring-pg-backend-wait-event-trace-view">
+        <structname>pg_backend_wait_event_trace</structname></link> view.
+        The ring is allocated lazily from dynamic shared memory on first
+        use (default 4&nbsp;MB per backend; see
+        <xref linkend="guc-wait-event-trace-ring-size"/>), so only
+        sessions that enable trace pay the per-ring memory cost.
+       </para>
        <para>
         Only superusers and users with the appropriate <literal>SET</literal>
         privilege can change this setting.  Read access to the resulting
@@ -9203,6 +9215,29 @@ GRANT SET ON PARAMETER wait_event_capture TO pg_monitor;
       </listitem>
      </varlistentry>
 
+     <varlistentry id="guc-wait-event-trace-ring-size" xreflabel="wait_event_trace_ring_size">
+      <term><varname>wait_event_trace_ring_size</varname> (<type>integer</type>)
+      <indexterm>
+       <primary><varname>wait_event_trace_ring_size</varname> configuration parameter</primary>
+      </indexterm>
+      </term>
+      <listitem>
+       <para>
+        Sets the size of the per-backend ring buffer used at the
+        <literal>trace</literal> level of
+        <xref linkend="guc-wait-event-capture"/>.  Each backend that enables
+        trace allocates a ring of this size from a cluster-wide dynamic
+        shared memory area.  The value must be a power of two; larger rings
+        retain a longer history of wait events before wrapping.  If this
+        parameter is specified without units, it is taken as kilobytes.  The
+        default is <literal>4096</literal> (4&nbsp;MB).  This parameter has
+        no effect on builds compiled without
+        <option>--enable-wait-event-timing</option>, and can only be set at
+        server start.
+       </para>
+      </listitem>
+     </varlistentry>
+
      <varlistentry id="guc-track-functions" xreflabel="track_functions">
       <term><varname>track_functions</varname> (<type>enum</type>)
       <indexterm>
diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml
index 34676e43eae..2f1fc457d00 100644
--- a/doc/src/sgml/monitoring.sgml
+++ b/doc/src/sgml/monitoring.sgml
@@ -579,6 +579,15 @@ postgres   27093  0.0  0.0  30096  2752 ?        Ss   11:34   0:00 postgres: ser
       </entry>
      </row>
 
+     <row>
+      <entry><structname>pg_backend_wait_event_trace</structname><indexterm><primary>pg_backend_wait_event_trace</primary></indexterm></entry>
+      <entry>Individual wait event records from the current backend's trace
+       ring buffer.  See
+       <link linkend="monitoring-pg-backend-wait-event-trace-view">
+       <structname>pg_backend_wait_event_trace</structname></link> for details.
+      </entry>
+     </row>
+
      <!-- all "stat" for schema objects, by "importance" -->
 
      <row>
@@ -3859,7 +3868,8 @@ description | Waiting for a newly initialized WAL file to reach durable storage
    The <structname>pg_stat_wait_event_timing</structname> view contains one
    row for each combination of backend and wait event that has a non-zero
    call count.  It shows accumulated timing statistics collected when
-   <xref linkend="guc-wait-event-capture"/> is set to <literal>stats</literal>.  Requires the server to be compiled with
+   <xref linkend="guc-wait-event-capture"/> is set to <literal>stats</literal>
+   or <literal>trace</literal>.  Requires the server to be compiled with
    <option>--enable-wait-event-timing</option>.
   </para>
 
@@ -4225,6 +4235,200 @@ ORDER  BY b.bucket_idx;
   </table>
  </sect2>
 
+ <sect2 id="monitoring-pg-backend-wait-event-trace-view">
+  <title><structname>pg_backend_wait_event_trace</structname></title>
+
+  <indexterm>
+   <primary>pg_backend_wait_event_trace</primary>
+  </indexterm>
+
+  <para>
+   The <structname>pg_backend_wait_event_trace</structname> view shows
+   individual wait event records from the <emphasis>current backend's</emphasis>
+   trace ring buffer.  Each record captures a single wait event
+   (with timestamp and duration).
+   Requires <xref linkend="guc-wait-event-capture"/> to be set to
+   <literal>trace</literal>.  The ring buffer is sized by
+   <xref linkend="guc-wait-event-trace-ring-size"/> (default 4&nbsp;MB =
+   131072 records of 32 bytes each); older records are overwritten in
+   FIFO order.  The view is session-local
+   and analogous in scope to
+   <link linkend="view-pg-backend-memory-contexts">
+   <structname>pg_backend_memory_contexts</structname></link>; querying it
+   from a superuser session still returns only that session's own
+   records, never another backend's.
+  </para>
+
+  <para>
+   The <structname>pg_backend_wait_event_trace</structname> view is
+   intended for <emphasis>session-local interactive diagnostics</emphasis>:
+   running ad-hoc <literal>SELECT</literal> queries against your own
+   session's trace from <application>psql</application> while
+   investigating wait-event behaviour.  The view materialises up to
+   one ring's worth of records (default ~4&nbsp;MB, controlled by
+   <xref linkend="guc-wait-event-trace-ring-size"/>) into a
+   tuplestore on each call, which is bounded and acceptable for that
+   use; for narrow result sets, append
+   <literal>ORDER BY seq DESC LIMIT <replaceable>N</replaceable></literal>
+   to get the most recent records.
+  </para>
+
+  <para>
+   Cross-backend monitoring tools &mdash; extensions and background
+   workers that read wait events losslessly from every backend's
+   ring &mdash; should <emphasis>not</emphasis> consume through this
+   view.  The in-tree cross-backend reader is
+   <function>pg_get_wait_event_trace</function>
+   (see <xref linkend="monitoring-stats-functions"/>); the underlying
+   per-session SQL function returns only the calling backend's own
+   ring, so a background worker invoking
+   <command>SELECT * FROM pg_backend_wait_event_trace</command> via
+   SPI would receive only its own (typically empty) ring, not the
+   target backend's data.  External tools that need cross-backend
+   access without going through SQL use the shared-memory snapshot
+   pattern documented on
+   <structname>WaitEventTraceControl</structname> in
+   <filename>src/include/utils/wait_event_timing.h</filename>:
+   snapshot <structfield>trace_slots[procNumber].generation</structfield>,
+   acquire <structname>WaitEventTraceCtl->lock</structname> in
+   <literal>LW_SHARED</literal>, resolve the target slot's
+   <structfield>ring_ptr</structfield> via
+   <function>dsa_get_address</function>, snapshot the relevant slice
+   of the ring into local memory, release the lock, re-snapshot
+   <structfield>generation</structfield> and discard the read if it
+   changed, then process the snapshot off the lock.  That bypasses
+   this view entirely and is the supported cross-backend interface
+   for monitoring extensions.
+  </para>
+
+  <para>
+   <emphasis>Slot lifecycle.</emphasis>  Per-backend trace rings are
+   not freed when their owner backend exits.  The ring stays
+   allocated in shared memory in an <quote>orphaned</quote> state
+   so the dying backend's final waits remain readable by the
+   cross-backend interface &mdash;
+   <function>pg_get_wait_event_trace</function>
+   (see <xref linkend="monitoring-stats-functions"/>) for in-tree
+   access, or external background workers that follow the
+   snapshot pattern documented above.
+   <emphasis>This does not change the behaviour of this view</emphasis>,
+   which always reads the calling backend's own ring and is
+   unaffected by orphan-state slots belonging to other
+   procnumbers.  The lifecycle change matters for short-lived
+   backends that exit before any monitoring tool has read their
+   data: parallel workers in particular exit in milliseconds at
+   end-of-parallel-query, well below typical reader polling
+   intervals, and without orphan-persistence their final waits
+   would be lost.  Orphaned rings are reclaimed automatically when a new
+   backend takes over the same <literal>procNumber</literal>
+   slot, and the DBA can force a sweep at any time via
+   <function>pg_stat_clear_orphaned_wait_event_rings</function>.
+   The worst-case orphan-memory footprint is bounded by the slot
+   count times ~4&nbsp;MB; see
+   <function>pg_stat_clear_orphaned_wait_event_rings</function>
+   under <xref linkend="monitoring-stats-functions"/> for details and
+   the deployment patterns where the function is most useful.
+  </para>
+
+  <para>
+   The ring buffer is designed as a lock-free transport mechanism for
+   external consumption.  At high wait event rates (e.g., 220K events/sec),
+   the ring wraps in roughly 0.5&ndash;1 seconds.  Consumers should poll
+   the ring buffer before it wraps.
+  </para>
+
+  <para>
+   The <structfield>seq</structfield> column is the absolute write
+   position of each record; it is monotonically increasing and never
+   resets while the ring is alive.  A consumer polling the ring
+   repeatedly can detect wraparound losses by tracking
+   <function>max(seq)</function> between successive scrapes: given two
+   consecutive polls returning <replaceable>N2</replaceable> rows with
+   maximum <structfield>seq</structfield> values
+   <replaceable>S1</replaceable> (previous poll) and
+   <replaceable>S2</replaceable> (current poll), the number of records
+   overwritten before the second poll could read them is
+   <literal>max(0, (S2 - S1) - N2)</literal>.  No separate
+   <quote>trace overflow</quote> counter is exposed because this
+   information is exact and derivable from <structfield>seq</structfield>
+   alone.
+  </para>
+
+
+  <table id="pg-backend-wait-event-trace-view" xreflabel="pg_backend_wait_event_trace">
+   <title><structname>pg_backend_wait_event_trace</structname> View</title>
+   <tgroup cols="1">
+    <thead>
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       Column Type
+      </para>
+      <para>
+       Description
+      </para></entry>
+     </row>
+    </thead>
+
+    <tbody>
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>seq</structfield> <type>bigint</type>
+      </para>
+      <para>
+       Sequence number of this record in the ring buffer
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>timestamp_ns</structfield> <type>bigint</type>
+      </para>
+      <para>
+       Monotonic clock timestamp in nanoseconds
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>wait_event_type</structfield> <type>text</type>
+      </para>
+      <para>
+       Wait event type
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>wait_event</structfield> <type>text</type>
+      </para>
+      <para>
+       Wait event name
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>duration_us</structfield> <type>double precision</type>
+      </para>
+      <para>
+       Wait duration in microseconds
+      </para></entry>
+     </row>
+
+     <row>
+      <entry role="catalog_table_entry"><para role="column_definition">
+       <structfield>query_id</structfield> <type>bigint</type>
+      </para>
+      <para>
+       Reserved; always 0 for wait event records
+      </para></entry>
+     </row>
+
+    </tbody>
+   </tgroup>
+  </table>
+ </sect2>
+
  <sect2 id="monitoring-pg-stat-database-view">
   <title><structname>pg_stat_database</structname></title>
 
@@ -6332,6 +6536,101 @@ ORDER  BY b.bucket_idx;
        </para></entry>
       </row>
 
+      <row>
+       <entry role="func_table_entry"><para role="func_signature">
+        <indexterm>
+         <primary>pg_get_wait_event_trace</primary>
+        </indexterm>
+        <function>pg_get_wait_event_trace</function> ( <parameter>procnumber</parameter> <type>integer</type> )
+        <returnvalue>setof record</returnvalue>
+       </para>
+       <para>
+        Returns individual wait event records from the trace ring of
+        the backend that currently or previously occupied the slot
+        identified by <parameter>procnumber</parameter>.  Reads slots
+        in <literal>OWNED</literal> state (live writer) and
+        <literal>ORPHANED</literal> state (writer has exited but the
+        ring is preserved for post-mortem reading) uniformly.  An
+        empty result indicates the slot is in <literal>FREE</literal>
+        state (no ring) or no records have been written.  Concurrent
+        slot transitions cannot interrupt the read because the
+        function holds the cross-backend trace lock in
+        <literal>SHARED</literal> mode throughout the iteration; the
+        per-record seqlock protocol skips any record being written
+        by a concurrent live writer.
+       </para>
+       <para>
+        This is the canonical cross-backend reader.  External
+        monitoring extensions that need cross-backend access without
+        going through SQL should follow the same snapshot pattern
+        documented on <structname>WaitEventTraceControl</structname>
+        in <filename>src/include/utils/wait_event_timing.h</filename>;
+        this function serves as both the reference implementation and
+        a DBA-facing diagnostic tool.  The
+        <parameter>procnumber</parameter> argument can be obtained
+        from the <structfield>procnumber</structfield> column of
+        <function>pg_stat_get_wait_event_timing</function> or
+        <function>pg_stat_get_wait_event_timing_overflow</function>
+        for live backends.  For post-mortem reads of short-lived
+        backends (parallel workers, autovacuum, walsender) the
+        <parameter>procnumber</parameter> must be captured while the
+        backend is still alive, or discovered by iterating slots in a
+        monitoring background worker.  A pid-keyed lookup for live
+        backends only is one query away:
+
+<programlisting>
+SELECT * FROM pg_get_wait_event_trace(
+    (SELECT procnumber FROM pg_stat_get_wait_event_timing(<replaceable>target_pid</replaceable>)
+     WHERE pid = <replaceable>target_pid</replaceable> LIMIT 1));
+</programlisting>
+       </para>
+       <para>
+        Requires membership in <literal>pg_read_all_stats</literal>
+        (matching the privilege model of the session-local view
+        <structname>pg_backend_wait_event_trace</structname>).
+       </para></entry>
+      </row>
+
+      <row>
+       <entry role="func_table_entry"><para role="func_signature">
+        <indexterm>
+         <primary>pg_stat_clear_orphaned_wait_event_rings</primary>
+        </indexterm>
+        <function>pg_stat_clear_orphaned_wait_event_rings</function> ()
+        <returnvalue>bigint</returnvalue>
+       </para>
+       <para>
+        Frees every wait-event-trace ring whose owner backend has
+        exited.  Returns the number of rings released.  Requires
+        superuser.
+       </para>
+       <para>
+        When a backend that had <varname>wait_event_capture</varname> =
+        <literal>trace</literal> exits, its ~4 MB trace ring is
+        intentionally <emphasis>not</emphasis> freed at exit so that
+        cross-backend consumers
+        (<function>pg_get_wait_event_trace</function> and extensions
+        following the snapshot pattern) can still read the dying
+        backend's final waits.  The
+        memory is reclaimed lazily: in the common case, the ring is
+        freed automatically when a new backend takes over the same
+        <literal>procNumber</literal> slot.  This function is the
+        explicit DBA-driven sweep for the pathological case where
+        capture was briefly enabled, then disabled, on a cluster with
+        long-lived pooled connections that never recycle the
+        <literal>procNumber</literal>.  The maximum amount of memory
+        this can release is bounded by the slot count times the
+        per-ring size (~400 MB at <varname>max_connections</varname>
+        = 100, ~4 GB at 1000); on most deployments the function will
+        report 0 because connection churn already drained orphans
+        naturally.
+       </para>
+       <para>
+        Safe to call when capture is currently <literal>off</literal>
+        and even when no orphans exist (returns 0 in both cases).
+       </para></entry>
+      </row>
+
       <row>
        <entry role="func_table_entry"><para role="func_signature">
         <indexterm>
diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql
index 96aba833895..ccb2fb51236 100644
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -1643,7 +1643,31 @@ CREATE VIEW pg_stat_wait_event_timing_overflow AS
 REVOKE ALL ON pg_stat_wait_event_timing_overflow FROM PUBLIC;
 GRANT SELECT ON pg_stat_wait_event_timing_overflow TO pg_read_all_stats;
 
+-- Per-session wait event trace ring, one record per completed wait
+-- (wait_event_capture = trace).
+-- Reading a session's trace exposes its query_id and wait sequence, which can
+-- leak across SECURITY DEFINER call chains, so the view AND both underlying
+-- SRFs are locked to pg_read_all_stats.
+CREATE VIEW pg_backend_wait_event_trace AS
+    SELECT
+        t.seq,
+        t.timestamp_ns,
+        t.wait_event_type,
+        t.wait_event,
+        t.duration_us,
+        t.query_id
+    FROM pg_get_backend_wait_event_trace() t;
+REVOKE ALL ON pg_backend_wait_event_trace FROM PUBLIC;
+GRANT SELECT ON pg_backend_wait_event_trace TO pg_read_all_stats;
+-- Revoke the session-local SRF itself, not just the view, so a role that can
+-- enable trace cannot read its own ring via the function and bypass the view.
+REVOKE EXECUTE ON FUNCTION pg_get_backend_wait_event_trace() FROM PUBLIC;
+GRANT EXECUTE ON FUNCTION pg_get_backend_wait_event_trace() TO pg_read_all_stats;
+-- Cross-backend reader, keyed by procnumber (reads OWNED and ORPHANED slots).
+REVOKE EXECUTE ON FUNCTION pg_get_wait_event_trace(int4) FROM PUBLIC;
+GRANT EXECUTE ON FUNCTION pg_get_wait_event_trace(int4) TO pg_read_all_stats;
 -- Cluster-scope operations: revoked from PUBLIC (administrators can
 -- delegate with GRANT EXECUTE); not granted to pg_read_all_stats because
 -- they mutate state rather than read it.
 REVOKE EXECUTE ON FUNCTION pg_stat_reset_wait_event_timing_all() FROM PUBLIC;
+REVOKE EXECUTE ON FUNCTION pg_stat_clear_orphaned_wait_event_rings() FROM PUBLIC;
diff --git a/src/backend/postmaster/auxprocess.c b/src/backend/postmaster/auxprocess.c
index ad4bf4bd2a8..6575dec4f00 100644
--- a/src/backend/postmaster/auxprocess.c
+++ b/src/backend/postmaster/auxprocess.c
@@ -26,6 +26,7 @@
 #include "utils/memutils.h"
 #include "utils/ps_status.h"
 #include "utils/wait_event.h"
+#include "utils/wait_event_timing.h"
 
 
 static void ShutdownAuxiliaryProcess(int code, Datum arg);
@@ -113,6 +114,16 @@ AuxiliaryProcessMainCommon(void)
 	 */
 	CreateAuxProcessResourceOwner();
 
+#ifdef USE_WAIT_EVENT_TIMING
+
+	/*
+	 * Attach trace ring if wait_event_capture = trace was set via
+	 * postgresql.conf
+	 */
+	if (wait_event_capture == WAIT_EVENT_CAPTURE_TRACE && my_trace_proc_number >= 0)
+		wait_event_trace_attach(my_trace_proc_number);
+#endif
+
 
 	/* Initialize backend status information */
 	pgstat_beinit();
diff --git a/src/backend/utils/activity/wait_event_names.txt b/src/backend/utils/activity/wait_event_names.txt
index 0a06cd68604..fe3fac660c9 100644
--- a/src/backend/utils/activity/wait_event_names.txt
+++ b/src/backend/utils/activity/wait_event_names.txt
@@ -418,6 +418,7 @@ ParallelVacuumDSA	"Waiting for parallel vacuum dynamic shared memory allocation.
 AioUringCompletion	"Waiting for another process to complete IO via io_uring."
 ShmemIndex	"Waiting to find or allocate space in shared memory."
 WaitEventTimingDSA	"Waiting for wait event timing dynamic shared memory allocation."
+WaitEventTraceDSA	"Waiting for wait event trace dynamic shared memory allocation."
 
 # No "ABI_compatibility" region here as WaitEventLWLock has its own C code.
 
diff --git a/src/backend/utils/activity/wait_event_timing.c b/src/backend/utils/activity/wait_event_timing.c
index e64919271a1..993bd303fba 100644
--- a/src/backend/utils/activity/wait_event_timing.c
+++ b/src/backend/utils/activity/wait_event_timing.c
@@ -15,11 +15,16 @@
  * attach on their first wait event under capture; a before_shmem_exit gate
  * keeps the hot path away from DSA mappings that proc_exit has torn down.
  *
- * Controlled by the wait_event_capture GUC (off | stats, default off) and
- * the compile-time option --enable-wait-event-timing.  In builds without
- * that option the file still compiles (the GUC backing variable, the enum
- * table, the rejecting check hook, and empty-result SQL stubs), so the GUC
- * and the catalog functions exist uniformly.
+ * At the trace level, every completed wait (and a set of query-attribution
+ * markers) is additionally pushed into a per-session DSA ring buffer that
+ * survives backend exit for post-mortem reading; see the trace section
+ * below and the lifecycle discussion on WaitEventTraceControl.
+ *
+ * Controlled by the wait_event_capture GUC (off | stats | trace, default
+ * off) and the compile-time option --enable-wait-event-timing.  In builds
+ * without that option the file still compiles (the GUC backing variables,
+ * the enum table, the rejecting check hook, and empty-result SQL stubs),
+ * so the GUC and the catalog functions exist uniformly.
  *
  * Copyright (c) 2026, PostgreSQL Global Development Group
  *
@@ -40,17 +45,46 @@
  */
 int			wait_event_capture = WAIT_EVENT_CAPTURE_OFF;
 int			wait_event_timing_max_tranches = 192;
+int			wait_event_trace_ring_size = 4096;
+
+/*
+ * Records-per-ring derived from wait_event_trace_ring_size at server
+ * start.  Read by the writer (via the per-ring cached mask) and by the ring
+ * allocator.  Zero until the GUC framework commits the boot value.
+ */
+uint32		WaitEventTraceRingSize = 0;
 
 /*
  * Enum value table consumed by guc.c.  Order matches the
- * WaitEventCaptureLevel enum and the documented "off < stats" ordering.
+ * WaitEventCaptureLevel enum and the documented "off < stats < trace".
  */
 const struct config_enum_entry wait_event_capture_options[] = {
 	{"off", WAIT_EVENT_CAPTURE_OFF, false},
 	{"stats", WAIT_EVENT_CAPTURE_STATS, false},
+	{"trace", WAIT_EVENT_CAPTURE_TRACE, false},
 	{NULL, 0, false}
 };
 
+/*
+ * GUC check hook for wait_event_trace_ring_size.  The ring size in records
+ * must be a power of two for the writer's mask-indexing (pos & ring_mask).
+ * Each record is 32 bytes, so kb is a power of two iff the record count is.
+ * Defined for both build configurations so the GUC framework validates it
+ * uniformly.
+ */
+bool
+check_wait_event_trace_ring_size(int *newval, void **extra, GucSource source)
+{
+	int			v = *newval;
+
+	if (v <= 0 || (v & (v - 1)) != 0)
+	{
+		GUC_check_errdetail("wait_event_trace_ring_size must be a positive power of two.");
+		return false;
+	}
+	return true;
+}
+
 #ifndef USE_WAIT_EVENT_TIMING
 
 /*
@@ -64,6 +98,9 @@ Datum		pg_stat_get_wait_event_timing(PG_FUNCTION_ARGS);
 Datum		pg_stat_get_wait_event_timing_overflow(PG_FUNCTION_ARGS);
 Datum		pg_stat_reset_wait_event_timing(PG_FUNCTION_ARGS);
 Datum		pg_stat_reset_wait_event_timing_all(PG_FUNCTION_ARGS);
+Datum		pg_get_backend_wait_event_trace(PG_FUNCTION_ARGS);
+Datum		pg_get_wait_event_trace(PG_FUNCTION_ARGS);
+Datum		pg_stat_clear_orphaned_wait_event_rings(PG_FUNCTION_ARGS);
 
 Datum
 pg_stat_get_wait_event_timing(PG_FUNCTION_ARGS)
@@ -79,6 +116,27 @@ pg_stat_get_wait_event_timing_overflow(PG_FUNCTION_ARGS)
 	PG_RETURN_VOID();
 }
 
+Datum
+pg_get_backend_wait_event_trace(PG_FUNCTION_ARGS)
+{
+	InitMaterializedSRF(fcinfo, 0);
+	PG_RETURN_VOID();
+}
+
+Datum
+pg_get_wait_event_trace(PG_FUNCTION_ARGS)
+{
+	InitMaterializedSRF(fcinfo, 0);
+	PG_RETURN_VOID();
+}
+
+Datum
+pg_stat_clear_orphaned_wait_event_rings(PG_FUNCTION_ARGS)
+{
+	/* No trace infrastructure in this build, so never any orphans. */
+	PG_RETURN_INT64(0);
+}
+
 Datum
 pg_stat_reset_wait_event_timing(PG_FUNCTION_ARGS)
 {
@@ -134,9 +192,11 @@ assign_wait_event_capture(int newval, void *extra)
 
 /* No shared memory is reserved in the stub build. */
 const ShmemCallbacks WaitEventTimingShmemCallbacks = {0};
+const ShmemCallbacks WaitEventTraceControlShmemCallbacks = {0};
 
 /* Defined so every extern in wait_event_timing.h resolves in stub builds. */
 WaitEventTimingState *my_wait_event_timing = NULL;
+int			my_trace_proc_number = -1;
 
 void
 pgstat_set_wait_event_timing_storage(int procNumber)
@@ -148,6 +208,35 @@ pgstat_reset_wait_event_timing_storage(void)
 {
 }
 
+void
+wait_event_trace_attach(int procNumber)
+{
+}
+
+/*
+ * Stub trace-marker entry points -- declared unconditionally in the header
+ * so call sites in execMain.c/postgres.c/backend_status.c need no #ifdef.
+ */
+void
+wait_event_trace_query_start(int64 query_id)
+{
+}
+
+void
+wait_event_trace_query_end(int64 query_id)
+{
+}
+
+void
+wait_event_trace_exec_start(int64 query_id)
+{
+}
+
+void
+wait_event_trace_exec_end(int64 query_id)
+{
+}
+
 #else							/* USE_WAIT_EVENT_TIMING */
 
 #include "catalog/pg_authid.h"
@@ -166,6 +255,7 @@ pgstat_reset_wait_event_timing_storage(void)
 #include "utils/backend_status.h"
 #include "utils/builtins.h"
 #include "utils/dsa.h"
+#include "utils/injection_point.h"
 #include "utils/tuplestore.h"
 #include "utils/wait_event.h"
 
@@ -232,6 +322,107 @@ static Size wait_event_timing_per_backend_stride = 0;
 static int	wait_event_timing_hash_size = 0;
 static int	wait_event_timing_max_entries = 0;
 
+/* ---- Trace-level (wait_event_capture = trace) state ---- */
+/* Pointer to this backend's trace ring buffer */
+static WaitEventTraceState *my_wait_event_trace = NULL;
+
+/* DSA-based trace ring buffer control */
+static WaitEventTraceControl *WaitEventTraceCtl = NULL;
+static dsa_area *trace_dsa = NULL;
+int			my_trace_proc_number = -1;
+
+/*
+ * Same-backend coordination between pg_get_backend_wait_event_trace (the
+ * own-session SRF reader) and wait_event_trace_release_slot (the GUC
+ * step-down path that frees this backend's ring).  Both paths run in this
+ * same backend, single-threaded, so a plain bool is sufficient -- no
+ * atomics needed.
+ *
+ *   srf_in_progress   set true while the SRF is iterating the ring; the
+ *                     release path observes this and defers the dsa_free
+ *                     instead of yanking the chunk out from under us.
+ *
+ *   release_pending   set by the release path when it had to defer; the
+ *                     SRF's PG_FINALLY checks it and performs the deferred
+ *                     dsa_free after the iteration completes.
+ *
+ * Cross-backend readers (extensions, bgworkers reading another backend's
+ * ring) cannot use this mechanism -- they coordinate with the release
+ * path via WaitEventTraceCtl->lock instead.  See the header for the
+ * recommended snapshot-under-lock pattern for those consumers.
+ */
+static bool wait_event_trace_srf_in_progress = false;
+static bool wait_event_trace_release_pending = false;
+
+/*
+ * Per-backend gate that disables the trace-ring writer in the wait-
+ * event hot path while a slot-state transition is in progress.
+ *
+ * Set true around code paths that either free the local trace ring
+ * (wait_event_trace_release_slot's dsa_free) or transition the slot
+ * out of OWNED (wait_event_trace_before_shmem_exit's OWNED ->
+ * ORPHANED publish).  In both cases an internal LWLock inside
+ * dsa_free / dsa_attach / dsa_pin_mapping / dsa_pin can in
+ * principle contend long enough to dispatch a wait event; that
+ * wait event's pgstat_report_wait_end_timing inline path runs in
+ * the SAME backend, sees capture_level == TRACE (the GUC hasn't
+ * been committed yet by the time the assign hook runs), and would:
+ *
+ *   * during release_slot's dsa_free: write into a ring that has
+ *     already been returned to the DSA freelist -- if another
+ *     allocator has since reused the chunk, this is a stray write
+ *     into someone else's allocation.
+ *
+ *   * during release_slot's dsa_free, alternative timing: see
+ *     my_wait_event_trace == NULL on a naive "clear before free"
+ *     fix and recurse into wait_event_trace_attach, which would
+ *     either deadlock on the WaitEventTraceCtl->lock the outer
+ *     release_slot already holds, or (on a lock-free moment)
+ *     allocate a fresh ring that the outer release_slot would
+ *     then free again as part of its post-acquire DsaPointerIsValid
+ *     check -- a different use-after-free of a freshly-allocated
+ *     chunk.
+ *
+ *   * during before_shmem_exit: write into the ring after the slot
+ *     has been published as ORPHANED, violating the post-mortem
+ *     read-only contract that cross-backend readers rely on.
+ *
+ * The flag is per-backend (static at file scope means per-process
+ * in PG's process-per-backend model), so the hot path's check is a
+ * single cache-warm load and a branch; no atomic, no fence.  The
+ * trace branch is already gated by capture_level == TRACE so the
+ * additional check costs nothing in the common case where capture
+ * is off or stats-only.  The flag is set on the very same backend
+ * that may later read it from the hot path, so there is no
+ * cross-process visibility concern.
+ *
+ * See the release_slot and before_shmem_exit doc comments for the
+ * specific transition each uses this flag around.
+ */
+static bool wait_event_trace_writes_disabled = false;
+
+/*
+ * Re-entrancy guard for the trace-record writer in
+ * pgstat_report_wait_end_timing(): wait events emitted while a trace
+ * record is mid-write are not themselves ring-recorded.  In production the
+ * record write is a handful of plain stores and cannot wait, so this never
+ * fires; it exists for the injection point inside the writer, whose wait
+ * machinery emits nested wait events that would otherwise recurse into the
+ * writer (and, by re-running the injection point, self-deadlock on locks
+ * the outer invocation holds).  Per-backend, single-threaded -- a plain
+ * bool suffices.
+ */
+static bool wait_event_trace_in_write = false;
+
+/*
+ * Forward declarations for trace functions referenced before their
+ * definitions (the storage/assign hooks call them; the bodies are in the
+ * trace machinery section below).
+ */
+static void wait_event_trace_detach(int procNumber);
+static void wait_event_trace_release_slot(int procNumber);
+static void wait_event_trace_clear_orphan_at_init(int procNumber);
+
 /*
  * Mapping arrays for the flat events[] array, generated from
  * wait_event_names.txt by generate-wait_event_types.pl.  Defines
@@ -747,17 +938,39 @@ void
 pgstat_set_wait_event_timing_storage(int procNumber)
 {
 	my_wait_event_timing = NULL;
+	my_wait_event_trace = NULL;
+
+	if (procNumber < 0 || procNumber >= NUM_WAIT_EVENT_TIMING_SLOTS)
+	{
+		my_trace_proc_number = -1;
+		return;
+	}
+
+	my_trace_proc_number = procNumber;
+
+	/*
+	 * If the previous occupant of this procNumber left an ORPHANED trace ring
+	 * behind (we do not free trace rings at backend exit -- see
+	 * WaitEventTraceControl), free it now so this backend starts with a clean
+	 * FREE slot.
+	 */
+	wait_event_trace_clear_orphan_at_init(procNumber);
 }
 
 /*
- * Detach from the timing slot on backend exit.  The slot stays in DSA;
- * clearing the local pointer keeps the late-shutdown hot path from touching
+ * Detach from the timing slot on backend exit.  The slots stay in DSA;
+ * clearing the local pointers keeps the late-shutdown hot path from touching
  * already-detached memory.
  */
 void
 pgstat_reset_wait_event_timing_storage(void)
 {
+	if (my_trace_proc_number >= 0)
+		wait_event_trace_detach(my_trace_proc_number);
+
 	my_wait_event_timing = NULL;
+	my_wait_event_trace = NULL;
+	my_trace_proc_number = -1;
 }
 
 /*
@@ -787,118 +1000,898 @@ assign_wait_event_capture(int newval, void *extra)
 		INSTR_TIME_SET_ZERO(my_wait_event_timing->wait_start);
 		my_wait_event_timing->current_event = 0;
 	}
+
+	/*
+	 * Stepping down from TRACE: release the ~4 MB DSA ring now rather than
+	 * holding it pinned for the rest of the session.  Only fires when a ring
+	 * is actually attached, so OFF -> TRACE -> OFF without ever emitting a
+	 * trace record stays a no-op.  Re-enable re-allocates a fresh ring on the
+	 * first wait event via wait_event_trace_attach.  dsa_free is non-raising
+	 * LWLock bookkeeping, so it is safe from an assign hook.
+	 */
+	if (newval != WAIT_EVENT_CAPTURE_TRACE && my_wait_event_trace != NULL)
+		wait_event_trace_release_slot(my_trace_proc_number);
 }
 
+/* ================= Trace-level ring machinery ================= */
+
+
 /*
- * Out-of-line body for pgstat_report_wait_start()'s timing path.  Records
- * the start timestamp and the event being waited on.  Reached only when
- * wait_event_capture != OFF.
+ * Write a trace ring marker record.  Shared helper for all marker types.
  */
-void
-pgstat_report_wait_start_timing(uint32 wait_event_info)
+static void
+wait_event_trace_write_marker(uint8 record_type, int64 query_id)
 {
+	uint64		pos;
+	WaitEventTraceRecord *rec;
+	uint32		seq;
+	instr_time	now;
+
 	/*
-	 * Stay out of the timing path once proc_exit has begun tearing down DSA
-	 * mappings (see the before_shmem_exit callback).
+	 * Single capture-level gate: markers only land in the ring when
+	 * wait_event_capture is at TRACE.  This guarantees consistency with the
+	 * wait-event hot path (also gated on the same level) -- there is no
+	 * configuration in which one half of the trace fires and the other
+	 * doesn't.  query_id == 0 means "no query ID available" (utility command
+	 * or compute_query_id = off), which we skip.
+	 *
+	 * wait_event_trace_writes_disabled is the same per-backend gate the
+	 * wait-event hot path uses; it is raised by release_slot and
+	 * before_shmem_exit around slot-state transitions to keep both writers
+	 * consistent.  Markers cannot fire during those transitions today
+	 * (single-threaded execution, no nested executor), but checking here
+	 * keeps the contract uniform across all trace-ring writers and is robust
+	 * to future code paths that might invoke a marker from a nested context.
+	 *
+	 * No likely()/unlikely() annotation: this function is called at
+	 * query/exec boundaries (a handful per query, not per wait event), so
+	 * neither side of the branch dominates often enough for static layout to
+	 * matter, and the meaningful production configuration (wait_event_capture
+	 * = trace) is exactly when the body is hot -- an annotation on the
+	 * early-return would point the wrong way.
 	 */
-	if (wait_event_timing_writes_disabled)
+	if (wait_event_capture != WAIT_EVENT_CAPTURE_TRACE ||
+		wait_event_trace_writes_disabled ||
+		query_id == 0)
 		return;
 
-	if (my_wait_event_timing == NULL)
+	/*
+	 * Lazy attach on first use.  Allocation lives here (not in the assign
+	 * hook) because dsa_allocate_extended() can ereport(ERROR) on OOM, which
+	 * is forbidden in assign-hook context but legitimate here.  Idempotent:
+	 * wait_event_trace_attach() short-circuits on subsequent calls.
+	 */
+	if (my_wait_event_trace == NULL)
 	{
-		pgstat_wait_event_timing_lazy_attach();
+		if (my_trace_proc_number < 0)
+			return;
+		wait_event_trace_attach(my_trace_proc_number);
+		if (my_wait_event_trace == NULL)
+			return;				/* attach path unable to allocate */
+	}
 
-		/*
-		 * lazy_attach can dispatch nested wait events while it sets up DSA
-		 * (dsa_attach takes an internal LWLock); those nested wait_end calls
-		 * clear my_wait_event_info to 0.  Re-publish so the outer wait stays
-		 * visible in pg_stat_activity.  Only needed on the first-attach path.
-		 */
-		*(volatile uint32 *) my_wait_event_info = wait_event_info;
+	/*
+	 * Claim the next slot.  Single-writer counter (only the owning backend
+	 * writes its own ring), so a plain read+write is sufficient and avoids
+	 * the LOCK XADD that pg_atomic_fetch_add_u64 would emit -- a wasted
+	 * cache-coherence trip on an unshared cache line at this rate (one per
+	 * wait event).  Cross-backend readers use pg_atomic_read_u64, which
+	 * compiles to a plain MOV on x86 and tolerates concurrent writes here
+	 * (their actual safety against the records[] window is the per-record
+	 * seqlock below).  Same idiom as injection_point.c's per-entry generation
+	 * counter (single writer + multiple lock-free readers).
+	 */
+	pos = pg_atomic_read_u64(&my_wait_event_trace->write_pos);
+	pg_atomic_write_u64(&my_wait_event_trace->write_pos, pos + 1);
+	rec = &my_wait_event_trace->records[pos & my_wait_event_trace->ring_mask];
+	seq = (uint32) (pos * 2 + 1);
+
+	rec->seq = seq;
+	pg_write_barrier();			/* release: payload stores must not rise above
+								 * seq=odd */
+
+	INSTR_TIME_SET_CURRENT(now);
+	rec->record_type = record_type;
+	rec->timestamp_ns = INSTR_TIME_GET_NANOSEC(now);
+	rec->data.query.query_id = query_id;
+	rec->data.query.pad2 = 0;
+
+	pg_write_barrier();			/* release: payload stores must land before
+								 * seq=even */
+	rec->seq = seq + 1;
+}
+
+void
+wait_event_trace_query_start(int64 query_id)
+{
+	wait_event_trace_write_marker(TRACE_QUERY_START, query_id);
+}
+
+void
+wait_event_trace_query_end(int64 query_id)
+{
+	wait_event_trace_write_marker(TRACE_QUERY_END, query_id);
+}
+
+void
+wait_event_trace_exec_start(int64 query_id)
+{
+	wait_event_trace_write_marker(TRACE_EXEC_START, query_id);
+}
+
+void
+wait_event_trace_exec_end(int64 query_id)
+{
+	wait_event_trace_write_marker(TRACE_EXEC_END, query_id);
+}
+
+/*
+ * Report the shared memory space needed for trace ring buffer control.
+ * Only a small control struct is in fixed shmem; the actual ring buffers
+ * are allocated lazily via DSA.  At ~24 bytes/slot, the slot array adds
+ * ~26 KB at a default MaxBackends, negligible compared to the ring
+ * memory itself.
+ */
+static Size
+WaitEventTraceControlShmemSize(void)
+{
+	return add_size(offsetof(WaitEventTraceControl, trace_slots),
+					mul_size(NUM_WAIT_EVENT_TIMING_SLOTS,
+							 sizeof(WaitEventTraceSlot)));
+}
+
+static void
+WaitEventTraceControlShmemRequest(void *arg)
+{
+	ShmemRequestStruct(.name = "WaitEventTraceControl",
+					   .size = WaitEventTraceControlShmemSize(),
+					   .ptr = (void **) &WaitEventTraceCtl);
+}
+
+/*
+ * Initialize shared memory for trace ring buffer control.
+ */
+static void
+WaitEventTraceControlShmemInit(void *arg)
+{
+	int			i;
+
+	WaitEventTraceCtl->trace_dsa_handle = DSA_HANDLE_INVALID;
+	LWLockInitialize(&WaitEventTraceCtl->lock,
+					 LWTRANCHE_WAIT_EVENT_TRACE_DSA);
+	for (i = 0; i < NUM_WAIT_EVENT_TIMING_SLOTS; i++)
+	{
+		WaitEventTraceSlot *s = &WaitEventTraceCtl->trace_slots[i];
+
+		pg_atomic_init_u64(&s->generation, 0);
+		pg_atomic_init_u32(&s->state, WAIT_EVENT_TRACE_SLOT_FREE);
+		s->pad = 0;
+		s->ring_ptr = InvalidDsaPointer;
 	}
+}
 
-	if (my_wait_event_timing != NULL)
+const ShmemCallbacks WaitEventTraceControlShmemCallbacks = {
+	.request_fn = WaitEventTraceControlShmemRequest,
+	.init_fn = WaitEventTraceControlShmemInit,
+};
+
+/*
+ * Ensure the shared DSA for trace ring buffers exists and is attached.
+ * Creates it on first call (any backend), attaches on subsequent calls.
+ * Must be called from a backend context (not postmaster).
+ */
+static void
+wait_event_trace_ensure_dsa(void)
+{
+	MemoryContext oldcontext;
+
+	if (trace_dsa != NULL)
+		return;
+
+	oldcontext = MemoryContextSwitchTo(TopMemoryContext);
+
+	LWLockAcquire(&WaitEventTraceCtl->lock, LW_EXCLUSIVE);
+
+	if (WaitEventTraceCtl->trace_dsa_handle == DSA_HANDLE_INVALID)
 	{
-		INSTR_TIME_SET_CURRENT(my_wait_event_timing->wait_start);
-		my_wait_event_timing->current_event = wait_event_info;
+		trace_dsa = dsa_create(LWTRANCHE_WAIT_EVENT_TRACE_DSA);
+		dsa_pin(trace_dsa);
+		dsa_pin_mapping(trace_dsa);
+		WaitEventTraceCtl->trace_dsa_handle = dsa_get_handle(trace_dsa);
+	}
+	else
+	{
+		trace_dsa = dsa_attach(WaitEventTraceCtl->trace_dsa_handle);
+		dsa_pin_mapping(trace_dsa);
 	}
+
+	LWLockRelease(&WaitEventTraceCtl->lock);
+
+	MemoryContextSwitchTo(oldcontext);
 }
 
 /*
- * Out-of-line body for pgstat_report_wait_end()'s timing path.  Computes
- * the wait duration and accumulates per-event statistics.
+ * Transition our trace ring slot to ORPHANED on backend exit.
  *
- * capture_level is the value of wait_event_capture observed at the inline
- * gate; in this commit only STATS exists, so it is not branched on, but it
- * is threaded through to keep the gate ABI stable for the trace level
- * added later in the series.
+ * Registered as a before_shmem_exit callback.  Runs BEFORE
+ * dsm_backend_shutdown() detaches the DSA.
+ *
+ * Crucially, we do NOT free the ring here.  The ring stays allocated in
+ * DSA so that cross-backend consumers -- the in-tree
+ * pg_get_wait_event_trace SRF and any extension following the
+ * snapshot pattern documented on WaitEventTraceControl -- can read
+ * the dying backend's final waits.  The original "free at exit"
+ * design lost data the instant a worker terminated, which was
+ * particularly bad for parallel workers exiting in milliseconds at
+ * end-of-parallel-query.  See the lifecycle comment on
+ * WaitEventTraceControl for the full design
+ * rationale and the bounded-memory cost we accept in exchange.
+ *
+ * The ORPHANED slot is reclaimed in one of two ways:
+ *   (a) a new backend at this procNumber calls
+ *       wait_event_trace_clear_orphan_at_init() at backend init, or
+ *   (b) the DBA calls pg_stat_clear_orphaned_wait_event_rings().
+ *
+ * State transition order matters: bump generation BEFORE storing the
+ * new state, so cross-backend readers that snapshot
+ * (generation_before, state, ring_ptr, generation_after) under the
+ * lock see a consistent (state, ring_ptr) pair iff generation didn't
+ * change.  We hold the lock for the whole transition, but readers do
+ * not have to (they just take it briefly to snapshot the ring
+ * contents); the generation check is what makes the unlocked-read
+ * path safe.
  */
-void
-pgstat_report_wait_end_timing(int capture_level)
+static void
+wait_event_trace_before_shmem_exit(int code, Datum arg)
 {
-	uint32		event;
-	uint32		cur_reset_gen;
+	int			procNumber = DatumGetInt32(arg);
+	WaitEventTraceSlot *slot;
 
-	(void) capture_level;
+	if (WaitEventTraceCtl == NULL)
+		return;
 
-	if (wait_event_timing_writes_disabled)
+	if (procNumber < 0 || procNumber >= NUM_WAIT_EVENT_TIMING_SLOTS)
 		return;
 
-	if (my_wait_event_timing == NULL)
+	slot = &WaitEventTraceCtl->trace_slots[procNumber];
+
+	/*
+	 * If this backend never ended up with an OWNED slot (e.g. capture was off
+	 * the whole session, or the trace was released back to FREE via
+	 * assign_wait_event_capture going trace -> off), there is nothing to
+	 * transition.  Read state without the lock first as a fast-path check;
+	 * the authoritative re-check happens under the lock below.
+	 */
+	if (pg_atomic_read_u32(&slot->state) != WAIT_EVENT_TRACE_SLOT_OWNED)
 	{
-		pgstat_wait_event_timing_lazy_attach();
-		if (my_wait_event_timing == NULL)
-			return;
+		wait_event_trace_writes_disabled = true;
+		my_wait_event_trace = NULL;
+		return;
 	}
 
-	event = my_wait_event_timing->current_event;
+	/*
+	 * Disable trace-ring writes on this backend before we touch the lock.
+	 * Writes after this point would race with the OWNED -> ORPHANED state
+	 * publish below: a wait event whose end-timing path runs after the state
+	 * has been published as ORPHANED would write into a ring that the patch
+	 * contract declares read-only post-mortem.  Cross-backend readers
+	 * snapshot ORPHANED rings without expecting concurrent writes from the
+	 * dying owner.  See wait_event_trace_writes_disabled for the full UAF /
+	 * contract-violation analysis.
+	 *
+	 * The flag stays true for the remainder of this backend's life (we are in
+	 * proc_exit; there is no subsequent capture re-enable to handle), so we
+	 * do not reset it.
+	 */
+	wait_event_trace_writes_disabled = true;
+
+	LWLockAcquire(&WaitEventTraceCtl->lock, LW_EXCLUSIVE);
 
 	/*
-	 * Service a pending cross-backend reset request.  A single relaxed atomic
-	 * load; when the shared generation has advanced past the value we last
-	 * acted on, clear our own counters on behalf of the requester and record
-	 * the reset.  wait_start is left untouched so the in-flight measurement
-	 * still lands (in the freshly-zeroed counters), and current_event is
-	 * zeroed so external readers do not see stale state.
+	 * Drop the local pointer inside the lock-held region as a second line of
+	 * defense; the writes-disabled flag above is the primary gate.
 	 */
-	cur_reset_gen = pg_atomic_read_u32(&my_wait_event_timing->reset_generation);
-	if (cur_reset_gen != my_last_reset_generation)
+	my_wait_event_trace = NULL;
+
+	if (pg_atomic_read_u32(&slot->state) == WAIT_EVENT_TRACE_SLOT_OWNED &&
+		DsaPointerIsValid(slot->ring_ptr))
 	{
-		memset(my_wait_event_timing->events, 0,
-			   sizeof(my_wait_event_timing->events));
-		lwlock_timing_hash_clear(my_wait_event_timing);
-		my_wait_event_timing->reset_count++;
-		my_wait_event_timing->lwlock_overflow_count = 0;
-		my_wait_event_timing->flat_overflow_count = 0;
-		my_wait_event_timing->current_event = 0;
-		my_last_reset_generation = cur_reset_gen;
+		/*
+		 * Bump generation first so any reader that snapped the old generation
+		 * will detect the change on its post-read recheck and discard its
+		 * read.  Then publish the ORPHANED state. Keep ring_ptr valid -- the
+		 * data is what we want to preserve.
+		 */
+		pg_atomic_fetch_add_u64(&slot->generation, 1);
+		pg_atomic_write_u32(&slot->state, WAIT_EVENT_TRACE_SLOT_ORPHANED);
 	}
 
-	if (event != 0 && !INSTR_TIME_IS_ZERO(my_wait_event_timing->wait_start))
-	{
-		instr_time	now;
-		int64		duration_ns;
-		int			idx;
-		WaitEventTimingEntry *entry = NULL;
-		bool		warn_lwlock_overflow = false;
-		bool		warn_flat_overflow = false;
+	LWLockRelease(&WaitEventTraceCtl->lock);
+}
 
-		INSTR_TIME_SET_CURRENT(now);
-		duration_ns = INSTR_TIME_GET_NANOSEC(now) -
-			INSTR_TIME_GET_NANOSEC(my_wait_event_timing->wait_start);
+/*
+ * Allocate (or re-acquire) a trace ring buffer for this backend via DSA.
+ * Called when wait_event_capture is set to 'trace'.
+ *
+ * Slot state at entry will be one of:
+ *
+ *   FREE     fresh slot (or one cleared on this backend's init by
+ *            wait_event_trace_clear_orphan_at_init): allocate a new
+ *            ring, transition slot to OWNED, bump generation.
+ *
+ *   OWNED    we already attached earlier in this same backend's life
+ *            (e.g. user toggled capture trace->stats->trace; the
+ *            stats step calls wait_event_trace_release_slot which
+ *            transitions back to FREE, but our cached
+ *            my_wait_event_trace was cleared on the way down -- so
+ *            seeing OWNED here at attach time means a different
+ *            backend somehow ended up with this procNumber, which
+ *            cannot happen because procNumber is per-backend and a
+ *            single backend can only run one attach at a time.  We
+ *            still tolerate this state defensively by re-mapping the
+ *            existing ring rather than leaking a second allocation.
+ *
+ *   ORPHANED can never be observed here: a new backend's
+ *            pgstat_set_wait_event_timing_storage() called
+ *            wait_event_trace_clear_orphan_at_init() before any
+ *            wait-event capture path can run, so any prior orphan has
+ *            already been demoted to FREE.  Treated as a safety check
+ *            (Assert in debug builds).
+ */
+void
+wait_event_trace_attach(int procNumber)
+{
+	/*
+	 * Re-entrancy guard.  dsa_create / dsa_allocate_extended below can emit
+	 * wait events internally; those reach the lazy-attach hot path which
+	 * calls back into this function while we still hold
+	 * WaitEventTraceCtl->lock or are mid-allocation.  See the
+	 * function-local-static-bool pattern explainer on
+	 * wait_event_timing_attach_array.
+	 */
+	static bool in_attach = false;
+	static bool shmem_exit_registered = false;
+	WaitEventTraceSlot *slot;
+	dsa_pointer p;
+	WaitEventTraceState *ts;
+	uint32		state_now;
 
-		if (duration_ns < 0)
-			duration_ns = 0;
+	if (in_attach)
+		return;
 
-		idx = wait_event_timing_index(event);
+	if (WaitEventTraceCtl == NULL)
+		return;
 
-		/*
-		 * Single-writer hot path: each slot has exactly one writer (the
-		 * owning backend), and the SRF reader is lock-free, so no locking is
-		 * needed here.  Overflow WARNINGs are deferred to after the
-		 * bookkeeping so ereport() cannot recurse through a wait event while
-		 * counters are in an intermediate state.
-		 */
+	if (procNumber < 0 || procNumber >= NUM_WAIT_EVENT_TIMING_SLOTS)
+		return;
+
+	/*
+	 * Skip the attach if we are inside a critical section.  Below this point
+	 * we call dsa_create / dsa_attach / dsa_allocate_extended, all of which
+	 * can allocate memory via MemoryContextAlloc and Assert-fail on
+	 * "CritSectionCount == 0 || allowInCritSection". The very-first wait
+	 * event after wait_event_capture = trace can land inside a critical
+	 * section (e.g. a parallel worker scanning a heap page hits
+	 * BufferSetHintBits16 -> XLogSaveBufferForHint -> XLogInsert ->
+	 * LWLockAcquire, with the XLogInsert critical section open).
+	 *
+	 * Skipping here silently drops the in-flight wait event (it is not
+	 * traced) but keeps the backend alive.  The next wait event outside any
+	 * critical section will hit this function again and attach successfully.
+	 * See the matching guard in pgstat_wait_event_timing_lazy_attach.
+	 */
+	if (CritSectionCount > 0)
+		return;
+
+	/*
+	 * Skip the attach if MyProc is already on an LWLock wait queue.  We are
+	 * called from the wait-event hot path which fires AFTER LWLockQueueSelf
+	 * has set MyProc->lwWaiting; a nested LWLockAcquire on our internal lock
+	 * (via wait_event_trace_ensure_dsa) would PANIC at lwlock.c:1029. See the
+	 * matching guard in pgstat_wait_event_timing_lazy_ attach for the full
+	 * rationale.
+	 */
+	if (MyProc != NULL && MyProc->lwWaiting != LW_WS_NOT_WAITING)
+		return;
+
+	slot = &WaitEventTraceCtl->trace_slots[procNumber];
+
+	in_attach = true;
+	PG_TRY();
+	{
+		state_now = pg_atomic_read_u32(&slot->state);
+
+		/*
+		 * ORPHANED is normally impossible at attach time --
+		 * pgstat_set_wait_event_timing_storage() at backend init calls
+		 * wait_event_trace_clear_orphan_at_init() which demotes any inherited
+		 * orphan to FREE.  But there is one case where this backend can
+		 * legitimately observe its own slot in the ORPHANED state: after we
+		 * have already run wait_event_trace_before_shmem_exit()
+		 * (transitioning the slot to ORPHANED on exit), a later
+		 * before_shmem_exit callback (e.g. pgstat_io_flush_cb during
+		 * proc_exit shutdown) can contend on an LWLock that emits a wait
+		 * event, which calls pgstat_report_wait_end_timing() ->
+		 * wait_event_trace_attach() after my_wait_event_trace has been
+		 * cleared.  We must not re-attach in that case: we are dying, the
+		 * ring is now post-mortem data for cross-backend readers, and the
+		 * writer invariant must hold.  Skip the trace for any wait events
+		 * emitted after our own exit transition.
+		 */
+		if (state_now == WAIT_EVENT_TRACE_SLOT_ORPHANED)
+		{
+			/* PG_FINALLY below clears in_attach. */
+		}
+		else if (state_now == WAIT_EVENT_TRACE_SLOT_OWNED &&
+				 DsaPointerIsValid(slot->ring_ptr))
+		{
+			/* Already have a ring buffer; re-map to it. */
+			wait_event_trace_ensure_dsa();
+			my_wait_event_trace = dsa_get_address(trace_dsa, slot->ring_ptr);
+			my_trace_proc_number = procNumber;
+		}
+		else
+		{
+			Size		alloc_size;
+
+			wait_event_trace_ensure_dsa();
+
+			/*
+			 * Cache the cluster-wide ring size on first allocation in this
+			 * backend.  wait_event_trace_ring_size is PGC_POSTMASTER, so by
+			 * the time any backend reaches here, its boot value has been
+			 * committed by the GUC framework.  All rings in the postmaster
+			 * run share the same dimensions.
+			 */
+			if (WaitEventTraceRingSize == 0)
+				WaitEventTraceRingSize =
+					(uint32) wait_event_trace_ring_size * 1024U /
+					(uint32) sizeof(WaitEventTraceRecord);
+
+			alloc_size = offsetof(WaitEventTraceState, records) +
+				(Size) WaitEventTraceRingSize * sizeof(WaitEventTraceRecord);
+
+			p = dsa_allocate_extended(trace_dsa, alloc_size, DSA_ALLOC_ZERO);
+			ts = dsa_get_address(trace_dsa, p);
+			pg_atomic_init_u64(&ts->write_pos, 0);
+			ts->ring_mask = WaitEventTraceRingSize - 1;
+
+			LWLockAcquire(&WaitEventTraceCtl->lock, LW_EXCLUSIVE);
+
+			/*
+			 * Publish ring_ptr BEFORE transitioning state to OWNED.
+			 * Cross-backend readers that observe state==OWNED outside the
+			 * lock then see a valid ring_ptr.  Bump generation last so any
+			 * reader that snapped the prior generation will detect the
+			 * change.
+			 */
+			slot->ring_ptr = p;
+			pg_atomic_write_u32(&slot->state, WAIT_EVENT_TRACE_SLOT_OWNED);
+			pg_atomic_fetch_add_u64(&slot->generation, 1);
+			LWLockRelease(&WaitEventTraceCtl->lock);
+
+			my_wait_event_trace = ts;
+			my_trace_proc_number = procNumber;
+
+			/*
+			 * Register cleanup to run BEFORE dsm_backend_shutdown() detaches
+			 * the DSA.  The before_shmem_exit callbacks run in LIFO order
+			 * before DSM detach, so the ORPHANED transition (which does not
+			 * actually free the ring) is safe at that point.
+			 *
+			 * Guarded by shmem_exit_registered because under the
+			 * release-on-disable policy (see wait_event_trace_release_slot
+			 * and assign_wait_event_capture) the allocate branch can run
+			 * multiple times per backend lifetime -- once per off/stats ->
+			 * trace re-enable cycle.  The cleanup itself is idempotent (it
+			 * short-circuits when state is not OWNED), so it is safe to
+			 * invoke after a release-then-reattach cycle, but we still avoid
+			 * growing the before_shmem_exit list.
+			 */
+			if (!shmem_exit_registered)
+			{
+				before_shmem_exit(wait_event_trace_before_shmem_exit,
+								  Int32GetDatum(procNumber));
+				shmem_exit_registered = true;
+			}
+		}
+	}
+	PG_FINALLY();
+	{
+		in_attach = false;
+	}
+	PG_END_TRY();
+}
+
+/*
+ * Free trace ring buffer for this backend on exit.
+ */
+static void
+wait_event_trace_detach(int procNumber)
+{
+	/*
+	 * Only clear local pointers here.  The actual DSA free happens in
+	 * wait_event_trace_before_shmem_exit(), which runs before
+	 * dsm_backend_shutdown() detaches the DSA segments.
+	 */
+	my_wait_event_trace = NULL;
+	my_trace_proc_number = -1;
+}
+
+/*
+ * Release this backend's trace ring buffer back to DSA immediately.
+ *
+ * Called from assign_wait_event_capture when the user steps down from
+ * TRACE to STATS or OFF.  Without this, a ~4 MB ring allocated by a
+ * brief investigation would remain pinned for the rest of the session's
+ * lifetime, which can leak gigabytes across large connection pools.
+ *
+ * Important contrast with wait_event_trace_before_shmem_exit: backend
+ * exit transitions the slot to ORPHANED (preserving data for
+ * cross-backend readers); release_slot fully frees and returns to FREE
+ * because the operator has explicitly disabled trace -- they have
+ * affirmatively decided not to keep the data, so we honour that and
+ * reclaim the memory immediately.  Subsequent re-enable allocates a
+ * fresh ring via wait_event_trace_attach's allocate branch.
+ *
+ * The operation is LWLock-safe and does not raise -- dsa_free is pure
+ * bookkeeping on the DSA freelist, no allocation and no ereport paths.
+ * Safe to call from a GUC assign hook.
+ *
+ * If pg_get_backend_wait_event_trace is currently iterating our own ring
+ * (wait_event_trace_srf_in_progress), we must NOT free the chunk out
+ * from under it: that would be a use-after-free on the records[] the SRF
+ * is still reading.  Set wait_event_trace_release_pending instead and
+ * return; the SRF's PG_FINALLY block will perform the deferred free
+ * after iteration completes.  In practice this branch is unreachable in
+ * current PG (assign hooks fire only at command boundaries and the SRF
+ * is a single command), but it makes the invariant explicit and the
+ * future-proofing free.
+ */
+static void
+wait_event_trace_release_slot(int procNumber)
+{
+	/*
+	 * Re-entrancy guard.  dsa_free takes a DSA-internal LWLock which can in
+	 * principle emit a wait event; if a nested assign hook re-enters we must
+	 * not recurse.  See the function-local-static-bool pattern explainer on
+	 * wait_event_timing_attach_array.
+	 */
+	static bool in_release = false;
+	WaitEventTraceSlot *slot;
+
+	if (in_release)
+		return;
+
+	if (WaitEventTraceCtl == NULL || trace_dsa == NULL)
+		return;
+
+	/*
+	 * Same-backend SRF is iterating our own ring.  Defer the free until the
+	 * SRF's PG_FINALLY runs.
+	 */
+	if (wait_event_trace_srf_in_progress)
+	{
+		wait_event_trace_release_pending = true;
+		return;
+	}
+
+	if (procNumber < 0 || procNumber >= NUM_WAIT_EVENT_TIMING_SLOTS)
+		return;
+
+	slot = &WaitEventTraceCtl->trace_slots[procNumber];
+
+	in_release = true;
+
+	/*
+	 * Disable trace-ring writes on this backend before we touch the lock or
+	 * call dsa_free.  An internal LWLock inside dsa_free can dispatch a wait
+	 * event whose end-timing path would otherwise see capture_level == TRACE
+	 * (the GUC assign hook is in flight; the variable has not been committed
+	 * by the framework yet) and write into the very chunk we are returning to
+	 * the DSA freelist.  See the comment on wait_event_trace_writes_disabled
+	 * for the full UAF analysis.
+	 */
+	wait_event_trace_writes_disabled = true;
+
+	PG_TRY();
+	{
+		LWLockAcquire(&WaitEventTraceCtl->lock, LW_EXCLUSIVE);
+
+		/*
+		 * Drop the local pointer BEFORE the dsa_free as a second line of
+		 * defense (the writes-disabled flag above is the primary gate).  Any
+		 * wait event whose hot path slips past the gate check via a compiler
+		 * or memory-ordering surprise would at least see my_wait_event_trace
+		 * == NULL and skip the write.
+		 */
+		my_wait_event_trace = NULL;
+
+		if (DsaPointerIsValid(slot->ring_ptr))
+		{
+			/*
+			 * Bump generation first to invalidate any concurrent
+			 * cross-backend snapshot, then free, then publish the FREE state
+			 * with a NULL ring_ptr.  Order matters for unlocked readers that
+			 * have already passed the state check.
+			 */
+			pg_atomic_fetch_add_u64(&slot->generation, 1);
+			dsa_free(trace_dsa, slot->ring_ptr);
+			slot->ring_ptr = InvalidDsaPointer;
+			pg_atomic_write_u32(&slot->state, WAIT_EVENT_TRACE_SLOT_FREE);
+		}
+		LWLockRelease(&WaitEventTraceCtl->lock);
+	}
+	PG_FINALLY();
+	{
+		wait_event_trace_writes_disabled = false;
+		in_release = false;
+	}
+	PG_END_TRY();
+}
+
+/*
+ * Clear an orphaned trace ring at backend init time.
+ *
+ * Called from pgstat_set_wait_event_timing_storage() once the new
+ * backend has its procNumber.  If the slot we're inheriting was left
+ * ORPHANED by a previous backend (because we deliberately do not free
+ * trace rings on backend exit -- see the lifecycle discussion on
+ * WaitEventTraceControl), free the ring now so the new backend starts
+ * with a clean FREE slot.  Subsequent wait_event_trace_attach() calls
+ * (when this backend itself enables trace) will then take the
+ * allocate branch.
+ *
+ * No-op when the slot is already FREE or OWNED: FREE means there's
+ * nothing to clear; OWNED is impossible at backend init (only a
+ * not-yet-exited backend can leave a slot OWNED, and procNumbers are
+ * assigned exclusively).  We assert OWNED is not observed in debug
+ * builds and conservatively skip the free in production.
+ *
+ * Robustness: this runs during InitProcess() (before the backend can
+ * accept any work), and the work it performs -- dsa_attach() and
+ * dsa_free() -- can raise ERROR on rare runtime failures (corrupted
+ * DSA segment headers, descriptor exhaustion, mmap ENOMEM, etc.).
+ * An uncaught ERROR here would propagate out of InitProcess() and
+ * abort backend startup entirely, even for sessions that never
+ * intended to use wait_event_capture.  To prevent the trace
+ * feature's housekeeping from gating connection establishment, the
+ * body is wrapped in PG_TRY()/PG_CATCH(): any error from dsa_attach
+ * or dsa_free is captured, downgraded to a WARNING with a hint
+ * pointing at the admin sweep function, and execution continues.
+ * The orphan stays in place; it can be reclaimed by the next
+ * backend that inherits the same procNumber (if the underlying
+ * problem was transient), by pg_stat_clear_orphaned_wait_event_rings(),
+ * or at next cluster restart.
+ */
+static void
+wait_event_trace_clear_orphan_at_init(int procNumber)
+{
+	WaitEventTraceSlot *slot;
+	uint32		state_now;
+	MemoryContext caller_cxt;
+
+	if (WaitEventTraceCtl == NULL)
+		return;
+
+	if (procNumber < 0 || procNumber >= NUM_WAIT_EVENT_TIMING_SLOTS)
+		return;
+
+	slot = &WaitEventTraceCtl->trace_slots[procNumber];
+
+	state_now = pg_atomic_read_u32(&slot->state);
+	if (state_now != WAIT_EVENT_TRACE_SLOT_ORPHANED)
+	{
+		Assert(state_now != WAIT_EVENT_TRACE_SLOT_OWNED);
+		return;
+	}
+
+	/*
+	 * Save CurrentMemoryContext so the PG_CATCH path can copy the error data
+	 * into a context that survives FlushErrorState(). FlushErrorState() calls
+	 * MemoryContextReset(ErrorContext), so CopyErrorData() must run in a
+	 * different context or the returned ErrorData becomes a dangling pointer.
+	 */
+	caller_cxt = CurrentMemoryContext;
+
+	PG_TRY();
+	{
+		/*
+		 * The trace DSA is shared across the cluster.  We must attach to it
+		 * before calling dsa_free (which needs the dsa_area pointer).  The
+		 * DSA was created by some earlier backend that wrote a trace record
+		 * (otherwise the slot couldn't have ended up ORPHANED), so the handle
+		 * in WaitEventTraceCtl is valid; ensure_dsa() will attach.  Both
+		 * ensure_dsa() and dsa_free() can raise ERROR; the PG_CATCH below
+		 * downgrades any such error to a WARNING so backend startup is not
+		 * blocked.
+		 */
+		wait_event_trace_ensure_dsa();
+
+		LWLockAcquire(&WaitEventTraceCtl->lock, LW_EXCLUSIVE);
+		if (pg_atomic_read_u32(&slot->state) == WAIT_EVENT_TRACE_SLOT_ORPHANED &&
+			DsaPointerIsValid(slot->ring_ptr))
+		{
+			pg_atomic_fetch_add_u64(&slot->generation, 1);
+			dsa_free(trace_dsa, slot->ring_ptr);
+			slot->ring_ptr = InvalidDsaPointer;
+			pg_atomic_write_u32(&slot->state, WAIT_EVENT_TRACE_SLOT_FREE);
+		}
+		LWLockRelease(&WaitEventTraceCtl->lock);
+	}
+	PG_CATCH();
+	{
+		ErrorData  *edata;
+
+		/*
+		 * Release any LWLocks we (or anything we called) might still hold.
+		 * Two paths can leave WaitEventTraceCtl->lock held when control
+		 * reaches here:
+		 *
+		 * 1. The outer LWLockAcquire above succeeded and dsa_free raised
+		 * before we reached LWLockRelease. 2. wait_event_trace_ensure_dsa()
+		 * raised inside its own LWLockAcquire/dsa_attach/LWLockRelease
+		 * region.
+		 *
+		 * We are running during InitProcess(), BEFORE any transaction or
+		 * PostgresMain sigsetjmp has been set up, so PG's standard
+		 * "AbortTransaction -> LWLockReleaseAll" cleanup does NOT fire on the
+		 * longjmp into PG_CATCH. Without an explicit release here the lock
+		 * would stay held for the lifetime of this backend, blocking every
+		 * future LW_EXCLUSIVE acquirer (the orphan-clear sweep, release_slot,
+		 * before_shmem_exit transitions, and subsequent backends'
+		 * clear_orphan_at_init).  That would be strictly worse than the
+		 * original failure-startup behavior this commit set out to fix.
+		 *
+		 * LWLockReleaseAll() is the idiomatic catch-path lock cleanup used by
+		 * the standalone aux-process error handlers (walwriter.c,
+		 * checkpointer.c, pgarch.c).  It is safe to call broadly here because
+		 * pgstat_set_wait_ event_timing_storage runs at a fixed point in
+		 * InitProcess where the caller frame holds no other LWLocks across
+		 * our return: the earlier InitProcess steps that touch LWLocks
+		 * (ProcArrayAdd, etc.) release them before returning, and the
+		 * subsequent steps that acquire LWLocks have not yet run.
+		 */
+		LWLockReleaseAll();
+
+		/*
+		 * Switch BACK to the caller's context before CopyErrorData so that
+		 * edata is allocated in a context that survives FlushErrorState().
+		 * FlushErrorState() calls MemoryContextReset(ErrorContext);
+		 * allocating edata in ErrorContext (the default at PG_CATCH entry on
+		 * the error path) would make it a dangling pointer the moment we
+		 * flush.  See the matching pattern in spi.c PG_CATCH branches.
+		 */
+		MemoryContextSwitchTo(caller_cxt);
+		edata = CopyErrorData();
+		FlushErrorState();
+
+		ereport(WARNING,
+				(errcode(edata->sqlerrcode),
+				 errmsg("could not clear orphaned wait-event trace ring "
+						"at backend init: %s", edata->message),
+				 errdetail("Backend startup proceeds with the orphan "
+						   "still allocated for procnumber %d.",
+						   procNumber),
+				 errhint("Run pg_stat_clear_orphaned_wait_event_rings() "
+						 "to release the orphan when the underlying "
+						 "condition is resolved.")));
+
+		FreeErrorData(edata);
+	}
+	PG_END_TRY();
+}
+
+/*
+ * Out-of-line body for pgstat_report_wait_start()'s timing path.  Records
+ * the start timestamp and the event being waited on.  Reached only when
+ * wait_event_capture != OFF.
+ */
+void
+pgstat_report_wait_start_timing(uint32 wait_event_info)
+{
+	/*
+	 * Stay out of the timing path once proc_exit has begun tearing down DSA
+	 * mappings (see the before_shmem_exit callback).
+	 */
+	if (wait_event_timing_writes_disabled)
+		return;
+
+	if (my_wait_event_timing == NULL)
+	{
+		pgstat_wait_event_timing_lazy_attach();
+
+		/*
+		 * lazy_attach can dispatch nested wait events while it sets up DSA
+		 * (dsa_attach takes an internal LWLock); those nested wait_end calls
+		 * clear my_wait_event_info to 0.  Re-publish so the outer wait stays
+		 * visible in pg_stat_activity.  Only needed on the first-attach path.
+		 */
+		*(volatile uint32 *) my_wait_event_info = wait_event_info;
+	}
+
+	if (my_wait_event_timing != NULL)
+	{
+		INSTR_TIME_SET_CURRENT(my_wait_event_timing->wait_start);
+		my_wait_event_timing->current_event = wait_event_info;
+	}
+}
+
+/*
+ * Out-of-line body for pgstat_report_wait_end()'s timing path.  Computes
+ * the wait duration, accumulates per-event statistics, and at the trace
+ * level pushes the completed wait into the per-session ring.
+ *
+ * capture_level is the value of wait_event_capture observed at the inline
+ * gate.  Passing it through (rather than re-loading the global here) avoids
+ * a redundant load on the trace branch below -- the function-call boundary
+ * defeats CSE -- and means a concurrent GUC change cannot half-apply to
+ * this call: we run entirely at the gate's view of the level.
+ */
+void
+pgstat_report_wait_end_timing(int capture_level)
+{
+	uint32		event;
+	uint32		cur_reset_gen;
+
+	if (wait_event_timing_writes_disabled)
+		return;
+
+	if (my_wait_event_timing == NULL)
+	{
+		pgstat_wait_event_timing_lazy_attach();
+		if (my_wait_event_timing == NULL)
+			return;
+	}
+
+	event = my_wait_event_timing->current_event;
+
+	/*
+	 * Service a pending cross-backend reset request.  A single relaxed atomic
+	 * load; when the shared generation has advanced past the value we last
+	 * acted on, clear our own counters on behalf of the requester and record
+	 * the reset.  wait_start is left untouched so the in-flight measurement
+	 * still lands (in the freshly-zeroed counters), and current_event is
+	 * zeroed so external readers do not see stale state.
+	 */
+	cur_reset_gen = pg_atomic_read_u32(&my_wait_event_timing->reset_generation);
+	if (cur_reset_gen != my_last_reset_generation)
+	{
+		memset(my_wait_event_timing->events, 0,
+			   sizeof(my_wait_event_timing->events));
+		lwlock_timing_hash_clear(my_wait_event_timing);
+		my_wait_event_timing->reset_count++;
+		my_wait_event_timing->lwlock_overflow_count = 0;
+		my_wait_event_timing->flat_overflow_count = 0;
+		my_wait_event_timing->current_event = 0;
+		my_last_reset_generation = cur_reset_gen;
+	}
+
+	if (event != 0 && !INSTR_TIME_IS_ZERO(my_wait_event_timing->wait_start))
+	{
+		instr_time	now;
+		int64		duration_ns;
+		int			idx;
+		WaitEventTimingEntry *entry = NULL;
+		bool		warn_lwlock_overflow = false;
+		bool		warn_flat_overflow = false;
+
+		INSTR_TIME_SET_CURRENT(now);
+		duration_ns = INSTR_TIME_GET_NANOSEC(now) -
+			INSTR_TIME_GET_NANOSEC(my_wait_event_timing->wait_start);
+
+		if (duration_ns < 0)
+			duration_ns = 0;
+
+		idx = wait_event_timing_index(event);
+
+		/*
+		 * Single-writer hot path: each slot has exactly one writer (the
+		 * owning backend), and the SRF reader is lock-free, so no locking is
+		 * needed here.  Overflow WARNINGs are deferred to after the
+		 * bookkeeping so ereport() cannot recurse through a wait event while
+		 * counters are in an intermediate state.
+		 */
 		if (idx == WAIT_EVENT_TIMING_IDX_LWLOCK)
 			entry = lwlock_timing_lookup(my_wait_event_timing, event & 0xFFFF);
 		else if (idx >= 0)
@@ -934,6 +1927,81 @@ pgstat_report_wait_end_timing(int capture_level)
 					(errmsg("some wait events will not be timed"),
 					 errdetail("A wait event was reported with a class unknown to the timing infrastructure.")));
 
+		/* Trace level: push the completed wait into the per-session ring. */
+		if (capture_level == WAIT_EVENT_CAPTURE_TRACE &&
+			!wait_event_trace_writes_disabled &&
+			!wait_event_trace_in_write)
+		{
+			/*
+			 * Lazy-attach the ring on first use here (not in the assign hook,
+			 * which must not ereport on OOM).  The writes-disabled gate also
+			 * blocks this re-attach during slot-state transitions
+			 * (release_slot / before_shmem_exit): without it a nested wait
+			 * event mid-transition could recurse into a fresh attach that
+			 * deadlocks on the lock the transition already holds.
+			 */
+			if (my_wait_event_trace == NULL && my_trace_proc_number >= 0)
+				wait_event_trace_attach(my_trace_proc_number);
+
+			if (my_wait_event_trace != NULL)
+			{
+				/*
+				 * Single-writer claim: a plain read+write avoids the LOCK
+				 * XADD that pg_atomic_fetch_add_u64 would emit on every wait
+				 * event.  Cross-backend readers use pg_atomic_read_u64 and
+				 * rely on the per-record seqlock below for safety.
+				 */
+				uint64		pos = pg_atomic_read_u64(&my_wait_event_trace->write_pos);
+				WaitEventTraceRecord *rec;
+				uint32		seq;
+
+				/*
+				 * Wait events emitted while a trace record is being written
+				 * must not themselves be ring-recorded (the gate above): the
+				 * record write is plain stores and cannot wait, but the
+				 * injection point below can, and the wait machinery it runs
+				 * (DSM registry lookup, condition-variable sleep) emits wait
+				 * events whose wait_end would recurse into this block --
+				 * re-running the injection point and self-deadlocking on
+				 * locks the outer invocation still holds.  Costs one
+				 * process-local store each way; nested waits still
+				 * accumulate in the stats above.
+				 */
+				wait_event_trace_in_write = true;
+
+				pg_atomic_write_u64(&my_wait_event_trace->write_pos, pos + 1);
+
+				/*
+				 * Injection point for the regression test of the
+				 * position-encoded identity seqlock: stalling here widens the
+				 * window between the write_pos store and the rec->seq store,
+				 * simulating weak-memory visibility that would otherwise be
+				 * unreachable on x86.  Compiled out unless
+				 * --enable-injection-points.
+				 */
+				INJECTION_POINT("wait-event-trace-after-write-pos", NULL);
+
+				rec = &my_wait_event_trace->records[pos & my_wait_event_trace->ring_mask];
+				seq = (uint32) (pos * 2 + 1);
+
+				rec->seq = seq;
+				pg_write_barrier(); /* payload stores must not rise above
+									 * seq=odd */
+
+				rec->record_type = TRACE_WAIT_EVENT;
+				rec->timestamp_ns = INSTR_TIME_GET_NANOSEC(now);
+				rec->data.wait.event = event;
+				rec->data.wait.pad2 = 0;
+				rec->data.wait.duration_ns = duration_ns;
+
+				pg_write_barrier(); /* payload stores must land before
+									 * seq=even */
+				rec->seq = seq + 1;
+
+				wait_event_trace_in_write = false;
+			}
+		}
+
 		INSTR_TIME_SET_ZERO(my_wait_event_timing->wait_start);
 	}
 }
@@ -1288,4 +2356,738 @@ pg_stat_reset_wait_event_timing_all(PG_FUNCTION_ARGS)
 	PG_RETURN_VOID();
 }
 
+/* ================= Trace-level readers ================= */
+
+/*
+ * SQL function: pg_get_backend_wait_event_trace()
+ *
+ * Returns trace records from the current backend's own ring buffer.  This
+ * function is deliberately session-local; cross-backend reading goes
+ * through pg_get_wait_event_trace(procnumber) below, which implements the
+ * snapshot-under-lock protocol documented on WaitEventTraceControl in
+ * wait_event_timing.h.  The name mirrors pg_get_backend_memory_contexts()
+ * to make the session-local scope explicit at the API level.
+ *
+ * Same-backend coordination with wait_event_trace_release_slot uses the
+ * wait_event_trace_srf_in_progress / _release_pending flags rather than
+ * an LWLock: same-backend serialization is implicit, so a per-backend
+ * bool plus a deferred-free path is sufficient and avoids any of the
+ * cross-backend lock-hold latency that the cross-backend reader pattern
+ * has to manage.  PG_TRY/PG_FINALLY guarantees the flag is cleared and
+ * any deferred dsa_free is performed even on ereport(ERROR).
+ *
+ * Uses InitMaterializedSRF (materialize-all).  The ring holds up to
+ * WaitEventTraceRingSize records (set at server start from the
+ * wait_event_trace_ring_size GUC; default 4 MB = 131072 records);
+ * full materialization caps the per-call cost at the ring size of
+ * tuplestore memory, which is acceptable for the use case this SRF
+ * is designed for: interactive own-session diagnostics from psql.
+ *
+ * This SRF is NOT the path for cross-backend monitoring tools: it is
+ * hard-coded to the calling backend's own ring, so a bgworker selecting
+ * from pg_backend_wait_event_trace via SPI would see only its own
+ * (typically empty) ring.  Cross-backend consumers use
+ * pg_get_wait_event_trace(procnumber) for SQL access, or implement the
+ * snapshot-under-lock protocol on WaitEventTraceControl directly.
+ *
+ * value-per-call (deferred) SRF mode would let an interactive
+ * "SELECT ... FROM pg_backend_wait_event_trace LIMIT N" short-circuit
+ * the materialisation, but converting this function would require
+ * spanning the wait_event_trace_srf_in_progress flag (and its
+ * deferred-free coordination with assign_wait_event_capture) across
+ * multiple SRF callbacks plus a transaction-cleanup registration to
+ * handle LIMIT abandonment.  The complexity is not
+ * justified for the diagnostic use case, especially since cross-
+ * backend monitoring (the consumer that would actually benefit from
+ * streaming) goes through the snapshot pattern above instead.
+ * Interactive callers who want only recent records should use
+ * "ORDER BY seq DESC LIMIT N" -- the LIMIT is applied after
+ * materialisation but the cost stays bounded by the ring size.
+ */
+Datum
+pg_get_backend_wait_event_trace(PG_FUNCTION_ARGS)
+{
+	ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+	WaitEventTraceState *ts;
+	uint64		write_pos;
+	uint64		read_start;
+	uint64		i;
+
+	InitMaterializedSRF(fcinfo, 0);
+
+	if (my_wait_event_trace == NULL)
+		PG_RETURN_VOID();
+
+	ts = my_wait_event_trace;
+
+	write_pos = pg_atomic_read_u64(&ts->write_pos);
+
+	if (write_pos == 0)
+		PG_RETURN_VOID();
+
+	/* Read from oldest available to newest */
+	{
+		uint64		ring_size = (uint64) ts->ring_mask + 1;
+
+		read_start = (write_pos > ring_size)
+			? write_pos - ring_size : 0;
+	}
+
+	/*
+	 * Mark the iteration in progress so wait_event_trace_release_slot defers
+	 * any concurrent dsa_free of our own ring (see the comment on that
+	 * function for the deferral protocol).  PG_FINALLY clears the flag and
+	 * performs any deferred free, even on ereport(ERROR).
+	 */
+	wait_event_trace_srf_in_progress = true;
+	PG_TRY();
+	{
+		for (i = read_start; i < write_pos; i++)
+		{
+			WaitEventTraceRecord *rec =
+				&ts->records[i & ts->ring_mask];
+			Datum		values[6];
+			bool		nulls[6];
+			const char *event_type;
+			const char *event_name;
+			uint32		seq_before;
+			uint32		seq_after;
+			uint8		rtype;
+			int64		timestamp_ns;
+			uint32		event_info;
+			int64		duration_ns;
+			int64		query_id;
+
+			/* Seqlock read */
+			seq_before = rec->seq;
+			pg_read_barrier();	/* acquire: payload loads below must not rise
+								 * above this */
+
+			if (seq_before & 1)
+				continue;
+
+			rtype = rec->record_type;
+			timestamp_ns = rec->timestamp_ns;
+
+			if (rtype == TRACE_WAIT_EVENT)
+			{
+				event_info = rec->data.wait.event;
+				duration_ns = rec->data.wait.duration_ns;
+				query_id = 0;
+			}
+			else if (rtype == TRACE_QUERY_START || rtype == TRACE_QUERY_END ||
+					 rtype == TRACE_EXEC_START || rtype == TRACE_EXEC_END)
+			{
+				event_info = 0;
+				duration_ns = 0;
+				query_id = rec->data.query.query_id;
+			}
+			else
+			{
+				pg_read_barrier();	/* acquire: pair with seq_before read
+									 * above before skipping */
+				continue;
+			}
+
+			pg_read_barrier();	/* acquire: payload loads must have landed
+								 * before seq_after */
+			seq_after = rec->seq;
+
+			if (seq_before != seq_after)
+				continue;
+
+			/* Skip empty wait events */
+			if (rtype == TRACE_WAIT_EVENT && event_info == 0)
+				continue;
+
+			if (rtype == TRACE_WAIT_EVENT)
+			{
+				event_type = pgstat_get_wait_event_type(event_info);
+				event_name = pgstat_get_wait_event(event_info);
+			}
+			else if (rtype == TRACE_QUERY_START)
+			{
+				event_type = "Query";
+				event_name = "QueryStart";
+			}
+			else if (rtype == TRACE_EXEC_START)
+			{
+				event_type = "Query";
+				event_name = "ExecStart";
+			}
+			else if (rtype == TRACE_EXEC_END)
+			{
+				event_type = "Query";
+				event_name = "ExecEnd";
+			}
+			else
+			{
+				event_type = "Query";
+				event_name = "QueryEnd";
+			}
+
+			if (event_type == NULL || event_name == NULL)
+				continue;
+
+			memset(nulls, 0, sizeof(nulls));
+
+			values[0] = Int64GetDatum((int64) i);
+			values[1] = Int64GetDatum(timestamp_ns);
+			values[2] = CStringGetTextDatum(event_type);
+			values[3] = CStringGetTextDatum(event_name);
+			values[4] = Float8GetDatum((double) duration_ns / 1000.0);
+			values[5] = Int64GetDatum(query_id);
+
+			tuplestore_putvalues(rsinfo->setResult,
+								 rsinfo->setDesc,
+								 values, nulls);
+		}
+	}
+	PG_FINALLY();
+	{
+		wait_event_trace_srf_in_progress = false;
+
+		/*
+		 * If a GUC step-down fired during iteration, it deferred the
+		 * dsa_free.  Process it now that we're safely past the loop. Re-check
+		 * release_pending under the same flag to handle the
+		 * (impossible-today, possible-tomorrow) case of a nested SRF.
+		 */
+		if (wait_event_trace_release_pending)
+		{
+			wait_event_trace_release_pending = false;
+			if (my_trace_proc_number >= 0)
+				wait_event_trace_release_slot(my_trace_proc_number);
+		}
+	}
+	PG_END_TRY();
+
+	PG_RETURN_VOID();
+}
+
+/*
+ * One element of the local result buffer.  Pairs a per-record copy
+ * with the original ring index (used as the seq output column).
+ */
+typedef struct WetValidRecord
+{
+	uint64		ring_index;		/* original index in the writer's ring */
+	WaitEventTraceRecord rec;
+} WetValidRecord;
+
+/*
+ * Snapshot the trace ring for a given procNumber and emit records into
+ * the SRF's tuplestore.  Returns silently for FREE slots, out-of-range
+ * procnumbers, slots whose ring was never allocated, and slots whose
+ * write_pos is zero.
+ *
+ * Cross-backend reader protocol implemented here:
+ *
+ *   1. Read slot->state without the lock as a cheap "worth visiting"
+ *      check; FREE -> nothing to emit.
+ *   2. Allocate the worst-case result buffer BEFORE taking the lock,
+ *      so the palloc -- which can bottom out in a glibc mmap syscall
+ *      for the worst-case (full-ring) size -- runs without holding the
+ *      WaitEventTraceCtl lock.
+ *   3. Acquire WaitEventTraceCtl->lock in LW_SHARED.  All slot
+ *      transitions take LW_EXCLUSIVE, so the slot's identity, state,
+ *      and ring_ptr are stable for the duration of the iteration.
+ *   4. Re-check state under the lock and resolve ring_ptr via
+ *      dsa_get_address.  Read write_pos.
+ *   5. Iterate every live ring index [read_start, write_pos).  For
+ *      each record do the per-record POSITION-ENCODED IDENTITY
+ *      seqlock check ON SHARED MEMORY (see the comment on the loop
+ *      below).
+ *   6. Release the lock.
+ *   7. Walk the local result array and emit rows into the tuplestore.
+ *      This is the expensive part (potential disk spill); doing it
+ *      after release minimises lock-hold time.
+ *
+ * Why per-record seqlock against shared memory, not against a local
+ * memcpy of the full ring: the protocol requires the two seq reads
+ * to go to the SAME shared-memory location at DIFFERENT TIMES, with
+ * the payload read between them.  A bulk memcpy then seqlock-on-
+ * local-copy reads the same frozen byte twice, the check degenerates
+ * to a no-op, and torn / stale-cycle reads slip through.
+ *
+ * Why position-encoded identity, not just parity: the writer encodes
+ * the ring position into the seq value (mid-write = pos*2+1, complete
+ * = pos*2+2).  After RING_SIZE writes the slot wraps and is rewritten
+ * with a new numerically-distinct seq.  A parity-only check accepts
+ * any stable even seq -- including the PREVIOUS cycle's seq if cross-
+ * process visibility puts the new write_pos ahead of the new seq
+ * update.  See the loop body for the four failure modes the identity
+ * check rejects.
+ *
+ * Holding LW_SHARED throughout the iteration also makes the
+ * generation-counter retry unnecessary for this caller: slot
+ * transitions take LW_EXCLUSIVE and therefore cannot happen while we
+ * hold LW_SHARED.  The generation counter is still part of the
+ * cross-backend reader contract on WaitEventTraceControl for external
+ * readers that follow a different lock-release pattern (e.g. an
+ * extension that wants to release the lock between batches of records
+ * and re-acquire), but this in-tree implementation does not release
+ * the lock mid-iteration.
+ *
+ * Both OWNED and ORPHANED slots are read uniformly.  For OWNED the
+ * live owner is concurrently writing; the seqlock catches torn reads.
+ * For ORPHANED the records are immutable post-mortem so the check is
+ * essentially a pass-through (it still correctly skips at most one
+ * trailing odd-seq record if the owner died mid-write).
+ *
+ * Lock-hold is O(write_pos - read_start) shared-memory loads, at
+ * roughly the same wall-clock cost as a single 4 MB memcpy of the
+ * full ring (~1 ms on modern hardware), with no I/O and no syscalls.
+ */
+static void
+emit_wait_event_trace_for_procnumber(int procNumber, ReturnSetInfo *rsinfo)
+{
+	WaitEventTraceSlot *slot;
+	WaitEventTraceState *ts;
+	WetValidRecord *valid_records = NULL;
+	uint64		valid_count = 0;
+	uint64		write_pos;
+	uint64		read_start;
+	uint64		i;
+	uint32		state_now;
+
+	if (WaitEventTraceCtl == NULL)
+		return;
+
+	/*
+	 * Range check.  Negative or out-of-range procnumbers return an empty
+	 * result rather than ERRORing because the most natural use pattern for
+	 * cross-backend readers is to iterate every possible slot index (a
+	 * monitoring background worker doesn't know the exact
+	 * NUM_WAIT_EVENT_TIMING_SLOTS at SQL level), and silent- empty for
+	 * out-of-range matches the behaviour of sister functions like
+	 * pg_stat_get_wait_event_timing(NULL) which iterate the shared array
+	 * internally.  FREE-but-in-range slots also return empty (see the state
+	 * check below); the caller cannot distinguish out-of-range from FREE,
+	 * which is fine.
+	 */
+	if (procNumber < 0 || procNumber >= NUM_WAIT_EVENT_TIMING_SLOTS)
+		return;
+
+	slot = &WaitEventTraceCtl->trace_slots[procNumber];
+
+	/*
+	 * If the trace DSA was never created (no backend in the cluster has ever
+	 * set wait_event_capture = trace), every slot is still in its initial
+	 * FREE state.  Skip without taking the lock.
+	 */
+	if (WaitEventTraceCtl->trace_dsa_handle == DSA_HANDLE_INVALID)
+		return;
+
+	/*
+	 * Unlocked fast-path check; the authoritative check is under the lock
+	 * below.
+	 */
+	if (pg_atomic_read_u32(&slot->state) == WAIT_EVENT_TRACE_SLOT_FREE)
+		return;
+
+	wait_event_trace_ensure_dsa();
+	if (trace_dsa == NULL)
+		return;
+
+	/*
+	 * Allocate the worst-case result buffer BEFORE taking the lock. The
+	 * buffer is sized for the full ring (sizeof(WetValidRecord) *
+	 * WaitEventTraceRingSize, e.g. ~5 MB at the 4 MB default ring and up to
+	 * ~42 MB at the 32 MB maximum); on a near-empty ring most goes unused,
+	 * but that is preferable to holding the WaitEventTraceCtl lock during a
+	 * palloc that may bottom out in a glibc mmap() syscall (allocations above
+	 * the malloc-mmap threshold).  Glibc's arena-internal mutex around the
+	 * syscall would serialise every concurrent reader of this lock through
+	 * one VMA-modifying kernel operation; sizing the alloc outside the lock
+	 * keeps the lock-hold time bounded by the per-record loop alone.
+	 *
+	 * After we acquire the lock we will either consume this buffer (writing
+	 * up to (write_pos - read_start) entries) or release it unused on an
+	 * early return.
+	 */
+
+	/*
+	 * Worst-case size = ring size.  Derive it from the GUC on first use in
+	 * this backend; subsequent calls see the cached value. The GUC is
+	 * PGC_POSTMASTER so the value is the same across every backend in this
+	 * postmaster run and never changes.
+	 */
+	if (WaitEventTraceRingSize == 0)
+		WaitEventTraceRingSize =
+			(uint32) wait_event_trace_ring_size * 1024U /
+			(uint32) sizeof(WaitEventTraceRecord);
+	valid_records = palloc(sizeof(WetValidRecord) * WaitEventTraceRingSize);
+
+	LWLockAcquire(&WaitEventTraceCtl->lock, LW_SHARED);
+
+	state_now = pg_atomic_read_u32(&slot->state);
+	if (state_now == WAIT_EVENT_TRACE_SLOT_FREE ||
+		!DsaPointerIsValid(slot->ring_ptr))
+	{
+		LWLockRelease(&WaitEventTraceCtl->lock);
+		pfree(valid_records);
+		return;
+	}
+
+	ts = (WaitEventTraceState *) dsa_get_address(trace_dsa, slot->ring_ptr);
+	write_pos = pg_atomic_read_u64(&ts->write_pos);
+
+	if (write_pos == 0)
+	{
+		LWLockRelease(&WaitEventTraceCtl->lock);
+		pfree(valid_records);
+		return;
+	}
+
+	/* Live range: oldest available to newest. */
+	{
+		uint64		ring_size = (uint64) ts->ring_mask + 1;
+
+		read_start = (write_pos > ring_size)
+			? write_pos - ring_size : 0;
+	}
+
+	for (i = read_start; i < write_pos; i++)
+	{
+		WaitEventTraceRecord *rec_shared =
+			&ts->records[i & ts->ring_mask];
+		WetValidRecord *out = &valid_records[valid_count];
+		uint32		expected_seq;
+		uint32		seq_before;
+		uint32		seq_after;
+
+		/*
+		 * Position-encoded seqlock identity check (NOT just parity).
+		 *
+		 * The writer encodes the ring position into the seq value: mid-write
+		 * -> (uint32)(pos * 2 + 1), complete -> + 2.  After RING_SIZE writes
+		 * the slot wraps and the same memory location gets a new seq value
+		 * (next_pos * 2 + 2) that is numerically distinct from the previous
+		 * cycle's seq.
+		 *
+		 * A parity-only check (skip on odd seq, accept on stable even) is
+		 * INSUFFICIENT for this layout in the cross-backend case: if the
+		 * writer just incremented write_pos to pos+1 but cross-process cache
+		 * coherence has not yet propagated the subsequent rec->seq =
+		 * (pos*2+1) store, this reader at i = pos would see the previous
+		 * cycle's complete-even seq (from logical position pos - RING_SIZE).
+		 * Both seq_before and seq_after would read that stale even value,
+		 * parity passes, identity-against-itself passes, and a record
+		 * belonging to the PREVIOUS cycle gets emitted with the new
+		 * ring_index = pos.  Silent data corruption (wrong attribution, not
+		 * torn bytes).
+		 *
+		 * The fix is identity against EXPECTED: a record is valid for
+		 * iterator position i if and only if its seq equals (uint32)(i * 2 +
+		 * 2) -- the writer's encoded "complete" value for that exact ring
+		 * position.  This rejects:
+		 *
+		 * * Stale prior cycle (seq <  expected): writer hasn't yet advanced
+		 * rec->seq for the current cycle. * Mid-write current cycle (seq ==
+		 * expected - 1, odd): writer is in the payload write window. * Ring
+		 * wrapped past us (seq >  expected): the writer completed a later
+		 * cycle on this slot during our read.
+		 *
+		 * The uint32 wraparound at 2^31 cycles is safe: we use exact
+		 * equality, and the writer's existing wrap-safety argument
+		 * (sizeof(seq) > worst-case in-flight window by 11 orders of
+		 * magnitude) covers the seq value.
+		 */
+		expected_seq = (uint32) (i * 2 + 2);
+
+		seq_before = rec_shared->seq;
+		pg_read_barrier();
+
+		if (seq_before != expected_seq)
+			continue;
+
+		out->rec = *rec_shared; /* one 32-byte structure copy */
+
+		pg_read_barrier();
+		seq_after = rec_shared->seq;
+
+		if (seq_after != expected_seq)
+			continue;
+
+		out->ring_index = i;
+		valid_count++;
+	}
+
+	LWLockRelease(&WaitEventTraceCtl->lock);
+
+	/*
+	 * Walk the local result array and emit rows.  No shared-memory access
+	 * from here on, so spills to disk by the tuplestore (if the result is
+	 * large) do not hold any wait-event-timing lock.
+	 */
+	for (i = 0; i < valid_count; i++)
+	{
+		WetValidRecord *vr = &valid_records[i];
+		WaitEventTraceRecord *rec = &vr->rec;
+		Datum		values[6];
+		bool		nulls[6];
+		const char *event_type;
+		const char *event_name;
+		uint8		rtype = rec->record_type;
+		uint32		event_info;
+		int64		duration_ns;
+		int64		query_id;
+
+		if (rtype == TRACE_WAIT_EVENT)
+		{
+			event_info = rec->data.wait.event;
+			duration_ns = rec->data.wait.duration_ns;
+			query_id = 0;
+
+			/* Skip empty wait events. */
+			if (event_info == 0)
+				continue;
+
+			event_type = pgstat_get_wait_event_type(event_info);
+			event_name = pgstat_get_wait_event(event_info);
+		}
+		else if (rtype == TRACE_QUERY_START)
+		{
+			event_info = 0;
+			duration_ns = 0;
+			query_id = rec->data.query.query_id;
+			event_type = "Query";
+			event_name = "QueryStart";
+		}
+		else if (rtype == TRACE_QUERY_END)
+		{
+			event_info = 0;
+			duration_ns = 0;
+			query_id = rec->data.query.query_id;
+			event_type = "Query";
+			event_name = "QueryEnd";
+		}
+		else if (rtype == TRACE_EXEC_START)
+		{
+			event_info = 0;
+			duration_ns = 0;
+			query_id = rec->data.query.query_id;
+			event_type = "Query";
+			event_name = "ExecStart";
+		}
+		else if (rtype == TRACE_EXEC_END)
+		{
+			event_info = 0;
+			duration_ns = 0;
+			query_id = rec->data.query.query_id;
+			event_type = "Query";
+			event_name = "ExecEnd";
+		}
+		else
+		{
+			/* Unrecognised record_type -- skip defensively. */
+			continue;
+		}
+
+		if (event_type == NULL || event_name == NULL)
+			continue;
+
+		memset(nulls, 0, sizeof(nulls));
+
+		values[0] = Int64GetDatum((int64) vr->ring_index);
+		values[1] = Int64GetDatum(rec->timestamp_ns);
+		values[2] = CStringGetTextDatum(event_type);
+		values[3] = CStringGetTextDatum(event_name);
+		values[4] = Float8GetDatum((double) duration_ns / 1000.0);
+		values[5] = Int64GetDatum(query_id);
+
+		tuplestore_putvalues(rsinfo->setResult,
+							 rsinfo->setDesc,
+							 values, nulls);
+	}
+
+	pfree(valid_records);
+}
+
+/*
+ * SQL function: pg_get_wait_event_trace(procnumber int4)
+ *
+ * Cross-backend trace ring reader.  Returns the records from the trace
+ * ring belonging to the backend that currently or previously occupied
+ * the given procNumber slot.  Reads OWNED and ORPHANED slots uniformly;
+ * FREE slots return an empty result.
+ *
+ * This SRF is the in-tree consumer of the orphan-preserved trace data:
+ * a backend that exited while wait_event_capture = trace leaves its
+ * ring allocated in DSA in ORPHANED state, and this function reads it
+ * until either a new backend takes over the same procNumber or the
+ * DBA calls pg_stat_clear_orphaned_wait_event_rings().  External
+ * extensions that need cross-backend access follow the same
+ * snapshot pattern documented on WaitEventTraceControl in
+ * wait_event_timing.h; this function serves as both the reference
+ * implementation and a DBA-facing diagnostic tool.
+ *
+ * Privileges: REVOKE'd from PUBLIC and GRANT'ed to pg_read_all_stats
+ * in system_views.sql, matching the privilege model of the session-
+ * local view pg_backend_wait_event_trace.
+ *
+ * The procnumber argument can be obtained from the procnumber column
+ * of pg_stat_get_wait_event_timing or pg_stat_get_wait_event_timing_
+ * overflow.  For pid-keyed access against live backends, callers can
+ * do:
+ *
+ *   SELECT * FROM pg_get_wait_event_trace(
+ *       (SELECT procnumber FROM pg_stat_get_wait_event_timing(<pid>)
+ *        WHERE pid = <pid> LIMIT 1));
+ *
+ * Note that pid-keyed access cannot read ORPHANED slots because a
+ * dying backend's pid is removed from procArray on exit; for
+ * post-mortem reading of short-lived backends (parallel workers,
+ * autovacuum, walsender) the procNumber must be captured before the
+ * backend exits, or discovered by iterating procnumbers in a
+ * monitoring background worker.
+ */
+Datum
+pg_get_wait_event_trace(PG_FUNCTION_ARGS)
+{
+	ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+	int32		procNumber = PG_GETARG_INT32(0);
+
+	InitMaterializedSRF(fcinfo, 0);
+
+	emit_wait_event_trace_for_procnumber((int) procNumber, rsinfo);
+
+	PG_RETURN_VOID();
+}
+
+/*
+ * SQL function: pg_stat_clear_orphaned_wait_event_rings()
+ *
+ * Free every trace ring whose owner has exited (slot state ORPHANED).
+ * Returns the number of rings released.
+ *
+ * Why this exists.  When a backend that had wait_event_capture = trace
+ * exits, we deliberately do NOT free its ~4 MB trace ring (see the
+ * lifecycle discussion on WaitEventTraceControl): the data must remain
+ * readable by cross-backend consumers -- the in-tree
+ * pg_get_wait_event_trace SRF and any extension following the
+ * snapshot pattern on WaitEventTraceControl -- and an exit-time
+ * dsa_free would defeat that.
+ * The reclaim instead happens lazily in two places:
+ *
+ *   (a) wait_event_trace_clear_orphan_at_init(): when a new backend
+ *       inherits the same procNumber slot at init, it frees the prior
+ *       orphan as part of starting clean.  This handles the common
+ *       case (busy clusters with connection churn) automatically.
+ *
+ *   (b) THIS FUNCTION: an explicit DBA-driven sweep that releases
+ *       every currently orphaned ring at once.
+ *
+ * The pathological case (a) does not handle is "capture briefly
+ * enabled, then disabled, on a cluster with long-lived pooled
+ * connections that never exit".  In that scenario procNumbers do not
+ * recycle, so prior orphans persist until cluster restart unless the
+ * DBA calls this function.  Worst-case bound is
+ * NUM_WAIT_EVENT_TIMING_SLOTS * sizeof(WaitEventTraceState) which is
+ * ~400 MB at MaxBackends=100, ~4 GB at MaxBackends=1000 -- bounded
+ * but worth a kill switch.
+ *
+ * Permissions: execution is revoked from PUBLIC by default, matching the
+ * cluster-wide reset (pg_stat_reset_wait_event_timing_all).  This is a
+ * cluster-scope memory-reclamation operation: it can disrupt any
+ * concurrent cross-backend reader on any orphaned slot.  The
+ * disruption is bounded (readers retry via the generation counter
+ * and at worst skip one read) but the operation is still
+ * cluster-wide, so the default privilege matches the reset variant
+ * with the same blast radius; administrators can delegate with GRANT.
+ *
+ * The function is safe to call even when no orphans exist (returns
+ * 0) and even when capture is currently OFF (the slot array exists
+ * unconditionally; only the rings are lazy).
+ */
+Datum
+pg_stat_clear_orphaned_wait_event_rings(PG_FUNCTION_ARGS)
+{
+	int64		freed = 0;
+	int			i;
+
+	/*
+	 * Execution is revoked from PUBLIC in system_views.sql; administrators
+	 * can delegate with GRANT EXECUTE.
+	 */
+	if (WaitEventTraceCtl == NULL)
+		PG_RETURN_INT64(0);
+
+	/*
+	 * If no backend has ever enabled trace, the trace DSA was never created
+	 * and there cannot be any ORPHANED slots: every slot is still in its
+	 * initial FREE state.  Nothing to do.
+	 */
+	if (WaitEventTraceCtl->trace_dsa_handle == DSA_HANDLE_INVALID)
+		PG_RETURN_INT64(0);
+
+	/* Attach to the trace DSA so dsa_free() can be called. */
+	wait_event_trace_ensure_dsa();
+	if (trace_dsa == NULL)
+		PG_RETURN_INT64(0);
+
+	/*
+	 * Walk every slot, taking and releasing WaitEventTraceCtl->lock per slot
+	 * rather than holding it across the entire sweep.
+	 *
+	 * Rationale: at MaxBackends = 1000 with a fully-orphaned cluster the
+	 * per-slot work (atomic state read + dsa_free + ring_ptr clear + atomic
+	 * state write) totals a few microseconds; holding the lock across all
+	 * slots would yield a millisecond-scale lock-hold window during which
+	 * every concurrent backend startup (the lazy
+	 * wait_event_trace_clear_orphan_at_init path), every cross-backend reader
+	 * (pg_get_wait_event_trace and the external snapshot pattern), and every
+	 * capture step-down or restore would stall.  PG's general convention is
+	 * to keep LWLock-held windows in paths that compete with regular activity
+	 * well under 100 microseconds; per-slot release/reacquire gives us a
+	 * worst- case lock-hold of one slot's worth of work regardless of how
+	 * many orphans exist cluster-wide.
+	 *
+	 * An unlocked fast-path read of slot->state skips non-ORPHANED slots
+	 * without an LWLockAcquire/Release pair.  This is safe: if a slot races
+	 * from non-ORPHANED to ORPHANED after we read it, we miss that orphan --
+	 * but the function is documented as a snapshot sweep, the missed orphan
+	 * can be cleared by a subsequent call, and the same race exists for
+	 * orphans that appear after the loop ends.  The authoritative re-check
+	 * under the lock prevents racing on the dsa_free direction (we never free
+	 * a slot whose owner became OWNED again).
+	 *
+	 * CHECK_FOR_INTERRUPTS at the top of the loop body lets the caller cancel
+	 * a long sweep; with the previous single-lock structure the
+	 * InterruptHoldoffCount elevation from LWLockAcquire deferred all
+	 * cancellation until release.
+	 */
+	for (i = 0; i < NUM_WAIT_EVENT_TIMING_SLOTS; i++)
+	{
+		WaitEventTraceSlot *slot = &WaitEventTraceCtl->trace_slots[i];
+
+		CHECK_FOR_INTERRUPTS();
+
+		/* Unlocked fast-path: skip non-ORPHANED slots cheaply. */
+		if (pg_atomic_read_u32(&slot->state) != WAIT_EVENT_TRACE_SLOT_ORPHANED)
+			continue;
+
+		LWLockAcquire(&WaitEventTraceCtl->lock, LW_EXCLUSIVE);
+
+		/*
+		 * Authoritative re-check under the lock.  A concurrent
+		 * clear_orphan_at_init may have already freed this slot.
+		 */
+		if (pg_atomic_read_u32(&slot->state) == WAIT_EVENT_TRACE_SLOT_ORPHANED &&
+			DsaPointerIsValid(slot->ring_ptr))
+		{
+			pg_atomic_fetch_add_u64(&slot->generation, 1);
+			dsa_free(trace_dsa, slot->ring_ptr);
+			slot->ring_ptr = InvalidDsaPointer;
+			pg_atomic_write_u32(&slot->state, WAIT_EVENT_TRACE_SLOT_FREE);
+			freed++;
+		}
+
+		LWLockRelease(&WaitEventTraceCtl->lock);
+	}
+
+	PG_RETURN_INT64(freed);
+}
+
 #endif							/* USE_WAIT_EVENT_TIMING */
diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c
index 3d8c9bdebd5..b7e4caa9e3b 100644
--- a/src/backend/utils/init/postinit.c
+++ b/src/backend/utils/init/postinit.c
@@ -70,6 +70,7 @@
 #include "utils/snapmgr.h"
 #include "utils/syscache.h"
 #include "utils/timeout.h"
+#include "utils/wait_event_timing.h"
 
 /* has this backend called EmitConnectionWarnings()? */
 static bool ConnectionWarningsEmitted;
@@ -1250,6 +1251,16 @@ InitPostgres(const char *in_dbname, Oid dboid,
 	/* Process pg_db_role_setting options */
 	process_settings(MyDatabaseId, GetSessionUserId());
 
+#ifdef USE_WAIT_EVENT_TIMING
+
+	/*
+	 * Attach trace ring if wait_event_capture = trace was set via
+	 * config/db/role settings
+	 */
+	if (wait_event_capture == WAIT_EVENT_CAPTURE_TRACE && my_trace_proc_number >= 0)
+		wait_event_trace_attach(my_trace_proc_number);
+#endif
+
 	/* Apply PostAuthDelay as soon as we've read all options */
 	if (PostAuthDelay > 0)
 		pg_usleep(PostAuthDelay * 1000000L);
diff --git a/src/backend/utils/misc/guc_parameters.dat b/src/backend/utils/misc/guc_parameters.dat
index 8f299f0971e..26f9a818ee9 100644
--- a/src/backend/utils/misc/guc_parameters.dat
+++ b/src/backend/utils/misc/guc_parameters.dat
@@ -3451,6 +3451,17 @@
   max => '65534',
 },
 
+{ name => 'wait_event_trace_ring_size', type => 'int', context => 'PGC_POSTMASTER', group => 'STATS_CUMULATIVE',
+  short_desc => 'Sets the size of each backend\'s wait-event-trace ring buffer.',
+  long_desc => 'Each backend that enables wait_event_capture = trace allocates a ring buffer of this size from a cluster-wide DSA.  The value must be a power of two and is fixed at server start.  Larger rings retain longer histories before wrapping.',
+  flags => 'GUC_UNIT_KB',
+  variable => 'wait_event_trace_ring_size',
+  boot_val => '4096',
+  min => '8',
+  max => '32768',
+  check_hook => 'check_wait_event_trace_ring_size',
+},
+
 { name => 'wal_block_size', type => 'int', context => 'PGC_INTERNAL', group => 'PRESET_OPTIONS',
   short_desc => 'Shows the block size in the write ahead log.',
   flags => 'GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE',
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 362fa48e7e6..23dfec1732a 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -703,9 +703,11 @@
 #track_cost_delay_timing = off
 #track_io_timing = off
 #track_wal_io_timing = off
-#wait_event_capture = off               # off, stats
+#wait_event_capture = off               # off, stats, trace
                                         # (requires --enable-wait-event-timing)
 #wait_event_timing_max_tranches = 192   # (change requires restart)
+#wait_event_trace_ring_size = 4MB      # power of two, 8kB .. 32MB
+                                        # (change requires restart)
 #track_functions = none                 # none, pl, all
 #stats_fetch_consistency = cache        # cache, none, snapshot
 
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index 9c5dad6d925..58821d5012c 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -12751,4 +12751,30 @@
   provolatile => 'v', prorettype => 'void', proargtypes => '',
   prosrc => 'pg_stat_reset_wait_event_timing_all' },
 
+{ oid => '9957',
+  descr => 'current backend wait event trace ring buffer',
+  proname => 'pg_get_backend_wait_event_trace', prorows => '1000',
+  proretset => 't', provolatile => 's', proparallel => 'r',
+  prorettype => 'record', proargtypes => '',
+  proallargtypes => '{int8,int8,text,text,float8,int8}',
+  proargmodes => '{o,o,o,o,o,o}',
+  proargnames => '{seq,timestamp_ns,wait_event_type,wait_event,duration_us,query_id}',
+  prosrc => 'pg_get_backend_wait_event_trace' },
+
+{ oid => '9962',
+  descr => 'wait event trace ring for the given procnumber slot, live or post-mortem',
+  proname => 'pg_get_wait_event_trace', prorows => '1000',
+  proretset => 't', provolatile => 'v', proparallel => 'r',
+  prorettype => 'record', proargtypes => 'int4',
+  proallargtypes => '{int4,int8,int8,text,text,float8,int8}',
+  proargmodes => '{i,o,o,o,o,o,o}',
+  proargnames => '{procnumber,seq,timestamp_ns,wait_event_type,wait_event,duration_us,query_id}',
+  prosrc => 'pg_get_wait_event_trace' },
+
+{ oid => '9961',
+  descr => 'statistics: free wait-event-trace rings whose owner backend has exited (superuser only); returns count freed',
+  proname => 'pg_stat_clear_orphaned_wait_event_rings',
+  provolatile => 'v', prorettype => 'int8', proargtypes => '',
+  prosrc => 'pg_stat_clear_orphaned_wait_event_rings' },
+
 ]
diff --git a/src/include/storage/lwlocklist.h b/src/include/storage/lwlocklist.h
index f0a27019a11..ab842923db4 100644
--- a/src/include/storage/lwlocklist.h
+++ b/src/include/storage/lwlocklist.h
@@ -141,3 +141,4 @@ PG_LWLOCKTRANCHE(PARALLEL_VACUUM_DSA, ParallelVacuumDSA)
 PG_LWLOCKTRANCHE(AIO_URING_COMPLETION, AioUringCompletion)
 PG_LWLOCKTRANCHE(SHMEM_INDEX, ShmemIndex)
 PG_LWLOCKTRANCHE(WAIT_EVENT_TIMING_DSA, WaitEventTimingDSA)
+PG_LWLOCKTRANCHE(WAIT_EVENT_TRACE_DSA, WaitEventTraceDSA)
diff --git a/src/include/storage/subsystemlist.h b/src/include/storage/subsystemlist.h
index 331d60cf703..90b14235464 100644
--- a/src/include/storage/subsystemlist.h
+++ b/src/include/storage/subsystemlist.h
@@ -80,6 +80,7 @@ PG_SHMEM_SUBSYSTEM(AsyncShmemCallbacks)
 PG_SHMEM_SUBSYSTEM(StatsShmemCallbacks)
 PG_SHMEM_SUBSYSTEM(WaitEventCustomShmemCallbacks)
 PG_SHMEM_SUBSYSTEM(WaitEventTimingShmemCallbacks)
+PG_SHMEM_SUBSYSTEM(WaitEventTraceControlShmemCallbacks)
 #ifdef USE_INJECTION_POINTS
 PG_SHMEM_SUBSYSTEM(InjectionPointShmemCallbacks)
 #endif
diff --git a/src/include/utils/guc_hooks.h b/src/include/utils/guc_hooks.h
index 7896e9c7239..afd9017e45c 100644
--- a/src/include/utils/guc_hooks.h
+++ b/src/include/utils/guc_hooks.h
@@ -174,6 +174,7 @@ extern void assign_transaction_timeout(int newval, void *extra);
 extern const char *show_unix_socket_permissions(void);
 extern bool check_wait_event_capture(int *newval, void **extra, GucSource source);
 extern void assign_wait_event_capture(int newval, void *extra);
+extern bool check_wait_event_trace_ring_size(int *newval, void **extra, GucSource source);
 extern bool check_wal_buffers(int *newval, void **extra, GucSource source);
 extern bool check_wal_consistency_checking(char **newval, void **extra,
 										   GucSource source);
diff --git a/src/include/utils/wait_event_timing.h b/src/include/utils/wait_event_timing.h
index 88a5015fb9c..d5766934b8c 100644
--- a/src/include/utils/wait_event_timing.h
+++ b/src/include/utils/wait_event_timing.h
@@ -30,7 +30,9 @@
 
 #include "port/atomics.h"
 #include "portability/instr_time.h"
+#include "storage/lwlock.h"
 #include "storage/shmem.h"
+#include "utils/dsa.h"
 #include "utils/wait_event_types.h"
 
 /*
@@ -41,23 +43,26 @@
  *   OFF   - No instrumentation, no hot-path cost.
  *   STATS - Aggregated per-event statistics (counts, durations, histogram)
  *           exposed via pg_stat_wait_event_timing.
- *
- * A further TRACE level is added later in the series.
+ *   TRACE - Everything in STATS plus a per-session ring buffer of individual
+ *           wait events and query-attribution markers, exposed via
+ *           pg_backend_wait_event_trace.
  */
 typedef enum WaitEventCaptureLevel
 {
 	WAIT_EVENT_CAPTURE_OFF = 0,
 	WAIT_EVENT_CAPTURE_STATS,
+	WAIT_EVENT_CAPTURE_TRACE,
 } WaitEventCaptureLevel;
 
 /*
- * Pin the enum ordering at compile time so future code that compares with
- * >= against WAIT_EVENT_CAPTURE_STATS keeps working, and so reordering is
- * caught at build time rather than via mysterious runtime mode switches.
+ * Pin the enum ordering at compile time so code that compares with >= keeps
+ * working, and so reordering is caught at build time rather than via
+ * mysterious runtime mode switches.
  */
 StaticAssertDecl(WAIT_EVENT_CAPTURE_OFF == 0 &&
-				 WAIT_EVENT_CAPTURE_STATS == 1,
-				 "WaitEventCaptureLevel values must be 0=OFF < 1=STATS");
+				 WAIT_EVENT_CAPTURE_STATS == 1 &&
+				 WAIT_EVENT_CAPTURE_TRACE == 2,
+				 "WaitEventCaptureLevel values must be 0=OFF < 1=STATS < 2=TRACE");
 
 /*
  * Number of log2 histogram buckets.  Bin edges are powers of two on the
@@ -194,11 +199,176 @@ extern PGDLLIMPORT WaitEventTimingState *my_wait_event_timing;
 extern PGDLLIMPORT const ShmemCallbacks WaitEventTimingShmemCallbacks;
 
 /*
- * Called from InitProcess()/InitAuxiliaryProcess() to point
- * my_wait_event_timing at this backend's slot, and from ProcKill() to
- * clear it.
+ * Called from InitProcess()/InitAuxiliaryProcess() to set up this backend's
+ * timing/trace bookkeeping, and from ProcKill() to clear it.
  */
 extern void pgstat_set_wait_event_timing_storage(int procNumber);
 extern void pgstat_reset_wait_event_timing_storage(void);
 
+
+/* ----------------------------------------------------------------------
+ * Trace level (wait_event_capture = trace)
+ *
+ * In addition to the STATS aggregates, every completed wait (and a set of
+ * query-attribution markers) is pushed into a per-session ring buffer --
+ * one record per completed wait.  The ring is allocated lazily in DSA on
+ * first use,
+ * so only sessions that enable trace pay the per-ring memory cost.  External
+ * tools read a session's ring via pg_get_backend_wait_event_trace() (own
+ * session) or pg_get_wait_event_trace(procnumber) (cross-backend).
+ *
+ * Query attribution is by scanning the ring at read time: QUERY/EXEC
+ * START/END markers delimit which wait events belong to which query_id.
+ *
+ * The ring size is set cluster-wide at server start by the
+ * wait_event_trace_ring_size GUC (PGC_POSTMASTER, default 4 MB).  It
+ * MUST be a power of two: the writer indexes the ring as (pos & ring_mask).
+ * ----------------------------------------------------------------------
+ */
+
+/* Trace record types */
+#define TRACE_WAIT_EVENT	0
+#define TRACE_QUERY_START	1
+#define TRACE_QUERY_END		2
+#define TRACE_EXEC_START	3
+#define TRACE_EXEC_END		4
+
+typedef struct WaitEventTraceRecord
+{
+	/*
+	 * Seqlock for torn-read detection.  Writers set seq odd before filling
+	 * fields, then even after; readers check seq before and after and skip
+	 * the record if either is odd or they differ.  uint32 wrap is irrelevant
+	 * over the ~10-20 ns reader access window.
+	 */
+	uint32		seq;
+	uint8		record_type;	/* TRACE_WAIT_EVENT / QUERY_* / EXEC_* */
+	uint8		pad[3];
+	int64		timestamp_ns;	/* monotonic clock */
+	union
+	{
+		struct					/* record_type = TRACE_WAIT_EVENT */
+		{
+			uint32		event;	/* wait_event_info */
+			uint32		pad2;
+			int64		duration_ns;
+		}			wait;
+		struct					/* QUERY_START/END or EXEC_START/END */
+		{
+			int64		query_id;
+			int64		pad2;
+		}			query;
+	}			data;
+} WaitEventTraceRecord;			/* 32 bytes */
+
+/*
+ * The seqlock wrap-safety argument and the mask-index math both rely on a
+ * fixed 32-byte record stride; make a stray field addition a build failure.
+ */
+StaticAssertDecl(sizeof(WaitEventTraceRecord) == 32,
+				 "WaitEventTraceRecord must be exactly 32 bytes");
+
+/*
+ * Per-backend trace ring header followed by the records array.  records[]
+ * is variably sized at allocation time (wait_event_trace_ring_size
+ * decides the row count).  write_pos and ring_mask share a cache line so
+ * the hot-path index calculation touches one line.
+ */
+typedef struct WaitEventTraceState
+{
+	pg_atomic_uint64 write_pos; /* monotonically increasing, wraps via mask */
+	uint32		ring_mask;		/* ring_size - 1; ring_size is a power of two */
+	uint32		ring_size_pad;	/* keep the records[] slab 16-byte aligned */
+	WaitEventTraceRecord records[FLEXIBLE_ARRAY_MEMBER];
+} WaitEventTraceState;
+
+/*
+ * Per-procNumber trace-ring slot lifecycle.  Decoupled from backend
+ * lifecycle on purpose: when a backend exits we transition its slot to
+ * ORPHANED and leave the ring in DSA so cross-backend consumers can still
+ * read the dying backend's final waits.  An orphan is reclaimed when a new
+ * backend takes the same procNumber, or by
+ * pg_stat_clear_orphaned_wait_event_rings().
+ *
+ *   FREE      no ring allocated (ring_ptr invalid).
+ *   OWNED     a live backend at this procNumber is writing to the ring.
+ *   ORPHANED  the owner exited; the ring is post-mortem and immutable.
+ */
+typedef enum WaitEventTraceSlotState
+{
+	WAIT_EVENT_TRACE_SLOT_FREE = 0,
+	WAIT_EVENT_TRACE_SLOT_OWNED,
+	WAIT_EVENT_TRACE_SLOT_ORPHANED,
+} WaitEventTraceSlotState;
+
+/*
+ * Per-procNumber slot.  generation is bumped on every owner transition;
+ * cross-backend readers snapshot it before+after their read and retry if it
+ * changed (the BackendStatusArray st_changecount idiom).  state is atomic
+ * only for cheap unlocked "worth visiting" probes; authoritative reads of
+ * (state, ring_ptr) are done under WaitEventTraceCtl->lock in LW_SHARED,
+ * while every transition holds it LW_EXCLUSIVE.
+ */
+typedef struct WaitEventTraceSlot
+{
+	pg_atomic_uint64 generation;	/* bumped on every owner transition */
+	pg_atomic_uint32 state;		/* WaitEventTraceSlotState */
+	uint32		pad;			/* keep ring_ptr 8-aligned */
+	dsa_pointer ring_ptr;		/* InvalidDsaPointer when FREE; else the
+								 * WaitEventTraceState chunk */
+} WaitEventTraceSlot;
+
+/*
+ * Control struct in fixed shared memory.  trace_slots[] is indexed by
+ * procNumber.
+ *
+ * External cross-backend reader protocol (pg_get_wait_event_trace is the
+ * reference implementation):
+ *   1. read trace_slots[procNumber].state unlocked as a "worth visiting"
+ *      probe; FREE -> nothing to read.
+ *   2. acquire lock LW_SHARED (all transitions take LW_EXCLUSIVE, so the
+ *      slot's state/ring_ptr/ring memory are stable for the iteration).
+ *   3. re-check state under the lock; resolve ring_ptr via dsa_get_address;
+ *      read write_pos.
+ *   4. iterate [read_start, write_pos): for each record do the per-record
+ *      POSITION-ENCODED IDENTITY seqlock check against shared memory --
+ *      expected_seq = (uint32)(i*2 + 2); read seq, barrier, copy record,
+ *      barrier, re-read seq; accept only if both equal expected_seq.  This
+ *      rejects stale previous-cycle reads (parity alone would not).
+ *   5. release the lock; emit the buffered records afterwards.
+ *   6. optional: snapshot generation before/after if releasing the lock
+ *      between batches.
+ */
+typedef struct WaitEventTraceControl
+{
+	dsa_handle	trace_dsa_handle;	/* DSA_HANDLE_INVALID until first use */
+	LWLock		lock;			/* protects DSA creation and slot transitions */
+	WaitEventTraceSlot trace_slots[FLEXIBLE_ARRAY_MEMBER];	/* per procNumber */
+} WaitEventTraceControl;
+
+/* Trace GUC and the records-per-ring value derived from it at startup. */
+extern PGDLLIMPORT int wait_event_trace_ring_size;
+extern PGDLLIMPORT uint32 WaitEventTraceRingSize;
+
+/* This backend's procNumber for the trace ring, or -1 if not set. */
+extern PGDLLIMPORT int my_trace_proc_number;
+
+/*
+ * Trace control shmem -- registered via subsystemlist.h; a no-op stub when
+ * the feature is compiled out.
+ */
+extern PGDLLIMPORT const ShmemCallbacks WaitEventTraceControlShmemCallbacks;
+
+/*
+ * Lazy DSA-based trace ring allocation -- called on first trace write and
+ * at backend startup when capture = trace was set via configuration.
+ */
+extern void wait_event_trace_attach(int procNumber);
+
+/* Query-attribution markers (defined in wait_event_timing.c). */
+extern void wait_event_trace_query_start(int64 query_id);
+extern void wait_event_trace_query_end(int64 query_id);
+extern void wait_event_trace_exec_start(int64 query_id);
+extern void wait_event_trace_exec_end(int64 query_id);
+
 #endif							/* WAIT_EVENT_TIMING_H */
diff --git a/src/test/modules/test_misc/meson.build b/src/test/modules/test_misc/meson.build
index ee290698b31..aba6fbdf86e 100644
--- a/src/test/modules/test_misc/meson.build
+++ b/src/test/modules/test_misc/meson.build
@@ -23,6 +23,7 @@ tests += {
       't/012_ddlutils.pl',
       't/013_temp_obj_multisession.pl',
       't/014_log_statement_max_length.pl',
+      't/015_wait_event_trace_seqlock.pl',
     ],
     # The injection points are cluster-wide, so disable installcheck
     'runningcheck': false,
diff --git a/src/test/modules/test_misc/t/015_wait_event_trace_seqlock.pl b/src/test/modules/test_misc/t/015_wait_event_trace_seqlock.pl
new file mode 100644
index 00000000000..0f0e49970c8
--- /dev/null
+++ b/src/test/modules/test_misc/t/015_wait_event_trace_seqlock.pl
@@ -0,0 +1,122 @@
+# Copyright (c) 2026, PostgreSQL Global Development Group
+
+# Test the position-encoded identity seqlock that protects cross-backend
+# reads of the wait-event trace ring (wait_event_capture = trace).
+#
+# The hazard: the trace writer advances write_pos and only then stamps the
+# record's seq.  A cross-backend reader that observes the new write_pos
+# before the seq store has propagated sees, at the in-flight ring slot, the
+# PREVIOUS cycle's record -- complete, with an even seq.  A parity-only
+# seqlock would accept it and emit a stale record attributed to the wrong
+# ring index; the identity check (seq must equal the writer's completion
+# value for that exact position) must reject it.
+#
+# That window is unobservable on TSO hardware without instrumentation, so
+# the writer carries INJECTION_POINT("wait-event-trace-after-write-pos")
+# between the write_pos advance and the seq stamp.  This test:
+#
+#   1. fills and wraps a minimum-size ring (8kB = 256 records), so every
+#      slot holds a complete record from the previous cycle;
+#   2. wedges the writer at the injection point, mid-record;
+#   3. reads the ring cross-backend: the reader must return exactly
+#      ring_size - 1 records, skipping the in-flight slot whose stale
+#      prior-cycle record a parity-only check would have emitted;
+#   4. releases the writer and verifies the ring reads full again.
+
+use strict;
+use warnings FATAL => 'all';
+
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+plan skip_all => 'Injection points not supported by this build'
+  unless $ENV{enable_injection_points} eq 'yes';
+
+my $ring_records = 256;			# 8kB ring / 32-byte records
+
+my $node = PostgreSQL::Test::Cluster->new('seqlock');
+$node->init;
+$node->append_conf(
+	'postgresql.conf', q[
+wait_event_trace_ring_size = '8kB'
+]);
+$node->start;
+
+# Skip if the server was not built with --enable-wait-event-timing.
+my ($ret, $stdout, $stderr) =
+  $node->psql('postgres', 'SET wait_event_capture = trace;');
+if ($ret != 0)
+{
+	$node->stop;
+	plan skip_all => 'server not built with --enable-wait-event-timing';
+}
+
+$node->safe_psql('postgres', 'CREATE EXTENSION injection_points;');
+
+# Writer session: enable trace and wrap the ring.  400 pg_sleep calls emit
+# at least 400 wait events into a 256-record ring, so every slot holds a
+# complete record from the current window.
+my $writer = $node->background_psql('postgres');
+$writer->query_safe('SET wait_event_capture = trace;');
+$writer->query_safe(
+	'SELECT count(pg_sleep(0.001)) FROM generate_series(1, 400);');
+
+my $writer_proc = $writer->query_safe(
+	'SELECT procnumber FROM pg_stat_get_wait_event_timing(pg_backend_pid())'
+	  . ' LIMIT 1;');
+chomp $writer_proc;
+like($writer_proc, qr/^\d+$/, 'writer reported its procnumber');
+
+# With the ring wrapped and the writer idle, a cross-backend read returns
+# exactly ring_size records: every slot is complete and identity-valid.
+my $count_full = $node->safe_psql('postgres',
+	"SELECT count(*) FROM pg_get_wait_event_trace($writer_proc);");
+is($count_full, $ring_records, 'wrapped ring reads full before the wedge');
+
+# Wedge the writer mid-record: arm the injection point, then send a
+# statement.  The arrival of the statement completes the writer's
+# ClientRead wait; its trace write advances write_pos and then blocks at
+# the injection point, before stamping the record's seq.
+$node->safe_psql('postgres',
+	"SELECT injection_points_attach('wait-event-trace-after-write-pos', 'wait');"
+);
+$writer->query_until(
+	qr/wedge_sent/, q[
+\echo wedge_sent
+SELECT 1;
+]);
+$node->wait_for_event('client backend', 'wait-event-trace-after-write-pos');
+
+# The decisive read: the in-flight slot still holds the previous cycle's
+# complete record.  A parity-only seqlock would emit it (ring_size rows,
+# one misattributed); the identity check must skip exactly that slot.
+my $count_wedged = $node->safe_psql('postgres',
+	"SELECT count(*) FROM pg_get_wait_event_trace($writer_proc);");
+is($count_wedged, $ring_records - 1,
+	'reader skips the in-flight slot instead of emitting the stale prior-cycle record'
+);
+
+# The read is stable and repeatable while the writer is wedged.
+my $count_wedged2 = $node->safe_psql('postgres',
+	"SELECT count(*) FROM pg_get_wait_event_trace($writer_proc);");
+is($count_wedged2, $count_wedged, 'wedged-ring read is stable');
+
+# Release the writer: detach first so the nested wakeup wait does not
+# re-arm, then wake it.
+$node->safe_psql('postgres',
+	"SELECT injection_points_detach('wait-event-trace-after-write-pos');");
+$node->safe_psql('postgres',
+	"SELECT injection_points_wakeup('wait-event-trace-after-write-pos');");
+
+# The writer completes the wedged record (and its pending statement); the
+# ring must read full again.
+$writer->query_safe("SELECT 'resync';");
+my $count_after = $node->safe_psql('postgres',
+	"SELECT count(*) FROM pg_get_wait_event_trace($writer_proc);");
+is($count_after, $ring_records, 'ring reads full again after release');
+
+$writer->quit;
+$node->stop;
+
+done_testing();
diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out
index 71c95cb0746..9d64e125972 100644
--- a/src/test/regress/expected/rules.out
+++ b/src/test/regress/expected/rules.out
@@ -1332,6 +1332,13 @@ pg_backend_memory_contexts| SELECT name,
     free_chunks,
     used_bytes
    FROM pg_get_backend_memory_contexts() pg_get_backend_memory_contexts(name, ident, type, level, path, total_bytes, total_nblocks, free_bytes, free_chunks, used_bytes);
+pg_backend_wait_event_trace| SELECT seq,
+    timestamp_ns,
+    wait_event_type,
+    wait_event,
+    duration_us,
+    query_id
+   FROM pg_get_backend_wait_event_trace() t(seq, timestamp_ns, wait_event_type, wait_event, duration_us, query_id);
 pg_config| SELECT name,
     setting
    FROM pg_config() pg_config(name, setting);
diff --git a/src/test/regress/expected/wait_event_timing.out b/src/test/regress/expected/wait_event_timing.out
index 8938ffe5fb0..9fc9b115e79 100644
--- a/src/test/regress/expected/wait_event_timing.out
+++ b/src/test/regress/expected/wait_event_timing.out
@@ -1,9 +1,10 @@
 --
 -- WAIT_EVENT_TIMING
 --
--- Exercises the wait_event_capture = stats instrumentation: the GUC, the
--- pg_stat_get_wait_event_timing() SRF, the pg_stat_wait_event_timing view,
--- and the pg_wait_event_timing_histogram_buckets taxonomy view.
+-- Exercises the wait_event_capture instrumentation: the GUC, the stats
+-- surface (pg_stat_get_wait_event_timing(), the pg_stat_wait_event_timing
+-- and histogram-buckets views, overflow counters, resets), and the trace
+-- surface (the per-session ring and its readers).
 --
 -- Two expected outputs are maintained:
 --   wait_event_timing.out    -- --enable-wait-event-timing builds
@@ -133,3 +134,42 @@ SELECT pg_stat_reset_wait_event_timing(2147483647);
 (1 row)
 
 RESET wait_event_capture;
+--
+-- Trace level: per-session ring of individual waits + query markers.
+-- (In a stub build SET trace errors and the trace readers stay empty;
+-- that is the documented difference between the two expected files.)
+--
+SET wait_event_capture = trace;
+SELECT pg_sleep(0.1);
+ pg_sleep 
+----------
+ 
+(1 row)
+
+-- PgSleep is recorded in this backend's ring with a positive duration.
+SELECT count(*) >= 1 AS pgsleep_in_ring,
+       coalesce(bool_and(duration_us > 0), false) AS durations_positive
+FROM pg_get_backend_wait_event_trace()
+WHERE wait_event = 'PgSleep';
+ pgsleep_in_ring | durations_positive 
+-----------------+--------------------
+ t               | t
+(1 row)
+
+-- The same records are visible through the view.
+SELECT count(*) >= 1 AS view_has_pgsleep
+FROM pg_backend_wait_event_trace
+WHERE wait_event = 'PgSleep';
+ view_has_pgsleep 
+------------------
+ t
+(1 row)
+
+-- Clearing orphaned rings is a no-op here (no orphans) but must succeed.
+SELECT pg_stat_clear_orphaned_wait_event_rings() >= 0 AS clear_orphans_ok;
+ clear_orphans_ok 
+------------------
+ t
+(1 row)
+
+RESET wait_event_capture;
diff --git a/src/test/regress/expected/wait_event_timing_1.out b/src/test/regress/expected/wait_event_timing_1.out
index 3aad898d9bf..f8b3beec02f 100644
--- a/src/test/regress/expected/wait_event_timing_1.out
+++ b/src/test/regress/expected/wait_event_timing_1.out
@@ -1,9 +1,10 @@
 --
 -- WAIT_EVENT_TIMING
 --
--- Exercises the wait_event_capture = stats instrumentation: the GUC, the
--- pg_stat_get_wait_event_timing() SRF, the pg_stat_wait_event_timing view,
--- and the pg_wait_event_timing_histogram_buckets taxonomy view.
+-- Exercises the wait_event_capture instrumentation: the GUC, the stats
+-- surface (pg_stat_get_wait_event_timing(), the pg_stat_wait_event_timing
+-- and histogram-buckets views, overflow counters, resets), and the trace
+-- surface (the per-session ring and its readers).
 --
 -- Two expected outputs are maintained:
 --   wait_event_timing.out    -- --enable-wait-event-timing builds
@@ -123,3 +124,45 @@ SELECT pg_stat_reset_wait_event_timing(2147483647);
 ERROR:  wait event capture is not supported by this build
 HINT:  Compile PostgreSQL with --enable-wait-event-timing.
 RESET wait_event_capture;
+--
+-- Trace level: per-session ring of individual waits + query markers.
+-- (In a stub build SET trace errors and the trace readers stay empty;
+-- that is the documented difference between the two expected files.)
+--
+SET wait_event_capture = trace;
+ERROR:  invalid value for parameter "wait_event_capture": "trace"
+DETAIL:  This build does not support wait event capture.
+HINT:  Compile PostgreSQL with --enable-wait-event-timing.
+SELECT pg_sleep(0.1);
+ pg_sleep 
+----------
+ 
+(1 row)
+
+-- PgSleep is recorded in this backend's ring with a positive duration.
+SELECT count(*) >= 1 AS pgsleep_in_ring,
+       coalesce(bool_and(duration_us > 0), false) AS durations_positive
+FROM pg_get_backend_wait_event_trace()
+WHERE wait_event = 'PgSleep';
+ pgsleep_in_ring | durations_positive 
+-----------------+--------------------
+ f               | f
+(1 row)
+
+-- The same records are visible through the view.
+SELECT count(*) >= 1 AS view_has_pgsleep
+FROM pg_backend_wait_event_trace
+WHERE wait_event = 'PgSleep';
+ view_has_pgsleep 
+------------------
+ f
+(1 row)
+
+-- Clearing orphaned rings is a no-op here (no orphans) but must succeed.
+SELECT pg_stat_clear_orphaned_wait_event_rings() >= 0 AS clear_orphans_ok;
+ clear_orphans_ok 
+------------------
+ t
+(1 row)
+
+RESET wait_event_capture;
diff --git a/src/test/regress/sql/wait_event_timing.sql b/src/test/regress/sql/wait_event_timing.sql
index a30fcd155fd..49206afef66 100644
--- a/src/test/regress/sql/wait_event_timing.sql
+++ b/src/test/regress/sql/wait_event_timing.sql
@@ -1,9 +1,10 @@
 --
 -- WAIT_EVENT_TIMING
 --
--- Exercises the wait_event_capture = stats instrumentation: the GUC, the
--- pg_stat_get_wait_event_timing() SRF, the pg_stat_wait_event_timing view,
--- and the pg_wait_event_timing_histogram_buckets taxonomy view.
+-- Exercises the wait_event_capture instrumentation: the GUC, the stats
+-- surface (pg_stat_get_wait_event_timing(), the pg_stat_wait_event_timing
+-- and histogram-buckets views, overflow counters, resets), and the trace
+-- surface (the per-session ring and its readers).
 --
 -- Two expected outputs are maintained:
 --   wait_event_timing.out    -- --enable-wait-event-timing builds
@@ -77,3 +78,27 @@ SELECT pg_stat_reset_wait_event_timing();
 SELECT pg_stat_reset_wait_event_timing(2147483647);
 
 RESET wait_event_capture;
+
+--
+-- Trace level: per-session ring of individual waits + query markers.
+-- (In a stub build SET trace errors and the trace readers stay empty;
+-- that is the documented difference between the two expected files.)
+--
+SET wait_event_capture = trace;
+SELECT pg_sleep(0.1);
+
+-- PgSleep is recorded in this backend's ring with a positive duration.
+SELECT count(*) >= 1 AS pgsleep_in_ring,
+       coalesce(bool_and(duration_us > 0), false) AS durations_positive
+FROM pg_get_backend_wait_event_trace()
+WHERE wait_event = 'PgSleep';
+
+-- The same records are visible through the view.
+SELECT count(*) >= 1 AS view_has_pgsleep
+FROM pg_backend_wait_event_trace
+WHERE wait_event = 'PgSleep';
+
+-- Clearing orphaned rings is a no-op here (no orphans) but must succeed.
+SELECT pg_stat_clear_orphaned_wait_event_rings() >= 0 AS clear_orphans_ok;
+
+RESET wait_event_capture;
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 7d43fc4e5b6..63f207f3da6 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -3431,6 +3431,11 @@ WaitEventTimeout
 WaitEventTimingControl
 WaitEventTimingEntry
 WaitEventTimingState
+WaitEventTraceControl
+WaitEventTraceRecord
+WaitEventTraceSlot
+WaitEventTraceSlotState
+WaitEventTraceState
 WaitLSNProcInfo
 WaitLSNResult
 WaitLSNState
@@ -3461,6 +3466,7 @@ WalUsage
 WalWriteMethod
 WalWriteMethodOps
 Walfile
+WetValidRecord
 WindowAgg
 WindowAggPath
 WindowAggState
-- 
2.43.0