From 72553668f1b03b6619e8c7097e7c53bfc0a955d2 Mon Sep 17 00:00:00 2001 From: Dmitry Fomin Date: Thu, 11 Jun 2026 15:11:09 +0000 Subject: [PATCH v1 7/7] wait_event_timing: add query-attribution markers to the trace ring At the trace level, interleave query-boundary markers with the wait events in the per-session ring so a reader can tell which query each wait belongs to. Two marker families are emitted: - ExecStart/ExecEnd bracket every executor run (ExecutorStart/ExecutorEnd), the primary attribution signal -- every executable statement, including those in parallel workers and pipelined extended-protocol messages, is bracketed; - QueryStart/QueryEnd fire at top-level query_id transitions (pgstat_report_query_id) and at the transition to idle (send_ready_for_query), providing the inter-statement boundaries the executor markers cannot -- e.g. the ClientRead wait between statements. In the pipelined extended protocol a Parse/Bind/Execute can arrive while the previous query's id is still set and the session is still RUNNING (no Sync->idle in between), so the prior id is flushed with force=true at those message boundaries to fire its QUERY_END before the new query starts. That flush is gated on wait_event_capture = trace, so when trace is off it is a no-op and pg_stat_activity.query_id behaves exactly as before. Marker emission requires a non-zero query_id (compute_query_id) and track_activities; a WARNING is logged when trace is enabled without them. The markers themselves are no-ops unless capture is at trace, so the only cost when the feature is off is the inline gate already present. --- doc/src/sgml/config.sgml | 9 +++- doc/src/sgml/monitoring.sgml | 54 ++++++++++++++++--- src/backend/executor/execMain.c | 5 ++ src/backend/tcop/postgres.c | 39 ++++++++++++++ src/backend/utils/activity/backend_status.c | 13 +++++ .../utils/activity/wait_event_timing.c | 18 +++++++ .../regress/expected/wait_event_timing.out | 15 +++++- .../regress/expected/wait_event_timing_1.out | 15 +++++- src/test/regress/sql/wait_event_timing.sql | 11 +++- 9 files changed, 166 insertions(+), 13 deletions(-) diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index 734310935ed..293a9095da7 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -9166,13 +9166,18 @@ COPY postgres_log FROM '/full/path/to/logfile.csv' WITH csv; At trace, the server additionally records every - individual wait event into a per-session ring buffer exposed via the + individual wait event, together with query-attribution markers, + into a per-session ring buffer exposed via the pg_backend_wait_event_trace view. The ring is allocated lazily from dynamic shared memory on first use (default 4 MB per backend; see ), so only - sessions that enable trace pay the per-ring memory cost. + sessions that enable trace pay the per-ring memory cost. Query + attribution — matching each wait to the query that incurred + it — requires both and + ; a warning is logged if + either is missing when trace is enabled. Only superusers and users with the appropriate SET diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml index 2f1fc457d00..3f99b022fda 100644 --- a/doc/src/sgml/monitoring.sgml +++ b/doc/src/sgml/monitoring.sgml @@ -4245,8 +4245,14 @@ ORDER BY b.bucket_idx; The pg_backend_wait_event_trace view shows individual wait event records from the current backend's - trace ring buffer. Each record captures a single wait event - (with timestamp and duration). + trace ring buffer. Each record captures either a single wait event + (with timestamp and duration) or a query-attribution marker. Two + marker families exist: ExecStart/ExecEnd + bracket every executor invocation, and + QueryStart/QueryEnd mark + top-level query-id transitions and the transition to idle. See + for the gating rules of + each marker family. Requires to be set to trace. The ring buffer is sized by (default 4 MB = @@ -4333,8 +4339,16 @@ ORDER BY b.bucket_idx; The ring buffer is designed as a lock-free transport mechanism for external consumption. At high wait event rates (e.g., 220K events/sec), - the ring wraps in roughly 0.5–1 seconds. Consumers should poll - the ring buffer before it wraps. + the ring wraps in roughly 0.5–1 seconds. External consumers + (background workers, extensions) can attribute events to queries by + scanning for ExecStart markers (or, when the + executor markers are unavailable, QueryStart); if + both have been overwritten, events before the next visible marker are + unattributed. Consumers should poll the ring buffer before it wraps + and can use the query_id column of + + pg_stat_activity as a fallback for the + current query context. @@ -4354,6 +4368,28 @@ ORDER BY b.bucket_idx; alone. + + QueryStart/QueryEnd markers are + emitted as matched pairs around each protocol phase that touches a + query_id. In simple protocol that is one + pair per statement. In extended protocol there is one pair around + each of Parse, Bind, and + Execute for the same + query_id — so a single parameterized + statement produces three nested pairs, plus the surrounding + ExecStart/ExecEnd pair from the + executor. This per-phase pairing lets consumers measure how much + time a query spent in each protocol phase (parse vs. bind vs. + execute) by computing the duration between each pair, and lets a + total-time-per-query rollup be expressed as the sum of pair + durations rather than a single subtraction. Consumers that just want + "how long did this query take in the executor" should use the + ExecStart/ExecEnd pair, which + brackets each executor invocation regardless of protocol. (Statements + that invoke the executor recursively — for example via + PL/pgSQL or other server-side code — + produce one pair per invocation.) + <structname>pg_backend_wait_event_trace</structname> View @@ -4393,7 +4429,7 @@ ORDER BY b.bucket_idx; wait_event_typetext - Wait event type + Wait event type, or Query for query markers @@ -4402,7 +4438,9 @@ ORDER BY b.bucket_idx; wait_eventtext - Wait event name + Wait event name, or one of ExecStart, + ExecEnd, QueryStart, + QueryEnd for query-attribution markers. @@ -4411,7 +4449,7 @@ ORDER BY b.bucket_idx; duration_usdouble precision - Wait duration in microseconds + Wait duration in microseconds (0 for query markers) @@ -4420,7 +4458,7 @@ ORDER BY b.bucket_idx; query_idbigint - Reserved; always 0 for wait event records + Query identifier for query markers (0 for wait events) diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c index fde502efd38..6aa1b9eefad 100644 --- a/src/backend/executor/execMain.c +++ b/src/backend/executor/execMain.c @@ -64,6 +64,7 @@ #include "utils/partcache.h" #include "utils/rls.h" #include "utils/snapmgr.h" +#include "utils/wait_event_timing.h" /* Hooks for plugins to get control in ExecutorStart/Run/Finish/End */ @@ -133,6 +134,8 @@ ExecutorStart(QueryDesc *queryDesc, int eflags) */ pgstat_report_query_id(queryDesc->plannedstmt->queryId, false); + wait_event_trace_exec_start(queryDesc->plannedstmt->queryId); + if (ExecutorStart_hook) (*ExecutorStart_hook) (queryDesc, eflags); else @@ -476,6 +479,8 @@ standard_ExecutorFinish(QueryDesc *queryDesc) void ExecutorEnd(QueryDesc *queryDesc) { + wait_event_trace_exec_end(queryDesc->plannedstmt->queryId); + if (ExecutorEnd_hook) (*ExecutorEnd_hook) (queryDesc); else diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index ce18df820cd..83fa0c1f598 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -86,6 +86,7 @@ #include "utils/timeout.h" #include "utils/timestamp.h" #include "utils/varlena.h" +#include "utils/wait_event_timing.h" /* ---------------- * global variables @@ -1444,6 +1445,17 @@ exec_parse_message(const char *query_string, /* string to execute */ */ debug_query_string = query_string; + /* + * In pipelined extended protocol, a Parse can arrive while the previous + * query's st_query_id is still set and st_state is still RUNNING (no + * Sync->idle between queries). Flush the prior id with force=true so the + * QUERY_END marker fires before pgstat_report_activity below zeros + * st_query_id. Skip when st_state != RUNNING: coming from idle means + * send_ready_for_query already emitted the QUERY_END. + */ + if (wait_event_capture == WAIT_EVENT_CAPTURE_TRACE && + MyBEEntry != NULL && MyBEEntry->st_state == STATE_RUNNING) + pgstat_report_query_id(0, true); pgstat_report_activity(STATE_RUNNING, query_string); set_ps_display("PARSE"); @@ -1723,6 +1735,14 @@ exec_bind_message(StringInfo input_message) */ debug_query_string = psrc->query_string; + /* + * See exec_parse_message: flush the prior query_id's QUERY_END before + * pgstat_report_activity zeros it; the state gate avoids a duplicate + * QUERY_END right after a Sync->idle transition. + */ + if (wait_event_capture == WAIT_EVENT_CAPTURE_TRACE && + MyBEEntry != NULL && MyBEEntry->st_state == STATE_RUNNING) + pgstat_report_query_id(0, true); pgstat_report_activity(STATE_RUNNING, psrc->query_string); foreach(lc, psrc->query_list) @@ -2224,6 +2244,14 @@ exec_execute_message(const char *portal_name, long max_rows) */ debug_query_string = sourceText; + /* + * See exec_parse_message: flush the prior query_id's QUERY_END before + * pgstat_report_activity zeros it; the state gate avoids a duplicate + * QUERY_END right after a Sync->idle transition. + */ + if (wait_event_capture == WAIT_EVENT_CAPTURE_TRACE && + MyBEEntry != NULL && MyBEEntry->st_state == STATE_RUNNING) + pgstat_report_query_id(0, true); pgstat_report_activity(STATE_RUNNING, sourceText); foreach(lc, portal->stmts) @@ -4749,6 +4777,17 @@ PostgresMain(const char *dbname, const char *username) */ if (send_ready_for_query) { + /* + * Emit QUERY_END before going idle so idle waits (ClientRead + * etc.) are not attributed to the finished query. + */ + { + volatile PgBackendStatus *beentry = MyBEEntry; + + if (beentry != NULL && beentry->st_query_id != 0) + wait_event_trace_query_end(beentry->st_query_id); + } + if (IsAbortedTransactionBlockState()) { set_ps_display("idle in transaction (aborted)"); diff --git a/src/backend/utils/activity/backend_status.c b/src/backend/utils/activity/backend_status.c index d685fc5cd87..19a6dd48aef 100644 --- a/src/backend/utils/activity/backend_status.c +++ b/src/backend/utils/activity/backend_status.c @@ -24,6 +24,7 @@ #include "utils/ascii.h" #include "utils/guc.h" /* for application_name */ #include "utils/memutils.h" +#include "utils/wait_event_timing.h" /* ---------- @@ -670,6 +671,18 @@ pgstat_report_query_id(int64 query_id, bool force) if (beentry->st_query_id != INT64CONST(0) && !force) return; + /* + * Emit trace markers for query-to-query transitions. QUERY_END fires + * here when st_query_id transitions from one non-zero value to another + * (multi-statement simple protocol, pipelined extended protocol). The + * last-query-to-idle QUERY_END is emitted in PostgresMain() at + * send_ready_for_query. + */ + if (beentry->st_query_id != 0 && beentry->st_query_id != query_id) + wait_event_trace_query_end(beentry->st_query_id); + if (query_id != 0 && query_id != beentry->st_query_id) + wait_event_trace_query_start(query_id); + /* * Update my status entry, following the protocol of bumping * st_changecount before and after. We use a volatile pointer here to diff --git a/src/backend/utils/activity/wait_event_timing.c b/src/backend/utils/activity/wait_event_timing.c index 993bd303fba..26a13fe8d9f 100644 --- a/src/backend/utils/activity/wait_event_timing.c +++ b/src/backend/utils/activity/wait_event_timing.c @@ -243,6 +243,7 @@ wait_event_trace_exec_end(int64 query_id) #include "catalog/pg_type_d.h" #include "funcapi.h" #include "miscadmin.h" +#include "nodes/queryjumble.h" #include "port/pg_bitutils.h" #include "storage/ipc.h" #include "storage/latch.h" @@ -1011,6 +1012,23 @@ assign_wait_event_capture(int newval, void *extra) */ if (newval != WAIT_EVENT_CAPTURE_TRACE && my_wait_event_trace != NULL) wait_event_trace_release_slot(my_trace_proc_number); + + /* + * Trace-level query attribution needs a non-zero query_id (from + * compute_query_id) and the activity reporting that drives the markers + * (track_activities). Warn -- but never error -- if either is missing; + * assign hooks must not ereport(ERROR). Trace still records wait events; + * only the query markers are affected. + */ + if (newval == WAIT_EVENT_CAPTURE_TRACE && !pgstat_track_activities) + ereport(WARNING, + (errmsg("query attribution at \"wait_event_capture\" = \"trace\" requires \"track_activities\" to be enabled"))); + + if (newval == WAIT_EVENT_CAPTURE_TRACE && + compute_query_id == COMPUTE_QUERY_ID_OFF) + ereport(WARNING, + (errmsg("query attribution at \"wait_event_capture\" = \"trace\" requires \"compute_query_id\" to be enabled"), + errhint("Set \"compute_query_id\" to \"on\" or \"auto\", or load a module that enables it."))); } /* ================= Trace-level ring machinery ================= */ diff --git a/src/test/regress/expected/wait_event_timing.out b/src/test/regress/expected/wait_event_timing.out index 9fc9b115e79..8e94a4ce6c5 100644 --- a/src/test/regress/expected/wait_event_timing.out +++ b/src/test/regress/expected/wait_event_timing.out @@ -4,7 +4,7 @@ -- Exercises the wait_event_capture instrumentation: the GUC, the stats -- surface (pg_stat_get_wait_event_timing(), the pg_stat_wait_event_timing -- and histogram-buckets views, overflow counters, resets), and the trace --- surface (the per-session ring and its readers). +-- surface (the per-session ring, its readers, and the query markers). -- -- Two expected outputs are maintained: -- wait_event_timing.out -- --enable-wait-event-timing builds @@ -139,6 +139,7 @@ RESET wait_event_capture; -- (In a stub build SET trace errors and the trace readers stay empty; -- that is the documented difference between the two expected files.) -- +SET compute_query_id = on; SET wait_event_capture = trace; SELECT pg_sleep(0.1); pg_sleep @@ -165,6 +166,17 @@ WHERE wait_event = 'PgSleep'; t (1 row) +-- Executor markers bracket the work and carry a non-zero query_id. +SELECT count(*) >= 1 AS has_exec_markers, + coalesce(bool_and(query_id <> 0), true) AS markers_have_query_id +FROM pg_get_backend_wait_event_trace() +WHERE wait_event_type = 'Query' + AND wait_event IN ('ExecStart', 'ExecEnd'); + has_exec_markers | markers_have_query_id +------------------+----------------------- + t | t +(1 row) + -- Clearing orphaned rings is a no-op here (no orphans) but must succeed. SELECT pg_stat_clear_orphaned_wait_event_rings() >= 0 AS clear_orphans_ok; clear_orphans_ok @@ -172,4 +184,5 @@ SELECT pg_stat_clear_orphaned_wait_event_rings() >= 0 AS clear_orphans_ok; t (1 row) +RESET compute_query_id; RESET wait_event_capture; diff --git a/src/test/regress/expected/wait_event_timing_1.out b/src/test/regress/expected/wait_event_timing_1.out index f8b3beec02f..31ccd005ff7 100644 --- a/src/test/regress/expected/wait_event_timing_1.out +++ b/src/test/regress/expected/wait_event_timing_1.out @@ -4,7 +4,7 @@ -- Exercises the wait_event_capture instrumentation: the GUC, the stats -- surface (pg_stat_get_wait_event_timing(), the pg_stat_wait_event_timing -- and histogram-buckets views, overflow counters, resets), and the trace --- surface (the per-session ring and its readers). +-- surface (the per-session ring, its readers, and the query markers). -- -- Two expected outputs are maintained: -- wait_event_timing.out -- --enable-wait-event-timing builds @@ -129,6 +129,7 @@ RESET wait_event_capture; -- (In a stub build SET trace errors and the trace readers stay empty; -- that is the documented difference between the two expected files.) -- +SET compute_query_id = on; SET wait_event_capture = trace; ERROR: invalid value for parameter "wait_event_capture": "trace" DETAIL: This build does not support wait event capture. @@ -158,6 +159,17 @@ WHERE wait_event = 'PgSleep'; f (1 row) +-- Executor markers bracket the work and carry a non-zero query_id. +SELECT count(*) >= 1 AS has_exec_markers, + coalesce(bool_and(query_id <> 0), true) AS markers_have_query_id +FROM pg_get_backend_wait_event_trace() +WHERE wait_event_type = 'Query' + AND wait_event IN ('ExecStart', 'ExecEnd'); + has_exec_markers | markers_have_query_id +------------------+----------------------- + f | t +(1 row) + -- Clearing orphaned rings is a no-op here (no orphans) but must succeed. SELECT pg_stat_clear_orphaned_wait_event_rings() >= 0 AS clear_orphans_ok; clear_orphans_ok @@ -165,4 +177,5 @@ SELECT pg_stat_clear_orphaned_wait_event_rings() >= 0 AS clear_orphans_ok; t (1 row) +RESET compute_query_id; RESET wait_event_capture; diff --git a/src/test/regress/sql/wait_event_timing.sql b/src/test/regress/sql/wait_event_timing.sql index 49206afef66..259ffb5f2d9 100644 --- a/src/test/regress/sql/wait_event_timing.sql +++ b/src/test/regress/sql/wait_event_timing.sql @@ -4,7 +4,7 @@ -- Exercises the wait_event_capture instrumentation: the GUC, the stats -- surface (pg_stat_get_wait_event_timing(), the pg_stat_wait_event_timing -- and histogram-buckets views, overflow counters, resets), and the trace --- surface (the per-session ring and its readers). +-- surface (the per-session ring, its readers, and the query markers). -- -- Two expected outputs are maintained: -- wait_event_timing.out -- --enable-wait-event-timing builds @@ -84,6 +84,7 @@ RESET wait_event_capture; -- (In a stub build SET trace errors and the trace readers stay empty; -- that is the documented difference between the two expected files.) -- +SET compute_query_id = on; SET wait_event_capture = trace; SELECT pg_sleep(0.1); @@ -98,7 +99,15 @@ SELECT count(*) >= 1 AS view_has_pgsleep FROM pg_backend_wait_event_trace WHERE wait_event = 'PgSleep'; +-- Executor markers bracket the work and carry a non-zero query_id. +SELECT count(*) >= 1 AS has_exec_markers, + coalesce(bool_and(query_id <> 0), true) AS markers_have_query_id +FROM pg_get_backend_wait_event_trace() +WHERE wait_event_type = 'Query' + AND wait_event IN ('ExecStart', 'ExecEnd'); + -- Clearing orphaned rings is a no-op here (no orphans) but must succeed. SELECT pg_stat_clear_orphaned_wait_event_rings() >= 0 AS clear_orphans_ok; +RESET compute_query_id; RESET wait_event_capture; -- 2.43.0