| From: | Ilmar Yunusov <tanswis42(at)gmail(dot)com> |
|---|---|
| To: | pgsql-hackers(at)postgresql(dot)org |
| Cc: | Ilmar Yunusov <tanswis42(at)gmail(dot)com> |
| Subject: | [RFC PATCH v0 3/7] Attribute EXPLAIN WAITS to plan nodes |
| Date: | 2026-05-08 23:22:33 |
| Message-ID: | cb26aa486d42f227209c27df4aef4b07f13e334a.1778280923.git.tanswis42@gmail.com |
| Views: | Whole Thread | Raw Message | Download mbox | Resend email |
| Thread: | |
| Lists: | pgsql-hackers |
---
src/backend/commands/explain.c | 6 +-
src/backend/executor/execAsync.c | 22 ++++
src/backend/executor/execParallel.c | 131 ++++++++++++++++++---
src/backend/executor/execProcnode.c | 31 ++++-
src/backend/executor/instrument.c | 7 ++
src/backend/executor/nodeBitmapAnd.c | 7 ++
src/backend/executor/nodeBitmapIndexscan.c | 7 ++
src/backend/executor/nodeBitmapOr.c | 7 ++
src/backend/executor/nodeHash.c | 7 ++
src/backend/utils/activity/wait_event.c | 49 +++++++-
src/include/executor/instrument.h | 1 +
src/include/nodes/execnodes.h | 1 +
src/include/utils/wait_event.h | 4 +
src/test/regress/expected/explain.out | 52 +++++++-
src/test/regress/sql/explain.sql | 29 +++++
15 files changed, 333 insertions(+), 28 deletions(-)
diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 9d7372f5415..0575ae8368b 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -530,6 +530,8 @@ ExplainOnePlan(PlannedStmt *plannedstmt, IntoClause *into, ExplainState *es,
instrument_option |= INSTRUMENT_WAL;
if (es->io)
instrument_option |= INSTRUMENT_IO;
+ if (es->waits)
+ instrument_option |= INSTRUMENT_WAITS;
/*
* We always collect timing for the entire statement, even when node-level
@@ -2332,6 +2334,8 @@ ExplainNode(PlanState *planstate, List *ancestors,
show_buffer_usage(es, &planstate->instrument->instr.bufusage);
if (es->wal && planstate->instrument)
show_wal_usage(es, &planstate->instrument->instr.walusage);
+ if (es->waits)
+ show_wait_event_usage(es, planstate->wait_event_usage);
/* Prepare per-worker buffer/WAL usage */
if (es->workers_state && (es->buffers || es->wal) && es->verbose)
@@ -4559,7 +4563,7 @@ show_wait_event_usage(ExplainState *es, const WaitEventUsage *usage)
if (usage == NULL)
return;
- if (es->format == EXPLAIN_FORMAT_TEXT && usage->nentries == 0)
+ if (usage->nentries == 0)
return;
if (usage->nentries > 0)
diff --git a/src/backend/executor/execAsync.c b/src/backend/executor/execAsync.c
index cf7ddbb01f4..e73ea55cf92 100644
--- a/src/backend/executor/execAsync.c
+++ b/src/backend/executor/execAsync.c
@@ -19,6 +19,7 @@
#include "executor/instrument.h"
#include "executor/nodeAppend.h"
#include "executor/nodeForeignscan.h"
+#include "utils/wait_event.h"
/*
* Asynchronously request a tuple from a designed async-capable node.
@@ -26,12 +27,17 @@
void
ExecAsyncRequest(AsyncRequest *areq)
{
+ WaitEventUsage *previous_wait_event_usage = NULL;
+
if (areq->requestee->chgParam != NULL) /* something changed? */
ExecReScan(areq->requestee); /* let ReScan handle this */
/* must provide our own instrumentation support */
if (areq->requestee->instrument)
InstrStartNode(areq->requestee->instrument);
+ if (areq->requestee->wait_event_usage)
+ previous_wait_event_usage =
+ pgstat_enter_wait_event_usage(areq->requestee->wait_event_usage);
switch (nodeTag(areq->requestee))
{
@@ -47,6 +53,8 @@ ExecAsyncRequest(AsyncRequest *areq)
ExecAsyncResponse(areq);
/* must provide our own instrumentation support */
+ if (areq->requestee->wait_event_usage)
+ pgstat_restore_wait_event_usage(previous_wait_event_usage);
if (areq->requestee->instrument)
InstrStopNode(areq->requestee->instrument,
TupIsNull(areq->result) ? 0.0 : 1.0);
@@ -62,9 +70,14 @@ ExecAsyncRequest(AsyncRequest *areq)
void
ExecAsyncConfigureWait(AsyncRequest *areq)
{
+ WaitEventUsage *previous_wait_event_usage = NULL;
+
/* must provide our own instrumentation support */
if (areq->requestee->instrument)
InstrStartNode(areq->requestee->instrument);
+ if (areq->requestee->wait_event_usage)
+ previous_wait_event_usage =
+ pgstat_enter_wait_event_usage(areq->requestee->wait_event_usage);
switch (nodeTag(areq->requestee))
{
@@ -78,6 +91,8 @@ ExecAsyncConfigureWait(AsyncRequest *areq)
}
/* must provide our own instrumentation support */
+ if (areq->requestee->wait_event_usage)
+ pgstat_restore_wait_event_usage(previous_wait_event_usage);
if (areq->requestee->instrument)
InstrStopNode(areq->requestee->instrument, 0.0);
}
@@ -88,9 +103,14 @@ ExecAsyncConfigureWait(AsyncRequest *areq)
void
ExecAsyncNotify(AsyncRequest *areq)
{
+ WaitEventUsage *previous_wait_event_usage = NULL;
+
/* must provide our own instrumentation support */
if (areq->requestee->instrument)
InstrStartNode(areq->requestee->instrument);
+ if (areq->requestee->wait_event_usage)
+ previous_wait_event_usage =
+ pgstat_enter_wait_event_usage(areq->requestee->wait_event_usage);
switch (nodeTag(areq->requestee))
{
@@ -106,6 +126,8 @@ ExecAsyncNotify(AsyncRequest *areq)
ExecAsyncResponse(areq);
/* must provide our own instrumentation support */
+ if (areq->requestee->wait_event_usage)
+ pgstat_restore_wait_event_usage(previous_wait_event_usage);
if (areq->requestee->instrument)
InstrStopNode(areq->requestee->instrument,
TupIsNull(areq->result) ? 0.0 : 1.0);
diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c
index 8213565a708..f77f539dd27 100644
--- a/src/backend/executor/execParallel.c
+++ b/src/backend/executor/execParallel.c
@@ -83,6 +83,12 @@ typedef struct FixedParallelExecutorState
int jit_flags;
} FixedParallelExecutorState;
+typedef struct SharedWaitEventUsageWorker
+{
+ int nentries;
+ dsa_pointer entries;
+} SharedWaitEventUsageWorker;
+
/*
* DSM structure for accumulating per-PlanState instrumentation.
*
@@ -92,6 +98,10 @@ typedef struct FixedParallelExecutorState
* of the first NodeInstrumentation object. This will depend on the length of
* the plan_node_id array.
*
+ * wait_event_usage_offset: Offset, relative to the start of this structure,
+ * of the first SharedWaitEventUsageWorker object, or 0 if wait event usage is
+ * not being collected.
+ *
* num_workers: Number of workers.
*
* num_plan_nodes: Number of plan nodes.
@@ -103,6 +113,7 @@ struct SharedExecutorInstrumentation
{
int instrument_options;
int instrument_offset;
+ int wait_event_usage_offset;
int num_workers;
int num_plan_nodes;
int plan_node_id[FLEXIBLE_ARRAY_MEMBER];
@@ -110,17 +121,17 @@ struct SharedExecutorInstrumentation
/*
* Array of num_plan_nodes * num_workers NodeInstrumentation objects
* follows.
+ *
+ * If wait_event_usage_offset is non-zero, an array of num_plan_nodes *
+ * num_workers SharedWaitEventUsageWorker objects follows.
*/
};
#define GetInstrumentationArray(sei) \
(StaticAssertVariableIsOfTypeMacro(sei, SharedExecutorInstrumentation *), \
(NodeInstrumentation *) (((char *) sei) + sei->instrument_offset))
-
-typedef struct SharedWaitEventUsageWorker
-{
- int nentries;
- dsa_pointer entries;
-} SharedWaitEventUsageWorker;
+#define GetInstrumentationWaitEventUsageArray(sei) \
+ (StaticAssertVariableIsOfTypeMacro(sei, SharedExecutorInstrumentation *), \
+ (SharedWaitEventUsageWorker *) (((char *) sei) + sei->wait_event_usage_offset))
struct SharedWaitEventUsage
{
@@ -143,6 +154,12 @@ typedef struct ExecParallelInitializeDSMContext
int nnodes;
} ExecParallelInitializeDSMContext;
+typedef struct ExecParallelRetrieveInstrumentationContext
+{
+ SharedExecutorInstrumentation *instrumentation;
+ dsa_area *area;
+} ExecParallelRetrieveInstrumentationContext;
+
/* Helper functions that run in the parallel leader. */
static char *ExecSerializePlan(Plan *plan, EState *estate);
static bool ExecParallelEstimate(PlanState *planstate,
@@ -154,8 +171,11 @@ static shm_mq_handle **ExecParallelSetupTupleQueues(ParallelContext *pcxt,
static bool ExecParallelReInitializeDSM(PlanState *planstate,
ParallelContext *pcxt);
static bool ExecParallelRetrieveInstrumentation(PlanState *planstate,
- SharedExecutorInstrumentation *instrumentation);
+ ExecParallelRetrieveInstrumentationContext *r);
static void ExecParallelRetrieveWaitEventUsage(ParallelExecutorInfo *pei);
+static void ExecParallelReportWaitEventUsageWorker(SharedWaitEventUsageWorker *worker,
+ dsa_area *area,
+ const WaitEventUsage *usage);
static void ExecParallelReportWaitEventUsage(SharedWaitEventUsage *shared,
dsa_area *area,
const WaitEventUsage *usage);
@@ -691,6 +711,7 @@ ExecInitParallelPlan(PlanState *planstate, EState *estate,
int instrumentation_len = 0;
int jit_instrumentation_len = 0;
int instrument_offset = 0;
+ int wait_event_usage_offset = 0;
Size dsa_minsize = dsa_minimum_size();
char *query_string;
int query_len;
@@ -798,6 +819,14 @@ ExecInitParallelPlan(PlanState *planstate, EState *estate,
instrumentation_len +=
mul_size(sizeof(NodeInstrumentation),
mul_size(e.nnodes, nworkers));
+ if (estate->es_instrument & INSTRUMENT_WAITS)
+ {
+ instrumentation_len = MAXALIGN(instrumentation_len);
+ wait_event_usage_offset = instrumentation_len;
+ instrumentation_len +=
+ mul_size(sizeof(SharedWaitEventUsageWorker),
+ mul_size(e.nnodes, nworkers));
+ }
shm_toc_estimate_chunk(&pcxt->estimator, instrumentation_len);
shm_toc_estimate_keys(&pcxt->estimator, 1);
@@ -903,11 +932,23 @@ ExecInitParallelPlan(PlanState *planstate, EState *estate,
instrumentation = shm_toc_allocate(pcxt->toc, instrumentation_len);
instrumentation->instrument_options = estate->es_instrument;
instrumentation->instrument_offset = instrument_offset;
+ instrumentation->wait_event_usage_offset = wait_event_usage_offset;
instrumentation->num_workers = nworkers;
instrumentation->num_plan_nodes = e.nnodes;
instrument = GetInstrumentationArray(instrumentation);
for (i = 0; i < nworkers * e.nnodes; ++i)
InstrInitNode(&instrument[i], estate->es_instrument, false);
+ if (wait_event_usage_offset != 0)
+ {
+ SharedWaitEventUsageWorker *worker_usage;
+
+ worker_usage = GetInstrumentationWaitEventUsageArray(instrumentation);
+ for (i = 0; i < nworkers * e.nnodes; ++i)
+ {
+ worker_usage[i].nentries = 0;
+ worker_usage[i].entries = InvalidDsaPointer;
+ }
+ }
shm_toc_insert(pcxt->toc, PARALLEL_KEY_INSTRUMENTATION,
instrumentation);
pei->instrumentation = instrumentation;
@@ -1137,9 +1178,11 @@ ExecParallelReInitializeDSM(PlanState *planstate,
*/
static bool
ExecParallelRetrieveInstrumentation(PlanState *planstate,
- SharedExecutorInstrumentation *instrumentation)
+ ExecParallelRetrieveInstrumentationContext *r)
{
+ SharedExecutorInstrumentation *instrumentation = r->instrumentation;
NodeInstrumentation *instrument;
+ SharedWaitEventUsageWorker *wait_event_usage = NULL;
int i;
int n;
int ibytes;
@@ -1159,6 +1202,30 @@ ExecParallelRetrieveInstrumentation(PlanState *planstate,
for (n = 0; n < instrumentation->num_workers; ++n)
InstrAggNode(planstate->instrument, &instrument[n]);
+ /* Accumulate the wait event usage from all workers. */
+ if (instrumentation->wait_event_usage_offset != 0 &&
+ planstate->wait_event_usage != NULL)
+ {
+ wait_event_usage = GetInstrumentationWaitEventUsageArray(instrumentation);
+ wait_event_usage += i * instrumentation->num_workers;
+ for (n = 0; n < instrumentation->num_workers; ++n)
+ {
+ SharedWaitEventUsageWorker *worker = &wait_event_usage[n];
+ WaitEventUsageEntry *entries;
+
+ if (worker->nentries <= 0 || !DsaPointerIsValid(worker->entries))
+ continue;
+
+ entries = dsa_get_address(r->area, worker->entries);
+ pgstat_accumulate_wait_event_usage(planstate->wait_event_usage,
+ entries,
+ worker->nentries);
+ dsa_free(r->area, worker->entries);
+ worker->nentries = 0;
+ worker->entries = InvalidDsaPointer;
+ }
+ }
+
/*
* Also store the per-worker detail.
*
@@ -1216,7 +1283,7 @@ ExecParallelRetrieveInstrumentation(PlanState *planstate,
}
return planstate_tree_walker(planstate, ExecParallelRetrieveInstrumentation,
- instrumentation);
+ r);
}
/*
@@ -1290,25 +1357,21 @@ ExecParallelRetrieveWaitEventUsage(ParallelExecutorInfo *pei)
}
static void
-ExecParallelReportWaitEventUsage(SharedWaitEventUsage *shared,
- dsa_area *area,
- const WaitEventUsage *usage)
+ExecParallelReportWaitEventUsageWorker(SharedWaitEventUsageWorker *worker,
+ dsa_area *area,
+ const WaitEventUsage *usage)
{
- SharedWaitEventUsageWorker *worker;
WaitEventUsageEntry *entries;
dsa_pointer entries_dsa;
Size entries_size;
- Assert(shared != NULL);
+ Assert(worker != NULL);
Assert(area != NULL);
Assert(usage != NULL);
- Assert(IsParallelWorker());
- Assert(ParallelWorkerNumber < shared->num_workers);
if (usage->nentries <= 0)
return;
- worker = &shared->worker_usage[ParallelWorkerNumber];
entries_size = mul_size(sizeof(WaitEventUsageEntry), usage->nentries);
entries_dsa = dsa_allocate(area, entries_size);
entries = dsa_get_address(area, entries_dsa);
@@ -1320,6 +1383,20 @@ ExecParallelReportWaitEventUsage(SharedWaitEventUsage *shared,
worker->entries = entries_dsa;
}
+static void
+ExecParallelReportWaitEventUsage(SharedWaitEventUsage *shared,
+ dsa_area *area,
+ const WaitEventUsage *usage)
+{
+ Assert(shared != NULL);
+ Assert(IsParallelWorker());
+ Assert(ParallelWorkerNumber < shared->num_workers);
+
+ ExecParallelReportWaitEventUsageWorker(&shared->worker_usage[ParallelWorkerNumber],
+ area,
+ usage);
+}
+
/*
* Finish parallel execution. We wait for parallel workers to finish, and
* accumulate their buffer/WAL usage.
@@ -1385,8 +1462,13 @@ ExecParallelCleanup(ParallelExecutorInfo *pei)
{
/* Accumulate instrumentation, if any. */
if (pei->instrumentation)
- ExecParallelRetrieveInstrumentation(pei->planstate,
- pei->instrumentation);
+ {
+ ExecParallelRetrieveInstrumentationContext r;
+
+ r.instrumentation = pei->instrumentation;
+ r.area = pei->area;
+ ExecParallelRetrieveInstrumentation(pei->planstate, &r);
+ }
/* Accumulate JIT instrumentation, if any. */
if (pei->jit_instrumentation)
@@ -1495,6 +1577,17 @@ ExecParallelReportInstrumentation(PlanState *planstate,
Assert(IsParallelWorker());
Assert(ParallelWorkerNumber < instrumentation->num_workers);
InstrAggNode(&instrument[ParallelWorkerNumber], planstate->instrument);
+ if (instrumentation->wait_event_usage_offset != 0 &&
+ planstate->wait_event_usage != NULL)
+ {
+ SharedWaitEventUsageWorker *wait_event_usage;
+
+ wait_event_usage = GetInstrumentationWaitEventUsageArray(instrumentation);
+ wait_event_usage += i * instrumentation->num_workers;
+ ExecParallelReportWaitEventUsageWorker(&wait_event_usage[ParallelWorkerNumber],
+ planstate->state->es_query_dsa,
+ planstate->wait_event_usage);
+ }
return planstate_tree_walker(planstate, ExecParallelReportInstrumentation,
instrumentation);
diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c
index 7c4c66e323f..081855b3fed 100644
--- a/src/backend/executor/execProcnode.c
+++ b/src/backend/executor/execProcnode.c
@@ -118,6 +118,7 @@
#include "executor/nodeWindowAgg.h"
#include "executor/nodeWorktablescan.h"
#include "miscadmin.h"
+#include "utils/wait_event.h"
#include "nodes/nodeFuncs.h"
static TupleTableSlot *ExecProcNodeFirst(PlanState *node);
@@ -415,6 +416,16 @@ ExecInitNode(Plan *node, EState *estate, int eflags)
if (estate->es_instrument)
result->instrument = InstrAllocNode(estate->es_instrument,
result->async_capable);
+ if (estate->es_instrument & INSTRUMENT_WAITS)
+ {
+ MemoryContext oldcontext;
+
+ oldcontext = MemoryContextSwitchTo(estate->es_query_cxt);
+ result->wait_event_usage = palloc_object(WaitEventUsage);
+ pgstat_init_wait_event_usage(result->wait_event_usage,
+ estate->es_query_cxt);
+ MemoryContextSwitchTo(oldcontext);
+ }
return result;
}
@@ -481,7 +492,8 @@ ExecProcNodeFirst(PlanState *node)
* This has essentially the same responsibilities as ExecProcNode,
* but it does not do InstrStartNode/InstrStopNode (mainly because
* it can't tell how many returned tuples to count). Each per-node
- * function must provide its own instrumentation support.
+ * function must provide its own instrumentation support, including
+ * wait event attribution when enabled.
* ----------------------------------------------------------------
*/
Node *
@@ -758,6 +770,9 @@ ExecShutdownNode(PlanState *node)
static bool
ExecShutdownNode_walker(PlanState *node, void *context)
{
+ bool node_running;
+ WaitEventUsage *previous_wait_event_usage = NULL;
+
if (node == NULL)
return false;
@@ -773,8 +788,14 @@ ExecShutdownNode_walker(PlanState *node, void *context)
* has never been executed, so as to avoid incorrectly making it appear
* that it has.
*/
- if (node->instrument && node->instrument->running)
+ node_running = node->instrument && node->instrument->running;
+ if (node_running)
+ {
InstrStartNode(node->instrument);
+ if (node->wait_event_usage)
+ previous_wait_event_usage =
+ pgstat_enter_wait_event_usage(node->wait_event_usage);
+ }
planstate_tree_walker(node, ExecShutdownNode_walker, context);
@@ -803,8 +824,12 @@ ExecShutdownNode_walker(PlanState *node, void *context)
}
/* Stop the node if we started it above, reporting 0 tuples. */
- if (node->instrument && node->instrument->running)
+ if (node_running)
+ {
+ if (node->wait_event_usage)
+ pgstat_restore_wait_event_usage(previous_wait_event_usage);
InstrStopNode(node->instrument, 0);
+ }
return false;
}
diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c
index ffbcd572133..a1f22180f82 100644
--- a/src/backend/executor/instrument.c
+++ b/src/backend/executor/instrument.c
@@ -21,6 +21,7 @@
#include "nodes/execnodes.h"
#include "portability/instr_time.h"
#include "utils/guc_hooks.h"
+#include "utils/wait_event.h"
BufferUsage pgBufferUsage;
static BufferUsage save_pgBufferUsage;
@@ -181,11 +182,17 @@ TupleTableSlot *
ExecProcNodeInstr(PlanState *node)
{
TupleTableSlot *result;
+ WaitEventUsage *previous_wait_event_usage = NULL;
InstrStartNode(node->instrument);
+ if (node->wait_event_usage)
+ previous_wait_event_usage =
+ pgstat_enter_wait_event_usage(node->wait_event_usage);
result = node->ExecProcNodeReal(node);
+ if (node->wait_event_usage)
+ pgstat_restore_wait_event_usage(previous_wait_event_usage);
InstrStopNode(node->instrument, TupIsNull(result) ? 0.0 : 1.0);
return result;
diff --git a/src/backend/executor/nodeBitmapAnd.c b/src/backend/executor/nodeBitmapAnd.c
index 9007dda3802..e2b442bd378 100644
--- a/src/backend/executor/nodeBitmapAnd.c
+++ b/src/backend/executor/nodeBitmapAnd.c
@@ -32,6 +32,7 @@
#include "executor/instrument.h"
#include "executor/nodeBitmapAnd.h"
#include "nodes/tidbitmap.h"
+#include "utils/wait_event.h"
/* ----------------------------------------------------------------
@@ -115,10 +116,14 @@ MultiExecBitmapAnd(BitmapAndState *node)
int nplans;
int i;
TIDBitmap *result = NULL;
+ WaitEventUsage *previous_wait_event_usage = NULL;
/* must provide our own instrumentation support */
if (node->ps.instrument)
InstrStartNode(node->ps.instrument);
+ if (node->ps.wait_event_usage)
+ previous_wait_event_usage =
+ pgstat_enter_wait_event_usage(node->ps.wait_event_usage);
/*
* get information from the node
@@ -162,6 +167,8 @@ MultiExecBitmapAnd(BitmapAndState *node)
elog(ERROR, "BitmapAnd doesn't support zero inputs");
/* must provide our own instrumentation support */
+ if (node->ps.wait_event_usage)
+ pgstat_restore_wait_event_usage(previous_wait_event_usage);
if (node->ps.instrument)
InstrStopNode(node->ps.instrument, 0 /* XXX */ );
diff --git a/src/backend/executor/nodeBitmapIndexscan.c b/src/backend/executor/nodeBitmapIndexscan.c
index 7978514e1bc..c2798a81e97 100644
--- a/src/backend/executor/nodeBitmapIndexscan.c
+++ b/src/backend/executor/nodeBitmapIndexscan.c
@@ -28,6 +28,7 @@
#include "executor/nodeIndexscan.h"
#include "miscadmin.h"
#include "nodes/tidbitmap.h"
+#include "utils/wait_event.h"
/* ----------------------------------------------------------------
@@ -54,10 +55,14 @@ MultiExecBitmapIndexScan(BitmapIndexScanState *node)
IndexScanDesc scandesc;
double nTuples = 0;
bool doscan;
+ WaitEventUsage *previous_wait_event_usage = NULL;
/* must provide our own instrumentation support */
if (node->ss.ps.instrument)
InstrStartNode(node->ss.ps.instrument);
+ if (node->ss.ps.wait_event_usage)
+ previous_wait_event_usage =
+ pgstat_enter_wait_event_usage(node->ss.ps.wait_event_usage);
/*
* extract necessary information from index scan node
@@ -116,6 +121,8 @@ MultiExecBitmapIndexScan(BitmapIndexScanState *node)
}
/* must provide our own instrumentation support */
+ if (node->ss.ps.wait_event_usage)
+ pgstat_restore_wait_event_usage(previous_wait_event_usage);
if (node->ss.ps.instrument)
InstrStopNode(node->ss.ps.instrument, nTuples);
diff --git a/src/backend/executor/nodeBitmapOr.c b/src/backend/executor/nodeBitmapOr.c
index 148c80fdae6..e9ee934a6f6 100644
--- a/src/backend/executor/nodeBitmapOr.c
+++ b/src/backend/executor/nodeBitmapOr.c
@@ -33,6 +33,7 @@
#include "executor/nodeBitmapOr.h"
#include "nodes/tidbitmap.h"
#include "miscadmin.h"
+#include "utils/wait_event.h"
/* ----------------------------------------------------------------
@@ -116,10 +117,14 @@ MultiExecBitmapOr(BitmapOrState *node)
int nplans;
int i;
TIDBitmap *result = NULL;
+ WaitEventUsage *previous_wait_event_usage = NULL;
/* must provide our own instrumentation support */
if (node->ps.instrument)
InstrStartNode(node->ps.instrument);
+ if (node->ps.wait_event_usage)
+ previous_wait_event_usage =
+ pgstat_enter_wait_event_usage(node->ps.wait_event_usage);
/*
* get information from the node
@@ -180,6 +185,8 @@ MultiExecBitmapOr(BitmapOrState *node)
elog(ERROR, "BitmapOr doesn't support zero inputs");
/* must provide our own instrumentation support */
+ if (node->ps.wait_event_usage)
+ pgstat_restore_wait_event_usage(previous_wait_event_usage);
if (node->ps.instrument)
InstrStopNode(node->ps.instrument, 0 /* XXX */ );
diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c
index 8825bb6fa23..6e9e4b4218f 100644
--- a/src/backend/executor/nodeHash.c
+++ b/src/backend/executor/nodeHash.c
@@ -105,9 +105,14 @@ ExecHash(PlanState *pstate)
Node *
MultiExecHash(HashState *node)
{
+ WaitEventUsage *previous_wait_event_usage = NULL;
+
/* must provide our own instrumentation support */
if (node->ps.instrument)
InstrStartNode(node->ps.instrument);
+ if (node->ps.wait_event_usage)
+ previous_wait_event_usage =
+ pgstat_enter_wait_event_usage(node->ps.wait_event_usage);
if (node->parallel_state != NULL)
MultiExecParallelHash(node);
@@ -115,6 +120,8 @@ MultiExecHash(HashState *node)
MultiExecPrivateHash(node);
/* must provide our own instrumentation support */
+ if (node->ps.wait_event_usage)
+ pgstat_restore_wait_event_usage(previous_wait_event_usage);
if (node->ps.instrument)
InstrStopNode(node->ps.instrument, node->hashtable->reportTuples);
diff --git a/src/backend/utils/activity/wait_event.c b/src/backend/utils/activity/wait_event.c
index eb01bc3d88c..3d5a10b44c4 100644
--- a/src/backend/utils/activity/wait_event.c
+++ b/src/backend/utils/activity/wait_event.c
@@ -27,6 +27,7 @@
#include "storage/shmem.h"
#include "storage/subsystems.h"
#include "storage/spin.h"
+#include "utils/memutils.h"
#include "utils/wait_event.h"
@@ -47,6 +48,7 @@ uint32 *my_wait_event_info = &local_my_wait_event_info;
int pgstat_wait_event_usage_depth = 0;
static WaitEventUsage *pgstat_wait_event_usage = NULL;
+static WaitEventUsage *pgstat_wait_event_node_usage = NULL;
static uint32 pgstat_wait_event_usage_current = 0;
static instr_time pgstat_wait_event_usage_start;
@@ -358,6 +360,28 @@ pgstat_reset_wait_event_storage(void)
my_wait_event_info = &local_my_wait_event_info;
}
+/*
+ * Initialize a wait event usage accumulator.
+ */
+void
+pgstat_init_wait_event_usage(WaitEventUsage *usage, MemoryContext memcontext)
+{
+ Assert(usage != NULL);
+ Assert(memcontext != NULL);
+
+ memset(usage, 0, sizeof(WaitEventUsage));
+
+ /*
+ * Wait events may end inside critical sections, for example while
+ * performing synchronous I/O. Keep usage entries in a dedicated context
+ * where the memory manager permits that accounting path to grow.
+ */
+ usage->memcontext = AllocSetContextCreate(memcontext,
+ "Wait Event Usage",
+ ALLOCSET_SMALL_SIZES);
+ MemoryContextAllowInCriticalSection(usage->memcontext, true);
+}
+
/*
* Start collecting exact wait event timings in this backend.
*
@@ -375,9 +399,9 @@ pgstat_begin_wait_event_usage(WaitEventUsage *usage, MemoryContext memcontext)
if (pgstat_wait_event_usage_depth++ == 0)
{
- memset(usage, 0, sizeof(WaitEventUsage));
- usage->memcontext = memcontext;
+ pgstat_init_wait_event_usage(usage, memcontext);
pgstat_wait_event_usage = usage;
+ pgstat_wait_event_node_usage = NULL;
pgstat_wait_event_usage_current = 0;
INSTR_TIME_SET_ZERO(pgstat_wait_event_usage_start);
}
@@ -398,11 +422,27 @@ pgstat_end_wait_event_usage(WaitEventUsage *usage)
pgstat_count_wait_event_end();
pgstat_wait_event_usage = NULL;
+ pgstat_wait_event_node_usage = NULL;
pgstat_wait_event_usage_current = 0;
INSTR_TIME_SET_ZERO(pgstat_wait_event_usage_start);
}
}
+WaitEventUsage *
+pgstat_enter_wait_event_usage(WaitEventUsage *usage)
+{
+ WaitEventUsage *previous = pgstat_wait_event_node_usage;
+
+ pgstat_wait_event_node_usage = usage;
+ return previous;
+}
+
+void
+pgstat_restore_wait_event_usage(WaitEventUsage *usage)
+{
+ pgstat_wait_event_node_usage = usage;
+}
+
/*
* Record the beginning of a wait event for exact EXPLAIN-style accounting.
*/
@@ -444,6 +484,11 @@ pgstat_count_wait_event_end(void)
pgstat_wait_event_usage_current,
1,
&elapsed);
+ if (pgstat_wait_event_node_usage != NULL)
+ WaitEventUsageAdd(pgstat_wait_event_node_usage,
+ pgstat_wait_event_usage_current,
+ 1,
+ &elapsed);
pgstat_wait_event_usage_current = 0;
INSTR_TIME_SET_ZERO(pgstat_wait_event_usage_start);
diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h
index f093a52aae0..f46ee03d2c1 100644
--- a/src/include/executor/instrument.h
+++ b/src/include/executor/instrument.h
@@ -65,6 +65,7 @@ typedef enum InstrumentOption
INSTRUMENT_ROWS = 1 << 2, /* needs row count */
INSTRUMENT_WAL = 1 << 3, /* needs WAL usage */
INSTRUMENT_IO = 1 << 4, /* needs IO usage */
+ INSTRUMENT_WAITS = 1 << 5, /* needs wait event usage */
INSTRUMENT_ALL = PG_INT32_MAX
} InstrumentOption;
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 781c8c79132..56f9dcbef88 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -1212,6 +1212,7 @@ typedef struct PlanState
NodeInstrumentation *instrument; /* Optional runtime stats for this
* node */
+ WaitEventUsage *wait_event_usage; /* Optional wait event usage */
WorkerNodeInstrumentation *worker_instrument; /* per-worker
* instrumentation */
diff --git a/src/include/utils/wait_event.h b/src/include/utils/wait_event.h
index 63992137ee7..64411288403 100644
--- a/src/include/utils/wait_event.h
+++ b/src/include/utils/wait_event.h
@@ -36,12 +36,16 @@ static inline void pgstat_report_wait_start(uint32 wait_event_info);
static inline void pgstat_report_wait_end(void);
extern void pgstat_set_wait_event_storage(uint32 *wait_event_info);
extern void pgstat_reset_wait_event_storage(void);
+extern void pgstat_init_wait_event_usage(WaitEventUsage *usage,
+ MemoryContext memcontext);
extern void pgstat_begin_wait_event_usage(WaitEventUsage *usage,
MemoryContext memcontext);
extern void pgstat_end_wait_event_usage(WaitEventUsage *usage);
extern void pgstat_accumulate_wait_event_usage(WaitEventUsage *usage,
const WaitEventUsageEntry *entries,
int nentries);
+extern WaitEventUsage *pgstat_enter_wait_event_usage(WaitEventUsage *usage);
+extern void pgstat_restore_wait_event_usage(WaitEventUsage *usage);
extern void pgstat_count_wait_event_start(uint32 wait_event_info);
extern void pgstat_count_wait_event_end(void);
diff --git a/src/test/regress/expected/explain.out b/src/test/regress/expected/explain.out
index e3847e222be..c086fa48d80 100644
--- a/src/test/regress/expected/explain.out
+++ b/src/test/regress/expected/explain.out
@@ -101,12 +101,14 @@ select explain_filter('explain (buffers, format text) select * from int8_tbl i8'
-- WAITS option
select explain_filter('explain (analyze, waits, costs off, summary off, timing off, buffers off) select pg_sleep(0.01)');
- explain_filter
----------------------------------------
+ explain_filter
+-----------------------------------------
Result (actual rows=N.N loops=N)
+ Wait Events:
+ Timeout:PgSleep calls=N time=N.N ms
Wait Events:
Timeout:PgSleep calls=N time=N.N ms
-(3 rows)
+(5 rows)
select explain_filter_to_json('explain (analyze, waits, costs off, summary off, timing off, buffers off, format json) select pg_sleep(0.01)') #> '{0,Wait Events,0}';
?column?
@@ -114,6 +116,12 @@ select explain_filter_to_json('explain (analyze, waits, costs off, summary off,
{"Time": 0.0, "Calls": 0, "Wait Event": "PgSleep", "Wait Event Type": "Timeout"}
(1 row)
+select explain_filter_to_json('explain (analyze, waits, costs off, summary off, timing off, buffers off, format json) select pg_sleep(0.01)') #> '{0,Plan,Wait Events,0}';
+ ?column?
+----------------------------------------------------------------------------------
+ {"Time": 0.0, "Calls": 0, "Wait Event": "PgSleep", "Wait Event Type": "Timeout"}
+(1 row)
+
begin;
create function pg_temp.parallel_pg_sleep(float8) returns void
language internal volatile parallel safe as 'pg_sleep';
@@ -130,6 +138,44 @@ select jsonb_path_query_first(
{"Time": 0.0, "Calls": 0, "Wait Event": "PgSleep", "Wait Event Type": "Timeout"}
(1 row)
+select jsonb_path_query_first(
+ explain_filter_to_json('explain (analyze, waits, costs off, summary off, timing off, buffers off, format json)
+ select pg_temp.parallel_pg_sleep(0.01)
+ from tenk1 where unique1 = 1') #> '{0,Plan}',
+ '$.**."Wait Events"[*] ? (@."Wait Event" == "PgSleep")'
+);
+ jsonb_path_query_first
+----------------------------------------------------------------------------------
+ {"Time": 0.0, "Calls": 0, "Wait Event": "PgSleep", "Wait Event Type": "Timeout"}
+(1 row)
+
+rollback;
+begin;
+-- This test deliberately creates a Bitmap Index Scan runtime-key wait.
+-- The STABLE PL/pgSQL wrapper is test scaffolding: STABLE lets the
+-- expression be used as an index runtime key, while PL/pgSQL prevents SQL
+-- inlining from moving pg_sleep() out of the Bitmap Index Scan boundary.
+-- The planner GUCs below are likewise test-only scaffolding to make the
+-- node shape deterministic.
+create function pg_temp.explain_waits_sleep_int(int) returns int
+ language plpgsql stable as $$begin perform pg_sleep(0.01); return $1; end$$;
+create temp table explain_waits_bitmap (a int);
+insert into explain_waits_bitmap select g from generate_series(1, 10) g;
+create index explain_waits_bitmap_a_idx on explain_waits_bitmap(a);
+analyze explain_waits_bitmap;
+set local enable_seqscan = off;
+set local enable_indexscan = off;
+select jsonb_path_query_first(
+ explain_filter_to_json('explain (analyze, waits, costs off, summary off, timing off, buffers off, format json)
+ select * from explain_waits_bitmap
+ where a = pg_temp.explain_waits_sleep_int(1)') #> '{0,Plan}',
+ '$.** ? (@."Node Type" == "Bitmap Index Scan")."Wait Events"[*] ? (@."Wait Event" == "PgSleep")'
+);
+ jsonb_path_query_first
+----------------------------------------------------------------------------------
+ {"Time": 0.0, "Calls": 0, "Wait Event": "PgSleep", "Wait Event Type": "Timeout"}
+(1 row)
+
rollback;
explain (waits) select 1;
ERROR: EXPLAIN option WAITS requires ANALYZE
diff --git a/src/test/regress/sql/explain.sql b/src/test/regress/sql/explain.sql
index 8821250bcef..bd196f248db 100644
--- a/src/test/regress/sql/explain.sql
+++ b/src/test/regress/sql/explain.sql
@@ -71,6 +71,7 @@ select explain_filter('explain (buffers, format text) select * from int8_tbl i8'
-- WAITS option
select explain_filter('explain (analyze, waits, costs off, summary off, timing off, buffers off) select pg_sleep(0.01)');
select explain_filter_to_json('explain (analyze, waits, costs off, summary off, timing off, buffers off, format json) select pg_sleep(0.01)') #> '{0,Wait Events,0}';
+select explain_filter_to_json('explain (analyze, waits, costs off, summary off, timing off, buffers off, format json) select pg_sleep(0.01)') #> '{0,Plan,Wait Events,0}';
begin;
create function pg_temp.parallel_pg_sleep(float8) returns void
language internal volatile parallel safe as 'pg_sleep';
@@ -82,6 +83,34 @@ select jsonb_path_query_first(
from tenk1 where unique1 = 1') #> '{0,Wait Events}',
'$[*] ? (@."Wait Event" == "PgSleep")'
);
+select jsonb_path_query_first(
+ explain_filter_to_json('explain (analyze, waits, costs off, summary off, timing off, buffers off, format json)
+ select pg_temp.parallel_pg_sleep(0.01)
+ from tenk1 where unique1 = 1') #> '{0,Plan}',
+ '$.**."Wait Events"[*] ? (@."Wait Event" == "PgSleep")'
+);
+rollback;
+begin;
+-- This test deliberately creates a Bitmap Index Scan runtime-key wait.
+-- The STABLE PL/pgSQL wrapper is test scaffolding: STABLE lets the
+-- expression be used as an index runtime key, while PL/pgSQL prevents SQL
+-- inlining from moving pg_sleep() out of the Bitmap Index Scan boundary.
+-- The planner GUCs below are likewise test-only scaffolding to make the
+-- node shape deterministic.
+create function pg_temp.explain_waits_sleep_int(int) returns int
+ language plpgsql stable as $$begin perform pg_sleep(0.01); return $1; end$$;
+create temp table explain_waits_bitmap (a int);
+insert into explain_waits_bitmap select g from generate_series(1, 10) g;
+create index explain_waits_bitmap_a_idx on explain_waits_bitmap(a);
+analyze explain_waits_bitmap;
+set local enable_seqscan = off;
+set local enable_indexscan = off;
+select jsonb_path_query_first(
+ explain_filter_to_json('explain (analyze, waits, costs off, summary off, timing off, buffers off, format json)
+ select * from explain_waits_bitmap
+ where a = pg_temp.explain_waits_sleep_int(1)') #> '{0,Plan}',
+ '$.** ? (@."Node Type" == "Bitmap Index Scan")."Wait Events"[*] ? (@."Wait Event" == "PgSleep")'
+);
rollback;
explain (waits) select 1;
--
2.52.0
| From | Date | Subject | |
|---|---|---|---|
| Next Message | Ilmar Yunusov | 2026-05-08 23:22:34 | [RFC PATCH v0 4/7] Refine EXPLAIN WAITS attribution semantics |
| Previous Message | Ilmar Yunusov | 2026-05-08 23:22:32 | [RFC PATCH v0 2/7] Aggregate EXPLAIN WAITS from parallel workers |