From 1d0c0bca34a646e0910e30ea281c97cc4878167d Mon Sep 17 00:00:00 2001 From: suzuki hironobu Date: Sat, 20 Dec 2025 12:52:47 +0900 Subject: [PATCH v1] Introduce ExecProcNodeInstrRowCount for reduced ANALYZE overhead This patch introduces ExecProcNodeInstrRowCount() to significantly reduce the overhead of EXPLAIN ANALYZE when only actual row counting is active (TIMING, BUFFERUSAGE, and WALUSAGE are all FALSE). It simplifies the instrumentation path by skipping InstrStartNode() and minimizing InstrStopNode() to just tuple counting and setting the 'running' flag. This non-intrusive approach reduces the original overhead from 26.60% to 8.13% without causing performance degradation for normal queries --- src/backend/executor/execProcnode.c | 39 +++++++++- src/backend/executor/instrument.c | 101 ------------------------- src/include/executor/instrument.h | 113 +++++++++++++++++++++++++++- 3 files changed, 147 insertions(+), 106 deletions(-) diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c index f5f9cfbeead..85ff922fd1f 100644 --- a/src/backend/executor/execProcnode.c +++ b/src/backend/executor/execProcnode.c @@ -121,6 +121,7 @@ static TupleTableSlot *ExecProcNodeFirst(PlanState *node); static TupleTableSlot *ExecProcNodeInstr(PlanState *node); +static TupleTableSlot *ExecProcNodeInstrRowCount(PlanState *node); static bool ExecShutdownNode_walker(PlanState *node, void *context); @@ -462,7 +463,17 @@ ExecProcNodeFirst(PlanState *node) * have ExecProcNode() directly call the relevant function from now on. */ if (node->instrument) - node->ExecProcNode = ExecProcNodeInstr; + { + /* + * Check if we can use the specialized row-counting path + * (ExecProcNodeInstrRowCount) to minimize overhead. + */ + if (node->instrument->need_timer || node->instrument->need_bufusage + || node->instrument->need_walusage) + node->ExecProcNode = ExecProcNodeInstr; + else + node->ExecProcNode = ExecProcNodeInstrRowCount; + } else node->ExecProcNode = node->ExecProcNodeReal; @@ -490,6 +501,32 @@ ExecProcNodeInstr(PlanState *node) } +/* + * ExecProcNodeInstrRowCount + * + * This function is used when ANALYZE is active but other instrumentation + * metrics (TIMING, BUFFERUSAGE, WALUSAGE) are disabled. + * + * It reduces overhead by skipping InstrStartNode() and only performing the + * essential row counting and 'running' flag updates normally found in InstrStopNode(). + */ +static TupleTableSlot * +ExecProcNodeInstrRowCount(PlanState *node) +{ + TupleTableSlot *result; + + result = node->ExecProcNodeReal(node); + + /* Perform the essential steps from InstrStopNode() */ + if (!TupIsNull(result)) + /* count the returned tuples */ + node->instrument->tuplecount += 1.0; + node->instrument->running = true; /* Always true */ + + return result; +} + + /* ---------------------------------------------------------------- * MultiExecProcNode * diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c index 9e11c662a7c..98b9690723e 100644 --- a/src/backend/executor/instrument.c +++ b/src/backend/executor/instrument.c @@ -63,107 +63,6 @@ InstrInit(Instrumentation *instr, int instrument_options) instr->need_timer = (instrument_options & INSTRUMENT_TIMER) != 0; } -/* Entry to a plan node */ -void -InstrStartNode(Instrumentation *instr) -{ - if (instr->need_timer && - !INSTR_TIME_SET_CURRENT_LAZY(instr->starttime)) - elog(ERROR, "InstrStartNode called twice in a row"); - - /* save buffer usage totals at node entry, if needed */ - if (instr->need_bufusage) - instr->bufusage_start = pgBufferUsage; - - if (instr->need_walusage) - instr->walusage_start = pgWalUsage; -} - -/* Exit from a plan node */ -void -InstrStopNode(Instrumentation *instr, double nTuples) -{ - double save_tuplecount = instr->tuplecount; - instr_time endtime; - - /* count the returned tuples */ - instr->tuplecount += nTuples; - - /* let's update the time only if the timer was requested */ - if (instr->need_timer) - { - if (INSTR_TIME_IS_ZERO(instr->starttime)) - elog(ERROR, "InstrStopNode called without start"); - - INSTR_TIME_SET_CURRENT(endtime); - INSTR_TIME_ACCUM_DIFF(instr->counter, endtime, instr->starttime); - - INSTR_TIME_SET_ZERO(instr->starttime); - } - - /* Add delta of buffer usage since entry to node's totals */ - if (instr->need_bufusage) - BufferUsageAccumDiff(&instr->bufusage, - &pgBufferUsage, &instr->bufusage_start); - - if (instr->need_walusage) - WalUsageAccumDiff(&instr->walusage, - &pgWalUsage, &instr->walusage_start); - - /* Is this the first tuple of this cycle? */ - if (!instr->running) - { - instr->running = true; - instr->firsttuple = INSTR_TIME_GET_DOUBLE(instr->counter); - } - else - { - /* - * In async mode, if the plan node hadn't emitted any tuples before, - * this might be the first tuple - */ - if (instr->async_mode && save_tuplecount < 1.0) - instr->firsttuple = INSTR_TIME_GET_DOUBLE(instr->counter); - } -} - -/* Update tuple count */ -void -InstrUpdateTupleCount(Instrumentation *instr, double nTuples) -{ - /* count the returned tuples */ - instr->tuplecount += nTuples; -} - -/* Finish a run cycle for a plan node */ -void -InstrEndLoop(Instrumentation *instr) -{ - double totaltime; - - /* Skip if nothing has happened, or already shut down */ - if (!instr->running) - return; - - if (!INSTR_TIME_IS_ZERO(instr->starttime)) - elog(ERROR, "InstrEndLoop called on running node"); - - /* Accumulate per-cycle statistics into totals */ - totaltime = INSTR_TIME_GET_DOUBLE(instr->counter); - - instr->startup += instr->firsttuple; - instr->total += totaltime; - instr->ntuples += instr->tuplecount; - instr->nloops += 1; - - /* Reset for next cycle (if any) */ - instr->running = false; - INSTR_TIME_SET_ZERO(instr->starttime); - INSTR_TIME_SET_ZERO(instr->counter); - instr->firsttuple = 0; - instr->tuplecount = 0; -} - /* aggregate instrumentation information */ void InstrAggNode(Instrumentation *dst, Instrumentation *add) diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h index ffe470f2b84..298255e9ec5 100644 --- a/src/include/executor/instrument.h +++ b/src/include/executor/instrument.h @@ -106,10 +106,6 @@ extern PGDLLIMPORT WalUsage pgWalUsage; extern Instrumentation *InstrAlloc(int n, int instrument_options, bool async_mode); extern void InstrInit(Instrumentation *instr, int instrument_options); -extern void InstrStartNode(Instrumentation *instr); -extern void InstrStopNode(Instrumentation *instr, double nTuples); -extern void InstrUpdateTupleCount(Instrumentation *instr, double nTuples); -extern void InstrEndLoop(Instrumentation *instr); extern void InstrAggNode(Instrumentation *dst, Instrumentation *add); extern void InstrStartParallelQuery(void); extern void InstrEndParallelQuery(BufferUsage *bufusage, WalUsage *walusage); @@ -119,4 +115,113 @@ extern void BufferUsageAccumDiff(BufferUsage *dst, extern void WalUsageAccumDiff(WalUsage *dst, const WalUsage *add, const WalUsage *sub); +/* Entry to a plan node */ +static pg_attribute_always_inline void +InstrStartNode(Instrumentation *instr) +{ + if (instr->need_timer && + !INSTR_TIME_SET_CURRENT_LAZY(instr->starttime)) + elog(ERROR, "InstrStartNode called twice in a row"); + + /* save buffer usage totals at node entry, if needed */ + if (instr->need_bufusage) + instr->bufusage_start = pgBufferUsage; + + if (instr->need_walusage) + instr->walusage_start = pgWalUsage; +} + +/* Exit from a plan node */ +static pg_attribute_always_inline void +InstrStopNode(Instrumentation *instr, double nTuples) +{ + double save_tuplecount = instr->tuplecount; + instr_time endtime; + + /* count the returned tuples */ + instr->tuplecount += nTuples; + + /* let's update the time only if the timer was requested */ + if (instr->need_timer) + { + if (INSTR_TIME_IS_ZERO(instr->starttime)) + elog(ERROR, "InstrStopNode called without start"); + + INSTR_TIME_SET_CURRENT(endtime); + INSTR_TIME_ACCUM_DIFF(instr->counter, endtime, instr->starttime); + + INSTR_TIME_SET_ZERO(instr->starttime); + } + + /* Add delta of buffer usage since entry to node's totals */ + if (instr->need_bufusage) + BufferUsageAccumDiff(&instr->bufusage, + &pgBufferUsage, &instr->bufusage_start); + + if (instr->need_walusage) + WalUsageAccumDiff(&instr->walusage, + &pgWalUsage, &instr->walusage_start); + + /* Is this the first tuple of this cycle? */ + if (!instr->running) + { + instr->running = true; + if (instr->need_timer) + instr->firsttuple = INSTR_TIME_GET_DOUBLE(instr->counter); + } + else + { + /* + * In async mode, if the plan node hadn't emitted any tuples before, + * this might be the first tuple + */ + if (instr->need_timer && instr->async_mode && save_tuplecount < 1.0) + instr->firsttuple = INSTR_TIME_GET_DOUBLE(instr->counter); + } +} + +/* Update tuple count */ +static pg_attribute_always_inline void +InstrUpdateTupleCount(Instrumentation *instr, double nTuples) +{ + /* count the returned tuples */ + instr->tuplecount += nTuples; +} + + +/* Finish a run cycle for a plan node */ +static pg_attribute_always_inline void +InstrEndLoop(Instrumentation *instr) +{ + double totaltime; + + /* Skip if nothing has happened, or already shut down */ + if (!instr->running) + return; + + if (!INSTR_TIME_IS_ZERO(instr->starttime)) + elog(ERROR, "InstrEndLoop called on running node"); + + /* Accumulate per-cycle statistics into totals */ + if (instr->need_timer) + { + totaltime = INSTR_TIME_GET_DOUBLE(instr->counter); + + instr->startup += instr->firsttuple; + instr->total += totaltime; + } + instr->ntuples += instr->tuplecount; + instr->nloops += 1; + + /* Reset for next cycle (if any) */ + instr->running = false; + if (instr->need_timer) + { + INSTR_TIME_SET_ZERO(instr->starttime); + INSTR_TIME_SET_ZERO(instr->counter); + instr->firsttuple = 0; + } + instr->tuplecount = 0; +} + #endif /* INSTRUMENT_H */ -- 2.39.5 (Apple Git-154)