From f2f6ef266bdaa2670f08e061bf6a94f121be76d5 Mon Sep 17 00:00:00 2001
From: Lukas Fittl <lukas@fittl.com>
Date: Tue, 7 Apr 2026 12:32:36 -0700
Subject: [PATCH v17a] instrumentation: Move ExecProcNodeInstr to allow
 inlining

This moves the implementation of ExecProcNodeInstr, the ExecProcNode variant
that gets used when instrumentation is on, to be defined in instrument.c
instead of execProcNode.c, and marks functions it uses as inline.

This allows compilers to generate an optimized implementation, and shows a 2
to 5% reduction in instrumentation overhead for queries that move lots of
rows.

Author: Lukas Fittl <lukas@fittl.com>
Suggested-by: Andres Freund <andres@anarazel.de>
Reviewed-by:
Discussion: https://www.postgresql.org/message-id/flat/CAP53PkzdBK8VJ1fS4AZ481LgMN8f9mJiC39ZRHqkFUSYq6KWmg@mail.gmail.com
---
 src/include/executor/executor.h     |  7 ++++++
 src/backend/executor/execProcnode.c | 20 ----------------
 src/backend/executor/instrument.c   | 37 ++++++++++++++++++++++++-----
 3 files changed, 38 insertions(+), 26 deletions(-)

diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h
index 491c4886506..6980c6dceda 100644
--- a/src/include/executor/executor.h
+++ b/src/include/executor/executor.h
@@ -303,6 +303,13 @@ extern void ExecEndNode(PlanState *node);
 extern void ExecShutdownNode(PlanState *node);
 extern void ExecSetTupleBound(int64 tuples_needed, PlanState *child_node);
 
+/*
+ * ExecProcNodeInstr() is implemented in instrument.c, as that allows for
+ * inlining of the instrumentation functions, but thematically it ought to be
+ * in execProcnode.c.
+ */
+extern TupleTableSlot *ExecProcNodeInstr(PlanState *node);
+
 
 /* ----------------------------------------------------------------
  *		ExecProcNode
diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c
index 132fe37ef60..7c4c66e323f 100644
--- a/src/backend/executor/execProcnode.c
+++ b/src/backend/executor/execProcnode.c
@@ -121,7 +121,6 @@
 #include "nodes/nodeFuncs.h"
 
 static TupleTableSlot *ExecProcNodeFirst(PlanState *node);
-static TupleTableSlot *ExecProcNodeInstr(PlanState *node);
 static bool ExecShutdownNode_walker(PlanState *node, void *context);
 
 
@@ -471,25 +470,6 @@ ExecProcNodeFirst(PlanState *node)
 }
 
 
-/*
- * ExecProcNode wrapper that performs instrumentation calls.  By keeping
- * this a separate function, we avoid overhead in the normal case where
- * no instrumentation is wanted.
- */
-static TupleTableSlot *
-ExecProcNodeInstr(PlanState *node)
-{
-	TupleTableSlot *result;
-
-	InstrStartNode(node->instrument);
-
-	result = node->ExecProcNodeReal(node);
-
-	InstrStopNode(node->instrument, TupIsNull(result) ? 0.0 : 1.0);
-
-	return result;
-}
-
 
 /* ----------------------------------------------------------------
  *		MultiExecProcNode
diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c
index 4c3aec7fdee..ffbcd572133 100644
--- a/src/backend/executor/instrument.c
+++ b/src/backend/executor/instrument.c
@@ -15,7 +15,10 @@
 
 #include <unistd.h>
 
+#include "executor/executor.h"
 #include "executor/instrument.h"
+#include "executor/tuptable.h"
+#include "nodes/execnodes.h"
 #include "portability/instr_time.h"
 #include "utils/guc_hooks.h"
 
@@ -46,7 +49,7 @@ InstrInitOptions(Instrumentation *instr, int instrument_options)
 	instr->need_timer = (instrument_options & INSTRUMENT_TIMER) != 0;
 }
 
-void
+inline void
 InstrStart(Instrumentation *instr)
 {
 	if (instr->need_timer)
@@ -125,14 +128,14 @@ InstrInitNode(NodeInstrumentation *instr, int instrument_options, bool async_mod
 }
 
 /* Entry to a plan node */
-void
+inline void
 InstrStartNode(NodeInstrumentation *instr)
 {
 	InstrStart(&instr->instr);
 }
 
 /* Exit from a plan node */
-void
+inline void
 InstrStopNode(NodeInstrumentation *instr, double nTuples)
 {
 	double		save_tuplecount = instr->tuplecount;
@@ -166,6 +169,28 @@ InstrStopNode(NodeInstrumentation *instr, double nTuples)
 	}
 }
 
+/*
+ * ExecProcNode wrapper that performs instrumentation calls.  By keeping
+ * this a separate function, we avoid overhead in the normal case where
+ * no instrumentation is wanted.
+ *
+ * This is implemented in instrument.c as all the functions it calls directly
+ * are here, allowing them to be inlined even when not using LTO.
+ */
+TupleTableSlot *
+ExecProcNodeInstr(PlanState *node)
+{
+	TupleTableSlot *result;
+
+	InstrStartNode(node->instrument);
+
+	result = node->ExecProcNodeReal(node);
+
+	InstrStopNode(node->instrument, TupIsNull(result) ? 0.0 : 1.0);
+
+	return result;
+}
+
 /* Update tuple count */
 void
 InstrUpdateTupleCount(NodeInstrumentation *instr, double nTuples)
@@ -298,7 +323,7 @@ BufferUsageAdd(BufferUsage *dst, const BufferUsage *add)
 }
 
 /* dst += add - sub */
-void
+inline void
 BufferUsageAccumDiff(BufferUsage *dst,
 					 const BufferUsage *add,
 					 const BufferUsage *sub)
@@ -328,7 +353,7 @@ BufferUsageAccumDiff(BufferUsage *dst,
 }
 
 /* helper functions for WAL usage accumulation */
-static void
+static inline void
 WalUsageAdd(WalUsage *dst, WalUsage *add)
 {
 	dst->wal_bytes += add->wal_bytes;
@@ -338,7 +363,7 @@ WalUsageAdd(WalUsage *dst, WalUsage *add)
 	dst->wal_buffers_full += add->wal_buffers_full;
 }
 
-void
+inline void
 WalUsageAccumDiff(WalUsage *dst, const WalUsage *add, const WalUsage *sub)
 {
 	dst->wal_bytes += add->wal_bytes - sub->wal_bytes;
-- 
2.53.0.1.gb2826b52eb

