From 63ae878a433c5e14ce0c94822832d657ec404796 Mon Sep 17 00:00:00 2001
From: test <test>
Date: Tue, 7 Apr 2026 01:38:53 +0200
Subject: [PATCH v14 4/5] Add EXPLAIN (IO) instrumentation for SeqScan

Adds support for EXPLAIN (IO) instrumentation for sequential scans. This
requires adding shared instrumentation for parallel scans, using the
separate DSM approach introduced by dd78e69cfc33.

Reviewed-by: Melanie Plageman <melanieplageman@gmail.com>
Reviewed-by: Lukas Fittl <lukas@fittl.com>
Reviewed-by: Andres Freund <andres@anarazel.de>
Discussion: https://postgr.es/m/flat/a177a6dd-240b-455a-8f25-aca0b1c08c6e%40vondra.me
---
 src/backend/commands/explain.c         |  25 +++++
 src/backend/executor/execParallel.c    |  11 ++
 src/backend/executor/nodeSeqscan.c     | 133 +++++++++++++++++++++++--
 src/include/executor/instrument_node.h |  19 ++++
 src/include/executor/nodeSeqscan.h     |   9 ++
 src/include/nodes/execnodes.h          |   1 +
 src/test/regress/expected/explain.out  |  18 +++-
 src/test/regress/sql/explain.sql       |   4 +-
 src/tools/pgindent/typedefs.list       |   2 +
 9 files changed, 209 insertions(+), 13 deletions(-)

diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index ca3071f4ca2..c1928713b52 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -2032,6 +2032,7 @@ ExplainNode(PlanState *planstate, List *ancestors,
 										   planstate, es);
 			if (IsA(plan, CteScan))
 				show_ctescan_info(castNode(CteScanState, planstate), es);
+			show_scan_io_usage((ScanState *) planstate, es);
 			break;
 		case T_Gather:
 			{
@@ -4102,6 +4103,30 @@ show_scan_io_usage(ScanState *planstate, ExplainState *es)
 					}
 				}
 
+				break;
+			}
+		case T_SeqScan:
+			{
+				SharedSeqScanInstrumentation *sinstrument
+				= ((SeqScanState *) planstate)->sinstrument;
+
+				if (sinstrument)
+				{
+					for (int i = 0; i < sinstrument->num_workers; ++i)
+					{
+						SeqScanInstrumentation *winstrument = &sinstrument->sinstrument[i];
+
+						AccumulateIOStats(&stats, &winstrument->stats.io);
+
+						if (!es->workers_state)
+							continue;
+
+						ExplainOpenWorker(i, es);
+						print_io_usage(es, &winstrument->stats.io);
+						ExplainCloseWorker(i, es);
+					}
+				}
+
 				break;
 			}
 		default:
diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c
index 1a5ec0c305f..9690f0938ae 100644
--- a/src/backend/executor/execParallel.c
+++ b/src/backend/executor/execParallel.c
@@ -257,6 +257,9 @@ ExecParallelEstimate(PlanState *planstate, ExecParallelEstimateContext *e)
 			if (planstate->plan->parallel_aware)
 				ExecSeqScanEstimate((SeqScanState *) planstate,
 									e->pcxt);
+			/* even when not parallel-aware, for EXPLAIN ANALYZE */
+			ExecSeqScanInstrumentEstimate((SeqScanState *) planstate,
+										  e->pcxt);
 			break;
 		case T_IndexScanState:
 			if (planstate->plan->parallel_aware)
@@ -500,6 +503,9 @@ ExecParallelInitializeDSM(PlanState *planstate,
 			if (planstate->plan->parallel_aware)
 				ExecSeqScanInitializeDSM((SeqScanState *) planstate,
 										 d->pcxt);
+			/* even when not parallel-aware, for EXPLAIN ANALYZE */
+			ExecSeqScanInstrumentInitDSM((SeqScanState *) planstate,
+										 d->pcxt);
 			break;
 		case T_IndexScanState:
 			if (planstate->plan->parallel_aware)
@@ -1148,6 +1154,9 @@ ExecParallelRetrieveInstrumentation(PlanState *planstate,
 		case T_BitmapHeapScanState:
 			ExecBitmapHeapRetrieveInstrumentation((BitmapHeapScanState *) planstate);
 			break;
+		case T_SeqScanState:
+			ExecSeqScanRetrieveInstrumentation((SeqScanState *) planstate);
+			break;
 		default:
 			break;
 	}
@@ -1388,6 +1397,8 @@ ExecParallelInitializeWorker(PlanState *planstate, ParallelWorkerContext *pwcxt)
 		case T_SeqScanState:
 			if (planstate->plan->parallel_aware)
 				ExecSeqScanInitializeWorker((SeqScanState *) planstate, pwcxt);
+			/* even when not parallel-aware, for EXPLAIN ANALYZE */
+			ExecSeqScanInstrumentInitWorker((SeqScanState *) planstate, pwcxt);
 			break;
 		case T_IndexScanState:
 			if (planstate->plan->parallel_aware)
diff --git a/src/backend/executor/nodeSeqscan.c b/src/backend/executor/nodeSeqscan.c
index 04803b0e37d..5bcb0a861d7 100644
--- a/src/backend/executor/nodeSeqscan.c
+++ b/src/backend/executor/nodeSeqscan.c
@@ -29,6 +29,7 @@
 
 #include "access/relscan.h"
 #include "access/tableam.h"
+#include "executor/execParallel.h"
 #include "executor/execScan.h"
 #include "executor/executor.h"
 #include "executor/nodeSeqscan.h"
@@ -65,15 +66,21 @@ SeqNext(SeqScanState *node)
 
 	if (scandesc == NULL)
 	{
+		uint32		flags = SO_NONE;
+
+		if (ScanRelIsReadOnly(&node->ss))
+			flags |= SO_HINT_REL_READ_ONLY;
+
+		if (estate->es_instrument & INSTRUMENT_IO)
+			flags |= SO_SCAN_INSTRUMENT;
+
 		/*
 		 * We reach here if the scan is not parallel, or if we're serially
 		 * executing a scan that was planned to be parallel.
 		 */
 		scandesc = table_beginscan(node->ss.ss_currentRelation,
 								   estate->es_snapshot,
-								   0, NULL,
-								   ScanRelIsReadOnly(&node->ss) ?
-								   SO_HINT_REL_READ_ONLY : SO_NONE);
+								   0, NULL, flags);
 		node->ss.ss_currentScanDesc = scandesc;
 	}
 
@@ -302,6 +309,22 @@ ExecEndSeqScan(SeqScanState *node)
 	 */
 	scanDesc = node->ss.ss_currentScanDesc;
 
+	/*
+	 * Collect I/O stats for this process into shared instrumentation.
+	 */
+	if (node->sinstrument != NULL && IsParallelWorker())
+	{
+		SeqScanInstrumentation *si;
+
+		Assert(ParallelWorkerNumber < node->sinstrument->num_workers);
+		si = &node->sinstrument->sinstrument[ParallelWorkerNumber];
+
+		if (scanDesc && scanDesc->rs_instrument)
+		{
+			AccumulateIOStats(&si->stats.io, &scanDesc->rs_instrument->io);
+		}
+	}
+
 	/*
 	 * close heap scan
 	 */
@@ -370,6 +393,13 @@ ExecSeqScanInitializeDSM(SeqScanState *node,
 {
 	EState	   *estate = node->ss.ps.state;
 	ParallelTableScanDesc pscan;
+	uint32		flags = SO_NONE;
+
+	if (ScanRelIsReadOnly(&node->ss))
+		flags |= SO_HINT_REL_READ_ONLY;
+
+	if (estate->es_instrument & INSTRUMENT_IO)
+		flags |= SO_SCAN_INSTRUMENT;
 
 	pscan = shm_toc_allocate(pcxt->toc, node->pscan_len);
 	table_parallelscan_initialize(node->ss.ss_currentRelation,
@@ -378,9 +408,7 @@ ExecSeqScanInitializeDSM(SeqScanState *node,
 	shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id, pscan);
 
 	node->ss.ss_currentScanDesc =
-		table_beginscan_parallel(node->ss.ss_currentRelation, pscan,
-								 ScanRelIsReadOnly(&node->ss) ?
-								 SO_HINT_REL_READ_ONLY : SO_NONE);
+		table_beginscan_parallel(node->ss.ss_currentRelation, pscan, flags);
 }
 
 /* ----------------------------------------------------------------
@@ -410,10 +438,97 @@ ExecSeqScanInitializeWorker(SeqScanState *node,
 							ParallelWorkerContext *pwcxt)
 {
 	ParallelTableScanDesc pscan;
+	uint32		flags = SO_NONE;
+
+	if (ScanRelIsReadOnly(&node->ss))
+		flags |= SO_HINT_REL_READ_ONLY;
+
+	if (node->ss.ps.state->es_instrument & INSTRUMENT_IO)
+		flags |= SO_SCAN_INSTRUMENT;
 
 	pscan = shm_toc_lookup(pwcxt->toc, node->ss.ps.plan->plan_node_id, false);
 	node->ss.ss_currentScanDesc =
-		table_beginscan_parallel(node->ss.ss_currentRelation, pscan,
-								 ScanRelIsReadOnly(&node->ss) ?
-								 SO_HINT_REL_READ_ONLY : SO_NONE);
+		table_beginscan_parallel(node->ss.ss_currentRelation, pscan, flags);
+}
+
+/*
+ * Compute the amount of space we'll need for the shared instrumentation and
+ * inform pcxt->estimator.
+ */
+void
+ExecSeqScanInstrumentEstimate(SeqScanState *node, ParallelContext *pcxt)
+{
+	EState	   *estate = node->ss.ps.state;
+	Size		size;
+
+	if ((estate->es_instrument & INSTRUMENT_IO) == 0 || pcxt->nworkers == 0)
+		return;
+
+	size = add_size(offsetof(SharedSeqScanInstrumentation, sinstrument),
+					mul_size(pcxt->nworkers, sizeof(SeqScanInstrumentation)));
+
+	shm_toc_estimate_chunk(&pcxt->estimator, size);
+	shm_toc_estimate_keys(&pcxt->estimator, 1);
+}
+
+/*
+ * Set up parallel sequential scan instrumentation.
+ */
+void
+ExecSeqScanInstrumentInitDSM(SeqScanState *node, ParallelContext *pcxt)
+{
+	EState	   *estate = node->ss.ps.state;
+	SharedSeqScanInstrumentation *sinstrument;
+	Size		size;
+
+	if ((estate->es_instrument & INSTRUMENT_IO) == 0 || pcxt->nworkers == 0)
+		return;
+
+	size = add_size(offsetof(SharedSeqScanInstrumentation, sinstrument),
+					mul_size(pcxt->nworkers, sizeof(SeqScanInstrumentation)));
+	sinstrument = shm_toc_allocate(pcxt->toc, size);
+	memset(sinstrument, 0, size);
+	sinstrument->num_workers = pcxt->nworkers;
+	shm_toc_insert(pcxt->toc,
+				   node->ss.ps.plan->plan_node_id +
+				   PARALLEL_KEY_SCAN_INSTRUMENT_OFFSET,
+				   sinstrument);
+	node->sinstrument = sinstrument;
+}
+
+/*
+ * Look up and save the location of the shared instrumentation.
+ */
+void
+ExecSeqScanInstrumentInitWorker(SeqScanState *node,
+								ParallelWorkerContext *pwcxt)
+{
+	EState	   *estate = node->ss.ps.state;
+
+	if ((estate->es_instrument & INSTRUMENT_IO) == 0)
+		return;
+
+	node->sinstrument = shm_toc_lookup(pwcxt->toc,
+									   node->ss.ps.plan->plan_node_id +
+									   PARALLEL_KEY_SCAN_INSTRUMENT_OFFSET,
+									   false);
+}
+
+/*
+ * Transfer sequential scan instrumentation from DSM to private memory.
+ */
+void
+ExecSeqScanRetrieveInstrumentation(SeqScanState *node)
+{
+	SharedSeqScanInstrumentation *sinstrument = node->sinstrument;
+	Size		size;
+
+	if (sinstrument == NULL)
+		return;
+
+	size = offsetof(SharedSeqScanInstrumentation, sinstrument)
+		+ sinstrument->num_workers * sizeof(SeqScanInstrumentation);
+
+	node->sinstrument = palloc(size);
+	memcpy(node->sinstrument, sinstrument, size);
 }
diff --git a/src/include/executor/instrument_node.h b/src/include/executor/instrument_node.h
index 22a75ccd863..003dc262b5d 100644
--- a/src/include/executor/instrument_node.h
+++ b/src/include/executor/instrument_node.h
@@ -266,4 +266,23 @@ typedef struct SharedIncrementalSortInfo
 	IncrementalSortInfo sinfo[FLEXIBLE_ARRAY_MEMBER];
 } SharedIncrementalSortInfo;
 
+
+/* ---------------------
+ *	Instrumentation information for sequential scans
+ * ---------------------
+ */
+typedef struct SeqScanInstrumentation
+{
+	TableScanInstrumentation stats;
+} SeqScanInstrumentation;
+
+/*
+ * Shared memory container for per-worker information
+ */
+typedef struct SharedSeqScanInstrumentation
+{
+	int			num_workers;
+	SeqScanInstrumentation sinstrument[FLEXIBLE_ARRAY_MEMBER];
+} SharedSeqScanInstrumentation;
+
 #endif							/* INSTRUMENT_NODE_H */
diff --git a/src/include/executor/nodeSeqscan.h b/src/include/executor/nodeSeqscan.h
index 7a1490596fb..9c0ad4879d7 100644
--- a/src/include/executor/nodeSeqscan.h
+++ b/src/include/executor/nodeSeqscan.h
@@ -28,4 +28,13 @@ extern void ExecSeqScanReInitializeDSM(SeqScanState *node, ParallelContext *pcxt
 extern void ExecSeqScanInitializeWorker(SeqScanState *node,
 										ParallelWorkerContext *pwcxt);
 
+/* instrument support */
+extern void ExecSeqScanInstrumentEstimate(SeqScanState *node,
+										  ParallelContext *pcxt);
+extern void ExecSeqScanInstrumentInitDSM(SeqScanState *node,
+										 ParallelContext *pcxt);
+extern void ExecSeqScanInstrumentInitWorker(SeqScanState *node,
+											ParallelWorkerContext *pwcxt);
+extern void ExecSeqScanRetrieveInstrumentation(SeqScanState *node);
+
 #endif							/* NODESEQSCAN_H */
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 3ecae7552fc..56febb3204c 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -1670,6 +1670,7 @@ typedef struct SeqScanState
 {
 	ScanState	ss;				/* its first field is NodeTag */
 	Size		pscan_len;		/* size of parallel heap scan descriptor */
+	struct SharedSeqScanInstrumentation *sinstrument;
 } SeqScanState;
 
 /* ----------------
diff --git a/src/test/regress/expected/explain.out b/src/test/regress/expected/explain.out
index dc31c7ce9f9..74a4d87801e 100644
--- a/src/test/regress/expected/explain.out
+++ b/src/test/regress/expected/explain.out
@@ -100,7 +100,7 @@ select explain_filter('explain (buffers, format text) select * from int8_tbl i8'
 (1 row)
 
 \a
-select explain_filter('explain (analyze, buffers, format xml) select * from int8_tbl i8');
+select explain_filter('explain (analyze, buffers, io, format xml) select * from int8_tbl i8');
 explain_filter
 <explain xmlns="http://www.postgresql.org/N/explain">
   <Query>
@@ -119,6 +119,13 @@ explain_filter
       <Actual-Rows>N.N</Actual-Rows>
       <Actual-Loops>N</Actual-Loops>
       <Disabled>false</Disabled>
+      <Average-Prefetch-Distance>N.N</Average-Prefetch-Distance>
+      <Max-Prefetch-Distance>N</Max-Prefetch-Distance>
+      <Prefetch-Capacity>N</Prefetch-Capacity>
+      <I-O-Count>N</I-O-Count>
+      <I-O-Waits>N</I-O-Waits>
+      <Average-I-O-Size>N.N</Average-I-O-Size>
+      <Average-I-Os-In-Progress>N.N</Average-I-Os-In-Progress>
       <Shared-Hit-Blocks>N</Shared-Hit-Blocks>
       <Shared-Read-Blocks>N</Shared-Read-Blocks>
       <Shared-Dirtied-Blocks>N</Shared-Dirtied-Blocks>
@@ -149,7 +156,7 @@ explain_filter
   </Query>
 </explain>
 (1 row)
-select explain_filter('explain (analyze, serialize, buffers, format yaml) select * from int8_tbl i8');
+select explain_filter('explain (analyze, serialize, buffers, io, format yaml) select * from int8_tbl i8');
 explain_filter
 - Plan: 
     Node Type: "Seq Scan"
@@ -166,6 +173,13 @@ explain_filter
     Actual Rows: N.N
     Actual Loops: N
     Disabled: false
+    Average Prefetch Distance: N.N
+    Max Prefetch Distance: N
+    Prefetch Capacity: N
+    I/O Count: N
+    I/O Waits: N
+    Average I/O Size: N.N
+    Average I/Os In Progress: N.N
     Shared Hit Blocks: N
     Shared Read Blocks: N
     Shared Dirtied Blocks: N
diff --git a/src/test/regress/sql/explain.sql b/src/test/regress/sql/explain.sql
index 8f10e1aff55..2f163c64bf6 100644
--- a/src/test/regress/sql/explain.sql
+++ b/src/test/regress/sql/explain.sql
@@ -69,8 +69,8 @@ select explain_filter('explain (analyze, buffers, format text) select * from int
 select explain_filter('explain (buffers, format text) select * from int8_tbl i8');
 
 \a
-select explain_filter('explain (analyze, buffers, format xml) select * from int8_tbl i8');
-select explain_filter('explain (analyze, serialize, buffers, format yaml) select * from int8_tbl i8');
+select explain_filter('explain (analyze, buffers, io, format xml) select * from int8_tbl i8');
+select explain_filter('explain (analyze, serialize, buffers, io, format yaml) select * from int8_tbl i8');
 select explain_filter('explain (buffers, format json) select * from int8_tbl i8');
 \a
 
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 98b8d78e693..c0436a13ac3 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -2800,6 +2800,7 @@ SelfJoinCandidate
 SemTPadded
 SemiAntiJoinFactors
 SeqScan
+SeqScanInstrumentation
 SeqScanState
 SeqTable
 SeqTableData
@@ -2864,6 +2865,7 @@ SharedMemoizeInfo
 SharedRecordTableEntry
 SharedRecordTableKey
 SharedRecordTypmodRegistry
+SharedSeqScanInstrumentation
 SharedSortInfo
 SharedTuplestore
 SharedTuplestoreAccessor
-- 
2.53.0

