From b584342347342bc0db8a96ea99094dbe8f48c7d3 Mon Sep 17 00:00:00 2001
From: Melanie Plageman <melanieplageman@gmail.com>
Date: Sun, 5 Apr 2026 17:17:22 -0400
Subject: [PATCH v12 4/7] Add EXPLAIN (IO) infrastructure and BitmapHeapScan IO
 instrumentation

---
 doc/src/sgml/ref/explain.sgml             |  12 +++
 src/backend/access/heap/heapam.c          |  10 ++
 src/backend/commands/explain.c            | 112 ++++++++++++++++++++++
 src/backend/commands/explain_state.c      |   8 ++
 src/backend/executor/nodeBitmapHeapscan.c |  19 +++-
 src/backend/storage/aio/read_stream.c     |  87 +++++++++++++++++
 src/include/access/relscan.h              |   6 ++
 src/include/access/tableam.h              |   3 +
 src/include/commands/explain_state.h      |   1 +
 src/include/executor/instrument.h         |   1 +
 src/include/executor/instrument_node.h    |  50 ++++++++++
 src/include/storage/read_stream.h         |   2 +
 src/tools/pgindent/typedefs.list          |   2 +
 13 files changed, 311 insertions(+), 2 deletions(-)

diff --git a/doc/src/sgml/ref/explain.sgml b/doc/src/sgml/ref/explain.sgml
index 5b8b521802e..a854c41e963 100644
--- a/doc/src/sgml/ref/explain.sgml
+++ b/doc/src/sgml/ref/explain.sgml
@@ -46,6 +46,7 @@ EXPLAIN [ ( <replaceable class="parameter">option</replaceable> [, ...] ) ] <rep
     TIMING [ <replaceable class="parameter">boolean</replaceable> ]
     SUMMARY [ <replaceable class="parameter">boolean</replaceable> ]
     MEMORY [ <replaceable class="parameter">boolean</replaceable> ]
+    IO [ <replaceable class="parameter">boolean</replaceable> ]
     FORMAT { TEXT | XML | JSON | YAML }
 </synopsis>
  </refsynopsisdiv>
@@ -298,6 +299,17 @@ ROLLBACK;
     </listitem>
    </varlistentry>
 
+   <varlistentry>
+    <term><literal>IO</literal></term>
+    <listitem>
+     <para>
+      Include information on I/O performed by each node.
+      This parameter may only be used when <literal>ANALYZE</literal> is also
+      enabled.  It defaults to <literal>FALSE</literal>.
+     </para>
+    </listitem>
+   </varlistentry>
+
    <varlistentry>
     <term><literal>FORMAT</literal></term>
     <listitem>
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index f6ac5a0897c..89ab9742aa5 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -43,6 +43,7 @@
 #include "catalog/pg_database.h"
 #include "catalog/pg_database_d.h"
 #include "commands/vacuum.h"
+#include "executor/instrument_node.h"
 #include "pgstat.h"
 #include "port/pg_bitutils.h"
 #include "storage/lmgr.h"
@@ -1200,6 +1201,7 @@ heap_beginscan(Relation relation, Snapshot snapshot,
 	scan->rs_base.rs_nkeys = nkeys;
 	scan->rs_base.rs_flags = flags;
 	scan->rs_base.rs_parallel = parallel_scan;
+	scan->rs_base.rs_instrument = NULL;
 	scan->rs_strategy = NULL;	/* set in initscan */
 	scan->rs_cbuf = InvalidBuffer;
 
@@ -1312,6 +1314,14 @@ heap_beginscan(Relation relation, Snapshot snapshot,
 														  sizeof(TBMIterateResult));
 	}
 
+	/* enable read stream instrumentation */
+	if (flags & SO_SCAN_INSTRUMENT)
+	{
+		scan->rs_base.rs_instrument = palloc0_object(TableScanInstrumentation);
+		read_stream_enable_stats(scan->rs_read_stream,
+								 &scan->rs_base.rs_instrument->io);
+	}
+
 	scan->rs_vmbuffer = InvalidBuffer;
 
 	return (TableScanDesc) scan;
diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index f151f21f9b3..863a9dd0f0d 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -13,6 +13,7 @@
  */
 #include "postgres.h"
 
+#include "access/relscan.h"
 #include "access/xact.h"
 #include "catalog/pg_type.h"
 #include "commands/createas.h"
@@ -139,6 +140,8 @@ static void show_hashagg_info(AggState *aggstate, ExplainState *es);
 static void show_indexsearches_info(PlanState *planstate, ExplainState *es);
 static void show_tidbitmap_info(BitmapHeapScanState *planstate,
 								ExplainState *es);
+static void show_scan_io_usage(ScanState *planstate,
+							   ExplainState *es);
 static void show_instrumentation_count(const char *qlabel, int which,
 									   PlanState *planstate, ExplainState *es);
 static void show_foreignscan_info(ForeignScanState *fsstate, ExplainState *es);
@@ -519,6 +522,8 @@ ExplainOnePlan(PlannedStmt *plannedstmt, IntoClause *into, ExplainState *es,
 		instrument_option |= INSTRUMENT_BUFFERS;
 	if (es->wal)
 		instrument_option |= INSTRUMENT_WAL;
+	else if (es->io)
+		instrument_option |= INSTRUMENT_IO;
 
 	/*
 	 * We always collect timing for the entire statement, even when node-level
@@ -2008,6 +2013,7 @@ ExplainNode(PlanState *planstate, List *ancestors,
 				show_instrumentation_count("Rows Removed by Filter", 1,
 										   planstate, es);
 			show_tidbitmap_info((BitmapHeapScanState *) planstate, es);
+			show_scan_io_usage((ScanState *) planstate, es);
 			break;
 		case T_SampleScan:
 			show_tablesample(((SampleScan *) plan)->tablesample,
@@ -3984,6 +3990,112 @@ show_tidbitmap_info(BitmapHeapScanState *planstate, ExplainState *es)
 	}
 }
 
+static void
+print_io_usage(ExplainState *es, IOStats *stats)
+{
+	/* don't print stats if there's nothing to report */
+	if (stats->prefetch_count > 0)
+	{
+		if (es->format == EXPLAIN_FORMAT_TEXT)
+		{
+			/* prefetch distance info */
+			ExplainIndentText(es);
+			appendStringInfo(es->str, "Prefetch: avg=%.2f max=%d capacity=%d\n",
+							 (stats->distance_sum * 1.0 / stats->prefetch_count),
+							 stats->distance_max,
+							 stats->distance_capacity);
+
+			/* prefetch I/O info (only if there were actual I/Os) */
+			if (stats->io_count > 0)
+			{
+				ExplainIndentText(es);
+				appendStringInfo(es->str, "I/O: count=%" PRIu64 " waits=%" PRIu64
+								 " size=%.2f inprogress=%.2f\n",
+								 stats->io_count, stats->wait_count,
+								 (stats->io_nblocks * 1.0 / stats->io_count),
+								 (stats->io_in_progress * 1.0 / stats->io_count));
+			}
+		}
+		else
+		{
+			ExplainPropertyFloat("Average Prefetch Distance", NULL,
+								 (stats->distance_sum * 1.0 / stats->prefetch_count), 3, es);
+			ExplainPropertyInteger("Max Prefetch Distance", NULL,
+								   stats->distance_max, es);
+			ExplainPropertyInteger("Prefetch Capacity", NULL,
+								   stats->distance_capacity, es);
+
+			ExplainPropertyUInteger("I/O Count", NULL,
+									stats->io_count, es);
+			ExplainPropertyUInteger("I/O Waits", NULL,
+									stats->wait_count, es);
+			ExplainPropertyFloat("Average I/O Size", NULL,
+								 (stats->io_nblocks * 1.0 / Max(1, stats->io_count)), 3, es);
+			ExplainPropertyFloat("Average I/Os In Progress", NULL,
+								 (stats->io_in_progress * 1.0 / Max(1, stats->io_count)), 3, es);
+		}
+	}
+}
+
+static void
+show_scan_io_usage(ScanState *planstate, ExplainState *es)
+{
+	Plan	   *plan = planstate->ps.plan;
+	IOStats		stats = {0};
+
+	if (!es->io)
+		return;
+
+	/*
+	 * Initialize counters with stats from the local process first.
+	 *
+	 * The scan descriptor may not exist, e.g. if the scan did not start, or
+	 * because of debug_parallel_query=regress. We still want to collect data
+	 * from workers.
+	 */
+	if (planstate->ss_currentScanDesc &&
+		planstate->ss_currentScanDesc->rs_instrument)
+	{
+		stats = planstate->ss_currentScanDesc->rs_instrument->io;
+	}
+
+	/*
+	 * Accumulate data from parallel workers (if any).
+	 */
+	switch (nodeTag(plan))
+	{
+		case T_BitmapHeapScan:
+			{
+				SharedBitmapHeapInstrumentation *sinstrument
+				= ((BitmapHeapScanState *) planstate)->sinstrument;
+
+				if (sinstrument)
+				{
+					for (int i = 0; i < sinstrument->num_workers; ++i)
+					{
+						BitmapHeapScanInstrumentation *winstrument = &sinstrument->sinstrument[i];
+
+						AccumulateIOStats(&stats, &winstrument->stats.io);
+
+						if (!es->workers_state)
+							continue;
+
+						ExplainOpenWorker(i, es);
+						print_io_usage(es, &winstrument->stats.io);
+						ExplainCloseWorker(i, es);
+					}
+				}
+
+				break;
+			}
+		default:
+			/* ignore other plans */
+			return;
+	}
+
+	print_io_usage(es, &stats);
+}
+
 /*
  * If it's EXPLAIN ANALYZE, show instrumentation information for a plan node
  *
diff --git a/src/backend/commands/explain_state.c b/src/backend/commands/explain_state.c
index 65dd4111459..0e07a63fca6 100644
--- a/src/backend/commands/explain_state.c
+++ b/src/backend/commands/explain_state.c
@@ -162,6 +162,8 @@ ParseExplainOptionList(ExplainState *es, List *options, ParseState *pstate)
 								"EXPLAIN", opt->defname, p),
 						 parser_errposition(pstate, opt->location)));
 		}
+		else if (strcmp(opt->defname, "io") == 0)
+			es->io = defGetBoolean(opt);
 		else if (!ApplyExtensionExplainOption(es, opt, pstate))
 			ereport(ERROR,
 					(errcode(ERRCODE_SYNTAX_ERROR),
@@ -188,6 +190,12 @@ ParseExplainOptionList(ExplainState *es, List *options, ParseState *pstate)
 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
 				 errmsg("EXPLAIN option %s requires ANALYZE", "TIMING")));
 
+	/* check that IO is used with EXPLAIN ANALYZE */
+	if (es->io && !es->analyze)
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("EXPLAIN option %s requires ANALYZE", "IO")));
+
 	/* check that serialize is used with EXPLAIN ANALYZE */
 	if (es->serialize != EXPLAIN_SERIALIZE_NONE && !es->analyze)
 		ereport(ERROR,
diff --git a/src/backend/executor/nodeBitmapHeapscan.c b/src/backend/executor/nodeBitmapHeapscan.c
index d65e2a87b42..83d6478bc2b 100644
--- a/src/backend/executor/nodeBitmapHeapscan.c
+++ b/src/backend/executor/nodeBitmapHeapscan.c
@@ -144,13 +144,20 @@ BitmapTableScanSetup(BitmapHeapScanState *node)
 	 */
 	if (!node->ss.ss_currentScanDesc)
 	{
+		uint32		flags = SO_NONE;
+
+		if (ScanRelIsReadOnly(&node->ss))
+			flags |= SO_HINT_REL_READ_ONLY;
+
+		if (node->ss.ps.state->es_instrument & INSTRUMENT_IO)
+			flags |= SO_SCAN_INSTRUMENT;
+
 		node->ss.ss_currentScanDesc =
 			table_beginscan_bm(node->ss.ss_currentRelation,
 							   node->ss.ps.state->es_snapshot,
 							   0,
 							   NULL,
-							   ScanRelIsReadOnly(&node->ss) ?
-							   SO_HINT_REL_READ_ONLY : SO_NONE);
+							   flags);
 	}
 
 	node->ss.ss_currentScanDesc->st.rs_tbmiterator = tbmiterator;
@@ -330,6 +337,14 @@ ExecEndBitmapHeapScan(BitmapHeapScanState *node)
 		 */
 		si->exact_pages += node->stats.exact_pages;
 		si->lossy_pages += node->stats.lossy_pages;
+
+		/* collect I/O instrumentation for this process */
+		if (node->ss.ss_currentScanDesc &&
+			node->ss.ss_currentScanDesc->rs_instrument)
+		{
+			AccumulateIOStats(&si->stats.io,
+							  &node->ss.ss_currentScanDesc->rs_instrument->io);
+		}
 	}
 
 	/*
diff --git a/src/backend/storage/aio/read_stream.c b/src/backend/storage/aio/read_stream.c
index 0b6cdf7c873..936e08ea450 100644
--- a/src/backend/storage/aio/read_stream.c
+++ b/src/backend/storage/aio/read_stream.c
@@ -74,6 +74,7 @@
 #include "postgres.h"
 
 #include "miscadmin.h"
+#include "executor/instrument_node.h"
 #include "storage/aio.h"
 #include "storage/fd.h"
 #include "storage/smgr.h"
@@ -123,6 +124,9 @@ struct ReadStream
 	bool		advice_enabled;
 	bool		temporary;
 
+	/* scan stats counters */
+	IOStats    *stats;
+
 	/*
 	 * One-block buffer to support 'ungetting' a block number, to resolve flow
 	 * control problems when I/Os are split.
@@ -188,6 +192,73 @@ block_range_read_stream_cb(ReadStream *stream,
 	return InvalidBlockNumber;
 }
 
+/*
+ * Update stream stats with current pinned buffer depth.
+ *
+ * Called once per buffer returned to the consumer in read_stream_next_buffer().
+ * Records the number of pinned buffers at that moment, so we can compute the
+ * average look-ahead depth.
+ */
+static inline void
+read_stream_count_prefetch(ReadStream *stream)
+{
+	IOStats    *stats = stream->stats;
+
+	if (stats == NULL)
+		return;
+
+	stats->prefetch_count++;
+	stats->distance_sum += stream->pinned_buffers;
+	if (stream->pinned_buffers > stats->distance_max)
+		stats->distance_max = stream->pinned_buffers;
+}
+
+/*
+ * Update stream stats about size of I/O requests.
+ *
+ * We count the number of I/O requests, size of requests (counted in blocks)
+ * and number of in-progress I/Os.
+ */
+static inline void
+read_stream_count_io(ReadStream *stream, int nblocks, int in_progress)
+{
+	IOStats    *stats = stream->stats;
+
+	if (stats == NULL)
+		return;
+
+	stats->io_count++;
+	stats->io_nblocks += nblocks;
+	stats->io_in_progress += in_progress;
+}
+
+/*
+ * Update stream stats about waits for I/O when consuming buffers.
+ *
+ * We count the number of I/O waits while pulling buffers out of a stream.
+ */
+static inline void
+read_stream_count_wait(ReadStream *stream)
+{
+	IOStats    *stats = stream->stats;
+
+	if (stats == NULL)
+		return;
+
+	stats->wait_count++;
+}
+
+/*
+ * Enable collection of stats into the provided IOStats.
+ */
+void
+read_stream_enable_stats(ReadStream *stream, IOStats *stats)
+{
+	stream->stats = stats;
+	if (stream->stats)
+		stream->stats->distance_capacity = stream->max_pinned_buffers;
+}
+
 /*
  * Ask the callback which block it would like us to read next, with a one block
  * buffer in front to allow read_stream_unget_block() to work.
@@ -426,6 +497,9 @@ read_stream_start_pending_read(ReadStream *stream)
 		Assert(stream->ios_in_progress < stream->max_ios);
 		stream->ios_in_progress++;
 		stream->seq_blocknum = stream->pending_read_blocknum + nblocks;
+
+		/* update I/O stats */
+		read_stream_count_io(stream, nblocks, stream->ios_in_progress);
 	}
 
 	/*
@@ -1021,6 +1095,7 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
 										flags)))
 			{
 				/* Fast return. */
+				read_stream_count_prefetch(stream);
 				return buffer;
 			}
 
@@ -1036,6 +1111,12 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
 			 * to avoid having to effectively do another synchronous IO for
 			 * the next block (if it were also a miss).
 			 */
+
+			/* update I/O stats */
+			read_stream_count_io(stream, 1, stream->ios_in_progress);
+
+			/* update prefetch distance */
+			read_stream_count_prefetch(stream);
 		}
 		else
 		{
@@ -1100,6 +1181,10 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
 
 		needed_wait = WaitReadBuffers(&stream->ios[io_index].op);
 
+		/* Count it as a wait if we need to wait for IO */
+		if (needed_wait)
+			read_stream_count_wait(stream);
+
 		Assert(stream->ios_in_progress > 0);
 		stream->ios_in_progress--;
 		if (++stream->oldest_io_index == stream->max_ios)
@@ -1228,6 +1313,8 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
 	}
 #endif
 
+	read_stream_count_prefetch(stream);
+
 	/* Pin transferred to caller. */
 	Assert(stream->pinned_buffers > 0);
 	stream->pinned_buffers--;
diff --git a/src/include/access/relscan.h b/src/include/access/relscan.h
index fd2076c582a..2ea06a67a63 100644
--- a/src/include/access/relscan.h
+++ b/src/include/access/relscan.h
@@ -24,6 +24,7 @@
 
 
 struct ParallelTableScanDescData;
+struct TableScanInstrumentation;
 
 /*
  * Generic descriptor for table scans. This is the base-class for table scans,
@@ -64,6 +65,11 @@ typedef struct TableScanDescData
 
 	struct ParallelTableScanDescData *rs_parallel;	/* parallel scan
 													 * information */
+
+	/*
+	 * Instrumentation counters maintained by all table AMs.
+	 */
+	struct TableScanInstrumentation *rs_instrument;
 } TableScanDescData;
 typedef struct TableScanDescData *TableScanDesc;
 
diff --git a/src/include/access/tableam.h b/src/include/access/tableam.h
index a21c7db5439..c13f05d39db 100644
--- a/src/include/access/tableam.h
+++ b/src/include/access/tableam.h
@@ -69,6 +69,9 @@ typedef enum ScanOptions
 
 	/* set if the query doesn't modify the relation */
 	SO_HINT_REL_READ_ONLY = 1 << 10,
+
+	/* collect scan instrumentation */
+	SO_SCAN_INSTRUMENT = 1 << 11,
 }			ScanOptions;
 
 /*
diff --git a/src/include/commands/explain_state.h b/src/include/commands/explain_state.h
index 6252fe11f15..97bc7ed49f6 100644
--- a/src/include/commands/explain_state.h
+++ b/src/include/commands/explain_state.h
@@ -55,6 +55,7 @@ typedef struct ExplainState
 	bool		summary;		/* print total planning and execution timing */
 	bool		memory;			/* print planner's memory usage information */
 	bool		settings;		/* print modified settings */
+	bool		io;				/* print info about IO (prefetch, ...) */
 	bool		generic;		/* generate a generic plan */
 	ExplainSerializeOption serialize;	/* serialize the query's output? */
 	ExplainFormat format;		/* output format */
diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h
index cc9fbb0e2f0..f093a52aae0 100644
--- a/src/include/executor/instrument.h
+++ b/src/include/executor/instrument.h
@@ -64,6 +64,7 @@ typedef enum InstrumentOption
 	INSTRUMENT_BUFFERS = 1 << 1,	/* needs buffer usage */
 	INSTRUMENT_ROWS = 1 << 2,	/* needs row count */
 	INSTRUMENT_WAL = 1 << 3,	/* needs WAL usage */
+	INSTRUMENT_IO = 1 << 4,		/* needs IO usage */
 	INSTRUMENT_ALL = PG_INT32_MAX
 } InstrumentOption;
 
diff --git a/src/include/executor/instrument_node.h b/src/include/executor/instrument_node.h
index e6a3f9f1941..22a75ccd863 100644
--- a/src/include/executor/instrument_node.h
+++ b/src/include/executor/instrument_node.h
@@ -48,6 +48,55 @@ typedef struct SharedAggInfo
 } SharedAggInfo;
 
 
+/* ---------------------
+ *	Instrumentation information about read streams and I/O
+ * ---------------------
+ */
+typedef struct IOStats
+{
+	/* number of buffers returned to consumer (for averaging distance) */
+	uint64		prefetch_count;
+
+	/* sum of pinned_buffers sampled at each buffer return */
+	uint64		distance_sum;
+
+	/* maximum actual pinned_buffers observed during the scan */
+	int16		distance_max;
+
+	/* maximum possible look-ahead distance (max_pinned_buffers) */
+	int16		distance_capacity;
+
+	/* number of waits for a read (for the I/O) */
+	uint64		wait_count;
+
+	/* I/O stats */
+	uint64		io_count;		/* number of I/Os */
+	uint64		io_nblocks;		/* sum of blocks for all I/Os */
+	uint64		io_in_progress; /* sum of in-progress I/Os */
+} IOStats;
+
+typedef struct TableScanInstrumentation
+{
+	IOStats		io;
+} TableScanInstrumentation;
+
+/* merge IO statistics from 'src' into 'dst' */
+static inline void
+AccumulateIOStats(IOStats *dst, IOStats *src)
+{
+	dst->prefetch_count += src->prefetch_count;
+	dst->distance_sum += src->distance_sum;
+	if (src->distance_max > dst->distance_max)
+		dst->distance_max = src->distance_max;
+	if (src->distance_capacity > dst->distance_capacity)
+		dst->distance_capacity = src->distance_capacity;
+	dst->wait_count += src->wait_count;
+	dst->io_count += src->io_count;
+	dst->io_nblocks += src->io_nblocks;
+	dst->io_in_progress += src->io_in_progress;
+}
+
+
 /* ---------------------
  *	Instrumentation information for indexscans (amgettuple and amgetbitmap)
  * ---------------------
@@ -79,6 +128,7 @@ typedef struct BitmapHeapScanInstrumentation
 {
 	uint64		exact_pages;
 	uint64		lossy_pages;
+	TableScanInstrumentation stats;
 } BitmapHeapScanInstrumentation;
 
 /*
diff --git a/src/include/storage/read_stream.h b/src/include/storage/read_stream.h
index c9359b29b0f..aebb1fafb31 100644
--- a/src/include/storage/read_stream.h
+++ b/src/include/storage/read_stream.h
@@ -65,6 +65,7 @@
 
 struct ReadStream;
 typedef struct ReadStream ReadStream;
+typedef struct IOStats IOStats;
 
 /* for block_range_read_stream_cb */
 typedef struct BlockRangeReadStreamPrivate
@@ -103,5 +104,6 @@ extern BlockNumber read_stream_pause(ReadStream *stream);
 extern void read_stream_resume(ReadStream *stream);
 extern void read_stream_reset(ReadStream *stream);
 extern void read_stream_end(ReadStream *stream);
+extern void read_stream_enable_stats(ReadStream *stream, IOStats *stats);
 
 #endif							/* READ_STREAM_H */
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 9e6a39f5608..98b8d78e693 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -1279,6 +1279,7 @@ IOContext
 IOFuncSelector
 IOObject
 IOOp
+IOStats
 IO_STATUS_BLOCK
 IPCompareMethod
 ITEM
@@ -3127,6 +3128,7 @@ TableLikeClause
 TableSampleClause
 TableScanDesc
 TableScanDescData
+TableScanInstrumentation
 TableSpaceCacheEntry
 TableSpaceOpts
 TableToProcess
-- 
2.53.0

