From 4e1df86fbc1c4e3014ef96cc6d1992c3f93e64e0 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Tue, 3 Mar 2026 18:00:53 -0500
Subject: [PATCH v5 4/5] read_stream: Only increase read-ahead distance when
 waiting for IO

This avoids increasing the distance to the maximum in cases where the IO
subsystem is already keeping up. This turns out to be important for
performance for two reasons:

- Pinning a lot of buffers is not cheap. If additional pins allow us to avoid
  IO waits, it's definitely worth it, but if we can already do all the
  necessary readahead at a distance of 16, reading ahead 512 buffers can
  increase the CPU overhead substantially.  This is particularly noticeable
  when the to-be-read blocks are already in the kernel page cache.

- If the read stream is read to completion, reading in data earlier than
  needed is of limited consequences, leaving aside the CPU costs mentioned
  above. But if the read stream will not be fully consumed, e.g. because it is
  on the inner side of a nested loop join, the additional IO can be a serious
  performance issue. This is not that commonly a problem for current read
  stream users, but the upcoming work, to use a read stream to fetch table
  pages as part of an index scan, frequently encounters this.

Note that this commit would have substantial performance downsides without
earlier commits:

- Commit 6e36930f9aa, which avoids decreasing the readahead distance when
  there was recent IO, is crucial, as otherwise we very often would end up not
  reading ahead aggressively enough anymore with this commit, due to
  increasing the distance less often.

- "read stream: Split decision about look ahead for AIO and combining" is
  important as we would otherwise not perform IO combining when the IO
  subsystem can keep up.

- "aio: io_uring: Trigger async processing for large IOs" is important to
  continue to benefit from memory copy parallelism when using fewer IOs.

Reviewed-by: Melanie Plageman <melanieplageman@gmail.com>
Reviewed-by: Nazir Bilal Yavuz <byavuz81@gmail.com>
Tested-by: Tomas Vondra <tomas@vondra.me>
Discussion: https://postgr.es/m/f3xxfrkafjxpyqxywcxricxgyizjirfceychyxsgn7bwjp5eda@kwbduhy7tfmu
Discussion: https://postgr.es/m/CA+hUKGL2PhFyDoqrHefqasOnaXhSg48t1phs3VM8BAdrZqKZkw@mail.gmail.com
---
 src/backend/storage/aio/read_stream.c | 89 ++++++++++++++++++++-------
 1 file changed, 68 insertions(+), 21 deletions(-)

diff --git a/src/backend/storage/aio/read_stream.c b/src/backend/storage/aio/read_stream.c
index a63e66988e1..2adc04601b6 100644
--- a/src/backend/storage/aio/read_stream.c
+++ b/src/backend/storage/aio/read_stream.c
@@ -18,11 +18,13 @@
  * to StartReadBuffers() so that a new one can begin to form.
  *
  * The algorithm for controlling the look-ahead distance is based on recent
- * cache hit and miss history.  When no I/O is necessary, there is no benefit
- * in looking ahead more than one block.  This is the default initial
- * assumption, but when blocks needing I/O are streamed, the distance is
- * increased rapidly to try to benefit from I/O combining and concurrency.  It
- * is reduced gradually when cached blocks are streamed.
+ * cache / miss history, as well as whether we need to wait for I/O completion
+ * after a miss.  When no I/O is necessary, there is no benefit in looking
+ * ahead more than one block.  This is the default initial assumption.  When
+ * blocks needing I/O are streamed, the combine distance is increased to
+ * benefit from I/O combining and the read-ahead distance is increased
+ * whenever we need to wait for IO to try to benefit from increased I/O
+ * concurrency. Both are reduced gradually when cached blocks are streamed.
  *
  * The main data structure is a circular queue of buffers of size
  * max_pinned_buffers plus some extra space for technical reasons, ready to be
@@ -1069,16 +1071,13 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
 		stream->ios[stream->oldest_io_index].buffer_index == oldest_buffer_index)
 	{
 		int16		io_index = stream->oldest_io_index;
-
-		/* wider temporary values, clamped below */
-		int32		readahead_distance;
-		int32		combine_distance;
+		bool		needed_wait;
 
 		/* Sanity check that we still agree on the buffers. */
 		Assert(stream->ios[io_index].op.buffers ==
 			   &stream->buffers[oldest_buffer_index]);
 
-		WaitReadBuffers(&stream->ios[io_index].op);
+		needed_wait = WaitReadBuffers(&stream->ios[io_index].op);
 
 		Assert(stream->ios_in_progress > 0);
 		stream->ios_in_progress--;
@@ -1086,21 +1085,45 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
 			stream->oldest_io_index = 0;
 
 		/*
-		 * Read-ahead and IO combining distances ramp up rapidly after we do
-		 * I/O.
+		 * If the IO was executed synchronously, we will never see
+		 * WaitReadBuffers() block. Treat it as if it did block. This is
+		 * particularly crucial when effective_io_concurrency=0 is used, as
+		 * all IO will be synchronous.  Without treating synchronous IO as
+		 * having waited, we'd never allow the distance to get large enough to
+		 * allow for IO combining, resulting in bad performance.
 		 */
-		readahead_distance = stream->readahead_distance * 2;
-		readahead_distance = Min(readahead_distance, stream->max_pinned_buffers);
-		stream->readahead_distance = readahead_distance;
+		if (stream->ios[io_index].op.flags & READ_BUFFERS_SYNCHRONOUSLY)
+			needed_wait = true;
 
-		combine_distance = stream->combine_distance * 2;
-		combine_distance = Min(combine_distance, stream->io_combine_limit);
-		combine_distance = Min(combine_distance, stream->max_pinned_buffers);
-		stream->combine_distance = combine_distance;
+		/*
+		 * Have the read-ahead distance ramp up rapidly after we needed to
+		 * wait for IO. We only increase the read-ahead-distance when we
+		 * needed to wait, to avoid increasing the distance further than
+		 * necessary, as looking ahead too far can be costly, both due to the
+		 * cost of unnecessarily pinning many buffers and due to doing IOs
+		 * that may never be consumed if the stream is ended/reset before
+		 * completion.
+		 *
+		 * If we did not need to wait, the current distance was evidently
+		 * sufficient.
+		 *
+		 * NB: May not increase the distance if we already reached the end of
+		 * the stream, as stream->readahead_distance == 0 is used to keep
+		 * track of having reached the end.
+		 */
+		if (stream->readahead_distance > 0 && needed_wait)
+		{
+			/* wider temporary value, due to overflow risk */
+			int32		readahead_distance;
+
+			readahead_distance = stream->readahead_distance * 2;
+			readahead_distance = Min(readahead_distance, stream->max_pinned_buffers);
+			stream->readahead_distance = readahead_distance;
+		}
 
 		/*
-		 * As we needed IO, prevent distance from being reduced within our
-		 * maximum look-ahead window. This avoids having distance collapse too
+		 * As we needed IO, prevent distances from being reduced within our
+		 * maximum look-ahead window. This avoids collapsing distances too
 		 * quickly in workloads where most of the required blocks are cached,
 		 * but where the remaining IOs are a sufficient enough factor to cause
 		 * a substantial slowdown if executed synchronously.
@@ -1112,6 +1135,30 @@ read_stream_next_buffer(ReadStream *stream, void **per_buffer_data)
 		 */
 		stream->distance_decay_holdoff = stream->max_pinned_buffers;
 
+		/*
+		 * Whether we needed to wait or not, allow for more IO combining if we
+		 * needed to do IO. The reason to do so independent of needing to wait
+		 * is that when the data is resident in the kernel page cache, IO
+		 * combining reduces the syscall / dispatch overhead, making it
+		 * worthwhile regardless of needing to wait.
+		 *
+		 * It is also important with io_uring as it will never signal the need
+		 * to wait for reads if all the data is in the page cache. There are
+		 * heuristics to deal with that in method_io_uring.c, but they only
+		 * work when the IO gets large enough.
+		 */
+		if (stream->combine_distance > 0 &&
+			stream->combine_distance < stream->io_combine_limit)
+		{
+			/* wider temporary value, due to overflow risk */
+			int32		combine_distance;
+
+			combine_distance = stream->combine_distance * 2;
+			combine_distance = Min(combine_distance, stream->io_combine_limit);
+			combine_distance = Min(combine_distance, stream->max_pinned_buffers);
+			stream->combine_distance = combine_distance;
+		}
+
 		/*
 		 * If we've reached the first block of a sequential region we're
 		 * issuing advice for, cancel that until the next jump.  The kernel
-- 
2.53.0.1.gb2826b52eb