From 46880777f409db35dd83d8acfabe56ef02e9c51c Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Tue, 18 Mar 2025 14:40:06 -0400
Subject: [PATCH v2.14 18/29] aio: Add test_aio module

To make the tests possible, a few functions from bufmgr.c/localbuf.c had to be
exported, via buf_internals.h.

Reviewed-by: Noah Misch <noah@leadboat.com>
Co-authored-by: Andres Freund <andres@anarazel.de>
Co-authored-by: Nazir Bilal Yavuz <byavuz81@gmail.com>
Discussion: https://postgr.es/m/uvrtrknj4kdytuboidbhwclo4gxhswwcpgadptsjvjqcluzmah%40brqs62irg4dt
---
 src/include/storage/buf_internals.h           |    7 +
 src/backend/storage/buffer/bufmgr.c           |    8 +-
 src/backend/storage/buffer/localbuf.c         |    3 +-
 src/test/modules/Makefile                     |    1 +
 src/test/modules/meson.build                  |    1 +
 src/test/modules/test_aio/.gitignore          |    2 +
 src/test/modules/test_aio/Makefile            |   26 +
 src/test/modules/test_aio/meson.build         |   37 +
 src/test/modules/test_aio/t/001_aio.pl        | 1439 +++++++++++++++++
 src/test/modules/test_aio/t/002_io_workers.pl |  125 ++
 src/test/modules/test_aio/test_aio--1.0.sql   |  108 ++
 src/test/modules/test_aio/test_aio.c          |  802 +++++++++
 src/test/modules/test_aio/test_aio.control    |    3 +
 13 files changed, 2554 insertions(+), 8 deletions(-)
 create mode 100644 src/test/modules/test_aio/.gitignore
 create mode 100644 src/test/modules/test_aio/Makefile
 create mode 100644 src/test/modules/test_aio/meson.build
 create mode 100644 src/test/modules/test_aio/t/001_aio.pl
 create mode 100644 src/test/modules/test_aio/t/002_io_workers.pl
 create mode 100644 src/test/modules/test_aio/test_aio--1.0.sql
 create mode 100644 src/test/modules/test_aio/test_aio.c
 create mode 100644 src/test/modules/test_aio/test_aio.control

diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index 72b36a4af26..0dec7d93b3b 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -434,6 +434,12 @@ extern void IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_co
 extern void ScheduleBufferTagForWriteback(WritebackContext *wb_context,
 										  IOContext io_context, BufferTag *tag);
 
+/* solely to make it easier to write tests */
+extern bool StartBufferIO(BufferDesc *buf, bool forInput, bool nowait);
+extern void TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits,
+							  bool forget_owner, bool release_aio);
+
+
 /* freelist.c */
 extern IOContext IOContextForStrategy(BufferAccessStrategy strategy);
 extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy,
@@ -478,6 +484,7 @@ extern void TerminateLocalBufferIO(BufferDesc *bufHdr, bool clear_dirty,
 								   uint32 set_flag_bits, bool release_aio);
 extern bool StartLocalBufferIO(BufferDesc *bufHdr, bool forInput, bool nowait);
 extern void FlushLocalBuffer(BufferDesc *bufHdr, SMgrRelation reln);
+extern void InvalidateLocalBuffer(BufferDesc *bufHdr, bool check_unreferenced);
 extern void DropRelationLocalBuffers(RelFileLocator rlocator,
 									 ForkNumber forkNum,
 									 BlockNumber firstDelBlock);
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 39cc6489145..c65d3916211 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -518,10 +518,6 @@ static uint32 WaitBufHdrUnlocked(BufferDesc *buf);
 static int	SyncOneBuffer(int buf_id, bool skip_recently_used,
 						  WritebackContext *wb_context);
 static void WaitIO(BufferDesc *buf);
-static bool StartBufferIO(BufferDesc *buf, bool forInput, bool nowait);
-static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty,
-							  uint32 set_flag_bits, bool forget_owner,
-							  bool release_aio);
 static void AbortBufferIO(Buffer buffer);
 static void shared_buffer_write_error_callback(void *arg);
 static void local_buffer_write_error_callback(void *arg);
@@ -5961,7 +5957,7 @@ WaitIO(BufferDesc *buf)
  * find out if they can perform the I/O as part of a larger operation, without
  * waiting for the answer or distinguishing the reasons why not.
  */
-static bool
+bool
 StartBufferIO(BufferDesc *buf, bool forInput, bool nowait)
 {
 	uint32		buf_state;
@@ -6018,7 +6014,7 @@ StartBufferIO(BufferDesc *buf, bool forInput, bool nowait)
  * resource owner. (forget_owner=false is used when the resource owner itself
  * is being released)
  */
-static void
+void
 TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits,
 				  bool forget_owner, bool release_aio)
 {
diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c
index bf89076bb10..ed56202af14 100644
--- a/src/backend/storage/buffer/localbuf.c
+++ b/src/backend/storage/buffer/localbuf.c
@@ -57,7 +57,6 @@ static int	NLocalPinnedBuffers = 0;
 static void InitLocalBuffers(void);
 static Block GetLocalBufferStorage(void);
 static Buffer GetLocalVictimBuffer(void);
-static void InvalidateLocalBuffer(BufferDesc *bufHdr, bool check_unreferenced);
 
 
 /*
@@ -597,7 +596,7 @@ TerminateLocalBufferIO(BufferDesc *bufHdr, bool clear_dirty, uint32 set_flag_bit
  *
  * See also InvalidateBuffer().
  */
-static void
+void
 InvalidateLocalBuffer(BufferDesc *bufHdr, bool check_unreferenced)
 {
 	Buffer		buffer = BufferDescriptorGetBuffer(bufHdr);
diff --git a/src/test/modules/Makefile b/src/test/modules/Makefile
index 4e4be3fa511..aa1d27bbed3 100644
--- a/src/test/modules/Makefile
+++ b/src/test/modules/Makefile
@@ -14,6 +14,7 @@ SUBDIRS = \
 		  oauth_validator \
 		  plsample \
 		  spgist_name_ops \
+		  test_aio \
 		  test_bloomfilter \
 		  test_copy_callbacks \
 		  test_custom_rmgrs \
diff --git a/src/test/modules/meson.build b/src/test/modules/meson.build
index 2b057451473..9de0057bd1d 100644
--- a/src/test/modules/meson.build
+++ b/src/test/modules/meson.build
@@ -13,6 +13,7 @@ subdir('oauth_validator')
 subdir('plsample')
 subdir('spgist_name_ops')
 subdir('ssl_passphrase_callback')
+subdir('test_aio')
 subdir('test_bloomfilter')
 subdir('test_copy_callbacks')
 subdir('test_custom_rmgrs')
diff --git a/src/test/modules/test_aio/.gitignore b/src/test/modules/test_aio/.gitignore
new file mode 100644
index 00000000000..716e17f5a2a
--- /dev/null
+++ b/src/test/modules/test_aio/.gitignore
@@ -0,0 +1,2 @@
+# Generated subdirectories
+/tmp_check/
diff --git a/src/test/modules/test_aio/Makefile b/src/test/modules/test_aio/Makefile
new file mode 100644
index 00000000000..f53cc64671a
--- /dev/null
+++ b/src/test/modules/test_aio/Makefile
@@ -0,0 +1,26 @@
+# src/test/modules/test_aio/Makefile
+
+PGFILEDESC = "test_aio - test code for AIO"
+
+MODULE_big = test_aio
+OBJS = \
+	$(WIN32RES) \
+	test_aio.o
+
+EXTENSION = test_aio
+DATA = test_aio--1.0.sql
+
+TAP_TESTS = 1
+
+export enable_injection_points
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = src/test/modules/test_aio
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
diff --git a/src/test/modules/test_aio/meson.build b/src/test/modules/test_aio/meson.build
new file mode 100644
index 00000000000..73d2fd68eaa
--- /dev/null
+++ b/src/test/modules/test_aio/meson.build
@@ -0,0 +1,37 @@
+# Copyright (c) 2024-2025, PostgreSQL Global Development Group
+
+test_aio_sources = files(
+  'test_aio.c',
+)
+
+if host_system == 'windows'
+  test_aio_sources += rc_lib_gen.process(win32ver_rc, extra_args: [
+    '--NAME', 'test_aio',
+    '--FILEDESC', 'test_aio - test code for AIO',])
+endif
+
+test_aio = shared_module('test_aio',
+  test_aio_sources,
+  kwargs: pg_test_mod_args,
+)
+test_install_libs += test_aio
+
+test_install_data += files(
+  'test_aio.control',
+  'test_aio--1.0.sql',
+)
+
+tests += {
+  'name': 'test_aio',
+  'sd': meson.current_source_dir(),
+  'bd': meson.current_build_dir(),
+  'tap': {
+    'env': {
+       'enable_injection_points': get_option('injection_points') ? 'yes' : 'no',
+    },
+    'tests': [
+      't/001_aio.pl',
+      't/002_io_workers.pl',
+    ],
+  },
+}
diff --git a/src/test/modules/test_aio/t/001_aio.pl b/src/test/modules/test_aio/t/001_aio.pl
new file mode 100644
index 00000000000..4f08116a3ff
--- /dev/null
+++ b/src/test/modules/test_aio/t/001_aio.pl
@@ -0,0 +1,1439 @@
+# Copyright (c) 2025, PostgreSQL Global Development Group
+
+use strict;
+use warnings FATAL => 'all';
+
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+
+###
+# Test io_method=worker
+###
+my $node_worker = create_node('worker');
+$node_worker->start();
+
+test_generic('worker', $node_worker);
+SKIP:
+{
+	skip 'Injection points not supported by this build', 1
+	  unless $ENV{enable_injection_points} eq 'yes';
+	test_inject_worker('worker', $node_worker);
+}
+
+$node_worker->stop();
+
+
+###
+# Test io_method=io_uring
+###
+
+if (have_io_uring())
+{
+	my $node_uring = create_node('io_uring');
+	$node_uring->start();
+	test_generic('io_uring', $node_uring);
+	$node_uring->stop();
+}
+
+
+###
+# Test io_method=sync
+###
+
+my $node_sync = create_node('sync');
+
+# just to have one test not use the default auto-tuning
+
+$node_sync->append_conf(
+	'postgresql.conf', qq(
+io_max_concurrency=4
+));
+
+$node_sync->start();
+test_generic('sync', $node_sync);
+$node_sync->stop();
+
+done_testing();
+
+
+###
+# Test Helpers
+###
+
+sub create_node
+{
+	local $Test::Builder::Level = $Test::Builder::Level + 1;
+
+	my $io_method = shift;
+
+	my $node = PostgreSQL::Test::Cluster->new($io_method);
+
+	# Want to test initdb for each IO method, otherwise we could just reuse
+	# the cluster.
+	#
+	# Unfortunately Cluster::init() puts PG_TEST_INITDB_EXTRA_OPTS after the
+	# options specified by ->extra, if somebody puts -c io_method=xyz in
+	# PG_TEST_INITDB_EXTRA_OPTS it would break this test. Fix that up if we
+	# detect it.
+	local $ENV{PG_TEST_INITDB_EXTRA_OPTS} = $ENV{PG_TEST_INITDB_EXTRA_OPTS};
+	if (defined $ENV{PG_TEST_INITDB_EXTRA_OPTS}
+		&& $ENV{PG_TEST_INITDB_EXTRA_OPTS} =~ m/io_method=/)
+	{
+		$ENV{PG_TEST_INITDB_EXTRA_OPTS} .= " -c io_method=$io_method";
+	}
+
+	$node->init(extra => [ '-c', "io_method=$io_method" ]);
+
+	$node->append_conf(
+		'postgresql.conf', qq(
+shared_preload_libraries=test_aio
+log_min_messages = 'DEBUG3'
+log_statement=all
+log_error_verbosity=default
+restart_after_crash=false
+temp_buffers=100
+));
+
+	ok(1, "$io_method: initdb");
+
+	return $node;
+}
+
+sub have_io_uring
+{
+	# To detect if io_uring is supported, we look at the error message for
+	# assigning an invalid value to an enum GUC, which lists all the valid
+	# options. We need to use -C to deal with running as administrator on
+	# windows, the superuser check is omitted if -C is used.
+	my ($stdout, $stderr) =
+	  run_command [qw(postgres -C invalid -c io_method=invalid)];
+	die "can't determine supported io_method values"
+	  unless $stderr =~ m/Available values: ([^\.]+)\./;
+	my $methods = $1;
+	note "supported io_method values are: $methods";
+
+	return ($methods =~ m/io_uring/) ? 1 : 0;
+}
+
+sub psql_like
+{
+	my $io_method = shift;
+	my $psql = shift;
+	my $name = shift;
+	my $sql = shift;
+	my $expected_stdout = shift;
+	my $expected_stderr = shift;
+	my ($cmdret, $output);
+
+	($output, $cmdret) = $psql->query($sql);
+
+	like($output, $expected_stdout, "$io_method: $name: expected stdout");
+	like($psql->{stderr}, $expected_stderr,
+		"$io_method: $name: expected stderr");
+	$psql->{stderr} = '';
+
+	return $output;
+}
+
+sub query_wait_block
+{
+	my $io_method = shift;
+	my $node = shift;
+	my $psql = shift;
+	my $name = shift;
+	my $sql = shift;
+	my $waitfor = shift;
+
+	my $pid = $psql->query_safe('SELECT pg_backend_pid()');
+
+	$psql->{stdin} .= qq($sql;\n);
+	$psql->{run}->pump_nb();
+	ok(1, "$io_method: $name: issued sql");
+
+	$node->poll_query_until('postgres',
+		qq(SELECT wait_event FROM pg_stat_activity WHERE pid = $pid),
+		$waitfor);
+	ok(1, "$io_method: $name: observed $waitfor wait event");
+}
+
+# Returns count of checksum failures for the specified database or for shared
+# relations, if $datname is undefined.
+sub checksum_failures
+{
+	my $psql = shift;
+	my $datname = shift;
+	my $checksum_count;
+	my $checksum_last_failure;
+
+	if (defined $datname)
+	{
+		$checksum_count = $psql->query_safe(
+			qq(
+SELECT checksum_failures FROM pg_stat_database WHERE datname = '$datname';
+));
+		$checksum_last_failure = $psql->query_safe(
+			qq(
+SELECT checksum_last_failure FROM pg_stat_database WHERE datname = '$datname';
+));
+	}
+	else
+	{
+		$checksum_count = $psql->query_safe(
+			qq(
+SELECT checksum_failures FROM pg_stat_database WHERE datname IS NULL;
+));
+		$checksum_last_failure = $psql->query_safe(
+			qq(
+SELECT checksum_last_failure FROM pg_stat_database WHERE datname IS NULL;
+));
+	}
+
+	return $checksum_count, $checksum_last_failure;
+}
+
+###
+# Sub-tests
+###
+
+# Sanity checks for the IO handle API
+sub test_handle
+{
+	my $io_method = shift;
+	my $node = shift;
+
+	my $psql = $node->background_psql('postgres', on_error_stop => 0);
+
+	# leak warning: implicit xact
+	psql_like(
+		$io_method,
+		$psql,
+		"handle_get() leak in implicit xact",
+		qq(SELECT handle_get()),
+		qr/^$/,
+		qr/leaked AIO handle/,
+		"$io_method: leaky handle_get() warns");
+
+	# leak warning: explicit xact
+	psql_like(
+		$io_method, $psql,
+		"handle_get() leak in explicit xact",
+		qq(BEGIN; SELECT handle_get(); COMMIT),
+		qr/^$/, qr/leaked AIO handle/);
+
+
+	# leak warning: explicit xact, rollback
+	psql_like(
+		$io_method,
+		$psql,
+		"handle_get() leak in explicit xact, rollback",
+		qq(BEGIN; SELECT handle_get(); ROLLBACK;),
+		qr/^$/,
+		qr/leaked AIO handle/);
+
+	# leak warning: subtrans
+	psql_like(
+		$io_method,
+		$psql,
+		"handle_get() leak in subxact",
+		qq(BEGIN; SAVEPOINT foo; SELECT handle_get(); COMMIT;),
+		qr/^$/,
+		qr/leaked AIO handle/);
+
+	# leak warning + error: released in different command (thus resowner)
+	psql_like(
+		$io_method,
+		$psql,
+		"handle_release() in different command",
+		qq(BEGIN; SELECT handle_get(); SELECT handle_release_last(); COMMIT;),
+		qr/^$/,
+		qr/leaked AIO handle.*release in unexpected state/ms);
+
+	# no leak, release in same command
+	psql_like(
+		$io_method,
+		$psql,
+		"handle_release() in same command",
+		qq(BEGIN; SELECT handle_get() UNION ALL SELECT handle_release_last(); COMMIT;),
+		qr/^$/,
+		qr/^$/);
+
+	# normal handle use
+	psql_like($io_method, $psql, "handle_get_release()",
+		qq(SELECT handle_get_release()),
+		qr/^$/, qr/^$/);
+
+	# should error out, API violation
+	psql_like(
+		$io_method,
+		$psql,
+		"handle_get_twice()",
+		qq(SELECT handle_get_twice()),
+		qr/^$/,
+		qr/ERROR:  API violation: Only one IO can be handed out$/);
+
+	# recover after error in implicit xact
+	psql_like(
+		$io_method,
+		$psql,
+		"handle error recovery in implicit xact",
+		qq(SELECT handle_get_and_error(); SELECT 'ok', handle_get_release()),
+		qr/^|ok$/,
+		qr/ERROR.*as you command/);
+
+	# recover after error in implicit xact
+	psql_like(
+		$io_method,
+		$psql,
+		"handle error recovery in explicit xact",
+		qq(BEGIN; SELECT handle_get_and_error(); SELECT handle_get_release(), 'ok'; COMMIT;),
+		qr/^|ok$/,
+		qr/ERROR.*as you command/);
+
+	# recover after error in subtrans
+	psql_like(
+		$io_method,
+		$psql,
+		"handle error recovery in explicit subxact",
+		qq(BEGIN; SAVEPOINT foo; SELECT handle_get_and_error(); ROLLBACK TO SAVEPOINT foo; SELECT handle_get_release(); ROLLBACK;),
+		qr/^|ok$/,
+		qr/ERROR.*as you command/);
+
+	$psql->quit();
+}
+
+# Sanity checks for the batchmode API
+sub test_batchmode
+{
+	my $io_method = shift;
+	my $node = shift;
+
+	my $psql = $node->background_psql('postgres', on_error_stop => 0);
+
+	# leak warning & recovery: implicit xact
+	psql_like(
+		$io_method,
+		$psql,
+		"batch_start() leak & cleanup in implicit xact",
+		qq(SELECT batch_start()),
+		qr/^$/,
+		qr/open AIO batch at end/,
+		"$io_method: leaky batch_start() warns");
+
+	# leak warning & recovery: explicit xact
+	psql_like(
+		$io_method,
+		$psql,
+		"batch_start() leak & cleanup in explicit xact",
+		qq(BEGIN; SELECT batch_start(); COMMIT;),
+		qr/^$/,
+		qr/open AIO batch at end/,
+		"$io_method: leaky batch_start() warns");
+
+
+	# leak warning & recovery: explicit xact, rollback
+	#
+	# XXX: This doesn't fail right now, due to not getting a chance to do
+	# something at transaction command commit. That's not a correctness issue,
+	# it just means it's a bit harder to find buggy code.
+	#psql_like($io_method, $psql,
+	#		  "batch_start() leak & cleanup after abort",
+	#		  qq(BEGIN; SELECT batch_start(); ROLLBACK;),
+	#		  qr/^$/,
+	#		  qr/open AIO batch at end/, "$io_method: leaky batch_start() warns");
+
+	# no warning, batch closed in same command
+	psql_like(
+		$io_method,
+		$psql,
+		"batch_start(), batch_end() works",
+		qq(SELECT batch_start() UNION ALL SELECT batch_end()),
+		qr/^$/,
+		qr/^$/,
+		"$io_method: batch_start(), batch_end()");
+
+	$psql->quit();
+}
+
+# Test that simple cases of invalid pages are reported
+sub test_io_error
+{
+	my $io_method = shift;
+	my $node = shift;
+	my ($ret, $output);
+
+	my $psql = $node->background_psql('postgres', on_error_stop => 0);
+
+	$psql->query_safe(
+		qq(
+CREATE TEMPORARY TABLE tmp_corr(data int not null);
+INSERT INTO tmp_corr SELECT generate_series(1, 10000);
+SELECT modify_rel_block('tmp_corr', 1, corrupt_header=>true);
+));
+
+	foreach my $tblname (qw(tbl_corr tmp_corr))
+	{
+		my $invalid_page_re =
+		  $tblname eq 'tbl_corr'
+		  ? qr/invalid page in block 1 of relation base\/\d+\/\d+/
+		  : qr/invalid page in block 1 of relation base\/\d+\/t\d+_\d+/;
+
+		# verify the error is reported in custom C code
+		psql_like(
+			$io_method,
+			$psql,
+			"read_rel_block_ll() of $tblname page",
+			qq(SELECT read_rel_block_ll('$tblname', 1)),
+			qr/^$/,
+			$invalid_page_re);
+
+		# verify the error is reported for bufmgr reads, seq scan
+		psql_like(
+			$io_method, $psql,
+			"sequential scan of $tblname block fails",
+			qq(SELECT count(*) FROM $tblname),
+			qr/^$/, $invalid_page_re);
+
+		# verify the error is reported for bufmgr reads, tid scan
+		psql_like(
+			$io_method,
+			$psql,
+			"tid scan of $tblname block fails",
+			qq(SELECT count(*) FROM $tblname WHERE ctid = '(1, 1)'),
+			qr/^$/,
+			$invalid_page_re);
+	}
+
+	$psql->quit();
+}
+
+# Test interplay between StartBufferIO and TerminateBufferIO
+sub test_startwait_io
+{
+	my $io_method = shift;
+	my $node = shift;
+	my ($ret, $output);
+
+	my $psql_a = $node->background_psql('postgres', on_error_stop => 0);
+	my $psql_b = $node->background_psql('postgres', on_error_stop => 0);
+
+
+	### Verify behavior for normal tables
+
+	# create a buffer we can play around with
+	my $buf_id = psql_like(
+		$io_method, $psql_a,
+		"creation of toy buffer succeeds",
+		qq(SELECT buffer_create_toy('tbl_ok', 1)),
+		qr/^\d+$/, qr/^$/);
+
+	# check that one backend can perform StartBufferIO
+	psql_like(
+		$io_method,
+		$psql_a,
+		"first StartBufferIO",
+		qq(SELECT buffer_call_start_io($buf_id, for_input=>true, nowait=>false);),
+		qr/^t$/,
+		qr/^$/);
+
+	# but not twice on the same buffer (non-waiting)
+	psql_like(
+		$io_method,
+		$psql_a,
+		"second StartBufferIO fails, same session",
+		qq(SELECT buffer_call_start_io($buf_id, for_input=>true, nowait=>true);),
+		qr/^f$/,
+		qr/^$/);
+	psql_like(
+		$io_method,
+		$psql_b,
+		"second StartBufferIO fails, other session",
+		qq(SELECT buffer_call_start_io($buf_id, for_input=>true, nowait=>true);),
+		qr/^f$/,
+		qr/^$/);
+
+	# start io in a different session, will block
+	query_wait_block(
+		$io_method,
+		$node,
+		$psql_b,
+		"blocking start buffer io",
+		qq(SELECT buffer_call_start_io($buf_id, for_input=>true, nowait=>false);),
+		"BufferIo");
+
+	# Terminate the IO, without marking it as success, this should trigger the
+	# waiting session to be able to start the io
+	psql_like(
+		$io_method,
+		$psql_a,
+		"blocking start buffer io, terminating io, not valid",
+		qq(SELECT buffer_call_terminate_io($buf_id, for_input=>true, succeed=>false, io_error=>false, release_aio=>false)),
+		qr/^$/,
+		qr/^$/);
+
+
+	# Because the IO was terminated, but not marked as valid, second session should get the right to start io
+	pump_until($psql_b->{run}, $psql_b->{timeout}, \$psql_b->{stdout}, qr/t/);
+	ok(1, "$io_method: blocking start buffer io, can start io");
+
+	# terminate the IO again
+	$psql_b->query_safe(
+		qq(SELECT buffer_call_terminate_io($buf_id, for_input=>true, succeed=>false, io_error=>false, release_aio=>false);)
+	);
+
+
+	# same as the above scenario, but mark IO as having succeeded
+	psql_like(
+		$io_method,
+		$psql_a,
+		"blocking buffer io w/ success: first start buffer io",
+		qq(SELECT buffer_call_start_io($buf_id, for_input=>true, nowait=>false);),
+		qr/^t$/,
+		qr/^$/);
+
+	# start io in a different session, will block
+	query_wait_block(
+		$io_method,
+		$node,
+		$psql_b,
+		"blocking start buffer io",
+		qq(SELECT buffer_call_start_io($buf_id, for_input=>true, nowait=>false);),
+		"BufferIo");
+
+	# Terminate the IO, marking it as success
+	psql_like(
+		$io_method,
+		$psql_a,
+		"blocking start buffer io, terminating io, valid",
+		qq(SELECT buffer_call_terminate_io($buf_id, for_input=>true, succeed=>true, io_error=>false, release_aio=>false)),
+		qr/^$/,
+		qr/^$/);
+
+	# Because the IO was terminated, and marked as valid, second session should complete but not need io
+	pump_until($psql_b->{run}, $psql_b->{timeout}, \$psql_b->{stdout}, qr/f/);
+	ok(1, "$io_method: blocking start buffer io, no need to start io");
+
+	# buffer is valid now, make it invalid again
+	$psql_a->query_safe(qq(SELECT buffer_create_toy('tbl_ok', 1);));
+
+
+	### Verify behavior for temporary tables
+
+	# Can't unfortunately share the code with the normal table case, there are
+	# too many behavioral differences.
+
+	# create a buffer we can play around with
+	$psql_a->query_safe(
+		qq(
+CREATE TEMPORARY TABLE tmp_ok(data int not null);
+INSERT INTO tmp_ok SELECT generate_series(1, 10000);
+));
+	$buf_id = $psql_a->query_safe(qq(SELECT buffer_create_toy('tmp_ok', 3);));
+
+	# check that one backend can perform StartLocalBufferIO
+	psql_like(
+		$io_method,
+		$psql_a,
+		"first StartLocalBufferIO",
+		qq(SELECT buffer_call_start_io($buf_id, for_input=>true, nowait=>true);),
+		qr/^t$/,
+		qr/^$/);
+
+	# Because local buffers don't use IO_IN_PROGRESS, a second StartLocalBufer
+	# succeeds as well. This test mostly serves as a documentation of that
+	# fact. If we had actually started IO, it'd be different.
+	psql_like(
+		$io_method,
+		$psql_a,
+		"second StartLocalBufferIO succeeds, same session",
+		qq(SELECT buffer_call_start_io($buf_id, for_input=>true, nowait=>true);),
+		qr/^t$/,
+		qr/^$/);
+
+	# Terminate the IO again, without marking it as a success
+	$psql_a->query_safe(
+		qq(SELECT buffer_call_terminate_io($buf_id, for_input=>true, succeed=>false, io_error=>false, release_aio=>false);)
+	);
+	psql_like(
+		$io_method,
+		$psql_a,
+		"StartLocalBufferIO after not marking valid succeeds, same session",
+		qq(SELECT buffer_call_start_io($buf_id, for_input=>true, nowait=>true);),
+		qr/^t$/,
+		qr/^$/);
+
+	# Terminate the IO again, marking it as a success
+	$psql_a->query_safe(
+		qq(SELECT buffer_call_terminate_io($buf_id, for_input=>true, succeed=>true, io_error=>false, release_aio=>false);)
+	);
+
+	# Now another StartLocalBufferIO should fail, this time because the buffer
+	# is already valid.
+	psql_like(
+		$io_method,
+		$psql_a,
+		"StartLocalBufferIO after marking valid fails",
+		qq(SELECT buffer_call_start_io($buf_id, for_input=>true, nowait=>false);),
+		qr/^f$/,
+		qr/^$/);
+
+	$psql_a->quit();
+	$psql_b->quit();
+}
+
+# Test that if the backend issuing a read doesn't wait for the IO's
+# completion, another backend can complete the IO
+sub test_complete_foreign
+{
+	my $io_method = shift;
+	my $node = shift;
+	my ($ret, $output);
+
+	my $psql_a = $node->background_psql('postgres', on_error_stop => 0);
+	my $psql_b = $node->background_psql('postgres', on_error_stop => 0);
+
+	# Issue IO without waiting for completion, then sleep
+	$psql_a->query_safe(
+		qq(SELECT read_rel_block_ll('tbl_ok', 1, wait_complete=>false);));
+
+	# Check that another backend can read the relevant block
+	psql_like(
+		$io_method,
+		$psql_b,
+		"completing read started by sleeping backend",
+		qq(SELECT count(*) FROM tbl_ok WHERE ctid = '(1,1)' LIMIT 1),
+		qr/^1$/,
+		qr/^$/);
+
+	# Issue IO without waiting for completion, then exit.
+	$psql_a->query_safe(
+		qq(SELECT read_rel_block_ll('tbl_ok', 1, wait_complete=>false);));
+	$psql_a->reconnect_and_clear();
+
+	# Check that another backend can read the relevant block. This verifies
+	# that the exiting backend left the AIO in a sane state.
+	psql_like(
+		$io_method,
+		$psql_b,
+		"read buffer started by exited backend",
+		qq(SELECT count(*) FROM tbl_ok WHERE ctid = '(1,1)' LIMIT 1),
+		qr/^1$/,
+		qr/^$/);
+
+	# Read a tbl_corr block, then sleep. The other session will retry the IO
+	# and also fail. The easiest thing to verify that seems to be to check
+	# that both are in the log.
+	my $log_location = -s $node->logfile;
+	$psql_a->query_safe(
+		qq(SELECT read_rel_block_ll('tbl_corr', 1, wait_complete=>false);));
+
+	psql_like(
+		$io_method,
+		$psql_b,
+		"completing read of tbl_corr block started by other backend",
+		qq(SELECT count(*) FROM tbl_corr WHERE ctid = '(1,1)' LIMIT 1),
+		qr/^$/,
+		qr/invalid page in block/);
+
+	# The log message issued for the read_rel_block_ll() should be logged as a LOG
+	$node->wait_for_log(qr/LOG[^\n]+invalid page in/, $log_location);
+	ok(1,
+		"$io_method: completing read of tbl_corr block started by other backend: LOG message for background read"
+	);
+
+	# But for the SELECT, it should be an ERROR
+	$log_location =
+	  $node->wait_for_log(qr/ERROR[^\n]+invalid page in/, $log_location);
+	ok(1,
+		"$io_method: completing read of tbl_corr block started by other backend: ERROR message for foreground read"
+	);
+
+	$psql_a->quit();
+	$psql_b->quit();
+}
+
+# Test that we deal correctly with FDs being closed while IO is in progress
+sub test_close_fd
+{
+	my $io_method = shift;
+	my $node = shift;
+	my ($ret, $output);
+
+	my $psql = $node->background_psql('postgres', on_error_stop => 0);
+
+	psql_like(
+		$io_method,
+		$psql,
+		"close all FDs after read, waiting for results",
+		qq(
+			SELECT read_rel_block_ll('tbl_ok', 1,
+				wait_complete=>true,
+				batchmode_enter=>true,
+				smgrreleaseall=>true,
+				batchmode_exit=>true
+			);),
+		qr/^$/,
+		qr/^$/);
+
+	psql_like(
+		$io_method,
+		$psql,
+		"close all FDs after read, no waiting",
+		qq(
+			SELECT read_rel_block_ll('tbl_ok', 1,
+				wait_complete=>false,
+				batchmode_enter=>true,
+				smgrreleaseall=>true,
+				batchmode_exit=>true
+			);),
+		qr/^$/,
+		qr/^$/);
+
+	# Check that another backend can read the relevant block
+	psql_like(
+		$io_method,
+		$psql,
+		"close all FDs after read, no waiting, query works",
+		qq(SELECT count(*) FROM tbl_ok WHERE ctid = '(1,1)' LIMIT 1),
+		qr/^1$/,
+		qr/^$/);
+
+	$psql->quit();
+}
+
+# Tests using injection points. Mostly to exercise had IO errors that are
+# hard to trigger without using injection points.
+sub test_inject
+{
+	my $io_method = shift;
+	my $node = shift;
+	my ($ret, $output);
+
+	my $psql = $node->background_psql('postgres', on_error_stop => 0);
+
+	# injected what we'd expect
+	$psql->query_safe(qq(SELECT inj_io_short_read_attach(8192);));
+	$psql->query_safe(qq(SELECT invalidate_rel_block('tbl_ok', 2);));
+	psql_like(
+		$io_method, $psql,
+		"injection point not triggering failure",
+		qq(SELECT count(*) FROM tbl_ok WHERE ctid = '(2, 1)'),
+		qr/^1$/, qr/^$/);
+
+	# injected a read shorter than a single block, expecting error
+	$psql->query_safe(qq(SELECT inj_io_short_read_attach(17);));
+	$psql->query_safe(qq(SELECT invalidate_rel_block('tbl_ok', 2);));
+	psql_like(
+		$io_method,
+		$psql,
+		"single block short read fails",
+		qq(SELECT count(*) FROM tbl_ok WHERE ctid = '(2, 1)'),
+		qr/^$/,
+		qr/ERROR:.*could not read blocks 2\.\.2 in file "base\/.*": read only 0 of 8192 bytes/
+	);
+
+	# shorten multi-block read to a single block, should retry
+	my $inval_query = qq(SELECT invalidate_rel_block('tbl_ok', 0);
+SELECT invalidate_rel_block('tbl_ok', 1);
+SELECT invalidate_rel_block('tbl_ok', 2);
+SELECT invalidate_rel_block('tbl_ok', 3);
+/* gap */
+SELECT invalidate_rel_block('tbl_ok', 5);
+SELECT invalidate_rel_block('tbl_ok', 6);
+SELECT invalidate_rel_block('tbl_ok', 7);
+SELECT invalidate_rel_block('tbl_ok', 8););
+
+	$psql->query_safe($inval_query);
+	$psql->query_safe(qq(SELECT inj_io_short_read_attach(8192);));
+	psql_like(
+		$io_method, $psql,
+		"multi block short read (1 block) is retried",
+		qq(SELECT count(*) FROM tbl_ok),
+		qr/^10000$/, qr/^$/);
+
+	# shorten multi-block read to two blocks, should retry
+	$psql->query_safe($inval_query);
+	$psql->query_safe(qq(SELECT inj_io_short_read_attach(8192*2);));
+
+	psql_like(
+		$io_method, $psql,
+		"multi block short read (2 blocks) is retried",
+		qq(SELECT count(*) FROM tbl_ok),
+		qr/^10000$/, qr/^$/);
+
+	# verify that page verification errors are detected even as part of a
+	# shortened multi-block read (tbl_corr, block 1 is corrupted)
+	$psql->query_safe(
+		qq(
+SELECT invalidate_rel_block('tbl_corr', 0);
+SELECT invalidate_rel_block('tbl_corr', 1);
+SELECT invalidate_rel_block('tbl_corr', 2);
+SELECT inj_io_short_read_attach(8192);
+	));
+
+	psql_like(
+		$io_method,
+		$psql,
+		"shortened multi-block read detects invalid page",
+		qq(SELECT count(*) FROM tbl_corr WHERE ctid < '(2, 1)'),
+		qr/^$/,
+		qr/ERROR:.*invalid page in block 1 of relation base\/.*/);
+
+	# trigger a hard error, should error out
+	$psql->query_safe(
+		qq(
+SELECT inj_io_short_read_attach(-errno_from_string('EIO'));
+SELECT invalidate_rel_block('tbl_ok', 2);
+	));
+
+	psql_like(
+		$io_method,
+		$psql,
+		"first hard IO error is reported",
+		qq(SELECT count(*) FROM tbl_ok),
+		qr/^$/,
+		qr/ERROR:.*could not read blocks 2\.\.2 in file \"base\/.*\": Input\/output error/
+	);
+
+	psql_like(
+		$io_method,
+		$psql,
+		"second hard IO error is reported",
+		qq(SELECT count(*) FROM tbl_ok),
+		qr/^$/,
+		qr/ERROR:.*could not read blocks 2\.\.2 in file \"base\/.*\": Input\/output error/
+	);
+
+	$psql->query_safe(qq(SELECT inj_io_short_read_detach()));
+
+	# now the IO should be ok.
+	psql_like(
+		$io_method, $psql,
+		"recovers after hard error",
+		qq(SELECT count(*) FROM tbl_ok),
+		qr/^10000$/, qr/^$/);
+
+	$psql->quit();
+}
+
+# Tests using injection points, only for io_method=worker.
+#
+# io_method=worker has the special case of needing to reopen files. That can
+# in theory fail, because the file could be gone. That's a hard path to test
+# for real, so we use an injection point to trigger it.
+sub test_inject_worker
+{
+	my $io_method = shift;
+	my $node = shift;
+	my ($ret, $output);
+
+	my $psql = $node->background_psql('postgres', on_error_stop => 0);
+
+	# trigger a failure to reopen, should error out, but should recover
+	$psql->query_safe(
+		qq(
+SELECT inj_io_reopen_attach();
+SELECT invalidate_rel_block('tbl_ok', 1);
+	));
+
+	psql_like(
+		$io_method,
+		$psql,
+		"failure to open: detected",
+		qq(SELECT count(*) FROM tbl_ok),
+		qr/^$/,
+		qr/ERROR:.*could not read blocks 1\.\.1 in file "base\/.*": No such file or directory/
+	);
+
+	$psql->query_safe(qq(SELECT inj_io_reopen_detach();));
+
+	# check that we indeed recover
+	psql_like(
+		$io_method, $psql,
+		"failure to open: recovers",
+		qq(SELECT count(*) FROM tbl_ok),
+		qr/^10000$/, qr/^$/);
+
+	$psql->quit();
+}
+
+# Verify that we handle a relation getting removed (due to a rollback or a
+# DROP TABLE) while IO is ongoing for that table.
+sub test_invalidate
+{
+	my $io_method = shift;
+	my $node = shift;
+
+	my $psql = $node->background_psql('postgres', on_error_stop => 0);
+
+	foreach my $persistency (qw(normal unlogged temporary))
+	{
+		my $sql_persistency = $persistency eq 'normal' ? '' : $persistency;
+		my $tblname = $persistency . '_transactional';
+
+		my $create_sql = qq(
+CREATE $sql_persistency TABLE $tblname (id int not null, data text not null) WITH (AUTOVACUUM_ENABLED = false);
+INSERT INTO $tblname(id, data) SELECT generate_series(1, 10000) as id, repeat('a', 200);
+);
+
+		# Verify that outstanding read IO does not cause problems with
+		# AbortTransaction -> smgrDoPendingDeletes -> smgrdounlinkall -> ...
+		# -> Invalidate[Local]Buffer.
+		$psql->query_safe("BEGIN; $create_sql;");
+		$psql->query_safe(
+			qq(
+SELECT read_rel_block_ll('$tblname', 1, wait_complete=>false);
+));
+		psql_like(
+			$io_method,
+			$psql,
+			"rollback of newly created $persistency table with outstanding IO",
+			qq(ROLLBACK),
+			qr/^$/,
+			qr/^$/);
+
+		# Verify that outstanding read IO does not cause problems with
+		# CommitTransaction -> smgrDoPendingDeletes -> smgrdounlinkall -> ...
+		# -> Invalidate[Local]Buffer.
+		$psql->query_safe("BEGIN; $create_sql; COMMIT;");
+		$psql->query_safe(
+			qq(
+BEGIN;
+SELECT read_rel_block_ll('$tblname', 1, wait_complete=>false);
+));
+
+		psql_like(
+			$io_method, $psql,
+			"drop $persistency table with outstanding IO",
+			qq(DROP TABLE $tblname),
+			qr/^$/, qr/^$/);
+
+		psql_like($io_method, $psql,
+			"commit after drop $persistency table with outstanding IO",
+			qq(COMMIT), qr/^$/, qr/^$/);
+	}
+
+	$psql->quit();
+}
+
+# Test behavior related to ZERO_ON_ERROR and zero_damaged_pages
+sub test_zero
+{
+	my $io_method = shift;
+	my $node = shift;
+
+	my $psql_a = $node->background_psql('postgres', on_error_stop => 0);
+
+	foreach my $persistency (qw(normal temporary))
+	{
+		my $sql_persistency = $persistency eq 'normal' ? '' : $persistency;
+
+		$psql_a->query_safe(
+			qq(
+CREATE $sql_persistency TABLE tbl_zero(id int) WITH (AUTOVACUUM_ENABLED = false);
+INSERT INTO tbl_zero SELECT generate_series(1, 10000);
+));
+
+		$psql_a->query_safe(
+			qq(
+SELECT modify_rel_block('tbl_zero', 0, corrupt_header=>true);
+));
+
+		# Check that page validity errors are detected
+		psql_like(
+			$io_method,
+			$psql_a,
+			"$persistency: test reading of invalid block 0",
+			qq(
+SELECT read_rel_block_ll('tbl_zero', 0, zero_on_error=>false)),
+			qr/^$/,
+			qr/^psql:<stdin>:\d+: ERROR:  invalid page in block 0 of relation base\/.*\/.*$/
+		);
+
+		# Check that page validity errors are zeroed
+		psql_like(
+			$io_method,
+			$psql_a,
+			"$persistency: test zeroing of invalid block 0",
+			qq(
+SELECT read_rel_block_ll('tbl_zero', 0, zero_on_error=>true)),
+			qr/^$/,
+			qr/^psql:<stdin>:\d+: WARNING:  invalid page in block 0 of relation base\/.*\/.*; zeroing out page$/
+		);
+
+		# And that once the corruption is fixed, we can read again
+		$psql_a->query(
+			qq(
+SELECT modify_rel_block('tbl_zero', 0, zero=>true);
+));
+		$psql_a->{stderr} = '';
+
+		psql_like(
+			$io_method,
+			$psql_a,
+			"$persistency: test re-read of block 0",
+			qq(
+SELECT read_rel_block_ll('tbl_zero', 0, zero_on_error=>false)),
+			qr/^$/,
+			qr/^$/);
+
+		# Check a page validity error in another block, to ensure we report
+		# the correct block number
+		$psql_a->query_safe(
+			qq(
+SELECT modify_rel_block('tbl_zero', 3, corrupt_header=>true);
+));
+		psql_like(
+			$io_method,
+			$psql_a,
+			"$persistency: test zeroing of invalid block 3",
+			qq(SELECT read_rel_block_ll('tbl_zero', 3, zero_on_error=>true);),
+			qr/^$/,
+			qr/^psql:<stdin>:\d+: WARNING:  invalid page in block 3 of relation base\/.*\/.*; zeroing out page$/
+		);
+
+
+		# Check a page validity error in another block, to ensure we report
+		# the correct block number
+		$psql_a->query_safe(
+			qq(
+SELECT modify_rel_block('tbl_zero', 2, corrupt_header=>true);
+SELECT modify_rel_block('tbl_zero', 3, corrupt_header=>true);
+));
+		# First test error
+		psql_like(
+			$io_method,
+			$psql_a,
+			"$persistency: test reading of invalid block 2,3 in larger read",
+			qq(SELECT read_rel_block_ll('tbl_zero', 1, nblocks=>4, zero_on_error=>false)),
+			qr/^$/,
+			qr/^psql:<stdin>:\d+: ERROR:  2 invalid pages among blocks 1..4 of relation base\/.*\/.*\nDETAIL:  Block 2 held first invalid page\.\nHINT:[^\n]+$/
+		);
+
+		# Then test zeroing via ZERO_ON_ERROR flag
+		psql_like(
+			$io_method,
+			$psql_a,
+			"$persistency: test zeroing of invalid block 2,3 in larger read, ZERO_ON_ERROR",
+			qq(SELECT read_rel_block_ll('tbl_zero', 1, nblocks=>4, zero_on_error=>true)),
+			qr/^$/,
+			qr/^psql:<stdin>:\d+: WARNING:  zeroing out 2 invalid pages among blocks 1..4 of relation base\/.*\/.*\nDETAIL:  Block 2 held first zeroed page\.\nHINT:[^\n]+$/
+		);
+
+		# Then test zeroing vio zero_damaged_pages
+		psql_like(
+			$io_method,
+			$psql_a,
+			"$persistency: test zeroing of invalid block 2,3 in larger read, zero_damaged_pages",
+			qq(
+BEGIN;
+SET LOCAL zero_damaged_pages = true;
+SELECT read_rel_block_ll('tbl_zero', 1, nblocks=>4, zero_on_error=>false)
+COMMIT;
+),
+			qr/^$/,
+			qr/^psql:<stdin>:\d+: WARNING:  zeroing out 2 invalid pages among blocks 1..4 of relation base\/.*\/.*\nDETAIL:  Block 2 held first zeroed page\.\nHINT:[^\n]+$/
+		);
+
+		$psql_a->query_safe(qq(COMMIT));
+
+
+		# Verify that bufmgr.c IO detects page validity errors
+		$psql_a->query(
+			qq(
+SELECT invalidate_rel_block('tbl_zero', g.i)
+FROM generate_series(0, 15) g(i);
+SELECT modify_rel_block('tbl_zero', 3, zero=>true);
+));
+		$psql_a->{stderr} = '';
+
+		psql_like(
+			$io_method,
+			$psql_a,
+			"$persistency: verify reading zero_damaged_pages=off",
+			qq(
+SELECT count(*) FROM tbl_zero),
+			qr/^$/,
+			qr/^psql:<stdin>:\d+: ERROR:  invalid page in block 2 of relation base\/.*\/.*$/
+		);
+
+		# Verify that bufmgr.c IO zeroes out pages with page validity errors
+		psql_like(
+			$io_method,
+			$psql_a,
+			"$persistency: verify zero_damaged_pages=on",
+			qq(
+BEGIN;
+SET LOCAL zero_damaged_pages = true;
+SELECT count(*) FROM tbl_zero;
+COMMIT;
+),
+			qr/^\d+$/,
+			qr/^psql:<stdin>:\d+: WARNING:  invalid page in block 2 of relation base\/.*\/.*$/
+		);
+
+
+		# FIXME: Test that if one session triggers a read but doesn't complete
+		# it, the completing session doesn't see the WARNING.
+
+		# Clean up
+		$psql_a->query_safe(
+			qq(
+DROP TABLE tbl_zero;
+));
+	}
+
+	$psql_a->{stderr} = '';
+
+	$psql_a->quit();
+}
+
+# Test that we detect checksum failures and report them
+sub test_checksum
+{
+	my $io_method = shift;
+	my $node = shift;
+
+	my $psql_a = $node->background_psql('postgres', on_error_stop => 0);
+
+	$psql_a->query_safe(
+		qq(
+CREATE TABLE tbl_normal(id int) WITH (AUTOVACUUM_ENABLED = false);
+INSERT INTO tbl_normal SELECT generate_series(1, 5000);
+SELECT modify_rel_block('tbl_normal', 3, corrupt_checksum=>true);
+
+CREATE TEMPORARY TABLE tbl_temp(id int) WITH (AUTOVACUUM_ENABLED = false);
+INSERT INTO tbl_temp SELECT generate_series(1, 5000);
+SELECT modify_rel_block('tbl_temp', 3, corrupt_checksum=>true);
+SELECT modify_rel_block('tbl_temp', 4, corrupt_checksum=>true);
+));
+
+	# To be able to test checksum failures on shared rels we need a shared rel
+	# with invalid pages - which is a bit scary. pg_shseclabel seems like a
+	# good bet, as it's not accessed in a default configuration.
+	$psql_a->query_safe(
+		qq(
+SELECT grow_rel('pg_shseclabel', 4);
+SELECT modify_rel_block('pg_shseclabel', 2, corrupt_checksum=>true);
+SELECT modify_rel_block('pg_shseclabel', 3, corrupt_checksum=>true);
+));
+
+
+	# Check that page validity errors are detected, checksums stats increase, normal rel
+	my ($cs_count_before, $cs_ts_before) =
+	  checksum_failures($psql_a, 'postgres');
+	psql_like(
+		$io_method,
+		$psql_a,
+		"normal rel: test reading of invalid block 3",
+		qq(
+SELECT read_rel_block_ll('tbl_normal', 3, nblocks=>1, zero_on_error=>false);),
+		qr/^$/,
+		qr/^psql:<stdin>:\d+: ERROR:  invalid page in block 3 of relation base\/\d+\/\d+$/
+	);
+
+	my ($cs_count_after, $cs_ts_after) =
+	  checksum_failures($psql_a, 'postgres');
+
+	cmp_ok($cs_count_before + 1,
+		'<=', $cs_count_after,
+		"$io_method: normal rel: checksum count increased");
+	cmp_ok($cs_ts_after, 'ne', '',
+		"$io_method: normal rel: checksum timestamp is not null");
+
+
+	# Check that page validity errors are detected, checksums stats increase, temp rel
+	($cs_count_after, $cs_ts_after) = checksum_failures($psql_a, 'postgres');
+	psql_like(
+		$io_method,
+		$psql_a,
+		"temp rel: test reading of invalid block 4, valid block 5",
+		qq(
+SELECT read_rel_block_ll('tbl_temp', 4, nblocks=>2, zero_on_error=>false);),
+		qr/^$/,
+		qr/^psql:<stdin>:\d+: ERROR:  invalid page in block 4 of relation base\/\d+\/t\d+_\d+$/
+	);
+
+	($cs_count_after, $cs_ts_after) = checksum_failures($psql_a, 'postgres');
+
+	cmp_ok($cs_count_before + 1,
+		'<=', $cs_count_after,
+		"$io_method: temp rel: checksum count increased");
+	cmp_ok($cs_ts_after, 'ne', '',
+		"$io_method: temp rel: checksum timestamp is not null");
+
+
+	# Check that page validity errors are detected, checksums stats increase, shared rel
+	($cs_count_before, $cs_ts_after) = checksum_failures($psql_a);
+	psql_like(
+		$io_method,
+		$psql_a,
+		"shared rel: reading of invalid blocks 2+3",
+		qq(
+SELECT read_rel_block_ll('pg_shseclabel', 2, nblocks=>2, zero_on_error=>false);),
+		qr/^$/,
+		qr/^psql:<stdin>:\d+: ERROR:  2 invalid pages among blocks 2..3 of relation global\/\d+\nDETAIL:  Block 2 held first invalid page\.\nHINT:[^\n]+$/
+	);
+
+	($cs_count_after, $cs_ts_after) = checksum_failures($psql_a);
+
+	cmp_ok($cs_count_before + 1,
+		'<=', $cs_count_after,
+		"$io_method: shared rel: checksum count increased");
+	cmp_ok($cs_ts_after, 'ne', '',
+		"$io_method: shared rel: checksum timestamp is not null");
+
+
+	# and restore sanity
+	$psql_a->query(
+		qq(
+SELECT modify_rel_block('pg_shseclabel', 1, zero=>true);
+DROP TABLE tbl_normal;
+));
+	$psql_a->{stderr} = '';
+
+	$psql_a->quit();
+}
+
+# Verify checksum handling when creating database from an invalid database.
+# This also serves as a minimal check that cross-database IO is handled
+# reasonably.
+sub test_checksum_createdb
+{
+	my $io_method = shift;
+	my $node = shift;
+
+	my $psql = $node->background_psql('postgres', on_error_stop => 0);
+
+	$node->safe_psql('postgres',
+		'CREATE DATABASE regression_createdb_source');
+
+	$node->safe_psql(
+		'regression_createdb_source', qq(
+CREATE EXTENSION test_aio;
+CREATE TABLE tbl_cs_fail(data int not null) WITH (AUTOVACUUM_ENABLED = false);
+INSERT INTO tbl_cs_fail SELECT generate_series(1, 1000);
+SELECT modify_rel_block('tbl_cs_fail', 1, corrupt_checksum=>true);
+));
+
+	my $createdb_sql = qq(
+CREATE DATABASE regression_createdb_target
+TEMPLATE regression_createdb_source
+STRATEGY wal_log;
+);
+
+	# Verify that CREATE DATABASE of an invalid database fails and is
+	# accounted for accurately.
+	my ($cs_count_before, $cs_ts_before) =
+	  checksum_failures($psql, 'regression_createdb_source');
+	psql_like(
+		$io_method,
+		$psql,
+		"create database w/ wal strategy, invalid source",
+		$createdb_sql,
+		qr/^$/,
+		qr/^psql:<stdin>:\d+: ERROR:  invalid page in block 1 of relation base\/\d+\/\d+$/
+	);
+	my ($cs_count_after, $cs_ts_after) =
+	  checksum_failures($psql, 'regression_createdb_source');
+	cmp_ok($cs_count_before + 1, '<=', $cs_count_after,
+		"$io_method: create database w/ wal strategy, invalid source: checksum count increased"
+	);
+
+	# Verify that CREATE DATABASE of the fixed database succeeds.
+	$node->safe_psql(
+		'regression_createdb_source', qq(
+SELECT modify_rel_block('tbl_cs_fail', 1, zero=>true);
+));
+	psql_like($io_method, $psql,
+		"create database w/ wal strategy, valid source",
+		$createdb_sql, qr/^$/, qr/^$/);
+
+	$psql->quit();
+}
+
+# Test that we detect checksum failures and report them
+sub test_ignore_checksum
+{
+	my $io_method = shift;
+	my $node = shift;
+
+	my $psql = $node->background_psql('postgres', on_error_stop => 0);
+
+	$psql->query_safe(
+		qq(
+CREATE TABLE tbl_cs_fail(id int) WITH (AUTOVACUUM_ENABLED = false);
+INSERT INTO tbl_cs_fail SELECT generate_series(1, 10000);
+));
+
+	my $count_sql = "SELECT count(*) FROM tbl_cs_fail";
+	my $invalidate_sql = qq(
+SELECT invalidate_rel_block('tbl_cs_fail', g.i)
+FROM generate_series(0, 6) g(i);
+);
+
+	my $expect = $psql->query_safe($count_sql);
+
+	$psql->query_safe(
+		qq(
+SELECT modify_rel_block('tbl_cs_fail', 1, corrupt_checksum=>true);
+SELECT modify_rel_block('tbl_cs_fail', 5, corrupt_checksum=>true);
+SELECT modify_rel_block('tbl_cs_fail', 6, corrupt_checksum=>true);
+));
+
+	$psql->query_safe($invalidate_sql);
+	psql_like($io_method, $psql,
+		"ignore_checksum_failure=off fails",
+		$count_sql, qr/^$/, qr/ERROR:  invalid page in block/);
+
+	$psql->query_safe("SET ignore_checksum_failure=on");
+
+	$psql->query_safe($invalidate_sql);
+	psql_like($io_method, $psql,
+			  "ignore_checksum_failure=on succeeds",
+			  $count_sql,
+			  qr/^$expect$/,
+			  qr/WARNING:  ignoring \d checksum failures among blocks/);
+
+
+	$psql->query_safe(
+		qq(
+SELECT modify_rel_block('tbl_cs_fail', 2, zero=>true);
+SELECT modify_rel_block('tbl_cs_fail', 3, corrupt_checksum=>true);
+SELECT modify_rel_block('tbl_cs_fail', 4, corrupt_header=>true);
+));
+
+	my $log_location = -s $node->logfile;
+	psql_like(
+		$io_method,
+		$psql,
+		"test reading of checksum failed block 3, with ignore",
+		qq(
+SELECT read_rel_block_ll('tbl_cs_fail', 3, nblocks=>1, zero_on_error=>false);),
+		qr/^$/,
+		qr/^psql:<stdin>:\d+: WARNING:  ignoring checksum failure in block 3/
+	);
+
+	$log_location =
+	  $node->wait_for_log(qr/LOG:  ignoring checksum failure/, $log_location);
+
+	psql_like(
+		$io_method,
+		$psql,
+		"test reading of valid block 2, checksum failed 3, invalid 4, zero=false",
+		qq(
+SELECT read_rel_block_ll('tbl_cs_fail', 2, nblocks=>3, zero_on_error=>false);),
+		qr/^$/,
+		qr/^psql:<stdin>:\d+: ERROR:  invalid page in block 4 of relation base\/\d+\/\d+$/
+	);
+
+	$psql->query(
+		qq(
+SELECT modify_rel_block('tbl_cs_fail', 1, zero=>true);
+SELECT modify_rel_block('tbl_cs_fail', 2, corrupt_checksum=>true);
+SELECT modify_rel_block('tbl_cs_fail', 3, corrupt_checksum=>true);
+SELECT modify_rel_block('tbl_cs_fail', 4, corrupt_header=>true);
+SELECT modify_rel_block('tbl_cs_fail', 5, corrupt_header=>true);
+SELECT modify_rel_block('tbl_cs_fail', 6, corrupt_header=>true);
+));
+	$psql->{stderr} = '';
+
+	$log_location = -s $node->logfile;
+	psql_like(
+		$io_method,
+		$psql,
+		"test reading of valid block 1, checksum failed 2-3, invalid 4-6, zero=true",
+		qq(
+SELECT read_rel_block_ll('tbl_cs_fail', 1, nblocks=>6, zero_on_error=>true);),
+		qr/^$/,
+		qr/^psql:<stdin>:\d+: WARNING:  zeroing 3 page\(s\) and ignoring 2 checksum failure\(s\) among blocks 1..6 of relation/
+	);
+
+	# FIXME: "register" as tests, add more similar tests and explain why we want this.
+	$node->wait_for_log(qr/LOG:  ignoring checksum failure in block 2/,
+						$log_location);
+	$node->wait_for_log(qr/LOG:  ignoring checksum failure in block 3/,
+						$log_location);
+	$node->wait_for_log(qr/LOG:  invalid page in block 4 of relation base.*; zeroing out page/,
+						$log_location);
+	$node->wait_for_log(qr/LOG:  invalid page in block 5 of relation base.*; zeroing out page/,
+						$log_location);
+	$node->wait_for_log(qr/LOG:  invalid page in block 6 of relation base.*; zeroing out page/,
+						$log_location);
+
+	$psql->query(
+		qq(
+SELECT modify_rel_block('tbl_cs_fail', 3, corrupt_checksum=>true, corrupt_header=>true);
+));
+	$psql->{stderr} = '';
+
+	# Reading a page with both an invalid header and an invalid checksum
+	psql_like(
+		$io_method,
+		$psql,
+		"test reading of block with both invalid header and invalid checksum, zero=false",
+		qq(
+SELECT read_rel_block_ll('tbl_cs_fail', 3, nblocks=>1, zero_on_error=>false);),
+		qr/^$/,
+		qr/^psql:<stdin>:\d+: ERROR:  invalid page in block 3 of relation/
+	);
+
+	psql_like(
+		$io_method,
+		$psql,
+		"test reading of block 3 with both invalid header and invalid checksum, zero=true",
+		qq(
+SELECT read_rel_block_ll('tbl_cs_fail', 3, nblocks=>1, zero_on_error=>true);),
+		qr/^$/,
+		qr/^psql:<stdin>:\d+: WARNING:  invalid page in block 3 of relation base\/.*; zeroing out page/
+	);
+
+
+	$psql->quit();
+}
+
+
+# Run all tests that are supported for all io_methods
+sub test_generic
+{
+	my $io_method = shift;
+	my $node = shift;
+
+	is($node->safe_psql('postgres', 'SHOW io_method'),
+		$io_method, "$io_method: io_method set correctly");
+
+	$node->safe_psql(
+		'postgres', qq(
+CREATE EXTENSION test_aio;
+CREATE TABLE tbl_corr(data int not null) WITH (AUTOVACUUM_ENABLED = false);
+CREATE TABLE tbl_ok(data int not null) WITH (AUTOVACUUM_ENABLED = false);
+
+INSERT INTO tbl_corr SELECT generate_series(1, 10000);
+INSERT INTO tbl_ok SELECT generate_series(1, 10000);
+SELECT grow_rel('tbl_corr', 16);
+SELECT grow_rel('tbl_ok', 16);
+
+SELECT modify_rel_block('tbl_corr', 1, corrupt_header=>true);
+CHECKPOINT;
+));
+
+	test_handle($io_method, $node);
+	test_io_error($io_method, $node);
+	test_batchmode($io_method, $node);
+	test_startwait_io($io_method, $node);
+	test_complete_foreign($io_method, $node);
+	test_close_fd($io_method, $node);
+	test_invalidate($io_method, $node);
+	test_zero($io_method, $node);
+	test_checksum($io_method, $node);
+	test_ignore_checksum($io_method, $node);
+	test_checksum_createdb($io_method, $node);
+
+  SKIP:
+	{
+		skip 'Injection points not supported by this build', 1
+		  unless $ENV{enable_injection_points} eq 'yes';
+		test_inject($io_method, $node);
+	}
+}
diff --git a/src/test/modules/test_aio/t/002_io_workers.pl b/src/test/modules/test_aio/t/002_io_workers.pl
new file mode 100644
index 00000000000..af5fae15ea7
--- /dev/null
+++ b/src/test/modules/test_aio/t/002_io_workers.pl
@@ -0,0 +1,125 @@
+# Copyright (c) 2025, PostgreSQL Global Development Group
+
+use strict;
+use warnings FATAL => 'all';
+
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+use List::Util qw(shuffle);
+
+
+my $node = PostgreSQL::Test::Cluster->new('worker');
+$node->init();
+$node->append_conf(
+	'postgresql.conf', qq(
+io_method=worker
+));
+
+$node->start();
+
+# Test changing the number of I/O worker processes while also evaluating the
+# handling of their termination.
+test_number_of_io_workers_dynamic($node);
+
+$node->stop();
+
+done_testing();
+
+
+sub test_number_of_io_workers_dynamic
+{
+	my $node = shift;
+
+	my $prev_worker_count = $node->safe_psql('postgres', 'SHOW io_workers');
+
+	# Verify that worker count can't be set to 0
+	change_number_of_io_workers($node, 0, $prev_worker_count, 1);
+
+	# Verify that worker count can't be set to 33 (above the max)
+	change_number_of_io_workers($node, 33, $prev_worker_count, 1);
+
+	# Try changing IO workers to a random value and verify that the worker
+	# count ends up as expected. Always test the min/max of workers.
+	#
+	# Valid range for io_workers is [1, 32]. 8 tests in total seems
+	# reasonable.
+	my @io_workers_range = shuffle(1 ... 32);
+	foreach my $worker_count (1, 32, @io_workers_range[ 0, 6 ])
+	{
+		$prev_worker_count =
+		  change_number_of_io_workers($node, $worker_count,
+			$prev_worker_count, 0);
+	}
+}
+
+sub change_number_of_io_workers
+{
+	my $node = shift;
+	my $worker_count = shift;
+	my $prev_worker_count = shift;
+	my $expect_failure = shift;
+	my ($result, $stdout, $stderr);
+
+	($result, $stdout, $stderr) =
+	  $node->psql('postgres', "ALTER SYSTEM SET io_workers = $worker_count");
+	$node->safe_psql('postgres', 'SELECT pg_reload_conf()');
+
+	if ($expect_failure)
+	{
+		ok( $stderr =~
+			  /$worker_count is outside the valid range for parameter "io_workers"/,
+			"updating number of io_workers to $worker_count failed, as expected"
+		);
+
+		return $prev_worker_count;
+	}
+	else
+	{
+		is( $node->safe_psql('postgres', 'SHOW io_workers'),
+			$worker_count,
+			"updating number of io_workers from $prev_worker_count to $worker_count"
+		);
+
+		check_io_worker_count($node, $worker_count);
+		terminate_io_worker($node, $worker_count);
+		check_io_worker_count($node, $worker_count);
+
+		return $worker_count;
+	}
+}
+
+sub terminate_io_worker
+{
+	my $node = shift;
+	my $worker_count = shift;
+	my ($pid, $ret);
+
+	# Select a random io worker
+	$pid = $node->safe_psql(
+		'postgres',
+		qq(SELECT pid FROM pg_stat_activity WHERE
+			backend_type = 'io worker' ORDER BY RANDOM() LIMIT 1));
+
+	# terminate IO worker with SIGINT
+	is(PostgreSQL::Test::Utils::system_log('pg_ctl', 'kill', 'INT', $pid),
+		0, "random io worker process signalled with INT");
+
+	# Check that worker exits
+	ok( $node->poll_query_until(
+			'postgres',
+			qq(SELECT COUNT(*) FROM pg_stat_activity WHERE pid = $pid), '0'),
+		"random io worker process exited after signal");
+}
+
+sub check_io_worker_count
+{
+	my $node = shift;
+	my $worker_count = shift;
+
+	ok( $node->poll_query_until(
+			'postgres',
+			qq(SELECT COUNT(*) FROM pg_stat_activity WHERE backend_type = 'io worker'),
+			$worker_count),
+		"io worker count is $worker_count");
+}
diff --git a/src/test/modules/test_aio/test_aio--1.0.sql b/src/test/modules/test_aio/test_aio--1.0.sql
new file mode 100644
index 00000000000..e495481c41e
--- /dev/null
+++ b/src/test/modules/test_aio/test_aio--1.0.sql
@@ -0,0 +1,108 @@
+/* src/test/modules/test_aio/test_aio--1.0.sql */
+
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "CREATE EXTENSION test_aio" to load this file. \quit
+
+
+CREATE FUNCTION errno_from_string(sym text)
+RETURNS pg_catalog.int4 STRICT
+AS 'MODULE_PATHNAME' LANGUAGE C;
+
+
+CREATE FUNCTION grow_rel(rel regclass, nblocks int)
+RETURNS pg_catalog.void STRICT
+AS 'MODULE_PATHNAME' LANGUAGE C;
+
+
+CREATE FUNCTION modify_rel_block(rel regclass, blockno int,
+  zero bool DEFAULT false,
+  corrupt_header bool DEFAULT false,
+  corrupt_checksum bool DEFAULT false)
+RETURNS pg_catalog.void STRICT
+AS 'MODULE_PATHNAME' LANGUAGE C;
+
+CREATE FUNCTION read_rel_block_ll(
+    rel regclass,
+    blockno int,
+    nblocks int DEFAULT 1,
+    wait_complete bool DEFAULT true,
+    batchmode_enter bool DEFAULT false,
+    smgrreleaseall bool DEFAULT false,
+    batchmode_exit bool DEFAULT false,
+    zero_on_error bool DEFAULT false)
+RETURNS pg_catalog.void STRICT
+AS 'MODULE_PATHNAME' LANGUAGE C;
+
+CREATE FUNCTION invalidate_rel_block(rel regclass, blockno int)
+RETURNS pg_catalog.void STRICT
+AS 'MODULE_PATHNAME' LANGUAGE C;
+
+CREATE FUNCTION buffer_create_toy(rel regclass, blockno int4)
+RETURNS pg_catalog.int4 STRICT
+AS 'MODULE_PATHNAME' LANGUAGE C;
+
+CREATE FUNCTION buffer_call_start_io(buffer int, for_input bool, nowait bool)
+RETURNS pg_catalog.bool STRICT
+AS 'MODULE_PATHNAME' LANGUAGE C;
+
+CREATE FUNCTION buffer_call_terminate_io(buffer int, for_input bool, succeed bool, io_error bool, release_aio bool)
+RETURNS pg_catalog.void STRICT
+AS 'MODULE_PATHNAME' LANGUAGE C;
+
+
+
+/*
+ * Handle related functions
+ */
+CREATE FUNCTION handle_get_and_error()
+RETURNS pg_catalog.void STRICT
+AS 'MODULE_PATHNAME' LANGUAGE C;
+
+CREATE FUNCTION handle_get_twice()
+RETURNS pg_catalog.void STRICT
+AS 'MODULE_PATHNAME' LANGUAGE C;
+
+CREATE FUNCTION handle_get()
+RETURNS pg_catalog.void STRICT
+AS 'MODULE_PATHNAME' LANGUAGE C;
+
+CREATE FUNCTION handle_get_release()
+RETURNS pg_catalog.void STRICT
+AS 'MODULE_PATHNAME' LANGUAGE C;
+
+CREATE FUNCTION handle_release_last()
+RETURNS pg_catalog.void STRICT
+AS 'MODULE_PATHNAME' LANGUAGE C;
+
+
+/*
+ * Batchmode related functions
+ */
+CREATE FUNCTION batch_start()
+RETURNS pg_catalog.void STRICT
+AS 'MODULE_PATHNAME' LANGUAGE C;
+
+CREATE FUNCTION batch_end()
+RETURNS pg_catalog.void STRICT
+AS 'MODULE_PATHNAME' LANGUAGE C;
+
+
+
+/*
+ * Injection point related functions
+ */
+CREATE FUNCTION inj_io_short_read_attach(result int)
+RETURNS pg_catalog.void STRICT
+AS 'MODULE_PATHNAME' LANGUAGE C;
+
+CREATE FUNCTION inj_io_short_read_detach()
+RETURNS pg_catalog.void STRICT
+AS 'MODULE_PATHNAME' LANGUAGE C;
+
+CREATE FUNCTION inj_io_reopen_attach()
+RETURNS pg_catalog.void STRICT
+AS 'MODULE_PATHNAME' LANGUAGE C;
+
+CREATE FUNCTION inj_io_reopen_detach()
+RETURNS pg_catalog.void STRICT
+AS 'MODULE_PATHNAME' LANGUAGE C;
diff --git a/src/test/modules/test_aio/test_aio.c b/src/test/modules/test_aio/test_aio.c
new file mode 100644
index 00000000000..35c6b331923
--- /dev/null
+++ b/src/test/modules/test_aio/test_aio.c
@@ -0,0 +1,802 @@
+/*-------------------------------------------------------------------------
+ *
+ * test_aio.c
+ *		Helpers to write tests for AIO
+ *
+ * This module provides interface functions for C functionality to SQL, to
+ * make it possible to test AIO related behavior in a targeted way from SQL.
+ * It'd not generally be safe to export these functions to SQL, but for a test
+ * that's fine.
+ *
+ * Copyright (c) 2020-2025, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ *	  src/test/modules/test_aio/test_aio.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/relation.h"
+#include "fmgr.h"
+#include "storage/aio.h"
+#include "storage/aio_internal.h"
+#include "storage/buf_internals.h"
+#include "storage/bufmgr.h"
+#include "storage/checksum.h"
+#include "storage/ipc.h"
+#include "storage/lwlock.h"
+#include "utils/builtins.h"
+#include "utils/injection_point.h"
+#include "utils/rel.h"
+
+
+PG_MODULE_MAGIC;
+
+
+typedef struct InjIoErrorState
+{
+	bool		enabled_short_read;
+	bool		enabled_reopen;
+
+	bool		short_read_result_set;
+	int			short_read_result;
+}			InjIoErrorState;
+
+static InjIoErrorState * inj_io_error_state;
+
+/* Shared memory init callbacks */
+static shmem_request_hook_type prev_shmem_request_hook = NULL;
+static shmem_startup_hook_type prev_shmem_startup_hook = NULL;
+
+
+static PgAioHandle *last_handle;
+
+
+
+static void
+test_aio_shmem_request(void)
+{
+	if (prev_shmem_request_hook)
+		prev_shmem_request_hook();
+
+	RequestAddinShmemSpace(sizeof(InjIoErrorState));
+}
+
+static void
+test_aio_shmem_startup(void)
+{
+	bool		found;
+
+	if (prev_shmem_startup_hook)
+		prev_shmem_startup_hook();
+
+	/* Create or attach to the shared memory state */
+	LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
+
+	inj_io_error_state = ShmemInitStruct("injection_points",
+										 sizeof(InjIoErrorState),
+										 &found);
+
+	if (!found)
+	{
+		/*
+		 * First time through, so initialize.  This is shared with the dynamic
+		 * initialization using a DSM.
+		 */
+		inj_io_error_state->enabled_short_read = false;
+		inj_io_error_state->enabled_reopen = false;
+
+#ifdef USE_INJECTION_POINTS
+		InjectionPointAttach("AIO_PROCESS_COMPLETION_BEFORE_SHARED",
+							 "test_aio",
+							 "inj_io_short_read",
+							 NULL,
+							 0);
+		InjectionPointLoad("AIO_PROCESS_COMPLETION_BEFORE_SHARED");
+
+		InjectionPointAttach("AIO_WORKER_AFTER_REOPEN",
+							 "test_aio",
+							 "inj_io_reopen",
+							 NULL,
+							 0);
+		InjectionPointLoad("AIO_WORKER_AFTER_REOPEN");
+
+#endif
+	}
+	else
+	{
+#ifdef USE_INJECTION_POINTS
+		InjectionPointLoad("AIO_PROCESS_COMPLETION_BEFORE_SHARED");
+		InjectionPointLoad("AIO_WORKER_AFTER_REOPEN");
+		elog(LOG, "injection point loaded");
+#endif
+	}
+
+	LWLockRelease(AddinShmemInitLock);
+}
+
+void
+_PG_init(void)
+{
+	if (!process_shared_preload_libraries_in_progress)
+		return;
+
+	/* Shared memory initialization */
+	prev_shmem_request_hook = shmem_request_hook;
+	shmem_request_hook = test_aio_shmem_request;
+	prev_shmem_startup_hook = shmem_startup_hook;
+	shmem_startup_hook = test_aio_shmem_startup;
+}
+
+
+PG_FUNCTION_INFO_V1(errno_from_string);
+Datum
+errno_from_string(PG_FUNCTION_ARGS)
+{
+	const char *sym = text_to_cstring(PG_GETARG_TEXT_PP(0));
+
+	if (strcmp(sym, "EIO") == 0)
+		PG_RETURN_INT32(EIO);
+	else if (strcmp(sym, "EAGAIN") == 0)
+		PG_RETURN_INT32(EAGAIN);
+	else if (strcmp(sym, "EINTR") == 0)
+		PG_RETURN_INT32(EINTR);
+	else if (strcmp(sym, "ENOSPC") == 0)
+		PG_RETURN_INT32(ENOSPC);
+	else if (strcmp(sym, "EROFS") == 0)
+		PG_RETURN_INT32(EROFS);
+
+	ereport(ERROR,
+			errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+			errmsg_internal("%s is not a supported errno value", sym));
+	PG_RETURN_INT32(0);
+}
+
+
+PG_FUNCTION_INFO_V1(grow_rel);
+Datum
+grow_rel(PG_FUNCTION_ARGS)
+{
+	Oid			relid = PG_GETARG_OID(0);
+	uint32		nblocks = PG_GETARG_UINT32(1);
+	Relation	rel;
+#define MAX_BUFFERS_TO_EXTEND_BY 64
+	Buffer		victim_buffers[MAX_BUFFERS_TO_EXTEND_BY];
+
+	rel = relation_open(relid, AccessExclusiveLock);
+
+	while (nblocks > 0)
+	{
+		uint32		extend_by_pages;
+
+		extend_by_pages = Min(nblocks, MAX_BUFFERS_TO_EXTEND_BY);
+
+		ExtendBufferedRelBy(BMR_REL(rel),
+							MAIN_FORKNUM,
+							NULL,
+							0,
+							extend_by_pages,
+							victim_buffers,
+							&extend_by_pages);
+
+		nblocks -= extend_by_pages;
+
+		for (uint32 i = 0; i < extend_by_pages; i++)
+		{
+			ReleaseBuffer(victim_buffers[i]);
+		}
+	}
+
+	relation_close(rel, NoLock);
+
+	PG_RETURN_VOID();
+}
+
+PG_FUNCTION_INFO_V1(modify_rel_block);
+Datum
+modify_rel_block(PG_FUNCTION_ARGS)
+{
+	Oid			relid = PG_GETARG_OID(0);
+	BlockNumber blkno = PG_GETARG_UINT32(1);
+	bool		zero = PG_GETARG_BOOL(2);
+	bool		corrupt_header = PG_GETARG_BOOL(3);
+	bool		corrupt_checksum = PG_GETARG_BOOL(4);
+	Page		page = palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, 0);
+	Relation	rel;
+	Buffer		buf;
+	PageHeader	ph;
+
+	rel = relation_open(relid, AccessExclusiveLock);
+
+	buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno,
+							 RBM_ZERO_ON_ERROR, NULL);
+
+	LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+
+	/*
+	 * copy the page to local memory, seems nicer than to directly modify in
+	 * the buffer pool.
+	 */
+	memcpy(page, BufferGetPage(buf), BLCKSZ);
+
+	LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+
+	ReleaseBuffer(buf);
+
+	/*
+	 * Don't want to have a buffer in-memory that's marked valid where the
+	 * on-disk contents are invalid. Particularly not if the in-memory buffer
+	 * could be dirty...
+	 *
+	 * While we hold an AEL on the relation nobody else should be able to read
+	 * the buffer in.
+	 *
+	 * NB: This is probably racy, better don't copy this to non-test code.
+	 */
+	if (BufferIsLocal(buf))
+		InvalidateLocalBuffer(GetLocalBufferDescriptor(-buf - 1), true);
+	else
+		EvictUnpinnedBuffer(buf);
+
+	/*
+	 * Now modify the page as asked for by the caller.
+	 */
+	if (zero)
+		memset(page, 0, BufferGetPageSize(buf));
+
+	if (PageIsEmpty(page) && (corrupt_header || corrupt_checksum))
+		PageInit(page, BufferGetPageSize(buf), 0);
+
+	ph = (PageHeader) page;
+
+	if (corrupt_header)
+		ph->pd_special = BLCKSZ + 1;
+
+	if (corrupt_checksum)
+	{
+		bool		successfully_corrupted = 0;
+
+		/*
+		 * Any single modification of the checksum could just end up being
+		 * valid again. To be sure
+		 */
+		for (int i = 0; i < 100; i++)
+		{
+			uint16		verify_checksum;
+			uint16		old_checksum;
+
+			old_checksum = ph->pd_checksum;
+			ph->pd_checksum = old_checksum + 3;
+
+			elog(LOG, "corrupting checksum of blk %u from %u to %u",
+				 blkno, old_checksum, ph->pd_checksum);
+
+			verify_checksum = pg_checksum_page(page, blkno);
+			if (verify_checksum != ph->pd_checksum)
+			{
+				successfully_corrupted = true;
+				break;
+			}
+		}
+
+		if (!successfully_corrupted)
+			elog(ERROR, "could not corrupt checksum, what's going on?");
+	}
+	else
+	{
+		PageSetChecksumInplace(page, blkno);
+	}
+
+	smgrwrite(RelationGetSmgr(rel),
+			  MAIN_FORKNUM, blkno, page, true);
+
+	relation_close(rel, NoLock);
+
+	PG_RETURN_VOID();
+}
+
+/*
+ * Ensures a buffer for rel & blkno is in shared buffers, without actually
+ * caring about the buffer contents. Used to set up test scenarios.
+ */
+static Buffer
+create_toy_buffer(Relation rel, BlockNumber blkno)
+{
+	Buffer		buf;
+	BufferDesc *buf_hdr;
+	uint32		buf_state;
+	bool		was_pinned = false;
+
+	/* place buffer in shared buffers without erroring out */
+	buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_ZERO_AND_LOCK, NULL);
+	LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+
+	if (RelationUsesLocalBuffers(rel))
+	{
+		buf_hdr = GetLocalBufferDescriptor(-buf - 1);
+		buf_state = pg_atomic_read_u32(&buf_hdr->state);
+	}
+	else
+	{
+		buf_hdr = GetBufferDescriptor(buf - 1);
+		buf_state = LockBufHdr(buf_hdr);
+	}
+
+	/*
+	 * We should be the only backend accessing this buffer. This is just a
+	 * small bit of belt-and-suspenders defense, none of this code should ever
+	 * run in a cluster with real data.
+	 */
+	if (BUF_STATE_GET_REFCOUNT(buf_state) > 1)
+		was_pinned = true;
+	else
+		buf_state &= ~(BM_VALID | BM_DIRTY);
+
+	if (RelationUsesLocalBuffers(rel))
+		pg_atomic_unlocked_write_u32(&buf_hdr->state, buf_state);
+	else
+		UnlockBufHdr(buf_hdr, buf_state);
+
+	if (was_pinned)
+		elog(ERROR, "toy buffer %d was already pinned",
+			 buf);
+
+	return buf;
+}
+
+/*
+ * A "low level" read. This does similar things to what
+ * StartReadBuffers()/WaitReadBuffers() do, but provides more control (and
+ * less sanity).
+ */
+PG_FUNCTION_INFO_V1(read_rel_block_ll);
+Datum
+read_rel_block_ll(PG_FUNCTION_ARGS)
+{
+	Oid			relid = PG_GETARG_OID(0);
+	BlockNumber blkno = PG_GETARG_UINT32(1);
+	int			nblocks = PG_GETARG_INT32(2);
+	bool		wait_complete = PG_GETARG_BOOL(3);
+	bool		batchmode_enter = PG_GETARG_BOOL(4);
+	bool		call_smgrreleaseall = PG_GETARG_BOOL(5);
+	bool		batchmode_exit = PG_GETARG_BOOL(6);
+	bool		zero_on_error = PG_GETARG_BOOL(7);
+	Relation	rel;
+	Buffer		bufs[PG_IOV_MAX];
+	BufferDesc *buf_hdrs[PG_IOV_MAX];
+	Page		pages[PG_IOV_MAX];
+	uint8		srb_flags = 0;
+	PgAioReturn ior;
+	PgAioHandle *ioh;
+	PgAioWaitRef iow;
+	SMgrRelation smgr;
+
+	if (nblocks <= 0 || nblocks > PG_IOV_MAX)
+		elog(ERROR, "nblocks is out of range");
+
+	rel = relation_open(relid, AccessExclusiveLock);
+
+	for (int i = 0; i < nblocks; i++)
+	{
+		bufs[i] = create_toy_buffer(rel, blkno + i);
+		pages[i] = BufferGetBlock(bufs[i]);
+		buf_hdrs[i] = BufferIsLocal(bufs[i]) ?
+			GetLocalBufferDescriptor(-bufs[i] - 1) :
+			GetBufferDescriptor(bufs[i] - 1);
+	}
+
+	smgr = RelationGetSmgr(rel);
+
+	pgstat_prepare_report_checksum_failure(smgr->smgr_rlocator.locator.dbOid);
+
+	ioh = pgaio_io_acquire(CurrentResourceOwner, &ior);
+	pgaio_io_get_wref(ioh, &iow);
+
+	if (RelationUsesLocalBuffers(rel))
+	{
+		for (int i = 0; i < nblocks; i++)
+			StartLocalBufferIO(buf_hdrs[i], true, false);
+		pgaio_io_set_flag(ioh, PGAIO_HF_REFERENCES_LOCAL);
+	}
+	else
+	{
+		for (int i = 0; i < nblocks; i++)
+			StartBufferIO(buf_hdrs[i], true, false);
+	}
+
+	pgaio_io_set_handle_data_32(ioh, (uint32 *) bufs, nblocks);
+
+	if (zero_on_error | zero_damaged_pages)
+		srb_flags |= READ_BUFFERS_ZERO_ON_ERROR;
+	if (ignore_checksum_failure)
+		srb_flags |= READ_BUFFERS_IGNORE_CHECKSUM_FAILURES;
+
+	pgaio_io_register_callbacks(ioh,
+								RelationUsesLocalBuffers(rel) ?
+								PGAIO_HCB_LOCAL_BUFFER_READV :
+								PGAIO_HCB_SHARED_BUFFER_READV,
+								srb_flags);
+
+	if (batchmode_enter)
+		pgaio_enter_batchmode();
+
+	elog(LOG, "about to smgrstartreadv");
+	smgrstartreadv(ioh, smgr, MAIN_FORKNUM, blkno,
+				   (void *) pages, nblocks);
+
+	if (call_smgrreleaseall)
+		smgrreleaseall();
+
+	if (batchmode_exit)
+		pgaio_exit_batchmode();
+
+	for (int i = 0; i < nblocks; i++)
+		ReleaseBuffer(bufs[i]);
+
+	if (wait_complete)
+	{
+		pgaio_wref_wait(&iow);
+
+		if (ior.result.status != PGAIO_RS_OK)
+			pgaio_result_report(ior.result,
+								&ior.target_data,
+								ior.result.status == PGAIO_RS_ERROR ?
+								ERROR : WARNING);
+	}
+
+	relation_close(rel, NoLock);
+
+	PG_RETURN_VOID();
+}
+
+PG_FUNCTION_INFO_V1(invalidate_rel_block);
+Datum
+invalidate_rel_block(PG_FUNCTION_ARGS)
+{
+	Oid			relid = PG_GETARG_OID(0);
+	BlockNumber blkno = PG_GETARG_UINT32(1);
+	Relation	rel;
+	PrefetchBufferResult pr;
+	Buffer		buf;
+
+	rel = relation_open(relid, AccessExclusiveLock);
+
+	/*
+	 * This is a gross hack, but there's no other API exposed that allows to
+	 * get a buffer ID without actually reading the block in.
+	 */
+	pr = PrefetchBuffer(rel, MAIN_FORKNUM, blkno);
+	buf = pr.recent_buffer;
+
+	if (BufferIsValid(buf))
+	{
+		/* if the buffer contents aren't valid, this'll return false */
+		if (ReadRecentBuffer(rel->rd_locator, MAIN_FORKNUM, blkno, buf))
+		{
+			BufferDesc *buf_hdr = BufferIsLocal(buf) ?
+				GetLocalBufferDescriptor(-buf - 1)
+				: GetBufferDescriptor(buf - 1);
+
+			LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+
+			if (pg_atomic_read_u32(&buf_hdr->state) & BM_DIRTY)
+			{
+				if (BufferIsLocal(buf))
+					FlushLocalBuffer(buf_hdr, NULL);
+				else
+					FlushOneBuffer(buf);
+			}
+			LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+			ReleaseBuffer(buf);
+
+			if (BufferIsLocal(buf))
+				InvalidateLocalBuffer(GetLocalBufferDescriptor(-buf - 1), true);
+			else if (!EvictUnpinnedBuffer(buf))
+				elog(ERROR, "couldn't evict");
+		}
+	}
+
+	relation_close(rel, AccessExclusiveLock);
+
+	PG_RETURN_VOID();
+}
+
+PG_FUNCTION_INFO_V1(buffer_create_toy);
+Datum
+buffer_create_toy(PG_FUNCTION_ARGS)
+{
+	Oid			relid = PG_GETARG_OID(0);
+	BlockNumber blkno = PG_GETARG_UINT32(1);
+	Relation	rel;
+	Buffer		buf;
+
+	rel = relation_open(relid, AccessExclusiveLock);
+
+	buf = create_toy_buffer(rel, blkno);
+	ReleaseBuffer(buf);
+
+	relation_close(rel, NoLock);
+
+	PG_RETURN_INT32(buf);
+}
+
+PG_FUNCTION_INFO_V1(buffer_call_start_io);
+Datum
+buffer_call_start_io(PG_FUNCTION_ARGS)
+{
+	Buffer		buf = PG_GETARG_INT32(0);
+	bool		for_input = PG_GETARG_BOOL(1);
+	bool		nowait = PG_GETARG_BOOL(2);
+	bool		can_start;
+
+	if (BufferIsLocal(buf))
+		can_start = StartLocalBufferIO(GetLocalBufferDescriptor(-buf - 1),
+									   for_input, nowait);
+	else
+		can_start = StartBufferIO(GetBufferDescriptor(buf - 1),
+								  for_input, nowait);
+
+	/*
+	 * For tests we don't want the resowner release preventing us from
+	 * orchestrating odd scenarios.
+	 */
+	if (can_start && !BufferIsLocal(buf))
+		ResourceOwnerForgetBufferIO(CurrentResourceOwner,
+									buf);
+
+	ereport(LOG,
+			errmsg("buffer %d after StartBufferIO: %s",
+				   buf, DebugPrintBufferRefcount(buf)),
+			errhidestmt(true), errhidecontext(true));
+
+	PG_RETURN_BOOL(can_start);
+}
+
+PG_FUNCTION_INFO_V1(buffer_call_terminate_io);
+Datum
+buffer_call_terminate_io(PG_FUNCTION_ARGS)
+{
+	Buffer		buf = PG_GETARG_INT32(0);
+	bool		for_input = PG_GETARG_BOOL(1);
+	bool		succeed = PG_GETARG_BOOL(2);
+	bool		io_error = PG_GETARG_BOOL(3);
+	bool		release_aio = PG_GETARG_BOOL(4);
+	bool		clear_dirty = false;
+	uint32		set_flag_bits = 0;
+
+	if (io_error)
+		set_flag_bits |= BM_IO_ERROR;
+
+	if (for_input)
+	{
+		clear_dirty = false;
+
+		if (succeed)
+			set_flag_bits |= BM_VALID;
+	}
+	else
+	{
+		if (succeed)
+			clear_dirty = true;
+	}
+
+	ereport(LOG,
+			errmsg("buffer %d before Terminate[Local]BufferIO: %s",
+				   buf, DebugPrintBufferRefcount(buf)),
+			errhidestmt(true), errhidecontext(true));
+
+	if (BufferIsLocal(buf))
+		TerminateLocalBufferIO(GetLocalBufferDescriptor(-buf - 1),
+							   clear_dirty, set_flag_bits, release_aio);
+	else
+		TerminateBufferIO(GetBufferDescriptor(buf - 1),
+						  clear_dirty, set_flag_bits, false, release_aio);
+
+	ereport(LOG,
+			errmsg("buffer %d after Terminate[Local]BufferIO: %s",
+				   buf, DebugPrintBufferRefcount(buf)),
+			errhidestmt(true), errhidecontext(true));
+
+	PG_RETURN_VOID();
+}
+
+PG_FUNCTION_INFO_V1(handle_get);
+Datum
+handle_get(PG_FUNCTION_ARGS)
+{
+	last_handle = pgaio_io_acquire(CurrentResourceOwner, NULL);
+
+	PG_RETURN_VOID();
+}
+
+PG_FUNCTION_INFO_V1(handle_release_last);
+Datum
+handle_release_last(PG_FUNCTION_ARGS)
+{
+	if (!last_handle)
+		elog(ERROR, "no handle");
+
+	pgaio_io_release(last_handle);
+
+	PG_RETURN_VOID();
+}
+
+PG_FUNCTION_INFO_V1(handle_get_and_error);
+Datum
+handle_get_and_error(PG_FUNCTION_ARGS)
+{
+	pgaio_io_acquire(CurrentResourceOwner, NULL);
+
+	elog(ERROR, "as you command");
+	PG_RETURN_VOID();
+}
+
+PG_FUNCTION_INFO_V1(handle_get_twice);
+Datum
+handle_get_twice(PG_FUNCTION_ARGS)
+{
+	pgaio_io_acquire(CurrentResourceOwner, NULL);
+	pgaio_io_acquire(CurrentResourceOwner, NULL);
+
+	PG_RETURN_VOID();
+}
+
+PG_FUNCTION_INFO_V1(handle_get_release);
+Datum
+handle_get_release(PG_FUNCTION_ARGS)
+{
+	PgAioHandle *handle;
+
+	handle = pgaio_io_acquire(CurrentResourceOwner, NULL);
+	pgaio_io_release(handle);
+
+	PG_RETURN_VOID();
+}
+
+PG_FUNCTION_INFO_V1(batch_start);
+Datum
+batch_start(PG_FUNCTION_ARGS)
+{
+	pgaio_enter_batchmode();
+	PG_RETURN_VOID();
+}
+
+PG_FUNCTION_INFO_V1(batch_end);
+Datum
+batch_end(PG_FUNCTION_ARGS)
+{
+	pgaio_exit_batchmode();
+	PG_RETURN_VOID();
+}
+
+#ifdef USE_INJECTION_POINTS
+extern PGDLLEXPORT void inj_io_short_read(const char *name, const void *private_data);
+extern PGDLLEXPORT void inj_io_reopen(const char *name, const void *private_data);
+
+void
+inj_io_short_read(const char *name, const void *private_data)
+{
+	PgAioHandle *ioh;
+
+	elog(LOG, "short read called: %d", inj_io_error_state->enabled_short_read);
+
+	if (inj_io_error_state->enabled_short_read)
+	{
+		ioh = pgaio_inj_io_get();
+
+		/*
+		 * Only shorten reads that are actually longer than the target size,
+		 * otherwise we can trigger over-reads.
+		 */
+		if (inj_io_error_state->short_read_result_set
+			&& ioh->op == PGAIO_OP_READV
+			&& inj_io_error_state->short_read_result <= ioh->result)
+		{
+			struct iovec *iov = &pgaio_ctl->iovecs[ioh->iovec_off];
+			int32		old_result = ioh->result;
+			int32		new_result = inj_io_error_state->short_read_result;
+			int32		processed = 0;
+
+			ereport(LOG,
+					errmsg("short read, changing result from %d to %d",
+						   old_result, new_result),
+					errhidestmt(true), errhidecontext(true));
+
+			/*
+			 * The underlying IO actually completed OK, and thus the "invalid"
+			 * portion of the IOV actually contains valid data. That can hide
+			 * a lot of problems, e.g. if we were to wrongly mark a buffer,
+			 * that wasn't read according to the shortened-read, IO as valid,
+			 * the contents would look valid and we might miss a bug.
+			 *
+			 * To avoid that, iterate through the IOV and zero out the
+			 * "failed" portion of the IO.
+			 */
+			for (int i = 0; i < ioh->op_data.read.iov_length; i++)
+			{
+				if (processed + iov[i].iov_len <= new_result)
+					processed += iov[i].iov_len;
+				else if (processed <= new_result)
+				{
+					uint32		ok_part = new_result - processed;
+
+					memset((char *) iov[i].iov_base + ok_part, 0, iov[i].iov_len - ok_part);
+					processed += iov[i].iov_len;
+				}
+				else
+				{
+					memset((char *) iov[i].iov_base, 0, iov[i].iov_len);
+				}
+			}
+
+			ioh->result = new_result;
+		}
+	}
+}
+
+void
+inj_io_reopen(const char *name, const void *private_data)
+{
+	elog(LOG, "reopen called: %d", inj_io_error_state->enabled_reopen);
+
+	if (inj_io_error_state->enabled_reopen)
+	{
+		elog(ERROR, "injection point triggering failure to reopen ");
+	}
+}
+#endif
+
+PG_FUNCTION_INFO_V1(inj_io_short_read_attach);
+Datum
+inj_io_short_read_attach(PG_FUNCTION_ARGS)
+{
+#ifdef USE_INJECTION_POINTS
+	inj_io_error_state->enabled_short_read = true;
+	inj_io_error_state->short_read_result_set = !PG_ARGISNULL(0);
+	if (inj_io_error_state->short_read_result_set)
+		inj_io_error_state->short_read_result = PG_GETARG_INT32(0);
+#else
+	elog(ERROR, "injection points not supported");
+#endif
+
+	PG_RETURN_VOID();
+}
+
+PG_FUNCTION_INFO_V1(inj_io_short_read_detach);
+Datum
+inj_io_short_read_detach(PG_FUNCTION_ARGS)
+{
+#ifdef USE_INJECTION_POINTS
+	inj_io_error_state->enabled_short_read = false;
+#else
+	elog(ERROR, "injection points not supported");
+#endif
+	PG_RETURN_VOID();
+}
+
+PG_FUNCTION_INFO_V1(inj_io_reopen_attach);
+Datum
+inj_io_reopen_attach(PG_FUNCTION_ARGS)
+{
+#ifdef USE_INJECTION_POINTS
+	inj_io_error_state->enabled_reopen = true;
+#else
+	elog(ERROR, "injection points not supported");
+#endif
+
+	PG_RETURN_VOID();
+}
+
+PG_FUNCTION_INFO_V1(inj_io_reopen_detach);
+Datum
+inj_io_reopen_detach(PG_FUNCTION_ARGS)
+{
+#ifdef USE_INJECTION_POINTS
+	inj_io_error_state->enabled_reopen = false;
+#else
+	elog(ERROR, "injection points not supported");
+#endif
+	PG_RETURN_VOID();
+}
diff --git a/src/test/modules/test_aio/test_aio.control b/src/test/modules/test_aio/test_aio.control
new file mode 100644
index 00000000000..cd91c3ed16b
--- /dev/null
+++ b/src/test/modules/test_aio/test_aio.control
@@ -0,0 +1,3 @@
+comment = 'Test code for AIO'
+default_version = '1.0'
+module_pathname = '$libdir/test_aio'
-- 
2.48.1.76.g4e746b1a31.dirty

