From 650c48ffc42f554f86b8485a6c6b98c36c35fb81 Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Thu, 8 May 2025 21:03:52 -0400
Subject: [PATCH v1] WIP: aio: Fix possible state confusion due to interrupt
 processing

Author:
Reviewed-by:
Discussion: https://postgr.es/m/
Backpatch:
---
 src/backend/storage/aio/aio.c | 113 +++++++++++++++++++++++++++++-----
 1 file changed, 99 insertions(+), 14 deletions(-)

diff --git a/src/backend/storage/aio/aio.c b/src/backend/storage/aio/aio.c
index 57b9cf3dcab..10535066762 100644
--- a/src/backend/storage/aio/aio.c
+++ b/src/backend/storage/aio/aio.c
@@ -198,6 +198,8 @@ pgaio_io_acquire(struct ResourceOwnerData *resowner, PgAioReturn *ret)
 PgAioHandle *
 pgaio_io_acquire_nb(struct ResourceOwnerData *resowner, PgAioReturn *ret)
 {
+	PgAioHandle *ioh = NULL;
+
 	if (pgaio_my_backend->num_staged_ios >= PGAIO_SUBMIT_BATCH_SIZE)
 	{
 		Assert(pgaio_my_backend->num_staged_ios == PGAIO_SUBMIT_BATCH_SIZE);
@@ -207,10 +209,17 @@ pgaio_io_acquire_nb(struct ResourceOwnerData *resowner, PgAioReturn *ret)
 	if (pgaio_my_backend->handed_out_io)
 		elog(ERROR, "API violation: Only one IO can be handed out");
 
+	/*
+	 * Probably not needed today, as interrupts should not process this IO,
+	 * but...
+	 */
+	HOLD_INTERRUPTS();
+
 	if (!dclist_is_empty(&pgaio_my_backend->idle_ios))
 	{
 		dlist_node *ion = dclist_pop_head_node(&pgaio_my_backend->idle_ios);
-		PgAioHandle *ioh = dclist_container(PgAioHandle, node, ion);
+
+		ioh = dclist_container(PgAioHandle, node, ion);
 
 		Assert(ioh->state == PGAIO_HS_IDLE);
 		Assert(ioh->owner_procno == MyProcNumber);
@@ -226,11 +235,11 @@ pgaio_io_acquire_nb(struct ResourceOwnerData *resowner, PgAioReturn *ret)
 			ioh->report_return = ret;
 			ret->result.status = PGAIO_RS_UNKNOWN;
 		}
-
-		return ioh;
 	}
 
-	return NULL;
+	RESUME_INTERRUPTS();
+
+	return ioh;
 }
 
 /*
@@ -247,6 +256,12 @@ pgaio_io_release(PgAioHandle *ioh)
 		Assert(ioh->resowner);
 
 		pgaio_my_backend->handed_out_io = NULL;
+
+		/*
+		 * Note that no interrupts are processed between the handed_out_io
+		 * check and the call to reclaim - that's important as otherwise an
+		 * interrupt could have already reclaimed the handle.
+		 */
 		pgaio_io_reclaim(ioh);
 	}
 	else
@@ -265,6 +280,12 @@ pgaio_io_release_resowner(dlist_node *ioh_node, bool on_error)
 
 	Assert(ioh->resowner);
 
+	/*
+	 * Otherwise an interrupt, in the middle of releasing the IO, could end up
+	 * trying to wait for the IO, leading to state confusion.
+	 */
+	HOLD_INTERRUPTS();
+
 	ResourceOwnerForgetAioHandle(ioh->resowner, &ioh->resowner_node);
 	ioh->resowner = NULL;
 
@@ -305,6 +326,8 @@ pgaio_io_release_resowner(dlist_node *ioh_node, bool on_error)
 	 */
 	if (ioh->report_return)
 		ioh->report_return = NULL;
+
+	RESUME_INTERRUPTS();
 }
 
 /*
@@ -373,6 +396,13 @@ pgaio_io_get_wref(PgAioHandle *ioh, PgAioWaitRef *iow)
 static inline void
 pgaio_io_update_state(PgAioHandle *ioh, PgAioHandleState new_state)
 {
+	/*
+	 * All callers need to have held interrupts in some form, otherwise
+	 * interrupt processing could wait for the IO to complete, while in an
+	 * intermediary state.
+	 */
+	Assert(!INTERRUPTS_CAN_BE_PROCESSED());
+
 	pgaio_debug_io(DEBUG5, ioh,
 				   "updating state to %s",
 				   pgaio_io_state_get_name(new_state));
@@ -410,6 +440,13 @@ pgaio_io_stage(PgAioHandle *ioh, PgAioOp op)
 	Assert(pgaio_my_backend->handed_out_io == ioh);
 	Assert(pgaio_io_has_target(ioh));
 
+	/*
+	 * Otherwise an interrupt, in the middle of staging and possibly executing
+	 * the IO, could end up trying to wait for the IO, leading to state
+	 * confusion.
+	 */
+	HOLD_INTERRUPTS();
+
 	ioh->op = op;
 	ioh->result = 0;
 
@@ -449,6 +486,8 @@ pgaio_io_stage(PgAioHandle *ioh, PgAioOp op)
 		pgaio_io_prepare_submit(ioh);
 		pgaio_io_perform_synchronously(ioh);
 	}
+
+	RESUME_INTERRUPTS();
 }
 
 bool
@@ -558,8 +597,8 @@ pgaio_io_wait(PgAioHandle *ioh, uint64 ref_generation)
 			&& state != PGAIO_HS_COMPLETED_SHARED
 			&& state != PGAIO_HS_COMPLETED_LOCAL)
 		{
-			elog(PANIC, "waiting for own IO in wrong state: %d",
-				 state);
+			elog(PANIC, "waiting for own IO %d in wrong state: %s",
+				 pgaio_io_get_id(ioh), pgaio_io_get_state_name(ioh));
 		}
 	}
 
@@ -613,7 +652,13 @@ pgaio_io_wait(PgAioHandle *ioh, uint64 ref_generation)
 
 			case PGAIO_HS_COMPLETED_SHARED:
 			case PGAIO_HS_COMPLETED_LOCAL:
-				/* see above */
+
+				/*
+				 * Note that no interrupts are processed between
+				 * pgaio_io_was_recycled() and this check - that's important
+				 * as otherwise an interrupt could have already reclaimed the
+				 * handle.
+				 */
 				if (am_owner)
 					pgaio_io_reclaim(ioh);
 				return;
@@ -624,6 +669,11 @@ pgaio_io_wait(PgAioHandle *ioh, uint64 ref_generation)
 /*
  * Make IO handle ready to be reused after IO has completed or after the
  * handle has been released without being used.
+ *
+ * Note that callers need to be careful about only calling this in the right
+ * state and that no interrupts can be processed between the state check and
+ * the call to pgaio_io_reclaim(). Otherwise interrupt processing could
+ * already have reclaimed the handle.
  */
 static void
 pgaio_io_reclaim(PgAioHandle *ioh)
@@ -632,6 +682,9 @@ pgaio_io_reclaim(PgAioHandle *ioh)
 	Assert(ioh->owner_procno == MyProcNumber);
 	Assert(ioh->state != PGAIO_HS_IDLE);
 
+	/* see comment in function header */
+	HOLD_INTERRUPTS();
+
 	/*
 	 * It's a bit ugly, but right now the easiest place to put the execution
 	 * of local completion callbacks is this function, as we need to execute
@@ -699,6 +752,8 @@ pgaio_io_reclaim(PgAioHandle *ioh)
 	 * efficient in cases where only a few IOs are used.
 	 */
 	dclist_push_head(&pgaio_my_backend->idle_ios, &ioh->node);
+
+	RESUME_INTERRUPTS();
 }
 
 /*
@@ -714,7 +769,7 @@ pgaio_io_wait_for_free(void)
 	pgaio_debug(DEBUG2, "waiting for free IO with %d pending, %d in-flight, %d idle IOs",
 				pgaio_my_backend->num_staged_ios,
 				dclist_count(&pgaio_my_backend->in_flight_ios),
-				dclist_is_empty(&pgaio_my_backend->idle_ios));
+				dclist_count(&pgaio_my_backend->idle_ios));
 
 	/*
 	 * First check if any of our IOs actually have completed - when using
@@ -728,6 +783,11 @@ pgaio_io_wait_for_free(void)
 
 		if (ioh->state == PGAIO_HS_COMPLETED_SHARED)
 		{
+			/*
+			 * Note that no interrupts are processed between the state check
+			 * and the call to reclaim - that's important as otherwise an
+			 * interrupt could have already reclaimed the handle.
+			 */
 			pgaio_io_reclaim(ioh);
 			reclaimed++;
 		}
@@ -744,13 +804,17 @@ pgaio_io_wait_for_free(void)
 	if (pgaio_my_backend->num_staged_ios > 0)
 		pgaio_submit_staged();
 
+	/* possibly some IOs finished during submission */
+	if (!dclist_is_empty(&pgaio_my_backend->idle_ios))
+		return;
+
 	if (dclist_count(&pgaio_my_backend->in_flight_ios) == 0)
 		ereport(ERROR,
 				errmsg_internal("no free IOs despite no in-flight IOs"),
 				errdetail_internal("%d pending, %d in-flight, %d idle IOs",
 								   pgaio_my_backend->num_staged_ios,
 								   dclist_count(&pgaio_my_backend->in_flight_ios),
-								   dclist_is_empty(&pgaio_my_backend->idle_ios)));
+								   dclist_count(&pgaio_my_backend->idle_ios)));
 
 	/*
 	 * Wait for the oldest in-flight IO to complete.
@@ -761,6 +825,7 @@ pgaio_io_wait_for_free(void)
 	{
 		PgAioHandle *ioh = dclist_head_element(PgAioHandle, node,
 											   &pgaio_my_backend->in_flight_ios);
+		uint64		generation = ioh->generation;
 
 		switch (ioh->state)
 		{
@@ -784,13 +849,24 @@ pgaio_io_wait_for_free(void)
 				 * In a more general case this would be racy, because the
 				 * generation could increase after we read ioh->state above.
 				 * But we are only looking at IOs by the current backend and
-				 * the IO can only be recycled by this backend.
+				 * the IO can only be recycled by this backend.  Even this is
+				 * only OK because we get the handle's generation before
+				 * potentially processing interrupts, e.g. as part of
+				 * pgaio_debug_io().
 				 */
-				pgaio_io_wait(ioh, ioh->generation);
+				pgaio_io_wait(ioh, generation);
 				break;
 
 			case PGAIO_HS_COMPLETED_SHARED:
-				/* it's possible that another backend just finished this IO */
+
+				/*
+				 * It's possible that another backend just finished this IO.
+				 *
+				 * Note that no interrupts are processed between the state
+				 * check and the call to reclaim - that's important as
+				 * otherwise an interrupt could have already reclaimed the
+				 * handle.
+				 */
 				pgaio_io_reclaim(ioh);
 				break;
 		}
@@ -940,6 +1016,11 @@ pgaio_wref_check_done(PgAioWaitRef *iow)
 	if (state == PGAIO_HS_COMPLETED_SHARED ||
 		state == PGAIO_HS_COMPLETED_LOCAL)
 	{
+		/*
+		 * Note that no interrupts are processed between
+		 * pgaio_io_was_recycled() and this check - that's important as
+		 * otherwise an interrupt could have already reclaimed the handle.
+		 */
 		if (am_owner)
 			pgaio_io_reclaim(ioh);
 		return true;
@@ -1167,11 +1248,14 @@ pgaio_closing_fd(int fd)
 		{
 			dlist_iter	iter;
 			PgAioHandle *ioh = NULL;
+			uint64		generation;
 
 			dclist_foreach(iter, &pgaio_my_backend->in_flight_ios)
 			{
 				ioh = dclist_container(PgAioHandle, node, iter.cur);
 
+				generation = ioh->generation;
+
 				if (pgaio_io_uses_fd(ioh, fd))
 					break;
 				else
@@ -1186,7 +1270,7 @@ pgaio_closing_fd(int fd)
 						   fd, dclist_count(&pgaio_my_backend->in_flight_ios));
 
 			/* see comment in pgaio_io_wait_for_free() about raciness */
-			pgaio_io_wait(ioh, ioh->generation);
+			pgaio_io_wait(ioh, generation);
 		}
 	}
 }
@@ -1215,13 +1299,14 @@ pgaio_shutdown(int code, Datum arg)
 	while (!dclist_is_empty(&pgaio_my_backend->in_flight_ios))
 	{
 		PgAioHandle *ioh = dclist_head_element(PgAioHandle, node, &pgaio_my_backend->in_flight_ios);
+		uint64		generation = ioh->generation;
 
 		pgaio_debug_io(DEBUG2, ioh,
 					   "waiting for IO to complete during shutdown, %d in-flight IOs",
 					   dclist_count(&pgaio_my_backend->in_flight_ios));
 
 		/* see comment in pgaio_io_wait_for_free() about raciness */
-		pgaio_io_wait(ioh, ioh->generation);
+		pgaio_io_wait(ioh, generation);
 	}
 
 	pgaio_my_backend = NULL;
-- 
2.48.1.76.g4e746b1a31.dirty

