From 667061650eab7fa95fd07b8c892102cf7a8a47f4 Mon Sep 17 00:00:00 2001 From: Andres Freund Date: Sun, 22 Mar 2026 15:12:41 -0400 Subject: [PATCH v25 7/8] aio: Fix pgaio_io_wait() for staged IOs (B). Previously, pgaio_io_wait()'s cases for PGAIO_HS_DEFINED and PGAIO_HS_STAGED fell through to waiting for completion. The owner only promises to advance it to PGAIO_HS_SUBMITTED. The waiter needs to be prepared to call ->wait_one() itself once the IO is submitted in order to guarantee progress and avoid deadlocks on IO methods that provide ->wait_one(). Introduce a new per-backend condition variable submit_cv, woken by by pgaio_submit_stage(), and use it to wait for the state to advance. The new broadcast doesn't seem to cause any measurable slowdown, so ideas for optimizing the common no-waiters case were abandoned for now. It may not be possible to reach any real deadlock with existing AIO users, but that situation could change. There's also no reason the waiter shouldn't begin to wait via the IO method as soon as possible even without a deadlock. Picked up by testing a proposed IO method that has ->wait_one(), like io_method=io_uring, and code review. Backpatch-through: 18 Reviewed-by: Andres Freund Discussion: https://postgr.es/m/CA%2BhUKG%2BmZYrSdnhk-XrBYO18H829K77S9gMKUsykOiTJtqB43g%40mail.gmail.com --- src/include/storage/aio_internal.h | 7 +++ src/backend/storage/aio/aio.c | 50 ++++++++++++++++--- src/backend/storage/aio/aio_init.c | 1 + .../utils/activity/wait_event_names.txt | 1 + 4 files changed, 51 insertions(+), 8 deletions(-) diff --git a/src/include/storage/aio_internal.h b/src/include/storage/aio_internal.h index 9ca4087aa..c3afde4d5 100644 --- a/src/include/storage/aio_internal.h +++ b/src/include/storage/aio_internal.h @@ -216,6 +216,13 @@ typedef struct PgAioBackend uint16 num_staged_ios; PgAioHandle *staged_ios[PGAIO_SUBMIT_BATCH_SIZE]; + /* + * Other backends sometimes need to wait for the owning backend to submit. + * The per-IO CV would work for that purpose, but a per-backend CV allows + * for just one broadcast per submitted batch. + */ + ConditionVariable submit_cv; + /* * List of in-flight IOs. Also contains IOs that aren't strictly speaking * in-flight anymore, but have been waited-for and completed by another diff --git a/src/backend/storage/aio/aio.c b/src/backend/storage/aio/aio.c index 7009fb7f6..ce2f73bff 100644 --- a/src/backend/storage/aio/aio.c +++ b/src/backend/storage/aio/aio.c @@ -593,6 +593,16 @@ pgaio_io_was_recycled(PgAioHandle *ioh, uint64 ref_generation, PgAioHandleState return ioh->generation != ref_generation; } +/* + * Whether we need to wait via the IO method. Don't check via the IO method if + * the issuing backend is executing the IO synchronously. + */ +static bool +pgaio_io_needs_wait_one(PgAioHandle *ioh) +{ + return pgaio_method_ops->wait_one && !(ioh->flags & PGAIO_HF_SYNCHRONOUS); +} + /* * Wait for IO to complete. External code should never use this, outside of * the AIO subsystem waits are only allowed via pgaio_wref_wait(). @@ -632,23 +642,38 @@ pgaio_io_wait(PgAioHandle *ioh, uint64 ref_generation) elog(ERROR, "IO in wrong state: %d", state); break; - case PGAIO_HS_SUBMITTED: + case PGAIO_HS_DEFINED: + case PGAIO_HS_STAGED: /* - * If we need to wait via the IO method, do so now. Don't - * check via the IO method if the issuing backend is executing - * the IO synchronously. + * The owner hasn't submitted the IO yet. If we need to wait + * via the IO method, wait for submission, giving this backend + * the chance to call ->wait_one(). */ - if (pgaio_method_ops->wait_one && !(ioh->flags & PGAIO_HF_SYNCHRONOUS)) + if (pgaio_io_needs_wait_one(ioh)) + { + PgAioBackend *backend = &pgaio_ctl->backend_state[ioh->owner_procno]; + + ConditionVariablePrepareToSleep(&backend->submit_cv); + while (!pgaio_io_was_recycled(ioh, ref_generation, &state) && + (state == PGAIO_HS_DEFINED || + state == PGAIO_HS_STAGED)) + ConditionVariableSleep(&backend->submit_cv, WAIT_EVENT_AIO_IO_SUBMIT); + ConditionVariableCancelSleep(); + continue; + } + pg_fallthrough; + + case PGAIO_HS_SUBMITTED: + + /* If we need to wait via the IO method, do so now. */ + if (pgaio_io_needs_wait_one(ioh)) { pgaio_method_ops->wait_one(ioh, ref_generation); continue; } pg_fallthrough; - /* waiting for owner to submit */ - case PGAIO_HS_DEFINED: - case PGAIO_HS_STAGED: /* waiting for reaper to complete */ /* fallthrough */ case PGAIO_HS_COMPLETED_IO: @@ -1226,6 +1251,15 @@ pgaio_submit_staged(void) pgaio_my_backend->num_staged_ios = 0; + /* + * Wake any backend that started waiting for any of these IOs before + * submission, if it is necessary to call ->wait_one() to guarantee + * progress with the configured IO method. On its side, pgaio_io_wait() + * only waits for submit_cv on IO methods needing that. + */ + if (pgaio_method_ops->wait_one) + ConditionVariableBroadcast(&pgaio_my_backend->submit_cv); + pgaio_debug(DEBUG4, "aio: submitted %d IOs", total_submitted); diff --git a/src/backend/storage/aio/aio_init.c b/src/backend/storage/aio/aio_init.c index da30d792a..6fc00a917 100644 --- a/src/backend/storage/aio/aio_init.c +++ b/src/backend/storage/aio/aio_init.c @@ -199,6 +199,7 @@ AioShmemInit(void *arg) dclist_init(&bs->idle_ios); memset(bs->staged_ios, 0, sizeof(PgAioHandle *) * PGAIO_SUBMIT_BATCH_SIZE); + ConditionVariableInit(&bs->submit_cv); dclist_init(&bs->in_flight_ios); /* initialize per-backend IOs */ diff --git a/src/backend/utils/activity/wait_event_names.txt b/src/backend/utils/activity/wait_event_names.txt index 560659f95..7c3326348 100644 --- a/src/backend/utils/activity/wait_event_names.txt +++ b/src/backend/utils/activity/wait_event_names.txt @@ -200,6 +200,7 @@ ABI_compatibility: Section: ClassName - WaitEventIO AIO_IO_COMPLETION "Waiting for another process to complete IO." +AIO_IO_SUBMIT "Waiting for another process to submit IO." AIO_IO_URING_SUBMIT "Waiting for IO submission via io_uring." AIO_IO_URING_EXECUTION "Waiting for IO execution via io_uring." BASEBACKUP_READ "Waiting for base backup to read from a file." -- 2.53.0