From b7ace6f7eab4a2c181427ac4a7719da62faf78ee Mon Sep 17 00:00:00 2001 From: Baji Shaik Date: Sun, 17 May 2026 23:32:42 +0000 Subject: [PATCH] Fix REPACK decoding worker not cleaned up on FATAL exit When the launching backend of REPACK (CONCURRENTLY) is terminated via pg_terminate_backend(), ProcDiePending causes ereport(FATAL) which bypasses PG_FINALLY blocks. As a result, stop_repack_decoding_worker() is never called, leaving the decoding worker running indefinitely and holding its temporary replication slot. Fix by using PG_ENSURE_ERROR_CLEANUP, which handles both ERROR and FATAL exits. Unlike before_shmem_exit, this does not leak callback slots when the same backend runs multiple REPACK (CONCURRENTLY) commands, and unlike on_proc_exit, it runs before memory contexts are destroyed so the worker handle is still valid. --- src/backend/commands/repack.c | 42 +++++++++++++++++++++++------------ 1 file changed, 28 insertions(+), 14 deletions(-) diff --git a/src/backend/commands/repack.c b/src/backend/commands/repack.c index fae88d6bb83..17eb9eb029c 100644 --- a/src/backend/commands/repack.c +++ b/src/backend/commands/repack.c @@ -64,6 +64,7 @@ #include "pgstat.h" #include "replication/logicalrelation.h" #include "storage/bufmgr.h" +#include "storage/ipc.h" #include "storage/lmgr.h" #include "storage/predicate.h" #include "storage/proc.h" @@ -211,6 +212,7 @@ static Oid determine_clustered_index(Relation rel, bool usingindex, static void start_repack_decoding_worker(Oid relid); static void stop_repack_decoding_worker(void); +static void repack_decoding_worker_cleanup_cb(int code, Datum arg); static Snapshot get_initial_snapshot(DecodingWorker *worker); static void ProcessRepackMessage(StringInfo msg); @@ -660,24 +662,25 @@ cluster_rel(RepackCommand cmd, Relation OldHeap, Oid indexOid, TransferPredicateLocksToHeapRelation(OldHeap); /* rebuild_relation does all the dirty work */ - PG_TRY(); - { - rebuild_relation(OldHeap, index, verbose, ident_idx); - } - PG_FINALLY(); + if (concurrent) { - if (concurrent) + /* + * Use PG_ENSURE_ERROR_CLEANUP so that the decoding worker is stopped + * on both ERROR and FATAL exits. PG_FINALLY only handles ERROR; + * FATAL (e.g. from pg_terminate_backend) would bypass it, leaving + * the worker running and holding its replication slot indefinitely. + */ + PG_ENSURE_ERROR_CLEANUP(repack_decoding_worker_cleanup_cb, (Datum) 0); { - /* - * Since during normal operation the worker was already asked to - * exit, stopping it explicitly is especially important on ERROR. - * However it still seems a good practice to make sure that the - * worker never survives the REPACK command. - */ - stop_repack_decoding_worker(); + rebuild_relation(OldHeap, index, verbose, ident_idx); } + PG_END_ENSURE_ERROR_CLEANUP(repack_decoding_worker_cleanup_cb, (Datum) 0); + stop_repack_decoding_worker(); + } + else + { + rebuild_relation(OldHeap, index, verbose, ident_idx); } - PG_END_TRY(); /* rebuild_relation closes OldHeap, and index if valid */ @@ -3482,6 +3485,17 @@ start_repack_decoding_worker(Oid relid) ConditionVariableCancelSleep(); } +/* + * PG_ENSURE_ERROR_CLEANUP callback to stop the decoding worker. + * This ensures the worker is terminated on both ERROR and FATAL exits, + * unlike PG_FINALLY which only handles ERROR. + */ +static void +repack_decoding_worker_cleanup_cb(int code, Datum arg) +{ + stop_repack_decoding_worker(); +} + /* * Stop the decoding worker and cleanup the related resources. * -- 2.50.1