From 998b743efad532054ac9cae5df7b9ffc0dde8781 Mon Sep 17 00:00:00 2001 From: Nisha Moond Date: Wed, 25 Mar 2026 18:04:12 +0530 Subject: [PATCH v9] Fix slotsync worker blocking promotion when stuck in wait Previously, on standby promotion, the startup process sent SIGUSR1 to the slotsync worker (or a backend performing slot synchronization) and waited for it to exit. This worked in most cases, but if the process was blocked waiting for a response from the primary (e.g., due to a network failure), SIGUSR1 would not interrupt the wait. As a result, the process could remain stuck, causing the startup process to wait for a long time and delaying promotion. This commit fixes the issue by introducing a new procsignal reason, PROCSIG_SLOTSYNC_MESSAGE. On promotion, the startup process sends this signal, and the handler sets interrupt flags so the process exits (or errors out) promptly at CHECK_FOR_INTERRUPTS(), allowing promotion to complete without delay. Backpatch to v17, where slotsync was introduced. Author: Nisha Moond Reviewed-by: shveta malik Reviewed-by: Amit Kapila Reviewed-by: Zhijie Hou Reviewed-by: Fujii Masao Discussion: https://postgr.es/m/CAHGQGwFzNYroAxSoyJhqTU-pH=t4Ej6RyvhVmBZ91Exj_TPMMQ@mail.gmail.com Backpatch-through: 17 --- src/backend/replication/logical/slotsync.c | 138 ++++++++++++++------- src/backend/storage/ipc/procsignal.c | 4 + src/backend/tcop/postgres.c | 4 + src/include/replication/slotsync.h | 7 ++ src/include/storage/procsignal.h | 1 + 5 files changed, 111 insertions(+), 43 deletions(-) diff --git a/src/backend/replication/logical/slotsync.c b/src/backend/replication/logical/slotsync.c index 9ce6160dcfa..0b48f55d057 100644 --- a/src/backend/replication/logical/slotsync.c +++ b/src/backend/replication/logical/slotsync.c @@ -75,18 +75,19 @@ * Struct for sharing information to control slot synchronization. * * The 'pid' is either the slot sync worker's pid or the backend's pid running - * the SQL function pg_sync_replication_slots(). When the startup process sets - * 'stopSignaled' during promotion, it uses this 'pid' to wake up the currently - * synchronizing process so that the process can immediately stop its - * synchronizing work on seeing 'stopSignaled' set. - * Setting 'stopSignaled' is also used to handle the race condition when the - * postmaster has not noticed the promotion yet and thus may end up restarting - * the slot sync worker. If 'stopSignaled' is set, the worker will exit in such a - * case. The SQL function pg_sync_replication_slots() will also error out if - * this flag is set. Note that we don't need to reset this variable as after - * promotion the slot sync worker won't be restarted because the pmState - * changes to PM_RUN from PM_HOT_STANDBY and we don't support demoting - * primary without restarting the server. See MaybeStartSlotSyncWorker. + * the SQL function pg_sync_replication_slots(). On promotion, the startup + * process sets 'stopSignaled' and uses this 'pid' to signal the synchronizing + * process with PROCSIG_SLOTSYNC_MESSAGE and also to wake it up so that the + * process can immediately stop its synchronizing work. + * Setting 'stopSignaled' on the other hand is used to handle the race + * condition when the postmaster has not noticed the promotion yet and thus may + * end up restarting the slot sync worker. If 'stopSignaled' is set, the worker + * will exit in such a case. The SQL function pg_sync_replication_slots() will + * also error out if this flag is set. Note that we don't need to reset this + * variable as after promotion the slot sync worker won't be restarted because + * the pmState changes to PM_RUN from PM_HOT_STANDBY and we don't support + * demoting primary without restarting the server. + * See MaybeStartSlotSyncWorker. * * The 'syncing' flag is needed to prevent concurrent slot syncs to avoid slot * overwrites. @@ -131,6 +132,13 @@ static long sleep_ms = MIN_SLOTSYNC_WORKER_NAPTIME_MS; */ static bool syncing_slots = false; +/* + * Interrupt flag set when PROCSIG_SLOTSYNC_MESSAGE is received, asking the + * slotsync worker or pg_sync_replication_slots() to stop because + * standby promotion has been triggered. + */ +volatile sig_atomic_t SlotSyncShutdownPending = false; + /* * Structure to hold information fetched from the primary server about a logical * replication slot. @@ -1186,36 +1194,52 @@ slotsync_reread_config(void) } /* - * Interrupt handler for process performing slot synchronization. + * Handle receipt of an interrupt indicating a slotsync shutdown message. + * + * This is called within the SIGUSR1 handler. All we do here is set a flag + * that will cause the next CHECK_FOR_INTERRUPTS() to invoke + * ProcessSlotSyncMessage(). */ -static void -ProcessSlotSyncInterrupts(WalReceiverConn *wrconn) +void +HandleSlotSyncMessageInterrupt(void) { - CHECK_FOR_INTERRUPTS(); + InterruptPending = true; + SlotSyncShutdownPending = true; + /* latch will be set by procsignal_sigusr1_handler */ +} - if (SlotSyncCtx->stopSignaled) - { - if (AmLogicalSlotSyncWorkerProcess()) - { - ereport(LOG, - errmsg("replication slot synchronization worker will stop because promotion is triggered")); +/* + * Handle a PROCSIG_SLOTSYNC_MESSAGE signal, called from ProcessInterrupts(). + * + * If the current process is the slotsync background worker, log a message + * and exit cleanly. If it is a backend executing pg_sync_replication_slots(), + * raise an error, unless the sync has already finished, in which case there + * is no need to interrupt the caller. + */ +void +ProcessSlotSyncMessage(void) +{ + SlotSyncShutdownPending = false; - proc_exit(0); - } - else - { - /* - * For the backend executing SQL function - * pg_sync_replication_slots(). - */ - ereport(ERROR, - errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), - errmsg("replication slot synchronization will stop because promotion is triggered")); - } + if (AmLogicalSlotSyncWorkerProcess()) + { + ereport(LOG, + errmsg("replication slot synchronization worker will stop because promotion is triggered")); + proc_exit(0); } + else + { + /* + * If sync has already completed, there is no need to interrupt the + * caller with an error. + */ + if (!IsSyncingReplicationSlots()) + return; - if (ConfigReloadPending) - slotsync_reread_config(); + ereport(ERROR, + errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("replication slot synchronization will stop because promotion is triggered")); + } } /* @@ -1322,6 +1346,34 @@ check_and_set_sync_info(pid_t sync_process_pid) { SpinLockAcquire(&SlotSyncCtx->mutex); + /* + * Exit immediately if promotion has been triggered. This guards against + * a new worker (or a call to pg_sync_replication_slots()) that starts + * after the old worker was stopped by ShutDownSlotSync(). + */ + if (SlotSyncCtx->stopSignaled) + { + SpinLockRelease(&SlotSyncCtx->mutex); + + if (AmLogicalSlotSyncWorkerProcess()) + { + ereport(DEBUG1, + errmsg("replication slot synchronization worker will not start because promotion was triggered")); + + proc_exit(0); + } + else + { + /* + * For the backend executing SQL function + * pg_sync_replication_slots(). + */ + ereport(ERROR, + errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("replication slot synchronization will not start because promotion was triggered")); + } + } + if (SlotSyncCtx->syncing) { SpinLockRelease(&SlotSyncCtx->mutex); @@ -1525,7 +1577,10 @@ ReplSlotSyncWorkerMain(char *startup_data, size_t startup_data_len) { bool some_slot_updated = false; - ProcessSlotSyncInterrupts(wrconn); + CHECK_FOR_INTERRUPTS(); + + if (ConfigReloadPending) + slotsync_reread_config(); some_slot_updated = synchronize_slots(wrconn); @@ -1625,11 +1680,11 @@ ShutDownSlotSync(void) SpinLockRelease(&SlotSyncCtx->mutex); /* - * Signal process doing slotsync, if any. The process will stop upon - * detecting that the stopSignaled flag is set to true. + * Signal process doing slotsync, if any, asking it to stop. */ if (sync_process_pid != InvalidPid) - kill(sync_process_pid, SIGUSR1); + SendProcSignal(sync_process_pid, PROCSIG_SLOTSYNC_MESSAGE, + INVALID_PROC_NUMBER); /* Wait for slot sync to end */ for (;;) @@ -1774,9 +1829,6 @@ SyncReplicationSlots(WalReceiverConn *wrconn) { check_and_set_sync_info(MyProcPid); - /* Check for interrupts and config changes */ - ProcessSlotSyncInterrupts(); - validate_remote_info(wrconn); synchronize_slots(wrconn); diff --git a/src/backend/storage/ipc/procsignal.c b/src/backend/storage/ipc/procsignal.c index 4ed9cedcdd4..d6857f5a8bb 100644 --- a/src/backend/storage/ipc/procsignal.c +++ b/src/backend/storage/ipc/procsignal.c @@ -23,6 +23,7 @@ #include "pgstat.h" #include "port/pg_bitutils.h" #include "replication/logicalworker.h" +#include "replication/slotsync.h" #include "replication/walsender.h" #include "storage/condition_variable.h" #include "storage/ipc.h" @@ -655,6 +656,9 @@ procsignal_sigusr1_handler(SIGNAL_ARGS) if (CheckProcSignal(PROCSIG_PARALLEL_APPLY_MESSAGE)) HandleParallelApplyMessageInterrupt(); + if (CheckProcSignal(PROCSIG_SLOTSYNC_MESSAGE)) + HandleSlotSyncMessageInterrupt(); + if (CheckProcSignal(PROCSIG_RECOVERY_CONFLICT_DATABASE)) HandleRecoveryConflictInterrupt(PROCSIG_RECOVERY_CONFLICT_DATABASE); diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index 9cd1d0abe35..00d4073c089 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -58,6 +58,7 @@ #include "postmaster/postmaster.h" #include "replication/logicallauncher.h" #include "replication/logicalworker.h" +#include "replication/slotsync.h" #include "replication/slot.h" #include "replication/walsender.h" #include "rewrite/rewriteHandler.h" @@ -3497,6 +3498,9 @@ ProcessInterrupts(void) if (ParallelApplyMessagePending) HandleParallelApplyMessages(); + + if (SlotSyncShutdownPending) + ProcessSlotSyncMessage(); } /* diff --git a/src/include/replication/slotsync.h b/src/include/replication/slotsync.h index e03c2a005a4..c7a93791104 100644 --- a/src/include/replication/slotsync.h +++ b/src/include/replication/slotsync.h @@ -12,10 +12,15 @@ #ifndef SLOTSYNC_H #define SLOTSYNC_H +#include + #include "replication/walreceiver.h" extern PGDLLIMPORT bool sync_replication_slots; +/* Interrupt flag set by HandleSlotSyncMessageInterrupt() */ +extern PGDLLIMPORT volatile sig_atomic_t SlotSyncShutdownPending; + /* * GUCs needed by slot sync worker to connect to the primary * server and carry on with slots synchronization. @@ -34,5 +39,7 @@ extern bool IsSyncingReplicationSlots(void); extern Size SlotSyncShmemSize(void); extern void SlotSyncShmemInit(void); extern void SyncReplicationSlots(WalReceiverConn *wrconn); +extern void HandleSlotSyncMessageInterrupt(void); +extern void ProcessSlotSyncMessage(void); #endif /* SLOTSYNC_H */ diff --git a/src/include/storage/procsignal.h b/src/include/storage/procsignal.h index 7d290ea7d05..126c44bcf1d 100644 --- a/src/include/storage/procsignal.h +++ b/src/include/storage/procsignal.h @@ -36,6 +36,7 @@ typedef enum PROCSIG_BARRIER, /* global barrier interrupt */ PROCSIG_LOG_MEMORY_CONTEXT, /* ask backend to log the memory contexts */ PROCSIG_PARALLEL_APPLY_MESSAGE, /* Message from parallel apply workers */ + PROCSIG_SLOTSYNC_MESSAGE, /* ask slot synchronization to stop */ /* Recovery conflict reasons */ PROCSIG_RECOVERY_CONFLICT_FIRST, -- 2.51.2