From b961438264bc6d279655b8852110c4751060964a Mon Sep 17 00:00:00 2001 From: Alexander Korotkov Date: Wed, 6 May 2026 13:10:07 +0300 Subject: [PATCH v1] Skip pg_control re-read in EXEC_BACKEND sub-processes In EXEC_BACKEND mode every newly exec'd sub-process re-reads pg_control from disk in SubPostmasterMain() via LocalProcessControlFile(). That read is not interlocked against update_controlfile() in the postmaster's checkpointer, which writes the file non-atomically. On filesystems that do not enforce POSIX read/write atomicity for concurrent access to the same region (notably NTFS and ext4) the sub-process can observe a torn image and abort with "incorrect checksum in control file". The sub-process did not actually need to re-read the file: pg_control contents that matter before shared memory is attached are either initdb-time invariants (already validated by the postmaster) or values that the postmaster has long since installed in shared memory. The only piece of state pulled out of LocalControlFile in the child that is not already propagated through the regular non-default-GUC mechanism is data_checksum_version, because the data_checksums GUC is updated by direct variable assignment (see SetLocalDataChecksumState) rather than through SetConfigOption, so its source stays at PGC_S_DEFAULT and it never lands on guc_nondef_list. Fix by removing the LocalProcessControlFile(false) call from SubPostmasterMain() and instead: - Pass data_checksum_version through BackendParameters; the child applies it via SetLocalDataChecksumState() in restore_backend_variables(), as part of its normal forkable-state restore. - wal_segment_size already propagates correctly: the postmaster sets it via SetConfigOption(..., PGC_S_DYNAMIC_DEFAULT) in ReadControlFile(), which puts the GUC on guc_nondef_list, so write_nondefault_variables/read_nondefault_variables carries it to the child. Settings derived from wal_segment_size that are not GUCs (UsableBytesInSegment, CheckPointSegments) are recomputed by a new helper, ApplyWalSegSizeSettings(), that is also factored out of ReadControlFile() in this commit. After the change the EXEC_BACKEND child never opens pg_control, so the torn-read window is closed. The global ControlFile pointer remains NULL in the child until the standard shmem attach hooks rebind it to the shared-memory copy. This brings EXEC_BACKEND startup behaviorally in line with fork(): both observe shared-memory pg_control as soon as they're permitted to, and neither carries a private snapshot. Discussion: https://postgr.es/m/f59335a4-83ff-438a-a30e-7cf2200276b6%40postgrespro.ru Discussion: https://postgr.es/m/70BF24D6-DC51-443F-B55A-95735803842A%40amazon.com Author: Alexander Korotkov --- src/backend/access/transam/xlog.c | 22 +++++++++-- src/backend/postmaster/launch_backend.c | 50 +++++++++++++++++++++++-- src/include/access/xlog.h | 1 + 3 files changed, 66 insertions(+), 7 deletions(-) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index f0434da40c9..9f26b72e0d9 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -4594,6 +4594,24 @@ ReadControlFile(void) wal_segment_size = ControlFile->xlog_seg_size; + snprintf(wal_segsz_str, sizeof(wal_segsz_str), "%d", wal_segment_size); + SetConfigOption("wal_segment_size", wal_segsz_str, PGC_INTERNAL, + PGC_S_DYNAMIC_DEFAULT); + + ApplyWalSegSizeSettings(); +} + +/* + * Validate wal_segment_size and (re)compute derived settings that aren't + * propagated through GUC variables. + * + * This is called by ReadControlFile() in the postmaster after reading + * pg_control, and by EXEC_BACKEND child startup after wal_segment_size + * has been restored via the GUC propagation mechanism. + */ +void +ApplyWalSegSizeSettings(void) +{ if (!IsValidWalSegSize(wal_segment_size)) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg_plural("invalid WAL segment size in control file (%d byte)", @@ -4602,10 +4620,6 @@ ReadControlFile(void) wal_segment_size), errdetail("The WAL segment size must be a power of two between 1 MB and 1 GB."))); - snprintf(wal_segsz_str, sizeof(wal_segsz_str), "%d", wal_segment_size); - SetConfigOption("wal_segment_size", wal_segsz_str, PGC_INTERNAL, - PGC_S_DYNAMIC_DEFAULT); - /* check and update variables dependent on wal_segment_size */ if (ConvertToXSegs(min_wal_size_mb, wal_segment_size) < 2) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), diff --git a/src/backend/postmaster/launch_backend.c b/src/backend/postmaster/launch_backend.c index 8f3cfea880c..5a04160c214 100644 --- a/src/backend/postmaster/launch_backend.c +++ b/src/backend/postmaster/launch_backend.c @@ -33,6 +33,7 @@ #include +#include "access/xlog.h" #include "libpq/libpq-be.h" #include "miscadmin.h" #include "postmaster/autovacuum.h" @@ -132,6 +133,14 @@ typedef struct int32 timing_tsc_frequency_khz; + /* + * data_checksum_version from pg_control. Propagated explicitly because + * the data_checksums GUC is bound directly to its variable (no + * SetConfigOption call), so it doesn't ride along the regular + * non-default-GUCs propagation path used for, e.g., wal_segment_size. + */ + uint32 data_checksum_version; + /* * These are only used by backend processes, but are here because passing * a socket needs some special handling on Windows. 'client_sock' is an @@ -662,10 +671,28 @@ SubPostmasterMain(int argc, char *argv[]) checkDataDir(); /* - * (re-)read control file, as it contains config. The postmaster will - * already have read this, but this process doesn't know about that. + * The postmaster already validated pg_control at startup. We don't + * re-read it here: doing so would race against concurrent writes by + * the checkpointer and could see a torn image (NTFS and ext4 can + * return mashed-together data for a concurrent read/write of the same + * region). Instead, we rely on inherited state: + * + * - wal_segment_size and other initdb-time GUCs are restored above by + * read_nondefault_variables() since SetConfigOption() in the + * postmaster placed them on guc_nondef_list. + * + * - data_checksum_version was applied in restore_backend_variables() + * from the BackendParameters snapshot. + * + * - The shared-memory ControlFile is re-attached below by + * InitShmemAllocator() / ShmemCallRequestCallbacks(), at which + * point the global ControlFile pointer is again valid for any + * code that needs the live (volatile) fields. + * + * Recompute settings derived from wal_segment_size that aren't + * propagated through GUCs (UsableBytesInSegment, CheckPointSegments). */ - LocalProcessControlFile(false); + ApplyWalSegSizeSettings(); RegisterBuiltinShmemCallbacks(); @@ -755,6 +782,15 @@ save_backend_variables(BackendParameters *param, param->timing_tsc_frequency_khz = timing_tsc_frequency_khz; + /* + * Propagate the data checksum version. In the postmaster the GUC-bound + * 'data_checksums' variable was set from pg_control by + * SetLocalDataChecksumState(); 'data_checksums' is not propagated via + * the regular non-default-GUC mechanism (its value is assigned directly, + * so its source stays at PGC_S_DEFAULT), so we ship it explicitly here. + */ + param->data_checksum_version = data_checksums; + #ifdef WIN32 param->PostmasterHandle = PostmasterHandle; if (!write_duplicated_handle(¶m->initial_signal_pipe, @@ -1011,6 +1047,14 @@ restore_backend_variables(BackendParameters *param) timing_tsc_frequency_khz = param->timing_tsc_frequency_khz; + /* + * Restore data checksum state from the inherited snapshot of + * pg_control. Doing this here, before any code can need it, avoids + * having to re-read pg_control from disk in this child (which used to + * race against concurrent writes by the postmaster's checkpointer). + */ + SetLocalDataChecksumState(param->data_checksum_version); + /* Re-run logic usually done by assign_timing_clock_source */ pg_initialize_timing(); pg_set_timing_clock_source(timing_clock_source); diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index 4dd98624204..b9867702bf9 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -264,6 +264,7 @@ extern XLogRecPtr GetFakeLSNForUnloggedRel(void); extern void BootStrapXLOG(uint32 data_checksum_version); extern void InitializeWalConsistencyChecking(void); extern void LocalProcessControlFile(bool reset); +extern void ApplyWalSegSizeSettings(void); extern WalLevel GetActiveWalLevelOnStandby(void); extern void StartupXLOG(void); extern void ShutdownXLOG(int code, Datum arg); -- 2.39.5 (Apple Git-154)