| From: | Mingwei Jia <i(at)nayishan(dot)top> |
|---|---|
| To: | pgsql-hackers(at)lists(dot)postgresql(dot)org |
| Subject: | [RFC PATCH v2 RESEND 09/10] umbra: add patch 8 checkpoint/mapwriter writeback and physical preallocation |
| Date: | 2026-06-01 23:33:39 |
| Message-ID: | 20260601233340.67949-8-i@nayishan.top |
| Views: | Whole Thread | Raw Message | Download mbox | Resend email |
| Thread: | |
| Lists: | pgsql-hackers |
---
src/backend/common.mk | 2 +-
src/backend/postmaster/Makefile | 5 +
src/backend/postmaster/bgworker.c | 10 +
src/backend/postmaster/mapwriter.c | 184 ++++++++++
src/backend/postmaster/meson.build | 6 +
src/backend/postmaster/postmaster.c | 7 +
src/backend/storage/map/Makefile | 1 +
src/backend/storage/map/map.c | 54 +++
src/backend/storage/map/mapbgproc.c | 323 ++++++++++++++++++
src/backend/storage/map/mapclock.c | 5 +
src/backend/storage/map/mapflush.c | 6 +-
src/backend/storage/map/mapinit.c | 10 +
src/backend/storage/map/mapsuper.c | 17 +-
src/backend/storage/map/meson.build | 1 +
src/backend/storage/smgr/umbra.c | 9 +-
src/backend/storage/smgr/umfile.c | 242 +++++++++++++
.../utils/activity/wait_event_names.txt | 2 +
src/backend/utils/init/postinit.c | 16 +-
src/backend/utils/misc/guc_parameters.dat | 127 +++++++
src/backend/utils/misc/guc_tables.c | 2 +
src/backend/utils/misc/postgresql.conf.sample | 2 +
src/include/postmaster/mapwriter.h | 24 ++
src/include/storage/map.h | 14 +
src/include/storage/map_internal.h | 7 +
src/include/storage/mapsuper_internal.h | 4 +
src/include/storage/umfile.h | 3 +
src/test/recovery/meson.build | 2 +
.../t/055_umbra_mapwriter_activity.pl | 56 +++
.../recovery/t/073_umbra_preallocate_guc.pl | 74 ++++
29 files changed, 1204 insertions(+), 11 deletions(-)
create mode 100644 src/backend/postmaster/mapwriter.c
create mode 100644 src/backend/storage/map/mapbgproc.c
create mode 100644 src/include/postmaster/mapwriter.h
create mode 100644 src/test/recovery/t/055_umbra_mapwriter_activity.pl
create mode 100644 src/test/recovery/t/073_umbra_preallocate_guc.pl
diff --git a/src/backend/common.mk b/src/backend/common.mk
index 61861f5c7e..aacdf0c702 100644
--- a/src/backend/common.mk
+++ b/src/backend/common.mk
@@ -17,7 +17,7 @@ ifneq ($(subdir), src/backend)
all: $(subsysfilename)
endif
-objfiles.txt: Makefile $(SUBDIROBJS) $(OBJS)
+objfiles.txt: Makefile $(top_builddir)/src/Makefile.global $(SUBDIROBJS) $(OBJS)
# Don't rebuild the list if only the OBJS have changed.
$(if $(filter-out $(OBJS),$?),( $(if $(SUBDIROBJS),cat $(SUBDIROBJS); )echo $(addprefix $(subdir)/,$(OBJS)) ) >$@,touch $@)
diff --git a/src/backend/postmaster/Makefile b/src/backend/postmaster/Makefile
index 55044b2bc6..05cb330024 100644
--- a/src/backend/postmaster/Makefile
+++ b/src/backend/postmaster/Makefile
@@ -30,4 +30,9 @@ OBJS = \
walsummarizer.o \
walwriter.o
+ifeq ($(with_umbra), yes)
+OBJS += \
+ mapwriter.o
+endif
+
include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/postmaster/bgworker.c b/src/backend/postmaster/bgworker.c
index 3914d22a51..45f0abf94a 100644
--- a/src/backend/postmaster/bgworker.c
+++ b/src/backend/postmaster/bgworker.c
@@ -20,6 +20,9 @@
#include "port/atomics.h"
#include "postmaster/bgworker_internals.h"
#include "postmaster/datachecksum_state.h"
+#ifdef USE_UMBRA
+#include "postmaster/mapwriter.h"
+#endif
#include "postmaster/postmaster.h"
#include "replication/logicallauncher.h"
#include "replication/logicalworker.h"
@@ -167,6 +170,13 @@ static const struct
.fn_name = "DataChecksumsWorkerMain",
.fn_addr = DataChecksumsWorkerMain
}
+#ifdef USE_UMBRA
+ ,
+ {
+ .fn_name = "MapWriterMain",
+ .fn_addr = MapWriterMain
+ }
+#endif
};
/* Private functions. */
diff --git a/src/backend/postmaster/mapwriter.c b/src/backend/postmaster/mapwriter.c
new file mode 100644
index 0000000000..e659b6be94
--- /dev/null
+++ b/src/backend/postmaster/mapwriter.c
@@ -0,0 +1,184 @@
+/*-------------------------------------------------------------------------
+ *
+ * mapwriter.c
+ * Umbra map writer background worker.
+ *
+ * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * src/backend/postmaster/mapwriter.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <signal.h>
+#include <unistd.h>
+
+#include "libpq/pqsignal.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "postmaster/bgworker.h"
+#include "postmaster/bgwriter.h"
+#include "postmaster/interrupt.h"
+#include "postmaster/mapwriter.h"
+#include "storage/bufmgr.h"
+#include "storage/condition_variable.h"
+#include "storage/fd.h"
+#include "storage/ipc.h"
+#include "storage/latch.h"
+#include "storage/lwlock.h"
+#include "storage/map.h"
+#include "storage/procnumber.h"
+#include "storage/procsignal.h"
+#include "storage/smgr.h"
+#include "utils/memutils.h"
+#include "utils/wait_event.h"
+
+#define MAPWRITER_HIBERNATE_FACTOR 50
+
+int MapWriterDelay = 200;
+int MapWriterMaxPages = 100;
+int MapWriterPreallocMaxRelations = 32;
+double MapWriterLRUMultiplier = 2.0;
+
+static void
+MapWriterExitCallback(int code, Datum arg)
+{
+ (void) code;
+ (void) arg;
+ MapStrategyNotifyWriter(INVALID_PROC_NUMBER);
+}
+
+void
+MapBackgroundWorkersRegister(void)
+{
+ BackgroundWorker bgw;
+
+ memset(&bgw, 0, sizeof(bgw));
+ bgw.bgw_flags = BGWORKER_SHMEM_ACCESS |
+ BGWORKER_BACKEND_DATABASE_CONNECTION;
+ bgw.bgw_start_time = BgWorkerStart_RecoveryFinished;
+ snprintf(bgw.bgw_library_name, BGW_MAXLEN, "postgres");
+ snprintf(bgw.bgw_function_name, BGW_MAXLEN, "MapWriterMain");
+ snprintf(bgw.bgw_name, BGW_MAXLEN, "Umbra mapwriter");
+ snprintf(bgw.bgw_type, BGW_MAXLEN, "map writer");
+ bgw.bgw_restart_time = 5;
+ bgw.bgw_notify_pid = 0;
+ bgw.bgw_main_arg = (Datum) 0;
+ RegisterBackgroundWorker(&bgw);
+}
+
+void
+MapWriterMain(Datum arg)
+{
+ sigjmp_buf local_sigjmp_buf;
+ MemoryContext mapwriter_context;
+ bool prev_hibernate = false;
+
+ (void) arg;
+ before_shmem_exit(MapWriterExitCallback, 0);
+
+ pqsignal(SIGHUP, SignalHandlerForConfigReload);
+ pqsignal(SIGINT, SIG_IGN);
+ pqsignal(SIGTERM, SignalHandlerForShutdownRequest);
+ pqsignal(SIGQUIT, SignalHandlerForCrashExit);
+ pqsignal(SIGALRM, SIG_IGN);
+ pqsignal(SIGPIPE, SIG_IGN);
+ pqsignal(SIGUSR1, procsignal_sigusr1_handler);
+ pqsignal(SIGUSR2, SIG_IGN);
+ pqsignal(SIGCHLD, SIG_DFL);
+
+ BackgroundWorkerUnblockSignals();
+ BackgroundWorkerInitializeConnectionByOid(InvalidOid, InvalidOid, 0);
+
+ mapwriter_context = AllocSetContextCreate(TopMemoryContext,
+ "Map Writer",
+ ALLOCSET_DEFAULT_SIZES);
+ MemoryContextSwitchTo(mapwriter_context);
+
+ if (sigsetjmp(local_sigjmp_buf, 1) != 0)
+ {
+ error_context_stack = NULL;
+ HOLD_INTERRUPTS();
+ EmitErrorReport();
+
+ LWLockReleaseAll();
+ ConditionVariableCancelSleep();
+ pgstat_report_wait_end();
+ MapAbortBufferIO();
+ MapStrategyNotifyWriter(INVALID_PROC_NUMBER);
+ MapBackendExitCleanup();
+ AtEOXact_Buffers(false);
+ AtEOXact_SMgr();
+ AtEOXact_Files(false);
+ AtEOXact_HashTables(false);
+
+ MemoryContextSwitchTo(mapwriter_context);
+ FlushErrorState();
+ MemoryContextReset(mapwriter_context);
+ RESUME_INTERRUPTS();
+
+ pg_usleep(1000000L);
+ smgrreleaseall();
+ }
+
+ PG_exception_stack = &local_sigjmp_buf;
+
+ for (;;)
+ {
+ uint32 recent_alloc = 0;
+ int target_pages = 0;
+ int cleaned = 0;
+ int prealloc_ops = 0;
+ bool can_hibernate = false;
+
+ ResetLatch(MyLatch);
+ ProcessMainLoopInterrupts();
+
+ (void) MapSyncStart(NULL, &recent_alloc);
+ if (recent_alloc > 0 && MapWriterPreallocMaxRelations > 0)
+ prealloc_ops = MapPreallocStep(MapWriterPreallocMaxRelations);
+
+ if (MapWriterMaxPages > 0)
+ {
+ int idle_pages;
+ double target_f;
+
+ idle_pages = Max(1, MapWriterMaxPages / 8);
+ if (recent_alloc > 0)
+ {
+ target_f = recent_alloc * MapWriterLRUMultiplier;
+ target_pages = (int) (target_f + 0.5);
+ }
+ else
+ target_pages = idle_pages;
+
+ target_pages = Min(MapWriterMaxPages, Max(1, target_pages));
+ cleaned = MapBgWriterFlush(target_pages);
+ }
+
+ can_hibernate = (recent_alloc == 0 &&
+ cleaned == 0 &&
+ prealloc_ops == 0);
+
+ if (FirstCallSinceLastCheckpoint())
+ smgrreleaseall();
+
+ MapStrategyNotifyWriter(MyProcNumber);
+ if (WaitLatch(MyLatch,
+ WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
+ MapWriterDelay,
+ WAIT_EVENT_MAPWRITER_MAIN) == WL_TIMEOUT &&
+ can_hibernate &&
+ prev_hibernate)
+ {
+ (void) WaitLatch(MyLatch,
+ WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
+ MapWriterDelay * MAPWRITER_HIBERNATE_FACTOR,
+ WAIT_EVENT_MAPWRITER_HIBERNATE);
+ }
+ MapStrategyNotifyWriter(INVALID_PROC_NUMBER);
+ prev_hibernate = can_hibernate;
+ }
+}
diff --git a/src/backend/postmaster/meson.build b/src/backend/postmaster/meson.build
index 6cba23bbee..0a30057703 100644
--- a/src/backend/postmaster/meson.build
+++ b/src/backend/postmaster/meson.build
@@ -18,3 +18,9 @@ backend_sources += files(
'walsummarizer.c',
'walwriter.c',
)
+
+if get_option('umbra').enabled()
+ backend_sources += files(
+ 'mapwriter.c',
+ )
+endif
diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index ae82974700..b940fca13b 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -102,6 +102,9 @@
#include "port/pg_getopt_ctx.h"
#include "postmaster/autovacuum.h"
#include "postmaster/bgworker_internals.h"
+#ifdef USE_UMBRA
+#include "postmaster/mapwriter.h"
+#endif
#include "postmaster/pgarch.h"
#include "postmaster/postmaster.h"
#include "postmaster/syslogger.h"
@@ -922,6 +925,10 @@ PostmasterMain(int argc, char *argv[])
*/
ApplyLauncherRegister();
+#ifdef USE_UMBRA
+ MapBackgroundWorkersRegister();
+#endif
+
/*
* Register the shared memory needs of all core subsystems.
*/
diff --git a/src/backend/storage/map/Makefile b/src/backend/storage/map/Makefile
index 94ae1c1b72..6fffc43e59 100644
--- a/src/backend/storage/map/Makefile
+++ b/src/backend/storage/map/Makefile
@@ -16,6 +16,7 @@ OBJS = \
map.o \
mapinit.o \
mapbuf.o \
+ mapbgproc.o \
mapflush.o \
mapclock.o \
mapinflight.o \
diff --git a/src/backend/storage/map/map.c b/src/backend/storage/map/map.c
index 0dad150b2b..6793db8671 100644
--- a/src/backend/storage/map/map.c
+++ b/src/backend/storage/map/map.c
@@ -105,6 +105,9 @@ static bool MapMapPageWithinLogicalRange(UmbraFileContext *map_ctx,
RelFileLocator rnode,
ForkNumber forknum,
BlockNumber map_blkno);
+bool MapForkPreallocSettings(ForkNumber forknum, BlockNumber *soft_low,
+ BlockNumber *hard_low,
+ BlockNumber *batch_blocks);
static MapCachedLookupResult MapTryLookupCachedEntry(RelFileLocator rnode,
ForkNumber forknum,
BlockNumber map_blkno,
@@ -142,6 +145,47 @@ MapResetAllTruncatePreloads(void)
}
}
+bool
+MapForkPreallocSettings(ForkNumber forknum, BlockNumber *soft_low,
+ BlockNumber *hard_low, BlockNumber *batch_blocks)
+{
+ int low;
+ int hard;
+ int batch;
+
+ switch (forknum)
+ {
+ case MAIN_FORKNUM:
+ low = map_prealloc_main_low;
+ hard = map_prealloc_main_hard;
+ batch = map_prealloc_main_batch;
+ break;
+ case FSM_FORKNUM:
+ low = map_prealloc_fsm_low;
+ hard = map_prealloc_fsm_hard;
+ batch = map_prealloc_fsm_batch;
+ break;
+ case VISIBILITYMAP_FORKNUM:
+ low = map_prealloc_vm_low;
+ hard = map_prealloc_vm_hard;
+ batch = map_prealloc_vm_batch;
+ break;
+ default:
+ return false;
+ }
+
+ if (low <= 0 || batch <= 0)
+ return false;
+ if (hard <= 0)
+ hard = 1;
+ if (hard > low)
+ hard = low;
+
+ *soft_low = (BlockNumber) low;
+ *hard_low = (BlockNumber) hard;
+ *batch_blocks = (BlockNumber) batch;
+ return true;
+}
static bool
MapTruncateEntryRange(ForkNumber forknum, BlockNumber n_lblknos,
@@ -728,6 +772,8 @@ MapReserveFreshPblkno(UmbraFileContext *map_ctx, RelFileLocator rnode,
if (MapReserveNextPblkno(map_ctx, rnode, forknum, lblkno,
new_pblkno, false))
{
+ if (!InRecovery)
+ (void) MapMaybePreallocateFork(map_ctx, rnode, forknum, false);
return true;
}
@@ -1418,6 +1464,14 @@ void MapGetNewPbkno(UmbraFileContext *map_ctx, RelFileLocator rnode, ForkNumber
"failed to reserve physical block for relation %u/%u/%u fork %d blk %u",
rnode.spcOid, rnode.dbOid, rnode.relNumber, forknum, lblkno);
}
+
+ /*
+ * Frontend/backgound coordination:
+ * - low-but-not-critical watermark: wake mapwriter
+ * - critical watermark: foreground performs one-shot preallocation
+ */
+ if (!InRecovery)
+ (void) MapMaybePreallocateFork(map_ctx, rnode, forknum, false);
}
/*
diff --git a/src/backend/storage/map/mapbgproc.c b/src/backend/storage/map/mapbgproc.c
new file mode 100644
index 0000000000..3bb167bae9
--- /dev/null
+++ b/src/backend/storage/map/mapbgproc.c
@@ -0,0 +1,323 @@
+/*-------------------------------------------------------------------------
+ *
+ * mapbgproc.c
+ * MAP background maintenance and coordination.
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/xlog.h"
+#include "access/xlogutils.h"
+#include "miscadmin.h"
+#include "storage/latch.h"
+#include "storage/map.h"
+#include "storage/map_internal.h"
+#include "storage/mapsuper_internal.h"
+#include "storage/proc.h"
+
+static bool MapForkNeedsPrealloc(const MapSuperEntry *entry, ForkNumber forknum,
+ bool background_mode);
+
+uint32
+MapAllocPressurePeek(void)
+{
+ return pg_atomic_read_u32(&MapShared->num_allocs);
+}
+
+void
+MapStrategyNotifyWriter(int mapwriter_procno)
+{
+ SpinLockAcquire(&MapShared->clock_lock);
+ MapShared->mapwriter_procno = mapwriter_procno;
+ SpinLockRelease(&MapShared->clock_lock);
+}
+
+void
+MapWakeWriter(void)
+{
+ int mapwriter_procno = -1;
+
+ SpinLockAcquire(&MapShared->clock_lock);
+ mapwriter_procno = MapShared->mapwriter_procno;
+ if (mapwriter_procno != -1)
+ MapShared->mapwriter_procno = -1;
+ SpinLockRelease(&MapShared->clock_lock);
+
+ if (mapwriter_procno != -1)
+ SetLatch(&ProcGlobal->allProcs[mapwriter_procno].procLatch);
+}
+
+bool
+MapMaybePreallocateFork(UmbraFileContext *map_ctx, RelFileLocator rnode,
+ ForkNumber forknum, bool background_mode)
+{
+ MapSuperEntry *entry;
+ BlockNumber soft_low;
+ BlockNumber hard_low;
+ BlockNumber batch_blocks;
+ BlockNumber next;
+ BlockNumber capacity;
+ BlockNumber remaining;
+ BlockNumber target_nblocks;
+ uint32 prealloc_flag;
+ bool prealloc_ok = false;
+ bool started = false;
+ uint64 target64;
+
+ if (!MapForkHasMappedState(forknum))
+ return false;
+
+ if (!MapForkPreallocSettings(forknum, &soft_low, &hard_low, &batch_blocks))
+ return false;
+
+ if (!MapSBlockEnsureLoaded(map_ctx, rnode))
+ return false;
+
+ prealloc_flag = MapSuperPreallocFlag(forknum);
+ Assert(prealloc_flag != 0);
+
+ if (!MapSuperFindEntryLocked(rnode, LW_EXCLUSIVE, &entry))
+ return false;
+
+ if (!entry->in_use || (entry->flags & MAPSUPER_FLAG_VALID) == 0)
+ {
+ LWLockRelease(&entry->lock);
+ return false;
+ }
+
+ if ((entry->flags & MAPSUPER_FLAG_CORRUPT) ||
+ !MapSuperblockHasValidIdentity(&entry->super) ||
+ ((entry->flags & MAPSUPER_FLAG_DIRTY) == 0 &&
+ !MapSuperblockCheckCRC(&entry->super)))
+ {
+ LWLockRelease(&entry->lock);
+ if (!InRecovery)
+ MapSBlockReportCorrupt(rnode, "invalid identity or CRC");
+ return false;
+ }
+
+ Assert(MapNormalizeForkBlockCount(forknum,
+ MapSuperblockGetNextFreePhysBlock(&entry->super,
+ forknum)) <=
+ MapSuperGetReservedNextFree(entry, forknum));
+ next = MapSuperGetReservedNextFree(entry, forknum);
+ capacity = MapSuperblockGetPhysCapacity(&entry->super, forknum);
+ capacity = MapNormalizeForkBlockCount(forknum, capacity);
+
+ if (next < soft_low)
+ {
+ LWLockRelease(&entry->lock);
+ return false;
+ }
+
+ remaining = (capacity > next) ? (capacity - next) : 0;
+ if (remaining > soft_low)
+ {
+ LWLockRelease(&entry->lock);
+ return false;
+ }
+
+ if (!background_mode && remaining > hard_low)
+ {
+ LWLockRelease(&entry->lock);
+ MapWakeWriter();
+ return false;
+ }
+
+ if ((entry->runtime_flags & prealloc_flag) != 0)
+ {
+ LWLockRelease(&entry->lock);
+ if (!background_mode)
+ MapWakeWriter();
+ return false;
+ }
+
+ target64 = Max((uint64) capacity + (uint64) batch_blocks,
+ (uint64) next + (uint64) batch_blocks);
+ if (target64 > (uint64) (InvalidBlockNumber - 1))
+ {
+ LWLockRelease(&entry->lock);
+ ereport(ERROR,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("cannot preallocate physical blocks beyond %u for relation %u/%u/%u fork %d",
+ InvalidBlockNumber - 1,
+ rnode.spcOid, rnode.dbOid, rnode.relNumber, forknum)));
+ }
+ target_nblocks = (BlockNumber) target64;
+
+ if (target_nblocks <= capacity)
+ {
+ LWLockRelease(&entry->lock);
+ return false;
+ }
+
+ entry->runtime_flags |= prealloc_flag;
+ LWLockRelease(&entry->lock);
+ started = true;
+
+ PG_TRY();
+ {
+ if (umfile_ctx_fork_exists(map_ctx, forknum, UMFILE_EXISTS_SPARSE))
+ prealloc_ok = umfile_ctx_preallocate_blocks(map_ctx, forknum,
+ UMFILE_NBLOCKS_SPARSE,
+ target_nblocks);
+ }
+ PG_CATCH();
+ {
+ if (started && MapSuperFindEntryLocked(rnode, LW_EXCLUSIVE, &entry))
+ {
+ entry->runtime_flags &= ~prealloc_flag;
+ LWLockRelease(&entry->lock);
+ }
+ PG_RE_THROW();
+ }
+ PG_END_TRY();
+
+ if (MapSuperFindEntryLocked(rnode, LW_EXCLUSIVE, &entry))
+ {
+ if (prealloc_ok &&
+ entry->in_use &&
+ (entry->flags & MAPSUPER_FLAG_VALID) != 0 &&
+ MapNormalizeForkBlockCount(forknum,
+ MapSuperblockGetPhysCapacity(&entry->super,
+ forknum)) < target_nblocks)
+ {
+ XLogRecPtr map_lsn = GetXLogWriteRecPtr();
+
+ MapSuperblockSetPhysCapacity(&entry->super, forknum, target_nblocks);
+ MapSuperblockSetLastUpdatedLSN(&entry->super, map_lsn);
+ entry->page_lsn = map_lsn;
+ entry->flags |= MAPSUPER_FLAG_DIRTY;
+ }
+ entry->runtime_flags &= ~prealloc_flag;
+ LWLockRelease(&entry->lock);
+ }
+
+ if (!background_mode && !prealloc_ok)
+ MapWakeWriter();
+
+ return prealloc_ok;
+}
+
+static bool
+MapForkNeedsPrealloc(const MapSuperEntry *entry, ForkNumber forknum,
+ bool background_mode)
+{
+ BlockNumber soft_low;
+ BlockNumber hard_low;
+ BlockNumber batch_blocks;
+ BlockNumber next;
+ BlockNumber capacity;
+ BlockNumber remaining;
+ uint32 prealloc_flag;
+
+ if (!MapForkHasMappedState(forknum))
+ return false;
+
+ if (!MapForkPreallocSettings(forknum, &soft_low, &hard_low, &batch_blocks))
+ return false;
+
+ if (!MapSuperForkExists(&entry->super, forknum))
+ return false;
+
+ prealloc_flag = MapSuperPreallocFlag(forknum);
+ Assert(prealloc_flag != 0);
+
+ Assert(MapNormalizeForkBlockCount(forknum,
+ MapSuperblockGetNextFreePhysBlock(&entry->super,
+ forknum)) <=
+ MapSuperGetReservedNextFree(entry, forknum));
+ next = MapSuperGetReservedNextFree(entry, forknum);
+ capacity = MapNormalizeForkBlockCount(forknum,
+ MapSuperblockGetPhysCapacity(&entry->super,
+ forknum));
+
+ if (next < soft_low)
+ return false;
+
+ remaining = (capacity > next) ? (capacity - next) : 0;
+ if (remaining > soft_low)
+ return false;
+
+ if (!background_mode && remaining > hard_low)
+ return false;
+
+ if ((entry->runtime_flags & prealloc_flag) != 0)
+ return false;
+
+ return true;
+}
+
+int
+MapPreallocStep(int max_relations)
+{
+ static int scan_slot = 0;
+ int max_scan;
+ int scanned = 0;
+ int visited = 0;
+ int prealloc_ops = 0;
+
+ if (InRecovery || max_relations <= 0 || MapSuperCapacity <= 0)
+ return 0;
+
+ max_scan = Min(MapSuperCapacity, Max(64, max_relations * 8));
+
+ while (scanned < max_scan && visited < max_relations)
+ {
+ MapSuperEntry *entry;
+ RelFileLocator rnode;
+ RelFileLocatorBackend rlocator;
+ UmbraFileContext *ctx;
+ bool prealloc_main;
+ bool prealloc_fsm;
+ bool prealloc_vm;
+
+ entry = MapSuperEntryBySlot(scan_slot);
+ scan_slot = (scan_slot + 1) % MapSuperCapacity;
+ scanned++;
+
+ LWLockAcquire(&entry->lock, LW_SHARED);
+ if (!entry->in_use)
+ {
+ LWLockRelease(&entry->lock);
+ continue;
+ }
+ rnode = entry->key.rnode;
+ if ((entry->flags & MAPSUPER_FLAG_VALID) == 0 ||
+ (entry->flags & MAPSUPER_FLAG_CORRUPT) != 0 ||
+ !MapSuperblockHasValidIdentity(&entry->super) ||
+ ((entry->flags & MAPSUPER_FLAG_DIRTY) == 0 &&
+ !MapSuperblockCheckCRC(&entry->super)))
+ {
+ LWLockRelease(&entry->lock);
+ continue;
+ }
+ prealloc_main = MapForkNeedsPrealloc(entry, MAIN_FORKNUM, true);
+ prealloc_fsm = MapForkNeedsPrealloc(entry, FSM_FORKNUM, true);
+ prealloc_vm = MapForkNeedsPrealloc(entry, VISIBILITYMAP_FORKNUM, true);
+ LWLockRelease(&entry->lock);
+ visited++;
+
+ if (!prealloc_main && !prealloc_fsm && !prealloc_vm)
+ continue;
+
+ rlocator.locator = rnode;
+ rlocator.backend = INVALID_PROC_NUMBER;
+ ctx = umfile_ctx_acquire(rlocator);
+ if (ctx == NULL)
+ continue;
+
+ if (prealloc_main &&
+ MapMaybePreallocateFork(ctx, rnode, MAIN_FORKNUM, true))
+ prealloc_ops++;
+ if (prealloc_fsm &&
+ MapMaybePreallocateFork(ctx, rnode, FSM_FORKNUM, true))
+ prealloc_ops++;
+ if (prealloc_vm &&
+ MapMaybePreallocateFork(ctx, rnode, VISIBILITYMAP_FORKNUM, true))
+ prealloc_ops++;
+ }
+
+ return prealloc_ops;
+}
diff --git a/src/backend/storage/map/mapclock.c b/src/backend/storage/map/mapclock.c
index 3ccdbb2310..5e4eb5dac7 100644
--- a/src/backend/storage/map/mapclock.c
+++ b/src/backend/storage/map/mapclock.c
@@ -273,6 +273,11 @@ MapClockGetBuffer(void)
uint32 local_buf_state;
int num_slots = MapShared->num_slots;
+ /*
+ * If mapwriter asked for allocation notification, wake it up.
+ */
+ MapWakeWriter();
+
/*
* First, check if there's a buffer on the free list.
*/
diff --git a/src/backend/storage/map/mapflush.c b/src/backend/storage/map/mapflush.c
index def1943dee..b13991e27a 100644
--- a/src/backend/storage/map/mapflush.c
+++ b/src/backend/storage/map/mapflush.c
@@ -226,7 +226,7 @@ map_flush_buffer_target_comparator(const MapFlushBufferTarget *a,
void
MapPreCheckpoint(void)
{
- /* no-op: checkpoint work is handled by MapCheckpoint(). */
+ /* no-op: reclaim is handled by sync request queues. */
}
/*
@@ -346,7 +346,7 @@ MapCheckpointDatabaseTablespaces(Oid dbid, int ntablespaces,
void
MapPostCheckpoint(void)
{
- /* no-op: checkpoint work is handled by MapCheckpoint(). */
+ /* no-op: reclaim is handled by sync request queues. */
}
int
@@ -355,7 +355,7 @@ MapBgWriterFlush(int max_pages)
if (max_pages <= 0)
return 0;
- /* Non-checkpoint flushes regular MAP pages only; superblock is checkpoint-owned. */
+ /* mapwriter flushes regular MAP pages only; superblock is checkpoint-owned. */
return MapFlushDirtyBuffers(max_pages, false);
}
diff --git a/src/backend/storage/map/mapinit.c b/src/backend/storage/map/mapinit.c
index c9ddd12ff0..c30057cf04 100644
--- a/src/backend/storage/map/mapinit.c
+++ b/src/backend/storage/map/mapinit.c
@@ -26,6 +26,15 @@ int map_buffers = 1024; /* Number of map buffer slots */
* relations do not churn through repeated ensure/load cycles.
*/
int map_superblocks = 262144;
+int map_prealloc_main_low = 512; /* 4MB in 8k blocks */
+int map_prealloc_main_hard = 128; /* 1MB in 8k blocks */
+int map_prealloc_main_batch = 1024; /* 8MB in 8k blocks */
+int map_prealloc_fsm_low = 64; /* 512kB in 8k blocks */
+int map_prealloc_fsm_hard = 16; /* 128kB in 8k blocks */
+int map_prealloc_fsm_batch = 128; /* 1MB in 8k blocks */
+int map_prealloc_vm_low = 64; /* 512kB in 8k blocks */
+int map_prealloc_vm_hard = 16; /* 128kB in 8k blocks */
+int map_prealloc_vm_batch = 128; /* 1MB in 8k blocks */
/* Shared memory pointer */
MapSharedData *MapShared = NULL;
@@ -105,6 +114,7 @@ MapShmemInit(void *arg)
MapShared->num_slots = map_buffers;
MapShared->first_free_buffer = 0;
+ MapShared->mapwriter_procno = -1;
pg_atomic_init_u32(&MapShared->next_victim_buffer, 0);
pg_atomic_init_u32(&MapShared->num_allocs, 0);
MapShared->complete_passes = 0;
diff --git a/src/backend/storage/map/mapsuper.c b/src/backend/storage/map/mapsuper.c
index 07ac7b39c6..e3e9421566 100644
--- a/src/backend/storage/map/mapsuper.c
+++ b/src/backend/storage/map/mapsuper.c
@@ -838,6 +838,21 @@ MapSuperForkExists(const MapSuperblock *super, ForkNumber forknum)
return MapSuperblockGetLogicalNblocks(super, forknum) != InvalidBlockNumber;
}
+uint32
+MapSuperPreallocFlag(ForkNumber forknum)
+{
+ switch (forknum)
+ {
+ case MAIN_FORKNUM:
+ return MAPSUPER_RUNTIME_FLAG_PREALLOC_MAIN;
+ case FSM_FORKNUM:
+ return MAPSUPER_RUNTIME_FLAG_PREALLOC_FSM;
+ case VISIBILITYMAP_FORKNUM:
+ return MAPSUPER_RUNTIME_FLAG_PREALLOC_VM;
+ default:
+ return 0;
+ }
+}
static uint32
MapSuperExtendingFlag(ForkNumber forknum)
@@ -1274,7 +1289,7 @@ MapSBlockInit(UmbraFileContext *map_ctx, RelFileLocator rnode, XLogRecPtr map_ls
/*
* Persist superblock immediately so later backends in bootstrap/initdb can
- * read block 0 even before checkpoint gets a chance to flush.
+ * read block 0 even before checkpoint/mapwriter gets a chance to flush.
* This keeps create-time O(1): only one 512-byte sector is written.
*/
write_super = entry->super;
diff --git a/src/backend/storage/map/meson.build b/src/backend/storage/map/meson.build
index bdaa0dd14a..5a9f685e53 100644
--- a/src/backend/storage/map/meson.build
+++ b/src/backend/storage/map/meson.build
@@ -4,6 +4,7 @@ backend_sources += files(
'map.c',
'mapinit.c',
'mapbuf.c',
+ 'mapbgproc.c',
'mapflush.c',
'mapclock.c',
'mapinflight.c',
diff --git a/src/backend/storage/smgr/umbra.c b/src/backend/storage/smgr/umbra.c
index f382d56c34..61c74a2378 100644
--- a/src/backend/storage/smgr/umbra.c
+++ b/src/backend/storage/smgr/umbra.c
@@ -309,7 +309,7 @@ UmMetadataImmediateSync(SMgrRelation reln)
void
UmMetadataRegisterSync(SMgrRelation reln)
{
- umimmedsync(reln, UMBRA_METADATA_FORKNUM);
+ umfile_registersync(um_ctx_acquire(reln), UMBRA_METADATA_FORKNUM);
}
void
@@ -2527,12 +2527,17 @@ umimmedsync(SMgrRelation reln, ForkNumber forknum)
void
umregistersync(SMgrRelation reln, ForkNumber forknum)
{
- umimmedsync(reln, forknum);
+ umfile_registersync(um_ctx_acquire(reln), forknum);
}
bool
umpreparependingsync(SMgrRelation reln)
{
+ /*
+ * Skip-WAL relations write data directly first and publish Umbra MAP
+ * metadata only at the durable transition boundary. Rebuild MAP and
+ * superblock before the relation enters the fsync path.
+ */
if (RelFileLocatorSkippingWAL(reln->smgr_rlocator.locator))
UmRebuildMapAndSuperblockForSkipWAL(reln);
diff --git a/src/backend/storage/smgr/umfile.c b/src/backend/storage/smgr/umfile.c
index 63afc8546c..a88e54c46b 100644
--- a/src/backend/storage/smgr/umfile.c
+++ b/src/backend/storage/smgr/umfile.c
@@ -14,6 +14,9 @@
#include <unistd.h>
#include <fcntl.h>
#include <sys/uio.h>
+#if defined(__linux__)
+#include <sys/syscall.h>
+#endif
#include "access/xlogutils.h"
#include "catalog/pg_tablespace_d.h"
@@ -86,6 +89,8 @@ static BlockNumber umfile_nblocks_dense(UmbraFileContext *ctx,
RelFileLocatorBackend rlocator,
ForkNumber forknum);
static BlockNumber umfile_nblocks_in_seg(File vfd);
+static bool umfile_preallocate_fd(File fd, off_t target_bytes);
+static bool umfile_preallocate_errno_is_unsupported(int err);
static bool umfile_collect_existing_segnos_by_path(const char *seg0path,
BlockNumber **segnos_out,
int *nsegnos_out);
@@ -314,6 +319,69 @@ umfile_ctx_extend(UmbraFileContext *ctx, ForkNumber forknum, BlockNumber blkno,
umfile_extend(ctx, forknum, blkno, buffer, true /* skipFsync */ );
}
+bool
+umfile_ctx_preallocate_blocks(UmbraFileContext *ctx, ForkNumber forknum,
+ UmFileNblocksMode mode,
+ BlockNumber target_nblocks)
+{
+ BlockNumber nblocks;
+ BlockNumber cur_nblocks;
+
+ if (ctx == NULL)
+ return false;
+
+ if (!umfile_exists(ctx, forknum,
+ mode == UMFILE_NBLOCKS_SPARSE ?
+ UMFILE_EXISTS_SPARSE :
+ UMFILE_EXISTS_DENSE))
+ return false;
+
+ nblocks = umfile_nblocks(ctx, forknum, mode);
+ if (target_nblocks <= nblocks)
+ return true;
+
+ if (target_nblocks > (uint64) MaxBlockNumber + 1)
+ return false;
+
+ /*
+ * Keep preallocation as a capacity operation: make the target segment
+ * BLCKSZ-addressable up to target_nblocks, but do not write page content.
+ * Later page writes may fill holes created by this step.
+ */
+ cur_nblocks = nblocks;
+ while (cur_nblocks < target_nblocks)
+ {
+ BlockNumber target_blkno;
+ BlockNumber targetseg;
+ BlockNumber targetseg_nblocks;
+ uint64 seg_start;
+ uint64 seg_end;
+ off_t target_bytes;
+ UmfdVec *v;
+
+ targetseg = cur_nblocks / ((BlockNumber) RELSEG_SIZE);
+ seg_start = (uint64) targetseg * (uint64) RELSEG_SIZE;
+ seg_end = seg_start + (uint64) RELSEG_SIZE;
+ if (seg_end > (uint64) target_nblocks)
+ seg_end = (uint64) target_nblocks;
+
+ targetseg_nblocks = (BlockNumber) (seg_end - seg_start);
+ target_blkno = (BlockNumber) (seg_start + targetseg_nblocks - 1);
+ target_bytes = (off_t) targetseg_nblocks * BLCKSZ;
+
+ v = umfile_getseg(ctx, ctx->rlocator, forknum, target_blkno,
+ true /* skipFsync */,
+ UM_EXTENSION_CREATE,
+ RelFileLocatorBackendIsTemp(ctx->rlocator));
+ if (!umfile_preallocate_fd(v->umfd_vfd, target_bytes))
+ return false;
+
+ cur_nblocks = (BlockNumber) seg_end;
+ }
+
+ return true;
+}
+
void
umfile_ctx_prefetch(UmbraFileContext *ctx, ForkNumber forknum, BlockNumber blkno)
{
@@ -1566,6 +1634,180 @@ umfile_extend(UmbraFileContext *ctx, ForkNumber forknum, BlockNumber blocknum,
umfile_register_dirty_seg(rlocator, false, forknum, v);
}
+/*
+ * Reserve bytes for a relation segment without writing page contents.
+ *
+ * Return true only if the whole range up to target_bytes is backed by a real
+ * preallocation primitive. Unsupported filesystems return false so callers do
+ * not publish capacity that is only a logical EOF extension.
+ *
+ * Linux uses the fallocate syscall directly so we don't inherit glibc's
+ * posix_fallocate()->userspace zero-fill fallback. macOS uses F_PREALLOCATE,
+ * and other platforms may use posix_fallocate().
+ */
+static bool
+umfile_preallocate_fd(File fd, off_t target_bytes)
+{
+ off_t current_bytes;
+ off_t delta_bytes;
+ int rawfd;
+
+ current_bytes = FileSize(fd);
+ if (current_bytes < 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not determine file size for \"%s\": %m",
+ FilePathName(fd))));
+
+ if (current_bytes >= target_bytes)
+ return true;
+
+ delta_bytes = target_bytes - current_bytes;
+ rawfd = FileGetRawDesc(fd);
+
+#if defined(__linux__)
+#ifdef SYS_fallocate
+ {
+ long rc;
+
+retry_fallocate:
+ errno = 0;
+ pgstat_report_wait_start(WAIT_EVENT_DATA_FILE_EXTEND);
+ rc = syscall(SYS_fallocate, rawfd, 0, current_bytes, delta_bytes);
+ pgstat_report_wait_end();
+
+ if (rc < 0)
+ {
+ if (errno == EINTR)
+ goto retry_fallocate;
+ if (umfile_preallocate_errno_is_unsupported(errno))
+ return false;
+
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not preallocate file \"%s\" to %llu bytes: %m",
+ FilePathName(fd),
+ (unsigned long long) target_bytes)));
+ }
+ }
+#else
+ return false;
+#endif
+#elif defined(__APPLE__) && defined(F_PREALLOCATE)
+ {
+ fstore_t fst;
+ int rc;
+
+ /*
+ * F_PEOFPOSMODE interprets fst_length as newly allocated bytes beyond
+ * current EOF, not the final file size. Passing target_bytes here would
+ * over-reserve on repeated top-ups of the same segment.
+ */
+ MemSet(&fst, 0, sizeof(fst));
+ fst.fst_flags = F_ALLOCATECONTIG | F_ALLOCATEALL;
+ fst.fst_posmode = F_PEOFPOSMODE;
+ fst.fst_offset = 0;
+ fst.fst_length = delta_bytes;
+
+retry_f_preallocate_contig:
+ errno = 0;
+ pgstat_report_wait_start(WAIT_EVENT_DATA_FILE_EXTEND);
+ rc = fcntl(rawfd, F_PREALLOCATE, &fst);
+ pgstat_report_wait_end();
+ if (rc < 0 && errno == EINTR)
+ goto retry_f_preallocate_contig;
+
+ if (rc < 0)
+ {
+ fst.fst_flags = F_ALLOCATEALL;
+ fst.fst_bytesalloc = 0;
+
+retry_f_preallocate_all:
+ errno = 0;
+ pgstat_report_wait_start(WAIT_EVENT_DATA_FILE_EXTEND);
+ rc = fcntl(rawfd, F_PREALLOCATE, &fst);
+ pgstat_report_wait_end();
+ if (rc < 0 && errno == EINTR)
+ goto retry_f_preallocate_all;
+
+ if (rc < 0)
+ {
+ if (umfile_preallocate_errno_is_unsupported(errno))
+ return false;
+
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not preallocate file \"%s\" to %llu bytes: %m",
+ FilePathName(fd),
+ (unsigned long long) target_bytes)));
+ }
+ }
+
+ if (fst.fst_bytesalloc < delta_bytes)
+ return false;
+ }
+#elif defined(HAVE_POSIX_FALLOCATE)
+ {
+ int rc;
+
+retry_posix_fallocate:
+ pgstat_report_wait_start(WAIT_EVENT_DATA_FILE_EXTEND);
+ rc = posix_fallocate(rawfd, current_bytes, delta_bytes);
+ pgstat_report_wait_end();
+
+ if (rc == EINTR)
+ goto retry_posix_fallocate;
+
+ if (rc != 0)
+ {
+ errno = rc;
+ if (umfile_preallocate_errno_is_unsupported(errno))
+ return false;
+
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not preallocate file \"%s\" to %llu bytes: %m",
+ FilePathName(fd),
+ (unsigned long long) target_bytes)));
+ }
+ }
+#else
+ return false;
+#endif
+
+ current_bytes = FileSize(fd);
+ if (current_bytes < 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not determine file size for \"%s\": %m",
+ FilePathName(fd))));
+ if (current_bytes < target_bytes &&
+ FileTruncate(fd, target_bytes, WAIT_EVENT_DATA_FILE_EXTEND) < 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not extend preallocated file \"%s\" to %llu bytes: %m",
+ FilePathName(fd),
+ (unsigned long long) target_bytes)));
+
+ return true;
+}
+
+static bool
+umfile_preallocate_errno_is_unsupported(int err)
+{
+ if (err == EINVAL || err == EOPNOTSUPP)
+ return true;
+#ifdef ENOSYS
+ if (err == ENOSYS)
+ return true;
+#endif
+#ifdef ENOTSUP
+ if (err == ENOTSUP)
+ return true;
+#endif
+ return false;
+}
+
void
umfile_zeroextend(UmbraFileContext *ctx, ForkNumber forknum, BlockNumber blocknum,
int nblocks, bool skipFsync)
diff --git a/src/backend/utils/activity/wait_event_names.txt b/src/backend/utils/activity/wait_event_names.txt
index a1de5a08d4..ec5e2eabf4 100644
--- a/src/backend/utils/activity/wait_event_names.txt
+++ b/src/backend/utils/activity/wait_event_names.txt
@@ -61,6 +61,8 @@ IO_WORKER_MAIN "Waiting in main loop of IO Worker process."
LOGICAL_APPLY_MAIN "Waiting in main loop of logical replication apply process."
LOGICAL_LAUNCHER_MAIN "Waiting in main loop of logical replication launcher process."
LOGICAL_PARALLEL_APPLY_MAIN "Waiting in main loop of logical replication parallel apply process."
+MAPWRITER_HIBERNATE "Waiting in Umbra map writer process, hibernating."
+MAPWRITER_MAIN "Waiting in main loop of Umbra map writer process."
RECOVERY_WAL_STREAM "Waiting in main loop of startup process for WAL to arrive, during streaming recovery."
REPLICATION_SLOTSYNC_MAIN "Waiting in main loop of slot synchronization."
REPLICATION_SLOTSYNC_SHUTDOWN "Waiting for slot sync worker to shut down."
diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c
index 62476de48e..8f1d36de0a 100644
--- a/src/backend/utils/init/postinit.c
+++ b/src/backend/utils/init/postinit.c
@@ -823,10 +823,18 @@ InitPostgres(const char *in_dbname, Oid dboid,
before_shmem_exit(ShutdownXLOG, 0);
}
- /*
- * Initialize the relation cache and the system catalog caches. Note that
- * no catalog access happens here; we only set up the hashtable structure.
- * We must do this before starting a transaction because transaction abort
+ /*
+ * Let the active storage manager register backend-local shutdown cleanup
+ * after ShutdownXLOG. That way, standalone shutdown runs this cleanup
+ * before the shutdown checkpoint, without exposing storage-manager-specific
+ * details here.
+ */
+ smgrregistershutdowncleanup();
+
+ /*
+ * Initialize the relation cache and the system catalog caches. Note that
+ * no catalog access happens here; we only set up the hashtable structure.
+ * We must do this before starting a transaction because transaction abort
* would try to touch these hashtables.
*/
RelationCacheInitialize();
diff --git a/src/backend/utils/misc/guc_parameters.dat b/src/backend/utils/misc/guc_parameters.dat
index 86c1eba5da..f3726be78d 100644
--- a/src/backend/utils/misc/guc_parameters.dat
+++ b/src/backend/utils/misc/guc_parameters.dat
@@ -1932,6 +1932,133 @@
max => 'MAX_KILOBYTES',
},
+{ name => 'map_prealloc_fsm_batch', type => 'int', context => 'PGC_SIGHUP', group => 'RESOURCES_BGWRITER',
+ short_desc => 'Preallocation batch size in blocks for Umbra FSM fork.',
+ flags => 'GUC_UNIT_BLOCKS',
+ variable => 'map_prealloc_fsm_batch',
+ boot_val => '128',
+ min => '1',
+ max => 'INT_MAX / 2',
+ ifdef => 'USE_UMBRA',
+},
+
+{ name => 'map_prealloc_fsm_hard', type => 'int', context => 'PGC_SIGHUP', group => 'RESOURCES_BGWRITER',
+ short_desc => 'Hard low-water mark in blocks before foreground Umbra FSM writes preallocate directly.',
+ flags => 'GUC_UNIT_BLOCKS',
+ variable => 'map_prealloc_fsm_hard',
+ boot_val => '16',
+ min => '1',
+ max => 'INT_MAX / 2',
+ ifdef => 'USE_UMBRA',
+},
+
+{ name => 'map_prealloc_fsm_low', type => 'int', context => 'PGC_SIGHUP', group => 'RESOURCES_BGWRITER',
+ short_desc => 'Soft low-water mark in blocks before Umbra FSM preallocation is considered.',
+ flags => 'GUC_UNIT_BLOCKS',
+ variable => 'map_prealloc_fsm_low',
+ boot_val => '64',
+ min => '1',
+ max => 'INT_MAX / 2',
+ ifdef => 'USE_UMBRA',
+},
+
+{ name => 'map_prealloc_main_batch', type => 'int', context => 'PGC_SIGHUP', group => 'RESOURCES_BGWRITER',
+ short_desc => 'Preallocation batch size in blocks for Umbra main fork.',
+ flags => 'GUC_UNIT_BLOCKS',
+ variable => 'map_prealloc_main_batch',
+ boot_val => '1024',
+ min => '1',
+ max => 'INT_MAX / 2',
+ ifdef => 'USE_UMBRA',
+},
+
+{ name => 'map_prealloc_main_hard', type => 'int', context => 'PGC_SIGHUP', group => 'RESOURCES_BGWRITER',
+ short_desc => 'Hard low-water mark in blocks before foreground Umbra main-fork writes preallocate directly.',
+ flags => 'GUC_UNIT_BLOCKS',
+ variable => 'map_prealloc_main_hard',
+ boot_val => '128',
+ min => '1',
+ max => 'INT_MAX / 2',
+ ifdef => 'USE_UMBRA',
+},
+
+{ name => 'map_prealloc_main_low', type => 'int', context => 'PGC_SIGHUP', group => 'RESOURCES_BGWRITER',
+ short_desc => 'Soft low-water mark in blocks before Umbra main-fork preallocation is considered.',
+ flags => 'GUC_UNIT_BLOCKS',
+ variable => 'map_prealloc_main_low',
+ boot_val => '512',
+ min => '1',
+ max => 'INT_MAX / 2',
+ ifdef => 'USE_UMBRA',
+},
+
+{ name => 'map_prealloc_vm_batch', type => 'int', context => 'PGC_SIGHUP', group => 'RESOURCES_BGWRITER',
+ short_desc => 'Preallocation batch size in blocks for Umbra VM fork.',
+ flags => 'GUC_UNIT_BLOCKS',
+ variable => 'map_prealloc_vm_batch',
+ boot_val => '128',
+ min => '1',
+ max => 'INT_MAX / 2',
+ ifdef => 'USE_UMBRA',
+},
+
+{ name => 'map_prealloc_vm_hard', type => 'int', context => 'PGC_SIGHUP', group => 'RESOURCES_BGWRITER',
+ short_desc => 'Hard low-water mark in blocks before foreground Umbra VM writes preallocate directly.',
+ flags => 'GUC_UNIT_BLOCKS',
+ variable => 'map_prealloc_vm_hard',
+ boot_val => '16',
+ min => '1',
+ max => 'INT_MAX / 2',
+ ifdef => 'USE_UMBRA',
+},
+
+{ name => 'map_prealloc_vm_low', type => 'int', context => 'PGC_SIGHUP', group => 'RESOURCES_BGWRITER',
+ short_desc => 'Soft low-water mark in blocks before Umbra VM preallocation is considered.',
+ flags => 'GUC_UNIT_BLOCKS',
+ variable => 'map_prealloc_vm_low',
+ boot_val => '64',
+ min => '1',
+ max => 'INT_MAX / 2',
+ ifdef => 'USE_UMBRA',
+},
+
+{ name => 'mapwriter_delay', type => 'int', context => 'PGC_SIGHUP', group => 'RESOURCES_BGWRITER',
+ short_desc => 'Umbra map writer sleep time between rounds.',
+ flags => 'GUC_UNIT_MS',
+ variable => 'MapWriterDelay',
+ boot_val => '200',
+ min => '1',
+ max => '10000',
+ ifdef => 'USE_UMBRA',
+},
+
+{ name => 'mapwriter_lru_maxpages', type => 'int', context => 'PGC_SIGHUP', group => 'RESOURCES_BGWRITER',
+ short_desc => 'Umbra map writer maximum number of MAP pages to flush per round.',
+ long_desc => '0 disables Umbra map writer cleaning.',
+ variable => 'MapWriterMaxPages',
+ boot_val => '100',
+ min => '0',
+ max => 'INT_MAX / 2',
+ ifdef => 'USE_UMBRA',
+},
+
+{ name => 'mapwriter_lru_multiplier', type => 'real', context => 'PGC_SIGHUP', group => 'RESOURCES_BGWRITER',
+ short_desc => 'Multiple of recent MAP allocation pressure to clean per Umbra map writer round.',
+ variable => 'MapWriterLRUMultiplier',
+ boot_val => '2.0',
+ min => '0.0',
+ max => '10.0',
+ ifdef => 'USE_UMBRA',
+},
+
+{ name => 'mapwriter_prealloc_max_relations', type => 'int', context => 'PGC_SIGHUP', group => 'RESOURCES_BGWRITER',
+ short_desc => 'Maximum number of relations preallocated by Umbra map writer per round.',
+ variable => 'MapWriterPreallocMaxRelations',
+ boot_val => '32',
+ min => '0',
+ max => 'INT_MAX / 2',
+ ifdef => 'USE_UMBRA',
+},
{ name => 'max_active_replication_origins', type => 'int', context => 'PGC_POSTMASTER', group => 'REPLICATION_SUBSCRIBERS',
short_desc => 'Sets the maximum number of active replication origins.',
variable => 'max_active_replication_origins',
diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c
index 290ccbc543..8d87958923 100644
--- a/src/backend/utils/misc/guc_tables.c
+++ b/src/backend/utils/misc/guc_tables.c
@@ -67,6 +67,7 @@
#include "postmaster/autovacuum.h"
#include "postmaster/bgworker_internals.h"
#include "postmaster/bgwriter.h"
+#include "postmaster/mapwriter.h"
#include "postmaster/postmaster.h"
#include "postmaster/startup.h"
#include "postmaster/syslogger.h"
@@ -83,6 +84,7 @@
#include "storage/fd.h"
#include "storage/io_worker.h"
#include "storage/large_object.h"
+#include "storage/map.h"
#include "storage/pg_shmem.h"
#include "storage/predicate.h"
#include "storage/proc.h"
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 4f2bbf0529..477d75f7e9 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -135,6 +135,8 @@
#shared_buffers = 128MB # min 128kB
# (change requires restart)
+#map_superblocks = 262144 # dedicated Umbra MAP superblock slots
+ # (change requires restart)
#huge_pages = try # on, off, or try
# (change requires restart)
#huge_page_size = 0 # zero for system default
diff --git a/src/include/postmaster/mapwriter.h b/src/include/postmaster/mapwriter.h
new file mode 100644
index 0000000000..6c984922b0
--- /dev/null
+++ b/src/include/postmaster/mapwriter.h
@@ -0,0 +1,24 @@
+/*-------------------------------------------------------------------------
+ *
+ * mapwriter.h
+ * Exports for Umbra map background workers.
+ *
+ * Portions Copyright (c) 1996-2025, PostgreSQL Global Development Group
+ *
+ * IDENTIFICATION
+ * src/include/postmaster/mapwriter.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef MAPWRITER_H
+#define MAPWRITER_H
+
+extern PGDLLIMPORT int MapWriterDelay;
+extern PGDLLIMPORT int MapWriterMaxPages;
+extern PGDLLIMPORT int MapWriterPreallocMaxRelations;
+extern PGDLLIMPORT double MapWriterLRUMultiplier;
+
+extern void MapBackgroundWorkersRegister(void);
+extern void MapWriterMain(Datum arg);
+
+#endif /* MAPWRITER_H */
diff --git a/src/include/storage/map.h b/src/include/storage/map.h
index ccbc392835..c61414fd16 100644
--- a/src/include/storage/map.h
+++ b/src/include/storage/map.h
@@ -88,6 +88,7 @@ typedef struct MapSharedData
pg_atomic_uint32 next_victim_buffer;
slock_t clock_lock;
int first_free_buffer; /* head of free list, -1 if empty */
+ int mapwriter_procno; /* procno to wake, -1 if none */
/* statistics */
pg_atomic_uint32 num_allocs;
@@ -258,6 +259,10 @@ extern BlockNumber MapGetPhysicalBlockCount(UmbraFileContext *map_ctx,
extern int MapClockGetBuffer(void);
extern void MapClockFreeBuffer(int slot_id);
extern int MapSyncStart(uint32 *complete_passes, uint32 *num_allocs);
+extern uint32 MapAllocPressurePeek(void);
+extern void MapStrategyNotifyWriter(int mapwriter_procno);
+extern void MapWakeWriter(void);
+extern int MapPreallocStep(int max_relations);
/* Map cache hash table (in mapclock.c) */
extern int MapCacheLookup(RelFileLocator rnode, ForkNumber forknum,
@@ -277,6 +282,15 @@ extern void MapInvalidateBuffer(int slot_id, RelFileLocator expected_rnode,
/* GUCs */
extern int map_buffers;
extern int map_superblocks;
+extern int map_prealloc_main_low;
+extern int map_prealloc_main_hard;
+extern int map_prealloc_main_batch;
+extern int map_prealloc_fsm_low;
+extern int map_prealloc_fsm_hard;
+extern int map_prealloc_fsm_batch;
+extern int map_prealloc_vm_low;
+extern int map_prealloc_vm_hard;
+extern int map_prealloc_vm_batch;
/* Global data (defined in map.c) */
extern MapSharedData *MapShared;
diff --git a/src/include/storage/map_internal.h b/src/include/storage/map_internal.h
index acac29b018..368b3da15a 100644
--- a/src/include/storage/map_internal.h
+++ b/src/include/storage/map_internal.h
@@ -27,6 +27,9 @@ extern void MapResetAllTruncatePreloads(void);
extern BlockNumber MapForkPageIndexToMapBlkno(ForkNumber forknum,
BlockNumber fork_page_idx);
extern BlockNumber MapLblknoToMapBlkno(ForkNumber forknum, BlockNumber lblkno);
+extern bool MapForkPreallocSettings(ForkNumber forknum, BlockNumber *soft_low,
+ BlockNumber *hard_low,
+ BlockNumber *batch_blocks);
extern bool MapReserveNextPblkno(UmbraFileContext *map_ctx, RelFileLocator rnode,
ForkNumber forknum, BlockNumber lblkno,
BlockNumber *new_pblkno, bool nowait);
@@ -36,6 +39,10 @@ extern bool MapTryReserveFreshPblkno(UmbraFileContext *map_ctx,
BlockNumber lblkno,
BlockNumber *new_pblkno,
bool nowait);
+extern bool MapMaybePreallocateFork(UmbraFileContext *map_ctx,
+ RelFileLocator rnode,
+ ForkNumber forknum,
+ bool background_mode);
extern bool MapInflightTryClaim(UmbraFileContext *map_ctx,
RelFileLocator rnode,
ForkNumber forknum,
diff --git a/src/include/storage/mapsuper_internal.h b/src/include/storage/mapsuper_internal.h
index 960469538f..5d64ddec87 100644
--- a/src/include/storage/mapsuper_internal.h
+++ b/src/include/storage/mapsuper_internal.h
@@ -17,6 +17,9 @@
#define MAPSUPER_FLAG_DIRTY 0x02
#define MAPSUPER_FLAG_CORRUPT 0x04
+#define MAPSUPER_RUNTIME_FLAG_PREALLOC_MAIN 0x01
+#define MAPSUPER_RUNTIME_FLAG_PREALLOC_FSM 0x02
+#define MAPSUPER_RUNTIME_FLAG_PREALLOC_VM 0x04
#define MAPSUPER_RUNTIME_FLAG_EXTENDING_MAIN 0x08
#define MAPSUPER_RUNTIME_FLAG_EXTENDING_FSM 0x10
#define MAPSUPER_RUNTIME_FLAG_EXTENDING_VM 0x20
@@ -143,6 +146,7 @@ extern MapSuperEntry *MapSuperEnsureEntryLocked(RelFileLocator rnode);
extern void MapSuperDeleteEntry(RelFileLocator rnode);
extern bool MapSuperForkExists(const MapSuperblock *super,
ForkNumber forknum);
+extern uint32 MapSuperPreallocFlag(ForkNumber forknum);
extern void MapSBlockBumpPhysicalState(UmbraFileContext *map_ctx,
RelFileLocator rnode,
ForkNumber forknum,
diff --git a/src/include/storage/umfile.h b/src/include/storage/umfile.h
index 8b7400140d..b965867572 100644
--- a/src/include/storage/umfile.h
+++ b/src/include/storage/umfile.h
@@ -61,6 +61,9 @@ extern void umfile_ctx_write(UmbraFileContext *ctx, ForkNumber forknum, BlockNum
const char *buffer, int nbytes, bool skipFsync);
extern void umfile_ctx_extend(UmbraFileContext *ctx, ForkNumber forknum, BlockNumber blkno,
const char *buffer);
+extern bool umfile_ctx_preallocate_blocks(UmbraFileContext *ctx, ForkNumber forknum,
+ UmFileNblocksMode mode,
+ BlockNumber target_nblocks);
extern void umfile_ctx_prefetch(UmbraFileContext *ctx, ForkNumber forknum, BlockNumber blkno);
extern bool umfile_ctx_block_exists(UmbraFileContext *ctx, ForkNumber forknum,
BlockNumber blkno);
diff --git a/src/test/recovery/meson.build b/src/test/recovery/meson.build
index 55a2de4df7..da020abc31 100644
--- a/src/test/recovery/meson.build
+++ b/src/test/recovery/meson.build
@@ -63,6 +63,7 @@ tests += {
't/052_checkpoint_segment_missing.pl',
't/053_umbra_map_superblock_watermark.pl',
't/054_umbra_map_fork_policy.pl',
+ 't/055_umbra_mapwriter_activity.pl',
't/056_umbra_truncate_superblock.pl',
't/057_umbra_remap_crash_consistency.pl',
't/058_umbra_2pc_remap_recovery.pl',
@@ -76,6 +77,7 @@ tests += {
't/070_umbra_hash_birth_block_remap.pl',
't/071_umbra_skip_wal_dense_map.pl',
't/072_umbra_ordinary_slim_block_remap.pl',
+ 't/073_umbra_preallocate_guc.pl',
't/074_umbra_torn_page_remap.pl',
],
},
diff --git a/src/test/recovery/t/055_umbra_mapwriter_activity.pl b/src/test/recovery/t/055_umbra_mapwriter_activity.pl
new file mode 100644
index 0000000000..6ebae116d8
--- /dev/null
+++ b/src/test/recovery/t/055_umbra_mapwriter_activity.pl
@@ -0,0 +1,56 @@
+# Verify mapwriter process visibility and basic activity metadata.
+#
+# In UMBRA mode:
+# - map writer backend should exist in pg_stat_activity
+# - wait event should be MapwriterMain/MapwriterHibernate or NULL transiently
+#
+# In md mode, skip this test.
+use strict;
+use warnings;
+
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+plan skip_all => 'requires --with-umbra MAP fork'
+ unless check_pg_config('^#define USE_UMBRA 1$');
+
+my $node = PostgreSQL::Test::Cluster->new('master');
+$node->init();
+$node->append_conf(
+ 'postgresql.conf', qq{
+autovacuum = off
+});
+$node->start();
+
+$node->safe_psql('postgres', q{CREATE TABLE umb_mapwriter_t(a int, b text);});
+
+my $mapwriter_cnt = $node->safe_psql(
+ 'postgres',
+ q{SELECT count(*) FROM pg_stat_activity WHERE backend_type = 'map writer';});
+is($mapwriter_cnt, '1', 'map writer backend exists');
+
+my $mapwriter_wait_ok = $node->safe_psql(
+ 'postgres', q{
+SELECT count(*) > 0
+FROM pg_stat_activity
+WHERE backend_type = 'map writer'
+ AND (wait_event IN ('MapwriterMain', 'MapwriterHibernate')
+ OR wait_event IS NULL);
+});
+is($mapwriter_wait_ok, 't', 'map writer wait event is expected');
+
+# Create allocation pressure and ensure map writer remains visible.
+$node->safe_psql(
+ 'postgres', q{
+INSERT INTO umb_mapwriter_t
+SELECT g, repeat('w', 300) FROM generate_series(1, 30000) g;
+CHECKPOINT;
+});
+
+ok($node->poll_query_until('postgres',
+ q{SELECT count(*) = 1 FROM pg_stat_activity WHERE backend_type = 'map writer';},
+ 't'),
+ 'map writer remains alive under allocation pressure');
+
+done_testing();
diff --git a/src/test/recovery/t/073_umbra_preallocate_guc.pl b/src/test/recovery/t/073_umbra_preallocate_guc.pl
new file mode 100644
index 0000000000..f5089d50bb
--- /dev/null
+++ b/src/test/recovery/t/073_umbra_preallocate_guc.pl
@@ -0,0 +1,74 @@
+# Verify Umbra MAIN-fork preallocation publishes capacity only after the
+# underlying file has been extended to cover it.
+#
+# In md mode, skip this test.
+use strict;
+use warnings;
+
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+plan skip_all => 'requires --with-umbra MAP fork'
+ unless check_pg_config('^#define USE_UMBRA 1$');
+
+sub u32le_from_hex
+{
+ my ($hex, $offset) = @_;
+ my $chunk = substr($hex, $offset * 2, 8);
+ my @b = ($chunk =~ /../g);
+
+ return hex($b[0]) +
+ (hex($b[1]) << 8) +
+ (hex($b[2]) << 16) +
+ (hex($b[3]) << 24);
+}
+
+my $node = PostgreSQL::Test::Cluster->new('umbra_preallocate_guc');
+$node->init();
+$node->append_conf(
+ 'postgresql.conf', qq{
+autovacuum = off
+map_prealloc_main_low = 64
+map_prealloc_main_hard = 64
+map_prealloc_main_batch = 128
+});
+$node->start();
+
+$node->safe_psql(
+ 'postgres', q{
+CREATE TABLE umbra_prealloc_t(id int, payload text);
+ALTER TABLE umbra_prealloc_t ALTER COLUMN payload SET STORAGE PLAIN;
+INSERT INTO umbra_prealloc_t
+SELECT g, repeat('x', 7000) FROM generate_series(1, 2000) g;
+CHECKPOINT;
+});
+
+my $main_path = $node->safe_psql(
+ 'postgres',
+ q{SELECT pg_relation_filepath('umbra_prealloc_t');}
+);
+
+my $map_super_hex = $node->safe_psql(
+ 'postgres',
+ q{SELECT encode(pg_read_binary_file(pg_relation_filepath('umbra_prealloc_t') || '_map', 0, 64, true), 'hex');}
+);
+
+my $next_free_main = u32le_from_hex($map_super_hex, 16);
+my $phys_capacity_main = u32le_from_hex($map_super_hex, 20);
+my $logical_main = u32le_from_hex($map_super_hex, 40);
+my $main_file_blocks = $node->safe_psql(
+ 'postgres',
+ "SELECT ((pg_stat_file('$main_path')).size / current_setting('block_size')::int)::bigint;");
+
+cmp_ok($logical_main, '>', 0, 'table has non-zero logical size');
+cmp_ok($next_free_main, '>=', $logical_main,
+ 'next_free_phys_block_main covers logical blocks');
+cmp_ok($phys_capacity_main, '>', $next_free_main,
+ 'GUC-driven preallocation keeps capacity ahead of next_free');
+cmp_ok($main_file_blocks, '>=', $phys_capacity_main,
+ 'MAIN fork file size covers published physical capacity');
+
+$node->stop;
+
+done_testing();
--
2.50.1 (Apple Git-155)
| From | Date | Subject | |
|---|---|---|---|
| Next Message | Mingwei Jia | 2026-06-01 23:33:40 | [RFC PATCH v2 RESEND 10/10] umbra: add patch 9 compactor framework and non-interference policy |
| Previous Message | Mingwei Jia | 2026-06-01 23:33:38 | [RFC PATCH v2 RESEND 08/10] umbra: add patch 7 checkpoint-boundary FPW replacement and block-reference remap |