| From: | Mingwei Jia <i(at)nayishan(dot)top> |
|---|---|
| To: | pgsql-hackers(at)lists(dot)postgresql(dot)org |
| Subject: | [RFC PATCH v2 RESEND 06/10] umbra: add patch 5 MAP access policy, translation, and materialization |
| Date: | 2026-06-01 23:33:36 |
| Message-ID: | 20260601233340.67949-5-i@nayishan.top |
| Views: | Whole Thread | Raw Message | Download mbox | Resend email |
| Thread: | |
| Lists: | pgsql-hackers |
---
src/backend/catalog/storage.c | 201 +-
src/backend/storage/buffer/bufmgr.c | 12 +-
src/backend/storage/map/Makefile | 1 +
src/backend/storage/map/map.c | 335 ++-
src/backend/storage/map/mapbuf.c | 14 +
src/backend/storage/map/mapclock.c | 6 +-
src/backend/storage/map/mapinflight.c | 402 +++
src/backend/storage/map/mapinit.c | 7 +-
src/backend/storage/map/mapsuper.c | 26 +-
src/backend/storage/map/meson.build | 1 +
src/backend/storage/smgr/md.c | 1 +
src/backend/storage/smgr/smgr.c | 295 ++-
src/backend/storage/smgr/umbra.c | 2337 ++++++++++++++---
src/backend/storage/smgr/umfile.c | 19 +-
src/backend/utils/cache/relcache.c | 12 +-
src/include/storage/aio_types.h | 3 +-
src/include/storage/map.h | 37 +-
src/include/storage/map_internal.h | 23 +
src/include/storage/smgr.h | 30 +-
src/include/storage/um_defs.h | 18 +-
src/include/storage/umbra.h | 112 +-
src/test/recovery/meson.build | 1 +
.../t/061_umbra_fsm_vm_map_translation.pl | 117 +
23 files changed, 3557 insertions(+), 453 deletions(-)
create mode 100644 src/backend/storage/map/mapinflight.c
create mode 100644 src/test/recovery/t/061_umbra_fsm_vm_map_translation.pl
diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c
index 6b69329a52..be58c35191 100644
--- a/src/backend/catalog/storage.c
+++ b/src/backend/catalog/storage.c
@@ -74,8 +74,55 @@ typedef struct PendingRelSync
bool is_truncated; /* Has the file experienced truncation? */
} PendingRelSync;
+typedef struct PendingRelTruncate
+{
+ RelFileLocator rlocator;
+ int nestLevel;
+ struct PendingRelTruncate *next;
+} PendingRelTruncate;
+
static PendingRelDelete *pendingDeletes = NULL; /* head of linked list */
static HTAB *pendingSyncHash = NULL;
+static HTAB *pendingSkipWalStateHash = NULL;
+static PendingRelTruncate *pendingTruncates = NULL;
+
+static void
+AddPendingTruncate(const RelFileLocator *rlocator)
+{
+ PendingRelTruncate *pending;
+ int nestLevel = GetCurrentTransactionNestLevel();
+
+ for (pending = pendingTruncates; pending != NULL; pending = pending->next)
+ {
+ if (RelFileLocatorEquals(pending->rlocator, *rlocator))
+ {
+ if (pending->nestLevel > nestLevel)
+ pending->nestLevel = nestLevel;
+ return;
+ }
+ }
+
+ pending = (PendingRelTruncate *)
+ MemoryContextAlloc(TopMemoryContext, sizeof(PendingRelTruncate));
+ pending->rlocator = *rlocator;
+ pending->nestLevel = nestLevel;
+ pending->next = pendingTruncates;
+ pendingTruncates = pending;
+}
+
+static void
+ClearPendingTruncates(void)
+{
+ PendingRelTruncate *pending;
+ PendingRelTruncate *next;
+
+ for (pending = pendingTruncates; pending != NULL; pending = next)
+ {
+ next = pending->next;
+ pfree(pending);
+ }
+ pendingTruncates = NULL;
+}
/*
@@ -103,6 +150,21 @@ AddPendingSync(const RelFileLocator *rlocator)
pending = hash_search(pendingSyncHash, rlocator, HASH_ENTER, &found);
Assert(!found);
pending->is_truncated = false;
+
+ if (!pendingSkipWalStateHash)
+ {
+ HASHCTL ctl;
+
+ ctl.keysize = sizeof(RelFileLocator);
+ ctl.entrysize = sizeof(RelFileLocator);
+ ctl.hcxt = TopTransactionContext;
+ pendingSkipWalStateHash = hash_create("pending skip-WAL state hash",
+ 16, &ctl,
+ HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
+ }
+
+ (void) hash_search(pendingSkipWalStateHash, rlocator, HASH_ENTER, &found);
+ smgrmarkskipwalpending(*rlocator);
}
/*
@@ -149,9 +211,7 @@ RelationCreateStorage(RelFileLocator rlocator, char relpersistence,
srel = smgropen(rlocator, procNumber);
smgrcreate(srel, MAIN_FORKNUM, false);
-
- if (needs_wal)
- smgrcreaterelationmetadata(srel);
+ smgrinitnewrelation(srel, needs_wal);
if (needs_wal)
log_smgrcreate(&srel->smgr_rlocator.locator, MAIN_FORKNUM);
@@ -297,6 +357,7 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
BlockNumber old_blocks[MAX_FORKNUM];
BlockNumber blocks[MAX_FORKNUM];
int nforks = 0;
+ XLogRecPtr truncate_lsn = InvalidXLogRecPtr;
SMgrRelation reln;
/*
@@ -385,14 +446,11 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
*
* (See also visibilitymap.c if changing this code.)
*/
- START_CRIT_SECTION();
-
if (RelationNeedsWAL(rel))
{
/*
* Make an XLOG entry reporting the file truncation.
*/
- XLogRecPtr lsn;
xl_smgr_truncate xlrec;
xlrec.blkno = nblocks;
@@ -402,8 +460,8 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
XLogBeginInsert();
XLogRegisterData(&xlrec, sizeof(xlrec));
- lsn = XLogInsert(RM_SMGR_ID,
- XLOG_SMGR_TRUNCATE | XLR_SPECIAL_REL_UPDATE);
+ truncate_lsn = XLogInsert(RM_SMGR_ID,
+ XLOG_SMGR_TRUNCATE | XLR_SPECIAL_REL_UPDATE);
/*
* Flush, because otherwise the truncation of the main relation might
@@ -413,14 +471,23 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
* contain entries for the non-existent heap pages, and standbys would
* also never replay the truncation.
*/
- XLogFlush(lsn);
+ XLogFlush(truncate_lsn);
}
+ /*
+ * Apply storage-manager-specific truncate metadata updates before entering
+ * the critical section. Umbra uses this to rewrite MAP metadata with the
+ * truncate WAL LSN while MAP buffer reads and flushes are still legal.
+ */
+ smgrpretruncate(RelationGetSmgr(rel), forks, nforks, old_blocks, blocks,
+ truncate_lsn);
+
/*
* This will first remove any buffers from the buffer pool that should no
* longer exist after truncation is complete, and then truncate the
* corresponding files on disk.
*/
+ START_CRIT_SECTION();
smgrtruncate(RelationGetSmgr(rel), forks, nforks, old_blocks, blocks);
END_CRIT_SECTION();
@@ -453,6 +520,8 @@ RelationPreTruncate(Relation rel)
{
PendingRelSync *pending;
+ AddPendingTruncate(&(RelationGetSmgr(rel)->smgr_rlocator.locator));
+
if (!pendingSyncHash)
return;
@@ -581,6 +650,30 @@ RelFileLocatorSkippingWAL(RelFileLocator rlocator)
return true;
}
+/*
+ * RelFileLocatorWasTruncated
+ * Check whether this transaction already marked the relfilenode truncated.
+ *
+ * A relfilenode can emit fresh page images for block 0 later in the same
+ * transaction after XLOG_SMGR_TRUNCATE has already been inserted. Recovery
+ * replays the truncate first and therefore sees no surviving old mapping, so
+ * producer-side first-born mapping logic must treat any still-visible local
+ * pre-truncate mapping as stale in that case.
+ */
+bool
+RelFileLocatorWasTruncated(RelFileLocator rlocator)
+{
+ PendingRelTruncate *pending;
+
+ for (pending = pendingTruncates; pending != NULL; pending = pending->next)
+ {
+ if (RelFileLocatorEquals(pending->rlocator, rlocator))
+ return true;
+ }
+
+ return false;
+}
+
/*
* EstimatePendingSyncsSpace
* Estimate space needed to pass syncs to parallel workers.
@@ -752,12 +845,17 @@ smgrDoPendingSyncs(bool isCommit, bool isParallelWorker)
Assert(GetCurrentTransactionNestLevel() == 1);
if (!pendingSyncHash)
+ {
+ ClearPendingTruncates();
return; /* no relation needs sync */
+ }
/* Abort -- just throw away all pending syncs */
if (!isCommit)
{
pendingSyncHash = NULL;
+ pendingSkipWalStateHash = NULL;
+ ClearPendingTruncates();
return;
}
@@ -767,6 +865,8 @@ smgrDoPendingSyncs(bool isCommit, bool isParallelWorker)
if (isParallelWorker)
{
pendingSyncHash = NULL;
+ pendingSkipWalStateHash = NULL;
+ ClearPendingTruncates();
return;
}
@@ -783,6 +883,7 @@ smgrDoPendingSyncs(bool isCommit, bool isParallelWorker)
BlockNumber nblocks[MAX_FORKNUM + 1];
uint64 total_blocks = 0;
SMgrRelation srel;
+ bool require_storage_sync;
srel = smgropen(pendingsync->rlocator, INVALID_PROC_NUMBER);
@@ -798,6 +899,12 @@ smgrDoPendingSyncs(bool isCommit, bool isParallelWorker)
{
for (fork = 0; fork <= MAX_FORKNUM; fork++)
{
+ if (smgrisinternalfork(fork))
+ {
+ nblocks[fork] = InvalidBlockNumber;
+ continue;
+ }
+
if (smgrexists(srel, fork))
{
BlockNumber n = smgrnblocks(srel, fork);
@@ -812,6 +919,8 @@ smgrDoPendingSyncs(bool isCommit, bool isParallelWorker)
}
}
+ require_storage_sync = smgrpreparependingsync(srel);
+
/*
* Sync file or emit WAL records for its contents.
*
@@ -825,6 +934,18 @@ smgrDoPendingSyncs(bool isCommit, bool isParallelWorker)
* main fork is longer than ever but FSM fork gets shorter.
*/
if (pendingsync->is_truncated ||
+ /*
+ * New relfilenumbers that are still in PostgreSQL's mandatory
+ * WAL-skipping state must reach disk via flush+sync, not via
+ * log_newpage_range(). See "Skipping WAL for New RelFileLocator"
+ * in src/backend/access/transam/README.
+ */
+ RelFileLocatorSkippingWAL(pendingsync->rlocator) ||
+ /*
+ * Some storage managers require flush+sync for their own durable
+ * transition protocol even when the relation is small.
+ */
+ require_storage_sync ||
total_blocks >= wal_skip_threshold * (uint64) 1024 / BLCKSZ)
{
/* allocate the initial array, or extend it, if needed */
@@ -866,12 +987,26 @@ smgrDoPendingSyncs(bool isCommit, bool isParallelWorker)
}
pendingSyncHash = NULL;
+ ClearPendingTruncates();
if (nrels > 0)
{
smgrdosyncall(srels, nrels);
+
+ if (pendingSkipWalStateHash)
+ {
+ HASH_SEQ_STATUS clear_scan;
+ RelFileLocator *rlocator;
+
+ hash_seq_init(&clear_scan, pendingSkipWalStateHash);
+ while ((rlocator = (RelFileLocator *) hash_seq_search(&clear_scan)) != NULL)
+ smgrclearskipwalpending(*rlocator);
+ }
+
pfree(srels);
}
+
+ pendingSkipWalStateHash = NULL;
}
/*
@@ -945,6 +1080,8 @@ PostPrepare_smgr(void)
/* must explicitly free the list entry */
pfree(pending);
}
+
+ ClearPendingTruncates();
}
@@ -958,12 +1095,21 @@ AtSubCommit_smgr(void)
{
int nestLevel = GetCurrentTransactionNestLevel();
PendingRelDelete *pending;
+ PendingRelTruncate *pending_truncate;
for (pending = pendingDeletes; pending != NULL; pending = pending->next)
{
if (pending->nestLevel >= nestLevel)
pending->nestLevel = nestLevel - 1;
}
+
+ for (pending_truncate = pendingTruncates;
+ pending_truncate != NULL;
+ pending_truncate = pending_truncate->next)
+ {
+ if (pending_truncate->nestLevel >= nestLevel)
+ pending_truncate->nestLevel = nestLevel - 1;
+ }
}
/*
@@ -976,7 +1122,28 @@ AtSubCommit_smgr(void)
void
AtSubAbort_smgr(void)
{
+ PendingRelTruncate *pending;
+ PendingRelTruncate *prev = NULL;
+ PendingRelTruncate *next;
+ int nestLevel = GetCurrentTransactionNestLevel();
+
smgrDoPendingDeletes(false);
+
+ for (pending = pendingTruncates; pending != NULL; pending = next)
+ {
+ next = pending->next;
+
+ if (pending->nestLevel >= nestLevel)
+ {
+ if (prev)
+ prev->next = next;
+ else
+ pendingTruncates = next;
+ pfree(pending);
+ }
+ else
+ prev = pending;
+ }
}
void
@@ -993,11 +1160,12 @@ smgr_redo(XLogReaderState *record)
xl_smgr_create *xlrec = (xl_smgr_create *) XLogRecGetData(record);
SMgrRelation reln;
- reln = smgropen(xlrec->rlocator, INVALID_PROC_NUMBER);
- smgrcreate(reln, xlrec->forkNum, true);
- }
- else if (info == XLOG_SMGR_TRUNCATE)
- {
+ reln = smgropen(xlrec->rlocator, INVALID_PROC_NUMBER);
+ smgrcreate(reln, xlrec->forkNum, true);
+ smgrredocreatefork(reln, xlrec->forkNum, lsn);
+ }
+ else if (info == XLOG_SMGR_TRUNCATE)
+ {
xl_smgr_truncate *xlrec = (xl_smgr_truncate *) XLogRecGetData(record);
SMgrRelation reln;
Relation rel;
@@ -1016,7 +1184,7 @@ smgr_redo(XLogReaderState *record)
* log as best we can until the drop is seen.
*/
smgrcreate(reln, MAIN_FORKNUM, true);
- smgrcreaterelationmetadata(reln);
+ smgrredocreatefork(reln, MAIN_FORKNUM, lsn);
/*
* Before we perform the truncation, update minimum recovery point to
@@ -1077,6 +1245,7 @@ smgr_redo(XLogReaderState *record)
/* Do the real work to truncate relation forks */
if (nforks > 0)
{
+ smgrpretruncate(reln, forks, nforks, old_blocks, blocks, lsn);
START_CRIT_SECTION();
smgrtruncate(reln, forks, nforks, old_blocks, blocks);
END_CRIT_SECTION();
@@ -1087,7 +1256,7 @@ smgr_redo(XLogReaderState *record)
* important because the just-truncated pages were likely marked as
* all-free, and would be preferentially selected.
*/
- if (need_fsm_vacuum)
+ if (need_fsm_vacuum && smgrneedsrecoveryfsmvacuum(reln))
FreeSpaceMapVacuumRange(rel, xlrec->blkno,
InvalidBlockNumber);
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 540f346d53..3bf2db8fdf 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -5491,7 +5491,8 @@ CreateAndCopyRelationData(RelFileLocator src_rlocator,
{
if (smgrexists(src_rel, forkNum))
{
- smgrcreate(dst_rel, forkNum, false);
+ if (!smgrisinternalfork(forkNum))
+ smgrcreate(dst_rel, forkNum, false);
/*
* WAL log creation if the relation is persistent, or this is the
@@ -5500,9 +5501,12 @@ CreateAndCopyRelationData(RelFileLocator src_rlocator,
if (permanent || forkNum == INIT_FORKNUM)
log_smgrcreate(&dst_rlocator, forkNum);
- /* Copy a fork's data, block by block. */
- RelationCopyStorageUsingBuffer(src_rlocator, dst_rlocator, forkNum,
- permanent);
+ if (!smgrisinternalfork(forkNum))
+ {
+ /* Copy a fork's data, block by block. */
+ RelationCopyStorageUsingBuffer(src_rlocator, dst_rlocator, forkNum,
+ permanent);
+ }
}
}
diff --git a/src/backend/storage/map/Makefile b/src/backend/storage/map/Makefile
index 08c3b69679..94ae1c1b72 100644
--- a/src/backend/storage/map/Makefile
+++ b/src/backend/storage/map/Makefile
@@ -18,6 +18,7 @@ OBJS = \
mapbuf.o \
mapflush.o \
mapclock.o \
+ mapinflight.o \
mapsuper.o
include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/storage/map/map.c b/src/backend/storage/map/map.c
index 1c74aa94ef..bd839a3e9f 100644
--- a/src/backend/storage/map/map.c
+++ b/src/backend/storage/map/map.c
@@ -43,6 +43,8 @@ typedef struct MapTruncatePreloadState
static MapTruncatePreloadState MapTruncatePreload[MAX_FORKNUM + 1];
+#define MAP_PENDING_WAIT_RETRIES 10000
+#define MAP_PENDING_WAIT_USEC 1000
typedef enum MapCachedLookupResult
{
@@ -114,6 +116,20 @@ static MapCachedLookupResult MapTryLookupCachedPblknoInternal(RelFileLocator rno
BlockNumber lblkno,
bool adjust_usage,
BlockNumber *pblkno);
+static bool MapTryReserveFreshPblknoInternal(UmbraFileContext *map_ctx,
+ RelFileLocator rnode,
+ ForkNumber forknum,
+ BlockNumber lblkno,
+ BlockNumber *new_pblkno,
+ bool nowait);
+static bool MapWaitForForeignInflightToClear(RelFileLocator rnode,
+ ForkNumber forknum,
+ BlockNumber lblkno);
+bool MapReserveNextPblkno(UmbraFileContext *map_ctx, RelFileLocator rnode,
+ ForkNumber forknum, BlockNumber lblkno,
+ BlockNumber *new_pblkno,
+ bool nowait);
+
void
MapResetAllTruncatePreloads(void)
{
@@ -531,13 +547,207 @@ done:
return run_blocks;
}
+/*
+ * Reserve a brand-new in-flight physical target for lblkno.
+ *
+ * This helper does not consult existing in-flight state before consuming the
+ * next frontier block. If another backend already published an in-flight
+ * target for the same lblkno, publication will fail and the freshly reserved
+ * frontier slot becomes a hole. Callers that want a stable winner should
+ * fall back to a lookup after a false return.
+ */
+bool
+MapTryReserveFreshPblkno(UmbraFileContext *map_ctx, RelFileLocator rnode,
+ ForkNumber forknum, BlockNumber lblkno,
+ BlockNumber *new_pblkno, bool nowait)
+{
+ return MapTryReserveFreshPblknoInternal(map_ctx, rnode, forknum, lblkno,
+ new_pblkno, nowait);
+}
+
+static bool
+MapTryReserveFreshPblknoInternal(UmbraFileContext *map_ctx, RelFileLocator rnode,
+ ForkNumber forknum, BlockNumber lblkno,
+ BlockNumber *new_pblkno, bool nowait)
+{
+ MapSuperEntry *entry;
+ BlockNumber next;
+ uint32 flags;
+ bool reserved = false;
+
+ Assert(new_pblkno != NULL);
+
+ if (!MapForkHasMappedState(forknum))
+ return false;
+
+ if (!MapSBlockEnsureLoaded(map_ctx, rnode))
+ return false;
+
+ if (!MapInflightTryClaim(map_ctx, rnode, forknum, lblkno))
+ return false;
+
+ PG_TRY();
+ {
+ if (nowait)
+ {
+ if (!MapSuperFindEntryTryLocked(rnode, LW_EXCLUSIVE, &entry))
+ goto reserve_done;
+ }
+ else
+ {
+ if (!MapSuperFindEntryLocked(rnode, LW_EXCLUSIVE, &entry))
+ goto reserve_done;
+ }
+
+ flags = entry->flags;
+ if ((flags & MAPSUPER_FLAG_CORRUPT) ||
+ !MapSuperblockHasValidIdentity(&entry->super) ||
+ ((flags & MAPSUPER_FLAG_DIRTY) == 0 &&
+ !MapSuperblockCheckCRC(&entry->super)))
+ {
+ LWLockRelease(&entry->lock);
+ if (!InRecovery)
+ MapSBlockReportCorrupt(rnode, "invalid identity or CRC");
+ goto reserve_done;
+ }
+
+ Assert(MapNormalizeForkBlockCount(forknum,
+ MapSuperblockGetNextFreePhysBlock(&entry->super,
+ forknum)) <=
+ MapSuperGetReservedNextFree(entry, forknum));
+ next = MapSuperGetReservedNextFree(entry, forknum);
+ if (next == InvalidBlockNumber - 1)
+ {
+ LWLockRelease(&entry->lock);
+ ereport(ERROR,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("cannot allocate more physical blocks for relation %u/%u/%u fork %d",
+ rnode.spcOid, rnode.dbOid, rnode.relNumber, forknum)));
+ }
+
+ MapSuperSetReservedNextFree(entry, forknum, next + 1);
+ Assert(MapNormalizeForkBlockCount(forknum,
+ MapSuperblockGetNextFreePhysBlock(&entry->super,
+ forknum)) <=
+ MapSuperGetReservedNextFree(entry, forknum));
+
+ *new_pblkno = next;
+ LWLockRelease(&entry->lock);
+ MapInflightFinishClaim(rnode, forknum, lblkno, next);
+ reserved = true;
+
+reserve_done:
+ ;
+ }
+ PG_CATCH();
+ {
+ MapInflightRelease(rnode, forknum, lblkno);
+ PG_RE_THROW();
+ }
+ PG_END_TRY();
+
+ if (!reserved)
+ {
+ MapInflightRelease(rnode, forknum, lblkno);
+ return false;
+ }
+
+ return true;
+}
+
+static bool
+MapWaitForForeignInflightToClear(RelFileLocator rnode,
+ ForkNumber forknum,
+ BlockNumber lblkno)
+{
+ int wait_retries = 0;
+
+ for (;;)
+ {
+ if (!MapInflightBitIsSet(rnode, forknum, lblkno))
+ {
+ if (wait_retries > 0)
+ elog(LOG,
+ "foreground remap waited for in-flight remap on relation %u/%u/%u fork %d block %u (%d retries, %d usec)",
+ rnode.spcOid, rnode.dbOid, rnode.relNumber,
+ forknum, lblkno,
+ wait_retries,
+ wait_retries * MAP_PENDING_WAIT_USEC);
+ return wait_retries > 0;
+ }
+
+ CHECK_FOR_INTERRUPTS();
+ pg_usleep(MAP_PENDING_WAIT_USEC);
+ wait_retries++;
+
+ if (wait_retries >= MAP_PENDING_WAIT_RETRIES)
+ ereport(ERROR,
+ (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
+ errmsg("timed out waiting for in-flight remap of relation %u/%u/%u fork %d block %u",
+ rnode.spcOid, rnode.dbOid, rnode.relNumber,
+ forknum, lblkno)));
+ }
+}
+
+/*
+ * MapReserveNextPblkno - return a locally owned in-flight pblk for lblkno,
+ * creating it if needed.
+ *
+ * The shared MAP bit remains visible to other backends so they can observe
+ * contention, but the selected pblk stays owner-local and must not be
+ * borrowed by foreign callers. Foreign collisions are surfaced as false so
+ * callers can fail or fallback according to their ownership rules.
+ */
+bool
+MapReserveNextPblkno(UmbraFileContext *map_ctx, RelFileLocator rnode,
+ ForkNumber forknum, BlockNumber lblkno,
+ BlockNumber *new_pblkno, bool nowait)
+{
+ if (MapInflightLookupOwnedPblk(rnode, forknum, lblkno, new_pblkno))
+ return true;
+
+ if (MapTryReserveFreshPblknoInternal(map_ctx, rnode, forknum, lblkno,
+ new_pblkno, nowait))
+ return true;
+
+ return MapInflightLookupOwnedPblk(rnode, forknum, lblkno, new_pblkno);
+}
+
+/*
+ * Reserve a fresh physical target without consulting or updating any current
+ * mapping entry.
+ *
+ * Callers use this when they already own first publication for lblkno and
+ * must not reuse a locally visible old mapping.
+ */
+bool
+MapReserveFreshPblkno(UmbraFileContext *map_ctx, RelFileLocator rnode,
+ ForkNumber forknum, BlockNumber lblkno,
+ BlockNumber *new_pblkno)
+{
+ if (MapReserveNextPblkno(map_ctx, rnode, forknum, lblkno,
+ new_pblkno, false))
+ {
+ return true;
+ }
+
+ /*
+ * "Fresh" callers are first-born owners. Seeing a foreign in-flight owner
+ * here means the caller's ownership assumption was wrong, so surface the
+ * conflict immediately and let the caller decide whether this is an
+ * invariant violation or a fast-path fallback.
+ */
+ return false;
+}
+
/*
* MapReadBuffer - read a map page into buffer
*
* Returns the slot_id of the buffer, with the buffer pinned.
*
- * The caller owns the returned buffer pin.
+ * This function is extern because direct mapping publication needs to load
+ * and update MAP pages from outside map.c.
*/
int
MapReadBuffer(UmbraFileContext *map_ctx, RelFileLocator rnode,
@@ -629,9 +839,19 @@ MapReadBuffer(UmbraFileContext *map_ctx, RelFileLocator rnode,
continue;
}
}
+
+ if (buf->pending_count != 0)
+ {
+ LWLockRelease(&buf->buffer_lock);
+ MapUnpinBuffer(slot_id);
+ continue;
+ }
+
old_page_number = buf->page_number;
old_forknum = buf->forknum;
old_rnode = buf->rnode;
+ MemSet(buf->pending_bits, 0, sizeof(buf->pending_bits));
+
existing_slot_id = MapCacheInsert(rnode, forknum, map_blkno, slot_id);
if (existing_slot_id >= 0 && existing_slot_id != slot_id)
retry = true;
@@ -1043,7 +1263,7 @@ MapInvalidateDatabase(Oid dbid)
}
/*
- * MapGetLogicalBlockCount - return the persisted logical block count.
+ * MapGetLogicalBlockCount - get logical block count from the MAP superblock
*/
BlockNumber
MapGetLogicalBlockCount(UmbraFileContext *map_ctx, RelFileLocator rnode, ForkNumber forknum)
@@ -1137,3 +1357,114 @@ MapGetPhysicalBlockCount(UmbraFileContext *map_ctx, RelFileLocator rnode,
return max_pblkno + 1;
}
+
+/*
+ * MapGetNewPblkno - allocate new physical block and return both old and new
+ *
+ * This queries the current mapping, reserves a new physical block, and returns
+ * both the old and new physical block numbers to the caller that owns the
+ * mapping transition.
+ *
+ * Parameters:
+ * map_ctx: MAP fork context for file I/O operations
+ * rnode: relation identifier
+ * forknum: fork number
+ * lblkno: logical block number
+ * new_pblkno: output parameter for the new physical block number
+ * old_pblkno: output parameter for the old physical block number
+ */
+void MapGetNewPbkno(UmbraFileContext *map_ctx, RelFileLocator rnode, ForkNumber forknum,
+ BlockNumber lblkno, BlockNumber *new_pblkno,
+ BlockNumber *old_pblkno)
+{
+ BlockNumber cur_pblkno;
+
+ Assert(new_pblkno != NULL);
+ Assert(old_pblkno != NULL);
+
+ for (;;)
+ {
+ if (!MapTryLookup(map_ctx, rnode, forknum, lblkno, &cur_pblkno))
+ {
+ *old_pblkno = InvalidBlockNumber;
+ }
+ else
+ {
+ *old_pblkno = cur_pblkno;
+ }
+
+ if (MapReserveNextPblkno(map_ctx, rnode, forknum, lblkno,
+ new_pblkno, false))
+ break;
+
+ /*
+ * A foreign in-flight owner controls the current mapping publication
+ * decision. Wait for it to publish, then retry against committed MAP
+ * state instead of guessing from buffers or page LSNs.
+ */
+ if (!MapWaitForForeignInflightToClear(rnode, forknum, lblkno))
+ elog(ERROR,
+ "failed to reserve physical block for relation %u/%u/%u fork %d blk %u",
+ rnode.spcOid, rnode.dbOid, rnode.relNumber, forknum, lblkno);
+ }
+}
+
+/*
+ * MapSetMapping - set a mapping entry directly
+ *
+ * This function sets a mapping entry directly without allocating a new
+ * physical block. Callers must already own the mapping publication decision.
+ *
+ * Parameters:
+ * map_ctx: MAP fork context for file I/O operations
+ * rnode: relation identifier
+ * forknum: fork number
+ * lblkno: logical block number
+ * new_pblkno: physical block number to set
+ *
+ * This function does NOT allocate a new physical block and does NOT advance
+ * superblock frontier/watermark state. Callers that own a fresh physical
+ * allocation must publish frontier changes explicitly.
+ */
+void
+MapSetMapping(UmbraFileContext *map_ctx, RelFileLocator rnode, ForkNumber forknum,
+ BlockNumber lblkno, BlockNumber new_pblkno, XLogRecPtr map_lsn)
+{
+ BlockNumber map_blkno;
+ int slot_id;
+ int entry_idx;
+ MapPage *page;
+ MapBufferDesc *buf;
+
+ /* Convert (forknum, lblkno) to Umbra metadata-fork block number */
+ map_blkno = MapLblknoToMapBlkno(forknum, lblkno);
+ entry_idx = map_blkno % MAP_ENTRIES_PER_PAGE;
+ map_blkno = map_blkno / MAP_ENTRIES_PER_PAGE;
+
+ /* Read or allocate the map page */
+ slot_id = MapReadBuffer(map_ctx, rnode, forknum, map_blkno);
+ buf = &MapBuffers[slot_id];
+ page = MapGetPage(slot_id);
+
+ /* Acquire buffer lock for modifying map page content */
+ LWLockAcquire(&buf->buffer_lock, LW_EXCLUSIVE);
+
+ /* Set the mapping directly */
+ page->pblknos[entry_idx] = new_pblkno;
+
+ /* Record LSN while holding page lock to avoid content/LSN reordering. */
+ if (map_lsn == InvalidXLogRecPtr)
+ {
+ if (InRecovery)
+ map_lsn = GetXLogReplayRecPtr(NULL);
+ else
+ map_lsn = GetXLogWriteRecPtr();
+ }
+ MapMarkBufferDirty(map_ctx, buf, map_lsn);
+
+ /* Release buffer lock after modification */
+ LWLockRelease(&buf->buffer_lock);
+
+ /* Unpin the buffer */
+ MapUnpinBuffer(slot_id);
+}
diff --git a/src/backend/storage/map/mapbuf.c b/src/backend/storage/map/mapbuf.c
index cb8b59dfbc..4e36a9b79e 100644
--- a/src/backend/storage/map/mapbuf.c
+++ b/src/backend/storage/map/mapbuf.c
@@ -258,6 +258,8 @@ MapBackendExitCleanup(void)
* progress even if current backend is leaving via ERROR/abort.
*/
MapAbortBufferIO();
+ MapInflightCleanupOwned();
+
if (MapPrivateRefCount == NULL)
return;
@@ -401,6 +403,18 @@ retry:
MapWaitIO(buf);
goto retry;
}
+
+ if (buf->pending_count != 0)
+ {
+ LWLockRelease(&buf->buffer_lock);
+ LWLockRelease(&buf->io_in_progress_lock);
+ CHECK_FOR_INTERRUPTS();
+ pg_usleep(1000L);
+ goto retry;
+ }
+
+ MemSet(buf->pending_bits, 0, sizeof(buf->pending_bits));
+
buf->page_number = -1;
buf->forknum = InvalidForkNumber;
memset(&buf->rnode, 0, sizeof(RelFileLocator));
diff --git a/src/backend/storage/map/mapclock.c b/src/backend/storage/map/mapclock.c
index 6fa62e1c1a..3ccdbb2310 100644
--- a/src/backend/storage/map/mapclock.c
+++ b/src/backend/storage/map/mapclock.c
@@ -308,7 +308,8 @@ MapClockGetBuffer(void)
local_buf_state = pg_atomic_read_u32(&buf->state);
if (MAPBUF_GET_REFCOUNT(local_buf_state) == 0 &&
- MAPBUF_GET_USAGECOUNT(local_buf_state) == 0)
+ MAPBUF_GET_USAGECOUNT(local_buf_state) == 0 &&
+ buf->pending_count == 0)
{
/* Found a usable buffer */
pg_atomic_fetch_add_u32(&MapShared->num_allocs, 1);
@@ -344,7 +345,8 @@ MapClockGetBuffer(void)
* If the buffer is pinned, we cannot use it.
* If it has a non-zero usage_count, decrement it and continue.
*/
- if (MAPBUF_GET_REFCOUNT(local_buf_state) == 0)
+ if (MAPBUF_GET_REFCOUNT(local_buf_state) == 0 &&
+ buf->pending_count == 0)
{
if (MAPBUF_GET_USAGECOUNT(local_buf_state) != 0)
{
diff --git a/src/backend/storage/map/mapinflight.c b/src/backend/storage/map/mapinflight.c
new file mode 100644
index 0000000000..c204a1fc03
--- /dev/null
+++ b/src/backend/storage/map/mapinflight.c
@@ -0,0 +1,402 @@
+/*-------------------------------------------------------------------------
+ *
+ * mapinflight.c
+ * MAP in-flight remap ownership tracking.
+ *
+ * Shared state is deliberately only a per-MAP-buffer bitmap. The bitmap
+ * serializes remaps of the same logical MAP entry across backends; the pblk
+ * reserved by the owner is backend-local, so other backends cannot borrow an
+ * uncommitted physical target.
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "miscadmin.h"
+#include "storage/map.h"
+#include "storage/map_internal.h"
+#include "utils/memutils.h"
+
+typedef struct MapInflightLocalEntry
+{
+ RelFileLocator rnode;
+ ForkNumber forknum;
+ BlockNumber lblkno;
+ BlockNumber pblkno;
+ int slot_id;
+ int entry_idx;
+} MapInflightLocalEntry;
+
+static MemoryContext MapInflightLocalCxt = NULL;
+static MapInflightLocalEntry *MapInflightLocalEntries = NULL;
+static int MapInflightLocalCount = 0;
+static int MapInflightLocalCapacity = 0;
+
+static void MapInflightLocalEnsureContext(void);
+static void MapInflightLocalEnsureCapacity(int needed);
+static int MapInflightLocalFind(RelFileLocator rnode, ForkNumber forknum,
+ BlockNumber lblkno);
+static void MapInflightLocalForget(RelFileLocator rnode, ForkNumber forknum,
+ BlockNumber lblkno);
+static void MapInflightLocalRememberPrepared(RelFileLocator rnode,
+ ForkNumber forknum,
+ BlockNumber lblkno,
+ BlockNumber pblkno,
+ int slot_id, int entry_idx);
+static void MapInflightDecode(ForkNumber forknum, BlockNumber lblkno,
+ BlockNumber *map_blkno, int *entry_idx);
+static inline uint64 MapInflightEntryMask(int entry_idx);
+static bool MapInflightBufferTryClaim(RelFileLocator rnode, ForkNumber forknum,
+ BlockNumber map_blkno, int slot_id,
+ int entry_idx);
+static void MapInflightBufferRelease(int slot_id, int entry_idx);
+
+static void
+MapInflightLocalEnsureContext(void)
+{
+ if (MapInflightLocalCxt == NULL)
+ {
+ MapInflightLocalCxt = AllocSetContextCreate(TopMemoryContext,
+ "MapInflightLocal",
+ ALLOCSET_DEFAULT_SIZES);
+ MemoryContextAllowInCriticalSection(MapInflightLocalCxt, true);
+ }
+}
+
+void
+MapInflightBackendInit(void)
+{
+ MapInflightLocalEnsureContext();
+}
+
+static void
+MapInflightLocalEnsureCapacity(int needed)
+{
+ int new_capacity;
+
+ MapInflightLocalEnsureContext();
+
+ if (MapInflightLocalCapacity >= needed)
+ return;
+
+ new_capacity = (MapInflightLocalCapacity == 0) ? 16 : MapInflightLocalCapacity;
+ while (new_capacity < needed)
+ new_capacity *= 2;
+
+ if (MapInflightLocalEntries == NULL)
+ MapInflightLocalEntries =
+ MemoryContextAlloc(MapInflightLocalCxt,
+ sizeof(MapInflightLocalEntry) * new_capacity);
+ else
+ MapInflightLocalEntries =
+ repalloc(MapInflightLocalEntries,
+ sizeof(MapInflightLocalEntry) * new_capacity);
+
+ MapInflightLocalCapacity = new_capacity;
+}
+
+static int
+MapInflightLocalFind(RelFileLocator rnode, ForkNumber forknum,
+ BlockNumber lblkno)
+{
+ int i;
+
+ for (i = 0; i < MapInflightLocalCount; i++)
+ {
+ MapInflightLocalEntry *entry = &MapInflightLocalEntries[i];
+
+ if (!RelFileLocatorEquals(entry->rnode, rnode))
+ continue;
+ if (entry->forknum != forknum)
+ continue;
+ if (entry->lblkno != lblkno)
+ continue;
+ return i;
+ }
+
+ return -1;
+}
+
+static void
+MapInflightLocalRememberPrepared(RelFileLocator rnode, ForkNumber forknum,
+ BlockNumber lblkno, BlockNumber pblkno,
+ int slot_id, int entry_idx)
+{
+ Assert(MapInflightLocalFind(rnode, forknum, lblkno) < 0);
+ Assert(MapInflightLocalCount < MapInflightLocalCapacity);
+
+ MapInflightLocalEntries[MapInflightLocalCount].rnode = rnode;
+ MapInflightLocalEntries[MapInflightLocalCount].forknum = forknum;
+ MapInflightLocalEntries[MapInflightLocalCount].lblkno = lblkno;
+ MapInflightLocalEntries[MapInflightLocalCount].pblkno = pblkno;
+ MapInflightLocalEntries[MapInflightLocalCount].slot_id = slot_id;
+ MapInflightLocalEntries[MapInflightLocalCount].entry_idx = entry_idx;
+ MapInflightLocalCount++;
+}
+
+static void
+MapInflightDecode(ForkNumber forknum, BlockNumber lblkno,
+ BlockNumber *map_blkno, int *entry_idx)
+{
+ BlockNumber map_entry_no;
+
+ Assert(map_blkno != NULL);
+ Assert(entry_idx != NULL);
+
+ map_entry_no = MapLblknoToMapBlkno(forknum, lblkno);
+ *entry_idx = map_entry_no % MAP_ENTRIES_PER_PAGE;
+ *map_blkno = map_entry_no / MAP_ENTRIES_PER_PAGE;
+ Assert(*entry_idx >= 0 && *entry_idx < MAP_ENTRIES_PER_PAGE);
+}
+
+static inline uint64
+MapInflightEntryMask(int entry_idx)
+{
+ Assert(entry_idx >= 0 && entry_idx < MAP_ENTRIES_PER_PAGE);
+ return UINT64CONST(1) << (entry_idx % MAP_PENDING_BITS_PER_WORD);
+}
+
+static bool
+MapInflightBufferTryClaim(RelFileLocator rnode, ForkNumber forknum,
+ BlockNumber map_blkno, int slot_id, int entry_idx)
+{
+ MapBufferDesc *buf = &MapBuffers[slot_id];
+ int word_idx = entry_idx / MAP_PENDING_BITS_PER_WORD;
+ uint64 mask = MapInflightEntryMask(entry_idx);
+ bool claimed = false;
+
+ LWLockAcquire(&buf->buffer_lock, LW_EXCLUSIVE);
+ if (buf->page_number == map_blkno &&
+ buf->forknum == forknum &&
+ RelFileLocatorEquals(buf->rnode, rnode))
+ {
+ if ((buf->pending_bits[word_idx] & mask) == 0)
+ {
+ buf->pending_bits[word_idx] |= mask;
+ buf->pending_count++;
+ claimed = true;
+ }
+ }
+ LWLockRelease(&buf->buffer_lock);
+
+ return claimed;
+}
+
+static void
+MapInflightBufferRelease(int slot_id, int entry_idx)
+{
+ MapBufferDesc *buf = &MapBuffers[slot_id];
+ int word_idx = entry_idx / MAP_PENDING_BITS_PER_WORD;
+ uint64 mask = MapInflightEntryMask(entry_idx);
+
+ LWLockAcquire(&buf->buffer_lock, LW_EXCLUSIVE);
+ Assert(buf->pending_bits[word_idx] & mask);
+ if ((buf->pending_bits[word_idx] & mask) != 0)
+ {
+ buf->pending_bits[word_idx] &= ~mask;
+ Assert(buf->pending_count > 0);
+ buf->pending_count--;
+ }
+ LWLockRelease(&buf->buffer_lock);
+
+ MapUnpinBuffer(slot_id);
+}
+
+static void
+MapInflightLocalForget(RelFileLocator rnode, ForkNumber forknum,
+ BlockNumber lblkno)
+{
+ int idx;
+
+ idx = MapInflightLocalFind(rnode, forknum, lblkno);
+ if (idx < 0)
+ return;
+
+ MapInflightLocalCount--;
+ if (idx != MapInflightLocalCount)
+ MapInflightLocalEntries[idx] = MapInflightLocalEntries[MapInflightLocalCount];
+}
+
+bool
+MapInflightLookupOwnedPblk(RelFileLocator rnode,
+ ForkNumber forknum,
+ BlockNumber lblkno,
+ BlockNumber *pblkno)
+{
+ int idx;
+
+ Assert(pblkno != NULL);
+
+ idx = MapInflightLocalFind(rnode, forknum, lblkno);
+ if (idx < 0)
+ return false;
+ if (MapInflightLocalEntries[idx].pblkno == InvalidBlockNumber)
+ return false;
+
+ *pblkno = MapInflightLocalEntries[idx].pblkno;
+ return true;
+}
+
+bool
+MapInflightTryClaimBarrier(UmbraFileContext *map_ctx,
+ RelFileLocator rnode,
+ ForkNumber forknum,
+ BlockNumber lblkno,
+ MapInflightBarrier *barrier)
+{
+ BlockNumber map_blkno;
+ int entry_idx;
+ int slot_id;
+
+ Assert(barrier != NULL);
+ Assert(!barrier->valid);
+
+ if (MapInflightLocalFind(rnode, forknum, lblkno) >= 0)
+ elog(ERROR,
+ "cannot claim write barrier while owning in-flight remap for relation %u/%u/%u fork %d block %u",
+ rnode.spcOid, rnode.dbOid, rnode.relNumber, forknum, lblkno);
+
+ MapInflightDecode(forknum, lblkno, &map_blkno, &entry_idx);
+
+ Assert(map_ctx != NULL);
+ slot_id = MapReadBuffer(map_ctx, rnode, forknum, map_blkno);
+
+ if (!MapInflightBufferTryClaim(rnode, forknum, map_blkno, slot_id,
+ entry_idx))
+ {
+ MapUnpinBuffer(slot_id);
+ return false;
+ }
+
+ barrier->valid = true;
+ barrier->slot_id = slot_id;
+ barrier->entry_idx = entry_idx;
+
+ return true;
+}
+
+void
+MapInflightReleaseBarrier(MapInflightBarrier *barrier)
+{
+ if (barrier == NULL || !barrier->valid)
+ return;
+
+ MapInflightBufferRelease(barrier->slot_id, barrier->entry_idx);
+ barrier->valid = false;
+ barrier->slot_id = -1;
+ barrier->entry_idx = -1;
+}
+
+bool
+MapInflightTryClaim(UmbraFileContext *map_ctx, RelFileLocator rnode,
+ ForkNumber forknum, BlockNumber lblkno)
+{
+ BlockNumber map_blkno;
+ int entry_idx;
+ int slot_id;
+
+ if (MapInflightLocalFind(rnode, forknum, lblkno) >= 0)
+ return false;
+
+ /*
+ * Ensure backend-local storage before publishing any shared in-flight
+ * state, so the post-claim path cannot throw due to allocation.
+ */
+ MapInflightLocalEnsureCapacity(MapInflightLocalCount + 1);
+ MapInflightDecode(forknum, lblkno, &map_blkno, &entry_idx);
+
+ Assert(map_ctx != NULL);
+ slot_id = MapReadBuffer(map_ctx, rnode, forknum, map_blkno);
+
+ if (!MapInflightBufferTryClaim(rnode, forknum, map_blkno, slot_id,
+ entry_idx))
+ {
+ MapUnpinBuffer(slot_id);
+ return false;
+ }
+
+ MapInflightLocalRememberPrepared(rnode, forknum, lblkno,
+ InvalidBlockNumber,
+ slot_id, entry_idx);
+
+ return true;
+}
+
+void
+MapInflightFinishClaim(RelFileLocator rnode, ForkNumber forknum,
+ BlockNumber lblkno, BlockNumber pblkno)
+{
+ int local_idx;
+
+ Assert(pblkno != InvalidBlockNumber);
+
+ local_idx = MapInflightLocalFind(rnode, forknum, lblkno);
+ if (local_idx < 0)
+ elog(ERROR,
+ "in-flight remap claim disappeared for relation %u/%u/%u fork %d block %u",
+ rnode.spcOid, rnode.dbOid, rnode.relNumber, forknum, lblkno);
+
+ MapInflightLocalEntries[local_idx].pblkno = pblkno;
+}
+
+void
+MapInflightRelease(RelFileLocator rnode, ForkNumber forknum,
+ BlockNumber lblkno)
+{
+ int idx;
+ int slot_id;
+ int entry_idx;
+
+ idx = MapInflightLocalFind(rnode, forknum, lblkno);
+ if (idx < 0)
+ return;
+
+ slot_id = MapInflightLocalEntries[idx].slot_id;
+ entry_idx = MapInflightLocalEntries[idx].entry_idx;
+ MapInflightLocalForget(rnode, forknum, lblkno);
+ MapInflightBufferRelease(slot_id, entry_idx);
+}
+
+bool
+MapInflightBitIsSet(RelFileLocator rnode, ForkNumber forknum,
+ BlockNumber lblkno)
+{
+ BlockNumber map_blkno;
+ int entry_idx;
+ int slot_id;
+ MapBufferDesc *buf;
+ int word_idx;
+ uint64 mask;
+ bool exists = false;
+
+ MapInflightDecode(forknum, lblkno, &map_blkno, &entry_idx);
+ slot_id = MapCacheLookup(rnode, forknum, map_blkno);
+ if (slot_id < 0)
+ return false;
+
+ buf = &MapBuffers[slot_id];
+ word_idx = entry_idx / MAP_PENDING_BITS_PER_WORD;
+ mask = MapInflightEntryMask(entry_idx);
+
+ MapPinBuffer(slot_id, false);
+ LWLockAcquire(&buf->buffer_lock, LW_SHARED);
+ if (buf->page_number == map_blkno &&
+ buf->forknum == forknum &&
+ RelFileLocatorEquals(buf->rnode, rnode))
+ exists = (buf->pending_bits[word_idx] & mask) != 0;
+ LWLockRelease(&buf->buffer_lock);
+ MapUnpinBuffer(slot_id);
+
+ return exists;
+}
+
+void
+MapInflightCleanupOwned(void)
+{
+ while (MapInflightLocalCount > 0)
+ {
+ MapInflightLocalEntry *entry = &MapInflightLocalEntries[0];
+
+ MapInflightRelease(entry->rnode, entry->forknum, entry->lblkno);
+ }
+}
diff --git a/src/backend/storage/map/mapinit.c b/src/backend/storage/map/mapinit.c
index a0880113ed..c9ddd12ff0 100644
--- a/src/backend/storage/map/mapinit.c
+++ b/src/backend/storage/map/mapinit.c
@@ -66,7 +66,9 @@ MapBackendInit(void)
return;
MapRefreshBufferSlots();
- MapEnsurePrivateRefCount(); initialized = true;
+ MapEnsurePrivateRefCount();
+ MapInflightBackendInit();
+ initialized = true;
}
static void
@@ -121,6 +123,9 @@ MapShmemInit(void *arg)
buf->forknum = InvalidForkNumber;
buf->page_number = -1;
buf->page_lsn = 0;
+ buf->pending_count = 0;
+ MemSet(buf->pending_bits, 0, sizeof(buf->pending_bits));
+
LWLockInitialize(&buf->buffer_lock, LWTRANCHE_MAP_BUFFER_CONTENT);
LWLockInitialize(&buf->io_in_progress_lock, LWTRANCHE_MAP_BUFFER_CONTENT);
}
diff --git a/src/backend/storage/map/mapsuper.c b/src/backend/storage/map/mapsuper.c
index cf8bde182e..ad4a6f6bdb 100644
--- a/src/backend/storage/map/mapsuper.c
+++ b/src/backend/storage/map/mapsuper.c
@@ -838,7 +838,6 @@ MapSuperSetExtendingTarget(MapSuperEntry *entry, ForkNumber forknum,
-
static bool
MapSuperPrepareEntryForUpdate(UmbraFileContext *map_ctx, RelFileLocator rnode,
XLogRecPtr map_lsn, const char *missing_errmsg,
@@ -871,12 +870,14 @@ MapSuperPrepareEntryForUpdate(UmbraFileContext *map_ctx, RelFileLocator rnode,
entry->super = disk_super;
entry->page_lsn = MapSuperblockGetLastUpdatedLSN(&disk_super);
entry->flags = MAPSUPER_FLAG_VALID;
+ MapSuperResetReservedNextFrees(entry);
}
else
{
MapSuperblockInit(&entry->super, 0);
entry->page_lsn = InvalidXLogRecPtr;
entry->flags = MAPSUPER_FLAG_VALID | MAPSUPER_FLAG_CORRUPT;
+ MapSuperResetReservedNextFrees(entry);
}
}
}
@@ -897,8 +898,21 @@ MapSuperPrepareEntryForUpdate(UmbraFileContext *map_ctx, RelFileLocator rnode,
*/
MapSuperblockInit(&entry->super, 0);
entry->flags = MAPSUPER_FLAG_VALID;
+ MapSuperResetReservedNextFrees(entry);
}
+ Assert(MapNormalizeForkBlockCount(MAIN_FORKNUM,
+ MapSuperblockGetNextFreePhysBlock(&entry->super,
+ MAIN_FORKNUM)) <=
+ MapSuperGetReservedNextFree(entry, MAIN_FORKNUM));
+ Assert(MapNormalizeForkBlockCount(FSM_FORKNUM,
+ MapSuperblockGetNextFreePhysBlock(&entry->super,
+ FSM_FORKNUM)) <=
+ MapSuperGetReservedNextFree(entry, FSM_FORKNUM));
+ Assert(MapNormalizeForkBlockCount(VISIBILITYMAP_FORKNUM,
+ MapSuperblockGetNextFreePhysBlock(&entry->super,
+ VISIBILITYMAP_FORKNUM)) <=
+ MapSuperGetReservedNextFree(entry, VISIBILITYMAP_FORKNUM));
*entry_p = entry;
return true;
}
@@ -1001,11 +1015,15 @@ MapSBlockBumpPhysicalState(UmbraFileContext *map_ctx, RelFileLocator rnode,
current_capacity = MapSuperblockGetPhysCapacity(&entry->super, forknum);
current_next = MapNormalizeForkBlockCount(forknum, current_next);
current_capacity = MapNormalizeForkBlockCount(forknum, current_capacity);
+ Assert(MapNormalizeForkBlockCount(forknum,
+ MapSuperblockGetNextFreePhysBlock(&entry->super,
+ forknum)) <=
+ MapSuperGetReservedNextFree(entry, forknum));
if (bump_next_free && current_next < nblocks)
{
MapSuperblockSetNextFreePhysBlock(&entry->super, forknum, nblocks);
- if (InRecovery)
+ MapSuperMaybeBumpReservedNextFree(entry, forknum, nblocks);
changed = true;
}
if (bump_capacity && current_capacity < nblocks)
@@ -1028,6 +1046,10 @@ MapSBlockBumpPhysicalState(UmbraFileContext *map_ctx, RelFileLocator rnode,
entry->flags |= MAPSUPER_FLAG_DIRTY;
}
+ Assert(MapNormalizeForkBlockCount(forknum,
+ MapSuperblockGetNextFreePhysBlock(&entry->super,
+ forknum)) <=
+ MapSuperGetReservedNextFree(entry, forknum));
LWLockRelease(&entry->lock);
}
diff --git a/src/backend/storage/map/meson.build b/src/backend/storage/map/meson.build
index 8747f0b714..bdaa0dd14a 100644
--- a/src/backend/storage/map/meson.build
+++ b/src/backend/storage/map/meson.build
@@ -6,5 +6,6 @@ backend_sources += files(
'mapbuf.c',
'mapflush.c',
'mapclock.c',
+ 'mapinflight.c',
'mapsuper.c',
)
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index dee29037b1..ffc4bb83e9 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -1034,6 +1034,7 @@ mdstartreadv(PgAioHandle *ioh,
reln,
forknum,
blocknum,
+ blocknum,
nblocks,
false);
pgaio_io_register_callbacks(ioh, PGAIO_HCB_MD_READV, 0);
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index 631d09d4b4..1e3e0b08f8 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -4,9 +4,8 @@
* public interface routines to storage manager switch.
*
* All file system operations on relations dispatch through these routines.
- * An SMgrRelation represents storage-manager state for a relation. The
- * selected storage manager implementation owns any implementation-specific
- * state needed to service those operations.
+ * An SMgrRelation represents physical on-disk relation files that are open
+ * for reading and writing.
*
* When a relation is first accessed through the relation cache, the
* corresponding SMgrRelation entry is opened by calling smgropen(), and the
@@ -93,6 +92,7 @@ typedef struct f_smgr
{
void (*smgr_init) (void); /* may be NULL */
void (*smgr_shutdown) (void); /* may be NULL */
+ void (*smgr_before_shmem_exit_cleanup) (void); /* may be NULL */
void (*smgr_open) (SMgrRelation reln);
void (*smgr_close) (SMgrRelation reln, ForkNumber forknum);
void (*smgr_destroy) (SMgrRelation reln); /* may be NULL */
@@ -123,10 +123,15 @@ typedef struct f_smgr
void (*smgr_writeback) (SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, BlockNumber nblocks);
BlockNumber (*smgr_nblocks) (SMgrRelation reln, ForkNumber forknum);
+ void (*smgr_pretruncate) (SMgrRelation reln, ForkNumber forknum,
+ BlockNumber old_blocks, BlockNumber nblocks,
+ XLogRecPtr truncate_lsn);
void (*smgr_truncate) (SMgrRelation reln, ForkNumber forknum,
BlockNumber old_blocks, BlockNumber nblocks);
void (*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum);
void (*smgr_registersync) (SMgrRelation reln, ForkNumber forknum);
+ int (*smgr_fd) (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, uint32 *off);
+ bool (*smgr_is_internal_fork) (ForkNumber forknum);
void (*smgr_create_relation_metadata) (SMgrRelation reln);
void (*smgr_copy_relation_metadata) (SMgrRelation src,
SMgrRelation dst,
@@ -134,61 +139,30 @@ typedef struct f_smgr
void (*smgr_sync_relation_metadata) (SMgrRelation reln);
void (*smgr_unlink_relation_metadata) (RelFileLocatorBackend rlocator,
bool isRedo);
+ void (*smgr_setmapstate) (SMgrRelation reln, uint8 map_state);
bool (*smgr_createdb_allows_wal_log) (void);
+ void (*smgr_init_new_relation) (SMgrRelation reln, bool needs_wal);
+ void (*smgr_redo_create_fork) (SMgrRelation reln, ForkNumber forknum,
+ XLogRecPtr lsn);
void (*smgr_checkpoint_database_tablespaces) (Oid dbid,
int ntablespaces,
const Oid *tablespace_ids);
void (*smgr_invalidate_database_tablespaces) (Oid dbid,
int ntablespaces,
const Oid *tablespace_ids);
- int (*smgr_fd) (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, uint32 *off);
+ void (*smgr_mark_skip_wal_pending) (SMgrRelation reln);
+ void (*smgr_clear_skip_wal_pending) (SMgrRelation reln);
+ bool (*smgr_prepare_pendingsync) (SMgrRelation reln);
+ bool (*smgr_needs_recovery_fsm_vacuum) (SMgrRelation reln);
} f_smgr;
-#define SMGR_MD 0
-#ifdef USE_UMBRA
-#define SMGR_UMBRA 1
-#define SMGR_DEFAULT SMGR_UMBRA
-#else
-#define SMGR_DEFAULT SMGR_MD
-#endif
-
static const f_smgr smgrsw[] = {
- /* magnetic disk */
- {
- .smgr_init = mdinit,
- .smgr_shutdown = NULL,
- .smgr_open = mdopen,
- .smgr_close = mdclose,
- .smgr_destroy = NULL,
- .smgr_create = mdcreate,
- .smgr_exists = mdexists,
- .smgr_unlink = mdunlink,
- .smgr_extend = mdextend,
- .smgr_zeroextend = mdzeroextend,
- .smgr_prefetch = mdprefetch,
- .smgr_maxcombine = mdmaxcombine,
- .smgr_readv = mdreadv,
- .smgr_startreadv = mdstartreadv,
- .smgr_writev = mdwritev,
- .smgr_writeback = mdwriteback,
- .smgr_nblocks = mdnblocks,
- .smgr_truncate = mdtruncate,
- .smgr_immedsync = mdimmedsync,
- .smgr_registersync = mdregistersync,
- .smgr_create_relation_metadata = NULL,
- .smgr_copy_relation_metadata = NULL,
- .smgr_sync_relation_metadata = NULL,
- .smgr_unlink_relation_metadata = NULL,
- .smgr_createdb_allows_wal_log = NULL,
- .smgr_checkpoint_database_tablespaces = NULL,
- .smgr_invalidate_database_tablespaces = NULL,
- .smgr_fd = mdfd,
- },
#ifdef USE_UMBRA
/* Umbra storage manager */
{
.smgr_init = uminit,
.smgr_shutdown = NULL,
+ .smgr_before_shmem_exit_cleanup = umbeforeshmemexitcleanup,
.smgr_open = umopen,
.smgr_close = umclose,
.smgr_destroy = umdestroy,
@@ -204,18 +178,64 @@ static const f_smgr smgrsw[] = {
.smgr_writev = umwritev,
.smgr_writeback = umwriteback,
.smgr_nblocks = umnblocks,
+ .smgr_pretruncate = umpretruncate,
.smgr_truncate = umtruncate,
.smgr_immedsync = umimmedsync,
.smgr_registersync = umregistersync,
+ .smgr_fd = umfd,
+ .smgr_is_internal_fork = umisinternalfork,
.smgr_create_relation_metadata = umcreaterelationmetadata,
.smgr_copy_relation_metadata = umcopyrelationmetadata,
.smgr_sync_relation_metadata = umsyncrelationmetadata,
.smgr_unlink_relation_metadata = umunlinkrelationmetadata,
+ .smgr_setmapstate = umsetmapstate,
.smgr_createdb_allows_wal_log = umcreatedballowswallog,
+ .smgr_init_new_relation = uminitnewrelation,
+ .smgr_redo_create_fork = umredocreatefork,
.smgr_checkpoint_database_tablespaces = umcheckpointdatabasetablespaces,
.smgr_invalidate_database_tablespaces = uminvalidatedatabasetablespaces,
- .smgr_fd = umfd,
+ .smgr_mark_skip_wal_pending = ummarkskipwalpending,
+ .smgr_clear_skip_wal_pending = umclearskipwalpending,
+ .smgr_prepare_pendingsync = umpreparependingsync,
+ .smgr_needs_recovery_fsm_vacuum = umneedsrecoveryfsmvacuum,
},
+#else
+ /* magnetic disk */
+ {
+ .smgr_init = mdinit,
+ .smgr_shutdown = NULL,
+ .smgr_before_shmem_exit_cleanup = NULL,
+ .smgr_open = mdopen,
+ .smgr_close = mdclose,
+ .smgr_destroy = NULL,
+ .smgr_create = mdcreate,
+ .smgr_exists = mdexists,
+ .smgr_unlink = mdunlink,
+ .smgr_extend = mdextend,
+ .smgr_zeroextend = mdzeroextend,
+ .smgr_prefetch = mdprefetch,
+ .smgr_maxcombine = mdmaxcombine,
+ .smgr_readv = mdreadv,
+ .smgr_startreadv = mdstartreadv,
+ .smgr_writev = mdwritev,
+ .smgr_writeback = mdwriteback,
+ .smgr_nblocks = mdnblocks,
+ .smgr_pretruncate = NULL,
+ .smgr_truncate = mdtruncate,
+ .smgr_immedsync = mdimmedsync,
+ .smgr_registersync = mdregistersync,
+ .smgr_fd = mdfd,
+ .smgr_is_internal_fork = NULL,
+ .smgr_create_relation_metadata = NULL,
+ .smgr_copy_relation_metadata = NULL,
+ .smgr_sync_relation_metadata = NULL,
+ .smgr_unlink_relation_metadata = NULL,
+ .smgr_setmapstate = NULL,
+ .smgr_mark_skip_wal_pending = NULL,
+ .smgr_clear_skip_wal_pending = NULL,
+ .smgr_prepare_pendingsync = NULL,
+ .smgr_needs_recovery_fsm_vacuum = NULL,
+ }
#endif
};
@@ -231,6 +251,7 @@ static dlist_head unpinned_relns;
/* local function prototypes */
static void smgrshutdown(int code, Datum arg);
+static void smgrbeforeshmemexit(int code, Datum arg);
static void smgrdestroy(SMgrRelation reln);
static void smgr_aio_reopen(PgAioHandle *ioh);
@@ -243,6 +264,15 @@ const PgAioTargetInfo aio_smgr_target_info = {
.describe_identity = smgr_aio_describe_identity,
};
+bool
+smgrisinternalfork(ForkNumber forknum)
+{
+ if (smgrsw[0].smgr_is_internal_fork == NULL)
+ return false;
+
+ return smgrsw[0].smgr_is_internal_fork(forknum);
+}
+
/*
* smgrinit(), smgrshutdown() -- Initialize or shut down storage
@@ -271,6 +301,21 @@ smgrinit(void)
on_proc_exit(smgrshutdown, 0);
}
+void
+smgrregistershutdowncleanup(void)
+{
+ static bool registered = false;
+
+ if (registered)
+ return;
+
+ if (smgrsw[0].smgr_before_shmem_exit_cleanup == NULL)
+ return;
+
+ before_shmem_exit(smgrbeforeshmemexit, 0);
+ registered = true;
+}
+
/*
* on_proc_exit hook for smgr cleanup during backend shutdown
*/
@@ -290,6 +335,15 @@ smgrshutdown(int code, Datum arg)
RESUME_INTERRUPTS();
}
+static void
+smgrbeforeshmemexit(int code, Datum arg)
+{
+ (void) code;
+ (void) arg;
+
+ smgrsw[0].smgr_before_shmem_exit_cleanup();
+}
+
/*
* smgropen() -- Return an SMgrRelation object, creating it if need be.
*
@@ -341,7 +395,7 @@ smgropen(RelFileLocator rlocator, ProcNumber backend)
reln->smgr_targblock = InvalidBlockNumber;
for (int i = 0; i <= MAX_FORKNUM; ++i)
reln->smgr_cached_nblocks[i] = InvalidBlockNumber;
- reln->smgr_which = SMGR_DEFAULT;
+ reln->smgr_which = 0; /* we only have md.c at present */
reln->smgr_private = NULL;
/* it is not pinned yet */
@@ -582,37 +636,56 @@ smgrsyncrelationmetadata(SMgrRelation reln)
void
smgrunlinkrelationmetadata(RelFileLocatorBackend rlocator, bool isRedo)
{
- if (smgrsw[SMGR_DEFAULT].smgr_unlink_relation_metadata)
- smgrsw[SMGR_DEFAULT].smgr_unlink_relation_metadata(rlocator, isRedo);
+ if (smgrsw[0].smgr_unlink_relation_metadata)
+ smgrsw[0].smgr_unlink_relation_metadata(rlocator, isRedo);
+}
+
+void
+smgrsetmapstate(SMgrRelation reln, uint8 map_state)
+{
+ if (smgrsw[reln->smgr_which].smgr_setmapstate)
+ smgrsw[reln->smgr_which].smgr_setmapstate(reln, map_state);
}
bool
smgrcreatedballowswallog(void)
{
- if (smgrsw[SMGR_DEFAULT].smgr_createdb_allows_wal_log)
- return smgrsw[SMGR_DEFAULT].smgr_createdb_allows_wal_log();
+ if (smgrsw[0].smgr_createdb_allows_wal_log)
+ return smgrsw[0].smgr_createdb_allows_wal_log();
return true;
}
+void
+smgrinitnewrelation(SMgrRelation reln, bool needs_wal)
+{
+ if (smgrsw[reln->smgr_which].smgr_init_new_relation)
+ smgrsw[reln->smgr_which].smgr_init_new_relation(reln, needs_wal);
+}
+
+void
+smgrredocreatefork(SMgrRelation reln, ForkNumber forknum, XLogRecPtr lsn)
+{
+ if (smgrsw[reln->smgr_which].smgr_redo_create_fork)
+ smgrsw[reln->smgr_which].smgr_redo_create_fork(reln, forknum, lsn);
+}
+
void
smgrcheckpointdatabasetablespaces(Oid dbid, int ntablespaces,
const Oid *tablespace_ids)
{
- if (smgrsw[SMGR_DEFAULT].smgr_checkpoint_database_tablespaces)
- smgrsw[SMGR_DEFAULT].smgr_checkpoint_database_tablespaces(dbid,
- ntablespaces,
- tablespace_ids);
+ if (smgrsw[0].smgr_checkpoint_database_tablespaces)
+ smgrsw[0].smgr_checkpoint_database_tablespaces(dbid, ntablespaces,
+ tablespace_ids);
}
void
smgrinvalidatedatabasetablespaces(Oid dbid, int ntablespaces,
const Oid *tablespace_ids)
{
- if (smgrsw[SMGR_DEFAULT].smgr_invalidate_database_tablespaces)
- smgrsw[SMGR_DEFAULT].smgr_invalidate_database_tablespaces(dbid,
- ntablespaces,
- tablespace_ids);
+ if (smgrsw[0].smgr_invalidate_database_tablespaces)
+ smgrsw[0].smgr_invalidate_database_tablespaces(dbid, ntablespaces,
+ tablespace_ids);
}
void
@@ -620,6 +693,50 @@ smgrinvalidatedatabase(Oid dbid)
{
smgrinvalidatedatabasetablespaces(dbid, 0, NULL);
}
+
+
+
+
+
+
+void
+smgrmarkskipwalpending(RelFileLocator rlocator)
+{
+ SMgrRelation reln;
+
+ reln = smgropen(rlocator, INVALID_PROC_NUMBER);
+ if (smgrsw[reln->smgr_which].smgr_mark_skip_wal_pending)
+ smgrsw[reln->smgr_which].smgr_mark_skip_wal_pending(reln);
+}
+
+void
+smgrclearskipwalpending(RelFileLocator rlocator)
+{
+ SMgrRelation reln;
+
+ reln = smgropen(rlocator, INVALID_PROC_NUMBER);
+ if (smgrsw[reln->smgr_which].smgr_clear_skip_wal_pending)
+ smgrsw[reln->smgr_which].smgr_clear_skip_wal_pending(reln);
+}
+
+bool
+smgrpreparependingsync(SMgrRelation reln)
+{
+ if (smgrsw[reln->smgr_which].smgr_prepare_pendingsync)
+ return smgrsw[reln->smgr_which].smgr_prepare_pendingsync(reln);
+
+ return false;
+}
+
+bool
+smgrneedsrecoveryfsmvacuum(SMgrRelation reln)
+{
+ if (smgrsw[reln->smgr_which].smgr_needs_recovery_fsm_vacuum)
+ return smgrsw[reln->smgr_which].smgr_needs_recovery_fsm_vacuum(reln);
+
+ return true;
+}
+
/*
* smgrdosyncall() -- Immediately sync all forks of all given relations
*
@@ -651,6 +768,9 @@ smgrdosyncall(SMgrRelation *rels, int nrels)
for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
{
+ if (smgrisinternalfork(forknum))
+ continue;
+
if (smgrsw[which].smgr_exists(rels[i], forknum))
smgrsw[which].smgr_immedsync(rels[i], forknum);
}
@@ -708,7 +828,12 @@ smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo)
/* Close the forks at smgr level */
for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
+ {
+ if (smgrisinternalfork(forknum))
+ continue;
+
smgrsw[which].smgr_close(rels[i], forknum);
+ }
}
/*
@@ -735,7 +860,12 @@ smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo)
int which = rels[i]->smgr_which;
for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
+ {
+ if (smgrisinternalfork(forknum))
+ continue;
+
smgrsw[which].smgr_unlink(rlocators[i], forknum, isRedo);
+ }
smgrunlinkrelationmetadata(rlocators[i], isRedo);
}
@@ -996,6 +1126,45 @@ smgrnblocks_cached(SMgrRelation reln, ForkNumber forknum)
return InvalidBlockNumber;
}
+void
+smgrbumpcachednblocks(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
+{
+ if (InRecovery &&
+ reln->smgr_cached_nblocks[forknum] == InvalidBlockNumber)
+ {
+ BlockNumber authoritative;
+
+ HOLD_INTERRUPTS();
+ authoritative = smgrsw[reln->smgr_which].smgr_nblocks(reln, forknum);
+ RESUME_INTERRUPTS();
+
+ if (authoritative > nblocks)
+ nblocks = authoritative;
+ }
+
+ if (reln->smgr_cached_nblocks[forknum] == InvalidBlockNumber ||
+ reln->smgr_cached_nblocks[forknum] < nblocks)
+ reln->smgr_cached_nblocks[forknum] = nblocks;
+}
+
+void
+smgrpretruncate(SMgrRelation reln, ForkNumber *forknum, int nforks,
+ BlockNumber *old_nblocks, BlockNumber *nblocks,
+ XLogRecPtr truncate_lsn)
+{
+ int i;
+
+ if (smgrsw[reln->smgr_which].smgr_pretruncate == NULL)
+ return;
+
+ for (i = 0; i < nforks; i++)
+ {
+ smgrsw[reln->smgr_which].smgr_pretruncate(reln, forknum[i],
+ old_nblocks[i], nblocks[i],
+ truncate_lsn);
+ }
+}
+
/*
* smgrtruncate() -- Truncate the given forks of supplied relation to
* each specified numbers of blocks
@@ -1177,7 +1346,8 @@ void
pgaio_io_set_target_smgr(PgAioHandle *ioh,
SMgrRelationData *smgr,
ForkNumber forknum,
- BlockNumber blocknum,
+ BlockNumber logical_blocknum,
+ BlockNumber physical_blocknum,
int nblocks,
bool skip_fsync)
{
@@ -1188,7 +1358,8 @@ pgaio_io_set_target_smgr(PgAioHandle *ioh,
/* backend is implied via IO owner */
sd->smgr.rlocator = smgr->smgr_rlocator.locator;
sd->smgr.forkNum = forknum;
- sd->smgr.blockNum = blocknum;
+ sd->smgr.blockNum = logical_blocknum;
+ sd->smgr.physBlockNum = physical_blocknum;
sd->smgr.nblocks = nblocks;
sd->smgr.is_temp = SmgrIsTemp(smgr);
/* Temp relations should never be fsync'd */
@@ -1226,11 +1397,13 @@ smgr_aio_reopen(PgAioHandle *ioh)
pg_unreachable();
break;
case PGAIO_OP_READV:
- od->read.fd = smgrfd(reln, sd->smgr.forkNum, sd->smgr.blockNum, &off);
+ od->read.fd = smgrfd(reln, sd->smgr.forkNum,
+ sd->smgr.physBlockNum, &off);
Assert(off == od->read.offset);
break;
case PGAIO_OP_WRITEV:
- od->write.fd = smgrfd(reln, sd->smgr.forkNum, sd->smgr.blockNum, &off);
+ od->write.fd = smgrfd(reln, sd->smgr.forkNum,
+ sd->smgr.physBlockNum, &off);
Assert(off == od->write.offset);
break;
}
diff --git a/src/backend/storage/smgr/umbra.c b/src/backend/storage/smgr/umbra.c
index bbb870ab8e..2baf64defe 100644
--- a/src/backend/storage/smgr/umbra.c
+++ b/src/backend/storage/smgr/umbra.c
@@ -1,83 +1,270 @@
/*-------------------------------------------------------------------------
*
* umbra.c
- * Umbra storage manager skeleton.
+ * Umbra storage manager: MAP translation + physical segment manager.
*
- * This file establishes Umbra as a separate smgr implementation from md.c. It
- * maintains relation-local metadata and MAP checkpoint/cache state while using
- * md.c for data-fork I/O and umfile for metadata-file I/O.
+ * Umbra implements a separate smgr that translates logical block numbers to
+ * physical block numbers for mapped data forks.
*
- * src/backend/storage/smgr/umbra.c
+ * The mapping is stored in Umbra's internal metadata fork and cached in
+ * shared memory by the MAP subsystem (src/backend/storage/map/).
+ *
+ * Layering in this file is intentionally split:
+ * 1. access semantics: classify relation/fork access state
+ * 2. mapping facts: consume MAP lookups/logical EOF/frontier facts
+ * 3. execution: issue physical file I/O
+ *
+ * map.c owns facts and metadata storage actions. umbra.c owns runtime
+ * interpretation of those facts for reads, writes, and publication.
+ *
+ * For correctness create/open establishes a steady-state base policy once:
+ * - permanent mapped relations: REQUIRE_MAP
+ * - temp/unlogged/direct relations: BYPASS_MAP
+ *
+ * INIT and MAP forks always use direct physical addressing.
*
*-------------------------------------------------------------------------
*/
+
#include "postgres.h"
+#include "access/xlog.h"
+#include "access/xlogrecovery.h"
#include "access/xlogutils.h"
#include "catalog/pg_class.h"
+#include "catalog/storage.h"
#include "common/relpath.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "storage/aio.h"
+#include "storage/aio_internal.h"
#include "storage/bufmgr.h"
+#include "storage/fd.h"
+#include "storage/ipc.h"
#include "storage/map.h"
#include "storage/md.h"
#include "storage/smgr.h"
-#include "storage/umfile.h"
#include "storage/umbra.h"
+#include "storage/umfile.h"
+#include "utils/hsearch.h"
#include "utils/memutils.h"
#include "utils/wait_event.h"
-typedef struct UmbraSmgrRelationState
+typedef struct UmbraAccessState
+{
+ UmbraMapPolicy policy;
+ bool map_available;
+} UmbraAccessState;
+
+typedef struct UmbraRelationState
+{
+ UmbraFileContext *file_ctx; /* cached borrow from umfile registry */
+ uint8 map_state;
+} UmbraRelationState;
+
+typedef struct UmbraAccessLookupState
+{
+ bool have_logical_nblocks;
+ BlockNumber logical_nblocks;
+} UmbraAccessLookupState;
+
+typedef enum UmbraAccessResolveMode
+{
+ UMBRA_ACCESS_RESOLVE_READ,
+ UMBRA_ACCESS_RESOLVE_WRITE,
+ UMBRA_ACCESS_RESOLVE_WRITEBACK
+} UmbraAccessResolveMode;
+
+typedef enum UmbraAccessResolveResult
+{
+ UMBRA_ACCESS_RESOLVED_PBLK,
+ UMBRA_ACCESS_RESOLVED_ZERO,
+ UMBRA_ACCESS_RESOLVED_SKIP
+} UmbraAccessResolveResult;
+
+typedef struct UmbraMappedBirthResult
+{
+ BlockNumber pblkno;
+ bool mapping_published;
+} UmbraMappedBirthResult;
+
+#define UMBRA_WRITE_BARRIER_WAIT_RETRIES 10000
+#define UMBRA_WRITE_BARRIER_WAIT_USEC 1000
+
+static inline UmbraRelationState *
+um_state_lookup(SMgrRelation reln)
+{
+ return (UmbraRelationState *) reln->smgr_private;
+}
+
+static inline UmbraRelationState *
+um_state_acquire(SMgrRelation reln)
+{
+ UmbraRelationState *state = (UmbraRelationState *) reln->smgr_private;
+
+ if (state == NULL)
+ {
+ state = MemoryContextAllocZero(TopMemoryContext, sizeof(*state));
+ state->file_ctx = umfile_ctx_acquire(reln->smgr_rlocator);
+ state->map_state = UMBRA_MAP_POLICY_UNKNOWN;
+ reln->smgr_private = state;
+ }
+ else if (state->file_ctx == NULL)
+ state->file_ctx = umfile_ctx_acquire(reln->smgr_rlocator);
+
+ return state;
+}
+
+static inline UmbraFileContext *
+um_ctx_acquire(SMgrRelation reln)
{
- UmbraFileContext *filectx;
-} UmbraSmgrRelationState;
+ return um_state_acquire(reln)->file_ctx;
+}
+
+static inline UmbraFileContext *
+um_ctx_lookup(SMgrRelation reln)
+{
+ UmbraRelationState *state = um_state_lookup(reln);
+
+ return state != NULL ? state->file_ctx : NULL;
+}
+
+static inline UmbraMapPolicy
+um_map_state_cached(SMgrRelation reln)
+{
+ UmbraRelationState *state = um_state_lookup(reln);
+
+ if (state == NULL)
+ return UMBRA_MAP_POLICY_UNKNOWN;
+
+ return (UmbraMapPolicy) state->map_state;
+}
+
+static inline void
+um_set_cached_map_state(SMgrRelation reln, UmbraMapPolicy map_state)
+{
+ UmbraRelationState *state;
+
+ Assert(map_state != UMBRA_MAP_POLICY_UNKNOWN);
+ state = um_state_acquire(reln);
+ state->map_state = (uint8) map_state;
+}
+
+static inline void
+um_state_destroy(SMgrRelation reln)
+{
+ UmbraRelationState *state = um_state_lookup(reln);
+
+ if (state == NULL)
+ return;
-static bool um_tracks_identity_metadata(ForkNumber forknum);
-static UmbraFileContext *um_relation_filectx(SMgrRelation reln);
-static void um_ensure_redo_metadata(SMgrRelation reln, ForkNumber forknum);
-static void um_identity_update_metadata(SMgrRelation reln, ForkNumber forknum,
- BlockNumber nblocks, bool fork_exists);
-static void um_refresh_identity_metadata(SMgrRelation reln);
+ reln->smgr_private = NULL;
+ pfree(state);
+}
+
+/* Runtime access semantics. */
+static UmbraMapPolicy um_map_policy_for_access(SMgrRelation reln,
+ ForkNumber forknum);
+static UmbraAccessState um_classify_access(SMgrRelation reln,
+ ForkNumber forknum);
+static void um_report_unmapped_map_entry(SMgrRelation reln,
+ ForkNumber forknum,
+ const UmbraAccessState *access,
+ BlockNumber lblkno);
+static BlockNumber umnblocks_for_access(SMgrRelation reln, ForkNumber forknum,
+ const UmbraAccessState *access);
+static bool um_lblk_precedes_logical_eof_for_access(SMgrRelation reln,
+ ForkNumber forknum,
+ const UmbraAccessState *access,
+ UmbraAccessLookupState *lookup_state,
+ BlockNumber lblkno);
+static bool um_is_logical_unmaterialized_for_access(SMgrRelation reln,
+ ForkNumber forknum,
+ const UmbraAccessState *access,
+ BlockNumber lblkno);
+static UmbraAccessResolveResult um_resolve_lblk_for_access(SMgrRelation reln,
+ ForkNumber forknum,
+ const UmbraAccessState *access,
+ UmbraAccessLookupState *lookup_state,
+ BlockNumber lblkno,
+ UmbraAccessResolveMode mode,
+ BlockNumber *pblkno);
+static BlockNumber um_resolve_mapped_read_run(SMgrRelation reln,
+ ForkNumber forknum,
+ const UmbraAccessState *access,
+ UmbraAccessLookupState *lookup_state,
+ BlockNumber blocknum,
+ BlockNumber maxblocks,
+ BlockNumber *start_pblk);
+static void um_complete_zero_readv(PgAioHandle *ioh, SMgrRelation reln,
+ ForkNumber forknum, BlockNumber blocknum,
+ void *buffer);
+static bool um_is_stale_post_truncate_lblk_for_access(SMgrRelation reln,
+ ForkNumber forknum,
+ const UmbraAccessState *access,
+ UmbraAccessLookupState *lookup_state,
+ BlockNumber lblkno);
+static bool um_is_stale_post_truncate_lblk_with_eof(ForkNumber forknum,
+ BlockNumber logical_nblocks,
+ BlockNumber lblkno);
+
+/* MAP facts and storage actions consumed by Umbra semantics. */
+static void um_ensure_datafork_batch_ready_for_access(SMgrRelation reln,
+ ForkNumber forknum,
+ const UmbraAccessState *access,
+ BlockNumber pblkno,
+ bool skipFsync);
+static void um_reserve_fresh_pblkno_for_access(SMgrRelation reln,
+ ForkNumber forknum,
+ const UmbraAccessState *access,
+ BlockNumber lblkno,
+ BlockNumber *new_pblkno);
+static bool um_fork_uses_map_translation(ForkNumber forknum);
+static bool um_mapped_exists_from_super(SMgrRelation reln, ForkNumber forknum);
+static UmbraMapPolicy um_open_map_state(SMgrRelation reln);
+static bool um_state_uses_map(UmbraMapPolicy state);
+static bool um_state_requires_durable_sync(UmbraMapPolicy state);
+static bool um_relation_requires_durable_sync(SMgrRelation reln);
+static UmbraMappedBirthResult um_publish_mapped_birth(SMgrRelation reln,
+ ForkNumber forknum,
+ const UmbraAccessState *access,
+ BlockNumber lblkno,
+ bool allow_wal_owned_firstborn);
static void um_filetag_path(const FileTag *ftag, char *path);
bool
UmMetadataExists(SMgrRelation reln)
{
- return umfile_exists(um_relation_filectx(reln),
- UMBRA_METADATA_FORKNUM,
+ return umfile_exists(um_ctx_acquire(reln), UMBRA_METADATA_FORKNUM,
UMFILE_EXISTS_DENSE);
}
bool
UmMetadataOpenOrCreate(SMgrRelation reln, bool isRedo, bool *created)
{
- return umfile_open_or_create(um_relation_filectx(reln),
- UMBRA_METADATA_FORKNUM,
- isRedo,
- created);
+ return umfile_open_or_create(um_ctx_acquire(reln), UMBRA_METADATA_FORKNUM,
+ isRedo, created);
}
BlockNumber
UmMetadataNblocks(SMgrRelation reln)
{
- return umfile_nblocks(um_relation_filectx(reln),
- UMBRA_METADATA_FORKNUM,
+ return umfile_nblocks(um_ctx_acquire(reln), UMBRA_METADATA_FORKNUM,
UMFILE_NBLOCKS_DENSE);
}
void
UmMetadataRead(SMgrRelation reln, BlockNumber blkno, void *buffer)
{
- void *buffers[1];
-
- buffers[0] = buffer;
- umfile_readv(um_relation_filectx(reln), UMBRA_METADATA_FORKNUM, blkno,
- buffers, 1);
+ umfile_readv(um_ctx_acquire(reln), UMBRA_METADATA_FORKNUM, blkno,
+ &buffer, 1);
}
void
UmMetadataWrite(SMgrRelation reln, BlockNumber blkno, const void *buffer,
bool skipFsync)
{
- UmbraFileContext *ctx = um_relation_filectx(reln);
+ UmbraFileContext *ctx = um_ctx_acquire(reln);
umfile_ctx_write(ctx, UMBRA_METADATA_FORKNUM, blkno,
buffer, BLCKSZ, skipFsync);
@@ -94,7 +281,8 @@ UmMetadataWriteSuperblock(RelFileLocatorBackend rlocator, const void *sector,
/*
* Superblock checkpoint flush can run while holding MapSuperEntry->lock,
- * so it must not recurse through smgr/umopen.
+ * so it must not reopen the relation via smgr/umopen and recurse into MAP
+ * state lookup.
*/
umfile_ctx_write(ctx, UMBRA_METADATA_FORKNUM, MAP_BLOCK_SUPER,
sector, MAP_SUPERBLOCK_SIZE, skipFsync);
@@ -107,15 +295,20 @@ void
UmMetadataExtend(SMgrRelation reln, BlockNumber blkno, const void *buffer,
bool skipFsync)
{
- umfile_extend(um_relation_filectx(reln), UMBRA_METADATA_FORKNUM, blkno,
+ umfile_extend(um_ctx_acquire(reln), UMBRA_METADATA_FORKNUM, blkno,
buffer, skipFsync);
}
void
UmMetadataImmediateSync(SMgrRelation reln)
{
- MapCheckpointRelation(reln->smgr_rlocator.locator);
- umfile_immedsync(um_relation_filectx(reln), UMBRA_METADATA_FORKNUM);
+ umimmedsync(reln, UMBRA_METADATA_FORKNUM);
+}
+
+void
+UmMetadataRegisterSync(SMgrRelation reln)
+{
+ umimmedsync(reln, UMBRA_METADATA_FORKNUM);
}
void
@@ -124,66 +317,139 @@ UmMetadataUnlink(RelFileLocatorBackend rlocator, bool isRedo)
umfile_unlink(rlocator, UMBRA_METADATA_FORKNUM, isRedo);
}
-void
-UmInvalidateDatabase(Oid dbid)
+static void
+um_ensure_datafork_batch_ready_for_access(SMgrRelation reln,
+ ForkNumber forknum,
+ const UmbraAccessState *access,
+ BlockNumber pblkno,
+ bool skipFsync)
{
- FileTag tag;
- RelFileLocator rlocator;
+ UmbraFileContext *ctx = um_ctx_acquire(reln);
- MapInvalidateDatabase(dbid);
+ if (!access->map_available)
+ return;
- rlocator.spcOid = 0;
- rlocator.dbOid = dbid;
- rlocator.relNumber = 0;
+ if (pblkno == InvalidBlockNumber)
+ return;
- memset(&tag, 0, sizeof(tag));
- tag.handler = SYNC_HANDLER_UMBRA;
- tag.rlocator = rlocator;
- tag.forknum = InvalidForkNumber;
- tag.segno = InvalidBlockNumber;
+ if (pblkno + 1 == 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("physical block overflow for relation %u/%u/%u fork %d",
+ reln->smgr_rlocator.locator.spcOid,
+ reln->smgr_rlocator.locator.dbOid,
+ reln->smgr_rlocator.locator.relNumber,
+ forknum)));
- RegisterSyncRequest(&tag, SYNC_FILTER_REQUEST, true);
+ (void) MapSBlockEnsurePhysicalNblocks(ctx, reln->smgr_rlocator.locator,
+ forknum, pblkno + 1, skipFsync);
}
void
-uminit(void)
+UmApplyReservedRangeRemap(SMgrRelation reln, ForkNumber forknum,
+ BlockNumber firstblock, BlockNumber nblocks,
+ const BlockNumber *pblknos,
+ XLogRecPtr lsn, bool skipFsync)
{
- umfile_init();
- MapBackendInit();
-}
+ UmbraFileContext *ctx = um_ctx_acquire(reln);
+ BlockNumber max_lblkno;
+ BlockNumber max_pblkno = InvalidBlockNumber;
-void
-umopen(SMgrRelation reln)
-{
- UmbraSmgrRelationState *state;
+ Assert(nblocks > 0);
+ Assert(pblknos != NULL);
+
+ max_lblkno = firstblock + nblocks - 1;
+ if (max_lblkno < firstblock)
+ ereport(ERROR,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("logical range overflow for relation %u/%u/%u fork %d",
+ reln->smgr_rlocator.locator.spcOid,
+ reln->smgr_rlocator.locator.dbOid,
+ reln->smgr_rlocator.locator.relNumber,
+ forknum)));
+
+ for (BlockNumber i = 0; i < nblocks; i++)
+ {
+ BlockNumber pblk = pblknos[i];
+
+ if (max_pblkno == InvalidBlockNumber || pblk > max_pblkno)
+ max_pblkno = pblk;
+ }
+
+ if (max_pblkno != InvalidBlockNumber)
+ {
+ (void) MapSBlockEnsurePhysicalNblocks(ctx, reln->smgr_rlocator.locator,
+ forknum, max_pblkno + 1, skipFsync);
+ }
- Assert(reln->smgr_private == NULL);
+ for (BlockNumber i = 0; i < nblocks; i++)
+ {
+ BlockNumber lblk = firstblock + i;
+ BlockNumber pblk = pblknos[i];
- state = MemoryContextAllocZero(TopMemoryContext,
- sizeof(UmbraSmgrRelationState));
- state->filectx = umfile_ctx_acquire(reln->smgr_rlocator);
- reln->smgr_private = state;
+ MapSetMapping(ctx, reln->smgr_rlocator.locator, forknum, lblk, pblk, lsn);
+ }
- mdopen(reln);
+ if (max_pblkno != InvalidBlockNumber)
+ {
+ MapSBlockBumpPhysicalNblocks(ctx, reln->smgr_rlocator.locator,
+ forknum, max_pblkno + 1, lsn);
+ for (BlockNumber i = 0; i < nblocks; i++)
+ MapInflightRelease(reln->smgr_rlocator.locator, forknum,
+ firstblock + i);
+ }
+ MapSBlockBumpLogicalNblocks(ctx, reln->smgr_rlocator.locator,
+ forknum, max_lblkno + 1, lsn);
}
-void
-umclose(SMgrRelation reln, ForkNumber forknum)
+bool
+umapplyreservedrange(SMgrRelation reln, ForkNumber forknum,
+ BlockNumber firstblock, BlockNumber nblocks,
+ const BlockNumber *pblknos,
+ XLogRecPtr lsn, bool skipFsync)
{
- mdclose(reln, forknum);
+ UmApplyReservedRangeRemap(reln, forknum, firstblock, nblocks,
+ pblknos, lsn, skipFsync);
+ return true;
}
-void
-umdestroy(SMgrRelation reln)
+/*
+ * Create and initialize MAP fork for a relation.
+ *
+ * Keep creation O(1): create/open the MAP fork and write only the superblock
+ * sector. Regular MAP pages are synthesized on first access and written
+ * lazily by the MAP layer.
+ */
+
+static void
+ummapcreate(SMgrRelation reln)
{
- UmbraSmgrRelationState *state = reln->smgr_private;
+ UmbraFileContext *ctx = um_ctx_acquire(reln);
+ bool newly_created;
- if (state != NULL)
- {
- umfile_ctx_forget(reln->smgr_rlocator);
- pfree(state);
- reln->smgr_private = NULL;
- }
+ /*
+ * Open existing MAP fork or create new one. During redo, EEXIST is
+ * acceptable and we reuse the existing file.
+ */
+ UmMetadataOpenOrCreate(reln, true /* isRedo */, &newly_created);
+
+ /* Existing MAP fork does not need re-initialization. */
+ if (!newly_created)
+ return;
+
+ /*
+ * Superblock identity is established locally here and reconstructed during
+ * redo from relation creation, so no separate Umbra rmgr record is needed.
+ */
+ Assert(um_map_state_cached(reln) != UMBRA_MAP_POLICY_UNKNOWN);
+ MapSBlockInit(ctx, reln->smgr_rlocator.locator, InvalidXLogRecPtr);
+
+ /*
+ * Keep metadata fork durability aligned with main-fork create semantics:
+ * the file is created now, while checkpoint/restartpoint owns syncing it.
+ */
+ if (!SmgrIsTemp(reln))
+ UmMetadataRegisterSync(reln);
}
bool
@@ -199,49 +465,60 @@ umcreatedballowswallog(void)
}
void
-umcheckpointdatabasetablespaces(Oid dbid, int ntablespaces,
- const Oid *tablespace_ids)
+uminitnewrelation(SMgrRelation reln, bool needs_wal)
{
- MapCheckpointDatabaseTablespaces(dbid, ntablespaces, tablespace_ids);
+ umsetmapstate(reln, needs_wal ?
+ UMBRA_MAP_POLICY_REQUIRE_MAP :
+ UMBRA_MAP_POLICY_BYPASS_MAP);
+ if (needs_wal)
+ umcreaterelationmetadata(reln);
}
void
-uminvalidatedatabasetablespaces(Oid dbid, int ntablespaces,
- const Oid *tablespace_ids)
+umcreaterelationmetadata(SMgrRelation reln)
{
- MapInvalidateDatabaseTablespaces(dbid, ntablespaces, tablespace_ids);
+ ummapcreate(reln);
}
void
-umcreaterelationmetadata(SMgrRelation reln)
+umredocreatefork(SMgrRelation reln, ForkNumber forknum, XLogRecPtr lsn)
{
- UmbraFileContext *ctx = um_relation_filectx(reln);
- bool created = false;
+ UmbraFileContext *ctx = um_ctx_acquire(reln);
- /*
- * smgrcreaterelationmetadata() is used both in normal create and redo
- * paths, so tolerate an already-existing metadata fork here.
- */
- if (!UmMetadataOpenOrCreate(reln, true, &created))
- ereport(ERROR,
- (errcode_for_file_access(),
- errmsg("could not create Umbra metadata fork for relation %u/%u/%u",
- reln->smgr_rlocator.locator.spcOid,
- reln->smgr_rlocator.locator.dbOid,
- reln->smgr_rlocator.locator.relNumber)));
+ if (forknum == MAIN_FORKNUM)
+ {
+ umsetmapstate(reln, UMBRA_MAP_POLICY_REQUIRE_MAP);
+ umcreaterelationmetadata(reln);
+ return;
+ }
- elog(DEBUG1, "umbra metadata open/create %u/%u/%u created=%s",
- reln->smgr_rlocator.locator.spcOid,
- reln->smgr_rlocator.locator.dbOid,
- reln->smgr_rlocator.locator.relNumber,
- created ? "true" : "false");
+ if (!UmbraForkUsesMapTranslation(forknum))
+ {
+ umsetmapstate(reln, UMBRA_MAP_POLICY_BYPASS_MAP);
+ return;
+ }
- if (created)
- MapSBlockInit(ctx, reln->smgr_rlocator.locator, InvalidXLogRecPtr);
- else
- (void) MapSBlockEnsureLoaded(ctx, reln->smgr_rlocator.locator);
+ umsetmapstate(reln, UMBRA_MAP_POLICY_REQUIRE_MAP);
+ if (!UmMetadataExists(reln))
+ umcreaterelationmetadata(reln);
+
+ if (UmbraForkIsAuxiliaryMapped(forknum))
+ MapSBlockSetLogicalNblocks(ctx, reln->smgr_rlocator.locator,
+ forknum, 0, lsn);
+}
+
+void
+umcheckpointdatabasetablespaces(Oid dbid, int ntablespaces,
+ const Oid *tablespace_ids)
+{
+ MapCheckpointDatabaseTablespaces(dbid, ntablespaces, tablespace_ids);
+}
- um_refresh_identity_metadata(reln);
+void
+uminvalidatedatabasetablespaces(Oid dbid, int ntablespaces,
+ const Oid *tablespace_ids)
+{
+ MapInvalidateDatabaseTablespaces(dbid, ntablespaces, tablespace_ids);
}
void
@@ -257,7 +534,7 @@ umcopyrelationmetadata(SMgrRelation src, SMgrRelation dst, char relpersistence)
if (!UmMetadataExists(src))
return;
- umcreaterelationmetadata(dst);
+ ummapcreate(dst);
src_nblocks = UmMetadataNblocks(src);
dst_nblocks = UmMetadataNblocks(dst);
@@ -291,333 +568,795 @@ umunlinkrelationmetadata(RelFileLocatorBackend rlocator, bool isRedo)
}
void
-umcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
+uminit(void)
{
- mdcreate(reln, forknum, isRedo);
-
- /*
- * Redo for permanent relation creation reaches smgrcreate() directly, so
- * make sure the metadata fork exists before later recovery steps touch the
- * relation again.
- */
- if (isRedo &&
- forknum == MAIN_FORKNUM &&
- !UmMetadataExists(reln))
- umcreaterelationmetadata(reln);
-
- if (forknum != MAIN_FORKNUM &&
- um_tracks_identity_metadata(forknum) &&
- UmMetadataExists(reln))
- um_identity_update_metadata(reln, forknum, 0, true);
+ umfile_init();
+ MapBackendInit();
}
-bool
-umexists(SMgrRelation reln, ForkNumber forknum)
+void
+umbeforeshmemexitcleanup(void)
{
- if (forknum == UMBRA_METADATA_FORKNUM)
- return UmMetadataExists(reln);
-
- return mdexists(reln, forknum);
+ MapBackendExitCleanup();
}
void
-umunlink(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo)
+umopen(SMgrRelation reln)
{
- if (forknum == UMBRA_METADATA_FORKNUM ||
- forknum == MAIN_FORKNUM ||
- forknum == InvalidForkNumber)
- {
- MapInvalidateRelation(rlocator.locator);
- }
+ (void) um_ctx_acquire(reln);
- if (forknum == UMBRA_METADATA_FORKNUM)
- {
- UmMetadataUnlink(rlocator, isRedo);
+ if (um_map_state_cached(reln) != UMBRA_MAP_POLICY_UNKNOWN)
return;
- }
-
- if (forknum == MAIN_FORKNUM || forknum == InvalidForkNumber)
- UmMetadataUnlink(rlocator, isRedo);
- mdunlink(rlocator, forknum, isRedo);
+ um_set_cached_map_state(reln, um_open_map_state(reln));
}
void
-umextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
- const void *buffer, bool skipFsync)
+umclose(SMgrRelation reln, ForkNumber forknum)
{
- um_ensure_redo_metadata(reln, forknum);
- mdextend(reln, forknum, blocknum, buffer, skipFsync);
+ UmbraFileContext *ctx = um_ctx_lookup(reln);
- if (um_tracks_identity_metadata(forknum) && UmMetadataExists(reln))
- um_identity_update_metadata(reln, forknum, blocknum + 1, true);
+ if (ctx != NULL)
+ umfile_ctx_close_fork(ctx, forknum);
}
void
-umzeroextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
- int nblocks, bool skipFsync)
+umdestroy(SMgrRelation reln)
{
- BlockNumber target_nblocks;
-
- um_ensure_redo_metadata(reln, forknum);
- mdzeroextend(reln, forknum, blocknum, nblocks, skipFsync);
-
- if (!um_tracks_identity_metadata(forknum) || !UmMetadataExists(reln))
- return;
-
- target_nblocks = blocknum + (BlockNumber) nblocks;
- if (target_nblocks < blocknum)
- ereport(ERROR,
- (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
- errmsg("Umbra identity mapping block count overflow")));
-
- um_identity_update_metadata(reln, forknum, target_nblocks, true);
+ umfile_ctx_forget(reln->smgr_rlocator);
+ um_state_destroy(reln);
}
-bool
-umprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
- int nblocks)
+void
+umsetmapstate(SMgrRelation reln, uint8 map_state)
{
- return mdprefetch(reln, forknum, blocknum, nblocks);
+ Assert(map_state != UMBRA_MAP_POLICY_UNKNOWN);
+ Assert(map_state <= UMBRA_MAP_POLICY_REQUIRE_MAP);
+ um_set_cached_map_state(reln, (UmbraMapPolicy) map_state);
}
-uint32
-ummaxcombine(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
+static bool
+um_fork_uses_map_translation(ForkNumber forknum)
{
- return mdmaxcombine(reln, forknum, blocknum);
+ return UmbraForkUsesMapTranslation(forknum);
}
-void
-umreadv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
- void **buffers, BlockNumber nblocks)
+/*
+ * MAIN fork is the only one that uses page-WAL-owned first-born remap.
+ *
+ * FSM/VM growth is much more structured: their extend/truncate producers are
+ * concentrated in a few helper paths, so we keep them on explicit mapping
+ * publication rather than tying first-born ownership to arbitrary page WAL.
+ */
+static bool
+um_mapped_exists_from_super(SMgrRelation reln, ForkNumber forknum)
{
- um_ensure_redo_metadata(reln, forknum);
- mdreadv(reln, forknum, blocknum, buffers, nblocks);
-}
+ UmbraFileContext *ctx = um_ctx_acquire(reln);
-void
-umstartreadv(PgAioHandle *ioh, SMgrRelation reln, ForkNumber forknum,
- BlockNumber blocknum, void **buffers, BlockNumber nblocks)
-{
- um_ensure_redo_metadata(reln, forknum);
- mdstartreadv(ioh, reln, forknum, blocknum, buffers, nblocks);
+ Assert(um_fork_uses_map_translation(forknum));
+
+ if (!UmMetadataExists(reln))
+ return false;
+
+ return MapSBlockForkExists(ctx, reln->smgr_rlocator.locator, forknum);
}
-void
-umwritev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
- const void **buffers, BlockNumber nblocks, bool skipFsync)
+static UmbraMapPolicy
+um_open_map_state(SMgrRelation reln)
{
- um_ensure_redo_metadata(reln, forknum);
- mdwritev(reln, forknum, blocknum, buffers, nblocks, skipFsync);
+ UmbraFileContext *ctx = um_ctx_acquire(reln);
- if (InRecovery &&
- um_tracks_identity_metadata(forknum) &&
- UmMetadataExists(reln))
- um_identity_update_metadata(reln, forknum, mdnblocks(reln, forknum),
- true);
+ if (RelFileLocatorBackendIsTemp(reln->smgr_rlocator) ||
+ !UmMetadataExists(reln))
+ return UMBRA_MAP_POLICY_BYPASS_MAP;
+
+ if (MapSBlockIsSkipWalPending(ctx,
+ reln->smgr_rlocator.locator))
+ return UMBRA_MAP_POLICY_SKIP_WAL_PENDING_MAP;
+
+ return UMBRA_MAP_POLICY_REQUIRE_MAP;
}
-void
-umwriteback(SMgrRelation reln, ForkNumber forknum,
- BlockNumber blocknum, BlockNumber nblocks)
+static bool
+um_state_uses_map(UmbraMapPolicy state)
{
- mdwriteback(reln, forknum, blocknum, nblocks);
+ return state == UMBRA_MAP_POLICY_REQUIRE_MAP;
}
-BlockNumber
-umnblocks(SMgrRelation reln, ForkNumber forknum)
+static bool
+um_state_requires_durable_sync(UmbraMapPolicy state)
{
- /*
- * Keep md.c responsible for physical fork size queries. mdtruncate()
- * relies on a preceding mdnblocks() call to have opened active segments.
- */
- return mdnblocks(reln, forknum);
+ return state == UMBRA_MAP_POLICY_REQUIRE_MAP ||
+ state == UMBRA_MAP_POLICY_SKIP_WAL_PENDING_MAP;
}
-void
-umtruncate(SMgrRelation reln, ForkNumber forknum,
- BlockNumber old_blocks, BlockNumber nblocks)
+/*
+ * Commit-time durability is a separate question from ordinary access state.
+ *
+ * log_newpage_range() can describe only physical page images. Once a relation
+ * either actively uses MAP translation or already owns a MAP fork on disk, its
+ * durable transition must go through the Umbra-aware flush+sync path instead
+ * of plain FPI-range logging.
+ */
+static bool
+um_relation_requires_durable_sync(SMgrRelation reln)
{
- mdtruncate(reln, forknum, old_blocks, nblocks);
+ UmbraMapPolicy state;
+
+ state = um_map_state_cached(reln);
+ Assert(state != UMBRA_MAP_POLICY_UNKNOWN);
- if (um_tracks_identity_metadata(forknum) && UmMetadataExists(reln))
- um_identity_update_metadata(reln, forknum, nblocks, true);
+ return um_state_requires_durable_sync(state) || UmMetadataExists(reln);
}
-void
-umimmedsync(SMgrRelation reln, ForkNumber forknum)
+static UmbraMapPolicy
+um_map_policy_for_access(SMgrRelation reln, ForkNumber forknum)
{
- mdimmedsync(reln, forknum);
+ if (!um_fork_uses_map_translation(forknum))
+ return UMBRA_MAP_POLICY_BYPASS_MAP;
- if (um_tracks_identity_metadata(forknum) && UmMetadataExists(reln))
- UmMetadataImmediateSync(reln);
+ Assert(um_map_state_cached(reln) != UMBRA_MAP_POLICY_UNKNOWN);
+ return um_map_state_cached(reln);
}
-void
-umregistersync(SMgrRelation reln, ForkNumber forknum)
+/*
+ * um_map_fork_available() -- whether MAP fork can be used for this access.
+ *
+ * If MAP fork is absent:
+ * - optional relations fall back to direct mapping (lblkno==pblkno)
+ * - required relations throw ERROR
+ */
+static UmbraAccessState
+um_classify_access(SMgrRelation reln, ForkNumber forknum)
{
- mdregistersync(reln, forknum);
-}
+ UmbraAccessState state;
-int
-umfd(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, uint32 *off)
-{
- return mdfd(reln, forknum, blocknum, off);
+ state.policy = um_map_policy_for_access(reln, forknum);
+ state.map_available = um_state_uses_map(state.policy);
+
+ return state;
}
-int
-umsyncfiletag(const FileTag *ftag, char *path)
+static void
+um_report_unmapped_map_entry(SMgrRelation reln, ForkNumber forknum,
+ const UmbraAccessState *access,
+ BlockNumber lblkno)
{
- File fd;
- int ret;
- int save_errno;
+ UmbraFileContext *ctx = um_ctx_acquire(reln);
+ BlockNumber logical_nblocks = InvalidBlockNumber;
+ BlockNumber map_blkno;
+ BlockNumber fork_page_idx;
+ int entry_idx;
+ uint64 blkno64;
- um_filetag_path(ftag, path);
+ Assert(access->map_available);
- fd = PathNameOpenFile(path, O_RDWR | PG_BINARY);
- if (fd < 0)
- return -1;
+ (void) MapSBlockTryGetLogicalNblocks(ctx,
+ reln->smgr_rlocator.locator,
+ forknum, &logical_nblocks);
- ret = FileSync(fd, WAIT_EVENT_DATA_FILE_SYNC);
- save_errno = errno;
+ fork_page_idx = lblkno / MAP_ENTRIES_PER_PAGE;
+ entry_idx = lblkno % MAP_ENTRIES_PER_PAGE;
- FileClose(fd);
- errno = save_errno;
- return ret;
+ switch (forknum)
+ {
+ case FSM_FORKNUM:
+ blkno64 = (uint64) MAP_BLOCK_FIRST_GROUP +
+ (uint64) fork_page_idx * (uint64) MAP_GROUP_TOTAL_PAGES;
+ break;
+
+ case VISIBILITYMAP_FORKNUM:
+ blkno64 = (uint64) MAP_BLOCK_FIRST_GROUP +
+ (uint64) fork_page_idx * (uint64) MAP_GROUP_TOTAL_PAGES +
+ (uint64) MAP_GROUP_FSM_PAGES;
+ break;
+
+ case MAIN_FORKNUM:
+ {
+ uint64 group_page_idx = (uint64) fork_page_idx;
+ uint64 group_no = group_page_idx / (uint64) MAP_GROUP_MAIN_PAGES;
+
+ blkno64 = (uint64) MAP_BLOCK_FIRST_GROUP +
+ group_no * (uint64) MAP_GROUP_TOTAL_PAGES +
+ (uint64) MAP_GROUP_FSM_PAGES +
+ (uint64) MAP_GROUP_VM_PAGES +
+ (group_page_idx % (uint64) MAP_GROUP_MAIN_PAGES);
+ break;
+ }
+
+ default:
+ elog(ERROR, "unsupported fork number %d in map miss report",
+ (int) forknum);
+ pg_unreachable();
+ }
+
+ map_blkno = (BlockNumber) blkno64;
+
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("MAP entry is unmapped: rel=%u/%u/%u fork=%d lblk=%u logical_nblocks=%u map_page=%u entry_idx=%d",
+ reln->smgr_rlocator.locator.spcOid,
+ reln->smgr_rlocator.locator.dbOid,
+ reln->smgr_rlocator.locator.relNumber,
+ forknum,
+ lblkno,
+ logical_nblocks,
+ map_blkno,
+ entry_idx)));
}
-int
-umunlinkfiletag(const FileTag *ftag, char *path)
+/*
+ * Build identity MAP metadata for relations that stayed on direct lblk==pblk
+ * access during a skip-WAL window and now need durable mapped state.
+ */
+void
+UmRebuildMapAndSuperblockForSkipWAL(SMgrRelation reln)
{
- um_filetag_path(ftag, path);
- return unlink(path);
+ UmbraFileContext *ctx = um_ctx_acquire(reln);
+
+ /*
+ * Rebuild assumes the relation stayed on direct lblk==pblk access during
+ * the skip-WAL window. No running path may consume MAP state before this
+ * durable-transition rebuild runs.
+ */
+ Assert(RelFileLocatorSkippingWAL(reln->smgr_rlocator.locator));
+
+ MapInvalidateRelation(reln->smgr_rlocator.locator);
+ Assert(UmMetadataExists(reln));
+
+ for (ForkNumber forknum = MAIN_FORKNUM; forknum <= VISIBILITYMAP_FORKNUM; forknum++)
+ {
+ BlockNumber nblocks;
+
+ if (!UmbraForkUsesMapTranslation(forknum))
+ continue;
+
+ if (!umfile_exists(ctx, forknum, UMFILE_EXISTS_DENSE))
+ {
+ MapSBlockSetLogicalNblocks(ctx, reln->smgr_rlocator.locator,
+ forknum, 0, InvalidXLogRecPtr);
+ continue;
+ }
+
+ nblocks = umfile_nblocks(ctx, forknum, UMFILE_NBLOCKS_DENSE);
+
+ for (BlockNumber lblk = 0; lblk < nblocks; lblk++)
+ MapSetMapping(ctx, reln->smgr_rlocator.locator, forknum,
+ lblk, lblk, InvalidXLogRecPtr);
+
+ if (nblocks > 0)
+ {
+ MapSBlockBumpNextFreePhysBlock(ctx, reln->smgr_rlocator.locator,
+ forknum, nblocks, InvalidXLogRecPtr);
+ MapSBlockBumpPhysicalNblocks(ctx, reln->smgr_rlocator.locator,
+ forknum, nblocks, InvalidXLogRecPtr);
+ }
+ MapSBlockSetLogicalNblocks(ctx, reln->smgr_rlocator.locator,
+ forknum, nblocks, InvalidXLogRecPtr);
+ }
}
-bool
-umfiletagmatches(const FileTag *ftag, const FileTag *candidate)
+void
+ummarkskipwalpending(SMgrRelation reln)
{
- if (ftag->forknum == InvalidForkNumber &&
- ftag->segno == InvalidBlockNumber &&
- ftag->rlocator.spcOid == 0 &&
- ftag->rlocator.relNumber == 0)
- return ftag->rlocator.dbOid == candidate->rlocator.dbOid;
+ UmbraFileContext *ctx = um_ctx_acquire(reln);
- if (ftag->forknum == InvalidForkNumber &&
- ftag->segno == InvalidBlockNumber)
- return RelFileLocatorEquals(ftag->rlocator, candidate->rlocator);
+ if (!UmMetadataExists(reln))
+ ummapcreate(reln);
- if (ftag->segno == InvalidBlockNumber)
- return RelFileLocatorEquals(ftag->rlocator, candidate->rlocator) &&
- ftag->forknum == candidate->forknum;
+ MapSBlockSetSkipWalPending(ctx, reln->smgr_rlocator.locator,
+ true, InvalidXLogRecPtr);
+ um_set_cached_map_state(reln, UMBRA_MAP_POLICY_SKIP_WAL_PENDING_MAP);
+}
- return RelFileLocatorEquals(ftag->rlocator, candidate->rlocator) &&
- ftag->forknum == candidate->forknum &&
- ftag->segno == candidate->segno;
+void
+umclearskipwalpending(SMgrRelation reln)
+{
+ UmbraFileContext *ctx = um_ctx_acquire(reln);
+
+ if (!UmMetadataExists(reln))
+ return;
+
+ MapSBlockSetSkipWalPending(ctx, reln->smgr_rlocator.locator,
+ false, InvalidXLogRecPtr);
+ um_set_cached_map_state(reln, UMBRA_MAP_POLICY_REQUIRE_MAP);
+ /*
+ * Clearing the durable-transition flag must itself be durable before the
+ * transaction can be considered to have left the skip-WAL state. If we
+ * only dirty the shared superblock copy here, a crash before checkpoint
+ * would resurrect SKIP_WAL_PENDING from disk on restart.
+ */
+ umimmedsync(reln, UMBRA_METADATA_FORKNUM);
}
-static UmbraFileContext *
-um_relation_filectx(SMgrRelation reln)
+static bool
+um_lblk_precedes_logical_eof_for_access(SMgrRelation reln, ForkNumber forknum,
+ const UmbraAccessState *access,
+ UmbraAccessLookupState *lookup_state,
+ BlockNumber lblkno)
{
- UmbraSmgrRelationState *state = reln->smgr_private;
+ BlockNumber logical_nblocks;
- if (state == NULL)
- return umfile_ctx_acquire(reln->smgr_rlocator);
+ if (!access->map_available)
+ return false;
- if (state->filectx == NULL)
- state->filectx = umfile_ctx_acquire(reln->smgr_rlocator);
+ if (!um_fork_uses_map_translation(forknum))
+ return false;
+
+ if (lookup_state != NULL && lookup_state->have_logical_nblocks)
+ logical_nblocks = lookup_state->logical_nblocks;
+ else
+ {
+ logical_nblocks = umnblocks_for_access(reln, forknum, access);
+ if (lookup_state != NULL)
+ {
+ lookup_state->logical_nblocks = logical_nblocks;
+ lookup_state->have_logical_nblocks = true;
+ }
+ }
- return state->filectx;
+ return lblkno < logical_nblocks;
}
static bool
-um_tracks_identity_metadata(ForkNumber forknum)
+um_is_logical_unmaterialized_for_access(SMgrRelation reln, ForkNumber forknum,
+ const UmbraAccessState *access,
+ BlockNumber lblkno)
{
- return forknum == MAIN_FORKNUM ||
- forknum == FSM_FORKNUM ||
- forknum == VISIBILITYMAP_FORKNUM;
+ UmbraFileContext *ctx = um_ctx_acquire(reln);
+ BlockNumber pblkno;
+
+ if (!um_lblk_precedes_logical_eof_for_access(reln, forknum, access, NULL,
+ lblkno))
+ return false;
+
+ return !MapTryLookup(ctx, reln->smgr_rlocator.locator,
+ forknum, lblkno, &pblkno);
}
static void
-um_ensure_redo_metadata(SMgrRelation reln, ForkNumber forknum)
+um_reserve_fresh_pblkno_for_access(SMgrRelation reln, ForkNumber forknum,
+ const UmbraAccessState *access,
+ BlockNumber lblkno,
+ BlockNumber *new_pblkno)
{
- Assert(reln != NULL);
+ UmbraFileContext *ctx = um_ctx_acquire(reln);
- if (!InRecovery ||
- RelFileLocatorBackendIsTemp(reln->smgr_rlocator) ||
- !um_tracks_identity_metadata(forknum) ||
- UmMetadataExists(reln))
+ Assert(new_pblkno != NULL);
+
+ if (!access->map_available)
+ {
+ *new_pblkno = lblkno;
return;
+ }
- /*
- * Redo can materialize a new data fork via mdwritev()/mdextend() without a
- * preceding smgrcreate() callback, for example during CREATE DATABASE
- * WAL-log replay. Ensure metadata exists before MAP state is consulted or
- * checkpointed for that relation.
- */
- elog(DEBUG1, "umbra redo ensure metadata %u/%u/%u fork=%d",
- reln->smgr_rlocator.locator.spcOid,
- reln->smgr_rlocator.locator.dbOid,
- reln->smgr_rlocator.locator.relNumber,
- forknum);
- umcreaterelationmetadata(reln);
+ if (!MapReserveFreshPblkno(ctx, reln->smgr_rlocator.locator,
+ forknum, lblkno, new_pblkno))
+ elog(ERROR,
+ "failed to reserve fresh physical block for relation %u/%u/%u fork %d blk %u",
+ reln->smgr_rlocator.locator.spcOid,
+ reln->smgr_rlocator.locator.dbOid,
+ reln->smgr_rlocator.locator.relNumber,
+ forknum, lblkno);
}
-static void
-um_identity_update_metadata(SMgrRelation reln, ForkNumber forknum,
- BlockNumber nblocks, bool fork_exists)
+static UmbraAccessResolveResult
+um_resolve_lblk_for_read(SMgrRelation reln, ForkNumber forknum,
+ const UmbraAccessState *access,
+ UmbraAccessLookupState *lookup_state,
+ BlockNumber lblkno)
{
- UmbraFileContext *ctx = um_relation_filectx(reln);
- BlockNumber logical_nblocks;
+ Assert(lookup_state != NULL);
- Assert(reln != NULL);
- Assert(um_tracks_identity_metadata(forknum));
- Assert(UmMetadataExists(reln));
+ if (!lookup_state->have_logical_nblocks)
+ {
+ lookup_state->logical_nblocks =
+ umnblocks_for_access(reln, forknum, access);
+ lookup_state->have_logical_nblocks = true;
+ }
- if (!MapSBlockEnsureLoaded(ctx, reln->smgr_rlocator.locator))
- elog(ERROR, "could not load MAP superblock for relation %u/%u/%u",
- reln->smgr_rlocator.locator.spcOid,
- reln->smgr_rlocator.locator.dbOid,
- reln->smgr_rlocator.locator.relNumber);
+ if (InRecovery && UmbraForkIsAuxiliaryMapped(forknum) &&
+ lookup_state->logical_nblocks != InvalidBlockNumber &&
+ um_is_stale_post_truncate_lblk_with_eof(forknum,
+ lookup_state->logical_nblocks,
+ lblkno))
+ {
+ return UMBRA_ACCESS_RESOLVED_ZERO;
+ }
+
+ if (lookup_state->logical_nblocks != InvalidBlockNumber &&
+ lblkno < lookup_state->logical_nblocks)
+ {
+ return UMBRA_ACCESS_RESOLVED_ZERO;
+ }
+
+ {
+ um_report_unmapped_map_entry(reln, forknum, access, lblkno);
+ }
+ pg_unreachable();
+}
+
+static UmbraAccessResolveResult
+um_resolve_lblk_for_write(SMgrRelation reln, ForkNumber forknum,
+ const UmbraAccessState *access,
+ UmbraAccessLookupState *lookup_state,
+ BlockNumber lblkno, BlockNumber *pblkno)
+{
+ (void) lookup_state;
+ (void) pblkno;
+
+ um_report_unmapped_map_entry(reln, forknum, access, lblkno);
+ pg_unreachable();
+}
+
+static UmbraAccessResolveResult
+um_resolve_lblk_for_writeback(SMgrRelation reln, ForkNumber forknum,
+ const UmbraAccessState *access,
+ UmbraAccessLookupState *lookup_state,
+ BlockNumber lblkno, BlockNumber *pblkno)
+{
+ if (um_lblk_precedes_logical_eof_for_access(reln, forknum, access,
+ lookup_state, lblkno))
+ {
+ if (MapInflightLookupOwnedPblk(reln->smgr_rlocator.locator,
+ forknum, lblkno, pblkno))
+ return UMBRA_ACCESS_RESOLVED_PBLK;
+ return UMBRA_ACCESS_RESOLVED_SKIP;
+ }
+
+ if (um_is_stale_post_truncate_lblk_for_access(reln, forknum, access,
+ lookup_state, lblkno))
+ return UMBRA_ACCESS_RESOLVED_SKIP;
+
+ um_report_unmapped_map_entry(reln, forknum, access, lblkno);
+ pg_unreachable();
+}
+
+static UmbraAccessResolveResult
+um_resolve_lblk_for_access(SMgrRelation reln, ForkNumber forknum,
+ const UmbraAccessState *access,
+ UmbraAccessLookupState *lookup_state,
+ BlockNumber lblkno,
+ UmbraAccessResolveMode mode,
+ BlockNumber *pblkno)
+{
+ UmbraFileContext *ctx = um_ctx_acquire(reln);
+ bool found;
+
+ Assert(pblkno != NULL);
+
+ if (!access->map_available)
+ {
+ *pblkno = lblkno;
+ return UMBRA_ACCESS_RESOLVED_PBLK;
+ }
- if (!fork_exists && forknum != MAIN_FORKNUM)
- logical_nblocks = InvalidBlockNumber;
+ if (mode == UMBRA_ACCESS_RESOLVE_READ)
+ {
+ found = MapTryLookup(ctx, reln->smgr_rlocator.locator,
+ forknum, lblkno, pblkno);
+ }
else
- logical_nblocks = nblocks;
+ {
+ found = MapTryLookup(ctx, reln->smgr_rlocator.locator,
+ forknum, lblkno, pblkno);
+ }
+
+ if (found)
+ return UMBRA_ACCESS_RESOLVED_PBLK;
- MapSBlockSetLogicalNblocks(ctx, reln->smgr_rlocator.locator,
- forknum, logical_nblocks,
- InvalidXLogRecPtr);
+ switch (mode)
+ {
+ case UMBRA_ACCESS_RESOLVE_READ:
+ return um_resolve_lblk_for_read(reln, forknum, access,
+ lookup_state, lblkno);
+ case UMBRA_ACCESS_RESOLVE_WRITE:
+ return um_resolve_lblk_for_write(reln, forknum, access,
+ lookup_state, lblkno, pblkno);
+ case UMBRA_ACCESS_RESOLVE_WRITEBACK:
+ return um_resolve_lblk_for_writeback(reln, forknum, access,
+ lookup_state, lblkno, pblkno);
+ }
+
+ pg_unreachable();
+}
- if (fork_exists || forknum == MAIN_FORKNUM)
+/*
+ * Resolve the longest read prefix beginning at blocknum whose translated
+ * physical blocks form one contiguous run within a single segment.
+ */
+static BlockNumber
+um_resolve_mapped_read_run(SMgrRelation reln, ForkNumber forknum,
+ const UmbraAccessState *access,
+ UmbraAccessLookupState *lookup_state,
+ BlockNumber blocknum, BlockNumber maxblocks,
+ BlockNumber *start_pblk)
+{
+ UmbraFileContext *ctx = um_ctx_acquire(reln);
+ BlockNumber run_blocks;
+ BlockNumber pblk;
+
+ Assert(access != NULL);
+ Assert(start_pblk != NULL);
+ Assert(maxblocks > 0);
+
+ run_blocks = MapTryLookupPblkRun(ctx, reln->smgr_rlocator.locator,
+ forknum, blocknum, maxblocks,
+ start_pblk);
+ if (run_blocks > 0)
+ return run_blocks;
+
+ if (um_resolve_lblk_for_access(reln, forknum, access, lookup_state,
+ blocknum, UMBRA_ACCESS_RESOLVE_READ,
+ &pblk) == UMBRA_ACCESS_RESOLVED_PBLK)
{
+ *start_pblk = pblk;
+ return 1;
+ }
+
+ return 0;
+}
+
+static UmbraMappedBirthResult
+um_publish_mapped_birth(SMgrRelation reln, ForkNumber forknum,
+ const UmbraAccessState *access,
+ BlockNumber lblkno, bool allow_wal_owned_firstborn)
+{
+ UmbraFileContext *ctx = um_ctx_acquire(reln);
+ UmbraMappedBirthResult result;
+ BlockNumber old_pblkno;
+
+ Assert(access->map_available);
+ (void) allow_wal_owned_firstborn;
+
+ result.mapping_published = false;
+
+ MapGetNewPbkno(ctx, reln->smgr_rlocator.locator, forknum, lblkno,
+ &result.pblkno, &old_pblkno);
+ Assert(old_pblkno == InvalidBlockNumber);
+
+ MapSetMapping(ctx, reln->smgr_rlocator.locator, forknum, lblkno,
+ result.pblkno, InvalidXLogRecPtr);
+ result.mapping_published = true;
+
+ if (result.mapping_published)
MapSBlockBumpNextFreePhysBlock(ctx, reln->smgr_rlocator.locator,
- forknum, nblocks,
+ forknum, result.pblkno + 1,
InvalidXLogRecPtr);
- MapSBlockBumpPhysicalNblocks(ctx, reln->smgr_rlocator.locator,
- forknum, nblocks,
- InvalidXLogRecPtr);
- }
+
+ MapInflightRelease(reln->smgr_rlocator.locator, forknum, lblkno);
+ return result;
}
+/*
+ * Auxiliary mapped forks (FSM/VM) can observe stale logical blocks during
+ * replay after truncate/VACUUM maintenance has already shrunk the
+ * authoritative logical EOF. Those callers historically expect EOF-like
+ * semantics, not a hard mapped-fork corruption error.
+ *
+ * Recovery reads are synchronous, but they still come through the AIO/bufmgr
+ * pipeline. Complete the read locally as a zero page so the normal shared
+ * buffer completion callbacks still run and mark the buffer valid, without
+ * issuing any physical I/O against an unmapped/stale block.
+ */
static void
-um_refresh_identity_metadata(SMgrRelation reln)
+um_complete_zero_readv(PgAioHandle *ioh, SMgrRelation reln,
+ ForkNumber forknum, BlockNumber blocknum,
+ void *buffer)
{
- ForkNumber forknum;
+ Assert(ioh != NULL);
+ Assert(buffer != NULL);
+ Assert(!INTERRUPTS_CAN_BE_PROCESSED());
+ Assert(ioh->state == PGAIO_HS_HANDED_OUT);
+ Assert(pgaio_my_backend->handed_out_io == ioh);
- Assert(UmMetadataExists(reln));
+ memset(buffer, 0, BLCKSZ);
+
+ pgaio_io_set_target_smgr(ioh, reln, forknum,
+ blocknum,
+ blocknum,
+ 1, false);
+ pgaio_io_register_callbacks(ioh, PGAIO_HCB_MD_READV, 0);
- for (forknum = MAIN_FORKNUM; forknum <= VISIBILITYMAP_FORKNUM; forknum++)
+ /*
+ * Mirror the minimal pgaio_io_stage() state transitions needed to invoke
+ * the normal smgr + buffer read completion callbacks, but do not start an
+ * actual readv against any file.
+ */
+ ioh->op = PGAIO_OP_READV;
+ ioh->result = 0;
+ ioh->state = PGAIO_HS_DEFINED;
+ pgaio_my_backend->handed_out_io = NULL;
+
+ pgaio_io_call_stage(ioh);
+
+ ioh->state = PGAIO_HS_STAGED;
+ pgaio_io_prepare_submit(ioh);
+
+ START_CRIT_SECTION();
+ pgaio_io_process_completion(ioh, BLCKSZ);
+ END_CRIT_SECTION();
+}
+
+/*
+ * PG18 truncation order is:
+ * 1. RelationTruncate() emits truncate WAL
+ * 2. smgrpretruncate() lets Umbra preload MAP pages before entering the
+ * critical section
+ * 3. smgrtruncate() later drops old shared buffers and performs the
+ * truncate-time metadata update
+ *
+ * That means a backend can briefly attempt to flush a stale dirty buffer for
+ * a block that has already been truncated away logically, but has not yet
+ * been removed from shared buffers. Such a write must be ignored, not turned
+ * into a new mapping owner. Only blocks still inside the authoritative
+ * logical EOF are allowed to demand a mapping.
+ */
+static bool
+um_is_stale_post_truncate_lblk_for_access(SMgrRelation reln,
+ ForkNumber forknum,
+ const UmbraAccessState *access,
+ UmbraAccessLookupState *lookup_state,
+ BlockNumber lblkno)
+{
+ BlockNumber logical_nblocks;
+
+ if (!access->map_available)
+ return false;
+
+ if (!um_fork_uses_map_translation(forknum))
+ return false;
+
+ /*
+ * MAIN fork remains strict during recovery: missing mappings there are
+ * corruption signals. Auxiliary mapped forks (FSM/VM) can legitimately
+ * walk just beyond logical EOF during truncate/vacuum maintenance, both
+ * in normal execution and during replay.
+ */
+ if (InRecovery && !UmbraForkIsAuxiliaryMapped(forknum))
+ return false;
+
+ /*
+ * Use the same authoritative logical EOF that the rest of the system sees.
+ * Umbra should have only one logical-size source of truth; this stale
+ * post-truncate path must not invent a second one by scanning MAP pages.
+ */
+ if (lookup_state != NULL && lookup_state->have_logical_nblocks)
+ logical_nblocks = lookup_state->logical_nblocks;
+ else
{
- bool fork_exists;
- BlockNumber nblocks;
+ logical_nblocks = umnblocks_for_access(reln, forknum, access);
+ if (lookup_state != NULL)
+ {
+ lookup_state->logical_nblocks = logical_nblocks;
+ lookup_state->have_logical_nblocks = true;
+ }
+ }
- if (!um_tracks_identity_metadata(forknum))
- continue;
+ return um_is_stale_post_truncate_lblk_with_eof(forknum,
+ logical_nblocks,
+ lblkno);
+}
+
+static bool
+um_is_stale_post_truncate_lblk_with_eof(ForkNumber forknum,
+ BlockNumber logical_nblocks,
+ BlockNumber lblkno)
+{
+ if (!um_fork_uses_map_translation(forknum))
+ return false;
+
+ if (logical_nblocks == InvalidBlockNumber)
+ return false;
+
+ return lblkno >= logical_nblocks;
+}
+
+BlockNumber
+umphysicalblock(SMgrRelation reln, ForkNumber forknum, BlockNumber lblkno)
+{
+ UmbraAccessState access;
+ UmbraFileContext *ctx = um_ctx_acquire(reln);
+ BlockNumber pblkno;
+
+ access = um_classify_access(reln, forknum);
+ if (!access.map_available)
+ return lblkno;
+
+ if (MapTryLookup(ctx, reln->smgr_rlocator.locator,
+ forknum, lblkno, &pblkno))
+ return pblkno;
+
+ um_report_unmapped_map_entry(reln, forknum, &access, lblkno);
+ pg_unreachable();
+}
+
+void
+UmMapGetNewPbkno(SMgrRelation reln, ForkNumber forknum,
+ BlockNumber lblkno, BlockNumber *new_pblkno,
+ BlockNumber *old_pblkno)
+{
+ UmbraAccessState access;
+ UmbraFileContext *ctx = um_ctx_acquire(reln);
+
+ Assert(new_pblkno != NULL);
+ Assert(old_pblkno != NULL);
+
+ access = um_classify_access(reln, forknum);
+ if (!access.map_available)
+ {
+ *old_pblkno = lblkno;
+ *new_pblkno = lblkno;
+ return;
+ }
+
+ MapGetNewPbkno(ctx, reln->smgr_rlocator.locator, forknum,
+ lblkno, new_pblkno, old_pblkno);
+}
- fork_exists = mdexists(reln, forknum);
- nblocks = fork_exists ? mdnblocks(reln, forknum) : 0;
- um_identity_update_metadata(reln, forknum, nblocks, fork_exists);
+void
+UmMapReserveFreshPbkno(SMgrRelation reln, ForkNumber forknum,
+ BlockNumber lblkno, BlockNumber *new_pblkno)
+{
+ UmbraAccessState access;
+
+ access = um_classify_access(reln, forknum);
+ um_reserve_fresh_pblkno_for_access(reln, forknum, &access,
+ lblkno, new_pblkno);
+}
+
+bool
+UmMapAccessAvailable(SMgrRelation reln, ForkNumber forknum)
+{
+ UmbraAccessState access;
+
+ access = um_classify_access(reln, forknum);
+ return access.map_available;
+}
+
+bool
+UmMapTryLookupPblkno(SMgrRelation reln, ForkNumber forknum,
+ BlockNumber lblkno, BlockNumber *pblkno)
+{
+ UmbraAccessState access;
+ UmbraFileContext *ctx = um_ctx_acquire(reln);
+
+ access = um_classify_access(reln, forknum);
+ if (!access.map_available)
+ {
+ *pblkno = lblkno;
+ return true;
}
+
+ return MapTryLookup(ctx, reln->smgr_rlocator.locator,
+ forknum, lblkno, pblkno);
+}
+
+bool
+UmMapIsLogicalUnmaterialized(SMgrRelation reln, ForkNumber forknum,
+ BlockNumber lblkno)
+{
+ UmbraAccessState access;
+
+ access = um_classify_access(reln, forknum);
+ return um_is_logical_unmaterialized_for_access(reln, forknum, &access,
+ lblkno);
+}
+
+void
+UmMapSetMapping(SMgrRelation reln, ForkNumber forknum,
+ BlockNumber lblkno, BlockNumber new_pblkno,
+ XLogRecPtr map_lsn)
+{
+ UmbraAccessState access;
+ UmbraFileContext *ctx = um_ctx_acquire(reln);
+
+ access = um_classify_access(reln, forknum);
+ if (!access.map_available)
+ return;
+
+ MapSetMapping(ctx, reln->smgr_rlocator.locator,
+ forknum, lblkno, new_pblkno, map_lsn);
}
static void
@@ -636,3 +1375,953 @@ um_filetag_path(const FileTag *ftag, char *path)
snprintf(path, MAXPGPATH, "%s.%llu",
base.str, (unsigned long long) ftag->segno);
}
+
+void
+umcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
+{
+ UmbraFileContext *ctx = um_ctx_acquire(reln);
+ bool created = false;
+
+ if (!umfile_open_or_create(ctx, forknum, isRedo, &created))
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not create or open relation %u/%u/%u fork %d",
+ reln->smgr_rlocator.locator.spcOid,
+ reln->smgr_rlocator.locator.dbOid,
+ reln->smgr_rlocator.locator.relNumber,
+ forknum)));
+
+ if (created &&
+ UmbraForkIsAuxiliaryMapped(forknum) &&
+ UmMetadataExists(reln))
+ {
+ XLogRecPtr map_lsn;
+
+ map_lsn = InRecovery ? GetXLogReplayRecPtr(NULL) : GetXLogWriteRecPtr();
+ MapSBlockSetLogicalNblocks(ctx, reln->smgr_rlocator.locator,
+ forknum, 0, map_lsn);
+ }
+}
+
+bool
+umexists(SMgrRelation reln, ForkNumber forknum)
+{
+ UmbraMapPolicy policy;
+ UmbraFileContext *ctx = um_ctx_acquire(reln);
+
+ if (!um_fork_uses_map_translation(forknum))
+ return umfile_exists(ctx, forknum, UMFILE_EXISTS_DENSE);
+
+ policy = um_map_policy_for_access(reln, forknum);
+ if (policy != UMBRA_MAP_POLICY_REQUIRE_MAP)
+ return umfile_exists(ctx, forknum, UMFILE_EXISTS_DENSE);
+
+ return um_mapped_exists_from_super(reln, forknum);
+}
+
+void
+umunlink(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo)
+{
+ /*
+ * Keep MAIN and MAP fork deletion ordered so the mapping lifecycle tracks
+ * the data fork during DROP processing and relfilenode reuse.
+ */
+ if (forknum == InvalidForkNumber)
+ {
+ MapInvalidateRelation(rlocator.locator);
+
+ umfile_unlink(rlocator, MAIN_FORKNUM, isRedo);
+ UmMetadataUnlink(rlocator, isRedo);
+
+ for (ForkNumber other = FSM_FORKNUM; other <= INIT_FORKNUM; other++)
+ umfile_unlink(rlocator, other, isRedo);
+ return;
+ }
+
+ if (forknum == UMBRA_METADATA_FORKNUM)
+ MapInvalidateRelation(rlocator.locator);
+ umfile_unlink(rlocator, forknum, isRedo);
+}
+
+void
+umextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+ const void *buffer, bool skipFsync)
+{
+ UmbraFileContext *ctx = um_ctx_acquire(reln);
+ UmbraAccessState access;
+ BlockNumber pblkno;
+ BlockNumber logical_nblocks;
+ BlockNumber materialized_nblocks = 0;
+ BlockNumber max_pblkno = InvalidBlockNumber;
+ bool mapping_committed = false;
+
+ access = um_classify_access(reln, forknum);
+
+ /* Only MAP fork itself uses direct physical extend. */
+ if (!access.map_available)
+ {
+ pblkno = blocknum;
+ umfile_extend(ctx, forknum, pblkno, buffer, skipFsync);
+ return;
+ }
+
+ /*
+ * smgrextend() contract allows blocknum beyond current EOF and requires
+ * intervening space to read as zeros. For mapped forks, we must explicitly
+ * create mappings and materialize zero pages for the gap.
+ */
+ logical_nblocks = umnblocks_for_access(reln, forknum, &access);
+ (void) MapSBlockTryGetPhysicalNblocks(ctx, reln->smgr_rlocator.locator,
+ forknum, &materialized_nblocks);
+ if (blocknum > logical_nblocks)
+ {
+ umzeroextend(reln, forknum, logical_nblocks,
+ (int) (blocknum - logical_nblocks),
+ skipFsync);
+ }
+
+ /*
+ * For mapped data forks, smgrextend is the point where an unmapped logical
+ * block gets its first physical block. Existing mappings can appear during
+ * redo/replay and should be reused.
+ */
+ if (!MapTryLookup(ctx, reln->smgr_rlocator.locator, forknum, blocknum, &pblkno))
+ {
+ UmbraMappedBirthResult birth;
+
+ birth = um_publish_mapped_birth(reln, forknum, &access, blocknum, true);
+ pblkno = birth.pblkno;
+ mapping_committed = birth.mapping_published;
+
+ /*
+ * Reservation/publication can make the mapping visible before the data
+ * fork is physically materialized. Extend when the chosen pblk is still
+ * beyond the materialized physical EOF; otherwise just write the page.
+ */
+ /*
+ * Page checksum stays keyed by the logical block identity; callers
+ * reaching smgrextend() have already set it using the logical blkno.
+ */
+ if (pblkno >= materialized_nblocks)
+ {
+ umfile_extend(ctx, forknum, pblkno, buffer, skipFsync);
+ max_pblkno = pblkno;
+ }
+ else
+ {
+ const void *single_buffer[1];
+
+ single_buffer[0] = buffer;
+ umfile_writev(ctx, forknum, pblkno, single_buffer, 1, skipFsync);
+ }
+ }
+ else
+ {
+ if (pblkno >= materialized_nblocks)
+ {
+ umfile_extend(ctx, forknum, pblkno, buffer, skipFsync);
+ max_pblkno = pblkno;
+ }
+ else
+ {
+ const void *single_buffer[1];
+
+ single_buffer[0] = buffer;
+ umfile_writev(ctx, forknum, pblkno, single_buffer, 1, skipFsync);
+ }
+ }
+
+ if (mapping_committed)
+ MapSBlockBumpLogicalNblocks(ctx, reln->smgr_rlocator.locator,
+ forknum, blocknum + 1,
+ InvalidXLogRecPtr);
+ if (max_pblkno != InvalidBlockNumber)
+ MapSBlockBumpPhysicalNblocks(ctx, reln->smgr_rlocator.locator,
+ forknum, max_pblkno + 1,
+ InvalidXLogRecPtr);
+}
+
+void
+umzeroextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+ int nblocks, bool skipFsync)
+{
+ UmbraFileContext *ctx = um_ctx_acquire(reln);
+ UmbraAccessState access;
+ BlockNumber run_start_pblk;
+ BlockNumber max_pblkno;
+ int run_len;
+
+ if (nblocks <= 0)
+ return;
+
+ access = um_classify_access(reln, forknum);
+
+ /* Direct physical path for non-mapped forks. */
+ if (!access.map_available)
+ {
+ umfile_zeroextend(ctx, forknum, blocknum, nblocks,
+ skipFsync);
+ return;
+ }
+
+ /*
+ * Per-block path for single-block, recovery, or callers that encountered
+ * pre-existing/pending MAP ownership in the requested range.
+ *
+ * For mapped forks we must materialize each newly-mapped physical page as
+ * a zero page, otherwise a later read through MAP would hit EOF/short read.
+ *
+ * Map allocator hands out sequential pblknos, so we can batch contiguous
+ * physical ranges with umfile_zeroextend().
+ */
+ run_start_pblk = InvalidBlockNumber;
+ max_pblkno = InvalidBlockNumber;
+ run_len = 0;
+
+ for (int i = 0; i < nblocks; i++)
+ {
+ BlockNumber lblk = blocknum + (BlockNumber) i;
+ BlockNumber pblk;
+
+ if (!MapTryLookup(ctx, reln->smgr_rlocator.locator, forknum, lblk, &pblk))
+ {
+ UmbraMappedBirthResult birth;
+
+ birth = um_publish_mapped_birth(reln, forknum, &access, lblk, false);
+ pblk = birth.pblkno;
+ }
+
+ if (max_pblkno == InvalidBlockNumber || pblk > max_pblkno)
+ max_pblkno = pblk;
+
+ if (run_len == 0)
+ {
+ run_start_pblk = pblk;
+ run_len = 1;
+ }
+ else if (pblk == run_start_pblk + (BlockNumber) run_len)
+ {
+ run_len++;
+ }
+ else
+ {
+ umfile_zeroextend(ctx, forknum, run_start_pblk,
+ run_len, skipFsync);
+ run_start_pblk = pblk;
+ run_len = 1;
+ }
+ }
+
+ if (run_len > 0)
+ umfile_zeroextend(ctx, forknum, run_start_pblk, run_len,
+ skipFsync);
+
+ if (max_pblkno != InvalidBlockNumber)
+ MapSBlockBumpPhysicalNblocks(ctx, reln->smgr_rlocator.locator,
+ forknum, max_pblkno + 1,
+ InvalidXLogRecPtr);
+
+ MapSBlockBumpLogicalNblocks(ctx, reln->smgr_rlocator.locator,
+ forknum, blocknum + (BlockNumber) nblocks,
+ InvalidXLogRecPtr);
+}
+
+bool
+umprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks)
+{
+ UmbraAccessState access;
+ UmbraAccessLookupState lookup_state = {0};
+ UmbraFileContext *ctx = um_ctx_acquire(reln);
+
+ access = um_classify_access(reln, forknum);
+
+ if (!access.map_available)
+ return umfile_prefetch(ctx, forknum, blocknum, nblocks);
+
+ for (int i = 0; i < nblocks; i++)
+ {
+ BlockNumber lblk = blocknum + (BlockNumber) i;
+ BlockNumber pblk;
+
+ if (um_resolve_lblk_for_access(reln, forknum, &access, &lookup_state,
+ lblk, UMBRA_ACCESS_RESOLVE_READ,
+ &pblk) != UMBRA_ACCESS_RESOLVED_PBLK)
+ continue;
+
+ if (!umfile_prefetch(ctx, forknum, pblk, 1))
+ return false;
+ }
+ return true;
+}
+
+uint32
+ummaxcombine(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
+{
+ UmbraAccessState access;
+ UmbraAccessLookupState lookup_state = {0};
+ BlockNumber pblk;
+ BlockNumber run_blocks;
+
+ access = um_classify_access(reln, forknum);
+
+ /*
+ * For mapped forks we can only combine a read while the translated physical
+ * blocks remain contiguous and stay inside one segment.
+ */
+ if (access.map_available)
+ {
+ if (InRecovery && UmbraForkIsAuxiliaryMapped(forknum))
+ return 1;
+
+ run_blocks = um_resolve_mapped_read_run(reln, forknum, &access,
+ &lookup_state, blocknum,
+ Min((BlockNumber) io_max_combine_limit,
+ (BlockNumber) umfile_maxcombine(forknum,
+ blocknum)),
+ &pblk);
+ return Max((BlockNumber) 1, run_blocks);
+ }
+ return umfile_maxcombine(forknum, blocknum);
+}
+
+void
+umreadv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+ void **buffers, BlockNumber nblocks)
+{
+ UmbraAccessState access;
+ UmbraAccessLookupState lookup_state = {0};
+ UmbraFileContext *ctx = um_ctx_acquire(reln);
+ bool aux_recovery_read;
+
+ access = um_classify_access(reln, forknum);
+
+ if (!access.map_available)
+ {
+ umfile_readv(ctx, forknum, blocknum, buffers, nblocks);
+ return;
+ }
+
+ aux_recovery_read = InRecovery && UmbraForkIsAuxiliaryMapped(forknum);
+
+ for (BlockNumber i = 0; i < nblocks;)
+ {
+ BlockNumber pblk;
+ BlockNumber run_blocks;
+
+ run_blocks = um_resolve_mapped_read_run(reln, forknum, &access,
+ &lookup_state, blocknum + i,
+ aux_recovery_read ? 1 :
+ (nblocks - i),
+ &pblk);
+ if (run_blocks == 0)
+ {
+ memset(buffers[i], 0, BLCKSZ);
+ i++;
+ continue;
+ }
+
+ /*
+ * Preserve md-style sync-read semantics for mapped forks by routing the
+ * translated physical block through umfile_readv().
+ *
+ * Callers such as VM/FSM redo use RBM_ZERO_ON_ERROR and expect
+ * InRecovery/zero_damaged_pages handling on short reads. A direct
+ * physical read would bypass that behavior and fail before bufmgr gets a
+ * chance to zero the page.
+ */
+ if (aux_recovery_read)
+ um_ensure_datafork_batch_ready_for_access(reln, forknum, &access,
+ pblk, true /* skipFsync */ );
+
+ umfile_readv(ctx, forknum, pblk, &buffers[i], run_blocks);
+ i += run_blocks;
+ }
+}
+
+static void
+um_startreadv_direct_physical(PgAioHandle *ioh, SMgrRelation reln,
+ UmbraFileContext *ctx, ForkNumber forknum,
+ BlockNumber blocknum, void **buffers,
+ BlockNumber nblocks)
+{
+ pgaio_io_set_target_smgr(ioh, reln, forknum,
+ blocknum /* logical */,
+ blocknum /* physical */,
+ nblocks, false);
+ pgaio_io_register_callbacks(ioh, PGAIO_HCB_MD_READV, 0);
+ umfile_startreadv(ioh, ctx, forknum, blocknum, buffers, nblocks);
+}
+
+static BlockNumber
+um_startreadv_lookup_mapped(PgAioHandle *ioh, SMgrRelation reln,
+ ForkNumber forknum, BlockNumber blocknum,
+ void **buffers, BlockNumber nblocks,
+ const UmbraAccessState *access,
+ UmbraAccessLookupState *lookup_state,
+ bool aux_recovery_read,
+ BlockNumber *pblk)
+{
+ BlockNumber run_blocks;
+
+ Assert(pblk != NULL);
+
+ run_blocks = um_resolve_mapped_read_run(reln, forknum, access, lookup_state,
+ blocknum,
+ aux_recovery_read ? 1 : nblocks,
+ pblk);
+ if (run_blocks > 0)
+ return run_blocks;
+
+ ioh->handle_data_len = 1;
+ um_complete_zero_readv(ioh, reln, forknum, blocknum, buffers[0]);
+ return 0;
+}
+
+static void
+um_startreadv_mapped_physical(PgAioHandle *ioh, SMgrRelation reln,
+ UmbraFileContext *ctx, ForkNumber forknum,
+ BlockNumber blocknum, void **buffers,
+ BlockNumber nblocks, BlockNumber pblk,
+ bool aux_recovery_read,
+ const UmbraAccessState *access)
+{
+ if (aux_recovery_read)
+ {
+ uint64 ensured_bytes = 0;
+
+ if (!umfile_ctx_block_exists(ctx, forknum, pblk))
+ {
+ um_ensure_datafork_batch_ready_for_access(reln, forknum, access,
+ pblk, true /* skipFsync */ );
+ ensured_bytes = BLCKSZ;
+ }
+ (void) ensured_bytes;
+ }
+
+ pgaio_io_set_target_smgr(ioh, reln, forknum,
+ blocknum /* logical */,
+ pblk /* physical */,
+ nblocks, false);
+ pgaio_io_register_callbacks(ioh, PGAIO_HCB_MD_READV, 0);
+ umfile_startreadv_physical(ioh, ctx, forknum,
+ blocknum /* logical */,
+ pblk /* physical */,
+ buffers, nblocks);
+}
+
+void
+umstartreadv(PgAioHandle *ioh, SMgrRelation reln, ForkNumber forknum,
+ BlockNumber blocknum, void **buffers, BlockNumber nblocks)
+{
+ UmbraAccessState access;
+ UmbraAccessLookupState lookup_state = {0};
+ UmbraFileContext *ctx = um_ctx_acquire(reln);
+ BlockNumber pblk;
+ bool aux_recovery_read;
+
+ access = um_classify_access(reln, forknum);
+
+ if (!access.map_available)
+ {
+ um_startreadv_direct_physical(ioh, reln, ctx, forknum,
+ blocknum, buffers, nblocks);
+ return;
+ }
+
+ /*
+ * See ummaxcombine(): callers may only combine a prefix whose translated
+ * physical blocks form one contiguous run. If the mapping changed since
+ * that check, shrink this I/O to the currently valid prefix and let
+ * WaitReadBuffers() retry the remainder.
+ */
+ aux_recovery_read = InRecovery && UmbraForkIsAuxiliaryMapped(forknum);
+
+ {
+ BlockNumber run_blocks;
+
+ run_blocks = um_startreadv_lookup_mapped(ioh, reln, forknum, blocknum,
+ buffers, nblocks, &access,
+ &lookup_state,
+ aux_recovery_read, &pblk);
+ if (run_blocks == 0)
+ return;
+
+ if (run_blocks < nblocks)
+ ioh->handle_data_len = run_blocks;
+ nblocks = run_blocks;
+ }
+
+ /*
+ * Start I/O using physical addressing but preserve logical identity for
+ * error reporting and reopen semantics.
+ */
+ um_startreadv_mapped_physical(ioh, reln, ctx, forknum, blocknum,
+ buffers, nblocks, pblk,
+ aux_recovery_read, &access);
+}
+
+static void
+um_claim_write_barrier(SMgrRelation reln, ForkNumber forknum,
+ UmbraFileContext *ctx, BlockNumber lblkno,
+ MapInflightBarrier *barrier)
+{
+ int wait_retries = 0;
+
+ Assert(barrier != NULL);
+ Assert(!barrier->valid);
+
+ for (;;)
+ {
+ if (MapInflightTryClaimBarrier(ctx, reln->smgr_rlocator.locator,
+ forknum, lblkno, barrier))
+ {
+ if (wait_retries > 0)
+ elog(LOG,
+ "storage write waited for in-flight remap on relation %u/%u/%u fork %d block %u (%d retries, %d usec)",
+ reln->smgr_rlocator.locator.spcOid,
+ reln->smgr_rlocator.locator.dbOid,
+ reln->smgr_rlocator.locator.relNumber,
+ forknum, lblkno,
+ wait_retries,
+ wait_retries * UMBRA_WRITE_BARRIER_WAIT_USEC);
+ return;
+ }
+
+ CHECK_FOR_INTERRUPTS();
+ pg_usleep(UMBRA_WRITE_BARRIER_WAIT_USEC);
+ wait_retries++;
+
+ if (wait_retries >= UMBRA_WRITE_BARRIER_WAIT_RETRIES)
+ ereport(ERROR,
+ (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
+ errmsg("timed out waiting for in-flight remap of relation %u/%u/%u fork %d block %u",
+ reln->smgr_rlocator.locator.spcOid,
+ reln->smgr_rlocator.locator.dbOid,
+ reln->smgr_rlocator.locator.relNumber,
+ forknum, lblkno)));
+ }
+}
+
+static void
+um_release_write_barriers(MapInflightBarrier *barriers, BlockNumber nbarriers)
+{
+ for (BlockNumber i = 0; i < nbarriers; i++)
+ MapInflightReleaseBarrier(&barriers[i]);
+}
+
+static void
+um_flush_write_barrier_run(UmbraFileContext *ctx, ForkNumber forknum,
+ BlockNumber run_start_pblk,
+ const void **buffers,
+ BlockNumber run_start_idx,
+ BlockNumber *run_blocks,
+ bool skipFsync,
+ MapInflightBarrier *barriers)
+{
+ if (*run_blocks == 0)
+ return;
+
+ umfile_writev(ctx, forknum, run_start_pblk, &buffers[run_start_idx],
+ *run_blocks, skipFsync);
+ um_release_write_barriers(barriers, *run_blocks);
+ *run_blocks = 0;
+}
+
+void
+umwritev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+ const void **buffers, BlockNumber nblocks, bool skipFsync)
+{
+ UmbraAccessState access;
+ UmbraAccessLookupState lookup_state = {0};
+ UmbraFileContext *ctx = um_ctx_acquire(reln);
+ BlockNumber materialized_nblocks = 0;
+ BlockNumber max_extended_pblk = InvalidBlockNumber;
+ BlockNumber run_start_pblk = InvalidBlockNumber;
+ BlockNumber run_start_idx = 0;
+ BlockNumber run_blocks = 0;
+ MapInflightBarrier run_barriers[PG_IOV_MAX];
+ MapInflightBarrier pending_barrier = {0};
+
+ access = um_classify_access(reln, forknum);
+ if (!access.map_available)
+ {
+ umfile_writev(ctx, forknum, blocknum, buffers, nblocks,
+ skipFsync);
+ return;
+ }
+
+ (void) MapSBlockTryGetPhysicalNblocks(ctx, reln->smgr_rlocator.locator,
+ forknum, &materialized_nblocks);
+
+ PG_TRY();
+ {
+ for (BlockNumber i = 0; i < nblocks; i++)
+ {
+ BlockNumber lblk = blocknum + i;
+ BlockNumber pblk;
+ UmbraAccessResolveResult resolve;
+ bool can_extend_run;
+
+ pending_barrier.valid = false;
+ pending_barrier.slot_id = -1;
+ pending_barrier.entry_idx = -1;
+
+ /*
+ * The barrier serializes physical writes with concurrent remap publication for
+ * the same logical block. Claim before lookup so a later relocation
+ * cannot publish a new mapping while this write is still targeting the
+ * old physical page.
+ */
+ um_claim_write_barrier(reln, forknum, ctx, lblk, &pending_barrier);
+
+ resolve = um_resolve_lblk_for_access(reln, forknum, &access,
+ &lookup_state,
+ lblk,
+ UMBRA_ACCESS_RESOLVE_WRITE,
+ &pblk);
+ Assert(resolve == UMBRA_ACCESS_RESOLVED_PBLK);
+
+ /*
+ * Checksum identity stays logical (lblk); callers reaching smgrwritev()
+ * have already set it before Umbra translates to physical blocks.
+ */
+
+ can_extend_run =
+ (run_blocks > 0) &&
+ (pblk == run_start_pblk + run_blocks) &&
+ (run_blocks < (BlockNumber) lengthof(run_barriers)) &&
+ ((run_start_pblk % ((BlockNumber) RELSEG_SIZE)) + run_blocks <
+ ((BlockNumber) RELSEG_SIZE));
+
+ if (pblk < materialized_nblocks)
+ {
+ if (run_blocks == 0)
+ {
+ run_start_pblk = pblk;
+ run_start_idx = i;
+ }
+ else if (!can_extend_run)
+ {
+ um_flush_write_barrier_run(ctx, forknum, run_start_pblk,
+ buffers, run_start_idx,
+ &run_blocks, skipFsync,
+ run_barriers);
+ run_start_pblk = pblk;
+ run_start_idx = i;
+ }
+
+ Assert(run_blocks < (BlockNumber) lengthof(run_barriers));
+ run_barriers[run_blocks] = pending_barrier;
+ pending_barrier.valid = false;
+ run_blocks++;
+ continue;
+ }
+
+ um_flush_write_barrier_run(ctx, forknum, run_start_pblk,
+ buffers, run_start_idx,
+ &run_blocks, skipFsync, run_barriers);
+ run_start_pblk = InvalidBlockNumber;
+
+ umfile_extend(ctx, forknum, pblk, buffers[i], skipFsync);
+ MapInflightReleaseBarrier(&pending_barrier);
+
+ if (max_extended_pblk == InvalidBlockNumber || pblk > max_extended_pblk)
+ max_extended_pblk = pblk;
+ if (materialized_nblocks < pblk + 1)
+ materialized_nblocks = pblk + 1;
+ }
+
+ um_flush_write_barrier_run(ctx, forknum, run_start_pblk,
+ buffers, run_start_idx,
+ &run_blocks, skipFsync, run_barriers);
+ }
+ PG_CATCH();
+ {
+ MapInflightReleaseBarrier(&pending_barrier);
+ um_release_write_barriers(run_barriers, run_blocks);
+ PG_RE_THROW();
+ }
+ PG_END_TRY();
+
+ if (max_extended_pblk != InvalidBlockNumber)
+ MapSBlockBumpPhysicalNblocks(ctx, reln->smgr_rlocator.locator,
+ forknum, max_extended_pblk + 1,
+ InvalidXLogRecPtr);
+
+ if (nblocks > 0)
+ {
+ BlockNumber logical_nblocks;
+
+ logical_nblocks = umnblocks_for_access(reln, forknum, &access);
+ Assert(logical_nblocks >= blocknum + nblocks);
+ }
+}
+
+void
+umwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+ BlockNumber nblocks)
+{
+ UmbraAccessState access;
+ UmbraAccessLookupState lookup_state = {0};
+ UmbraFileContext *ctx = um_ctx_acquire(reln);
+ BlockNumber run_start_pblk = InvalidBlockNumber;
+ BlockNumber run_blocks = 0;
+
+ access = um_classify_access(reln, forknum);
+ if (!access.map_available)
+ {
+ umfile_writeback(ctx, forknum, blocknum, nblocks);
+ return;
+ }
+
+ for (BlockNumber i = 0; i < nblocks; i++)
+ {
+ BlockNumber lblk = blocknum + i;
+ BlockNumber pblk;
+ UmbraAccessResolveResult resolve;
+ bool can_extend_run;
+
+ resolve = um_resolve_lblk_for_access(reln, forknum, &access,
+ &lookup_state,
+ lblk,
+ UMBRA_ACCESS_RESOLVE_WRITEBACK,
+ &pblk);
+ if (resolve == UMBRA_ACCESS_RESOLVED_SKIP)
+ {
+ if (run_blocks > 0)
+ {
+ umfile_writeback(ctx, forknum, run_start_pblk, run_blocks);
+ run_start_pblk = InvalidBlockNumber;
+ run_blocks = 0;
+ }
+ continue;
+ }
+
+ Assert(resolve == UMBRA_ACCESS_RESOLVED_PBLK);
+
+ can_extend_run =
+ (run_blocks > 0) &&
+ (pblk == run_start_pblk + run_blocks);
+
+ if (run_blocks == 0)
+ {
+ run_start_pblk = pblk;
+ run_blocks = 1;
+ }
+ else if (can_extend_run)
+ {
+ run_blocks++;
+ }
+ else
+ {
+ umfile_writeback(ctx, forknum, run_start_pblk, run_blocks);
+ run_start_pblk = pblk;
+ run_blocks = 1;
+ }
+ }
+
+ if (run_blocks > 0)
+ umfile_writeback(ctx, forknum, run_start_pblk, run_blocks);
+}
+
+BlockNumber
+umnblocks(SMgrRelation reln, ForkNumber forknum)
+{
+ UmbraAccessState access;
+
+ access = um_classify_access(reln, forknum);
+ return umnblocks_for_access(reln, forknum, &access);
+}
+
+BlockNumber
+umnblocks_cached(SMgrRelation reln, ForkNumber forknum)
+{
+ return reln->smgr_cached_nblocks[forknum];
+}
+
+static BlockNumber
+umnblocks_for_access(SMgrRelation reln, ForkNumber forknum,
+ const UmbraAccessState *access)
+{
+ UmbraFileContext *ctx = um_ctx_acquire(reln);
+ BlockNumber nblocks;
+
+ if (!access->map_available)
+ return umfile_nblocks(ctx, forknum, UMFILE_NBLOCKS_DENSE);
+
+ if (MapSBlockTryGetLogicalNblocks(ctx, reln->smgr_rlocator.locator,
+ forknum, &nblocks))
+ return nblocks;
+
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("missing or invalid MAP superblock for relation %u/%u/%u fork %d",
+ reln->smgr_rlocator.locator.spcOid,
+ reln->smgr_rlocator.locator.dbOid,
+ reln->smgr_rlocator.locator.relNumber,
+ forknum)));
+}
+
+void
+umpretruncate(SMgrRelation reln, ForkNumber forknum,
+ BlockNumber old_blocks, BlockNumber nblocks,
+ XLogRecPtr truncate_lsn)
+{
+ UmbraAccessState access;
+ UmbraFileContext *ctx = um_ctx_acquire(reln);
+
+ (void) old_blocks;
+ (void) truncate_lsn;
+ access = um_classify_access(reln, forknum);
+
+ if (um_fork_uses_map_translation(forknum) &&
+ (access.policy == UMBRA_MAP_POLICY_REQUIRE_MAP ||
+ access.map_available))
+ MapPreloadTruncatePages(ctx, reln->smgr_rlocator.locator,
+ forknum, nblocks);
+}
+
+void
+umtruncate(SMgrRelation reln, ForkNumber forknum,
+ BlockNumber old_blocks, BlockNumber nblocks)
+{
+ UmbraAccessState access;
+ UmbraFileContext *ctx = um_ctx_acquire(reln);
+
+ access = um_classify_access(reln, forknum);
+ if (um_fork_uses_map_translation(forknum) &&
+ (access.policy == UMBRA_MAP_POLICY_REQUIRE_MAP ||
+ access.map_available))
+ {
+ XLogRecPtr map_lsn;
+
+ map_lsn = InRecovery ?
+ GetXLogReplayRecPtr(NULL) : GetXLogWriteRecPtr();
+
+ MapTruncate(ctx, reln->smgr_rlocator.locator,
+ forknum, nblocks, map_lsn);
+ MapSBlockSetLogicalNblocks(ctx, reln->smgr_rlocator.locator,
+ forknum, nblocks, map_lsn);
+ MapReleasePreloadedTruncatePages(reln->smgr_rlocator.locator, forknum);
+ return;
+ }
+
+ /* Non-mapped forks (and MAP fork itself) truncate physically. */
+ umfile_truncate(ctx, forknum, old_blocks, nblocks);
+}
+
+void
+umimmedsync(SMgrRelation reln, ForkNumber forknum)
+{
+ UmbraFileContext *ctx = um_ctx_acquire(reln);
+
+ if (forknum == UMBRA_METADATA_FORKNUM)
+ MapCheckpointRelation(reln->smgr_rlocator.locator);
+
+ umfile_immedsync(ctx, forknum);
+}
+
+void
+umregistersync(SMgrRelation reln, ForkNumber forknum)
+{
+ umimmedsync(reln, forknum);
+}
+
+bool
+umpreparependingsync(SMgrRelation reln)
+{
+ if (RelFileLocatorSkippingWAL(reln->smgr_rlocator.locator))
+ UmRebuildMapAndSuperblockForSkipWAL(reln);
+
+ return um_relation_requires_durable_sync(reln);
+}
+
+bool
+umneedsrecoveryfsmvacuum(SMgrRelation reln)
+{
+ (void) reln;
+
+ /*
+ * FSM is not WAL-logged. During replay, Umbra already publishes the
+ * truncate result through mapped metadata, and auxiliary stale reads are
+ * handled with EOF-like semantics. Re-running the generic post-truncate
+ * FSM vacuum step provides only tidy-up value, while forcing recovery to
+ * walk stale upper-tree pages through the MAIN-oriented buffer model.
+ *
+ * Skip that replay-only cleanup and let later foreground FSM maintenance
+ * refresh upper-level slots naturally.
+ */
+ return false;
+}
+
+int
+umfd(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, uint32 *off)
+{
+ /*
+ * smgrfd() is only used by the AIO reopen path, after the issuer has
+ * already resolved logical identity to a concrete physical target.
+ * Interpret blocknum here as a physical block number and reopen the
+ * corresponding segment/offset directly.
+ */
+ return umfile_fd(um_ctx_acquire(reln), forknum, blocknum, off);
+}
+
+int
+umsyncfiletag(const FileTag *ftag, char *path)
+{
+ File fd;
+ int ret;
+ int save_errno;
+
+ um_filetag_path(ftag, path);
+
+ fd = PathNameOpenFile(path, O_RDWR | PG_BINARY);
+ if (fd < 0)
+ return -1;
+
+ ret = FileSync(fd, WAIT_EVENT_DATA_FILE_SYNC);
+ save_errno = errno;
+
+ FileClose(fd);
+ errno = save_errno;
+ return ret;
+}
+
+int
+umunlinkfiletag(const FileTag *ftag, char *path)
+{
+ um_filetag_path(ftag, path);
+ return unlink(path);
+}
+
+bool
+umfiletagmatches(const FileTag *ftag, const FileTag *candidate)
+{
+ /*
+ * Database-scope filter (DROP DATABASE / MOVE DATABASE paths).
+ */
+ if (ftag->forknum == InvalidForkNumber &&
+ ftag->segno == InvalidBlockNumber &&
+ ftag->rlocator.spcOid == 0 &&
+ ftag->rlocator.relNumber == 0)
+ return ftag->rlocator.dbOid == candidate->rlocator.dbOid;
+
+ /*
+ * Relation-scope filter: wildcard fork/segment.
+ */
+ if (ftag->forknum == InvalidForkNumber &&
+ ftag->segno == InvalidBlockNumber)
+ return RelFileLocatorEquals(ftag->rlocator, candidate->rlocator);
+
+ /*
+ * Fork-scope filter: wildcard segment.
+ */
+ if (ftag->segno == InvalidBlockNumber)
+ return RelFileLocatorEquals(ftag->rlocator, candidate->rlocator) &&
+ ftag->forknum == candidate->forknum;
+
+ /* Exact file match. */
+ return RelFileLocatorEquals(ftag->rlocator, candidate->rlocator) &&
+ ftag->forknum == candidate->forknum &&
+ ftag->segno == candidate->segno;
+}
diff --git a/src/backend/storage/smgr/umfile.c b/src/backend/storage/smgr/umfile.c
index 17145405cf..63afc8546c 100644
--- a/src/backend/storage/smgr/umfile.c
+++ b/src/backend/storage/smgr/umfile.c
@@ -1620,7 +1620,6 @@ umfile_zeroextend(UmbraFileContext *ctx, ForkNumber forknum, BlockNumber blocknu
nblocks -= numblocks;
blocknum += numblocks;
}
-
}
bool
@@ -1832,7 +1831,6 @@ umfile_readv(UmbraFileContext *ctx, ForkNumber forknum, BlockNumber blocknum,
buffers += nblocks_this_segment;
blocknum += nblocks_this_segment;
}
-
}
void
@@ -1897,12 +1895,10 @@ umfile_startreadv_physical(PgAioHandle *ioh, UmbraFileContext *ctx,
* Umbra MAP translation enforces single-block I/O via ummaxcombine().
*/
Assert(nblocks >= 1);
- {
- v = umfile_getseg(ctx, ctx->rlocator,
- forknum, physical_blocknum, false /* skipFsync */,
- UM_EXTENSION_FAIL,
- RelFileLocatorBackendIsTemp(ctx->rlocator));
- }
+ v = umfile_getseg(ctx, ctx->rlocator,
+ forknum, physical_blocknum, false /* skipFsync */,
+ UM_EXTENSION_FAIL,
+ RelFileLocatorBackendIsTemp(ctx->rlocator));
seekpos = (off_t) BLCKSZ * (physical_blocknum % ((BlockNumber) RELSEG_SIZE));
Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
@@ -1926,10 +1922,8 @@ umfile_startreadv_physical(PgAioHandle *ioh, UmbraFileContext *ctx,
* Preserve logical identity for AIO completion reporting and reopen.
* The started I/O uses physical addressing (file/seekpos).
*/
- {
- ret = FileStartReadV(ioh, v->umfd_vfd, iovcnt, seekpos,
- WAIT_EVENT_DATA_FILE_READ);
- }
+ ret = FileStartReadV(ioh, v->umfd_vfd, iovcnt, seekpos,
+ WAIT_EVENT_DATA_FILE_READ);
if (ret != 0)
ereport(ERROR,
(errcode_for_file_access(),
@@ -2026,7 +2020,6 @@ umfile_writev(UmbraFileContext *ctx, ForkNumber forknum, BlockNumber blocknum,
buffers += nblocks_this_segment;
blocknum += nblocks_this_segment;
}
-
}
void
diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c
index e19f0d3e51..1b9cc0ecf3 100644
--- a/src/backend/utils/cache/relcache.c
+++ b/src/backend/utils/cache/relcache.c
@@ -3873,7 +3873,17 @@ RelationSetNewRelfilenumber(Relation relation, char persistence)
SMgrRelation srel;
srel = RelationCreateStorage(newrlocator, persistence, true);
- smgrclose(srel);
+
+ /*
+ * Keep the newly created storage handle as the relation's default
+ * smgr binding. The creator has already seeded the correct MAP policy
+ * on this handle, and subsequent writes in the same command should not
+ * fall back to shape-based reopening.
+ */
+ if (relation->rd_smgr != NULL)
+ RelationCloseSmgr(relation);
+ relation->rd_smgr = srel;
+ smgrpin(srel);
}
else
{
diff --git a/src/include/storage/aio_types.h b/src/include/storage/aio_types.h
index 17b59aeed7..d07911673c 100644
--- a/src/include/storage/aio_types.h
+++ b/src/include/storage/aio_types.h
@@ -63,7 +63,8 @@ typedef union PgAioTargetData
struct
{
RelFileLocator rlocator; /* physical relation identifier */
- BlockNumber blockNum; /* blknum relative to begin of reln */
+ BlockNumber blockNum; /* logical blknum relative to begin of reln */
+ BlockNumber physBlockNum; /* physical blknum for executing this IO */
BlockNumber nblocks;
ForkNumber forkNum:8; /* don't waste 4 byte for four values */
bool is_temp:1; /* proc can be inferred by owning AIO */
diff --git a/src/include/storage/map.h b/src/include/storage/map.h
index b4f6063f35..ccbc392835 100644
--- a/src/include/storage/map.h
+++ b/src/include/storage/map.h
@@ -77,6 +77,9 @@ typedef struct MapPage
uint32 pblknos[MAP_ENTRIES_PER_PAGE];
} MapPage;
+#define MAP_PENDING_BITS_PER_WORD 64
+#define MAP_PENDING_BITMAP_WORDS \
+ ((MAP_ENTRIES_PER_PAGE + MAP_PENDING_BITS_PER_WORD - 1) / MAP_PENDING_BITS_PER_WORD)
/* Shared memory control structure */
typedef struct MapSharedData
@@ -109,12 +112,20 @@ typedef struct MapBufferDesc
XLogRecPtr page_lsn; /* LSN of last modification */
int id; /* slot ID */
pg_atomic_uint32 state; /* state flags */
+ uint32 pending_count; /* in-flight remaps protected by pending_bits */
+ uint64 pending_bits[MAP_PENDING_BITMAP_WORDS];
int freeNext; /* next buffer in free list */
int wait_backend_pid; /* backend PID of pin-count waiter */
LWLock buffer_lock; /* lock for buffer content access */
LWLock io_in_progress_lock; /* lock for buffer I/O state */
} MapBufferDesc;
+typedef struct MapInflightBarrier
+{
+ bool valid;
+ int slot_id;
+ int entry_idx;
+} MapInflightBarrier;
extern void MapBackendInit(void);
extern const ShmemCallbacks MapShmemCallbacks;
@@ -128,10 +139,34 @@ extern BlockNumber MapTryLookupPblkRun(UmbraFileContext *map_ctx,
ForkNumber forknum,
BlockNumber lblkno,
BlockNumber maxblocks,
- BlockNumber *start_pblkno);/* Buffer management */
+ BlockNumber *start_pblkno);
+extern bool MapReserveFreshPblkno(UmbraFileContext *map_ctx,
+ RelFileLocator rnode,
+ ForkNumber forknum,
+ BlockNumber lblkno,
+ BlockNumber *new_pblkno);
+extern bool MapInflightLookupOwnedPblk(RelFileLocator rnode, ForkNumber forknum,
+ BlockNumber lblkno, BlockNumber *pblkno);
+extern bool MapInflightTryClaimBarrier(UmbraFileContext *map_ctx,
+ RelFileLocator rnode,
+ ForkNumber forknum,
+ BlockNumber lblkno,
+ MapInflightBarrier *barrier);
+extern void MapInflightReleaseBarrier(MapInflightBarrier *barrier);
+extern void MapInflightRelease(RelFileLocator rnode, ForkNumber forknum,
+ BlockNumber lblkno);
+/* Buffer management used by direct mapping publication helpers. */
extern int MapReadBuffer(UmbraFileContext *map_ctx, RelFileLocator rnode,
ForkNumber forknum, BlockNumber map_blkno);
+/* Mapping publication helpers. */
+extern void MapGetNewPbkno(UmbraFileContext *map_ctx, RelFileLocator rnode,
+ ForkNumber forknum, BlockNumber lblkno,
+ BlockNumber *new_pblkno, BlockNumber *old_pblkno);
+extern void MapSetMapping(UmbraFileContext *map_ctx, RelFileLocator rnode,
+ ForkNumber forknum, BlockNumber lblkno,
+ BlockNumber new_pblkno, XLogRecPtr map_lsn);
+
/* MAP superblock helpers */
extern void MapSBlockInit(UmbraFileContext *map_ctx, RelFileLocator rnode,
XLogRecPtr map_lsn);
diff --git a/src/include/storage/map_internal.h b/src/include/storage/map_internal.h
index 8a2ee89deb..acac29b018 100644
--- a/src/include/storage/map_internal.h
+++ b/src/include/storage/map_internal.h
@@ -21,8 +21,31 @@ extern bool MapStartBufferIO(MapBufferDesc *buf, uint32 required_bits);
extern void MapTerminateBufferIO(MapBufferDesc *buf, bool clear_dirty,
uint32 set_flag_bits);
extern void MapFlushBuffer(int slot_id);
+extern void MapInflightCleanupOwned(void);
+extern void MapInflightBackendInit(void);
extern void MapResetAllTruncatePreloads(void);
extern BlockNumber MapForkPageIndexToMapBlkno(ForkNumber forknum,
BlockNumber fork_page_idx);
extern BlockNumber MapLblknoToMapBlkno(ForkNumber forknum, BlockNumber lblkno);
+extern bool MapReserveNextPblkno(UmbraFileContext *map_ctx, RelFileLocator rnode,
+ ForkNumber forknum, BlockNumber lblkno,
+ BlockNumber *new_pblkno, bool nowait);
+extern bool MapTryReserveFreshPblkno(UmbraFileContext *map_ctx,
+ RelFileLocator rnode,
+ ForkNumber forknum,
+ BlockNumber lblkno,
+ BlockNumber *new_pblkno,
+ bool nowait);
+extern bool MapInflightTryClaim(UmbraFileContext *map_ctx,
+ RelFileLocator rnode,
+ ForkNumber forknum,
+ BlockNumber lblkno);
+extern void MapInflightFinishClaim(RelFileLocator rnode,
+ ForkNumber forknum,
+ BlockNumber lblkno,
+ BlockNumber pblkno);
+extern bool MapInflightBitIsSet(RelFileLocator rnode,
+ ForkNumber forknum,
+ BlockNumber lblkno);
+
#endif /* MAP_INTERNAL_H */
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index 47dbf12643..b7f95ed5d3 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -14,6 +14,7 @@
#ifndef SMGR_H
#define SMGR_H
+#include "access/xlogdefs.h"
#include "lib/ilist.h"
#include "storage/aio_types.h"
#include "storage/block.h"
@@ -21,11 +22,11 @@
/*
* smgr.c maintains a table of SMgrRelation objects, which are essentially
- * cached storage-manager handles for a relation. An SMgrRelation is created
- * (if not already present) by smgropen(), and destroyed by smgrdestroy().
- * Note that neither of these operations imply I/O, they just create or destroy
- * a hashtable entry. (But smgrdestroy() may release associated resources,
- * such as OS-level file descriptors.)
+ * cached file handles. An SMgrRelation is created (if not already present)
+ * by smgropen(), and destroyed by smgrdestroy(). Note that neither of these
+ * operations imply I/O, they just create or destroy a hashtable entry. (But
+ * smgrdestroy() may release associated resources, such as OS-level file
+ * descriptors.)
*
* An SMgrRelation may be "pinned", to prevent it from being destroyed while
* it's in use. We use this to prevent pointers in relcache to smgr from being
@@ -113,18 +114,34 @@ extern void smgrwriteback(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, BlockNumber nblocks);
extern BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum);
extern BlockNumber smgrnblocks_cached(SMgrRelation reln, ForkNumber forknum);
+extern void smgrbumpcachednblocks(SMgrRelation reln, ForkNumber forknum,
+ BlockNumber nblocks);
+extern bool smgrisinternalfork(ForkNumber forknum);
extern void smgrcreaterelationmetadata(SMgrRelation reln);
extern void smgrcopyrelationmetadata(SMgrRelation src, SMgrRelation dst,
char relpersistence);
extern void smgrsyncrelationmetadata(SMgrRelation reln);
extern void smgrunlinkrelationmetadata(RelFileLocatorBackend rlocator,
bool isRedo);
+extern void smgrsetmapstate(SMgrRelation reln, uint8 map_state);
extern bool smgrcreatedballowswallog(void);
+extern void smgrinitnewrelation(SMgrRelation reln, bool needs_wal);
+extern void smgrredocreatefork(SMgrRelation reln, ForkNumber forknum,
+ XLogRecPtr lsn);
extern void smgrcheckpointdatabasetablespaces(Oid dbid, int ntablespaces,
const Oid *tablespace_ids);
extern void smgrinvalidatedatabasetablespaces(Oid dbid, int ntablespaces,
const Oid *tablespace_ids);
extern void smgrinvalidatedatabase(Oid dbid);
+extern void smgrregistershutdowncleanup(void);
+extern void smgrmarkskipwalpending(RelFileLocator rlocator);
+extern void smgrclearskipwalpending(RelFileLocator rlocator);
+extern bool smgrpreparependingsync(SMgrRelation reln);
+extern bool smgrneedsrecoveryfsmvacuum(SMgrRelation reln);
+extern void smgrpretruncate(SMgrRelation reln, ForkNumber *forknum, int nforks,
+ BlockNumber *old_nblocks,
+ BlockNumber *nblocks,
+ XLogRecPtr truncate_lsn);
extern void smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks,
BlockNumber *old_nblocks,
BlockNumber *nblocks);
@@ -150,7 +167,8 @@ smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
extern void pgaio_io_set_target_smgr(PgAioHandle *ioh,
SMgrRelationData *smgr,
ForkNumber forknum,
- BlockNumber blocknum,
+ BlockNumber logical_blocknum,
+ BlockNumber physical_blocknum,
int nblocks,
bool skip_fsync);
diff --git a/src/include/storage/um_defs.h b/src/include/storage/um_defs.h
index 3b567a397e..b7ad5f4490 100644
--- a/src/include/storage/um_defs.h
+++ b/src/include/storage/um_defs.h
@@ -1,14 +1,14 @@
/*-------------------------------------------------------------------------
*
* um_defs.h
- * Umbra low-level fork and metadata path definitions.
+ * Umbra low-level fork and metadata path definitions.
*
- * This header contains storage-layout facts shared by Umbra submodules.
- *
- * src/include/storage/um_defs.h
+ * This header intentionally contains only storage-layout facts shared by
+ * Umbra submodules. Higher-level MAP policy stays in umbra.h/umbra.c.
*
*-------------------------------------------------------------------------
*/
+
#ifndef UM_DEFS_H
#define UM_DEFS_H
@@ -16,13 +16,15 @@
#include "common/relpath.h"
#include "storage/relfilelocator.h"
+#include "storage/smgr.h"
/*
- * Umbra reserves an extra fork slot for relation-local metadata. This lives
- * outside PostgreSQL's built-in fork numbering so ordinary smgr loops do not
- * try to process it implicitly.
+ * Umbra internal metadata fork numbering.
+ *
+ * The numeric value still matches the historical MAP slot, but the definition
+ * lives here so low-level file/map code does not depend on umbra.h.
*/
-#define UMBRA_METADATA_FORKNUM ((ForkNumber) (INIT_FORKNUM + 1))
+#define UMBRA_METADATA_FORKNUM ((int) INIT_FORKNUM + 1)
#define UMBRA_FORK_SLOTS (UMBRA_METADATA_FORKNUM + 1)
static inline RelPathStr
diff --git a/src/include/storage/umbra.h b/src/include/storage/umbra.h
index b41fae75ea..0702f7b392 100644
--- a/src/include/storage/umbra.h
+++ b/src/include/storage/umbra.h
@@ -1,25 +1,61 @@
/*-------------------------------------------------------------------------
*
* umbra.h
- * Umbra storage manager public interface declarations.
+ * Umbra storage manager public interface declarations.
*
* This header declares the Umbra smgr callback surface used by smgr.c when
* the build is configured with --with-umbra.
*
- * src/include/storage/umbra.h
- *
*-------------------------------------------------------------------------
*/
+
#ifndef UMBRA_H
#define UMBRA_H
#include "storage/aio_types.h"
#include "storage/block.h"
+#include "common/relpath.h"
#include "storage/relfilelocator.h"
#include "storage/smgr.h"
#include "storage/sync.h"
#include "storage/um_defs.h"
+/*
+ * Umbra MAP policy.
+ *
+ * This is the handle-local Umbra access state. Mapped forks must not silently
+ * fall back to direct physical addressing unless upper layers explicitly allow
+ * it.
+ */
+typedef enum UmbraMapPolicy
+{
+ UMBRA_MAP_POLICY_UNKNOWN = 0,
+ UMBRA_MAP_POLICY_BYPASS_MAP,
+ UMBRA_MAP_POLICY_SKIP_WAL_PENDING_MAP,
+ UMBRA_MAP_POLICY_REQUIRE_MAP,
+} UmbraMapPolicy;
+
+/*
+ * Umbra keeps MAIN/FSM/VM under mapping translation, but only MAIN uses the
+ * more involved page-WAL-owned first-born protocol. FSM/VM are auxiliary
+ * mapped forks with a more explicit producer set and a simpler traced-extend
+ * model.
+ */
+static inline bool
+UmbraForkUsesMapTranslation(ForkNumber forknum)
+{
+ return (forknum == MAIN_FORKNUM ||
+ forknum == FSM_FORKNUM ||
+ forknum == VISIBILITYMAP_FORKNUM);
+}
+
+static inline bool
+UmbraForkIsAuxiliaryMapped(ForkNumber forknum)
+{
+ return (forknum == FSM_FORKNUM ||
+ forknum == VISIBILITYMAP_FORKNUM);
+}
+
extern bool UmMetadataExists(SMgrRelation reln);
extern bool UmMetadataOpenOrCreate(SMgrRelation reln, bool isRedo, bool *created);
extern BlockNumber UmMetadataNblocks(SMgrRelation reln);
@@ -31,20 +67,28 @@ extern void UmMetadataWriteSuperblock(RelFileLocatorBackend rlocator,
extern void UmMetadataExtend(SMgrRelation reln, BlockNumber blkno,
const void *buffer, bool skipFsync);
extern void UmMetadataImmediateSync(SMgrRelation reln);
+extern void UmMetadataRegisterSync(SMgrRelation reln);
extern void UmMetadataUnlink(RelFileLocatorBackend rlocator, bool isRedo);
-extern void UmInvalidateDatabase(Oid dbid);
+/* Umbra storage manager functionality (smgr callbacks). */
extern void uminit(void);
+extern void umbeforeshmemexitcleanup(void);
extern void umopen(SMgrRelation reln);
extern void umclose(SMgrRelation reln, ForkNumber forknum);
extern void umdestroy(SMgrRelation reln);
-extern bool umisinternalfork(ForkNumber forknum);
extern bool umcreatedballowswallog(void);
+extern void uminitnewrelation(SMgrRelation reln, bool needs_wal);
+extern void umsetmapstate(SMgrRelation reln, uint8 map_state);
+extern void ummarkskipwalpending(SMgrRelation reln);
+extern void umclearskipwalpending(SMgrRelation reln);
+extern bool umisinternalfork(ForkNumber forknum);
extern void umcreaterelationmetadata(SMgrRelation reln);
+extern void umredocreatefork(SMgrRelation reln, ForkNumber forknum,
+ XLogRecPtr lsn);
extern void umcheckpointdatabasetablespaces(Oid dbid, int ntablespaces,
const Oid *tablespace_ids);
extern void uminvalidatedatabasetablespaces(Oid dbid, int ntablespaces,
- const Oid *tablespace_ids);
+ const Oid *tablespace_ids);
extern void umcopyrelationmetadata(SMgrRelation src, SMgrRelation dst,
char relpersistence);
extern void umsyncrelationmetadata(SMgrRelation reln);
@@ -55,8 +99,17 @@ extern bool umexists(SMgrRelation reln, ForkNumber forknum);
extern void umunlink(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo);
extern void umextend(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, const void *buffer, bool skipFsync);
+extern bool umapplyreservedrange(SMgrRelation reln, ForkNumber forknum,
+ BlockNumber firstblock, BlockNumber nblocks,
+ const BlockNumber *pblknos,
+ XLogRecPtr lsn, bool skipFsync);
extern void umzeroextend(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, int nblocks, bool skipFsync);
+extern void UmApplyReservedRangeRemap(SMgrRelation reln, ForkNumber forknum,
+ BlockNumber firstblock, BlockNumber nblocks,
+ const BlockNumber *pblknos,
+ XLogRecPtr lsn, bool skipFsync);
+extern void UmRebuildMapAndSuperblockForSkipWAL(SMgrRelation reln);
extern bool umprefetch(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, int nblocks);
extern uint32 ummaxcombine(SMgrRelation reln, ForkNumber forknum,
@@ -64,22 +117,59 @@ extern uint32 ummaxcombine(SMgrRelation reln, ForkNumber forknum,
extern void umreadv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
void **buffers, BlockNumber nblocks);
extern void umstartreadv(PgAioHandle *ioh,
- SMgrRelation reln, ForkNumber forknum,
- BlockNumber blocknum,
+ SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
void **buffers, BlockNumber nblocks);
extern void umwritev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
const void **buffers, BlockNumber nblocks, bool skipFsync);
extern void umwriteback(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, BlockNumber nblocks);
-extern BlockNumber umnblocks(SMgrRelation reln, ForkNumber forknum);
+extern void umpretruncate(SMgrRelation reln, ForkNumber forknum,
+ BlockNumber old_blocks, BlockNumber nblocks,
+ XLogRecPtr truncate_lsn);
extern void umtruncate(SMgrRelation reln, ForkNumber forknum,
BlockNumber old_blocks, BlockNumber nblocks);
extern void umimmedsync(SMgrRelation reln, ForkNumber forknum);
extern void umregistersync(SMgrRelation reln, ForkNumber forknum);
-extern int umfd(SMgrRelation reln, ForkNumber forknum,
- BlockNumber blocknum, uint32 *off);
+extern bool umpreparependingsync(SMgrRelation reln);
+extern bool umneedsrecoveryfsmvacuum(SMgrRelation reln);
+extern int umfd(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, uint32 *off);
extern int umsyncfiletag(const FileTag *ftag, char *path);
extern int umunlinkfiletag(const FileTag *ftag, char *path);
extern bool umfiletagmatches(const FileTag *ftag, const FileTag *candidate);
+/*
+ * Runtime semantic helpers.
+ *
+ * These consume Umbra access semantics (bypass/require-map/skip-pending) and
+ * expose the runtime answers upper layers use directly.
+ */
+extern BlockNumber umphysicalblock(SMgrRelation reln, ForkNumber forknum,
+ BlockNumber lblkno);
+extern BlockNumber umnblocks(SMgrRelation reln, ForkNumber forknum);
+extern BlockNumber umnblocks_cached(SMgrRelation reln, ForkNumber forknum);
+
+/*
+ * MAP fact / mutation helpers used by WAL and replay code.
+ *
+ * These expose mapping facts and mapping-state updates only. Runtime read-miss
+ * interpretation stays in umbra.c.
+ */
+extern void UmMapGetNewPbkno(SMgrRelation reln, ForkNumber forknum,
+ BlockNumber lblkno, BlockNumber *new_pblkno,
+ BlockNumber *old_pblkno);
+extern void UmMapReserveFreshPbkno(SMgrRelation reln, ForkNumber forknum,
+ BlockNumber lblkno,
+ BlockNumber *new_pblkno);
+extern bool UmMapAccessAvailable(SMgrRelation reln, ForkNumber forknum);
+extern bool UmWalOwnedRemapAvailable(SMgrRelation reln, ForkNumber forknum);
+extern bool UmWalOwnedFirstbornAvailable(SMgrRelation reln, ForkNumber forknum,
+ BlockNumber lblkno);
+extern bool UmMapTryLookupPblkno(SMgrRelation reln, ForkNumber forknum,
+ BlockNumber lblkno, BlockNumber *pblkno);
+extern bool UmMapIsLogicalUnmaterialized(SMgrRelation reln, ForkNumber forknum,
+ BlockNumber lblkno);
+extern void UmMapSetMapping(SMgrRelation reln, ForkNumber forknum,
+ BlockNumber lblkno, BlockNumber new_pblkno,
+ XLogRecPtr map_lsn);
+
#endif /* UMBRA_H */
diff --git a/src/test/recovery/meson.build b/src/test/recovery/meson.build
index 0cbdf133ca..0abe8ff1a1 100644
--- a/src/test/recovery/meson.build
+++ b/src/test/recovery/meson.build
@@ -63,6 +63,7 @@ tests += {
't/052_checkpoint_segment_missing.pl',
't/053_umbra_map_superblock_watermark.pl',
't/054_umbra_map_fork_policy.pl',
+ 't/061_umbra_fsm_vm_map_translation.pl',
't/063_umbra_mainfork_head_unlink_checkpoint.pl',
],
},
diff --git a/src/test/recovery/t/061_umbra_fsm_vm_map_translation.pl b/src/test/recovery/t/061_umbra_fsm_vm_map_translation.pl
new file mode 100644
index 0000000000..607afcb01e
--- /dev/null
+++ b/src/test/recovery/t/061_umbra_fsm_vm_map_translation.pl
@@ -0,0 +1,117 @@
+# Verify FSM/VM forks participate in UMBRA MAP translation.
+#
+# In UMBRA mode this test checks that FSM and VM logical block 0 both get a
+# valid mapping entry in relation MAP fork (entry != 0xFFFFFFFF).
+#
+# In md mode, MAP fork does not exist and the test is skipped.
+use strict;
+use warnings;
+
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+plan skip_all => 'requires --with-umbra MAP fork'
+ unless check_pg_config('^#define USE_UMBRA 1$');
+
+my $node = PostgreSQL::Test::Cluster->new('master');
+$node->init();
+$node->append_conf(
+ 'postgresql.conf', qq{
+autovacuum = off
+});
+$node->start();
+
+$node->safe_psql(
+ 'postgres', q{
+CREATE TABLE umb_fsm_vm_t(id int, payload text);
+INSERT INTO umb_fsm_vm_t
+SELECT g, repeat('x', 2000) FROM generate_series(1, 15000) g;
+VACUUM (FREEZE, ANALYZE) umb_fsm_vm_t;
+});
+
+my $fsm_size = $node->safe_psql(
+ 'postgres', q{
+SELECT pg_relation_size('umb_fsm_vm_t', 'fsm');
+});
+my $vm_size = $node->safe_psql(
+ 'postgres', q{
+SELECT pg_relation_size('umb_fsm_vm_t', 'vm');
+});
+
+cmp_ok($fsm_size, '>', 0, 'FSM fork has at least one page');
+cmp_ok($vm_size, '>', 0, 'VM fork has at least one page');
+
+# Under the current proportional MAP layout:
+# block 0 = superblock
+# block 1 = FSM map page 0
+# block 2 = VM map page 0
+my $fsm_map_page = 1;
+my $vm_map_page = 2;
+
+my $fsm_map_entry = $node->safe_psql(
+ 'postgres', qq{
+SELECT COALESCE(
+ encode(
+ pg_read_binary_file(
+ pg_relation_filepath('umb_fsm_vm_t') || '_map',
+ current_setting('block_size')::int * $fsm_map_page,
+ 4,
+ true),
+ 'hex'),
+ '');
+});
+
+my $vm_map_entry = $node->safe_psql(
+ 'postgres', qq{
+SELECT COALESCE(
+ encode(
+ pg_read_binary_file(
+ pg_relation_filepath('umb_fsm_vm_t') || '_map',
+ current_setting('block_size')::int * $vm_map_page,
+ 4,
+ true),
+ 'hex'),
+ '');
+});
+
+isnt($fsm_map_entry, '', 'FSM map entry is readable');
+isnt($vm_map_entry, '', 'VM map entry is readable');
+isnt($fsm_map_entry, 'ffffffff', 'FSM block 0 has a valid map entry');
+isnt($vm_map_entry, 'ffffffff', 'VM block 0 has a valid map entry');
+
+$node->stop('immediate');
+$node->start();
+
+my $fsm_map_entry_after_restart = $node->safe_psql(
+ 'postgres', qq{
+SELECT COALESCE(
+ encode(
+ pg_read_binary_file(
+ pg_relation_filepath('umb_fsm_vm_t') || '_map',
+ current_setting('block_size')::int * $fsm_map_page,
+ 4,
+ true),
+ 'hex'),
+ '');
+});
+
+my $vm_map_entry_after_restart = $node->safe_psql(
+ 'postgres', qq{
+SELECT COALESCE(
+ encode(
+ pg_read_binary_file(
+ pg_relation_filepath('umb_fsm_vm_t') || '_map',
+ current_setting('block_size')::int * $vm_map_page,
+ 4,
+ true),
+ 'hex'),
+ '');
+});
+
+isnt($fsm_map_entry_after_restart, 'ffffffff',
+ 'FSM block 0 map entry survives restart');
+isnt($vm_map_entry_after_restart, 'ffffffff',
+ 'VM block 0 map entry survives restart');
+
+done_testing();
--
2.50.1 (Apple Git-155)
| From | Date | Subject | |
|---|---|---|---|
| Next Message | Mingwei Jia | 2026-06-01 23:33:37 | [RFC PATCH v2 RESEND 07/10] umbra: add patch 6 WAL records, mapped birth, and redo state machine |
| Previous Message | Mingwei Jia | 2026-06-01 23:33:35 | [RFC PATCH v2 RESEND 05/10] umbra: add patch 4 shared-memory MAP cache and checkpoint flush |