[RFC PATCH v2 RESEND 05/10] umbra: add patch 4 shared-memory MAP cache and checkpoint flush

From: Mingwei Jia <i(at)nayishan(dot)top>
To: pgsql-hackers(at)lists(dot)postgresql(dot)org
Subject: [RFC PATCH v2 RESEND 05/10] umbra: add patch 4 shared-memory MAP cache and checkpoint flush
Date: 2026-06-01 23:33:35
Message-ID: 20260601233340.67949-4-i@nayishan.top
Views: Whole Thread | Raw Message | Download mbox | Resend email
Thread:
Lists: pgsql-hackers

---
src/backend/access/transam/xlog.c | 6 +
src/backend/commands/dbcommands.c | 19 +
src/backend/storage/map/Makefile | 4 +
src/backend/storage/map/map.c | 1047 ++++++-
src/backend/storage/map/mapbuf.c | 414 +++
src/backend/storage/map/mapclock.c | 457 +++
src/backend/storage/map/mapflush.c | 665 ++++
src/backend/storage/map/mapinit.c | 143 +
src/backend/storage/map/mapsuper.c | 1259 +++++++-
src/backend/storage/map/meson.build | 4 +
src/backend/storage/smgr/smgr.c | 52 +-
src/backend/storage/smgr/umbra.c | 339 ++-
src/backend/storage/smgr/umfile.c | 2700 ++++++++++++-----
src/backend/storage/sync/sync.c | 12 +-
.../utils/activity/wait_event_names.txt | 1 +
src/backend/utils/init/postinit.c | 8 +-
src/include/storage/lwlocklist.h | 1 +
src/include/storage/map.h | 247 +-
src/include/storage/map_internal.h | 28 +
src/include/storage/mapsuper.h | 28 +-
src/include/storage/mapsuper_internal.h | 157 +
src/include/storage/smgr.h | 6 +
src/include/storage/subsystemlist.h | 3 +
src/include/storage/sync.h | 3 +
src/include/storage/umbra.h | 12 +
src/include/storage/umfile.h | 118 +-
src/test/recovery/meson.build | 3 +
.../t/053_umbra_map_superblock_watermark.pl | 104 +
.../recovery/t/054_umbra_map_fork_policy.pl | 62 +
...3_umbra_mainfork_head_unlink_checkpoint.pl | 60 +
30 files changed, 6991 insertions(+), 971 deletions(-)
create mode 100644 src/backend/storage/map/mapbuf.c
create mode 100644 src/backend/storage/map/mapclock.c
create mode 100644 src/backend/storage/map/mapflush.c
create mode 100644 src/backend/storage/map/mapinit.c
create mode 100644 src/include/storage/map_internal.h
create mode 100644 src/include/storage/mapsuper_internal.h
create mode 100644 src/test/recovery/t/053_umbra_map_superblock_watermark.pl
create mode 100644 src/test/recovery/t/054_umbra_map_fork_policy.pl
create mode 100644 src/test/recovery/t/063_umbra_mainfork_head_unlink_checkpoint.pl

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index f85b528608..d1bf13b951 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -98,6 +98,9 @@
#include "storage/spin.h"
#include "storage/subsystems.h"
#include "storage/sync.h"
+#ifdef USE_UMBRA
+#include "storage/map.h"
+#endif
#include "utils/guc_hooks.h"
#include "utils/guc_tables.h"
#include "utils/injection_point.h"
@@ -8062,6 +8065,9 @@ CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
CheckPointSUBTRANS();
CheckPointMultiXact();
CheckPointPredicate();
+#ifdef USE_UMBRA
+ MapCheckpoint();
+#endif
CheckPointBuffers(flags);

/* Perform all queued up fsyncs */
diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c
index f0819d15ab..8751886bb6 100644
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -1059,6 +1059,17 @@ createdb(ParseState *pstate, const CreatedbStmt *stmt)
errhint("Valid strategies are \"wal_log\" and \"file_copy\".")));
}

+ /*
+ * Umbra currently supports only the legacy file-copy CREATE DATABASE copy
+ * semantics.
+ *
+ * Accept STRATEGY = WAL_LOG for compatibility, but route through the
+ * file-copy path until Umbra owns a WAL_LOG implementation.
+ */
+ if (dbstrategy == CREATEDB_WAL_LOG &&
+ !smgrcreatedballowswallog())
+ dbstrategy = CREATEDB_FILE_COPY;
+
/* If encoding or locales are defaulted, use source's setting */
if (encoding < 0)
encoding = src_encoding;
@@ -1873,6 +1884,7 @@ dropdb(const char *dbname, bool missing_ok, bool force)
* dirty buffer to the dead database later...
*/
DropDatabaseBuffers(db_id);
+ smgrinvalidatedatabase(db_id);

/*
* Tell checkpointer to forget any pending fsync and unlink requests for
@@ -2164,6 +2176,7 @@ movedb(const char *dbname, const char *tblspcname)
* src_tblspcoid, but bufmgr.c presently provides no API for that.
*/
DropDatabaseBuffers(db_id);
+ smgrinvalidatedatabasetablespaces(db_id, 1, &src_tblspcoid);

/*
* Check for existence of files in the target directory, i.e., objects of
@@ -3370,9 +3383,12 @@ dbase_redo(XLogReaderState *record)
* up-to-date for the copy.
*/
FlushDatabaseBuffers(xlrec->src_db_id);
+ smgrcheckpointdatabasetablespaces(xlrec->src_db_id, 1,
+ &xlrec->src_tablespace_id);

/* Close all smgr fds in all backends. */
WaitForProcSignalBarrier(EmitProcSignalBarrier(PROCSIGNAL_BARRIER_SMGRRELEASE));
+ smgrreleaseall();

/*
* Copy this subdirectory to the new location
@@ -3431,6 +3447,8 @@ dbase_redo(XLogReaderState *record)

/* Drop pages for this database that are in the shared buffer cache */
DropDatabaseBuffers(xlrec->db_id);
+ smgrinvalidatedatabasetablespaces(xlrec->db_id, xlrec->ntablespaces,
+ xlrec->tablespace_ids);

/* Also, clean out any fsync requests that might be pending in md.c */
ForgetDatabaseSyncRequests(xlrec->db_id);
@@ -3440,6 +3458,7 @@ dbase_redo(XLogReaderState *record)

/* Close all smgr fds in all backends. */
WaitForProcSignalBarrier(EmitProcSignalBarrier(PROCSIGNAL_BARRIER_SMGRRELEASE));
+ smgrreleaseall();

for (i = 0; i < xlrec->ntablespaces; i++)
{
diff --git a/src/backend/storage/map/Makefile b/src/backend/storage/map/Makefile
index ee9603de14..08c3b69679 100644
--- a/src/backend/storage/map/Makefile
+++ b/src/backend/storage/map/Makefile
@@ -14,6 +14,10 @@ include $(top_builddir)/src/Makefile.global

OBJS = \
map.o \
+ mapinit.o \
+ mapbuf.o \
+ mapflush.o \
+ mapclock.o \
mapsuper.o

include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/storage/map/map.c b/src/backend/storage/map/map.c
index 563f38b21a..1c74aa94ef 100644
--- a/src/backend/storage/map/map.c
+++ b/src/backend/storage/map/map.c
@@ -1,10 +1,10 @@
/*-------------------------------------------------------------------------
*
* map.c
- * Umbra metadata-fork disk layout helpers.
+ * physical map layer implementation
*
- * This file contains address-translation and in-page access routines for the
- * metadata fork disk layout.
+ * This module owns MAP metadata layout helpers and shared cache/checkpoint
+ * support for MAP pages.
*
* src/backend/storage/map/map.c
*
@@ -12,47 +12,155 @@
*/
#include "postgres.h"

+#include "access/xlog.h"
+#include "access/xlogrecovery.h"
+#include "access/xlogutils.h"
+#include "common/hashfn.h"
+#include "common/relpath.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "storage/bufmgr.h"
+#include "storage/freespace.h"
#include "storage/map.h"
-#include "storage/um_defs.h"
+#include "storage/map_internal.h"
+#include "storage/mapsuper.h"
+#include "storage/mapsuper_internal.h"
+#include "storage/procnumber.h"
+#include "storage/shmem.h"
+#include "storage/sync.h"
+#include "storage/umfile.h"
+#include "utils/memutils.h"

-void
-MapPageInit(MapPage *page)
+typedef struct MapTruncatePreloadState
+{
+ bool active;
+ RelFileLocator rnode;
+ ForkNumber forknum;
+ int nslots;
+ int capacity;
+ int *slots;
+} MapTruncatePreloadState;
+
+static MapTruncatePreloadState MapTruncatePreload[MAX_FORKNUM + 1];
+
+
+typedef enum MapCachedLookupResult
{
- Assert(page != NULL);
+ MAP_CACHED_LOOKUP_MISS,
+ MAP_CACHED_LOOKUP_UNMAPPED,
+ MAP_CACHED_LOOKUP_MAPPED
+} MapCachedLookupResult;
+
+/* Internal functions */
+static bool MapTablespaceSelected(Oid spcOid, int ntablespaces,
+ const Oid *tablespace_ids);
+static bool MapTruncateEntryRange(ForkNumber forknum, BlockNumber n_lblknos,
+ BlockNumber n_map_pages,
+ BlockNumber *start_map_page,
+ int *start_entry_idx,
+ BlockNumber *end_map_page,
+ int *end_entry_idx);
+
+static void
+MapTruncatePreloadResetEntry(MapTruncatePreloadState *state)
+{
+ int i;
+
+ if (!state->active)
+ return;
+
+ for (i = 0; i < state->nslots; i++)
+ MapUnpinBuffer(state->slots[i]);

- MemSet(page->pblknos, 0xFF, sizeof(page->pblknos));
+ state->active = false;
+ state->nslots = 0;
+ state->forknum = InvalidForkNumber;
+ memset(&state->rnode, 0, sizeof(state->rnode));
}

-BlockNumber
-MapPageGetEntry(const MapPage *page, int entry_idx)
+static MapTruncatePreloadState *
+MapTruncatePreloadEntry(RelFileLocator rnode, ForkNumber forknum)
{
- Assert(page != NULL);
+ MapTruncatePreloadState *state;
+
+ Assert(forknum >= 0 && forknum <= MAX_FORKNUM);
+ state = &MapTruncatePreload[forknum];

- if (entry_idx < 0 || entry_idx >= MAP_ENTRIES_PER_PAGE)
- elog(ERROR, "map entry index %d is out of range", entry_idx);
+ if (state->active &&
+ (!RelFileLocatorEquals(state->rnode, rnode) ||
+ state->forknum != forknum))
+ MapTruncatePreloadResetEntry(state);

- return page->pblknos[entry_idx];
+ return state;
}

+BlockNumber MapForkPageIndexToMapBlkno(ForkNumber forknum,
+ BlockNumber fork_page_idx);
+BlockNumber MapLblknoToMapBlkno(ForkNumber forknum, BlockNumber lblkno);
+static bool MapDecodeMapBlkno(BlockNumber map_blkno, ForkNumber *forknum,
+ BlockNumber *fork_page_idx);
+static bool MapMapPageWithinLogicalRange(UmbraFileContext *map_ctx,
+ RelFileLocator rnode,
+ ForkNumber forknum,
+ BlockNumber map_blkno);
+static MapCachedLookupResult MapTryLookupCachedEntry(RelFileLocator rnode,
+ ForkNumber forknum,
+ BlockNumber map_blkno,
+ int entry_idx,
+ bool adjust_usage,
+ BlockNumber *pblkno);
+static MapCachedLookupResult MapTryLookupCachedPblknoInternal(RelFileLocator rnode,
+ ForkNumber forknum,
+ BlockNumber lblkno,
+ bool adjust_usage,
+ BlockNumber *pblkno);
void
-MapPageSetEntry(MapPage *page, int entry_idx, BlockNumber pblkno)
+MapResetAllTruncatePreloads(void)
+{
+ int slot_id;
+
+ for (slot_id = 0; slot_id <= MAX_FORKNUM; slot_id++)
+ {
+ MapTruncatePreload[slot_id].active = false;
+ MapTruncatePreload[slot_id].nslots = 0;
+ }
+}
+
+
+static bool
+MapTruncateEntryRange(ForkNumber forknum, BlockNumber n_lblknos,
+ BlockNumber old_n_lblknos,
+ BlockNumber *start_map_page,
+ int *start_entry_idx,
+ BlockNumber *end_map_page,
+ int *end_entry_idx)
{
- Assert(page != NULL);
+ BlockNumber start_page_idx;
+ BlockNumber end_page_idx;

- if (entry_idx < 0 || entry_idx >= MAP_ENTRIES_PER_PAGE)
- elog(ERROR, "map entry index %d is out of range", entry_idx);
+ (void) forknum;

- page->pblknos[entry_idx] = pblkno;
+ if (old_n_lblknos <= n_lblknos)
+ return false;
+
+ start_page_idx = n_lblknos / MAP_ENTRIES_PER_PAGE;
+ end_page_idx = (old_n_lblknos - 1) / MAP_ENTRIES_PER_PAGE;
+
+ *start_map_page = start_page_idx;
+ *start_entry_idx = n_lblknos % MAP_ENTRIES_PER_PAGE;
+ *end_map_page = end_page_idx;
+ *end_entry_idx = (old_n_lblknos - 1) % MAP_ENTRIES_PER_PAGE;
+ return true;
}

BlockNumber
MapForkPageIndexToMapBlkno(ForkNumber forknum, BlockNumber fork_page_idx)
{
- uint64 group_no;
- uint64 blkno64;
+ uint64 group_no;
+ uint64 blkno64;

if (forknum == UMBRA_METADATA_FORKNUM)
- elog(ERROR, "Umbra metadata fork cannot be addressed as a map target");
+ elog(ERROR, "Umbra metadata fork should not call MapForkPageIndexToMapBlkno");

switch (forknum)
{
@@ -71,8 +179,7 @@ MapForkPageIndexToMapBlkno(ForkNumber forknum, BlockNumber fork_page_idx)

case MAIN_FORKNUM:
{
- uint64 group_page_idx = (uint64) fork_page_idx;
-
+ uint64 group_page_idx = (uint64) fork_page_idx;
group_no = group_page_idx / (uint64) MAP_GROUP_MAIN_PAGES;
blkno64 = (uint64) MAP_BLOCK_FIRST_GROUP +
group_no * (uint64) MAP_GROUP_TOTAL_PAGES +
@@ -83,19 +190,26 @@ MapForkPageIndexToMapBlkno(ForkNumber forknum, BlockNumber fork_page_idx)
}

default:
- elog(ERROR, "unsupported fork number %d in map layout", (int) forknum);
- pg_unreachable();
+ elog(ERROR, "unsupported fork number %d in map lookup", (int) forknum);
+ return 0;
}

if (blkno64 > (uint64) MaxBlockNumber)
ereport(ERROR,
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
- errmsg("cannot address map page %u for fork %d",
+ errmsg("cannot address map page %u for fork %d in MAP",
fork_page_idx, forknum)));

return (BlockNumber) blkno64;
}

+/*
+ * MapLblknoToMapBlkno - convert (forknum, lblkno) to linear MAP entry index.
+ *
+ * The metadata fork stores repeated proportional groups:
+ * [FSM page][VM page][8192 MAIN pages].
+ * Each fork page still maps MAP_ENTRIES_PER_PAGE logical blocks.
+ */
BlockNumber
MapLblknoToMapBlkno(ForkNumber forknum, BlockNumber lblkno)
{
@@ -110,22 +224,19 @@ MapLblknoToMapBlkno(ForkNumber forknum, BlockNumber lblkno)
if (entry64 > (uint64) MaxBlockNumber)
ereport(ERROR,
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
- errmsg("cannot address logical block %u for fork %d in map",
+ errmsg("cannot address logical block %u for fork %d in MAP",
lblkno, forknum)));

return (BlockNumber) entry64;
}

-bool
+static bool
MapDecodeMapBlkno(BlockNumber map_blkno, ForkNumber *forknum,
BlockNumber *fork_page_idx)
{
- uint64 offset;
- uint64 group_no;
- uint64 in_group;
-
- Assert(forknum != NULL);
- Assert(fork_page_idx != NULL);
+ uint64 offset;
+ uint64 group_no;
+ uint64 in_group;

if (map_blkno == MAP_BLOCK_SUPER || map_blkno < MAP_BLOCK_FIRST_GROUP)
return false;
@@ -160,3 +271,869 @@ MapDecodeMapBlkno(BlockNumber map_blkno, ForkNumber *forknum,

return false;
}
+
+/*
+ * MapMapPageWithinLogicalRange - whether a MAP page intersects current logical
+ * mapping domain of the target fork.
+ *
+ * This check is superblock-driven and keeps sparse holes outside logical range
+ * from being interpreted as real MAP pages.
+ */
+static bool
+MapMapPageWithinLogicalRange(UmbraFileContext *map_ctx, RelFileLocator rnode,
+ ForkNumber forknum, BlockNumber map_blkno)
+{
+ BlockNumber n_lblknos;
+ ForkNumber page_forknum;
+ BlockNumber page_idx;
+ uint64 page_first_lblk;
+
+ if (!MapSBlockTryGetLogicalNblocks(map_ctx, rnode, forknum, &n_lblknos))
+ return true;
+
+ if (!MapDecodeMapBlkno(map_blkno, &page_forknum, &page_idx))
+ return false;
+
+ if (page_forknum != forknum)
+ return false;
+
+ page_first_lblk = (uint64) page_idx * (uint64) MAP_ENTRIES_PER_PAGE;
+ if (page_first_lblk >= (uint64) n_lblknos)
+ return false;
+
+ return true;
+}
+
+/*
+ * MapTryLookupCachedEntry - read a cached MAP entry without performing I/O.
+ *
+ * The caller supplies the decoded MAP page and entry index so cache hit
+ * handling stays in one place. Result distinguishes between:
+ * - cache miss / stale slot
+ * - cached page with an unmapped entry
+ * - cached page with a valid mapping
+ */
+static MapCachedLookupResult
+MapTryLookupCachedEntry(RelFileLocator rnode, ForkNumber forknum,
+ BlockNumber map_blkno, int entry_idx,
+ bool adjust_usage, BlockNumber *pblkno)
+{
+ int slot_id;
+ MapBufferDesc *buf;
+ MapPage *page;
+ BlockNumber value;
+
+ slot_id = MapCacheLookup(rnode, forknum, map_blkno);
+ if (slot_id < 0)
+ return MAP_CACHED_LOOKUP_MISS;
+
+ buf = &MapBuffers[slot_id];
+ MapPinBuffer(slot_id, adjust_usage);
+ LWLockAcquire(&buf->buffer_lock, LW_SHARED);
+
+ if (buf->page_number != map_blkno ||
+ buf->page_number < 0 ||
+ !RelFileLocatorEquals(buf->rnode, rnode) ||
+ buf->forknum != forknum)
+ {
+ LWLockRelease(&buf->buffer_lock);
+ MapUnpinBuffer(slot_id);
+ return MAP_CACHED_LOOKUP_MISS;
+ }
+
+ page = MapGetPage(slot_id);
+ value = page->pblknos[entry_idx];
+ LWLockRelease(&buf->buffer_lock);
+ MapUnpinBuffer(slot_id);
+
+ if (value == InvalidBlockNumber)
+ return MAP_CACHED_LOOKUP_UNMAPPED;
+
+ *pblkno = value;
+ return MAP_CACHED_LOOKUP_MAPPED;
+}
+
+static MapCachedLookupResult
+MapTryLookupCachedPblknoInternal(RelFileLocator rnode, ForkNumber forknum,
+ BlockNumber lblkno, bool adjust_usage,
+ BlockNumber *pblkno)
+{
+ BlockNumber map_blkno;
+ int entry_idx;
+
+ Assert(pblkno != NULL);
+
+ if (forknum == UMBRA_METADATA_FORKNUM)
+ return MAP_CACHED_LOOKUP_MISS;
+
+ map_blkno = MapLblknoToMapBlkno(forknum, lblkno);
+ entry_idx = map_blkno % MAP_ENTRIES_PER_PAGE;
+ map_blkno = map_blkno / MAP_ENTRIES_PER_PAGE;
+
+ return MapTryLookupCachedEntry(rnode, forknum, map_blkno, entry_idx,
+ adjust_usage, pblkno);
+}
+
+/*
+ * MapTryLookup - try to find physical block number for a logical block.
+ *
+ * Returns true and sets *pblkno when a valid mapping exists.
+ * Returns false if MAP fork is absent or the entry is still unmapped.
+ */
+bool
+MapTryLookup(UmbraFileContext *map_ctx, RelFileLocator rnode, ForkNumber forknum,
+ BlockNumber lblkno, BlockNumber *pblkno)
+{
+ BlockNumber map_blkno;
+ int slot_id;
+ uint32_t state;
+ MapPage *page;
+ MapBufferDesc *buf;
+ int entry_idx;
+ MapCachedLookupResult cache_result;
+
+ Assert(pblkno != NULL);
+
+ if (forknum == UMBRA_METADATA_FORKNUM)
+ elog(ERROR, "MapTryLookup does not accept Umbra metadata fork");
+
+ cache_result = MapTryLookupCachedPblknoInternal(rnode, forknum, lblkno,
+ true, pblkno);
+ if (cache_result != MAP_CACHED_LOOKUP_MISS)
+ return cache_result == MAP_CACHED_LOOKUP_MAPPED;
+
+ /* Convert (forknum, lblkno) to MAP page and entry index */
+ map_blkno = MapLblknoToMapBlkno(forknum, lblkno);
+ entry_idx = map_blkno % MAP_ENTRIES_PER_PAGE;
+ map_blkno = map_blkno / MAP_ENTRIES_PER_PAGE;
+
+ /* Find or load the map page - returns with buffer pinned */
+ slot_id = MapReadBuffer(map_ctx, rnode, forknum, map_blkno);
+ buf = &MapBuffers[slot_id];
+ page = MapGetPage(slot_id);
+
+ /* Verify buffer is pinned */
+ state = pg_atomic_read_u32(&buf->state);
+ if (!(state & MAPBUF_VALID_MASK))
+ elog(ERROR, "map buffer not pinned");
+
+ LWLockAcquire(&buf->buffer_lock, LW_SHARED);
+ *pblkno = page->pblknos[entry_idx];
+ LWLockRelease(&buf->buffer_lock);
+ MapUnpinBuffer(slot_id);
+
+ return (*pblkno != InvalidBlockNumber);
+}
+
+/*
+ * MapTryLookupPblkRun - find the longest contiguous mapped pblk run.
+ *
+ * Returns the number of blocks in the run beginning at lblkno, up to
+ * maxblocks. Returns 0 if the first entry is unmapped.
+ *
+ * This batches translation by MAP page, so callers don't need a full
+ * MapTryLookup() round trip for every block in a contiguous run.
+ */
+BlockNumber
+MapTryLookupPblkRun(UmbraFileContext *map_ctx, RelFileLocator rnode,
+ ForkNumber forknum, BlockNumber lblkno,
+ BlockNumber maxblocks, BlockNumber *start_pblkno)
+{
+ BlockNumber current_lblk = lblkno;
+ BlockNumber remaining = maxblocks;
+ BlockNumber run_blocks = 0;
+ BlockNumber expected_next_pblk = InvalidBlockNumber;
+ BlockNumber current_map_blkno = InvalidBlockNumber;
+ int current_slot = -1;
+
+ Assert(start_pblkno != NULL);
+ Assert(maxblocks > 0);
+
+ if (forknum == UMBRA_METADATA_FORKNUM)
+ elog(ERROR, "MapTryLookupPblkRun does not accept Umbra metadata fork");
+
+ if (!umfile_ctx_fork_exists(map_ctx, UMBRA_METADATA_FORKNUM,
+ UMFILE_EXISTS_DENSE))
+ return 0;
+
+ while (remaining > 0)
+ {
+ BlockNumber map_entry_no;
+ BlockNumber map_blkno;
+ int entry_idx;
+ int entries_this_page;
+ MapBufferDesc *buf;
+ MapPage *page;
+
+ map_entry_no = MapLblknoToMapBlkno(forknum, current_lblk);
+ entry_idx = map_entry_no % MAP_ENTRIES_PER_PAGE;
+ map_blkno = map_entry_no / MAP_ENTRIES_PER_PAGE;
+ entries_this_page = Min((BlockNumber) (MAP_ENTRIES_PER_PAGE - entry_idx),
+ remaining);
+
+ if (current_slot < 0 || current_map_blkno != map_blkno)
+ {
+ if (current_slot >= 0)
+ MapUnpinBuffer(current_slot);
+ current_slot = MapReadBuffer(map_ctx, rnode, forknum, map_blkno);
+ current_map_blkno = map_blkno;
+ }
+
+ buf = &MapBuffers[current_slot];
+ page = MapGetPage(current_slot);
+
+ LWLockAcquire(&buf->buffer_lock, LW_SHARED);
+ for (int i = 0; i < entries_this_page; i++)
+ {
+ BlockNumber pblkno = page->pblknos[entry_idx + i];
+
+ if (pblkno == InvalidBlockNumber)
+ {
+ LWLockRelease(&buf->buffer_lock);
+ goto done;
+ }
+
+ if (run_blocks == 0)
+ {
+ *start_pblkno = pblkno;
+ expected_next_pblk = pblkno + 1;
+ run_blocks = 1;
+ current_lblk++;
+ remaining--;
+ continue;
+ }
+
+ if (pblkno != expected_next_pblk)
+ {
+ LWLockRelease(&buf->buffer_lock);
+ goto done;
+ }
+
+ if (((*start_pblkno % ((BlockNumber) RELSEG_SIZE)) + run_blocks) >=
+ ((BlockNumber) RELSEG_SIZE))
+ {
+ LWLockRelease(&buf->buffer_lock);
+ goto done;
+ }
+
+ expected_next_pblk++;
+ run_blocks++;
+ current_lblk++;
+ remaining--;
+ }
+ LWLockRelease(&buf->buffer_lock);
+ }
+
+done:
+ if (current_slot >= 0)
+ MapUnpinBuffer(current_slot);
+
+ return run_blocks;
+}
+
+
+/*
+ * MapReadBuffer - read a map page into buffer
+ *
+ * Returns the slot_id of the buffer, with the buffer pinned.
+ *
+ * The caller owns the returned buffer pin.
+ */
+int
+MapReadBuffer(UmbraFileContext *map_ctx, RelFileLocator rnode,
+ ForkNumber forknum, BlockNumber map_blkno)
+{
+ int slot_id;
+ uint32_t state;
+ MapPage *page;
+ MapBufferDesc *buf;
+ BlockNumber map_nblocks;
+ int old_page_number;
+ ForkNumber old_forknum;
+ RelFileLocator old_rnode;
+
+ if (map_blkno == MAP_BLOCK_SUPER)
+ elog(ERROR, "MapReadBuffer cannot be used for MAP superblock");
+
+ for (;;)
+ {
+ int existing_slot_id;
+ bool retry = false;
+
+ slot_id = MapCacheLookup(rnode, forknum, map_blkno);
+ if (slot_id >= 0)
+ {
+ buf = &MapBuffers[slot_id];
+
+ MapPinBuffer(slot_id, true);
+ LWLockAcquire(&buf->buffer_lock, LW_SHARED);
+
+ if (buf->page_number == map_blkno &&
+ buf->page_number >= 0 &&
+ RelFileLocatorEquals(buf->rnode, rnode) &&
+ buf->forknum == forknum)
+ {
+ LWLockRelease(&buf->buffer_lock);
+ return slot_id;
+ }
+
+ LWLockRelease(&buf->buffer_lock);
+ MapUnpinBuffer(slot_id);
+ continue;
+ }
+
+ slot_id = MapClockGetBuffer();
+ buf = &MapBuffers[slot_id];
+ MapPinBuffer(slot_id, false);
+
+ LWLockAcquire(&buf->buffer_lock, LW_EXCLUSIVE);
+
+ if (buf->page_number == map_blkno &&
+ buf->page_number >= 0 &&
+ RelFileLocatorEquals(buf->rnode, rnode) &&
+ buf->forknum == forknum)
+ {
+ LWLockRelease(&buf->buffer_lock);
+ return slot_id;
+ }
+
+ state = pg_atomic_read_u32(&buf->state);
+ if (MAPBUF_GET_REFCOUNT(state) != 1)
+ {
+ LWLockRelease(&buf->buffer_lock);
+ MapUnpinBuffer(slot_id);
+ continue;
+ }
+
+ if (state & MAPBUF_DIRTY)
+ {
+ LWLockRelease(&buf->buffer_lock);
+ MapFlushBuffer(slot_id);
+
+ LWLockAcquire(&buf->buffer_lock, LW_EXCLUSIVE);
+ if (buf->page_number == map_blkno &&
+ buf->page_number >= 0 &&
+ RelFileLocatorEquals(buf->rnode, rnode) &&
+ buf->forknum == forknum)
+ {
+ LWLockRelease(&buf->buffer_lock);
+ return slot_id;
+ }
+
+ state = pg_atomic_read_u32(&buf->state);
+ if (MAPBUF_GET_REFCOUNT(state) != 1 ||
+ (state & MAPBUF_DIRTY))
+ {
+ LWLockRelease(&buf->buffer_lock);
+ MapUnpinBuffer(slot_id);
+ continue;
+ }
+ }
+ old_page_number = buf->page_number;
+ old_forknum = buf->forknum;
+ old_rnode = buf->rnode;
+ existing_slot_id = MapCacheInsert(rnode, forknum, map_blkno, slot_id);
+ if (existing_slot_id >= 0 && existing_slot_id != slot_id)
+ retry = true;
+ if (retry)
+ {
+ LWLockRelease(&buf->buffer_lock);
+ MapUnpinBuffer(slot_id);
+ continue;
+ }
+
+ if (old_page_number >= 0)
+ MapCacheDelete(old_rnode, old_forknum,
+ (BlockNumber) old_page_number, slot_id);
+
+ buf->page_number = map_blkno;
+ buf->rnode = rnode;
+ buf->forknum = forknum;
+ buf->page_lsn = 0;
+ MapBufferUpdateStateBits(buf, MAPBUF_USAGECOUNT_ONE, 0);
+
+ page = MapGetPage(slot_id);
+ if (umfile_ctx_fork_exists(map_ctx, UMBRA_METADATA_FORKNUM,
+ UMFILE_EXISTS_DENSE))
+ {
+ map_nblocks = umfile_ctx_get_nblocks(map_ctx, UMBRA_METADATA_FORKNUM,
+ UMFILE_NBLOCKS_DENSE);
+ if (map_blkno < map_nblocks &&
+ MapMapPageWithinLogicalRange(map_ctx, rnode, forknum, map_blkno))
+ {
+ umfile_ctx_read(map_ctx, UMBRA_METADATA_FORKNUM, map_blkno,
+ (char *) page, BLCKSZ);
+ MapBufferUpdateStateBits(buf, 0, MAPBUF_NOT_MATERIALIZED);
+
+ if (pg_memory_is_all_zeros(page, BLCKSZ))
+ {
+ BlockNumber n_lblknos = 0;
+ ForkNumber page_forknum;
+ BlockNumber page_idx;
+ bool need_this_page = false;
+
+ if (MapSBlockTryGetLogicalNblocks(map_ctx, rnode, forknum,
+ &n_lblknos) &&
+ n_lblknos > 0 &&
+ MapDecodeMapBlkno(map_blkno, &page_forknum, &page_idx) &&
+ page_forknum == forknum)
+ {
+ uint64 page_first_lblk =
+ (uint64) page_idx * (uint64) MAP_ENTRIES_PER_PAGE;
+
+ need_this_page = page_first_lblk < (uint64) n_lblknos;
+ }
+
+ if (need_this_page)
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("MAP page %u is all-zeros for relation %u/%u/%u fork %d",
+ map_blkno, rnode.spcOid, rnode.dbOid,
+ rnode.relNumber, forknum)));
+
+ MemSet(page, 0xFF, BLCKSZ);
+ }
+ }
+ else
+ {
+ MemSet(page, 0xFF, BLCKSZ);
+ if (map_blkno >= map_nblocks)
+ MapBufferUpdateStateBits(buf, MAPBUF_NOT_MATERIALIZED, 0);
+ else
+ MapBufferUpdateStateBits(buf, 0, MAPBUF_NOT_MATERIALIZED);
+ }
+ }
+ else
+ {
+ MemSet(page, 0xFF, BLCKSZ);
+ MapBufferUpdateStateBits(buf, MAPBUF_NOT_MATERIALIZED, 0);
+ }
+
+ LWLockRelease(&buf->buffer_lock);
+ return slot_id;
+ }
+}
+
+/*
+ * MapDrop - drop mapping for a relation
+ */
+void
+MapDrop(RelFileLocator rnode)
+{
+ RelFileLocatorBackend rnode_backend;
+
+ rnode_backend.locator = rnode;
+ rnode_backend.backend = INVALID_PROC_NUMBER;
+
+ MapInvalidateRelation(rnode);
+ umfile_ctx_unlinkfork(rnode_backend, UMBRA_METADATA_FORKNUM, false);
+}
+
+/*
+ * MapTruncate - truncate mapping when relation is truncated
+ */
+void
+MapTruncate(UmbraFileContext *map_ctx, RelFileLocator rnode,
+ ForkNumber forknum, BlockNumber n_lblknos,
+ XLogRecPtr map_lsn)
+{
+ BlockNumber old_n_lblknos = 0;
+ BlockNumber end_page_idx;
+ BlockNumber start_page_idx;
+ int start_entry_idx;
+ int end_entry_idx;
+ BlockNumber page_idx;
+
+ if (forknum == UMBRA_METADATA_FORKNUM)
+ return;
+
+ Assert(map_ctx != NULL);
+ Assert(map_lsn != InvalidXLogRecPtr);
+ if (map_lsn == InvalidXLogRecPtr)
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("invalid truncate WAL LSN for relation %u/%u/%u fork %d",
+ rnode.spcOid, rnode.dbOid, rnode.relNumber, forknum),
+ errdetail("truncate target logical block count: %u", n_lblknos)));
+
+ if (!umfile_ctx_fork_exists(map_ctx, UMBRA_METADATA_FORKNUM,
+ UMFILE_EXISTS_DENSE))
+ {
+ Assert(false);
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("required MAP fork is missing during truncate for relation %u/%u/%u fork %d",
+ rnode.spcOid, rnode.dbOid, rnode.relNumber, forknum),
+ errdetail("truncate target logical block count: %u", n_lblknos)));
+ }
+
+ if (!MapSBlockTryGetLogicalNblocks(map_ctx, rnode, forknum, &old_n_lblknos))
+ return;
+
+ if (!MapTruncateEntryRange(forknum, n_lblknos, old_n_lblknos,
+ &start_page_idx, &start_entry_idx,
+ &end_page_idx, &end_entry_idx))
+ return;
+
+ for (page_idx = start_page_idx; page_idx <= end_page_idx; page_idx++)
+ {
+ int slot_id;
+ int begin_idx;
+ int last_idx;
+ Size clear_bytes;
+ BlockNumber map_blkno;
+ MapPage *page;
+ MapBufferDesc *buf;
+
+ map_blkno = MapForkPageIndexToMapBlkno(forknum, page_idx);
+ if (map_blkno >= umfile_ctx_get_nblocks(map_ctx, UMBRA_METADATA_FORKNUM,
+ UMFILE_NBLOCKS_DENSE))
+ break;
+
+ slot_id = MapReadBuffer(map_ctx, rnode, forknum, map_blkno);
+ buf = &MapBuffers[slot_id];
+ page = MapGetPage(slot_id);
+
+ begin_idx = (page_idx == start_page_idx) ? start_entry_idx : 0;
+ last_idx = (page_idx == end_page_idx) ? end_entry_idx : (MAP_ENTRIES_PER_PAGE - 1);
+
+ LWLockAcquire(&buf->buffer_lock, LW_EXCLUSIVE);
+ if (begin_idx == 0 && last_idx == (MAP_ENTRIES_PER_PAGE - 1))
+ {
+ /* Fast path: the whole map page range is invalidated. */
+ MemSet(page->pblknos, 0xFF, MAP_ENTRIES_PER_PAGE * sizeof(uint32));
+ }
+ else
+ {
+ /* Boundary pages: invalidate only the requested subrange. */
+ clear_bytes = ((Size) (last_idx - begin_idx + 1)) * sizeof(uint32);
+ MemSet(&page->pblknos[begin_idx], 0xFF, clear_bytes);
+ }
+
+ /* Associate truncate-driven map rewrite with truncate WAL LSN. */
+ MapMarkBufferDirty(map_ctx, buf, map_lsn);
+
+ LWLockRelease(&buf->buffer_lock);
+
+ MapUnpinBuffer(slot_id);
+ }
+
+ /*
+ * Keep dirty map pages in cache and let checkpoint/bgwriter flush them.
+ * Invalidating relation slots here would clear dirty state before writeback.
+ */
+}
+
+void
+MapPreloadTruncatePages(UmbraFileContext *map_ctx, RelFileLocator rnode,
+ ForkNumber forknum, BlockNumber n_lblknos)
+{
+ MapTruncatePreloadState *state;
+ BlockNumber old_n_lblknos = 0;
+ BlockNumber start_page_idx;
+ BlockNumber end_page_idx;
+ int start_entry_idx;
+ int end_entry_idx;
+ BlockNumber page_idx;
+
+ if (forknum == UMBRA_METADATA_FORKNUM ||
+ !umfile_ctx_fork_exists(map_ctx, UMBRA_METADATA_FORKNUM,
+ UMFILE_EXISTS_DENSE))
+ return;
+
+ if (!MapSBlockTryGetLogicalNblocks(map_ctx, rnode, forknum, &old_n_lblknos))
+ return;
+
+ if (!MapTruncateEntryRange(forknum, n_lblknos, old_n_lblknos,
+ &start_page_idx, &start_entry_idx,
+ &end_page_idx, &end_entry_idx))
+ return;
+
+ state = MapTruncatePreloadEntry(rnode, forknum);
+ MapTruncatePreloadResetEntry(state);
+
+ state->active = true;
+ state->rnode = rnode;
+ state->forknum = forknum;
+
+ for (page_idx = start_page_idx; page_idx <= end_page_idx; page_idx++)
+ {
+ int slot_id;
+ BlockNumber map_blkno;
+
+ if (state->nslots == state->capacity)
+ {
+ int newcap = state->capacity == 0 ? 4 : state->capacity * 2;
+
+ if (state->slots == NULL)
+ state->slots = MemoryContextAlloc(TopMemoryContext,
+ sizeof(int) * newcap);
+ else
+ state->slots = repalloc(state->slots, sizeof(int) * newcap);
+ state->capacity = newcap;
+ }
+
+ map_blkno = MapForkPageIndexToMapBlkno(forknum, page_idx);
+ if (map_blkno >= umfile_ctx_get_nblocks(map_ctx, UMBRA_METADATA_FORKNUM,
+ UMFILE_NBLOCKS_DENSE))
+ break;
+
+ slot_id = MapReadBuffer(map_ctx, rnode, forknum, map_blkno);
+ state->slots[state->nslots++] = slot_id;
+ }
+}
+
+void
+MapReleasePreloadedTruncatePages(RelFileLocator rnode, ForkNumber forknum)
+{
+ MapTruncatePreloadState *state;
+
+ Assert(forknum >= 0 && forknum <= MAX_FORKNUM);
+ state = &MapTruncatePreload[forknum];
+
+ if (!state->active)
+ return;
+
+ if (!RelFileLocatorEquals(state->rnode, rnode) || state->forknum != forknum)
+ return;
+
+ MapTruncatePreloadResetEntry(state);
+}
+
+/*
+ * MapInvalidateRelation - invalidate all map cache entries for one relation.
+ */
+void
+MapInvalidateRelation(RelFileLocator rnode)
+{
+ int slot_id;
+
+ for (slot_id = 0; slot_id < map_buffers; slot_id++)
+ {
+ MapBufferDesc *buf = &MapBuffers[slot_id];
+ int page_number;
+ ForkNumber forknum;
+ RelFileLocator slot_rnode;
+
+ LWLockAcquire(&buf->buffer_lock, LW_SHARED);
+ page_number = buf->page_number;
+ forknum = buf->forknum;
+ slot_rnode = buf->rnode;
+ LWLockRelease(&buf->buffer_lock);
+
+ if (page_number < 0 || !RelFileLocatorEquals(slot_rnode, rnode))
+ continue;
+
+ MapCacheDelete(slot_rnode, forknum, (BlockNumber) page_number, slot_id);
+ MapInvalidateBuffer(slot_id, slot_rnode, forknum,
+ (BlockNumber) page_number);
+ }
+
+ /* Remove dedicated superblock cache entry for this relation. */
+ MapSuperDeleteEntry(rnode);
+}
+
+static bool
+MapTablespaceSelected(Oid spcOid, int ntablespaces, const Oid *tablespace_ids)
+{
+ int i;
+
+ if (ntablespaces <= 0 || tablespace_ids == NULL)
+ return true;
+
+ for (i = 0; i < ntablespaces; i++)
+ {
+ if (tablespace_ids[i] == spcOid)
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * MapInvalidateDatabaseTablespaces - invalidate MAP metadata/cache for a DB.
+ *
+ * If ntablespaces<=0, invalidate all tablespaces of that DB.
+ * If ntablespaces>0, only invalidate entries whose spcOid is in the list.
+ *
+ * This is needed because database OIDs and relfilenodes can be reused after
+ * DROP/CREATE churn. Without DB-scope invalidation, stale MAP buffer/cache/
+ * super entries can survive and be incorrectly reused by relations in the
+ * recreated DB.
+ */
+void
+MapInvalidateDatabaseTablespaces(Oid dbid, int ntablespaces,
+ const Oid *tablespace_ids)
+{
+ int slot_id;
+
+ /* Invalidate per-buffer cached pages */
+ for (slot_id = 0; slot_id < map_buffers; slot_id++)
+ {
+ MapBufferDesc *buf = &MapBuffers[slot_id];
+ int page_number;
+ ForkNumber forknum;
+ RelFileLocator slot_rnode;
+
+ LWLockAcquire(&buf->buffer_lock, LW_SHARED);
+ page_number = buf->page_number;
+ forknum = buf->forknum;
+ slot_rnode = buf->rnode;
+ LWLockRelease(&buf->buffer_lock);
+
+ if (page_number < 0 ||
+ slot_rnode.dbOid != dbid ||
+ !MapTablespaceSelected(slot_rnode.spcOid, ntablespaces, tablespace_ids))
+ continue;
+
+ MapCacheDelete(slot_rnode, forknum, (BlockNumber) page_number, slot_id);
+ MapInvalidateBuffer(slot_id, slot_rnode, forknum,
+ (BlockNumber) page_number);
+ }
+
+ /* Invalidate dedicated superblock cache entries for matching relations */
+ {
+ RelFileLocator *targets;
+ int target_cap = 256;
+ int target_count = 0;
+ int i;
+
+ targets = palloc(sizeof(RelFileLocator) * target_cap);
+
+ for (slot_id = 0; slot_id < MapSuperCapacity; slot_id++)
+ {
+ MapSuperEntry *entry = MapSuperEntryBySlot(slot_id);
+ RelFileLocator rnode;
+
+ LWLockAcquire(&entry->lock, LW_SHARED);
+ if (!entry->in_use ||
+ entry->key.rnode.dbOid != dbid ||
+ !MapTablespaceSelected(entry->key.rnode.spcOid, ntablespaces,
+ tablespace_ids))
+ {
+ LWLockRelease(&entry->lock);
+ continue;
+ }
+
+ rnode = entry->key.rnode;
+ LWLockRelease(&entry->lock);
+
+ if (target_count >= target_cap)
+ {
+ target_cap *= 2;
+ targets = repalloc(targets, sizeof(RelFileLocator) * target_cap);
+ }
+ targets[target_count++] = rnode;
+ }
+
+ for (i = 0; i < target_count; i++)
+ MapSuperDeleteEntry(targets[i]);
+
+ pfree(targets);
+ }
+}
+
+/*
+ * MapInvalidateDatabase - invalidate all MAP metadata/cache for one database.
+ */
+void
+MapInvalidateDatabase(Oid dbid)
+{
+ MapInvalidateDatabaseTablespaces(dbid, 0, NULL);
+}
+
+/*
+ * MapGetLogicalBlockCount - return the persisted logical block count.
+ */
+BlockNumber
+MapGetLogicalBlockCount(UmbraFileContext *map_ctx, RelFileLocator rnode, ForkNumber forknum)
+{
+ BlockNumber n_lblknos = 0;
+
+ if (!MapSBlockTryGetLogicalNblocks(map_ctx, rnode, forknum, &n_lblknos))
+ return 0;
+
+ return n_lblknos;
+}
+
+/*
+ * MapGetPhysicalBlockCount - physical block count needed for first n lblknos
+ *
+ * Returns max(mapped pblkno in [0, n_lblknos)) + 1.
+ * This is used by truncate to avoid cutting off still-referenced physical
+ * blocks when logical->physical mapping is non-identity.
+ */
+BlockNumber
+MapGetPhysicalBlockCount(UmbraFileContext *map_ctx, RelFileLocator rnode,
+ ForkNumber forknum, BlockNumber n_lblknos)
+{
+ BlockNumber n_map_pages;
+ BlockNumber current_page = InvalidBlockNumber;
+ BlockNumber page_idx;
+ BlockNumber page_count;
+ BlockNumber max_pblkno = InvalidBlockNumber;
+ int current_slot = -1;
+
+ if (n_lblknos == 0)
+ return 0;
+
+ if (!umfile_ctx_fork_exists(map_ctx, UMBRA_METADATA_FORKNUM,
+ UMFILE_EXISTS_DENSE))
+ return n_lblknos;
+
+ n_map_pages = umfile_ctx_get_nblocks(map_ctx, UMBRA_METADATA_FORKNUM,
+ UMFILE_NBLOCKS_DENSE);
+ if (n_map_pages == 0)
+ return 0;
+ page_count = (n_lblknos + MAP_ENTRIES_PER_PAGE - 1) / MAP_ENTRIES_PER_PAGE;
+ for (page_idx = 0; page_idx < page_count; page_idx++)
+ {
+ BlockNumber page_no = MapForkPageIndexToMapBlkno(forknum, page_idx);
+ int entry_idx;
+ int limit_idx;
+ MapPage *page;
+ MapBufferDesc *buf;
+
+ if (page_no >= n_map_pages)
+ break;
+
+ if (page_no != current_page)
+ {
+ if (current_slot >= 0)
+ MapUnpinBuffer(current_slot);
+ current_slot = MapReadBuffer(map_ctx, rnode, forknum, page_no);
+ current_page = page_no;
+ }
+
+ buf = &MapBuffers[current_slot];
+ page = MapGetPage(current_slot);
+ LWLockAcquire(&buf->buffer_lock, LW_SHARED);
+ limit_idx = MAP_ENTRIES_PER_PAGE;
+ if (page_idx == page_count - 1 && (n_lblknos % MAP_ENTRIES_PER_PAGE) != 0)
+ limit_idx = n_lblknos % MAP_ENTRIES_PER_PAGE;
+ for (entry_idx = 0; entry_idx < limit_idx; entry_idx++)
+ {
+ BlockNumber pblkno = page->pblknos[entry_idx];
+
+ if (pblkno == InvalidBlockNumber)
+ continue;
+
+ if (max_pblkno == InvalidBlockNumber || pblkno > max_pblkno)
+ max_pblkno = pblkno;
+ }
+ LWLockRelease(&buf->buffer_lock);
+ }
+
+ if (current_slot >= 0)
+ MapUnpinBuffer(current_slot);
+
+ if (max_pblkno == InvalidBlockNumber)
+ return 0;
+ if (max_pblkno == InvalidBlockNumber - 1)
+ ereport(ERROR,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("cannot represent physical block count beyond %u",
+ InvalidBlockNumber - 1)));
+
+ return max_pblkno + 1;
+}
diff --git a/src/backend/storage/map/mapbuf.c b/src/backend/storage/map/mapbuf.c
new file mode 100644
index 0000000000..cb8b59dfbc
--- /dev/null
+++ b/src/backend/storage/map/mapbuf.c
@@ -0,0 +1,414 @@
+/*-------------------------------------------------------------------------
+ *
+ * mapbuf.c
+ * MAP buffer state, pinning, and I/O helpers.
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "miscadmin.h"
+#include "storage/map.h"
+#include "storage/map_internal.h"
+#include "utils/memutils.h"
+
+/* local state for MapStartBufferIO and related functions */
+static MapBufferDesc *InProgressMapBuf = NULL;
+static int *MapPrivateRefCount = NULL;
+static MemoryContext MapLocalCxt = NULL;
+
+static void MapWaitIO(MapBufferDesc *buf);
+static void MapEnsureBufferMaterialized(UmbraFileContext *map_ctx,
+ MapBufferDesc *buf);
+
+void
+MapEnsurePrivateRefCount(void)
+{
+ if (MapPrivateRefCount == NULL)
+ {
+ if (MapLocalCxt == NULL)
+ {
+ MapLocalCxt = AllocSetContextCreate(TopMemoryContext,
+ "MapLocal",
+ ALLOCSET_DEFAULT_SIZES);
+ MemoryContextAllowInCriticalSection(MapLocalCxt, true);
+ }
+ MapPrivateRefCount = MemoryContextAllocZero(MapLocalCxt,
+ map_buffers * sizeof(int));
+ }
+}
+
+void
+MapBufferUpdateStateBits(MapBufferDesc *buf, uint32 set_bits, uint32 clear_bits)
+{
+ for (;;)
+ {
+ uint32 old_state;
+ uint32 new_state;
+
+ old_state = pg_atomic_read_u32(&buf->state);
+ new_state = (old_state | set_bits) & ~clear_bits;
+ if (pg_atomic_compare_exchange_u32(&buf->state, &old_state, new_state))
+ return;
+ }
+}
+
+static void
+MapEnsureBufferMaterialized(UmbraFileContext *map_ctx, MapBufferDesc *buf)
+{
+ uint32 state;
+ BlockNumber map_nblocks;
+ BlockNumber map_blkno;
+
+ Assert(map_ctx != NULL);
+ Assert(buf != NULL);
+ Assert(LWLockHeldByMeInMode(&buf->buffer_lock, LW_EXCLUSIVE));
+ Assert(buf->page_number >= 0);
+ Assert(buf->page_number != MAP_BLOCK_SUPER);
+
+ state = pg_atomic_read_u32(&buf->state);
+ if ((state & MAPBUF_NOT_MATERIALIZED) == 0)
+ return;
+
+ if (!umfile_ctx_fork_exists(map_ctx, UMBRA_METADATA_FORKNUM,
+ UMFILE_EXISTS_DENSE))
+ elog(PANIC,
+ "cannot materialize MAP page %d for relation %u/%u/%u without MAP fork",
+ buf->page_number,
+ buf->rnode.spcOid,
+ buf->rnode.dbOid,
+ buf->rnode.relNumber);
+
+ map_nblocks = umfile_ctx_get_nblocks(map_ctx, UMBRA_METADATA_FORKNUM,
+ UMFILE_NBLOCKS_DENSE);
+ map_blkno = (BlockNumber) buf->page_number;
+
+ if (map_blkno >= map_nblocks)
+ {
+ /*
+ * Mirror buffer-pool extension ownership: create the physical block
+ * at first dirtying, not during checkpoint flush.
+ */
+ umfile_zeroextend(map_ctx, UMBRA_METADATA_FORKNUM,
+ map_nblocks,
+ (int) (map_blkno + 1 - map_nblocks),
+ false);
+ }
+
+ MapBufferUpdateStateBits(buf, 0, MAPBUF_NOT_MATERIALIZED);
+}
+
+void
+MapMarkBufferDirty(UmbraFileContext *map_ctx, MapBufferDesc *buf,
+ XLogRecPtr page_lsn)
+{
+ Assert(buf != NULL);
+ Assert(LWLockHeldByMeInMode(&buf->buffer_lock, LW_EXCLUSIVE));
+
+ if (buf->page_number != MAP_BLOCK_SUPER)
+ MapEnsureBufferMaterialized(map_ctx, buf);
+
+ buf->page_lsn = page_lsn;
+ MapBufferUpdateStateBits(buf, MAPBUF_DIRTY | MAPBUF_JUST_DIRTIED, 0);
+}
+
+/*
+ * MapWaitIO -- Block until MAPBUF_IO_IN_PROGRESS is cleared.
+ */
+static void
+MapWaitIO(MapBufferDesc *buf)
+{
+ for (;;)
+ {
+ uint32 state;
+
+ state = pg_atomic_read_u32(&buf->state);
+ if (!(state & MAPBUF_IO_IN_PROGRESS))
+ break;
+
+ LWLockAcquire(&buf->io_in_progress_lock, LW_SHARED);
+ LWLockRelease(&buf->io_in_progress_lock);
+ }
+}
+
+/*
+ * MapStartBufferIO -- begin output I/O on this map buffer.
+ *
+ * Returns true if caller should perform I/O; false if page is already clean or
+ * no longer has the caller-required state bits.
+ */
+bool
+MapStartBufferIO(MapBufferDesc *buf, uint32 required_bits)
+{
+ uint32 state;
+
+ Assert(!InProgressMapBuf);
+
+ for (;;)
+ {
+ LWLockAcquire(&buf->io_in_progress_lock, LW_EXCLUSIVE);
+ state = pg_atomic_read_u32(&buf->state);
+
+ if (!(state & MAPBUF_IO_IN_PROGRESS))
+ break;
+
+ /*
+ * Another backend is finishing I/O (or recovering from an error); wait
+ * for the in-progress bit to clear before retrying.
+ */
+ LWLockRelease(&buf->io_in_progress_lock);
+ MapWaitIO(buf);
+ }
+
+ if ((state & MAPBUF_DIRTY) == 0 ||
+ (state & required_bits) != required_bits)
+ {
+ LWLockRelease(&buf->io_in_progress_lock);
+ return false;
+ }
+
+ for (;;)
+ {
+ uint32 new_state;
+ uint32 expected;
+
+ if ((state & MAPBUF_DIRTY) == 0 ||
+ (state & required_bits) != required_bits)
+ {
+ LWLockRelease(&buf->io_in_progress_lock);
+ return false;
+ }
+
+ expected = state;
+ new_state = (state | MAPBUF_IO_IN_PROGRESS) &
+ ~(MAPBUF_IO_ERROR | MAPBUF_JUST_DIRTIED);
+ if (pg_atomic_compare_exchange_u32(&buf->state, &expected, new_state))
+ break;
+ state = expected;
+ }
+
+ InProgressMapBuf = buf;
+ return true;
+}
+
+/*
+ * MapTerminateBufferIO -- complete output I/O state transition.
+ *
+ * Assumes this backend owns I/O on buf.
+ */
+void
+MapTerminateBufferIO(MapBufferDesc *buf, bool clear_dirty, uint32 set_flag_bits)
+{
+ for (;;)
+ {
+ uint32 old_state;
+ uint32 new_state;
+
+ old_state = pg_atomic_read_u32(&buf->state);
+ Assert(old_state & MAPBUF_IO_IN_PROGRESS);
+
+ new_state = old_state & ~(MAPBUF_IO_IN_PROGRESS | MAPBUF_IO_ERROR);
+ if (clear_dirty)
+ {
+ new_state &= ~MAPBUF_CHECKPOINT_NEEDED;
+ if (!(old_state & MAPBUF_JUST_DIRTIED))
+ new_state &= ~MAPBUF_DIRTY;
+ }
+ new_state |= set_flag_bits;
+
+ if (pg_atomic_compare_exchange_u32(&buf->state, &old_state, new_state))
+ break;
+ }
+
+ InProgressMapBuf = NULL;
+ LWLockRelease(&buf->io_in_progress_lock);
+}
+
+/*
+ * MapAbortBufferIO -- cleanup map buffer I/O after an ERROR.
+ */
+void
+MapAbortBufferIO(void)
+{
+ MapBufferDesc *buf = InProgressMapBuf;
+ uint32 state;
+
+ if (buf == NULL)
+ return;
+
+ LWLockAcquire(&buf->io_in_progress_lock, LW_EXCLUSIVE);
+
+ state = pg_atomic_read_u32(&buf->state);
+ if (state & MAPBUF_IO_IN_PROGRESS)
+ MapTerminateBufferIO(buf, false, MAPBUF_IO_ERROR);
+ else
+ {
+ InProgressMapBuf = NULL;
+ LWLockRelease(&buf->io_in_progress_lock);
+ }
+}
+
+void
+MapBackendExitCleanup(void)
+{
+ int slot_id;
+
+ /*
+ * First clear in-progress map I/O ownership, so other waiters can make
+ * progress even if current backend is leaving via ERROR/abort.
+ */
+ MapAbortBufferIO();
+ if (MapPrivateRefCount == NULL)
+ return;
+
+ /* Release all map pins held by this backend. */
+ for (slot_id = 0; slot_id < map_buffers; slot_id++)
+ {
+ while (MapPrivateRefCount[slot_id] > 0)
+ MapUnpinBuffer(slot_id);
+ }
+
+ MapResetAllTruncatePreloads();
+
+#ifdef USE_ASSERT_CHECKING
+ Assert(InProgressMapBuf == NULL);
+ for (slot_id = 0; slot_id < map_buffers; slot_id++)
+ {
+ Assert(MapPrivateRefCount[slot_id] == 0);
+ Assert(!LWLockHeldByMe(&MapBuffers[slot_id].buffer_lock));
+ Assert(!LWLockHeldByMe(&MapBuffers[slot_id].io_in_progress_lock));
+ }
+#endif
+}
+
+/*
+ * MapPinBuffer - pin a map buffer
+ *
+ * Increments the refcount for the buffer. If adjust_usage is true,
+ * also increments the usage_count (up to max 5).
+ */
+void
+MapPinBuffer(int slot_id, bool adjust_usage)
+{
+ uint32_t state;
+
+ MapEnsurePrivateRefCount();
+
+ /* Increment shared refcount first. */
+ while (true)
+ {
+ uint32_t old_state = pg_atomic_read_u32(&MapBuffers[slot_id].state);
+ uint32_t new_state = old_state + 1;
+
+ if (MAPBUF_GET_REFCOUNT(old_state) >= MAPBUF_VALID_MASK)
+ elog(ERROR, "map buffer reference count overflow");
+
+ if (pg_atomic_compare_exchange_u32(&MapBuffers[slot_id].state,
+ &old_state, new_state))
+ {
+ state = new_state;
+ break;
+ }
+ }
+
+ MapPrivateRefCount[slot_id]++;
+ Assert(MapPrivateRefCount[slot_id] > 0);
+
+ /* Increment usage count if requested. */
+ if (adjust_usage && MAPBUF_GET_USAGECOUNT(state) < 5)
+ {
+ while (true)
+ {
+ uint32_t old_state = pg_atomic_read_u32(&MapBuffers[slot_id].state);
+ uint32_t new_state = old_state + MAPBUF_USAGECOUNT_ONE;
+
+ if (pg_atomic_compare_exchange_u32(&MapBuffers[slot_id].state,
+ &old_state, new_state))
+ break;
+ }
+ }
+}
+
+/*
+ * MapUnpinBuffer - unpin a map buffer
+ *
+ * Decrements the refcount for the buffer.
+ */
+void
+MapUnpinBuffer(int slot_id)
+{
+ MapEnsurePrivateRefCount();
+
+ if (MapPrivateRefCount[slot_id] == 0)
+ elog(ERROR, "map buffer private refcount underflow");
+
+ while (true)
+ {
+ uint32_t old_state = pg_atomic_read_u32(&MapBuffers[slot_id].state);
+ uint32_t new_state = old_state - 1;
+
+ if (MAPBUF_GET_REFCOUNT(old_state) == 0)
+ elog(ERROR, "map buffer refcount underflow");
+
+ if (pg_atomic_compare_exchange_u32(&MapBuffers[slot_id].state,
+ &old_state, new_state))
+ break;
+ }
+
+ MapPrivateRefCount[slot_id]--;
+}
+
+/*
+ * MapInvalidateBuffer - invalidate a buffer slot for a specific mapping tag.
+ *
+ * This follows buffer-pool invalidation semantics:
+ * - caller identifies expected tag and slot
+ * - if slot tag changed while waiting, do nothing
+ * - if slot is still pinned, wait/retry until safe to invalidate
+ */
+void
+MapInvalidateBuffer(int slot_id, RelFileLocator expected_rnode,
+ ForkNumber expected_forknum,
+ BlockNumber expected_map_blkno)
+{
+ MapBufferDesc *buf = &MapBuffers[slot_id];
+ uint32 state;
+
+retry:
+ LWLockAcquire(&buf->io_in_progress_lock, LW_EXCLUSIVE);
+
+ LWLockAcquire(&buf->buffer_lock, LW_EXCLUSIVE);
+ if (buf->page_number < 0 ||
+ buf->page_number != expected_map_blkno ||
+ buf->forknum != expected_forknum ||
+ !RelFileLocatorEquals(buf->rnode, expected_rnode))
+ {
+ LWLockRelease(&buf->buffer_lock);
+ LWLockRelease(&buf->io_in_progress_lock);
+ return;
+ }
+
+ state = pg_atomic_read_u32(&buf->state);
+ if (MAPBUF_GET_REFCOUNT(state) != 0)
+ {
+ LWLockRelease(&buf->buffer_lock);
+ LWLockRelease(&buf->io_in_progress_lock);
+
+ if (MapPrivateRefCount != NULL &&
+ MapPrivateRefCount[slot_id] > 0)
+ elog(ERROR, "map buffer is pinned in MapInvalidateBuffer");
+
+ MapWaitIO(buf);
+ goto retry;
+ }
+ buf->page_number = -1;
+ buf->forknum = InvalidForkNumber;
+ memset(&buf->rnode, 0, sizeof(RelFileLocator));
+ buf->page_lsn = 0;
+ LWLockRelease(&buf->buffer_lock);
+
+ /* Reset full state before returning slot to free list. */
+ pg_atomic_write_u32(&buf->state, 0);
+ MapClockFreeBuffer(slot_id);
+ LWLockRelease(&buf->io_in_progress_lock);
+}
diff --git a/src/backend/storage/map/mapclock.c b/src/backend/storage/map/mapclock.c
new file mode 100644
index 0000000000..6fa62e1c1a
--- /dev/null
+++ b/src/backend/storage/map/mapclock.c
@@ -0,0 +1,457 @@
+/*-------------------------------------------------------------------------
+ *
+ * mapclock.c
+ * clock sweep algorithm for map buffer replacement
+ *
+ * This implements a clock sweep algorithm similar to freelist.c,
+ * but for managing map buffers instead of data buffers.
+ *
+ * Also handles the map cache hash table, similar to buf_table.c.
+ *
+ * src/backend/storage/map/mapclock.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "storage/map.h"
+#include "storage/map_internal.h"
+#include "storage/lwlock.h"
+#include "storage/shmem.h"
+#include "storage/spin.h"
+#include "utils/hsearch.h"
+
+#define LOG2_NUM_MAP_CACHE_PARTITIONS 5
+#define NUM_MAP_CACHE_PARTITIONS (1 << LOG2_NUM_MAP_CACHE_PARTITIONS)
+
+typedef struct MapCacheTag
+{
+ RelFileLocator rnode;
+ ForkNumber forknum;
+ BlockNumber map_blkno;
+} MapCacheTag;
+
+typedef struct MapCacheEntry
+{
+ MapCacheTag key;
+ int slot_id;
+} MapCacheEntry;
+
+static HTAB *MapCacheHash = NULL;
+static LWLockPadded *MapCachePartitionLocks = NULL;
+
+static inline uint32
+MapCacheHashCode(MapCacheTag *tag)
+{
+ Assert(MapCacheHash != NULL);
+ return get_hash_value(MapCacheHash, (void *) tag);
+}
+
+static inline LWLock *
+MapCachePartitionLock(uint32 hashcode)
+{
+ return &MapCachePartitionLocks[hashcode & (NUM_MAP_CACHE_PARTITIONS - 1)].lock;
+}
+
+void
+MapCacheTableShmemRequest(void)
+{
+ long hash_size;
+
+ hash_size = Max((long) map_buffers, (long) map_buffers * 2L);
+
+ ShmemRequestStruct(.name = "Map Cache Partition Locks",
+ .size = NUM_MAP_CACHE_PARTITIONS * sizeof(LWLockPadded),
+ .ptr = (void **) &MapCachePartitionLocks,
+ );
+
+ ShmemRequestHash(.name = "Map Cache Lookup Table",
+ .nelems = hash_size,
+ .ptr = &MapCacheHash,
+ .hash_info.keysize = sizeof(MapCacheTag),
+ .hash_info.entrysize = sizeof(MapCacheEntry),
+ .hash_info.num_partitions = NUM_MAP_CACHE_PARTITIONS,
+ .hash_flags = HASH_ELEM | HASH_BLOBS | HASH_PARTITION,
+ );
+}
+
+void
+MapCacheTableShmemInit(void)
+{
+ int i;
+
+ for (i = 0; i < NUM_MAP_CACHE_PARTITIONS; i++)
+ LWLockInitialize(&MapCachePartitionLocks[i].lock,
+ LWTRANCHE_MAP_BUFFER_CONTENT);
+}
+
+/*
+ * MapCacheLookup - lookup a buffer slot in the cache
+ * Returns slot_id if found, -1 otherwise
+ */
+int
+MapCacheLookup(RelFileLocator rnode, ForkNumber forknum, BlockNumber map_blkno)
+{
+ MapCacheTag tag;
+ MapCacheEntry *entry;
+ uint32 hashcode;
+ int slot_id = -1;
+ LWLock *partition_lock;
+
+ tag.rnode = rnode;
+ tag.forknum = forknum;
+ tag.map_blkno = map_blkno;
+ hashcode = MapCacheHashCode(&tag);
+ partition_lock = MapCachePartitionLock(hashcode);
+
+ LWLockAcquire(partition_lock, LW_SHARED);
+ entry = (MapCacheEntry *)
+ hash_search_with_hash_value(MapCacheHash,
+ (void *) &tag,
+ hashcode,
+ HASH_FIND,
+ NULL);
+ if (entry != NULL)
+ slot_id = entry->slot_id;
+ LWLockRelease(partition_lock);
+
+ return slot_id;
+}
+
+/*
+ * MapCacheInsert - insert a buffer slot into the cache.
+ *
+ * Returns -1 on successful insertion. If another slot already owns the tag,
+ * returns that slot id and leaves the existing entry unchanged.
+ */
+int
+MapCacheInsert(RelFileLocator rnode, ForkNumber forknum, BlockNumber map_blkno, int slot_id)
+{
+ MapCacheTag tag;
+ MapCacheEntry *entry;
+ uint32 hashcode;
+ bool found;
+ LWLock *partition_lock;
+
+ Assert(slot_id >= 0);
+
+ tag.rnode = rnode;
+ tag.forknum = forknum;
+ tag.map_blkno = map_blkno;
+ hashcode = MapCacheHashCode(&tag);
+ partition_lock = MapCachePartitionLock(hashcode);
+
+ LWLockAcquire(partition_lock, LW_EXCLUSIVE);
+ entry = (MapCacheEntry *)
+ hash_search_with_hash_value(MapCacheHash,
+ (void *) &tag,
+ hashcode,
+ HASH_ENTER,
+ &found);
+ if (found)
+ {
+ int existing_slot_id = entry->slot_id;
+
+ LWLockRelease(partition_lock);
+ return existing_slot_id;
+ }
+
+ entry->slot_id = slot_id;
+ LWLockRelease(partition_lock);
+
+ return -1;
+}
+
+/*
+ * MapCacheDelete - remove a buffer slot from the cache
+ */
+void
+MapCacheDelete(RelFileLocator rnode, ForkNumber forknum, BlockNumber map_blkno,
+ int slot_id)
+{
+ MapCacheTag tag;
+ MapCacheEntry *entry;
+ uint32 hashcode;
+ LWLock *partition_lock;
+
+ Assert(slot_id >= 0);
+
+ tag.rnode = rnode;
+ tag.forknum = forknum;
+ tag.map_blkno = map_blkno;
+ hashcode = MapCacheHashCode(&tag);
+ partition_lock = MapCachePartitionLock(hashcode);
+
+ LWLockAcquire(partition_lock, LW_EXCLUSIVE);
+ entry = (MapCacheEntry *)
+ hash_search_with_hash_value(MapCacheHash,
+ (void *) &tag,
+ hashcode,
+ HASH_FIND,
+ NULL);
+ if (entry != NULL && entry->slot_id == slot_id)
+ {
+ (void) hash_search_with_hash_value(MapCacheHash,
+ (void *) &tag,
+ hashcode,
+ HASH_REMOVE,
+ NULL);
+ }
+ LWLockRelease(partition_lock);
+}
+
+/*
+ * ClockSweepTick - advance the clock hand
+ *
+ * Returns the next slot to examine.
+ */
+static inline uint32
+ClockSweepTick(void)
+{
+ uint32 victim;
+ int num_slots;
+
+ num_slots = MapShared->num_slots;
+
+ /*
+ * Atomically move hand ahead one slot.
+ * Multiple processes can do this concurrently.
+ */
+ victim = pg_atomic_fetch_add_u32(&MapShared->next_victim_buffer, 1);
+
+ /* Handle wraparound */
+ if (victim >= (uint32) num_slots)
+ {
+ uint32 originalVictim = victim;
+
+ /* What we actually look up in MapBuffers */
+ victim = victim % num_slots;
+
+ /*
+ * If we're the one that just caused a wraparound, increment
+ * completePasses while holding the lock.
+ */
+ if (victim == 0)
+ {
+ uint32 expected;
+ uint32 wrapped;
+ bool success = false;
+
+ expected = originalVictim + 1;
+
+ while (!success)
+ {
+ SpinLockAcquire(&MapShared->clock_lock);
+
+ wrapped = expected % num_slots;
+
+ success = pg_atomic_compare_exchange_u32(
+ &MapShared->next_victim_buffer,
+ &expected, wrapped);
+ if (success)
+ MapShared->complete_passes++;
+
+ SpinLockRelease(&MapShared->clock_lock);
+ }
+ }
+ }
+
+ return victim;
+}
+
+/*
+ * MapClockGetBuffer - select a buffer slot using clock algorithm
+ *
+ * Returns a slot ID that is safe to use (not pinned).
+ * The caller is responsible for initializing the slot.
+ */
+int
+MapClockGetBuffer(void)
+{
+ MapBufferDesc *buf;
+ int trycounter;
+ uint32 local_buf_state;
+ int num_slots = MapShared->num_slots;
+
+ /*
+ * First, check if there's a buffer on the free list.
+ */
+ if (MapShared->first_free_buffer >= 0)
+ {
+ while (true)
+ {
+ int slot_id;
+
+ SpinLockAcquire(&MapShared->clock_lock);
+
+ if (MapShared->first_free_buffer < 0)
+ {
+ SpinLockRelease(&MapShared->clock_lock);
+ break;
+ }
+
+ slot_id = MapShared->first_free_buffer;
+ buf = &MapBuffers[slot_id];
+
+ Assert(buf->freeNext != FREENEXT_NOT_IN_LIST);
+
+ /* Remove from free list */
+ MapShared->first_free_buffer = buf->freeNext;
+ buf->freeNext = FREENEXT_NOT_IN_LIST;
+
+ SpinLockRelease(&MapShared->clock_lock);
+
+ /*
+ * Check if the buffer is actually usable.
+ * (It might have been used after being put on free list)
+ */
+ local_buf_state = pg_atomic_read_u32(&buf->state);
+
+ if (MAPBUF_GET_REFCOUNT(local_buf_state) == 0 &&
+ MAPBUF_GET_USAGECOUNT(local_buf_state) == 0)
+ {
+ /* Found a usable buffer */
+ pg_atomic_fetch_add_u32(&MapShared->num_allocs, 1);
+ return slot_id;
+ }
+
+ /*
+ * Buffer not usable (pinned or still has usage_count).
+ *
+ * Keep it off free list and let normal clock sweep handle it.
+ * Re-queuing it at free-list head can livelock when the same
+ * non-usable slot is popped repeatedly.
+ */
+ continue;
+ }
+ }
+
+ /*
+ * No free buffers, run the clock sweep algorithm.
+ */
+ trycounter = num_slots;
+
+ for (;;)
+ {
+ uint32 victim_slot;
+
+ victim_slot = ClockSweepTick();
+ buf = &MapBuffers[victim_slot];
+
+ local_buf_state = pg_atomic_read_u32(&buf->state);
+
+ /*
+ * If the buffer is pinned, we cannot use it.
+ * If it has a non-zero usage_count, decrement it and continue.
+ */
+ if (MAPBUF_GET_REFCOUNT(local_buf_state) == 0)
+ {
+ if (MAPBUF_GET_USAGECOUNT(local_buf_state) != 0)
+ {
+ /* Decrement usage_count */
+ uint32_t old_state;
+ uint32_t new_state;
+
+ do
+ {
+ old_state = pg_atomic_read_u32(&buf->state);
+ new_state = old_state - MAPBUF_USAGECOUNT_ONE;
+ }
+ while (!pg_atomic_compare_exchange_u32(&buf->state,
+ &old_state, new_state));
+
+ /* Reset try counter since we made progress */
+ trycounter = num_slots;
+ }
+ else
+ {
+ /* Found a usable buffer */
+ pg_atomic_fetch_add_u32(&MapShared->num_allocs, 1);
+
+ /* Dirty-victim writeback is handled by caller (MapReadBuffer). */
+
+ return (int) victim_slot;
+ }
+ }
+ else if (--trycounter == 0)
+ {
+ /*
+ * We've scanned all buffers and all are pinned.
+ * This shouldn't happen with reasonable sizing.
+ */
+ elog(ERROR, "no unpinned map buffers available");
+ }
+ }
+}
+
+/*
+ * MapClockFreeBuffer - return a buffer to the free list
+ *
+ * Low-level function that adds a buffer to the free list.
+ * The buffer's state should already be cleaned before calling this.
+ * This is called by MapInvalidateBuffer.
+ */
+void
+MapClockFreeBuffer(int slot_id)
+{
+ MapBufferDesc *buf;
+ uint32 state;
+
+ buf = &MapBuffers[slot_id];
+
+ /* Check if buffer is already on free list */
+ SpinLockAcquire(&MapShared->clock_lock);
+
+ if (buf->freeNext != FREENEXT_NOT_IN_LIST)
+ {
+ /* Already on free list, just return */
+ SpinLockRelease(&MapShared->clock_lock);
+ return;
+ }
+
+ /*
+ * Free list must only contain fully reusable slots.
+ * Caller is responsible for clearing refcount/usage first.
+ */
+ state = pg_atomic_read_u32(&buf->state);
+ Assert(MAPBUF_GET_REFCOUNT(state) == 0);
+ Assert(MAPBUF_GET_USAGECOUNT(state) == 0);
+
+ /* Insert at head of free list */
+ buf->freeNext = MapShared->first_free_buffer;
+ MapShared->first_free_buffer = slot_id;
+
+ SpinLockRelease(&MapShared->clock_lock);
+}
+
+/*
+ * MapSyncStart - tell checkpoint where to start syncing
+ *
+ * Returns the starting slot ID for checkpoint sync.
+ */
+int
+MapSyncStart(uint32 *complete_passes, uint32 *num_allocs)
+{
+ uint32 next_victim;
+ int result;
+
+ SpinLockAcquire(&MapShared->clock_lock);
+
+ next_victim = pg_atomic_read_u32(&MapShared->next_victim_buffer);
+ result = next_victim % MapShared->num_slots;
+
+ if (complete_passes)
+ {
+ *complete_passes = MapShared->complete_passes;
+ *complete_passes += next_victim / MapShared->num_slots;
+ }
+
+ if (num_allocs)
+ {
+ *num_allocs = pg_atomic_exchange_u32(&MapShared->num_allocs, 0);
+ }
+
+ SpinLockRelease(&MapShared->clock_lock);
+
+ return result;
+}
diff --git a/src/backend/storage/map/mapflush.c b/src/backend/storage/map/mapflush.c
new file mode 100644
index 0000000000..def1943dee
--- /dev/null
+++ b/src/backend/storage/map/mapflush.c
@@ -0,0 +1,665 @@
+/*-------------------------------------------------------------------------
+ *
+ * mapflush.c
+ * MAP checkpoint and writeback implementation.
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/xlog.h"
+#include "access/xlogutils.h"
+#include "storage/map.h"
+#include "storage/map_internal.h"
+#include "storage/mapsuper_internal.h"
+#include "storage/umbra.h"
+#include "storage/umfile.h"
+
+typedef struct MapFlushWriteCache
+{
+ bool valid;
+ RelFileLocatorBackend rlocator;
+ UmbraFileContext *ctx;
+} MapFlushWriteCache;
+
+typedef struct MapFlushBufferTarget
+{
+ int slot_id;
+ RelFileLocator rnode;
+ BlockNumber map_blkno;
+} MapFlushBufferTarget;
+
+static void MapFlushWriteCacheReset(MapFlushWriteCache *cache);
+static UmbraFileContext *MapFlushContextFor(MapFlushWriteCache *cache,
+ RelFileLocator rnode);
+static void MapFlushWritePage(RelFileLocatorBackend rlocator,
+ UmbraFileContext *ctx,
+ BlockNumber map_blkno,
+ const void *page,
+ XLogRecPtr page_lsn);
+static void MapFlushWriteSuperblockEntry(RelFileLocator rnode,
+ MapSuperEntry *entry);
+static int MapCollectDirtyBufferTargets(MapFlushBufferTarget **targets_out,
+ const RelFileLocator *filter_rnode,
+ bool mark_checkpoint_needed);
+static int MapFlushDirtyBuffers(int max_pages, bool checkpoint);
+static int MapFlushRelationBuffers(RelFileLocator rnode, bool checkpoint);
+static int MapFlushDirtySuperblocks(void);
+static int MapFlushRelationSuperblocks(RelFileLocator rnode);
+static void MapFlushBufferCached(int slot_id, MapFlushWriteCache *write_cache,
+ bool checkpoint);
+static bool MapTablespaceSelected(Oid spcOid, int ntablespaces,
+ const Oid *tablespace_ids);
+static inline int map_flush_buffer_target_comparator(
+ const MapFlushBufferTarget *a,
+ const MapFlushBufferTarget *b);
+
+#define ST_SORT sort_map_flush_buffer_targets
+#define ST_ELEMENT_TYPE MapFlushBufferTarget
+#define ST_COMPARE(a, b) map_flush_buffer_target_comparator(a, b)
+#define ST_SCOPE static
+#define ST_DEFINE
+#include "lib/sort_template.h"
+
+static void
+MapFlushWriteCacheReset(MapFlushWriteCache *cache)
+{
+ if (cache == NULL || !cache->valid)
+ return;
+
+ umfile_ctx_destroy_temporary(cache->ctx);
+ cache->ctx = NULL;
+ cache->valid = false;
+ memset(&cache->rlocator, 0, sizeof(cache->rlocator));
+}
+
+static UmbraFileContext *
+MapFlushContextFor(MapFlushWriteCache *cache, RelFileLocator rnode)
+{
+ Assert(cache != NULL);
+
+ if (cache->valid && RelFileLocatorEquals(cache->rlocator.locator, rnode))
+ return cache->ctx;
+
+ MapFlushWriteCacheReset(cache);
+
+ cache->rlocator.locator = rnode;
+ cache->rlocator.backend = INVALID_PROC_NUMBER;
+ cache->ctx = umfile_ctx_create_temporary(cache->rlocator);
+ cache->valid = true;
+ return cache->ctx;
+}
+
+static void
+MapFlushWritePage(RelFileLocatorBackend rlocator, UmbraFileContext *ctx,
+ BlockNumber map_blkno, const void *page,
+ XLogRecPtr page_lsn)
+{
+ Assert(ctx != NULL);
+ Assert(page != NULL);
+ Assert(map_blkno != MAP_BLOCK_SUPER);
+ Assert(umfile_ctx_fork_exists(ctx, UMBRA_METADATA_FORKNUM,
+ UMFILE_EXISTS_DENSE));
+
+ if (!InRecovery && page_lsn != InvalidXLogRecPtr)
+ XLogFlush(page_lsn);
+
+ umfile_ctx_write(ctx, UMBRA_METADATA_FORKNUM, map_blkno,
+ page, BLCKSZ, false);
+ umfile_ctx_register_dirty(ctx, UMBRA_METADATA_FORKNUM, map_blkno,
+ false,
+ RelFileLocatorBackendIsTemp(rlocator));
+}
+
+static void
+MapFlushWriteSuperblockEntry(RelFileLocator rnode, MapSuperEntry *entry)
+{
+ RelFileLocatorBackend rlocator = {0};
+ char sector[MAP_SUPERBLOCK_SIZE];
+
+ Assert(entry != NULL);
+
+ if (!InRecovery && entry->page_lsn != InvalidXLogRecPtr)
+ XLogFlush(entry->page_lsn);
+
+ rlocator.locator = rnode;
+ rlocator.backend = INVALID_PROC_NUMBER;
+
+ MapSuperblockSetLastUpdatedLSN(&entry->super, entry->page_lsn);
+ MapSuperblockRefreshCRC(&entry->super);
+ MapSuperblockPackSector(&entry->super, sector);
+ UmMetadataWriteSuperblock(rlocator, sector, false);
+}
+
+static int
+MapCollectDirtyBufferTargets(MapFlushBufferTarget **targets_out,
+ const RelFileLocator *filter_rnode,
+ bool mark_checkpoint_needed)
+{
+ MapFlushBufferTarget *targets;
+ int target_cap = 256;
+ int target_count = 0;
+
+ Assert(targets_out != NULL);
+
+ targets = palloc(sizeof(MapFlushBufferTarget) * target_cap);
+
+ for (int i = 0; i < map_buffers; i++)
+ {
+ MapBufferDesc *buf = &MapBuffers[i];
+ uint32 state;
+ int page_number;
+ RelFileLocator slot_rnode;
+
+ LWLockAcquire(&buf->buffer_lock, LW_SHARED);
+ state = pg_atomic_read_u32(&buf->state);
+ if ((state & MAPBUF_DIRTY) == 0)
+ {
+ LWLockRelease(&buf->buffer_lock);
+ continue;
+ }
+
+ page_number = buf->page_number;
+ slot_rnode = buf->rnode;
+
+ if (page_number < 0 || page_number == MAP_BLOCK_SUPER)
+ {
+ LWLockRelease(&buf->buffer_lock);
+ continue;
+ }
+ if (filter_rnode != NULL &&
+ !RelFileLocatorEquals(slot_rnode, *filter_rnode))
+ {
+ LWLockRelease(&buf->buffer_lock);
+ continue;
+ }
+
+ if (mark_checkpoint_needed)
+ MapBufferUpdateStateBits(buf, MAPBUF_CHECKPOINT_NEEDED, 0);
+
+ LWLockRelease(&buf->buffer_lock);
+
+ if (target_count >= target_cap)
+ {
+ target_cap *= 2;
+ targets = repalloc(targets,
+ sizeof(MapFlushBufferTarget) * target_cap);
+ }
+
+ targets[target_count].slot_id = i;
+ targets[target_count].rnode = slot_rnode;
+ targets[target_count].map_blkno = (BlockNumber) page_number;
+ target_count++;
+ }
+
+ if (target_count > 1)
+ sort_map_flush_buffer_targets(targets, target_count);
+
+ *targets_out = targets;
+ return target_count;
+}
+
+static inline int
+map_flush_buffer_target_comparator(const MapFlushBufferTarget *a,
+ const MapFlushBufferTarget *b)
+{
+ if (a->rnode.spcOid < b->rnode.spcOid)
+ return -1;
+ else if (a->rnode.spcOid > b->rnode.spcOid)
+ return 1;
+ else if (a->rnode.dbOid < b->rnode.dbOid)
+ return -1;
+ else if (a->rnode.dbOid > b->rnode.dbOid)
+ return 1;
+ else if (a->rnode.relNumber < b->rnode.relNumber)
+ return -1;
+ else if (a->rnode.relNumber > b->rnode.relNumber)
+ return 1;
+ else if (a->map_blkno < b->map_blkno)
+ return -1;
+ else if (a->map_blkno > b->map_blkno)
+ return 1;
+
+ return 0;
+}
+
+void
+MapPreCheckpoint(void)
+{
+ /* no-op: checkpoint work is handled by MapCheckpoint(). */
+}
+
+/*
+ * MapCheckpoint - sync dirty map pages during checkpoint
+ *
+ * Scans all buffer slots and writes dirty pages to disk.
+ * Must handle concurrent access from other backends.
+ */
+void
+MapCheckpoint(void)
+{
+ /*
+ * Checkpoint ordering: persist regular MAP pages first, then superblocks.
+ * This keeps on-disk superblock as a checkpoint-boundary snapshot and
+ * avoids it getting ahead of mapping-page durability.
+ */
+ (void) MapFlushDirtyBuffers(-1, true);
+ (void) MapFlushDirtySuperblocks();
+}
+
+void
+MapCheckpointRelation(RelFileLocator rnode)
+{
+ (void) MapFlushRelationBuffers(rnode, true);
+ (void) MapFlushRelationSuperblocks(rnode);
+}
+
+void
+MapCheckpointDatabaseTablespaces(Oid dbid, int ntablespaces,
+ const Oid *tablespace_ids)
+{
+ RelFileLocator *targets;
+ int target_cap = 256;
+ int target_count = 0;
+ int i;
+
+ targets = palloc(sizeof(RelFileLocator) * target_cap);
+
+ for (i = 0; i < map_buffers; i++)
+ {
+ MapBufferDesc *buf = &MapBuffers[i];
+ uint32 state_before;
+ int page_number;
+ RelFileLocator slot_rnode;
+
+ state_before = pg_atomic_read_u32(&buf->state);
+ if ((state_before & MAPBUF_DIRTY) == 0)
+ continue;
+
+ LWLockAcquire(&buf->buffer_lock, LW_SHARED);
+ page_number = buf->page_number;
+ slot_rnode = buf->rnode;
+ LWLockRelease(&buf->buffer_lock);
+
+ if (page_number < 0 ||
+ slot_rnode.dbOid != dbid ||
+ !MapTablespaceSelected(slot_rnode.spcOid, ntablespaces,
+ tablespace_ids))
+ continue;
+
+ if (target_count >= target_cap)
+ {
+ target_cap *= 2;
+ targets = repalloc(targets, sizeof(RelFileLocator) * target_cap);
+ }
+ targets[target_count++] = slot_rnode;
+ }
+
+ for (i = 0; i < MapSuperCapacity; i++)
+ {
+ MapSuperEntry *entry = MapSuperEntryBySlot(i);
+ RelFileLocator rnode;
+
+ LWLockAcquire(&entry->lock, LW_SHARED);
+ if (!entry->in_use ||
+ (entry->flags & MAPSUPER_FLAG_DIRTY) == 0 ||
+ entry->key.rnode.dbOid != dbid ||
+ !MapTablespaceSelected(entry->key.rnode.spcOid, ntablespaces,
+ tablespace_ids))
+ {
+ LWLockRelease(&entry->lock);
+ continue;
+ }
+ rnode = entry->key.rnode;
+ LWLockRelease(&entry->lock);
+
+ if (target_count >= target_cap)
+ {
+ target_cap *= 2;
+ targets = repalloc(targets, sizeof(RelFileLocator) * target_cap);
+ }
+ targets[target_count++] = rnode;
+ }
+
+ for (i = 0; i < target_count; i++)
+ {
+ int j;
+ bool seen = false;
+
+ for (j = 0; j < i; j++)
+ {
+ if (RelFileLocatorEquals(targets[j], targets[i]))
+ {
+ seen = true;
+ break;
+ }
+ }
+ if (seen)
+ continue;
+
+ MapCheckpointRelation(targets[i]);
+ }
+
+ pfree(targets);
+}
+
+void
+MapPostCheckpoint(void)
+{
+ /* no-op: checkpoint work is handled by MapCheckpoint(). */
+}
+
+int
+MapBgWriterFlush(int max_pages)
+{
+ if (max_pages <= 0)
+ return 0;
+
+ /* Non-checkpoint flushes regular MAP pages only; superblock is checkpoint-owned. */
+ return MapFlushDirtyBuffers(max_pages, false);
+}
+
+static int
+MapFlushDirtyBuffers(int max_pages, bool checkpoint)
+{
+ MapFlushBufferTarget *targets;
+ int ntargets;
+ int cleaned = 0;
+ MapFlushWriteCache write_cache = {0};
+
+ ntargets = MapCollectDirtyBufferTargets(&targets, NULL, checkpoint);
+
+ for (int i = 0; i < ntargets; i++)
+ {
+ MapBufferDesc *buf = &MapBuffers[targets[i].slot_id];
+ uint32 state_before;
+ uint32 state_after;
+
+ if (max_pages >= 0 && cleaned >= max_pages)
+ break;
+
+ state_before = pg_atomic_read_u32(&buf->state);
+ if ((state_before & MAPBUF_DIRTY) == 0)
+ continue;
+ if (checkpoint &&
+ (state_before & MAPBUF_CHECKPOINT_NEEDED) == 0)
+ continue;
+
+ MapFlushBufferCached(targets[i].slot_id, &write_cache, checkpoint);
+
+ state_after = pg_atomic_read_u32(&buf->state);
+ if (checkpoint)
+ {
+ if ((state_before & MAPBUF_CHECKPOINT_NEEDED) != 0 &&
+ (state_after & MAPBUF_CHECKPOINT_NEEDED) == 0)
+ cleaned++;
+ }
+ else if ((state_before & MAPBUF_DIRTY) != 0 &&
+ (state_after & MAPBUF_DIRTY) == 0)
+ cleaned++;
+ }
+
+ MapFlushWriteCacheReset(&write_cache);
+ pfree(targets);
+
+ return cleaned;
+}
+
+static int
+MapFlushRelationBuffers(RelFileLocator rnode, bool checkpoint)
+{
+ MapFlushBufferTarget *targets;
+ int ntargets;
+ int cleaned = 0;
+ MapFlushWriteCache write_cache = {0};
+
+ ntargets = MapCollectDirtyBufferTargets(&targets, &rnode, checkpoint);
+
+ for (int i = 0; i < ntargets; i++)
+ {
+ MapBufferDesc *buf = &MapBuffers[targets[i].slot_id];
+ uint32 state_before;
+ uint32 state_after;
+
+ state_before = pg_atomic_read_u32(&buf->state);
+ if ((state_before & MAPBUF_DIRTY) == 0)
+ continue;
+ if (checkpoint &&
+ (state_before & MAPBUF_CHECKPOINT_NEEDED) == 0)
+ continue;
+
+ MapFlushBufferCached(targets[i].slot_id, &write_cache, checkpoint);
+
+ state_after = pg_atomic_read_u32(&buf->state);
+ if (checkpoint)
+ {
+ if ((state_before & MAPBUF_CHECKPOINT_NEEDED) != 0 &&
+ (state_after & MAPBUF_CHECKPOINT_NEEDED) == 0)
+ cleaned++;
+ }
+ else if ((state_before & MAPBUF_DIRTY) != 0 &&
+ (state_after & MAPBUF_DIRTY) == 0)
+ cleaned++;
+ }
+
+ MapFlushWriteCacheReset(&write_cache);
+ pfree(targets);
+
+ return cleaned;
+}
+
+static int
+MapFlushDirtySuperblocks(void)
+{
+ typedef struct MapSuperDirtyTarget
+ {
+ RelFileLocator rnode;
+ } MapSuperDirtyTarget;
+
+ MapSuperEntry *entry;
+ MapSuperDirtyTarget *targets;
+ int target_cap = 256;
+ int target_count;
+ bool need_rescan;
+ int cleaned = 0;
+
+ targets = palloc(sizeof(MapSuperDirtyTarget) * target_cap);
+
+ do
+ {
+ int i;
+ int slot_id;
+
+ target_count = 0;
+ need_rescan = false;
+
+ for (slot_id = 0; slot_id < MapSuperCapacity; slot_id++)
+ {
+ entry = MapSuperEntryBySlot(slot_id);
+ LWLockAcquire(&entry->lock, LW_SHARED);
+ if (entry->in_use &&
+ (entry->flags & MAPSUPER_FLAG_DIRTY) != 0)
+ {
+ if (target_count >= target_cap)
+ {
+ need_rescan = true;
+ LWLockRelease(&entry->lock);
+ break;
+ }
+ targets[target_count].rnode = entry->key.rnode;
+ target_count++;
+ }
+ LWLockRelease(&entry->lock);
+ }
+
+ for (i = 0; i < target_count; i++)
+ {
+ RelFileLocator rnode = targets[i].rnode;
+
+ if (!MapSuperFindEntryLocked(rnode, LW_EXCLUSIVE, &entry))
+ continue;
+
+ if ((entry->flags & MAPSUPER_FLAG_DIRTY) == 0)
+ {
+ LWLockRelease(&entry->lock);
+ continue;
+ }
+
+ if (!MapSuperblockHasValidIdentity(&entry->super))
+ {
+ LWLockRelease(&entry->lock);
+ MapSBlockReportCorrupt(rnode,
+ "invalid identity while flushing");
+ }
+
+ MapFlushWriteSuperblockEntry(rnode, entry);
+
+ entry->flags &= ~MAPSUPER_FLAG_DIRTY;
+ cleaned++;
+ LWLockRelease(&entry->lock);
+ }
+
+ if (need_rescan)
+ {
+ target_cap *= 2;
+ targets = repalloc(targets,
+ sizeof(MapSuperDirtyTarget) * target_cap);
+ }
+ }
+ while (need_rescan);
+
+ pfree(targets);
+
+ return cleaned;
+}
+
+static int
+MapFlushRelationSuperblocks(RelFileLocator rnode)
+{
+ MapSuperEntry *entry;
+ int cleaned = 0;
+
+ if (!MapSuperFindEntryLocked(rnode, LW_EXCLUSIVE, &entry))
+ return 0;
+
+ if ((entry->flags & MAPSUPER_FLAG_DIRTY) == 0)
+ {
+ LWLockRelease(&entry->lock);
+ return 0;
+ }
+
+ if (!MapSuperblockHasValidIdentity(&entry->super))
+ {
+ LWLockRelease(&entry->lock);
+ MapSBlockReportCorrupt(rnode, "invalid identity while flushing");
+ }
+
+ MapFlushWriteSuperblockEntry(rnode, entry);
+
+ entry->flags &= ~MAPSUPER_FLAG_DIRTY;
+ cleaned++;
+ LWLockRelease(&entry->lock);
+
+ return cleaned;
+}
+
+void
+MapFlushBuffer(int slot_id)
+{
+ MapFlushBufferCached(slot_id, NULL, false);
+}
+
+static void
+MapFlushBufferCached(int slot_id, MapFlushWriteCache *write_cache,
+ bool checkpoint)
+{
+ int page_number;
+ BlockNumber map_blkno;
+ RelFileLocator rnode;
+ RelFileLocatorBackend rlocator;
+ XLogRecPtr page_lsn;
+ MapBufferDesc *buf;
+ MapPage *page;
+ UmbraFileContext *ctx;
+
+ buf = &MapBuffers[slot_id];
+ page = MapGetPage(slot_id);
+
+ /*
+ * First lock I/O state so only one backend writes this slot. Hold content
+ * lock exclusively while writing, so page content and page_lsn stay in
+ * sync for writeback.
+ */
+ if (!MapStartBufferIO(buf,
+ checkpoint ? MAPBUF_CHECKPOINT_NEEDED : 0))
+ return;
+
+ LWLockAcquire(&buf->buffer_lock, LW_EXCLUSIVE);
+
+ page_number = buf->page_number;
+ rnode = buf->rnode;
+ page_lsn = buf->page_lsn;
+
+ if (page_number < 0)
+ {
+ /* Defensive cleanup: invalid slot must not stay dirty. */
+ MapTerminateBufferIO(buf, true, 0);
+ LWLockRelease(&buf->buffer_lock);
+ return;
+ }
+ map_blkno = (BlockNumber) page_number;
+
+ if (map_blkno == MAP_BLOCK_SUPER)
+ {
+ /*
+ * Superblock is managed by the dedicated superblock table and must not
+ * be present in the regular MAP buffer cache.
+ */
+ MapTerminateBufferIO(buf, false, MAPBUF_IO_ERROR);
+ LWLockRelease(&buf->buffer_lock);
+ elog(ERROR, "MAP superblock cannot be flushed via map buffer cache");
+ }
+
+ /*
+ * Flush by slot owner rnode without going through smgr/umopen again.
+ * MapReadBuffer() can call this while a data-fork read already has an AIO
+ * handle handed out, so reopening through smgr would recurse into Umbra
+ * map-state lookup on the read path.
+ */
+ if (write_cache != NULL)
+ {
+ ctx = MapFlushContextFor(write_cache, rnode);
+ rlocator = write_cache->rlocator;
+ }
+ else
+ {
+ rlocator.locator = rnode;
+ rlocator.backend = INVALID_PROC_NUMBER;
+ ctx = umfile_ctx_create_temporary(rlocator);
+ }
+
+ MapFlushWritePage(rlocator, ctx, map_blkno, (char *) page, page_lsn);
+
+ MapTerminateBufferIO(buf, true, 0);
+ LWLockRelease(&buf->buffer_lock);
+
+ if (write_cache == NULL)
+ umfile_ctx_destroy_temporary(ctx);
+}
+
+static bool
+MapTablespaceSelected(Oid spcOid, int ntablespaces, const Oid *tablespace_ids)
+{
+ int i;
+
+ if (ntablespaces <= 0 || tablespace_ids == NULL)
+ return true;
+
+ for (i = 0; i < ntablespaces; i++)
+ {
+ if (tablespace_ids[i] == spcOid)
+ return true;
+ }
+
+ return false;
+}
diff --git a/src/backend/storage/map/mapinit.c b/src/backend/storage/map/mapinit.c
new file mode 100644
index 0000000000..a0880113ed
--- /dev/null
+++ b/src/backend/storage/map/mapinit.c
@@ -0,0 +1,143 @@
+/*-------------------------------------------------------------------------
+ *
+ * mapinit.c
+ * shared-memory and backend initialization for the MAP layer
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "storage/bufmgr.h"
+#include "storage/map.h"
+#include "storage/map_internal.h"
+#include "storage/mapsuper.h"
+#include "storage/mapsuper_internal.h"
+#include "storage/shmem.h"
+#include "storage/spin.h"
+
+/* GUCs */
+int map_buffers = 1024; /* Number of map buffer slots */
+/*
+ * Dedicated shared-memory slots for MAP superblocks.
+ *
+ * These entries back extremely hot runtime metadata. They are not managed as
+ * an LRU-style cache; instead they remain resident until explicit relation or
+ * database invalidation releases the slot. Keep the default large so hot
+ * relations do not churn through repeated ensure/load cycles.
+ */
+int map_superblocks = 262144;
+
+/* Shared memory pointer */
+MapSharedData *MapShared = NULL;
+
+/* Per-process buffer descriptors */
+MapBufferDesc *MapBuffers = NULL;
+
+/* Actual page data (contiguous block) */
+char *MapPageData = NULL;
+
+static void MapShmemRequest(void *arg);
+static void MapShmemInit(void *arg);
+static void MapShmemAttach(void *arg);
+
+const ShmemCallbacks MapShmemCallbacks = {
+ .request_fn = MapShmemRequest,
+ .init_fn = MapShmemInit,
+ .attach_fn = MapShmemAttach,
+};
+
+static void
+MapRefreshBufferSlots(void)
+{
+ int computed_slots = NBuffers >> 7;
+
+ if (computed_slots < 4096)
+ computed_slots = 4096;
+
+ map_buffers = computed_slots;
+}
+
+void
+MapBackendInit(void)
+{
+ static bool initialized = false;
+
+ if (initialized)
+ return;
+
+ MapRefreshBufferSlots();
+ MapEnsurePrivateRefCount(); initialized = true;
+}
+
+static void
+MapShmemRequest(void *arg)
+{
+ MapRefreshBufferSlots();
+
+ ShmemRequestStruct(.name = "Map Shared Data",
+ .size = sizeof(MapSharedData),
+ .ptr = (void **) &MapShared,
+ );
+
+ ShmemRequestStruct(.name = "Map Buffers",
+ .size = map_buffers * sizeof(MapBufferDesc),
+ .ptr = (void **) &MapBuffers,
+ );
+
+ ShmemRequestStruct(.name = "Map Page Data",
+ .size = map_buffers * BLCKSZ,
+ .ptr = (void **) &MapPageData,
+ );
+
+ MapCacheTableShmemRequest();
+ MapSuperTableShmemRequest();
+}
+
+/*
+ * Initialize shared memory for map layer during postmaster startup.
+ */
+static void
+MapShmemInit(void *arg)
+{
+ int i;
+
+ MapShared->num_slots = map_buffers;
+ MapShared->first_free_buffer = 0;
+ pg_atomic_init_u32(&MapShared->next_victim_buffer, 0);
+ pg_atomic_init_u32(&MapShared->num_allocs, 0);
+ MapShared->complete_passes = 0;
+ SpinLockInit(&MapShared->clock_lock);
+
+ for (i = 0; i < map_buffers; i++)
+ {
+ MapBufferDesc *buf = &MapBuffers[i];
+
+ buf->id = i;
+ buf->freeNext = (i == map_buffers - 1) ? FREENEXT_END_OF_LIST : i + 1;
+ pg_atomic_init_u32(&buf->state, 0);
+ buf->wait_backend_pid = 0;
+
+ memset(&buf->rnode, 0, sizeof(RelFileLocator));
+ buf->forknum = InvalidForkNumber;
+ buf->page_number = -1;
+ buf->page_lsn = 0;
+ LWLockInitialize(&buf->buffer_lock, LWTRANCHE_MAP_BUFFER_CONTENT);
+ LWLockInitialize(&buf->io_in_progress_lock, LWTRANCHE_MAP_BUFFER_CONTENT);
+ }
+
+ memset(MapPageData, 0, map_buffers * BLCKSZ);
+
+ MapCacheTableShmemInit();
+ MapSuperTableShmemInit();
+}
+
+static void
+MapShmemAttach(void *arg)
+{
+ Assert(MapShared != NULL);
+ Assert(MapBuffers != NULL);
+ Assert(MapPageData != NULL);
+ Assert(MapShared->num_slots == map_buffers);
+
+ MapSuperTableShmemAttach();
+}
diff --git a/src/backend/storage/map/mapsuper.c b/src/backend/storage/map/mapsuper.c
index b376d513fd..cf8bde182e 100644
--- a/src/backend/storage/map/mapsuper.c
+++ b/src/backend/storage/map/mapsuper.c
@@ -1,22 +1,98 @@
/*-------------------------------------------------------------------------
*
* mapsuper.c
- * Umbra metadata superblock helpers.
- *
- * This file contains on-disk superblock encoding and direct metadata-file I/O
- * helpers.
- *
- * src/backend/storage/map/mapsuper.c
+ * MAP superblock metadata helpers.
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"

+#include "access/xlog.h"
+#include "access/xlogrecovery.h"
+#include "access/xlogutils.h"
+#include "common/hashfn.h"
+#include "miscadmin.h"
#include "storage/map.h"
#include "storage/mapsuper.h"
-#include "storage/umbra.h"
+#include "storage/mapsuper_internal.h"
+#include "storage/shmem.h"
+
+#define MAP_SUPER_NPARTITIONS 128
+#define MAP_SUPER_NPARTITION_BITS 7
+#define MAPSUPER_INDEX_EMPTY (-1)
+#define MAPSUPER_INDEX_DELETED (-2)
+#define MAPSUPER_FREENEXT_END (-1)
+#define MAPSUPER_FREENEXT_NOT_IN_LIST (-2)
+
+#if MAP_SUPER_NPARTITIONS != (1 << MAP_SUPER_NPARTITION_BITS)
+#error "MAP_SUPER_NPARTITIONS must match MAP_SUPER_NPARTITION_BITS"
+#endif

-static void MapSBlockReportCorrupt(SMgrRelation reln, const char *reason);
+typedef struct MapSuperIndexSlot
+{
+ int slot_id;
+} MapSuperIndexSlot;
+
+typedef struct MapSuperCtl
+{
+ int free_head;
+ slock_t free_list_lock;
+} MapSuperCtl;
+
+typedef enum MapSBlockReadStatus
+{
+ MAP_SBLOCK_READ_OK,
+ MAP_SBLOCK_READ_MISSING,
+ MAP_SBLOCK_READ_CORRUPT
+} MapSBlockReadStatus;
+
+MapSuperEntry *MapSuperEntries = NULL;
+int MapSuperCapacity = 0;
+
+static MapSuperIndexSlot *MapSuperIndex = NULL;
+static MapSuperCtl *MapSuperCtlData = NULL;
+static LWLockPadded *MapSuperPartitionLocks = NULL;
+static int MapSuperIndexCapacityPerPartition = 0;
+
+static void MapSuperTableRefreshDerivedState(void);
+static MapSBlockReadStatus MapSuperLoadFromDisk(UmbraFileContext *map_ctx,
+ RelFileLocator rnode,
+ MapSuperblock *super);
+static int MapSuperIndexCapacityForPartition(int capacity);
+static uint32 MapSuperHashCode(RelFileLocator rnode);
+static int MapSuperPartitionForHash(uint32 hashcode);
+static LWLock *MapSuperPartitionLock(uint32 hashcode);
+static int MapSuperLookupSlotLocked(RelFileLocator rnode, uint32 hashcode,
+ int partition, int *insert_bucket);
+static bool MapForkUsesAbsentSentinel(ForkNumber forknum);
+static uint32 MapSuperExtendingFlag(ForkNumber forknum);
+static BlockNumber MapSuperGetExtendingTarget(const MapSuperEntry *entry,
+ ForkNumber forknum);
+static void MapSuperSetExtendingTarget(MapSuperEntry *entry,
+ ForkNumber forknum,
+ BlockNumber nblocks);
+static bool MapSuperPrepareEntryForUpdate(UmbraFileContext *map_ctx,
+ RelFileLocator rnode,
+ XLogRecPtr map_lsn,
+ const char *missing_errmsg,
+ MapSuperEntry **entry_p);
+static void MapSBlockUpdateLogicalNblocks(UmbraFileContext *map_ctx,
+ RelFileLocator rnode,
+ ForkNumber forknum,
+ BlockNumber nblocks,
+ XLogRecPtr map_lsn,
+ bool bump_only);
+static void MapSBlockSetPendingFlag(UmbraFileContext *map_ctx,
+ RelFileLocator rnode,
+ bool pending,
+ XLogRecPtr map_lsn);
+void MapSBlockBumpPhysicalState(UmbraFileContext *map_ctx,
+ RelFileLocator rnode,
+ ForkNumber forknum,
+ BlockNumber nblocks,
+ bool bump_next_free,
+ bool bump_capacity,
+ XLogRecPtr map_lsn);

void
MapSuperblockRefreshCRC(MapSuperblock *super)
@@ -251,88 +327,1171 @@ MapSuperblockSetLogicalNblocks(MapSuperblock *super, ForkNumber forknum,
}

void
-MapSuperblockPackPage(const MapSuperblock *super, char page[BLCKSZ])
+MapSuperblockPackSector(const MapSuperblock *super, char sector[MAP_SUPERBLOCK_SIZE])
{
Assert(super != NULL);
- Assert(page != NULL);
+ Assert(sector != NULL);

- MemSet(page, 0, BLCKSZ);
- memcpy(page, super->padding, MAP_SUPERBLOCK_SIZE);
+ memcpy(sector, super->padding, MAP_SUPERBLOCK_SIZE);
}

void
-MapSuperblockUnpackPage(MapSuperblock *super, const char page[BLCKSZ])
+MapSuperblockUnpackSector(MapSuperblock *super,
+ const char sector[MAP_SUPERBLOCK_SIZE])
{
Assert(super != NULL);
- Assert(page != NULL);
+ Assert(sector != NULL);
+
+ memcpy(super->padding, sector, MAP_SUPERBLOCK_SIZE);
+}
+
+void
+MapSBlockReportCorrupt(RelFileLocator rnode, const char *reason)
+{
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("map superblock is corrupted for relation %u/%u/%u: %s",
+ rnode.spcOid, rnode.dbOid, rnode.relNumber, reason)));
+}
+
+static MapSBlockReadStatus
+MapSuperLoadFromDisk(UmbraFileContext *map_ctx, RelFileLocator rnode,
+ MapSuperblock *super)
+{
+ char sector[MAP_SUPERBLOCK_SIZE];
+
+ if (!umfile_ctx_fork_exists(map_ctx, UMBRA_METADATA_FORKNUM,
+ UMFILE_EXISTS_DENSE))
+ return MAP_SBLOCK_READ_MISSING;
+
+ umfile_ctx_read(map_ctx, UMBRA_METADATA_FORKNUM, MAP_BLOCK_SUPER,
+ sector, MAP_SUPERBLOCK_SIZE);
+ MapSuperblockUnpackSector(super, sector);
+
+ if (!MapSuperblockHasValidIdentity(super) ||
+ !MapSuperblockCheckCRC(super))
+ return MAP_SBLOCK_READ_CORRUPT;
+
+ return MAP_SBLOCK_READ_OK;
+}
+
+static uint32
+MapSuperHashCode(RelFileLocator rnode)
+{
+ return DatumGetUInt32(hash_any((const unsigned char *) &rnode,
+ sizeof(RelFileLocator)));
+}
+
+static int
+MapSuperIndexCapacityForPartition(int capacity)
+{
+ int index_capacity = 1;
+ long total_target = (long) capacity * 2L;
+ long per_partition_target;
+
+ per_partition_target =
+ (total_target + MAP_SUPER_NPARTITIONS - 1) / MAP_SUPER_NPARTITIONS;
+ while ((long) index_capacity < per_partition_target)
+ index_capacity <<= 1;
+
+ return index_capacity;
+}
+
+static int
+MapSuperPartitionForHash(uint32 hashcode)
+{
+ return hashcode & (MAP_SUPER_NPARTITIONS - 1);
+}
+
+static LWLock *
+MapSuperPartitionLock(uint32 hashcode)
+{
+ return &MapSuperPartitionLocks[MapSuperPartitionForHash(hashcode)].lock;
+}
+
+static int
+MapSuperLookupSlotLocked(RelFileLocator rnode, uint32 hashcode, int partition,
+ int *insert_bucket)
+{
+ int mask = MapSuperIndexCapacityPerPartition - 1;
+ int base = partition * MapSuperIndexCapacityPerPartition;
+ int bucket = (hashcode >> MAP_SUPER_NPARTITION_BITS) & mask;
+ int first_deleted = -1;
+ int probes;
+ LWLock *partition_lock = MapSuperPartitionLock(hashcode);
+
+ Assert(LWLockHeldByMe(partition_lock));
+
+ for (probes = 0; probes < MapSuperIndexCapacityPerPartition; probes++)
+ {
+ int slot_id = MapSuperIndex[base + bucket].slot_id;
+
+ if (slot_id == MAPSUPER_INDEX_EMPTY)
+ {
+ if (insert_bucket != NULL)
+ *insert_bucket = (first_deleted >= 0) ?
+ (base + first_deleted) : (base + bucket);
+ return -1;
+ }
+
+ if (slot_id == MAPSUPER_INDEX_DELETED)
+ {
+ if (first_deleted < 0)
+ first_deleted = bucket;
+ }
+ else
+ {
+ MapSuperEntry *entry = MapSuperEntryBySlot(slot_id);
+
+ if (entry->in_use && RelFileLocatorEquals(entry->key.rnode, rnode))
+ {
+ if (insert_bucket != NULL)
+ *insert_bucket = base + bucket;
+ return slot_id;
+ }
+ }
+
+ bucket = (bucket + 1) & mask;
+ }
+
+ if (insert_bucket != NULL)
+ *insert_bucket = (first_deleted >= 0) ? (base + first_deleted) : -1;

- memcpy(super->padding, page, MAP_SUPERBLOCK_SIZE);
+ return -1;
}

bool
-MapSBlockRead(SMgrRelation reln, MapSuperblock *super)
+MapSuperFindEntryLocked(RelFileLocator rnode, LWLockMode mode,
+ MapSuperEntry **entry)
+{
+ uint32 hashcode;
+ int partition;
+ int slot_id;
+ LWLock *partition_lock;
+
+ hashcode = MapSuperHashCode(rnode);
+ partition = MapSuperPartitionForHash(hashcode);
+ partition_lock = &MapSuperPartitionLocks[partition].lock;
+
+ LWLockAcquire(partition_lock, LW_SHARED);
+ slot_id = MapSuperLookupSlotLocked(rnode, hashcode, partition, NULL);
+ if (slot_id >= 0)
+ {
+ *entry = MapSuperEntryBySlot(slot_id);
+ LWLockAcquire(&(*entry)->lock, mode);
+ LWLockRelease(partition_lock);
+ return true;
+ }
+
+ LWLockRelease(partition_lock);
+ *entry = NULL;
+ return false;
+}
+
+bool
+MapSuperFindEntryTryLocked(RelFileLocator rnode, LWLockMode mode,
+ MapSuperEntry **entry)
+{
+ uint32 hashcode;
+ int partition;
+ int slot_id;
+ LWLock *partition_lock;
+
+ hashcode = MapSuperHashCode(rnode);
+ partition = MapSuperPartitionForHash(hashcode);
+ partition_lock = &MapSuperPartitionLocks[partition].lock;
+
+ LWLockAcquire(partition_lock, LW_SHARED);
+ slot_id = MapSuperLookupSlotLocked(rnode, hashcode, partition, NULL);
+ if (slot_id >= 0)
+ {
+ *entry = MapSuperEntryBySlot(slot_id);
+ if (!LWLockConditionalAcquire(&(*entry)->lock, mode))
+ {
+ LWLockRelease(partition_lock);
+ *entry = NULL;
+ return false;
+ }
+ LWLockRelease(partition_lock);
+ return true;
+ }
+
+ LWLockRelease(partition_lock);
+ *entry = NULL;
+ return false;
+}
+
+MapSuperEntry *
+MapSuperEnsureEntryLocked(RelFileLocator rnode)
+{
+ MapSuperEntry *entry;
+ uint32 hashcode;
+ int partition;
+ int slot_id;
+ int insert_bucket = -1;
+ LWLock *partition_lock;
+
+ hashcode = MapSuperHashCode(rnode);
+ partition = MapSuperPartitionForHash(hashcode);
+ partition_lock = &MapSuperPartitionLocks[partition].lock;
+
+ LWLockAcquire(partition_lock, LW_EXCLUSIVE);
+ slot_id = MapSuperLookupSlotLocked(rnode, hashcode, partition, &insert_bucket);
+ if (slot_id >= 0)
+ {
+ entry = MapSuperEntryBySlot(slot_id);
+ LWLockAcquire(&entry->lock, LW_EXCLUSIVE);
+ LWLockRelease(partition_lock);
+ return entry;
+ }
+
+ if (insert_bucket < 0)
+ {
+ LWLockRelease(partition_lock);
+ ereport(ERROR,
+ (errmsg("map superblock index table is full"),
+ errhint("Increase map_superblocks and restart the server.")));
+ }
+
+ SpinLockAcquire(&MapSuperCtlData->free_list_lock);
+ slot_id = MapSuperCtlData->free_head;
+ if (slot_id == MAPSUPER_FREENEXT_END)
+ {
+ SpinLockRelease(&MapSuperCtlData->free_list_lock);
+ LWLockRelease(partition_lock);
+ ereport(ERROR,
+ (errmsg("map superblock slot table is full"),
+ errhint("Increase map_superblocks and restart the server.")));
+ }
+
+ entry = MapSuperEntryBySlot(slot_id);
+ MapSuperCtlData->free_head = entry->next_free;
+ SpinLockRelease(&MapSuperCtlData->free_list_lock);
+
+ entry->next_free = MAPSUPER_FREENEXT_NOT_IN_LIST;
+ entry->in_use = true;
+ entry->key.rnode = rnode;
+ MemSet(&entry->super, 0, sizeof(entry->super));
+ entry->page_lsn = InvalidXLogRecPtr;
+ entry->flags = 0;
+ entry->runtime_flags = 0;
+ entry->reserved_next_free_main = 0;
+ entry->reserved_next_free_fsm = 0;
+ entry->reserved_next_free_vm = 0;
+ entry->extending_target_main = InvalidBlockNumber;
+ entry->extending_target_fsm = InvalidBlockNumber;
+ entry->extending_target_vm = InvalidBlockNumber;
+ MapSuperIndex[insert_bucket].slot_id = slot_id;
+
+ LWLockAcquire(&entry->lock, LW_EXCLUSIVE);
+ LWLockRelease(partition_lock);
+
+ return entry;
+}
+
+void
+MapSuperDeleteEntry(RelFileLocator rnode)
{
- char page[BLCKSZ];
+ MapSuperEntry *entry = NULL;
+ uint32 hashcode;
+ int partition;
+ int slot_id;
+ int bucket = -1;
+ LWLock *partition_lock;
+
+ hashcode = MapSuperHashCode(rnode);
+ partition = MapSuperPartitionForHash(hashcode);
+ partition_lock = &MapSuperPartitionLocks[partition].lock;

- Assert(reln != NULL);
+ LWLockAcquire(partition_lock, LW_EXCLUSIVE);
+ slot_id = MapSuperLookupSlotLocked(rnode, hashcode, partition, &bucket);
+ if (slot_id >= 0)
+ {
+ entry = MapSuperEntryBySlot(slot_id);
+ LWLockAcquire(&entry->lock, LW_EXCLUSIVE);
+ entry->flags = 0;
+ entry->runtime_flags = 0;
+ entry->page_lsn = InvalidXLogRecPtr;
+ entry->reserved_next_free_main = 0;
+ entry->reserved_next_free_fsm = 0;
+ entry->reserved_next_free_vm = 0;
+ entry->extending_target_main = InvalidBlockNumber;
+ entry->extending_target_fsm = InvalidBlockNumber;
+ entry->extending_target_vm = InvalidBlockNumber;
+ entry->in_use = false;
+ SpinLockAcquire(&MapSuperCtlData->free_list_lock);
+ entry->next_free = MapSuperCtlData->free_head;
+ MapSuperCtlData->free_head = slot_id;
+ SpinLockRelease(&MapSuperCtlData->free_list_lock);
+ LWLockRelease(&entry->lock);
+ MapSuperIndex[bucket].slot_id = MAPSUPER_INDEX_DELETED;
+ }
+ LWLockRelease(partition_lock);
+}
+
+static MapSBlockReadStatus
+MapSBlockRead(UmbraFileContext *map_ctx, RelFileLocator rnode, MapSuperblock *super)
+{
+ MapSuperEntry *entry;
+ MapSBlockReadStatus status = MAP_SBLOCK_READ_OK;
+ MapSuperblock disk_super;
+
+ Assert(map_ctx != NULL);
Assert(super != NULL);

- if (!UmMetadataExists(reln))
- return false;
+ if (!MapSuperFindEntryLocked(rnode, LW_SHARED, &entry))
+ {
+ status = MapSuperLoadFromDisk(map_ctx, rnode, &disk_super);
+ if (status == MAP_SBLOCK_READ_MISSING)
+ return MAP_SBLOCK_READ_MISSING;
+
+ entry = MapSuperEnsureEntryLocked(rnode);
+ if ((entry->flags & MAPSUPER_FLAG_VALID) == 0)
+ {
+ if (status == MAP_SBLOCK_READ_OK)
+ {
+ entry->super = disk_super;
+ entry->page_lsn = MapSuperblockGetLastUpdatedLSN(&disk_super);
+ entry->flags = MAPSUPER_FLAG_VALID;
+ MapSuperResetReservedNextFrees(entry);
+ }
+ else
+ {
+ MapSuperblockInit(&entry->super, 0);
+ entry->page_lsn = InvalidXLogRecPtr;
+ entry->flags = MAPSUPER_FLAG_VALID | MAPSUPER_FLAG_CORRUPT;
+ MapSuperResetReservedNextFrees(entry);
+ }
+ }
+ else if (entry->flags & MAPSUPER_FLAG_CORRUPT)
+ status = MAP_SBLOCK_READ_CORRUPT;
+ else
+ status = MAP_SBLOCK_READ_OK;
+ }
+ else if ((entry->flags & MAPSUPER_FLAG_VALID) == 0)
+ {
+ LWLockRelease(&entry->lock);
+ status = MapSuperLoadFromDisk(map_ctx, rnode, &disk_super);
+ if (status == MAP_SBLOCK_READ_MISSING)
+ return MAP_SBLOCK_READ_MISSING;
+
+ entry = MapSuperEnsureEntryLocked(rnode);
+ if ((entry->flags & MAPSUPER_FLAG_VALID) == 0)
+ {
+ if (status == MAP_SBLOCK_READ_OK)
+ {
+ entry->super = disk_super;
+ entry->page_lsn = MapSuperblockGetLastUpdatedLSN(&disk_super);
+ entry->flags = MAPSUPER_FLAG_VALID;
+ MapSuperResetReservedNextFrees(entry);
+ }
+ else
+ {
+ MapSuperblockInit(&entry->super, 0);
+ entry->page_lsn = InvalidXLogRecPtr;
+ entry->flags = MAPSUPER_FLAG_VALID | MAPSUPER_FLAG_CORRUPT;
+ MapSuperResetReservedNextFrees(entry);
+ }
+ }
+ else if (entry->flags & MAPSUPER_FLAG_CORRUPT)
+ status = MAP_SBLOCK_READ_CORRUPT;
+ else
+ status = MAP_SBLOCK_READ_OK;
+ }
+ else
+ {
+ /*
+ * Once a superblock is loaded into a valid shared entry, hot reads
+ * should consume that runtime state directly. Disk identity/CRC
+ * validation belongs to the slow path that populates shared state.
+ */
+ *super = entry->super;
+ status = (entry->flags & MAPSUPER_FLAG_CORRUPT) ?
+ MAP_SBLOCK_READ_CORRUPT : MAP_SBLOCK_READ_OK;
+ LWLockRelease(&entry->lock);
+ return status;
+ }
+
+ switch (status)
+ {
+ case MAP_SBLOCK_READ_OK:
+ break;
+ case MAP_SBLOCK_READ_MISSING:
+ LWLockRelease(&entry->lock);
+ return MAP_SBLOCK_READ_MISSING;
+ case MAP_SBLOCK_READ_CORRUPT:
+ LWLockRelease(&entry->lock);
+ return MAP_SBLOCK_READ_CORRUPT;
+ }
+
+ *super = entry->super;
+ LWLockRelease(&entry->lock);
+ return MAP_SBLOCK_READ_OK;
+}
+
+bool
+MapForkHasMappedState(ForkNumber forknum)
+{
+ switch (forknum)
+ {
+ case MAIN_FORKNUM:
+ case FSM_FORKNUM:
+ case VISIBILITYMAP_FORKNUM:
+ return true;
+ default:
+ return false;
+ }
+}
+
+static bool
+MapForkUsesAbsentSentinel(ForkNumber forknum)
+{
+ switch (forknum)
+ {
+ case FSM_FORKNUM:
+ case VISIBILITYMAP_FORKNUM:
+ return true;
+ default:
+ return false;
+ }
+}

- if (UmMetadataNblocks(reln) == 0)
+BlockNumber
+MapNormalizeForkBlockCount(ForkNumber forknum, BlockNumber raw)
+{
+ if (MapForkUsesAbsentSentinel(forknum) &&
+ raw == InvalidBlockNumber)
+ return 0;
+
+ return raw;
+}
+
+bool
+MapSuperForkExists(const MapSuperblock *super, ForkNumber forknum)
+{
+ if (!MapForkHasMappedState(forknum))
return false;

- UmMetadataRead(reln, MAP_BLOCK_SUPER, page);
- MapSuperblockUnpackPage(super, page);
+ if (!MapForkUsesAbsentSentinel(forknum))
+ return true;

- if (!MapSuperblockHasValidIdentity(super))
- MapSBlockReportCorrupt(reln, "invalid identity");
- if (!MapSuperblockCheckCRC(super))
- MapSBlockReportCorrupt(reln, "CRC mismatch");
+ return MapSuperblockGetLogicalNblocks(super, forknum) != InvalidBlockNumber;
+}
+
+
+static uint32
+MapSuperExtendingFlag(ForkNumber forknum)
+{
+ switch (forknum)
+ {
+ case MAIN_FORKNUM:
+ return MAPSUPER_RUNTIME_FLAG_EXTENDING_MAIN;
+ case FSM_FORKNUM:
+ return MAPSUPER_RUNTIME_FLAG_EXTENDING_FSM;
+ case VISIBILITYMAP_FORKNUM:
+ return MAPSUPER_RUNTIME_FLAG_EXTENDING_VM;
+ default:
+ return 0;
+ }
+}

+static BlockNumber
+MapSuperGetExtendingTarget(const MapSuperEntry *entry, ForkNumber forknum)
+{
+ Assert(entry != NULL);
+
+ switch (forknum)
+ {
+ case MAIN_FORKNUM:
+ return entry->extending_target_main;
+ case FSM_FORKNUM:
+ return entry->extending_target_fsm;
+ case VISIBILITYMAP_FORKNUM:
+ return entry->extending_target_vm;
+ default:
+ return InvalidBlockNumber;
+ }
+}
+
+static void
+MapSuperSetExtendingTarget(MapSuperEntry *entry, ForkNumber forknum,
+ BlockNumber nblocks)
+{
+ Assert(entry != NULL);
+
+ switch (forknum)
+ {
+ case MAIN_FORKNUM:
+ entry->extending_target_main = nblocks;
+ break;
+ case FSM_FORKNUM:
+ entry->extending_target_fsm = nblocks;
+ break;
+ case VISIBILITYMAP_FORKNUM:
+ entry->extending_target_vm = nblocks;
+ break;
+ default:
+ elog(ERROR, "unsupported fork number for extend target: %d", forknum);
+ }
+}
+
+
+
+
+
+static bool
+MapSuperPrepareEntryForUpdate(UmbraFileContext *map_ctx, RelFileLocator rnode,
+ XLogRecPtr map_lsn, const char *missing_errmsg,
+ MapSuperEntry **entry_p)
+{
+ MapSuperEntry *entry;
+ uint32 flags;
+
+ Assert(map_ctx != NULL);
+ Assert(entry_p != NULL);
+
+ if (!MapSuperFindEntryLocked(rnode, LW_EXCLUSIVE, &entry))
+ {
+ MapSuperblock disk_super;
+ MapSBlockReadStatus status;
+
+ status = MapSuperLoadFromDisk(map_ctx, rnode, &disk_super);
+ if (status == MAP_SBLOCK_READ_MISSING)
+ {
+ if (InRecovery)
+ return false;
+ elog(ERROR, "%s", missing_errmsg);
+ }
+
+ entry = MapSuperEnsureEntryLocked(rnode);
+ if ((entry->flags & MAPSUPER_FLAG_VALID) == 0)
+ {
+ if (status == MAP_SBLOCK_READ_OK)
+ {
+ entry->super = disk_super;
+ entry->page_lsn = MapSuperblockGetLastUpdatedLSN(&disk_super);
+ entry->flags = MAPSUPER_FLAG_VALID;
+ }
+ else
+ {
+ MapSuperblockInit(&entry->super, 0);
+ entry->page_lsn = InvalidXLogRecPtr;
+ entry->flags = MAPSUPER_FLAG_VALID | MAPSUPER_FLAG_CORRUPT;
+ }
+ }
+ }
+
+ flags = entry->flags;
+
+ if ((flags & MAPSUPER_FLAG_CORRUPT) ||
+ !MapSuperblockHasValidIdentity(&entry->super) ||
+ ((flags & MAPSUPER_FLAG_DIRTY) == 0 &&
+ !MapSuperblockCheckCRC(&entry->super)))
+ {
+ if (!InRecovery || map_lsn == InvalidXLogRecPtr)
+ MapSBlockReportCorrupt(rnode, "invalid identity or CRC");
+
+ /*
+ * Update paths rebuild superblock state from WAL-backed metadata.
+ * Never continue from an untrusted superblock image.
+ */
+ MapSuperblockInit(&entry->super, 0);
+ entry->flags = MAPSUPER_FLAG_VALID;
+ }
+
+ *entry_p = entry;
return true;
}

+static void
+MapSBlockUpdateLogicalNblocks(UmbraFileContext *map_ctx, RelFileLocator rnode,
+ ForkNumber forknum, BlockNumber nblocks,
+ XLogRecPtr map_lsn, bool bump_only)
+{
+ MapSuperEntry *entry;
+ BlockNumber current;
+
+ if (!MapForkHasMappedState(forknum))
+ return;
+
+ if (!MapSuperPrepareEntryForUpdate(map_ctx, rnode, map_lsn,
+ "MAP fork is missing while updating superblock",
+ &entry))
+ return;
+
+ current = MapSuperblockGetLogicalNblocks(&entry->super, forknum);
+ current = MapNormalizeForkBlockCount(forknum, current);
+ if (!bump_only || current < nblocks)
+ MapSuperblockSetLogicalNblocks(&entry->super, forknum, nblocks);
+
+ if (!bump_only || current < nblocks)
+ {
+ if (map_lsn == InvalidXLogRecPtr)
+ {
+ if (InRecovery)
+ map_lsn = GetXLogReplayRecPtr(NULL);
+ else
+ map_lsn = GetXLogWriteRecPtr();
+ }
+ MapSuperblockSetLastUpdatedLSN(&entry->super, map_lsn);
+ entry->page_lsn = map_lsn;
+ entry->flags |= MAPSUPER_FLAG_DIRTY;
+ }
+
+ LWLockRelease(&entry->lock);
+}
+
+static void
+MapSBlockSetPendingFlag(UmbraFileContext *map_ctx, RelFileLocator rnode,
+ bool pending, XLogRecPtr map_lsn)
+{
+ MapSuperEntry *entry;
+ uint32 super_flags;
+
+ if (!MapSuperPrepareEntryForUpdate(map_ctx, rnode, map_lsn,
+ "MAP fork is missing while updating superblock state",
+ &entry))
+ return;
+
+ super_flags = MapSuperblockGetFlags(&entry->super);
+ if (pending)
+ super_flags |= MAP_SUPERBLOCK_FLAG_SKIP_WAL_PENDING;
+ else
+ super_flags &= ~MAP_SUPERBLOCK_FLAG_SKIP_WAL_PENDING;
+
+ if (super_flags != MapSuperblockGetFlags(&entry->super))
+ {
+ if (map_lsn == InvalidXLogRecPtr)
+ {
+ if (InRecovery)
+ map_lsn = GetXLogReplayRecPtr(NULL);
+ else
+ map_lsn = GetXLogWriteRecPtr();
+ }
+
+ MapSuperblockSetFlags(&entry->super, super_flags);
+ MapSuperblockSetLastUpdatedLSN(&entry->super, map_lsn);
+ entry->page_lsn = map_lsn;
+ entry->flags |= MAPSUPER_FLAG_DIRTY;
+ }
+
+ LWLockRelease(&entry->lock);
+}
+
void
-MapSBlockWrite(SMgrRelation reln, const MapSuperblock *super, bool skipFsync)
+MapSBlockBumpPhysicalState(UmbraFileContext *map_ctx, RelFileLocator rnode,
+ ForkNumber forknum, BlockNumber nblocks,
+ bool bump_next_free, bool bump_capacity,
+ XLogRecPtr map_lsn)
{
- MapSuperblock write_super;
- char page[BLCKSZ];
+ MapSuperEntry *entry;
+ BlockNumber current_next;
+ BlockNumber current_capacity;
+ bool changed = false;

- Assert(reln != NULL);
- Assert(super != NULL);
+ if (!MapForkHasMappedState(forknum))
+ return;
+
+ if (!MapSuperPrepareEntryForUpdate(map_ctx, rnode, map_lsn,
+ "MAP fork is missing while updating superblock",
+ &entry))
+ return;
+
+ current_next = MapSuperblockGetNextFreePhysBlock(&entry->super, forknum);
+ current_capacity = MapSuperblockGetPhysCapacity(&entry->super, forknum);
+ current_next = MapNormalizeForkBlockCount(forknum, current_next);
+ current_capacity = MapNormalizeForkBlockCount(forknum, current_capacity);
+
+ if (bump_next_free && current_next < nblocks)
+ {
+ MapSuperblockSetNextFreePhysBlock(&entry->super, forknum, nblocks);
+ if (InRecovery)
+ changed = true;
+ }
+ if (bump_capacity && current_capacity < nblocks)
+ {
+ MapSuperblockSetPhysCapacity(&entry->super, forknum, nblocks);
+ changed = true;
+ }
+
+ if (changed)
+ {
+ if (map_lsn == InvalidXLogRecPtr)
+ {
+ if (InRecovery)
+ map_lsn = GetXLogReplayRecPtr(NULL);
+ else
+ map_lsn = GetXLogWriteRecPtr();
+ }
+ MapSuperblockSetLastUpdatedLSN(&entry->super, map_lsn);
+ entry->page_lsn = map_lsn;
+ entry->flags |= MAPSUPER_FLAG_DIRTY;
+ }
+
+ LWLockRelease(&entry->lock);
+}
+
+bool
+MapSBlockEnsurePhysicalNblocks(UmbraFileContext *map_ctx, RelFileLocator rnode,
+ ForkNumber forknum, BlockNumber nblocks,
+ bool skipFsync)
+{
+ MapSuperEntry *entry;
+ uint32 extend_flag;
+ BlockNumber current;
+ BlockNumber desired;
+
+ if (!MapForkHasMappedState(forknum))
+ return false;
+
+ if (nblocks == 0)
+ return true;
+
+ if (!MapSBlockEnsureLoaded(map_ctx, rnode))
+ return false;
+
+ extend_flag = MapSuperExtendingFlag(forknum);
+ Assert(extend_flag != 0);
+
+retry:
+ if (!MapSuperPrepareEntryForUpdate(map_ctx, rnode, InvalidXLogRecPtr,
+ "MAP fork is missing while materializing physical blocks",
+ &entry))
+ return false;
+
+ current = MapSuperblockGetPhysCapacity(&entry->super, forknum);
+ current = MapNormalizeForkBlockCount(forknum, current);
+ if (current >= nblocks)
+ {
+ LWLockRelease(&entry->lock);
+ return true;
+ }
+
+ if ((entry->runtime_flags & extend_flag) != 0)
+ {
+ if (MapSuperGetExtendingTarget(entry, forknum) < nblocks)
+ MapSuperSetExtendingTarget(entry, forknum, nblocks);
+ LWLockRelease(&entry->lock);
+ pg_usleep(1000L);
+
+ CHECK_FOR_INTERRUPTS();
+ goto retry;
+ }
+
+ entry->runtime_flags |= extend_flag;
+ MapSuperSetExtendingTarget(entry, forknum, nblocks);
+ LWLockRelease(&entry->lock);
+
+ PG_TRY();
+ {
+ desired = nblocks;
+
+ for (;;)
+ {
+ BlockNumber blk;
+
+ for (blk = current; blk < desired; blk++)
+ {
+ if (!umfile_ctx_block_exists(map_ctx, forknum, blk))
+ umfile_zeroextend(map_ctx, forknum, blk, 1, skipFsync);
+ }
+
+ if (!MapSuperPrepareEntryForUpdate(map_ctx, rnode, InvalidXLogRecPtr,
+ "MAP fork is missing while materializing physical blocks",
+ &entry))
+ elog(ERROR,
+ "MAP fork disappeared while materializing relation %u/%u/%u fork %d",
+ rnode.spcOid, rnode.dbOid, rnode.relNumber, forknum);
+
+ current = MapSuperblockGetPhysCapacity(&entry->super, forknum);
+ current = MapNormalizeForkBlockCount(forknum, current);
+ if (current < desired)
+ {
+ XLogRecPtr map_lsn;
+
+ if (InRecovery)
+ map_lsn = GetXLogReplayRecPtr(NULL);
+ else
+ map_lsn = GetXLogWriteRecPtr();
+
+ MapSuperblockSetPhysCapacity(&entry->super, forknum, desired);
+ MapSuperblockSetLastUpdatedLSN(&entry->super, map_lsn);
+ entry->page_lsn = map_lsn;
+ entry->flags |= MAPSUPER_FLAG_DIRTY;
+ current = desired;
+ }
+
+ desired = Max(desired, MapSuperGetExtendingTarget(entry, forknum));
+ if (current >= desired)
+ {
+ entry->runtime_flags &= ~extend_flag;
+ MapSuperSetExtendingTarget(entry, forknum, InvalidBlockNumber);
+ LWLockRelease(&entry->lock);
+ return true;
+ }
+
+ MapSuperSetExtendingTarget(entry, forknum, desired);
+ LWLockRelease(&entry->lock);
+ }
+ }
+ PG_CATCH();
+ {
+ if (MapSuperFindEntryLocked(rnode, LW_EXCLUSIVE, &entry))
+ {
+ entry->runtime_flags &= ~extend_flag;
+ MapSuperSetExtendingTarget(entry, forknum, InvalidBlockNumber);
+ LWLockRelease(&entry->lock);
+ }
+ PG_RE_THROW();
+ }
+ PG_END_TRY();
+
+ return false;
+}
+
+void
+MapSBlockInit(UmbraFileContext *map_ctx, RelFileLocator rnode, XLogRecPtr map_lsn)
+{
+ MapSuperEntry *entry;
+ MapSuperblock super;
+ MapSuperblock write_super;
+ char sector[MAP_SUPERBLOCK_SIZE];
+ XLogRecPtr write_lsn;
+
+ Assert(map_ctx != NULL);
+ if (!umfile_ctx_fork_exists(map_ctx, UMBRA_METADATA_FORKNUM,
+ UMFILE_EXISTS_DENSE))
+ elog(ERROR, "MAP fork is missing while initializing superblock");
+
+ entry = MapSuperEnsureEntryLocked(rnode);
+
+ MapSuperblockInit(&super, 0);
+
+ entry->super = super;
+ entry->page_lsn = (map_lsn != InvalidXLogRecPtr) ?
+ map_lsn : GetXLogWriteRecPtr();
+ MapSuperblockSetLastUpdatedLSN(&entry->super, entry->page_lsn);
+ entry->flags = MAPSUPER_FLAG_VALID | MAPSUPER_FLAG_DIRTY;
+
+ /*
+ * Persist superblock immediately so later backends in bootstrap/initdb can
+ * read block 0 even before checkpoint gets a chance to flush.
+ * This keeps create-time O(1): only one 512-byte sector is written.
+ */
+ write_super = entry->super;
+ write_lsn = entry->page_lsn;
+ LWLockRelease(&entry->lock);
+
+ if (!InRecovery && write_lsn != InvalidXLogRecPtr)
+ XLogFlush(write_lsn);

- write_super = *super;
MapSuperblockRefreshCRC(&write_super);
- MapSuperblockPackPage(&write_super, page);
+ MapSuperblockPackSector(&write_super, sector);
+ umfile_ctx_write(map_ctx, UMBRA_METADATA_FORKNUM, MAP_BLOCK_SUPER,
+ sector, MAP_SUPERBLOCK_SIZE, false);
+ umfile_ctx_register_dirty(map_ctx, UMBRA_METADATA_FORKNUM, MAP_BLOCK_SUPER,
+ false, false);
+}
+
+bool
+MapSBlockEnsureLoaded(UmbraFileContext *map_ctx, RelFileLocator rnode)
+{
+ MapSuperEntry *entry;

- if (!UmMetadataOpenOrCreate(reln, false, NULL))
- elog(ERROR, "could not open Umbra metadata file for superblock write");
+ Assert(map_ctx != NULL);

- if (UmMetadataNblocks(reln) == 0)
- UmMetadataExtend(reln, MAP_BLOCK_SUPER, page, skipFsync);
- else
- UmMetadataWrite(reln, MAP_BLOCK_SUPER, page, skipFsync);
+ if (!umfile_ctx_fork_exists(map_ctx, UMBRA_METADATA_FORKNUM,
+ UMFILE_EXISTS_DENSE))
+ return false;
+
+ if (!MapSuperFindEntryLocked(rnode, LW_SHARED, &entry))
+ {
+ MapSuperblock disk_super;
+ MapSBlockReadStatus status;
+
+ status = MapSuperLoadFromDisk(map_ctx, rnode, &disk_super);
+ if (status == MAP_SBLOCK_READ_MISSING)
+ return false;
+
+ entry = MapSuperEnsureEntryLocked(rnode);
+ if ((entry->flags & MAPSUPER_FLAG_VALID) == 0)
+ {
+ if (status == MAP_SBLOCK_READ_OK)
+ {
+ entry->super = disk_super;
+ entry->page_lsn = MapSuperblockGetLastUpdatedLSN(&disk_super);
+ entry->flags = MAPSUPER_FLAG_VALID;
+ }
+ else
+ {
+ MapSuperblockInit(&entry->super, 0);
+ entry->page_lsn = InvalidXLogRecPtr;
+ entry->flags = MAPSUPER_FLAG_VALID | MAPSUPER_FLAG_CORRUPT;
+ }
+ }
+ }
+
+ LWLockRelease(&entry->lock);
+ return true;
+}
+
+bool
+MapSBlockTryGetLogicalNblocks(UmbraFileContext *map_ctx, RelFileLocator rnode,
+ ForkNumber forknum,
+ BlockNumber *nblocks)
+{
+ MapSuperblock super;
+
+ Assert(nblocks != NULL);
+
+ if (!MapForkHasMappedState(forknum))
+ return false;
+
+ switch (MapSBlockRead(map_ctx, rnode, &super))
+ {
+ case MAP_SBLOCK_READ_OK:
+ break;
+ case MAP_SBLOCK_READ_MISSING:
+ return false;
+ case MAP_SBLOCK_READ_CORRUPT:
+ if (!InRecovery)
+ MapSBlockReportCorrupt(rnode, "invalid identity/CRC or short file");
+ return false;
+ }
+
+ if (!MapSuperblockHasValidIdentity(&super))
+ return false;
+
+ *nblocks = MapNormalizeForkBlockCount(forknum,
+ MapSuperblockGetLogicalNblocks(&super, forknum));
+ return true;
+}
+
+bool
+MapSBlockForkExists(UmbraFileContext *map_ctx, RelFileLocator rnode,
+ ForkNumber forknum)
+{
+ MapSuperblock super;
+
+ if (!MapForkHasMappedState(forknum))
+ return false;
+
+ switch (MapSBlockRead(map_ctx, rnode, &super))
+ {
+ case MAP_SBLOCK_READ_OK:
+ break;
+ case MAP_SBLOCK_READ_MISSING:
+ case MAP_SBLOCK_READ_CORRUPT:
+ return false;
+ }
+
+ if (!MapSuperblockHasValidIdentity(&super))
+ return false;
+
+ return MapSuperForkExists(&super, forknum);
+}
+
+bool
+MapSBlockTryGetPhysicalNblocks(UmbraFileContext *map_ctx, RelFileLocator rnode,
+ ForkNumber forknum, BlockNumber *nblocks)
+{
+ MapSuperblock super;
+
+ Assert(nblocks != NULL);
+
+ if (!MapForkHasMappedState(forknum))
+ return false;
+
+ switch (MapSBlockRead(map_ctx, rnode, &super))
+ {
+ case MAP_SBLOCK_READ_OK:
+ break;
+ case MAP_SBLOCK_READ_MISSING:
+ return false;
+ case MAP_SBLOCK_READ_CORRUPT:
+ if (!InRecovery)
+ MapSBlockReportCorrupt(rnode, "invalid identity/CRC or short file");
+ return false;
+ }
+
+ if (!MapSuperblockHasValidIdentity(&super))
+ return false;
+
+ *nblocks = MapNormalizeForkBlockCount(forknum,
+ MapSuperblockGetPhysCapacity(&super, forknum));
+ return true;
+}
+
+bool
+MapSBlockTryGetNextFreePhysBlock(UmbraFileContext *map_ctx, RelFileLocator rnode,
+ ForkNumber forknum, BlockNumber *next_free_pblk)
+{
+ MapSuperblock super;
+
+ Assert(next_free_pblk != NULL);
+
+ if (!MapForkHasMappedState(forknum))
+ return false;
+
+ switch (MapSBlockRead(map_ctx, rnode, &super))
+ {
+ case MAP_SBLOCK_READ_OK:
+ break;
+ case MAP_SBLOCK_READ_MISSING:
+ return false;
+ case MAP_SBLOCK_READ_CORRUPT:
+ if (!InRecovery)
+ MapSBlockReportCorrupt(rnode, "invalid identity/CRC or short file");
+ return false;
+ }
+
+ if (!MapSuperblockHasValidIdentity(&super))
+ return false;
+
+ *next_free_pblk = MapNormalizeForkBlockCount(forknum,
+ MapSuperblockGetNextFreePhysBlock(&super, forknum));
+ return true;
+}
+
+
+
+void
+MapSBlockBumpLogicalNblocks(UmbraFileContext *map_ctx, RelFileLocator rnode,
+ ForkNumber forknum, BlockNumber nblocks,
+ XLogRecPtr map_lsn)
+{
+ MapSBlockUpdateLogicalNblocks(map_ctx, rnode, forknum, nblocks,
+ map_lsn, true);
+}
+
+void
+MapSBlockBumpPhysicalNblocks(UmbraFileContext *map_ctx, RelFileLocator rnode,
+ ForkNumber forknum, BlockNumber nblocks,
+ XLogRecPtr map_lsn)
+{
+ MapSBlockBumpPhysicalState(map_ctx, rnode, forknum, nblocks,
+ false, true, map_lsn);
}

void
-MapSBlockInitNew(SMgrRelation reln, uint32 flags, XLogRecPtr lsn, bool skipFsync)
+MapSBlockBumpNextFreePhysBlock(UmbraFileContext *map_ctx, RelFileLocator rnode,
+ ForkNumber forknum, BlockNumber next_free_pblk,
+ XLogRecPtr map_lsn)
+{
+ MapSBlockBumpPhysicalState(map_ctx, rnode, forknum, next_free_pblk,
+ true, false, map_lsn);
+}
+
+void
+MapSBlockSetLogicalNblocks(UmbraFileContext *map_ctx, RelFileLocator rnode,
+ ForkNumber forknum, BlockNumber nblocks,
+ XLogRecPtr map_lsn)
+{
+ MapSBlockUpdateLogicalNblocks(map_ctx, rnode, forknum, nblocks,
+ map_lsn, false);
+}
+
+void
+MapSBlockSetSkipWalPending(UmbraFileContext *map_ctx, RelFileLocator rnode,
+ bool pending, XLogRecPtr map_lsn)
+{
+ MapSBlockSetPendingFlag(map_ctx, rnode, pending, map_lsn);
+}
+
+bool
+MapSBlockIsSkipWalPending(UmbraFileContext *map_ctx, RelFileLocator rnode)
{
MapSuperblock super;

- MapSuperblockInit(&super, flags);
- MapSuperblockSetLastUpdatedLSN(&super, lsn);
- MapSBlockWrite(reln, &super, skipFsync);
+ switch (MapSBlockRead(map_ctx, rnode, &super))
+ {
+ case MAP_SBLOCK_READ_OK:
+ break;
+ case MAP_SBLOCK_READ_MISSING:
+ case MAP_SBLOCK_READ_CORRUPT:
+ return false;
+ }
+
+ if (!MapSuperblockHasValidIdentity(&super))
+ return false;
+
+ return (MapSuperblockGetFlags(&super) &
+ MAP_SUPERBLOCK_FLAG_SKIP_WAL_PENDING) != 0;
}

static void
-MapSBlockReportCorrupt(SMgrRelation reln, const char *reason)
+MapSuperTableRefreshDerivedState(void)
{
- RelFileLocator rlocator = reln->smgr_rlocator.locator;
+ MapSuperCapacity = Max(map_superblocks, MAP_SUPERBLOCK_MIN_ENTRIES);
+ MapSuperIndexCapacityPerPartition =
+ MapSuperIndexCapacityForPartition(MapSuperCapacity);
+}

- ereport(ERROR,
- (errcode(ERRCODE_DATA_CORRUPTED),
- errmsg("Umbra metadata superblock is corrupted for relation %u/%u/%u: %s",
- rlocator.spcOid, rlocator.dbOid, rlocator.relNumber, reason)));
+void
+MapSuperTableShmemRequest(void)
+{
+ int total_index_slots;
+
+ MapSuperTableRefreshDerivedState();
+ total_index_slots =
+ MapSuperIndexCapacityPerPartition * MAP_SUPER_NPARTITIONS;
+
+ ShmemRequestStruct(.name = "Map Superblock Table Ctl",
+ .size = sizeof(MapSuperCtl),
+ .ptr = (void **) &MapSuperCtlData,
+ );
+
+ ShmemRequestStruct(.name = "Map Superblock Partition Locks",
+ .size = MAP_SUPER_NPARTITIONS * sizeof(LWLockPadded),
+ .ptr = (void **) &MapSuperPartitionLocks,
+ );
+
+ ShmemRequestStruct(.name = "Map Superblock Table Entries",
+ .size = MapSuperCapacity * sizeof(MapSuperEntry),
+ .ptr = (void **) &MapSuperEntries,
+ );
+
+ ShmemRequestStruct(.name = "Map Superblock Table Index",
+ .size = total_index_slots * sizeof(MapSuperIndexSlot),
+ .ptr = (void **) &MapSuperIndex,
+ );
+}
+
+void
+MapSuperTableShmemInit(void)
+{
+ int total_index_slots;
+ int i;
+
+ MapSuperTableRefreshDerivedState();
+ total_index_slots = MapSuperIndexCapacityPerPartition * MAP_SUPER_NPARTITIONS;
+
+ for (i = 0; i < MAP_SUPER_NPARTITIONS; i++)
+ LWLockInitialize(&MapSuperPartitionLocks[i].lock,
+ LWTRANCHE_MAP_BUFFER_CONTENT);
+
+ MapSuperCtlData->free_head = 0;
+ SpinLockInit(&MapSuperCtlData->free_list_lock);
+ for (i = 0; i < MapSuperCapacity; i++)
+ {
+ MapSuperEntry *entry = &MapSuperEntries[i];
+
+ MemSet(entry, 0, sizeof(*entry));
+ entry->next_free =
+ (i == MapSuperCapacity - 1) ? MAPSUPER_FREENEXT_END : (i + 1);
+ entry->in_use = false;
+ entry->extending_target_main = InvalidBlockNumber;
+ entry->extending_target_fsm = InvalidBlockNumber;
+ entry->extending_target_vm = InvalidBlockNumber;
+ LWLockInitialize(&entry->lock, LWTRANCHE_MAP_BUFFER_CONTENT);
+ }
+
+ for (i = 0; i < total_index_slots; i++)
+ MapSuperIndex[i].slot_id = MAPSUPER_INDEX_EMPTY;
+}
+
+void
+MapSuperTableShmemAttach(void)
+{
+ MapSuperTableRefreshDerivedState();
}
diff --git a/src/backend/storage/map/meson.build b/src/backend/storage/map/meson.build
index 0f780fe522..8747f0b714 100644
--- a/src/backend/storage/map/meson.build
+++ b/src/backend/storage/map/meson.build
@@ -2,5 +2,9 @@

backend_sources += files(
'map.c',
+ 'mapinit.c',
+ 'mapbuf.c',
+ 'mapflush.c',
+ 'mapclock.c',
'mapsuper.c',
)
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index c9a3ef6461..631d09d4b4 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -134,6 +134,13 @@ typedef struct f_smgr
void (*smgr_sync_relation_metadata) (SMgrRelation reln);
void (*smgr_unlink_relation_metadata) (RelFileLocatorBackend rlocator,
bool isRedo);
+ bool (*smgr_createdb_allows_wal_log) (void);
+ void (*smgr_checkpoint_database_tablespaces) (Oid dbid,
+ int ntablespaces,
+ const Oid *tablespace_ids);
+ void (*smgr_invalidate_database_tablespaces) (Oid dbid,
+ int ntablespaces,
+ const Oid *tablespace_ids);
int (*smgr_fd) (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, uint32 *off);
} f_smgr;

@@ -172,6 +179,9 @@ static const f_smgr smgrsw[] = {
.smgr_copy_relation_metadata = NULL,
.smgr_sync_relation_metadata = NULL,
.smgr_unlink_relation_metadata = NULL,
+ .smgr_createdb_allows_wal_log = NULL,
+ .smgr_checkpoint_database_tablespaces = NULL,
+ .smgr_invalidate_database_tablespaces = NULL,
.smgr_fd = mdfd,
},
#ifdef USE_UMBRA
@@ -201,6 +211,9 @@ static const f_smgr smgrsw[] = {
.smgr_copy_relation_metadata = umcopyrelationmetadata,
.smgr_sync_relation_metadata = umsyncrelationmetadata,
.smgr_unlink_relation_metadata = umunlinkrelationmetadata,
+ .smgr_createdb_allows_wal_log = umcreatedballowswallog,
+ .smgr_checkpoint_database_tablespaces = umcheckpointdatabasetablespaces,
+ .smgr_invalidate_database_tablespaces = uminvalidatedatabasetablespaces,
.smgr_fd = umfd,
},
#endif
@@ -569,8 +582,43 @@ smgrsyncrelationmetadata(SMgrRelation reln)
void
smgrunlinkrelationmetadata(RelFileLocatorBackend rlocator, bool isRedo)
{
- if (smgrsw[0].smgr_unlink_relation_metadata)
- smgrsw[0].smgr_unlink_relation_metadata(rlocator, isRedo);
+ if (smgrsw[SMGR_DEFAULT].smgr_unlink_relation_metadata)
+ smgrsw[SMGR_DEFAULT].smgr_unlink_relation_metadata(rlocator, isRedo);
+}
+
+bool
+smgrcreatedballowswallog(void)
+{
+ if (smgrsw[SMGR_DEFAULT].smgr_createdb_allows_wal_log)
+ return smgrsw[SMGR_DEFAULT].smgr_createdb_allows_wal_log();
+
+ return true;
+}
+
+void
+smgrcheckpointdatabasetablespaces(Oid dbid, int ntablespaces,
+ const Oid *tablespace_ids)
+{
+ if (smgrsw[SMGR_DEFAULT].smgr_checkpoint_database_tablespaces)
+ smgrsw[SMGR_DEFAULT].smgr_checkpoint_database_tablespaces(dbid,
+ ntablespaces,
+ tablespace_ids);
+}
+
+void
+smgrinvalidatedatabasetablespaces(Oid dbid, int ntablespaces,
+ const Oid *tablespace_ids)
+{
+ if (smgrsw[SMGR_DEFAULT].smgr_invalidate_database_tablespaces)
+ smgrsw[SMGR_DEFAULT].smgr_invalidate_database_tablespaces(dbid,
+ ntablespaces,
+ tablespace_ids);
+}
+
+void
+smgrinvalidatedatabase(Oid dbid)
+{
+ smgrinvalidatedatabasetablespaces(dbid, 0, NULL);
}
/*
* smgrdosyncall() -- Immediately sync all forks of all given relations
diff --git a/src/backend/storage/smgr/umbra.c b/src/backend/storage/smgr/umbra.c
index fc6e480276..bbb870ab8e 100644
--- a/src/backend/storage/smgr/umbra.c
+++ b/src/backend/storage/smgr/umbra.c
@@ -3,10 +3,9 @@
* umbra.c
* Umbra storage manager skeleton.
*
- * This file establishes Umbra as a separate smgr implementation from md.c.
- * maintains identity mapping state (logical block number == physical block
- * number) in the relation-local metadata file while using md.c for data-fork
- * I/O and umfile for metadata-file I/O.
+ * This file establishes Umbra as a separate smgr implementation from md.c. It
+ * maintains relation-local metadata and MAP checkpoint/cache state while using
+ * md.c for data-fork I/O and umfile for metadata-file I/O.
*
* src/backend/storage/smgr/umbra.c
*
@@ -14,13 +13,17 @@
*/
#include "postgres.h"

+#include "access/xlogutils.h"
#include "catalog/pg_class.h"
+#include "common/relpath.h"
+#include "storage/bufmgr.h"
+#include "storage/map.h"
#include "storage/md.h"
-#include "storage/mapsuper.h"
#include "storage/smgr.h"
#include "storage/umfile.h"
#include "storage/umbra.h"
#include "utils/memutils.h"
+#include "utils/wait_event.h"

typedef struct UmbraSmgrRelationState
{
@@ -29,9 +32,11 @@ typedef struct UmbraSmgrRelationState

static bool um_tracks_identity_metadata(ForkNumber forknum);
static UmbraFileContext *um_relation_filectx(SMgrRelation reln);
+static void um_ensure_redo_metadata(SMgrRelation reln, ForkNumber forknum);
static void um_identity_update_metadata(SMgrRelation reln, ForkNumber forknum,
- BlockNumber nblocks, bool fork_exists,
- bool skipFsync);
+ BlockNumber nblocks, bool fork_exists);
+static void um_refresh_identity_metadata(SMgrRelation reln);
+static void um_filetag_path(const FileTag *ftag, char *path);

bool
UmMetadataExists(SMgrRelation reln)
@@ -72,11 +77,30 @@ void
UmMetadataWrite(SMgrRelation reln, BlockNumber blkno, const void *buffer,
bool skipFsync)
{
- const void *buffers[1];
+ UmbraFileContext *ctx = um_relation_filectx(reln);

- buffers[0] = buffer;
- umfile_writev(um_relation_filectx(reln), UMBRA_METADATA_FORKNUM, blkno,
- buffers, 1, skipFsync);
+ umfile_ctx_write(ctx, UMBRA_METADATA_FORKNUM, blkno,
+ buffer, BLCKSZ, skipFsync);
+ umfile_ctx_register_dirty(ctx, UMBRA_METADATA_FORKNUM, blkno,
+ skipFsync,
+ RelFileLocatorBackendIsTemp(reln->smgr_rlocator));
+}
+
+void
+UmMetadataWriteSuperblock(RelFileLocatorBackend rlocator, const void *sector,
+ bool skipFsync)
+{
+ UmbraFileContext *ctx = umfile_ctx_acquire(rlocator);
+
+ /*
+ * Superblock checkpoint flush can run while holding MapSuperEntry->lock,
+ * so it must not recurse through smgr/umopen.
+ */
+ umfile_ctx_write(ctx, UMBRA_METADATA_FORKNUM, MAP_BLOCK_SUPER,
+ sector, MAP_SUPERBLOCK_SIZE, skipFsync);
+ umfile_ctx_register_dirty(ctx, UMBRA_METADATA_FORKNUM, MAP_BLOCK_SUPER,
+ skipFsync,
+ RelFileLocatorBackendIsTemp(rlocator));
}

void
@@ -90,6 +114,7 @@ UmMetadataExtend(SMgrRelation reln, BlockNumber blkno, const void *buffer,
void
UmMetadataImmediateSync(SMgrRelation reln)
{
+ MapCheckpointRelation(reln->smgr_rlocator.locator);
umfile_immedsync(um_relation_filectx(reln), UMBRA_METADATA_FORKNUM);
}

@@ -99,10 +124,32 @@ UmMetadataUnlink(RelFileLocatorBackend rlocator, bool isRedo)
umfile_unlink(rlocator, UMBRA_METADATA_FORKNUM, isRedo);
}

+void
+UmInvalidateDatabase(Oid dbid)
+{
+ FileTag tag;
+ RelFileLocator rlocator;
+
+ MapInvalidateDatabase(dbid);
+
+ rlocator.spcOid = 0;
+ rlocator.dbOid = dbid;
+ rlocator.relNumber = 0;
+
+ memset(&tag, 0, sizeof(tag));
+ tag.handler = SYNC_HANDLER_UMBRA;
+ tag.rlocator = rlocator;
+ tag.forknum = InvalidForkNumber;
+ tag.segno = InvalidBlockNumber;
+
+ RegisterSyncRequest(&tag, SYNC_FILTER_REQUEST, true);
+}
+
void
uminit(void)
{
umfile_init();
+ MapBackendInit();
}

void
@@ -131,10 +178,9 @@ umdestroy(SMgrRelation reln)
{
UmbraSmgrRelationState *state = reln->smgr_private;

- umfile_ctx_release(reln->smgr_rlocator);
-
if (state != NULL)
{
+ umfile_ctx_forget(reln->smgr_rlocator);
pfree(state);
reln->smgr_private = NULL;
}
@@ -146,18 +192,56 @@ umisinternalfork(ForkNumber forknum)
return forknum == UMBRA_METADATA_FORKNUM;
}

+bool
+umcreatedballowswallog(void)
+{
+ return false;
+}
+
+void
+umcheckpointdatabasetablespaces(Oid dbid, int ntablespaces,
+ const Oid *tablespace_ids)
+{
+ MapCheckpointDatabaseTablespaces(dbid, ntablespaces, tablespace_ids);
+}
+
+void
+uminvalidatedatabasetablespaces(Oid dbid, int ntablespaces,
+ const Oid *tablespace_ids)
+{
+ MapInvalidateDatabaseTablespaces(dbid, ntablespaces, tablespace_ids);
+}
+
void
umcreaterelationmetadata(SMgrRelation reln)
{
+ UmbraFileContext *ctx = um_relation_filectx(reln);
bool created = false;

- if (!UmMetadataOpenOrCreate(reln, false, &created))
+ /*
+ * smgrcreaterelationmetadata() is used both in normal create and redo
+ * paths, so tolerate an already-existing metadata fork here.
+ */
+ if (!UmMetadataOpenOrCreate(reln, true, &created))
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not create Umbra metadata fork for relation %u/%u/%u",
reln->smgr_rlocator.locator.spcOid,
reln->smgr_rlocator.locator.dbOid,
reln->smgr_rlocator.locator.relNumber)));
+
+ elog(DEBUG1, "umbra metadata open/create %u/%u/%u created=%s",
+ reln->smgr_rlocator.locator.spcOid,
+ reln->smgr_rlocator.locator.dbOid,
+ reln->smgr_rlocator.locator.relNumber,
+ created ? "true" : "false");
+
+ if (created)
+ MapSBlockInit(ctx, reln->smgr_rlocator.locator, InvalidXLogRecPtr);
+ else
+ (void) MapSBlockEnsureLoaded(ctx, reln->smgr_rlocator.locator);
+
+ um_refresh_identity_metadata(reln);
}

void
@@ -166,7 +250,6 @@ umcopyrelationmetadata(SMgrRelation src, SMgrRelation dst, char relpersistence)
BlockNumber src_nblocks;
BlockNumber dst_nblocks;
PGIOAlignedBlock pagebuf;
- bool created = false;

if (relpersistence != RELPERSISTENCE_PERMANENT)
return;
@@ -174,13 +257,7 @@ umcopyrelationmetadata(SMgrRelation src, SMgrRelation dst, char relpersistence)
if (!UmMetadataExists(src))
return;

- if (!UmMetadataOpenOrCreate(dst, false, &created))
- ereport(ERROR,
- (errcode_for_file_access(),
- errmsg("could not create Umbra metadata fork for relation %u/%u/%u",
- dst->smgr_rlocator.locator.spcOid,
- dst->smgr_rlocator.locator.dbOid,
- dst->smgr_rlocator.locator.relNumber)));
+ umcreaterelationmetadata(dst);

src_nblocks = UmMetadataNblocks(src);
dst_nblocks = UmMetadataNblocks(dst);
@@ -209,7 +286,7 @@ umsyncrelationmetadata(SMgrRelation reln)
void
umunlinkrelationmetadata(RelFileLocatorBackend rlocator, bool isRedo)
{
- umfile_ctx_forget(rlocator);
+ MapInvalidateRelation(rlocator.locator);
UmMetadataUnlink(rlocator, isRedo);
}

@@ -218,8 +295,20 @@ umcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
{
mdcreate(reln, forknum, isRedo);

- if (um_tracks_identity_metadata(forknum))
- um_identity_update_metadata(reln, forknum, 0, true, true);
+ /*
+ * Redo for permanent relation creation reaches smgrcreate() directly, so
+ * make sure the metadata fork exists before later recovery steps touch the
+ * relation again.
+ */
+ if (isRedo &&
+ forknum == MAIN_FORKNUM &&
+ !UmMetadataExists(reln))
+ umcreaterelationmetadata(reln);
+
+ if (forknum != MAIN_FORKNUM &&
+ um_tracks_identity_metadata(forknum) &&
+ UmMetadataExists(reln))
+ um_identity_update_metadata(reln, forknum, 0, true);
}

bool
@@ -234,7 +323,12 @@ umexists(SMgrRelation reln, ForkNumber forknum)
void
umunlink(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo)
{
- umfile_ctx_forget(rlocator);
+ if (forknum == UMBRA_METADATA_FORKNUM ||
+ forknum == MAIN_FORKNUM ||
+ forknum == InvalidForkNumber)
+ {
+ MapInvalidateRelation(rlocator.locator);
+ }

if (forknum == UMBRA_METADATA_FORKNUM)
{
@@ -252,11 +346,11 @@ void
umextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
const void *buffer, bool skipFsync)
{
+ um_ensure_redo_metadata(reln, forknum);
mdextend(reln, forknum, blocknum, buffer, skipFsync);

- if (um_tracks_identity_metadata(forknum))
- um_identity_update_metadata(reln, forknum, blocknum + 1, true,
- skipFsync);
+ if (um_tracks_identity_metadata(forknum) && UmMetadataExists(reln))
+ um_identity_update_metadata(reln, forknum, blocknum + 1, true);
}

void
@@ -265,18 +359,19 @@ umzeroextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
{
BlockNumber target_nblocks;

+ um_ensure_redo_metadata(reln, forknum);
mdzeroextend(reln, forknum, blocknum, nblocks, skipFsync);

- if (um_tracks_identity_metadata(forknum))
- {
- target_nblocks = blocknum + (BlockNumber) nblocks;
- if (target_nblocks < blocknum)
- ereport(ERROR,
- (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
- errmsg("Umbra identity mapping block count overflow")));
- um_identity_update_metadata(reln, forknum, target_nblocks, true,
- skipFsync);
- }
+ if (!um_tracks_identity_metadata(forknum) || !UmMetadataExists(reln))
+ return;
+
+ target_nblocks = blocknum + (BlockNumber) nblocks;
+ if (target_nblocks < blocknum)
+ ereport(ERROR,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("Umbra identity mapping block count overflow")));
+
+ um_identity_update_metadata(reln, forknum, target_nblocks, true);
}

bool
@@ -296,6 +391,7 @@ void
umreadv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
void **buffers, BlockNumber nblocks)
{
+ um_ensure_redo_metadata(reln, forknum);
mdreadv(reln, forknum, blocknum, buffers, nblocks);
}

@@ -303,6 +399,7 @@ void
umstartreadv(PgAioHandle *ioh, SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, void **buffers, BlockNumber nblocks)
{
+ um_ensure_redo_metadata(reln, forknum);
mdstartreadv(ioh, reln, forknum, blocknum, buffers, nblocks);
}

@@ -310,7 +407,14 @@ void
umwritev(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
const void **buffers, BlockNumber nblocks, bool skipFsync)
{
+ um_ensure_redo_metadata(reln, forknum);
mdwritev(reln, forknum, blocknum, buffers, nblocks, skipFsync);
+
+ if (InRecovery &&
+ um_tracks_identity_metadata(forknum) &&
+ UmMetadataExists(reln))
+ um_identity_update_metadata(reln, forknum, mdnblocks(reln, forknum),
+ true);
}

void
@@ -324,9 +428,8 @@ BlockNumber
umnblocks(SMgrRelation reln, ForkNumber forknum)
{
/*
- * Keep md.c responsible for the physical fork size query. mdtruncate()
- * relies on a preceding mdnblocks() call to have opened all active
- * segments.
+ * Keep md.c responsible for physical fork size queries. mdtruncate()
+ * relies on a preceding mdnblocks() call to have opened active segments.
*/
return mdnblocks(reln, forknum);
}
@@ -337,8 +440,8 @@ umtruncate(SMgrRelation reln, ForkNumber forknum,
{
mdtruncate(reln, forknum, old_blocks, nblocks);

- if (um_tracks_identity_metadata(forknum))
- um_identity_update_metadata(reln, forknum, nblocks, true, false);
+ if (um_tracks_identity_metadata(forknum) && UmMetadataExists(reln))
+ um_identity_update_metadata(reln, forknum, nblocks, true);
}

void
@@ -362,6 +465,56 @@ umfd(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, uint32 *off)
return mdfd(reln, forknum, blocknum, off);
}

+int
+umsyncfiletag(const FileTag *ftag, char *path)
+{
+ File fd;
+ int ret;
+ int save_errno;
+
+ um_filetag_path(ftag, path);
+
+ fd = PathNameOpenFile(path, O_RDWR | PG_BINARY);
+ if (fd < 0)
+ return -1;
+
+ ret = FileSync(fd, WAIT_EVENT_DATA_FILE_SYNC);
+ save_errno = errno;
+
+ FileClose(fd);
+ errno = save_errno;
+ return ret;
+}
+
+int
+umunlinkfiletag(const FileTag *ftag, char *path)
+{
+ um_filetag_path(ftag, path);
+ return unlink(path);
+}
+
+bool
+umfiletagmatches(const FileTag *ftag, const FileTag *candidate)
+{
+ if (ftag->forknum == InvalidForkNumber &&
+ ftag->segno == InvalidBlockNumber &&
+ ftag->rlocator.spcOid == 0 &&
+ ftag->rlocator.relNumber == 0)
+ return ftag->rlocator.dbOid == candidate->rlocator.dbOid;
+
+ if (ftag->forknum == InvalidForkNumber &&
+ ftag->segno == InvalidBlockNumber)
+ return RelFileLocatorEquals(ftag->rlocator, candidate->rlocator);
+
+ if (ftag->segno == InvalidBlockNumber)
+ return RelFileLocatorEquals(ftag->rlocator, candidate->rlocator) &&
+ ftag->forknum == candidate->forknum;
+
+ return RelFileLocatorEquals(ftag->rlocator, candidate->rlocator) &&
+ ftag->forknum == candidate->forknum &&
+ ftag->segno == candidate->segno;
+}
+
static UmbraFileContext *
um_relation_filectx(SMgrRelation reln)
{
@@ -384,32 +537,102 @@ um_tracks_identity_metadata(ForkNumber forknum)
forknum == VISIBILITYMAP_FORKNUM;
}

+static void
+um_ensure_redo_metadata(SMgrRelation reln, ForkNumber forknum)
+{
+ Assert(reln != NULL);
+
+ if (!InRecovery ||
+ RelFileLocatorBackendIsTemp(reln->smgr_rlocator) ||
+ !um_tracks_identity_metadata(forknum) ||
+ UmMetadataExists(reln))
+ return;
+
+ /*
+ * Redo can materialize a new data fork via mdwritev()/mdextend() without a
+ * preceding smgrcreate() callback, for example during CREATE DATABASE
+ * WAL-log replay. Ensure metadata exists before MAP state is consulted or
+ * checkpointed for that relation.
+ */
+ elog(DEBUG1, "umbra redo ensure metadata %u/%u/%u fork=%d",
+ reln->smgr_rlocator.locator.spcOid,
+ reln->smgr_rlocator.locator.dbOid,
+ reln->smgr_rlocator.locator.relNumber,
+ forknum);
+ umcreaterelationmetadata(reln);
+}
+
static void
um_identity_update_metadata(SMgrRelation reln, ForkNumber forknum,
- BlockNumber nblocks, bool fork_exists,
- bool skipFsync)
+ BlockNumber nblocks, bool fork_exists)
{
- MapSuperblock super;
+ UmbraFileContext *ctx = um_relation_filectx(reln);
+ BlockNumber logical_nblocks;

Assert(reln != NULL);
Assert(um_tracks_identity_metadata(forknum));
+ Assert(UmMetadataExists(reln));

- if (!MapSBlockRead(reln, &super))
- MapSuperblockInit(&super, 0);
+ if (!MapSBlockEnsureLoaded(ctx, reln->smgr_rlocator.locator))
+ elog(ERROR, "could not load MAP superblock for relation %u/%u/%u",
+ reln->smgr_rlocator.locator.spcOid,
+ reln->smgr_rlocator.locator.dbOid,
+ reln->smgr_rlocator.locator.relNumber);

if (!fork_exists && forknum != MAIN_FORKNUM)
+ logical_nblocks = InvalidBlockNumber;
+ else
+ logical_nblocks = nblocks;
+
+ MapSBlockSetLogicalNblocks(ctx, reln->smgr_rlocator.locator,
+ forknum, logical_nblocks,
+ InvalidXLogRecPtr);
+
+ if (fork_exists || forknum == MAIN_FORKNUM)
{
- MapSuperblockSetLogicalNblocks(&super, forknum, InvalidBlockNumber);
- MapSuperblockSetNextFreePhysBlock(&super, forknum, InvalidBlockNumber);
- MapSuperblockSetPhysCapacity(&super, forknum, InvalidBlockNumber);
+ MapSBlockBumpNextFreePhysBlock(ctx, reln->smgr_rlocator.locator,
+ forknum, nblocks,
+ InvalidXLogRecPtr);
+ MapSBlockBumpPhysicalNblocks(ctx, reln->smgr_rlocator.locator,
+ forknum, nblocks,
+ InvalidXLogRecPtr);
}
- else
+}
+
+static void
+um_refresh_identity_metadata(SMgrRelation reln)
+{
+ ForkNumber forknum;
+
+ Assert(UmMetadataExists(reln));
+
+ for (forknum = MAIN_FORKNUM; forknum <= VISIBILITYMAP_FORKNUM; forknum++)
{
- MapSuperblockSetLogicalNblocks(&super, forknum, nblocks);
- MapSuperblockSetNextFreePhysBlock(&super, forknum, nblocks);
- MapSuperblockSetPhysCapacity(&super, forknum, nblocks);
+ bool fork_exists;
+ BlockNumber nblocks;
+
+ if (!um_tracks_identity_metadata(forknum))
+ continue;
+
+ fork_exists = mdexists(reln, forknum);
+ nblocks = fork_exists ? mdnblocks(reln, forknum) : 0;
+ um_identity_update_metadata(reln, forknum, nblocks, fork_exists);
}
+}

- MapSuperblockSetLastUpdatedLSN(&super, InvalidXLogRecPtr);
- MapSBlockWrite(reln, &super, skipFsync);
+static void
+um_filetag_path(const FileTag *ftag, char *path)
+{
+ RelPathStr base;
+
+ if (ftag->forknum == UMBRA_METADATA_FORKNUM)
+ base = UmMetadataRelPathPerm(ftag->rlocator);
+ else
+ base = relpathperm(ftag->rlocator, ftag->forknum);
+
+ if (ftag->segno == 0)
+ strlcpy(path, base.str, MAXPGPATH);
+ else
+ snprintf(path, MAXPGPATH, "%s.%llu",
+ base.str, (unsigned long long) ftag->segno);
}
diff --git a/src/backend/storage/smgr/umfile.c b/src/backend/storage/smgr/umfile.c
index f8d1140840..17145405cf 100644
--- a/src/backend/storage/smgr/umfile.c
+++ b/src/backend/storage/smgr/umfile.c
@@ -1,32 +1,40 @@
/*-------------------------------------------------------------------------
*
* umfile.c
- * Umbra backend-local file/segment helpers.
+ * Umbra file/segment manager.
*
* This layer owns backend-local file contexts keyed by RelFileLocatorBackend
- * and provides physical fork/segment management beneath Umbra metadata and
- * mapping code.
- *
- * src/backend/storage/smgr/umfile.c
+ * and provides low-level physical file/segment handling for Umbra forks.
*
*-------------------------------------------------------------------------
*/
+
#include "postgres.h"

-#include <fcntl.h>
#include <unistd.h>
+#include <fcntl.h>
+#include <sys/uio.h>

#include "access/xlogutils.h"
-#include "commands/tablespace.h"
+#include "catalog/pg_tablespace_d.h"
#include "common/relpath.h"
+#include "commands/tablespace.h"
+#include "common/file_utils.h"
#include "miscadmin.h"
+#include "pg_trace.h"
+#include "pgstat.h"
+#include "storage/aio.h"
#include "storage/fd.h"
-#include "storage/um_defs.h"
+#include "storage/sync.h"
#include "storage/umfile.h"
#include "utils/hsearch.h"
#include "utils/memutils.h"
#include "utils/wait_event.h"

+/*
+ * Like md.c, we split relation storage into segments of RELSEG_SIZE blocks.
+ */
+
/* Behavior flags for segment open helpers. */
#define UM_EXTENSION_FAIL (1 << 0)
#define UM_EXTENSION_RETURN_NULL (1 << 1)
@@ -34,6 +42,10 @@
#define UM_EXTENSION_CREATE_RECOVERY (1 << 3)
#define UM_EXTENSION_DONT_OPEN (1 << 5)

+/* local state */
+static MemoryContext UmCxt = NULL;
+static HTAB *UmCtxRegistry = NULL;
+
typedef struct UmCtxRegistryEntry
{
RelFileLocatorBackend rlocator;
@@ -49,71 +61,48 @@ typedef struct UmfdVec
struct UmbraFileContext
{
RelFileLocatorBackend rlocator;
+
int num_open_segs[UMBRA_FORK_SLOTS];
- UmfdVec *seg_fds[UMBRA_FORK_SLOTS];
- uint32 refcount;
+ UmfdVec *seg_fds[UMBRA_FORK_SLOTS]; /* array [0..num_open_segs) */
};

-static MemoryContext UmFileCxt = NULL;
-static HTAB *UmFileContextHash = NULL;
-
-static void umfile_ctx_registry_init(void);
-static UmbraFileContext *umfile_ctx_create(RelFileLocatorBackend rlocator);
-static void umfile_ctx_destroy(UmbraFileContext *ctx);
-static void umfile_close_open_segments(UmbraFileContext *ctx,
- ForkNumber forknum);
-static bool umfile_create(UmbraFileContext *ctx, ForkNumber forknum,
- bool isRedo);
-static int umfile_open_flags(void);
-static void umfile_fdvec_resize(UmbraFileContext *ctx, ForkNumber forknum,
- int nseg);
-static inline UmfdVec *umfile_v_get(UmbraFileContext *ctx,
- ForkNumber forknum, int segindex);
-static BlockNumber umfile_nblocks_in_seg(File vfd);
-static RelPathStr umfile_segpath(RelFileLocatorBackend rlocator,
- ForkNumber forknum, BlockNumber segno);
-static UmfdVec *umfile_openseg(UmbraFileContext *ctx,
- RelFileLocatorBackend rlocator,
- ForkNumber forknum,
- BlockNumber segno, int oflags);
-static UmfdVec *umfile_openfork(UmbraFileContext *ctx,
- RelFileLocatorBackend rlocator,
+/* Forward declarations for internal ctx+rlocator core helpers. */
+static UmfdVec *umfile_openfork(UmbraFileContext *ctx, RelFileLocatorBackend rlocator,
ForkNumber forknum, int behavior);
-static UmfdVec *umfile_getseg(UmbraFileContext *ctx,
- RelFileLocatorBackend rlocator,
+static UmfdVec *umfile_openseg(UmbraFileContext *ctx, RelFileLocatorBackend rlocator,
+ ForkNumber forknum, BlockNumber segno, int oflags);
+static UmfdVec *umfile_getseg(UmbraFileContext *ctx, RelFileLocatorBackend rlocator,
ForkNumber forknum, BlockNumber blkno,
- bool skipFsync, int behavior);
-static bool umfile_fork_has_open_segment(UmbraFileContext *ctx,
+ bool skipFsync, int behavior,
+ bool isTempRelation);
+static void umfile_register_dirty_seg(RelFileLocatorBackend rlocator,
+ bool isTempRelation,
+ ForkNumber forknum, UmfdVec *seg);
+static bool umfile_fork_allows_sparse_segments(ForkNumber forknum);
+static BlockNumber umfile_nblocks_sparse(UmbraFileContext *ctx,
+ RelFileLocatorBackend rlocator,
ForkNumber forknum);
+static BlockNumber umfile_nblocks_dense(UmbraFileContext *ctx,
+ RelFileLocatorBackend rlocator,
+ ForkNumber forknum);
+static BlockNumber umfile_nblocks_in_seg(File vfd);
+static bool umfile_collect_existing_segnos_by_path(const char *seg0path,
+ BlockNumber **segnos_out,
+ int *nsegnos_out);
+static bool umfile_any_segment_exists_by_path(const char *seg0path);
+static inline UmfdVec *umfile_v_get(UmbraFileContext *ctx, ForkNumber forknum,
+ int segindex);
+static bool umfile_fork_has_open_segment(UmbraFileContext *ctx, ForkNumber forknum);
static bool umfile_fork_has_open_segment_on_disk(UmbraFileContext *ctx,
RelFileLocatorBackend rlocator,
ForkNumber forknum);
static inline bool umfile_seg_entry_is_open(const UmfdVec *seg);
static inline void umfile_seg_entry_reset(UmfdVec *seg);
-
-void
-umfile_init(void)
-{
- HASHCTL ctl;
-
- if (UmFileContextHash != NULL)
- return;
-
- UmFileCxt = AllocSetContextCreate(TopMemoryContext,
- "UmFile",
- ALLOCSET_DEFAULT_SIZES);
- MemoryContextAllowInCriticalSection(UmFileCxt, true);
-
- memset(&ctl, 0, sizeof(ctl));
- ctl.keysize = sizeof(RelFileLocatorBackend);
- ctl.entrysize = sizeof(UmCtxRegistryEntry);
- ctl.hcxt = UmFileCxt;
-
- UmFileContextHash = hash_create("Umbra file context registry",
- 256,
- &ctl,
- HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
-}
+static void umfile_build_segpath(UmbraFileContext *ctx, ForkNumber forknum,
+ BlockNumber segno, char *path, size_t pathlen);
+static void umfile_ctx_registry_init(void);
+static UmbraFileContext *umfile_ctx_create(RelFileLocatorBackend rlocator);
+static void umfile_ctx_destroy_internal(UmbraFileContext *ctx);

UmbraFileContext *
umfile_ctx_lookup(RelFileLocatorBackend rlocator)
@@ -121,7 +110,7 @@ umfile_ctx_lookup(RelFileLocatorBackend rlocator)
UmCtxRegistryEntry *entry;

umfile_ctx_registry_init();
- entry = hash_search(UmFileContextHash, &rlocator, HASH_FIND, NULL);
+ entry = hash_search(UmCtxRegistry, &rlocator, HASH_FIND, NULL);
if (entry == NULL)
return NULL;

@@ -132,13 +121,12 @@ UmbraFileContext *
umfile_ctx_acquire(RelFileLocatorBackend rlocator)
{
UmCtxRegistryEntry *entry;
- bool found;
+ bool found;

umfile_ctx_registry_init();
- entry = hash_search(UmFileContextHash, &rlocator, HASH_ENTER, &found);
+ entry = hash_search(UmCtxRegistry, &rlocator, HASH_ENTER, &found);
if (!found)
entry->ctx = umfile_ctx_create(rlocator);
- entry->ctx->refcount++;

return entry->ctx;
}
@@ -150,79 +138,43 @@ umfile_ctx_create_temporary(RelFileLocatorBackend rlocator)
return umfile_ctx_create(rlocator);
}

-void
-umfile_ctx_destroy_temporary(UmbraFileContext *ctx)
-{
- if (ctx == NULL)
- return;
-
- umfile_ctx_destroy(ctx);
-}
-
-void
-umfile_ctx_release(RelFileLocatorBackend rlocator)
-{
- UmCtxRegistryEntry *entry;
- UmbraFileContext *ctx;
-
- if (UmFileContextHash == NULL)
- return;
-
- entry = hash_search(UmFileContextHash, &rlocator, HASH_FIND, NULL);
- if (entry == NULL)
- return;
-
- ctx = entry->ctx;
- Assert(ctx->refcount > 0);
- ctx->refcount--;
-
- if (ctx->refcount == 0)
- {
- umfile_ctx_destroy(ctx);
- (void) hash_search(UmFileContextHash, &rlocator, HASH_REMOVE, NULL);
- }
-}
-
void
umfile_ctx_forget(RelFileLocatorBackend rlocator)
{
UmCtxRegistryEntry *entry;
- UmbraFileContext *ctx;

- if (UmFileContextHash == NULL)
+ if (UmCtxRegistry == NULL)
return;

- entry = hash_search(UmFileContextHash, &rlocator, HASH_FIND, NULL);
+ entry = hash_search(UmCtxRegistry, &rlocator, HASH_FIND, NULL);
if (entry == NULL)
return;

- ctx = entry->ctx;
- for (ForkNumber forknum = 0; forknum <= UMBRA_METADATA_FORKNUM; forknum++)
- umfile_close_open_segments(ctx, forknum);
-
- if (ctx->refcount == 0)
- {
- umfile_ctx_destroy(ctx);
- (void) hash_search(UmFileContextHash, &rlocator, HASH_REMOVE, NULL);
- }
+ umfile_ctx_destroy_internal(entry->ctx);
+ (void) hash_search(UmCtxRegistry, &rlocator, HASH_REMOVE, NULL);
}

void
-umfile_ctx_close_fork(UmbraFileContext *ctx, ForkNumber forknum)
+umfile_ctx_destroy_temporary(UmbraFileContext *ctx)
{
- if (ctx == NULL)
- return;
-
- umfile_close_open_segments(ctx, forknum);
+ umfile_ctx_destroy_internal(ctx);
}

+/*
+ * MAP-layer context helpers
+ *
+ * These operate directly on the ctx+rlocator core.
+ *
+ * Important: these helpers intentionally do not register fsync requests for
+ * writes/extends. The MAP layer calls umfile_ctx_register_dirty() explicitly.
+ */
+
bool
umfile_ctx_fork_exists(UmbraFileContext *ctx, ForkNumber forknum,
UmFileExistsMode mode)
{
if (ctx == NULL)
return false;
-
return umfile_exists(ctx, forknum, mode);
}

@@ -234,83 +186,117 @@ umfile_ctx_get_nblocks(UmbraFileContext *ctx, ForkNumber forknum,
return umfile_nblocks(ctx, forknum, mode);
}

+static void
+umfile_ctx_ensure_fork(UmbraFileContext *ctx, ForkNumber forknum)
+{
+ Assert(ctx != NULL);
+ if (!umfile_exists(ctx, forknum,
+ umfile_fork_allows_sparse_segments(forknum) ?
+ UMFILE_EXISTS_SPARSE :
+ UMFILE_EXISTS_DENSE))
+ umfile_create(ctx, forknum, false /* isRedo */ );
+}
+
+static void
+umfile_ctx_ensure_block_exists(UmbraFileContext *ctx, ForkNumber forknum,
+ BlockNumber blkno)
+{
+ Assert(ctx != NULL);
+
+ if (umfile_ctx_block_exists(ctx, forknum, blkno))
+ return;
+
+ /*
+ * Materialize just the requested block. For sparse mapped forks we do not
+ * need an authoritative current EOF here; FileZero() can create the target
+ * segment and make blkno BLCKSZ-addressable directly.
+ */
+ Assert(blkno < MaxBlockNumber);
+ umfile_zeroextend(ctx, forknum, blkno, 1, true /* skipFsync */ );
+}
+
void
umfile_ctx_read(UmbraFileContext *ctx, ForkNumber forknum, BlockNumber blkno,
char *buffer, int nbytes)
{
- UmfdVec *seg;
- off_t offset;
- ssize_t got;
+ UmfdVec *v;
+ off_t seekpos;
+ int got;

Assert(ctx != NULL);
Assert(buffer != NULL);
Assert(nbytes > 0 && nbytes <= BLCKSZ);

- seg = umfile_getseg(ctx, ctx->rlocator, forknum, blkno,
- false,
- UM_EXTENSION_FAIL | UM_EXTENSION_CREATE_RECOVERY);
- offset = (off_t) BLCKSZ * (blkno % ((BlockNumber) RELSEG_SIZE));
- got = FileRead(seg->umfd_vfd, buffer, nbytes, offset,
+ v = umfile_getseg(ctx, ctx->rlocator, forknum, blkno,
+ false /* skipFsync */,
+ UM_EXTENSION_FAIL,
+ false /* isTempRelation */);
+ seekpos = (off_t) BLCKSZ * (blkno % ((BlockNumber) RELSEG_SIZE));
+
+ got = FileRead(v->umfd_vfd, buffer, nbytes, seekpos,
WAIT_EVENT_DATA_FILE_READ);
- if (got < 0)
- ereport(ERROR,
- (errcode_for_file_access(),
- errmsg("could not read block %u in file \"%s\": %m",
- blkno, FilePathName(seg->umfd_vfd))));
if (got != nbytes)
+ {
+ if (got < 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not read file \"%s\": %m",
+ FilePathName(v->umfd_vfd))));
ereport(ERROR,
(errcode(ERRCODE_DATA_CORRUPTED),
- errmsg("could not read block %u in file \"%s\"",
- blkno, FilePathName(seg->umfd_vfd)),
- errdetail("Read only %zd of %d bytes.", got, nbytes)));
+ errmsg("could not read file \"%s\": read only %d of %d bytes at block %u",
+ FilePathName(v->umfd_vfd), got, nbytes, blkno)));
+ }
}

void
umfile_ctx_write(UmbraFileContext *ctx, ForkNumber forknum, BlockNumber blkno,
const char *buffer, int nbytes, bool skipFsync)
{
- UmfdVec *seg;
- BlockNumber nblocks;
- off_t offset;
- ssize_t wrote;
+ UmfdVec *v;
+ off_t seekpos;
+ int wrote;

Assert(ctx != NULL);
Assert(buffer != NULL);
Assert(nbytes > 0 && nbytes <= BLCKSZ);

- nblocks = umfile_nblocks(ctx, forknum, UMFILE_NBLOCKS_DENSE);
- if (blkno >= nblocks)
- ereport(ERROR,
- (errcode(ERRCODE_DATA_CORRUPTED),
- errmsg("cannot overwrite block %u in relation %u/%u/%u fork %d",
- blkno,
- ctx->rlocator.locator.spcOid,
- ctx->rlocator.locator.dbOid,
- ctx->rlocator.locator.relNumber,
- forknum),
- errdetail("Current fork size is %u blocks.", nblocks)));
-
- seg = umfile_getseg(ctx, ctx->rlocator, forknum, blkno,
- skipFsync,
- UM_EXTENSION_FAIL | UM_EXTENSION_CREATE_RECOVERY);
- offset = (off_t) BLCKSZ * (blkno % ((BlockNumber) RELSEG_SIZE));
- wrote = FileWrite(seg->umfd_vfd, buffer, nbytes, offset,
+ /*
+ * Ensure the target block exists at BLCKSZ granularity even if we're about
+ * to write only a sector-sized header.
+ */
+ umfile_ctx_ensure_block_exists(ctx, forknum, blkno);
+
+ v = umfile_getseg(ctx, ctx->rlocator, forknum, blkno,
+ true /* skipFsync */,
+ UM_EXTENSION_FAIL | UM_EXTENSION_CREATE,
+ false /* isTempRelation */);
+ seekpos = (off_t) BLCKSZ * (blkno % ((BlockNumber) RELSEG_SIZE));
+
+ wrote = FileWrite(v->umfd_vfd, buffer, nbytes, seekpos,
WAIT_EVENT_DATA_FILE_WRITE);
- if (wrote < 0)
- ereport(ERROR,
- (errcode_for_file_access(),
- errmsg("could not write block %u in file \"%s\": %m",
- blkno, FilePathName(seg->umfd_vfd))));
if (wrote != nbytes)
+ {
+ if (wrote < 0 && errno == ENOSPC)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not write file \"%s\": %m",
+ FilePathName(v->umfd_vfd)),
+ errhint("Check free disk space.")));
+ if (wrote < 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not write file \"%s\": %m",
+ FilePathName(v->umfd_vfd))));
ereport(ERROR,
- (errcode(ERRCODE_DISK_FULL),
- errmsg("could not write block %u in file \"%s\"",
- blkno, FilePathName(seg->umfd_vfd)),
- errdetail("Wrote only %zd of %d bytes.", wrote, nbytes)));
+ (errcode_for_file_access(),
+ errmsg("could not write file \"%s\": wrote only %d of %d bytes at block %u",
+ FilePathName(v->umfd_vfd), wrote, nbytes, blkno)));
+ }

/*
- * Sync policy is explicit at this layer: callers use
- * umfile_registersync()/umfile_immedsync() for durable requests.
+ * Intentionally do not register dirty here. The MAP layer does that via
+ * umfile_ctx_register_dirty() so it can control skipFsync consistently.
*/
(void) skipFsync;
}
@@ -319,828 +305,2074 @@ void
umfile_ctx_extend(UmbraFileContext *ctx, ForkNumber forknum, BlockNumber blkno,
const char *buffer)
{
- BlockNumber nblocks;
-
Assert(ctx != NULL);
Assert(buffer != NULL);

- (void) umfile_open_or_create(ctx, forknum, false, NULL);
- nblocks = umfile_nblocks(ctx, forknum, UMFILE_NBLOCKS_DENSE);
- if (blkno != nblocks)
- ereport(ERROR,
- (errcode(ERRCODE_DATA_CORRUPTED),
- errmsg("cannot extend relation %u/%u/%u fork %d at block %u",
- ctx->rlocator.locator.spcOid,
- ctx->rlocator.locator.dbOid,
- ctx->rlocator.locator.relNumber,
- forknum, blkno),
- errdetail("Expected next block %u.", nblocks)));
+ umfile_ctx_ensure_fork(ctx, forknum);

- umfile_extend(ctx, forknum, blkno, buffer, true);
+ /* Use the existing extension path but suppress fsync registration. */
+ umfile_extend(ctx, forknum, blkno, buffer, true /* skipFsync */ );
}

void
-umfile_ctx_unlinkfork(RelFileLocatorBackend rlocator, ForkNumber forknum,
- bool isRedo)
+umfile_ctx_prefetch(UmbraFileContext *ctx, ForkNumber forknum, BlockNumber blkno)
{
- umfile_unlink(rlocator, forknum, isRedo);
+ if (ctx == NULL)
+ return;
+ (void) umfile_prefetch(ctx, forknum, blkno, 1);
}

bool
-umfile_metadata_exists(UmbraFileContext *ctx)
+umfile_ctx_block_exists(UmbraFileContext *ctx, ForkNumber forknum,
+ BlockNumber blkno)
{
- return umfile_exists(ctx, UMBRA_METADATA_FORKNUM, UMFILE_EXISTS_DENSE);
+ UmfdVec *v;
+ BlockNumber segno;
+ BlockNumber segblocks;
+
+ if (ctx == NULL)
+ return false;
+
+ segno = blkno / ((BlockNumber) RELSEG_SIZE);
+
+ if (segno < (BlockNumber) ctx->num_open_segs[forknum])
+ {
+ v = umfile_v_get(ctx, forknum, (int) segno);
+ if (!umfile_seg_entry_is_open(v))
+ return false;
+ }
+ else
+ {
+ /*
+ * Dense forks can only materialize the next segment in order. Sparse
+ * forks may legitimately skip lower segments.
+ */
+ if (!umfile_fork_allows_sparse_segments(forknum) &&
+ segno > (BlockNumber) ctx->num_open_segs[forknum])
+ return false;
+
+ v = umfile_openseg(ctx, ctx->rlocator, forknum, segno,
+ UM_EXTENSION_RETURN_NULL);
+ if (v == NULL)
+ return false;
+ }
+
+ segblocks = umfile_nblocks_in_seg(v->umfd_vfd);
+ return (blkno % ((BlockNumber) RELSEG_SIZE)) < segblocks;
}

bool
-umfile_metadata_open_or_create(UmbraFileContext *ctx, bool isRedo, bool *created)
+umfile_ctx_segment_exists(UmbraFileContext *ctx, ForkNumber forknum,
+ BlockNumber segno)
{
- return umfile_open_or_create(ctx, UMBRA_METADATA_FORKNUM, isRedo, created);
-}
+ char path[MAXPGPATH];

-BlockNumber
-umfile_metadata_nblocks(UmbraFileContext *ctx)
-{
- return umfile_nblocks(ctx, UMBRA_METADATA_FORKNUM, UMFILE_NBLOCKS_DENSE);
+ if (ctx == NULL)
+ return false;
+
+ umfile_build_segpath(ctx, forknum, segno, path, sizeof(path));
+ return access(path, F_OK) == 0;
}

void
-umfile_metadata_read(UmbraFileContext *ctx, BlockNumber blkno, void *buffer)
+umfile_ctx_register_dirty(UmbraFileContext *ctx, ForkNumber forknum,
+ BlockNumber blkno, bool skipFsync,
+ bool isTempRelation)
{
- void *buffers[1];
+ UmfdVec *v;

- buffers[0] = buffer;
- umfile_readv(ctx, UMBRA_METADATA_FORKNUM, blkno, buffers, 1);
-}
+ if (skipFsync || isTempRelation)
+ return;

-void
-umfile_metadata_write(UmbraFileContext *ctx, BlockNumber blkno, const void *buffer)
-{
- const void *buffers[1];
+ Assert(ctx != NULL);

- buffers[0] = buffer;
- umfile_writev(ctx, UMBRA_METADATA_FORKNUM, blkno, buffers, 1, false);
+ /*
+ * Ensure we can fall back to immediate fsync if the sync request queue is
+ * full, mirroring md.c behavior.
+ */
+ v = umfile_getseg(ctx, ctx->rlocator, forknum, blkno,
+ false /* skipFsync */,
+ UM_EXTENSION_FAIL,
+ isTempRelation);
+ umfile_register_dirty_seg(ctx->rlocator, isTempRelation, forknum, v);
}

void
-umfile_metadata_extend(UmbraFileContext *ctx, BlockNumber blkno, const void *buffer)
+umfile_ctx_unlinkfork(RelFileLocatorBackend rlocator, ForkNumber forkNum,
+ bool isRedo)
{
- umfile_extend(ctx, UMBRA_METADATA_FORKNUM, blkno, buffer, false);
+ umfile_unlink(rlocator, forkNum, isRedo);
}

-void
-umfile_metadata_immedsync(UmbraFileContext *ctx)
+/*
+ * Build a FileTag for Umbra relation segment files. MAP fork uses Umbra-only
+ * naming and cannot safely reuse md's unlink callback.
+ */
+#define INIT_UM_FILETAG(tag, rlocator_, forknum_, segno_) \
+ do { \
+ memset(&(tag), 0, sizeof(FileTag)); \
+ (tag).handler = SYNC_HANDLER_UMBRA; \
+ (tag).rlocator = (rlocator_); \
+ (tag).forknum = (forknum_); \
+ (tag).segno = (segno_); \
+ } while (0)
+
+static inline int
+_umfd_open_flags(void)
{
- umfile_immedsync(ctx, UMBRA_METADATA_FORKNUM);
-}
+ int flags = O_RDWR | PG_BINARY;

-void
-umfile_metadata_unlink(RelFileLocatorBackend rlocator, bool isRedo)
-{
- umfile_unlink(rlocator, UMBRA_METADATA_FORKNUM, isRedo);
+ if (io_direct_flags & IO_DIRECT_DATA)
+ flags |= PG_O_DIRECT;
+
+ return flags;
}

-bool
-umfile_exists(UmbraFileContext *ctx, ForkNumber forknum, UmFileExistsMode mode)
+static void
+umfile_fdvec_resize(UmbraFileContext *ctx, ForkNumber forknum, int nseg)
{
- Assert(ctx != NULL);
- (void) mode;
+ Assert(nseg >= 0);

- if (umfile_fork_has_open_segment(ctx, forknum))
+ if (nseg == 0)
{
- if (umfile_fork_has_open_segment_on_disk(ctx, ctx->rlocator, forknum))
- return true;
+ if (ctx->num_open_segs[forknum] > 0)
+ {
+ pfree(ctx->seg_fds[forknum]);
+ ctx->seg_fds[forknum] = NULL;
+ }
+ ctx->seg_fds[forknum] = NULL;
+ ctx->num_open_segs[forknum] = 0;
+ return;
+ }

- umfile_close_open_segments(ctx, forknum);
+ if (ctx->num_open_segs[forknum] == 0)
+ {
+ ctx->seg_fds[forknum] =
+ MemoryContextAlloc(UmCxt, sizeof(UmfdVec) * nseg);
+ }
+ else if (nseg > ctx->num_open_segs[forknum])
+ {
+ ctx->seg_fds[forknum] =
+ repalloc(ctx->seg_fds[forknum],
+ sizeof(UmfdVec) * nseg);
+ }
+ else
+ {
+ /*
+ * Don't reallocate a smaller array: keep truncate usable in critical
+ * sections (mirrors md.c behavior).
+ */
}

- return (umfile_openfork(ctx, ctx->rlocator, forknum,
- UM_EXTENSION_RETURN_NULL) != NULL);
+ ctx->num_open_segs[forknum] = nseg;
}

-bool
-umfile_open_or_create(UmbraFileContext *ctx, ForkNumber forknum,
- bool isRedo, bool *created)
+static inline UmfdVec *
+umfile_v_get(UmbraFileContext *ctx, ForkNumber forknum, int segindex)
+{
+ Assert(segindex >= 0);
+ Assert(segindex < ctx->num_open_segs[forknum]);
+ return &ctx->seg_fds[forknum][segindex];
+}
+
+static BlockNumber
+umfile_nblocks_in_seg(File vfd)
{
- UmfdVec *seg;
- bool was_created;
+ off_t len;

- Assert(ctx != NULL);
+ len = FileSize(vfd);
+ if (len < 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not seek to end of file \"%s\": %m",
+ FilePathName(vfd))));

- if (created != NULL)
- *created = false;
+ return (BlockNumber) (len / BLCKSZ);
+}

- seg = umfile_openfork(ctx, ctx->rlocator, forknum,
- UM_EXTENSION_RETURN_NULL);
- if (seg != NULL)
- return true;
+static RelPathStr
+umfile_segpath(RelFileLocatorBackend rlocator, ForkNumber forknum, BlockNumber segno)
+{
+ RelPathStr base;
+ RelPathStr fullpath;

- was_created = umfile_create(ctx, forknum, isRedo);
- if (created != NULL)
- *created = was_created;
+ if (forknum == UMBRA_METADATA_FORKNUM)
+ base = UmMetadataRelPathBackend(rlocator);
+ else
+ base = relpath(rlocator, forknum);

- return true;
+ if (segno == 0)
+ return base;
+
+ snprintf(fullpath.str, sizeof(fullpath.str), "%s.%u", base.str, segno);
+ return fullpath;
}

-BlockNumber
-umfile_nblocks(UmbraFileContext *ctx, ForkNumber forknum, UmFileNblocksMode mode)
+static UmfdVec *
+umfile_openseg(UmbraFileContext *ctx, RelFileLocatorBackend rlocator,
+ ForkNumber forknum, BlockNumber segno, int oflags)
{
- UmfdVec *seg;
- BlockNumber segno;
- BlockNumber nblocks;
+ UmfdVec *v;
+ RelPathStr fullpath;
+ File fd;
+ int old_nseg;
+ int i;

- Assert(ctx != NULL);
- (void) mode;
+ fullpath = umfile_segpath(rlocator, forknum, segno);

- if (umfile_openfork(ctx, ctx->rlocator, forknum,
- UM_EXTENSION_RETURN_NULL) == NULL)
- return 0;
+ fd = PathNameOpenFile(fullpath.str, _umfd_open_flags() | oflags);

- Assert(ctx->num_open_segs[forknum] > 0);
- segno = ctx->num_open_segs[forknum] - 1;
- seg = umfile_v_get(ctx, forknum, segno);
+ if (fd < 0)
+ return NULL;

- for (;;)
+ old_nseg = ctx->num_open_segs[forknum];
+ if (umfile_fork_allows_sparse_segments(forknum))
{
- nblocks = umfile_nblocks_in_seg(seg->umfd_vfd);
- if (nblocks > (BlockNumber) RELSEG_SIZE)
- elog(FATAL, "Umbra segment too big");
- if (nblocks < (BlockNumber) RELSEG_SIZE)
- return (segno * ((BlockNumber) RELSEG_SIZE)) + nblocks;
-
- segno++;
- seg = umfile_openseg(ctx, ctx->rlocator, forknum, segno, 0);
- if (seg == NULL)
- return segno * ((BlockNumber) RELSEG_SIZE);
+ if (segno >= (BlockNumber) old_nseg)
+ {
+ umfile_fdvec_resize(ctx, forknum, segno + 1);
+ for (i = old_nseg; i < ctx->num_open_segs[forknum]; i++)
+ umfile_seg_entry_reset(umfile_v_get(ctx, forknum, i));
+ }
+ v = umfile_v_get(ctx, forknum, (int) segno);
+ Assert(!umfile_seg_entry_is_open(v));
+ }
+ else
+ {
+ /*
+ * Segments are opened in increasing order, so we must be adding a new
+ * one at the end.
+ */
+ Assert(segno == (BlockNumber) old_nseg);
+ umfile_fdvec_resize(ctx, forknum, segno + 1);
+ v = umfile_v_get(ctx, forknum, (int) segno);
}
-}

-void
-umfile_readv(UmbraFileContext *ctx, ForkNumber forknum, BlockNumber blocknum,
- void **buffers, BlockNumber nblocks)
-{
- for (BlockNumber i = 0; i < nblocks; i++)
- umfile_ctx_read(ctx, forknum, blocknum + i, buffers[i], BLCKSZ);
+ v->umfd_vfd = fd;
+ v->umfd_segno = segno;
+ Assert(umfile_nblocks_in_seg(v->umfd_vfd) <= (BlockNumber) RELSEG_SIZE);
+ return v;
}

-void
-umfile_writev(UmbraFileContext *ctx, ForkNumber forknum, BlockNumber blocknum,
- const void **buffers, BlockNumber nblocks, bool skipFsync)
+static UmfdVec *
+umfile_openfork(UmbraFileContext *ctx, RelFileLocatorBackend rlocator,
+ ForkNumber forknum, int behavior)
{
- for (BlockNumber i = 0; i < nblocks; i++)
- umfile_ctx_write(ctx, forknum, blocknum + i, buffers[i], BLCKSZ,
- skipFsync);
-}
+ RelPathStr path;
+ File fd;
+ UmfdVec *v;

-void
-umfile_extend(UmbraFileContext *ctx, ForkNumber forknum, BlockNumber blocknum,
- const void *buffer, bool skipFsync)
-{
- UmfdVec *seg;
- off_t offset;
- ssize_t wrote;
+ /* No work if already open */
+ if (ctx->num_open_segs[forknum] > 0)
+ return umfile_v_get(ctx, forknum, 0);

- Assert(ctx != NULL);
- Assert(buffer != NULL);
+ if (forknum == UMBRA_METADATA_FORKNUM)
+ path = UmMetadataRelPathBackend(rlocator);
+ else
+ path = relpath(rlocator, forknum);
+ fd = PathNameOpenFile(path.str, _umfd_open_flags());

- seg = umfile_getseg(ctx, ctx->rlocator, forknum, blocknum,
- skipFsync,
- UM_EXTENSION_FAIL | UM_EXTENSION_CREATE);
- offset = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
- wrote = FileWrite(seg->umfd_vfd, buffer, BLCKSZ, offset,
- WAIT_EVENT_DATA_FILE_EXTEND);
- if (wrote < 0)
+ if (fd < 0)
+ {
+ if ((behavior & UM_EXTENSION_RETURN_NULL) &&
+ FILE_POSSIBLY_DELETED(errno))
+ return NULL;
ereport(ERROR,
(errcode_for_file_access(),
- errmsg("could not extend file \"%s\": %m",
- FilePathName(seg->umfd_vfd))));
- if (wrote != BLCKSZ)
- ereport(ERROR,
- (errcode(ERRCODE_DISK_FULL),
- errmsg("could not extend file \"%s\" at block %u",
- FilePathName(seg->umfd_vfd), blocknum),
- errdetail("Wrote only %zd of %d bytes.", wrote, BLCKSZ)));
+ errmsg("could not open file \"%s\": %m", path.str)));
+ }

- (void) skipFsync;
+ umfile_fdvec_resize(ctx, forknum, 1);
+ v = umfile_v_get(ctx, forknum, 0);
+ v->umfd_vfd = fd;
+ v->umfd_segno = 0;
+
+ Assert(umfile_nblocks_in_seg(v->umfd_vfd) <= (BlockNumber) RELSEG_SIZE);
+
+ return v;
}

-void
-umfile_zeroextend(UmbraFileContext *ctx, ForkNumber forknum,
- BlockNumber blocknum, int nblocks, bool skipFsync)
+static bool
+umfile_fork_allows_sparse_segments(ForkNumber forknum)
{
- Assert(ctx != NULL);
- Assert(nblocks >= 0);
-
- while (nblocks > 0)
+ switch (forknum)
{
- UmfdVec *seg;
- BlockNumber nblocks_this_segment;
- off_t offset;
- int ret;
-
- seg = umfile_getseg(ctx, ctx->rlocator, forknum, blocknum,
- skipFsync,
- UM_EXTENSION_FAIL | UM_EXTENSION_CREATE);
- offset = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
- nblocks_this_segment =
- Min((BlockNumber) nblocks,
- ((BlockNumber) RELSEG_SIZE) -
- (blocknum % ((BlockNumber) RELSEG_SIZE)));
-
- ret = FileZero(seg->umfd_vfd,
- offset,
- (off_t) BLCKSZ * nblocks_this_segment,
- WAIT_EVENT_DATA_FILE_EXTEND);
- if (ret < 0)
- ereport(ERROR,
- (errcode_for_file_access(),
- errmsg("could not zero-extend file \"%s\": %m",
- FilePathName(seg->umfd_vfd))));
-
- nblocks -= nblocks_this_segment;
- blocknum += nblocks_this_segment;
+ case MAIN_FORKNUM:
+ case FSM_FORKNUM:
+ case VISIBILITYMAP_FORKNUM:
+ return true;
+ default:
+ return false;
}
}

-void
-umfile_truncate(UmbraFileContext *ctx, ForkNumber forknum,
- BlockNumber old_blocks, BlockNumber nblocks)
+static bool umfile_collect_existing_segnos_by_path(const char *seg0path,
+ BlockNumber **segnos_out,
+ int *nsegnos_out);
+
+static bool
+umfile_sparse_fork_scan_segments(UmbraFileContext *ctx,
+ ForkNumber forknum,
+ BlockNumber *minsegno,
+ BlockNumber *maxsegno)
{
- int curopensegs;
+ char seg0path[MAXPGPATH];
+ BlockNumber *segnos = NULL;
+ int nsegnos = 0;

- Assert(ctx != NULL);
+ Assert(umfile_fork_allows_sparse_segments(forknum));

- if (nblocks > old_blocks)
- {
- if (InRecovery)
- return;
+ umfile_build_segpath(ctx, forknum, 0, seg0path, sizeof(seg0path));
+ if (!umfile_collect_existing_segnos_by_path(seg0path, &segnos, &nsegnos))
+ return false;
+ if (nsegnos == 0)
+ return false;

- ereport(ERROR,
- (errcode(ERRCODE_DATA_CORRUPTED),
- errmsg("cannot truncate relation %u/%u/%u fork %d to %u blocks: current size is only %u blocks",
- ctx->rlocator.locator.spcOid,
- ctx->rlocator.locator.dbOid,
- ctx->rlocator.locator.relNumber,
- forknum,
- nblocks,
- old_blocks)));
- }
+ if (minsegno != NULL)
+ *minsegno = segnos[0];
+ if (maxsegno != NULL)
+ *maxsegno = segnos[nsegnos - 1];
+ pfree(segnos);
+ return true;
+}

- if (nblocks == old_blocks)
- return;
+static bool
+umfile_any_segment_exists_by_path(const char *seg0path)
+{
+ char dirpath[MAXPGPATH];
+ char *slash;
+ const char *basename;
+ size_t baselen;
+ DIR *dir;
+ struct dirent *de;
+
+ Assert(seg0path != NULL);
+
+ strlcpy(dirpath, seg0path, sizeof(dirpath));
+ slash = strrchr(dirpath, '/');
+ if (slash == NULL)
+ return false;

- /*
- * Bring all dense segments into the local array first, then trim from the
- * tail. This keeps the truncate contract local to the file manager.
- */
- (void) umfile_nblocks(ctx, forknum, UMFILE_NBLOCKS_DENSE);
- curopensegs = ctx->num_open_segs[forknum];
+ *slash = '\0';
+ basename = slash + 1;
+ baselen = strlen(basename);

- while (curopensegs > 0)
+ dir = AllocateDir(dirpath);
+ if (dir == NULL)
{
- UmfdVec *seg;
- BlockNumber priorblocks;
+ if (errno == ENOENT)
+ return false;
+ return false;
+ }

- priorblocks = (curopensegs - 1) * ((BlockNumber) RELSEG_SIZE);
- seg = umfile_v_get(ctx, forknum, curopensegs - 1);
+ while ((de = ReadDir(dir, dirpath)) != NULL)
+ {
+ const char *name = de->d_name;

- if (priorblocks >= nblocks)
+ if (strcmp(name, basename) == 0)
{
- if (FileTruncate(seg->umfd_vfd, 0, WAIT_EVENT_DATA_FILE_TRUNCATE) < 0)
- ereport(ERROR,
- (errcode_for_file_access(),
- errmsg("could not truncate file \"%s\": %m",
- FilePathName(seg->umfd_vfd))));
+ FreeDir(dir);
+ return true;
+ }

- if (seg != umfile_v_get(ctx, forknum, 0))
+ if (strncmp(name, basename, baselen) == 0 &&
+ name[baselen] == '.')
+ {
+ char *endptr = NULL;
+ unsigned long parsed;
+
+ errno = 0;
+ parsed = strtoul(name + baselen + 1, &endptr, 10);
+ if (errno == 0 &&
+ endptr != name + baselen + 1 &&
+ *endptr == '\0' &&
+ parsed <= MaxBlockNumber)
{
- FileClose(seg->umfd_vfd);
- umfile_fdvec_resize(ctx, forknum, curopensegs - 1);
+ FreeDir(dir);
+ return true;
}
}
- else if (priorblocks + ((BlockNumber) RELSEG_SIZE) > nblocks)
- {
- BlockNumber lastsegblocks;
+ }

- lastsegblocks = nblocks - priorblocks;
- if (FileTruncate(seg->umfd_vfd,
- (off_t) lastsegblocks * BLCKSZ,
- WAIT_EVENT_DATA_FILE_TRUNCATE) < 0)
- ereport(ERROR,
- (errcode_for_file_access(),
- errmsg("could not truncate file \"%s\" to %u blocks: %m",
- FilePathName(seg->umfd_vfd),
- nblocks)));
- }
- else
- break;
+ FreeDir(dir);
+ return false;
+}

- curopensegs--;
- }
+static inline bool
+umfile_seg_entry_is_open(const UmfdVec *seg)
+{
+ return (seg != NULL && seg->umfd_vfd >= 0);
}

-void
-umfile_immedsync(UmbraFileContext *ctx, ForkNumber forknum)
+static bool
+umfile_fork_has_open_segment(UmbraFileContext *ctx, ForkNumber forknum)
{
- int segno;
- int min_inactive_seg;
+ int i;

- Assert(ctx != NULL);
+ for (i = 0; i < ctx->num_open_segs[forknum]; i++)
+ {
+ if (umfile_seg_entry_is_open(umfile_v_get(ctx, forknum, i)))
+ return true;
+ }

- (void) umfile_nblocks(ctx, forknum, UMFILE_NBLOCKS_DENSE);
- min_inactive_seg = segno = ctx->num_open_segs[forknum];
+ return false;
+}

- while (umfile_openseg(ctx, ctx->rlocator, forknum, segno, 0) != NULL)
- segno++;
+static bool
+umfile_fork_has_open_segment_on_disk(UmbraFileContext *ctx,
+ RelFileLocatorBackend rlocator,
+ ForkNumber forknum)
+{
+ int i;
+ bool have_live = false;

- while (segno > 0)
+ for (i = 0; i < ctx->num_open_segs[forknum]; i++)
{
- UmfdVec *seg = umfile_v_get(ctx, forknum, segno - 1);
+ UmfdVec *seg = umfile_v_get(ctx, forknum, i);
+ RelPathStr path;

- if (FileSync(seg->umfd_vfd, WAIT_EVENT_DATA_FILE_IMMEDIATE_SYNC) < 0)
- ereport(ERROR,
- (errcode_for_file_access(),
- errmsg("could not fsync file \"%s\": %m",
- FilePathName(seg->umfd_vfd))));
+ if (!umfile_seg_entry_is_open(seg))
+ continue;

- if (segno > min_inactive_seg)
+ path = umfile_segpath(rlocator, forknum, seg->umfd_segno);
+ if (access(path.str, F_OK) == 0)
{
- FileClose(seg->umfd_vfd);
- umfile_fdvec_resize(ctx, forknum, segno - 1);
+ have_live = true;
+ continue;
}

- segno--;
+ FileClose(seg->umfd_vfd);
+ umfile_seg_entry_reset(seg);
}
+
+ return have_live;
}

-void
-umfile_registersync(UmbraFileContext *ctx, ForkNumber forknum)
+static inline void
+umfile_seg_entry_reset(UmfdVec *seg)
{
- /*
- * Registering durability at this boundary is implemented as an immediate
- * fsync.
- */
- umfile_immedsync(ctx, forknum);
+ seg->umfd_vfd = -1;
+ seg->umfd_segno = InvalidBlockNumber;
}

-void
-umfile_unlink(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo)
+static int
+umfile_compare_blocknumbers(const void *a, const void *b)
{
- if (forknum == InvalidForkNumber)
+ BlockNumber va = *(const BlockNumber *) a;
+ BlockNumber vb = *(const BlockNumber *) b;
+
+ if (va < vb)
+ return -1;
+ if (va > vb)
+ return 1;
+ return 0;
+}
+
+static bool
+umfile_collect_existing_segnos_by_path(const char *seg0path,
+ BlockNumber **segnos_out,
+ int *nsegnos_out)
+{
+ char dirpath[MAXPGPATH];
+ char *slash;
+ const char *basename;
+ size_t baselen;
+ DIR *dir;
+ struct dirent *de;
+ BlockNumber *segnos = NULL;
+ int nsegnos = 0;
+ int capacity = 0;
+ int i;
+ int uniq;
+
+ Assert(seg0path != NULL);
+ Assert(segnos_out != NULL);
+ Assert(nsegnos_out != NULL);
+
+ *segnos_out = NULL;
+ *nsegnos_out = 0;
+
+ strlcpy(dirpath, seg0path, sizeof(dirpath));
+ slash = strrchr(dirpath, '/');
+ if (slash == NULL)
+ return false;
+
+ *slash = '\0';
+ basename = slash + 1;
+ baselen = strlen(basename);
+
+ dir = AllocateDir(dirpath);
+ if (dir == NULL)
{
- for (forknum = 0; forknum <= UMBRA_METADATA_FORKNUM; forknum++)
- umfile_unlink(rlocator, forknum, isRedo);
- return;
+ if (errno == ENOENT)
+ return true;
+ return false;
}

- for (BlockNumber segno = 0;; segno++)
+ while ((de = ReadDir(dir, dirpath)) != NULL)
{
- RelPathStr path;
+ const char *name = de->d_name;
+ BlockNumber segno;

- path = umfile_segpath(rlocator, forknum, segno);
- if (unlink(path.str) < 0)
+ if (strcmp(name, basename) == 0)
+ segno = 0;
+ else if (strncmp(name, basename, baselen) == 0 &&
+ name[baselen] == '.')
{
- if (FILE_POSSIBLY_DELETED(errno))
- {
- if (segno == 0 && isRedo)
- return;
- break;
- }
+ char *endptr = NULL;
+ unsigned long parsed;
+
+ errno = 0;
+ parsed = strtoul(name + baselen + 1, &endptr, 10);
+ if (errno != 0 ||
+ endptr == name + baselen + 1 ||
+ *endptr != '\0' ||
+ parsed > MaxBlockNumber)
+ continue;
+ segno = (BlockNumber) parsed;
+ }
+ else
+ continue;

- ereport(WARNING,
- (errcode_for_file_access(),
- errmsg("could not remove file \"%s\": %m", path.str)));
- break;
+ if (nsegnos == capacity)
+ {
+ int new_capacity = (capacity == 0) ? 16 : capacity * 2;
+
+ if (segnos == NULL)
+ segnos = (BlockNumber *) MemoryContextAlloc(UmCxt,
+ sizeof(BlockNumber) * new_capacity);
+ else
+ segnos = (BlockNumber *) repalloc(segnos,
+ sizeof(BlockNumber) * new_capacity);
+ capacity = new_capacity;
}
+ segnos[nsegnos++] = segno;
}
-}

-static void
-umfile_ctx_registry_init(void)
-{
- if (UmFileContextHash == NULL)
- umfile_init();
-
- Assert(UmFileContextHash != NULL);
-}
-
-static UmbraFileContext *
-umfile_ctx_create(RelFileLocatorBackend rlocator)
-{
- UmbraFileContext *ctx;
+ FreeDir(dir);

- Assert(UmFileCxt != NULL);
+ if (nsegnos == 0)
+ {
+ if (segnos != NULL)
+ pfree(segnos);
+ return true;
+ }

- ctx = MemoryContextAllocZero(UmFileCxt, sizeof(UmbraFileContext));
- ctx->rlocator = rlocator;
+ qsort(segnos, nsegnos, sizeof(BlockNumber), umfile_compare_blocknumbers);

- for (ForkNumber forknum = 0; forknum <= UMBRA_METADATA_FORKNUM; forknum++)
+ uniq = 1;
+ for (i = 1; i < nsegnos; i++)
{
- ctx->num_open_segs[forknum] = 0;
- ctx->seg_fds[forknum] = NULL;
+ if (segnos[i] != segnos[uniq - 1])
+ segnos[uniq++] = segnos[i];
}

- return ctx;
+ *segnos_out = segnos;
+ *nsegnos_out = uniq;
+ return true;
}

+/*
+ * umfile_build_segpath() -- Build segment path in caller-provided buffer.
+ *
+ * This is a no-allocation path builder so callers can use it safely in
+ * critical sections.
+ */
static void
-umfile_ctx_destroy(UmbraFileContext *ctx)
+umfile_build_segpath(UmbraFileContext *ctx, ForkNumber forknum,
+ BlockNumber segno, char *path, size_t pathlen)
{
- if (ctx == NULL)
- return;
-
- for (ForkNumber forknum = 0; forknum <= UMBRA_METADATA_FORKNUM; forknum++)
- umfile_close_open_segments(ctx, forknum);
-
- pfree(ctx);
-}
+ int n;
+ RelFileLocatorBackend rlocator;

-static void
-umfile_close_open_segments(UmbraFileContext *ctx, ForkNumber forknum)
-{
- int nopensegs;
+ Assert(forknum >= 0 && forknum <= UMBRA_METADATA_FORKNUM);

- Assert(ctx != NULL);
+ /* Build RelFileLocatorBackend for use with relpath */
+ rlocator.locator = ctx->rlocator.locator;
+ rlocator.backend = ctx->rlocator.backend;

- nopensegs = ctx->num_open_segs[forknum];
- while (nopensegs > 0)
+ /* Build the base path using public forks or Umbra private metadata. */
{
- UmfdVec *seg = umfile_v_get(ctx, forknum, nopensegs - 1);
+ RelPathStr relpath_str;

- if (umfile_seg_entry_is_open(seg))
- FileClose(seg->umfd_vfd);
- umfile_fdvec_resize(ctx, forknum, nopensegs - 1);
- nopensegs--;
+ if (forknum == UMBRA_METADATA_FORKNUM)
+ relpath_str = UmMetadataRelPathBackend(rlocator);
+ else
+ relpath_str = relpath(rlocator, forknum);
+ n = strlcpy(path, relpath_str.str, pathlen);
}
+
+ if (segno == 0)
+ return;
+
+ Assert(segno < RELSEG_SIZE);
+ snprintf(path + n, pathlen - n, ".%u", segno);
}

-static bool
-umfile_create(UmbraFileContext *ctx, ForkNumber forknum, bool isRedo)
+static UmfdVec *
+umfile_getseg(UmbraFileContext *ctx, RelFileLocatorBackend rlocator,
+ ForkNumber forknum, BlockNumber blkno,
+ bool skipFsync, int behavior, bool isTempRelation)
{
- RelPathStr path;
- File fd;
- UmfdVec *seg;
- bool created = false;
+ UmfdVec *v;
+ BlockNumber targetseg;
+ BlockNumber nextsegno;

- Assert(ctx != NULL);
+ Assert(behavior &
+ (UM_EXTENSION_FAIL | UM_EXTENSION_CREATE | UM_EXTENSION_RETURN_NULL |
+ UM_EXTENSION_DONT_OPEN));

- if (isRedo && ctx->num_open_segs[forknum] > 0)
- return false;
+ targetseg = blkno / ((BlockNumber) RELSEG_SIZE);

- if (ctx->num_open_segs[forknum] > 0)
- umfile_close_open_segments(ctx, forknum);
+ /* if an existing and opened segment, we're done */
+ if (targetseg < (BlockNumber) ctx->num_open_segs[forknum])
+ {
+ v = umfile_v_get(ctx, forknum, (int) targetseg);
+ if (!umfile_fork_allows_sparse_segments(forknum) ||
+ umfile_seg_entry_is_open(v))
+ return v;
+ }

- TablespaceCreateDbspace(ctx->rlocator.locator.spcOid,
- ctx->rlocator.locator.dbOid,
- isRedo);
+ /* The caller only wants the segment if we already had it open. */
+ if (behavior & UM_EXTENSION_DONT_OPEN)
+ return NULL;

- path = umfile_segpath(ctx->rlocator, forknum, 0);
- fd = PathNameOpenFile(path.str, umfile_open_flags() | O_CREAT | O_EXCL);
- if (fd < 0)
+ /*
+ * Mapped data forks can use sparse physical segment numbering. Open/create
+ * the target segment directly without checking continuity of previous
+ * segments.
+ */
+ if (umfile_fork_allows_sparse_segments(forknum))
{
- int save_errno = errno;
+ int flags = 0;

- if (isRedo)
- fd = PathNameOpenFile(path.str, umfile_open_flags());
- if (fd < 0)
+ if ((behavior & UM_EXTENSION_CREATE) ||
+ (InRecovery && (behavior & UM_EXTENSION_CREATE_RECOVERY)))
+ flags = O_CREAT;
+
+ v = umfile_openseg(ctx, rlocator, forknum, targetseg, flags);
+ if (v == NULL)
{
- errno = save_errno;
+ if ((behavior & UM_EXTENSION_RETURN_NULL) &&
+ FILE_POSSIBLY_DELETED(errno))
+ return NULL;
ereport(ERROR,
(errcode_for_file_access(),
- errmsg("could not create file \"%s\": %m", path.str)));
+ errmsg("could not open file \"%s\" (target block %u): %m",
+ umfile_segpath(rlocator, forknum, targetseg).str,
+ blkno)));
}
+ return v;
}
+
+ /*
+ * The target segment is not yet open. Iterate over all the segments between
+ * the last opened and the target segment.
+ */
+ if (ctx->num_open_segs[forknum] > 0)
+ v = umfile_v_get(ctx, forknum, ctx->num_open_segs[forknum] - 1);
else
- created = true;
+ {
+ v = umfile_openfork(ctx, rlocator, forknum, behavior);
+ if (!v)
+ return NULL;
+ }

- umfile_fdvec_resize(ctx, forknum, 1);
- seg = umfile_v_get(ctx, forknum, 0);
- seg->umfd_vfd = fd;
- seg->umfd_segno = 0;
+ for (nextsegno = ctx->num_open_segs[forknum];
+ nextsegno <= targetseg;
+ nextsegno++)
+ {
+ BlockNumber nblocks = umfile_nblocks_in_seg(v->umfd_vfd);
+ int flags = 0;

- return created;
-}
+ Assert(nextsegno == v->umfd_segno + 1);

-static int
-umfile_open_flags(void)
-{
- int flags = O_RDWR | PG_BINARY;
+ if (nblocks > ((BlockNumber) RELSEG_SIZE))
+ elog(FATAL, "segment too big");

- if (io_direct_flags & IO_DIRECT_DATA)
- flags |= PG_O_DIRECT;
+ if ((behavior & UM_EXTENSION_CREATE) ||
+ (InRecovery && (behavior & UM_EXTENSION_CREATE_RECOVERY)))
+ {
+ /*
+ * Maintain the invariant that segments before the last active
+ * segment are exactly RELSEG_SIZE blocks. Pad with zeros if needed.
+ * This can happen e.g. in recovery or for discontiguous extension.
+ */
+ if (nblocks < ((BlockNumber) RELSEG_SIZE))
+ {
+ char *zerobuf = palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE,
+ MCXT_ALLOC_ZERO);

- return flags;
+ umfile_extend(ctx, forknum,
+ nextsegno * ((BlockNumber) RELSEG_SIZE) - 1,
+ zerobuf, skipFsync);
+ pfree(zerobuf);
+ }
+ flags = O_CREAT;
+ }
+ else if (nblocks < ((BlockNumber) RELSEG_SIZE))
+ {
+ if (behavior & UM_EXTENSION_RETURN_NULL)
+ {
+ errno = ENOENT;
+ return NULL;
+ }
+
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not open file \"%s\" (target block %u): previous segment is only %u blocks",
+ umfile_segpath(rlocator, forknum, nextsegno).str,
+ blkno, nblocks)));
+ }
+
+ v = umfile_openseg(ctx, rlocator, forknum, nextsegno, flags);
+ if (v == NULL)
+ {
+ if ((behavior & UM_EXTENSION_RETURN_NULL) &&
+ FILE_POSSIBLY_DELETED(errno))
+ return NULL;
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not open file \"%s\" (target block %u): %m",
+ umfile_segpath(rlocator, forknum, nextsegno).str,
+ blkno)));
+ }
+ }
+
+ return v;
}

static void
-umfile_fdvec_resize(UmbraFileContext *ctx, ForkNumber forknum, int nseg)
+umfile_register_dirty_seg(RelFileLocatorBackend rlocator, bool isTempRelation,
+ ForkNumber forknum, UmfdVec *seg)
{
- Assert(ctx != NULL);
- Assert(nseg >= 0);
+ FileTag tag;

- if (nseg == 0)
+ if (!RelFileNumberIsValid(rlocator.locator.relNumber) ||
+ !OidIsValid(rlocator.locator.spcOid))
+ elog(PANIC,
+ "invalid Umbra relation locator in fsync registration %u/%u/%u fork=%d seg=%u",
+ rlocator.locator.spcOid,
+ rlocator.locator.dbOid,
+ rlocator.locator.relNumber,
+ forknum,
+ seg->umfd_segno);
+
+ INIT_UM_FILETAG(tag, rlocator.locator, forknum, seg->umfd_segno);
+
+ /* Temp relations should never be fsync'd */
+ Assert(!isTempRelation);
+
+ if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false /* retryOnError */ ))
+ {
+ instr_time io_start;
+
+ ereport(DEBUG1,
+ (errmsg_internal("could not forward fsync request because request queue is full")));
+
+ io_start = pgstat_prepare_io_time(track_io_timing);
+
+ if (FileSync(seg->umfd_vfd, WAIT_EVENT_DATA_FILE_SYNC) < 0)
+ ereport(data_sync_elevel(ERROR),
+ (errcode_for_file_access(),
+ errmsg("could not fsync file \"%s\": %m",
+ FilePathName(seg->umfd_vfd))));
+
+ pgstat_count_io_op_time(IOOBJECT_RELATION, IOCONTEXT_NORMAL,
+ IOOP_FSYNC, io_start, 1, 0);
+ }
+}
+
+static void
+umfile_register_unlink_seg(RelFileLocatorBackend rlocator, ForkNumber forknum,
+ BlockNumber segno)
+{
+ FileTag tag;
+
+ INIT_UM_FILETAG(tag, rlocator.locator, forknum, segno);
+ Assert(!RelFileLocatorBackendIsTemp(rlocator));
+
+ RegisterSyncRequest(&tag, SYNC_UNLINK_REQUEST, true /* retryOnError */ );
+}
+
+static void
+umfile_register_forget_seg(RelFileLocatorBackend rlocator, ForkNumber forknum,
+ BlockNumber segno)
+{
+ FileTag tag;
+
+ INIT_UM_FILETAG(tag, rlocator.locator, forknum, segno);
+ RegisterSyncRequest(&tag, SYNC_FORGET_REQUEST, true /* retryOnError */ );
+}
+
+static void
+umfile_register_dense_existing_segs_for_unlink(RelFileLocatorBackend rlocator,
+ ForkNumber forknum,
+ const char *seg0path)
+{
+ char segpath[MAXPGPATH];
+ BlockNumber segno;
+
+ Assert(seg0path != NULL);
+
+ for (segno = 0;; segno++)
+ {
+ if (segno == 0)
+ strlcpy(segpath, seg0path, sizeof(segpath));
+ else
+ snprintf(segpath, sizeof(segpath), "%s.%u", seg0path, segno);
+
+ if (!pg_file_exists(segpath))
+ break;
+
+ umfile_register_unlink_seg(rlocator, forknum, segno);
+ }
+}
+
+void
+umfile_init(void)
+{
+ HASHCTL info;
+
+ if (UmCxt != NULL)
+ return;
+ UmCxt = AllocSetContextCreate(TopMemoryContext,
+ "UmFile",
+ ALLOCSET_DEFAULT_SIZES);
+ /*
+ * smgr callbacks (including truncate during WAL replay) can run inside a
+ * critical section. Umbra's per-relation file context is used by openfork
+ * and fdvec management, so it must be permitted to allocate there.
+ *
+ * This matches the expectation in core smgr implementations that their
+ * internal contexts can allocate while in a critical section.
+ */
+ MemoryContextAllowInCriticalSection(UmCxt, true);
+
+ MemSet(&info, 0, sizeof(info));
+ info.keysize = sizeof(RelFileLocatorBackend);
+ info.entrysize = sizeof(UmCtxRegistryEntry);
+ info.hcxt = UmCxt;
+ UmCtxRegistry = hash_create("Umbra file context registry",
+ 256,
+ &info,
+ HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
+}
+
+static void
+umfile_ctx_registry_init(void)
+{
+ if (UmCxt == NULL)
+ umfile_init();
+
+ Assert(UmCtxRegistry != NULL);
+}
+
+static UmbraFileContext *
+umfile_ctx_create(RelFileLocatorBackend rlocator)
+{
+ UmbraFileContext *ctx;
+
+ ctx = MemoryContextAllocZero(UmCxt, sizeof(UmbraFileContext));
+ ctx->rlocator = rlocator;
+
+ for (int forknum = 0; forknum <= UMBRA_METADATA_FORKNUM; forknum++)
{
- if (ctx->num_open_segs[forknum] > 0)
- pfree(ctx->seg_fds[forknum]);
- ctx->seg_fds[forknum] = NULL;
ctx->num_open_segs[forknum] = 0;
+ ctx->seg_fds[forknum] = NULL;
+ }
+
+ return ctx;
+}
+
+static void
+umfile_ctx_destroy_internal(UmbraFileContext *ctx)
+{
+ if (ctx == NULL)
+ return;
+
+ for (int forknum = 0; forknum <= UMBRA_METADATA_FORKNUM; forknum++)
+ {
+ while (ctx->num_open_segs[forknum] > 0)
+ {
+ UmfdVec *seg = umfile_v_get(ctx, forknum,
+ ctx->num_open_segs[forknum] - 1);
+
+ if (umfile_seg_entry_is_open(seg))
+ FileClose(seg->umfd_vfd);
+ umfile_fdvec_resize(ctx, forknum, ctx->num_open_segs[forknum] - 1);
+ }
+ }
+
+ pfree(ctx);
+}
+
+void
+umfile_create(UmbraFileContext *ctx, ForkNumber forknum, bool isRedo)
+{
+ RelFileLocatorBackend rlocator;
+ bool isTempRelation;
+ RelPathStr path;
+ File fd;
+ UmfdVec *v;
+
+ Assert(ctx != NULL);
+ rlocator = ctx->rlocator;
+ isTempRelation = RelFileLocatorBackendIsTemp(rlocator);
+
+ if (isRedo && ctx->num_open_segs[forknum] > 0)
return;
+
+ if (ctx->num_open_segs[forknum] > 0)
+ while (ctx->num_open_segs[forknum] > 0)
+ {
+ UmfdVec *seg = umfile_v_get(ctx, forknum,
+ ctx->num_open_segs[forknum] - 1);
+
+ if (umfile_seg_entry_is_open(seg))
+ FileClose(seg->umfd_vfd);
+ umfile_fdvec_resize(ctx, forknum, ctx->num_open_segs[forknum] - 1);
+ }
+
+ Assert(ctx->num_open_segs[forknum] == 0);
+
+ TablespaceCreateDbspace(rlocator.locator.spcOid,
+ rlocator.locator.dbOid,
+ isRedo);
+
+ if (forknum == UMBRA_METADATA_FORKNUM)
+ path = UmMetadataRelPathBackend(rlocator);
+ else
+ path = relpath(rlocator, forknum);
+
+ fd = PathNameOpenFile(path.str, _umfd_open_flags() | O_CREAT | O_EXCL);
+ if (fd < 0)
+ {
+ int save_errno = errno;
+
+ if (isRedo)
+ fd = PathNameOpenFile(path.str, _umfd_open_flags());
+ if (fd < 0)
+ {
+ errno = save_errno;
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not create file \"%s\": %m", path.str)));
+ }
}

- if (ctx->num_open_segs[forknum] == 0)
+ umfile_fdvec_resize(ctx, forknum, 1);
+ v = umfile_v_get(ctx, forknum, 0);
+ v->umfd_vfd = fd;
+ v->umfd_segno = 0;
+
+ if (!isTempRelation)
+ umfile_register_dirty_seg(rlocator, false, forknum, v);
+}
+
+void
+umfile_ctx_close_fork(UmbraFileContext *ctx, ForkNumber forknum)
+{
+ int nopensegs;
+
+ if (ctx == NULL)
+ return;
+
+ nopensegs = ctx->num_open_segs[forknum];
+ if (nopensegs == 0)
+ return;
+
+ while (nopensegs > 0)
{
- ctx->seg_fds[forknum] =
- MemoryContextAlloc(UmFileCxt, sizeof(UmfdVec) * nseg);
+ UmfdVec *v = umfile_v_get(ctx, forknum, nopensegs - 1);
+
+ if (umfile_seg_entry_is_open(v))
+ FileClose(v->umfd_vfd);
+ umfile_fdvec_resize(ctx, forknum, nopensegs - 1);
+ nopensegs--;
}
- else if (nseg > ctx->num_open_segs[forknum])
+}
+
+bool
+umfile_exists(UmbraFileContext *ctx, ForkNumber forknum, UmFileExistsMode mode)
+{
+ RelFileLocatorBackend rlocator;
+
+ Assert(ctx != NULL);
+ rlocator = ctx->rlocator;
+
+ if (!InRecovery &&
+ umfile_fork_has_open_segment(ctx, forknum))
{
- ctx->seg_fds[forknum] =
- repalloc(ctx->seg_fds[forknum], sizeof(UmfdVec) * nseg);
+ /*
+ * Any still-open segment whose path still exists is enough evidence
+ * that the fork exists. If all open fds are stale after an unlink or
+ * rewrite, drop them before falling back to slower on-disk probes.
+ */
+ if (umfile_fork_has_open_segment_on_disk(ctx, rlocator, forknum))
+ return true;
+
+ while (ctx->num_open_segs[forknum] > 0)
+ {
+ UmfdVec *v = umfile_v_get(ctx, forknum, ctx->num_open_segs[forknum] - 1);
+
+ if (umfile_seg_entry_is_open(v))
+ FileClose(v->umfd_vfd);
+ umfile_fdvec_resize(ctx, forknum, ctx->num_open_segs[forknum] - 1);
+ }
}

- ctx->num_open_segs[forknum] = nseg;
+ if (mode == UMFILE_EXISTS_SPARSE)
+ {
+ /*
+ * Most sparse forks still keep seg0 around. Probe that first so the
+ * common case stays as cheap as the legacy exists() path. Only fall
+ * back to a directory scan when seg0 is absent and the fork may still
+ * exist solely via higher sparse segments.
+ */
+ if (umfile_openfork(ctx, rlocator, forknum, UM_EXTENSION_RETURN_NULL) != NULL)
+ return true;
+
+ return umfile_any_segment_exists_by_path(
+ umfile_segpath(rlocator, forknum, 0).str);
+ }
+
+ return (umfile_openfork(ctx, rlocator, forknum, UM_EXTENSION_RETURN_NULL) != NULL);
}

-static inline UmfdVec *
-umfile_v_get(UmbraFileContext *ctx, ForkNumber forknum, int segindex)
+/*
+ * umfile_open_or_create() -- open existing fork or create new one.
+ *
+ * For redo, attempt to reuse existing file. For normal create, always create
+ * with O_EXCL to avoid binding to stale on-disk contents. Returns true on
+ * success, and sets *created to indicate whether a new file was created.
+ */
+bool
+umfile_open_or_create(UmbraFileContext *ctx, ForkNumber forknum,
+ bool isRedo, bool *created)
+{
+ UmfdVec *v;
+
+ if (created)
+ *created = false;
+
+ /*
+ * Redo can legitimately see pre-existing files and should reuse them.
+ */
+ if (isRedo)
+ {
+ v = umfile_openfork(ctx, ctx->rlocator, forknum,
+ UM_EXTENSION_RETURN_NULL);
+ if (v != NULL)
+ return true;
+ }
+
+ /* Create new file (with O_EXCL for normal path) */
+ umfile_create(ctx, forknum, isRedo);
+
+ /* Verify creation succeeded */
+ v = umfile_openfork(ctx, ctx->rlocator, forknum,
+ UM_EXTENSION_RETURN_NULL);
+ if (v != NULL)
+ {
+ if (created)
+ *created = true;
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Unlink logic mirrors mdunlink(), but uses Umbra segment tracking.
+ */
+void
+umfile_unlink(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo)
+{
+ RelPathStr path;
+ int ret;
+ int save_errno;
+
+ if (forknum == InvalidForkNumber)
+ {
+ for (forknum = 0; forknum <= UMBRA_METADATA_FORKNUM; forknum++)
+ umfile_unlink(rlocator, forknum, isRedo);
+ return;
+ }
+
+ if (forknum == UMBRA_METADATA_FORKNUM)
+ path = UmMetadataRelPathBackend(rlocator);
+ else
+ path = relpath(rlocator, forknum);
+
+ /*
+ * Keep all MAP segments physically intact until checkpoint-time unlink, so
+ * remap-related lookup state is preserved throughout the checkpoint window.
+ */
+ if (!isRedo &&
+ !RelFileLocatorBackendIsTemp(rlocator) &&
+ forknum == UMBRA_METADATA_FORKNUM)
+ {
+ umfile_register_dense_existing_segs_for_unlink(rlocator, forknum,
+ path.str);
+ return;
+ }
+
+ if (isRedo || IsBinaryUpgrade || forknum != MAIN_FORKNUM ||
+ RelFileLocatorBackendIsTemp(rlocator))
+ {
+ if (!RelFileLocatorBackendIsTemp(rlocator))
+ {
+ ret = pg_truncate(path.str, 0);
+ if (ret < 0 && errno != ENOENT)
+ {
+ save_errno = errno;
+ ereport(WARNING,
+ (errcode_for_file_access(),
+ errmsg("could not truncate file \"%s\": %m", path.str)));
+ errno = save_errno;
+ }
+
+ save_errno = errno;
+ umfile_register_forget_seg(rlocator, forknum, 0 /* first seg */ );
+ errno = save_errno;
+ }
+ else
+ ret = 0;
+
+ if (ret >= 0 || errno != ENOENT)
+ {
+ ret = unlink(path.str);
+ if (ret < 0 && errno != ENOENT)
+ {
+ save_errno = errno;
+ ereport(WARNING,
+ (errcode_for_file_access(),
+ errmsg("could not remove file \"%s\": %m", path.str)));
+ errno = save_errno;
+ }
+ }
+ }
+ else
+ {
+ ret = pg_truncate(path.str, 0);
+ if (ret < 0 && errno != ENOENT)
+ {
+ save_errno = errno;
+ ereport(WARNING,
+ (errcode_for_file_access(),
+ errmsg("could not truncate file \"%s\": %m", path.str)));
+ errno = save_errno;
+ }
+
+ save_errno = errno;
+ umfile_register_unlink_seg(rlocator, forknum, 0 /* first seg */ );
+ errno = save_errno;
+ }
+
+ /* Remove additional segments. */
+ if (ret >= 0 || errno != ENOENT)
+ {
+ char segpath[MAXPGPATH];
+ BlockNumber segno;
+
+ for (segno = 1;; segno++)
+ {
+ snprintf(segpath, sizeof(segpath), "%s.%u", path.str, segno);
+
+ if (!RelFileLocatorBackendIsTemp(rlocator))
+ {
+ ret = pg_truncate(segpath, 0);
+ save_errno = errno;
+ umfile_register_forget_seg(rlocator, forknum, segno);
+ errno = save_errno;
+ }
+ else
+ ret = 0;
+
+ if (ret < 0 && errno != ENOENT)
+ break;
+
+ ret = unlink(segpath);
+ if (ret < 0)
+ {
+ if (errno != ENOENT)
+ {
+ save_errno = errno;
+ ereport(WARNING,
+ (errcode_for_file_access(),
+ errmsg("could not remove file \"%s\": %m", segpath)));
+ errno = save_errno;
+ }
+ break;
+ }
+ }
+ }
+}
+
+void
+umfile_extend(UmbraFileContext *ctx, ForkNumber forknum, BlockNumber blocknum,
+ const void *buffer, bool skipFsync)
{
+ RelFileLocatorBackend rlocator;
+ bool isTempRelation;
+ UmfdVec *v;
+ off_t seekpos;
+ int nbytes;
+
Assert(ctx != NULL);
- Assert(segindex >= 0);
- Assert(segindex < ctx->num_open_segs[forknum]);
- return &ctx->seg_fds[forknum][segindex];
+ rlocator = ctx->rlocator;
+ isTempRelation = RelFileLocatorBackendIsTemp(rlocator);
+
+ v = umfile_getseg(ctx, rlocator, forknum, blocknum, skipFsync,
+ UM_EXTENSION_FAIL | UM_EXTENSION_CREATE,
+ isTempRelation);
+
+ seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
+ Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
+
+ TRACE_POSTGRESQL_SMGR_MD_WRITE_START(forknum, blocknum,
+ rlocator.locator.spcOid,
+ rlocator.locator.dbOid,
+ rlocator.locator.relNumber,
+ rlocator.backend);
+
+ nbytes = FileWrite(v->umfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_EXTEND);
+
+ TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE(forknum, blocknum,
+ rlocator.locator.spcOid,
+ rlocator.locator.dbOid,
+ rlocator.locator.relNumber,
+ rlocator.backend,
+ nbytes,
+ BLCKSZ);
+
+ if (nbytes != BLCKSZ)
+ {
+ if (nbytes < 0 && errno == ENOSPC)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not extend file \"%s\": %m",
+ FilePathName(v->umfd_vfd)),
+ errhint("Check free disk space.")));
+ else
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not extend file \"%s\": wrote only %d of %d bytes at block %u",
+ FilePathName(v->umfd_vfd), nbytes, BLCKSZ, blocknum)));
+ }
+
+ if (!skipFsync && !isTempRelation)
+ umfile_register_dirty_seg(rlocator, false, forknum, v);
+}
+
+void
+umfile_zeroextend(UmbraFileContext *ctx, ForkNumber forknum, BlockNumber blocknum,
+ int nblocks, bool skipFsync)
+{
+ RelFileLocatorBackend rlocator;
+ bool isTempRelation;
+
+ Assert(ctx != NULL);
+ rlocator = ctx->rlocator;
+ isTempRelation = RelFileLocatorBackendIsTemp(rlocator);
+
+ while (nblocks > 0)
+ {
+ int numblocks;
+ off_t seekpos;
+ UmfdVec *v;
+ int ret;
+ int remblocks;
+ BlockNumber curblocknum;
+
+ curblocknum = blocknum;
+ remblocks = nblocks;
+
+ numblocks = Min(remblocks, RELSEG_SIZE - (curblocknum % RELSEG_SIZE));
+ numblocks = Min(numblocks, PG_IOV_MAX);
+
+ v = umfile_getseg(ctx, rlocator, forknum, curblocknum, skipFsync,
+ UM_EXTENSION_FAIL | UM_EXTENSION_CREATE,
+ isTempRelation);
+
+ seekpos = (off_t) BLCKSZ * (curblocknum % ((BlockNumber) RELSEG_SIZE));
+ Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
+
+ ret = FileZero(v->umfd_vfd,
+ seekpos,
+ (off_t) BLCKSZ * numblocks,
+ WAIT_EVENT_DATA_FILE_EXTEND);
+ if (ret < 0)
+ {
+ int save_errno = errno;
+
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not extend file \"%s\": %m",
+ FilePathName(v->umfd_vfd)),
+ errhint(save_errno == ENOSPC ? "Check free disk space." : NULL)));
+ }
+
+ if (!skipFsync && !isTempRelation)
+ umfile_register_dirty_seg(rlocator, false, forknum, v);
+
+ nblocks -= numblocks;
+ blocknum += numblocks;
+ }
+
+}
+
+bool
+umfile_prefetch(UmbraFileContext *ctx, ForkNumber forknum, BlockNumber blocknum,
+ int nblocks)
+{
+ RelFileLocatorBackend rlocator;
+ bool isTempRelation;
+
+ Assert(ctx != NULL);
+ rlocator = ctx->rlocator;
+ isTempRelation = RelFileLocatorBackendIsTemp(rlocator);
+
+#ifdef USE_PREFETCH
+ Assert((io_direct_flags & IO_DIRECT_DATA) == 0);
+
+ if ((uint64) blocknum + nblocks > (uint64) MaxBlockNumber + 1)
+ return false;
+
+ while (nblocks > 0)
+ {
+ off_t seekpos;
+ UmfdVec *v;
+ int nblocks_this_segment;
+
+ v = umfile_getseg(ctx, rlocator, forknum, blocknum,
+ false /* skipFsync */,
+ InRecovery ? UM_EXTENSION_RETURN_NULL : UM_EXTENSION_FAIL,
+ isTempRelation);
+ if (v == NULL)
+ return false;
+
+ seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
+ Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
+
+ nblocks_this_segment =
+ Min(nblocks, RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)));
+
+ (void) FilePrefetch(v->umfd_vfd, seekpos, BLCKSZ * nblocks_this_segment,
+ WAIT_EVENT_DATA_FILE_PREFETCH);
+
+ blocknum += nblocks_this_segment;
+ nblocks -= nblocks_this_segment;
+ }
+#endif
+ return true;
+}
+
+uint32
+umfile_maxcombine(ForkNumber forknum, BlockNumber blocknum)
+{
+ uint32 maxblocks;
+
+ (void) forknum;
+ maxblocks = RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE));
+ return maxblocks;
+}
+
+static int
+umfile_buffers_to_iovec(struct iovec *iov, void **buffers, int nblocks)
+{
+ struct iovec *iovp;
+ int iovcnt;
+
+ Assert(nblocks >= 1);
+
+ /* If this build supports direct I/O, buffers must be I/O aligned. */
+ for (int i = 0; i < nblocks; ++i)
+ {
+ if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ)
+ Assert((uintptr_t) buffers[i] ==
+ TYPEALIGN(PG_IO_ALIGN_SIZE, buffers[i]));
+ }
+
+ /* Start the first iovec off with the first buffer. */
+ iovp = &iov[0];
+ iovp->iov_base = buffers[0];
+ iovp->iov_len = BLCKSZ;
+ iovcnt = 1;
+
+ /* Try to merge the rest. */
+ for (int i = 1; i < nblocks; ++i)
+ {
+ void *buffer = buffers[i];
+
+ if (((char *) iovp->iov_base + iovp->iov_len) == buffer)
+ {
+ iovp->iov_len += BLCKSZ;
+ }
+ else
+ {
+ iovp++;
+ iovp->iov_base = buffer;
+ iovp->iov_len = BLCKSZ;
+ iovcnt++;
+ }
+ }
+
+ return iovcnt;
+}
+
+void
+umfile_readv(UmbraFileContext *ctx, ForkNumber forknum, BlockNumber blocknum,
+ void **buffers, BlockNumber nblocks)
+{
+ while (nblocks > 0)
+ {
+ struct iovec iov[PG_IOV_MAX];
+ int iovcnt;
+ off_t seekpos;
+ int nbytes;
+ UmfdVec *v;
+ BlockNumber nblocks_this_segment;
+ size_t transferred_this_segment;
+ size_t size_this_segment;
+
+ v = umfile_getseg(ctx, ctx->rlocator,
+ forknum, blocknum, false /* skipFsync */,
+ UM_EXTENSION_FAIL | UM_EXTENSION_CREATE_RECOVERY,
+ RelFileLocatorBackendIsTemp(ctx->rlocator));
+
+ seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
+ Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
+
+ nblocks_this_segment =
+ Min(nblocks,
+ RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)));
+ nblocks_this_segment = Min(nblocks_this_segment, (BlockNumber) lengthof(iov));
+
+ if (nblocks_this_segment != nblocks)
+ elog(ERROR, "read crosses segment boundary");
+
+ iovcnt = umfile_buffers_to_iovec(iov, buffers, (int) nblocks_this_segment);
+ size_this_segment = nblocks_this_segment * BLCKSZ;
+ transferred_this_segment = 0;
+
+ /*
+ * Inner loop to continue after a short read. We'll keep going until
+ * we hit EOF rather than assuming that a short read means we hit the
+ * end.
+ */
+ for (;;)
+ {
+ TRACE_POSTGRESQL_SMGR_MD_READ_START(forknum, blocknum,
+ ctx->rlocator.locator.spcOid,
+ ctx->rlocator.locator.dbOid,
+ ctx->rlocator.locator.relNumber,
+ ctx->rlocator.backend);
+
+ nbytes = FileReadV(v->umfd_vfd, iov, iovcnt, seekpos,
+ WAIT_EVENT_DATA_FILE_READ);
+
+ TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum, blocknum,
+ ctx->rlocator.locator.spcOid,
+ ctx->rlocator.locator.dbOid,
+ ctx->rlocator.locator.relNumber,
+ ctx->rlocator.backend,
+ nbytes,
+ size_this_segment - transferred_this_segment);
+
+ if (nbytes < 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not read blocks %u..%u in file \"%s\": %m",
+ blocknum,
+ blocknum + nblocks_this_segment - 1,
+ FilePathName(v->umfd_vfd))));
+
+ if (nbytes == 0)
+ {
+ /*
+ * Mirror mdreadv() behavior: in production builds we can
+ * zero-fill if zero_damaged_pages or in recovery, but this
+ * codepath is expected to be unreachable for normal reads.
+ */
+ if (zero_damaged_pages || InRecovery)
+ {
+ Assert(false); /* see md.c commentary */
+
+ for (BlockNumber i = transferred_this_segment / BLCKSZ;
+ i < nblocks_this_segment;
+ ++i)
+ memset(buffers[i], 0, BLCKSZ);
+ break;
+ }
+ else
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("could not read blocks %u..%u in file \"%s\": read only %zu of %zu bytes",
+ blocknum,
+ blocknum + nblocks_this_segment - 1,
+ FilePathName(v->umfd_vfd),
+ transferred_this_segment,
+ size_this_segment)));
+ }
+
+ /* One loop should usually be enough. */
+ transferred_this_segment += nbytes;
+ Assert(transferred_this_segment <= size_this_segment);
+ if (transferred_this_segment == size_this_segment)
+ break;
+
+ /* Adjust position and vectors after a short read. */
+ seekpos += nbytes;
+ iovcnt = compute_remaining_iovec(iov, iov, iovcnt, nbytes);
+ }
+
+ nblocks -= nblocks_this_segment;
+ buffers += nblocks_this_segment;
+ blocknum += nblocks_this_segment;
+ }
+
+}
+
+void
+umfile_startreadv(PgAioHandle *ioh, UmbraFileContext *ctx, ForkNumber forknum,
+ BlockNumber blocknum, void **buffers, BlockNumber nblocks)
+{
+ off_t seekpos;
+ UmfdVec *v;
+ BlockNumber nblocks_this_segment;
+ struct iovec *iov;
+ int iovcnt;
+ int ret;
+
+ v = umfile_getseg(ctx, ctx->rlocator,
+ forknum, blocknum, false /* skipFsync */,
+ UM_EXTENSION_FAIL | UM_EXTENSION_CREATE_RECOVERY,
+ RelFileLocatorBackendIsTemp(ctx->rlocator));
+
+ seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
+ Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
+
+ nblocks_this_segment =
+ Min(nblocks, RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)));
+
+ if (nblocks_this_segment != nblocks)
+ elog(ERROR, "read crossing segment boundary");
+
+ iovcnt = pgaio_io_get_iovec(ioh, &iov);
+ Assert(nblocks <= (BlockNumber) iovcnt);
+
+ iovcnt = umfile_buffers_to_iovec(iov, buffers, (int) nblocks_this_segment);
+
+ if (!(io_direct_flags & IO_DIRECT_DATA))
+ pgaio_io_set_flag(ioh, PGAIO_HF_BUFFERED);
+
+ ret = FileStartReadV(ioh, v->umfd_vfd, iovcnt, seekpos, WAIT_EVENT_DATA_FILE_READ);
+ if (ret != 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not start reading blocks %u..%u in file \"%s\": %m",
+ blocknum,
+ blocknum + nblocks_this_segment - 1,
+ FilePathName(v->umfd_vfd))));
}

-static BlockNumber
-umfile_nblocks_in_seg(File vfd)
-{
- pgoff_t size;
+void
+umfile_startreadv_physical(PgAioHandle *ioh, UmbraFileContext *ctx,
+ ForkNumber forknum,
+ BlockNumber logical_blocknum,
+ BlockNumber physical_blocknum,
+ void **buffers, BlockNumber nblocks)
+{
+ off_t seekpos;
+ UmfdVec *v;
+ BlockNumber nblocks_this_segment;
+ struct iovec *iov;
+ int iovcnt;
+ int ret;
+
+ /*
+ * Caller is responsible for not crossing physical segment boundaries.
+ * Umbra MAP translation enforces single-block I/O via ummaxcombine().
+ */
+ Assert(nblocks >= 1);
+ {
+ v = umfile_getseg(ctx, ctx->rlocator,
+ forknum, physical_blocknum, false /* skipFsync */,
+ UM_EXTENSION_FAIL,
+ RelFileLocatorBackendIsTemp(ctx->rlocator));
+ }
+
+ seekpos = (off_t) BLCKSZ * (physical_blocknum % ((BlockNumber) RELSEG_SIZE));
+ Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
+
+ nblocks_this_segment =
+ Min(nblocks,
+ RELSEG_SIZE - (physical_blocknum % ((BlockNumber) RELSEG_SIZE)));
+
+ if (nblocks_this_segment != nblocks)
+ elog(ERROR, "read crossing segment boundary");
+
+ iovcnt = pgaio_io_get_iovec(ioh, &iov);
+ Assert(nblocks <= (BlockNumber) iovcnt);
+
+ iovcnt = umfile_buffers_to_iovec(iov, buffers, (int) nblocks_this_segment);
+
+ if (!(io_direct_flags & IO_DIRECT_DATA))
+ pgaio_io_set_flag(ioh, PGAIO_HF_BUFFERED);
+
+ /*
+ * Preserve logical identity for AIO completion reporting and reopen.
+ * The started I/O uses physical addressing (file/seekpos).
+ */
+ {
+ ret = FileStartReadV(ioh, v->umfd_vfd, iovcnt, seekpos,
+ WAIT_EVENT_DATA_FILE_READ);
+ }
+ if (ret != 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not start reading blocks %u..%u in file \"%s\": %m",
+ logical_blocknum,
+ logical_blocknum + nblocks_this_segment - 1,
+ FilePathName(v->umfd_vfd))));
+}
+
+void
+umfile_writev(UmbraFileContext *ctx, ForkNumber forknum, BlockNumber blocknum,
+ const void **buffers, BlockNumber nblocks, bool skipFsync)
+{
+ while (nblocks > 0)
+ {
+ struct iovec iov[PG_IOV_MAX];
+ int iovcnt;
+ off_t seekpos;
+ int nbytes;
+ UmfdVec *v;
+ BlockNumber nblocks_this_segment;
+ size_t transferred_this_segment;
+ size_t size_this_segment;
+
+ v = umfile_getseg(ctx, ctx->rlocator,
+ forknum, blocknum, false /* skipFsync */,
+ UM_EXTENSION_FAIL | UM_EXTENSION_CREATE_RECOVERY,
+ RelFileLocatorBackendIsTemp(ctx->rlocator));
+
+ seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
+ Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
+
+ nblocks_this_segment =
+ Min(nblocks, RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)));
+ nblocks_this_segment = Min(nblocks_this_segment, (BlockNumber) lengthof(iov));
+
+ if (nblocks_this_segment != nblocks)
+ elog(ERROR, "write crosses segment boundary");
+
+ iovcnt = umfile_buffers_to_iovec(iov, (void **) buffers,
+ (int) nblocks_this_segment);
+
+ size_this_segment = nblocks_this_segment * BLCKSZ;
+ transferred_this_segment = 0;
+
+ for (;;)
+ {
+ TRACE_POSTGRESQL_SMGR_MD_WRITE_START(forknum, blocknum,
+ ctx->rlocator.locator.spcOid,
+ ctx->rlocator.locator.dbOid,
+ ctx->rlocator.locator.relNumber,
+ ctx->rlocator.backend);
+
+ nbytes = FileWriteV(v->umfd_vfd, iov, iovcnt, seekpos,
+ WAIT_EVENT_DATA_FILE_WRITE);
+
+ TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE(forknum, blocknum,
+ ctx->rlocator.locator.spcOid,
+ ctx->rlocator.locator.dbOid,
+ ctx->rlocator.locator.relNumber,
+ ctx->rlocator.backend,
+ nbytes,
+ size_this_segment - transferred_this_segment);
+
+ if (nbytes < 0)
+ {
+ bool enospc = errno == ENOSPC;
+
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not write blocks %u..%u in file \"%s\": %m",
+ blocknum,
+ blocknum + nblocks_this_segment - 1,
+ FilePathName(v->umfd_vfd)),
+ enospc ? errhint("Check free disk space.") : 0));
+ }
+
+ transferred_this_segment += nbytes;
+ Assert(transferred_this_segment <= size_this_segment);
+ if (transferred_this_segment == size_this_segment)
+ break;

- size = FileSize(vfd);
- if (size < 0)
- ereport(ERROR,
- (errcode_for_file_access(),
- errmsg("could not determine size of file \"%s\": %m",
- FilePathName(vfd))));
- if ((size % BLCKSZ) != 0)
- ereport(ERROR,
- (errcode(ERRCODE_DATA_CORRUPTED),
- errmsg("file \"%s\" has partial block contents",
- FilePathName(vfd)),
- errdetail("File size %lld is not a multiple of %d bytes.",
- (long long) size, BLCKSZ)));
+ /* Adjust position and vectors after a short write. */
+ seekpos += nbytes;
+ iovcnt = compute_remaining_iovec(iov, iov, iovcnt, nbytes);
+ }
+
+ if (!skipFsync && !RelFileLocatorBackendIsTemp(ctx->rlocator))
+ umfile_register_dirty_seg(ctx->rlocator,
+ RelFileLocatorBackendIsTemp(ctx->rlocator),
+ forknum, v);
+
+ nblocks -= nblocks_this_segment;
+ buffers += nblocks_this_segment;
+ blocknum += nblocks_this_segment;
+ }

- return (BlockNumber) (size / BLCKSZ);
}

-static RelPathStr
-umfile_segpath(RelFileLocatorBackend rlocator, ForkNumber forknum,
- BlockNumber segno)
+void
+umfile_writeback(UmbraFileContext *ctx, ForkNumber forknum, BlockNumber blocknum,
+ BlockNumber nblocks)
{
- RelPathStr base;
- RelPathStr fullpath;
+ UmfdVec *v;
+ off_t seekpos;

- if (forknum == UMBRA_METADATA_FORKNUM)
- base = UmMetadataRelPathBackend(rlocator);
- else
- base = relpath(rlocator, forknum);
+ while (nblocks > 0)
+ {
+ BlockNumber nflush;

- if (segno == 0)
- return base;
+ v = umfile_getseg(ctx, ctx->rlocator,
+ forknum, blocknum, false /* skipFsync */,
+ UM_EXTENSION_FAIL | UM_EXTENSION_CREATE_RECOVERY,
+ RelFileLocatorBackendIsTemp(ctx->rlocator));
+ seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
+ Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);

- snprintf(fullpath.str, sizeof(fullpath.str), "%s.%u", base.str, segno);
- return fullpath;
+ nflush = Min(nblocks, (BlockNumber) RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)));
+ Assert(nflush >= 1);
+ Assert(nflush <= nblocks);
+
+ FileWriteback(v->umfd_vfd, seekpos, (off_t) BLCKSZ * nflush, WAIT_EVENT_DATA_FILE_FLUSH);
+
+ nblocks -= nflush;
+ blocknum += nflush;
+ }
}

-static UmfdVec *
-umfile_openseg(UmbraFileContext *ctx, RelFileLocatorBackend rlocator,
- ForkNumber forknum, BlockNumber segno, int oflags)
+static BlockNumber
+umfile_nblocks_sparse(UmbraFileContext *ctx, RelFileLocatorBackend rlocator,
+ ForkNumber forknum)
{
- UmfdVec *seg;
- RelPathStr path;
- File fd;
- int old_nseg;
-
- Assert(ctx != NULL);
+ UmfdVec *v;
+ BlockNumber nblocks;
+ BlockNumber minsegno;
+ BlockNumber maxsegno;

- old_nseg = ctx->num_open_segs[forknum];
- if (segno < (BlockNumber) old_nseg)
- {
- seg = umfile_v_get(ctx, forknum, (int) segno);
- if (umfile_seg_entry_is_open(seg))
- return seg;
- }
+ Assert(umfile_fork_allows_sparse_segments(forknum));

- path = umfile_segpath(rlocator, forknum, segno);
- fd = PathNameOpenFile(path.str, umfile_open_flags() | oflags);
- if (fd < 0)
- return NULL;
+ if (!umfile_sparse_fork_scan_segments(ctx, forknum, &minsegno, &maxsegno))
+ return 0;

- if (segno >= (BlockNumber) old_nseg)
+ if (maxsegno >= (BlockNumber) ctx->num_open_segs[forknum] ||
+ !umfile_seg_entry_is_open(umfile_v_get(ctx, forknum, (int) maxsegno)))
{
- umfile_fdvec_resize(ctx, forknum, segno + 1);
- for (int i = old_nseg; i < ctx->num_open_segs[forknum]; i++)
- umfile_seg_entry_reset(umfile_v_get(ctx, forknum, i));
+ v = umfile_openseg(ctx, rlocator, forknum, maxsegno, 0);
+ if (v == NULL)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not open file \"%s\": %m",
+ umfile_segpath(rlocator, forknum, maxsegno).str)));
}
+ else
+ v = umfile_v_get(ctx, forknum, (int) maxsegno);

- seg = umfile_v_get(ctx, forknum, (int) segno);
- seg->umfd_vfd = fd;
- seg->umfd_segno = segno;
-
- Assert(umfile_nblocks_in_seg(seg->umfd_vfd) <= (BlockNumber) RELSEG_SIZE);
- return seg;
+ nblocks = umfile_nblocks_in_seg(v->umfd_vfd);
+ if (nblocks > (BlockNumber) RELSEG_SIZE)
+ elog(FATAL, "segment too big");
+ return (maxsegno * ((BlockNumber) RELSEG_SIZE)) + nblocks;
}

-static UmfdVec *
-umfile_openfork(UmbraFileContext *ctx, RelFileLocatorBackend rlocator,
- ForkNumber forknum, int behavior)
+static BlockNumber
+umfile_nblocks_dense(UmbraFileContext *ctx, RelFileLocatorBackend rlocator,
+ ForkNumber forknum)
{
- RelPathStr path;
- File fd;
- UmfdVec *seg;
+ UmfdVec *v;
+ BlockNumber nblocks;
+ BlockNumber segno;

- Assert(ctx != NULL);
+ /*
+ * Match md.c semantics: missing forks read as size 0.
+ *
+ * This is relied on by size-reporting code paths (pg_table_size, psql \d+),
+ * and by callers that probe optional forks without doing smgrexists() first.
+ */
+ if (umfile_openfork(ctx, rlocator, forknum, UM_EXTENSION_RETURN_NULL) == NULL)
+ return 0;
+ Assert(ctx->num_open_segs[forknum] > 0);

- if (ctx->num_open_segs[forknum] > 0)
- {
- seg = umfile_v_get(ctx, forknum, 0);
- if (umfile_seg_entry_is_open(seg))
- return seg;
- }
+ segno = ctx->num_open_segs[forknum] - 1;
+ v = umfile_v_get(ctx, forknum, segno);

- path = umfile_segpath(rlocator, forknum, 0);
- fd = PathNameOpenFile(path.str, umfile_open_flags());
- if (fd < 0)
+ for (;;)
{
- if ((behavior & UM_EXTENSION_RETURN_NULL) &&
- FILE_POSSIBLY_DELETED(errno))
- return NULL;
+ nblocks = umfile_nblocks_in_seg(v->umfd_vfd);
+ if (nblocks > (BlockNumber) RELSEG_SIZE)
+ elog(FATAL, "segment too big");
+ if (nblocks < (BlockNumber) RELSEG_SIZE)
+ return (segno * ((BlockNumber) RELSEG_SIZE)) + nblocks;

- ereport(ERROR,
- (errcode_for_file_access(),
- errmsg("could not open file \"%s\": %m", path.str)));
+ segno++;
+ v = umfile_openseg(ctx, rlocator, forknum, segno, 0);
+ if (v == NULL)
+ return segno * ((BlockNumber) RELSEG_SIZE);
}
-
- if (ctx->num_open_segs[forknum] == 0)
- umfile_fdvec_resize(ctx, forknum, 1);
- seg = umfile_v_get(ctx, forknum, 0);
- seg->umfd_vfd = fd;
- seg->umfd_segno = 0;
-
- Assert(umfile_nblocks_in_seg(seg->umfd_vfd) <= (BlockNumber) RELSEG_SIZE);
- return seg;
}

-static UmfdVec *
-umfile_getseg(UmbraFileContext *ctx, RelFileLocatorBackend rlocator,
- ForkNumber forknum, BlockNumber blkno,
- bool skipFsync, int behavior)
+BlockNumber
+umfile_nblocks(UmbraFileContext *ctx, ForkNumber forknum, UmFileNblocksMode mode)
{
- UmfdVec *seg;
- BlockNumber targetseg;
- BlockNumber nextsegno;
+ RelFileLocatorBackend rlocator;

Assert(ctx != NULL);
- Assert(behavior &
- (UM_EXTENSION_FAIL | UM_EXTENSION_CREATE |
- UM_EXTENSION_RETURN_NULL | UM_EXTENSION_DONT_OPEN));
+ rlocator = ctx->rlocator;

- targetseg = blkno / ((BlockNumber) RELSEG_SIZE);
+ if (mode == UMFILE_NBLOCKS_SPARSE)
+ return umfile_nblocks_sparse(ctx, rlocator, forknum);

- if (targetseg < (BlockNumber) ctx->num_open_segs[forknum])
- {
- seg = umfile_v_get(ctx, forknum, (int) targetseg);
- if (umfile_seg_entry_is_open(seg))
- return seg;
- }
+ return umfile_nblocks_dense(ctx, rlocator, forknum);
+}

- if (behavior & UM_EXTENSION_DONT_OPEN)
- return NULL;
+void
+umfile_truncate(UmbraFileContext *ctx, ForkNumber forknum,
+ BlockNumber curnblk, BlockNumber nblocks)
+{
+ BlockNumber priorblocks;
+ int curopensegs;

- if (ctx->num_open_segs[forknum] > 0)
- seg = umfile_v_get(ctx, forknum, ctx->num_open_segs[forknum] - 1);
- else
+ if (nblocks > curnblk)
{
- seg = umfile_openfork(ctx, rlocator, forknum, behavior);
- if (seg == NULL)
- return NULL;
+ if (InRecovery)
+ return;
+ ereport(ERROR,
+ (errmsg("could not truncate file \"%s\" to %u blocks: it's only %u blocks now",
+ relpath(ctx->rlocator, forknum).str,
+ nblocks, curnblk)));
}
+ if (nblocks == curnblk)
+ return;

- for (nextsegno = ctx->num_open_segs[forknum];
- nextsegno <= targetseg;
- nextsegno++)
+ curopensegs = ctx->num_open_segs[forknum];
+ while (curopensegs > 0)
{
- BlockNumber nblocks;
- int flags = 0;
+ UmfdVec *v;

- Assert(nextsegno == seg->umfd_segno + 1);
+ priorblocks = (curopensegs - 1) * RELSEG_SIZE;
+ v = umfile_v_get(ctx, forknum, curopensegs - 1);

- nblocks = umfile_nblocks_in_seg(seg->umfd_vfd);
- if (nblocks > (BlockNumber) RELSEG_SIZE)
- elog(FATAL, "Umbra segment too big");
+ if (priorblocks > nblocks)
+ {
+ if (FileTruncate(v->umfd_vfd, 0, WAIT_EVENT_DATA_FILE_TRUNCATE) < 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not truncate file \"%s\": %m",
+ FilePathName(v->umfd_vfd))));

- if ((behavior & UM_EXTENSION_CREATE) ||
- (InRecovery && (behavior & UM_EXTENSION_CREATE_RECOVERY)))
+ if (!RelFileLocatorBackendIsTemp(ctx->rlocator))
+ umfile_register_dirty_seg(ctx->rlocator,
+ RelFileLocatorBackendIsTemp(ctx->rlocator),
+ forknum, v);
+
+ Assert(v != umfile_v_get(ctx, forknum, 0));
+
+ FileClose(v->umfd_vfd);
+ umfile_fdvec_resize(ctx, forknum, curopensegs - 1);
+ }
+ else if (priorblocks + ((BlockNumber) RELSEG_SIZE) > nblocks)
{
- if (nblocks < (BlockNumber) RELSEG_SIZE)
- {
- char *zerobuf;
+ BlockNumber lastsegblocks = nblocks - priorblocks;

- zerobuf = palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE,
- MCXT_ALLOC_ZERO);
- umfile_extend(ctx, forknum,
- nextsegno * ((BlockNumber) RELSEG_SIZE) - 1,
- zerobuf, skipFsync);
- pfree(zerobuf);
- }
- flags = O_CREAT;
+ if (FileTruncate(v->umfd_vfd, (off_t) lastsegblocks * BLCKSZ, WAIT_EVENT_DATA_FILE_TRUNCATE) < 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not truncate file \"%s\" to %u blocks: %m",
+ FilePathName(v->umfd_vfd),
+ nblocks)));
+
+ if (!RelFileLocatorBackendIsTemp(ctx->rlocator))
+ umfile_register_dirty_seg(ctx->rlocator,
+ RelFileLocatorBackendIsTemp(ctx->rlocator),
+ forknum, v);
}
- else if (nblocks < (BlockNumber) RELSEG_SIZE)
+ else
{
- if (behavior & UM_EXTENSION_RETURN_NULL)
+ break;
+ }
+ curopensegs--;
+ }
+}
+
+void
+umfile_registersync(UmbraFileContext *ctx, ForkNumber forknum)
+{
+ int segno;
+ int min_inactive_seg;
+
+ if (umfile_fork_allows_sparse_segments(forknum))
+ {
+ RelPathStr path = umfile_segpath(ctx->rlocator, forknum, 0);
+ BlockNumber *segnos = NULL;
+ int nsegnos = 0;
+ int i;
+
+ if (!umfile_collect_existing_segnos_by_path(path.str, &segnos, &nsegnos))
+ return;
+
+ for (i = 0; i < nsegnos; i++)
+ {
+ BlockNumber curseg = segnos[i];
+ UmfdVec *v;
+
+ if (curseg < (BlockNumber) ctx->num_open_segs[forknum] &&
+ umfile_seg_entry_is_open(umfile_v_get(ctx, forknum, (int) curseg)))
+ v = umfile_v_get(ctx, forknum, (int) curseg);
+ else
{
- errno = ENOENT;
- return NULL;
+ v = umfile_openseg(ctx, ctx->rlocator, forknum, curseg, 0);
+ if (v == NULL)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not open file \"%s\": %m",
+ umfile_segpath(ctx->rlocator, forknum, curseg).str)));
}

- ereport(ERROR,
- (errcode_for_file_access(),
- errmsg("could not open file \"%s\" (target block %u): previous segment is only %u blocks",
- umfile_segpath(rlocator, forknum, nextsegno).str,
- blkno, nblocks)));
+ umfile_register_dirty_seg(ctx->rlocator,
+ RelFileLocatorBackendIsTemp(ctx->rlocator),
+ forknum, v);
}

- seg = umfile_openseg(ctx, rlocator, forknum, nextsegno, flags);
- if (seg == NULL)
- {
- if ((behavior & UM_EXTENSION_RETURN_NULL) &&
- FILE_POSSIBLY_DELETED(errno))
- return NULL;
+ if (segnos != NULL)
+ pfree(segnos);
+ return;
+ }

- ereport(ERROR,
- (errcode_for_file_access(),
- errmsg("could not open file \"%s\" (target block %u): %m",
- umfile_segpath(rlocator, forknum, nextsegno).str,
- blkno)));
+ (void) umfile_nblocks(ctx, forknum, UMFILE_NBLOCKS_DENSE);
+
+ min_inactive_seg = segno = ctx->num_open_segs[forknum];
+
+ while (umfile_openseg(ctx, ctx->rlocator, forknum, segno, 0) != NULL)
+ segno++;
+
+ while (segno > 0)
+ {
+ UmfdVec *v = umfile_v_get(ctx, forknum, segno - 1);
+
+ umfile_register_dirty_seg(ctx->rlocator,
+ RelFileLocatorBackendIsTemp(ctx->rlocator),
+ forknum, v);
+
+ if (segno > min_inactive_seg)
+ {
+ FileClose(v->umfd_vfd);
+ umfile_fdvec_resize(ctx, forknum, segno - 1);
}
- }

- return seg;
+ segno--;
+ }
}

-static bool
-umfile_fork_has_open_segment(UmbraFileContext *ctx, ForkNumber forknum)
+void
+umfile_immedsync(UmbraFileContext *ctx, ForkNumber forknum)
{
- Assert(ctx != NULL);
+ int segno;
+ int min_inactive_seg;

- for (int i = 0; i < ctx->num_open_segs[forknum]; i++)
+ if (umfile_fork_allows_sparse_segments(forknum))
{
- if (umfile_seg_entry_is_open(umfile_v_get(ctx, forknum, i)))
- return true;
+ RelPathStr path = umfile_segpath(ctx->rlocator, forknum, 0);
+ BlockNumber *segnos = NULL;
+ int nsegnos = 0;
+ int i;
+
+ if (!umfile_collect_existing_segnos_by_path(path.str, &segnos, &nsegnos))
+ return;
+
+ for (i = 0; i < nsegnos; i++)
+ {
+ BlockNumber curseg = segnos[i];
+ UmfdVec *v;
+
+ if (curseg < (BlockNumber) ctx->num_open_segs[forknum] &&
+ umfile_seg_entry_is_open(umfile_v_get(ctx, forknum, (int) curseg)))
+ v = umfile_v_get(ctx, forknum, (int) curseg);
+ else
+ {
+ v = umfile_openseg(ctx, ctx->rlocator, forknum, curseg, 0);
+ if (v == NULL)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not open file \"%s\": %m",
+ umfile_segpath(ctx->rlocator, forknum, curseg).str)));
+ }
+
+ if (FileSync(v->umfd_vfd, WAIT_EVENT_DATA_FILE_IMMEDIATE_SYNC) < 0)
+ ereport(data_sync_elevel(ERROR),
+ (errcode_for_file_access(),
+ errmsg("could not fsync file \"%s\": %m",
+ FilePathName(v->umfd_vfd))));
+ }
+
+ if (segnos != NULL)
+ pfree(segnos);
+ return;
}

- return false;
-}
+ (void) umfile_nblocks(ctx, forknum, UMFILE_NBLOCKS_DENSE);

-static bool
-umfile_fork_has_open_segment_on_disk(UmbraFileContext *ctx,
- RelFileLocatorBackend rlocator,
- ForkNumber forknum)
-{
- bool have_live = false;
+ min_inactive_seg = segno = ctx->num_open_segs[forknum];

- Assert(ctx != NULL);
+ while (umfile_openseg(ctx, ctx->rlocator, forknum, segno, 0) != NULL)
+ segno++;

- for (int i = 0; i < ctx->num_open_segs[forknum]; i++)
+ while (segno > 0)
{
- UmfdVec *seg = umfile_v_get(ctx, forknum, i);
- RelPathStr path;
+ UmfdVec *v = umfile_v_get(ctx, forknum, segno - 1);

- if (!umfile_seg_entry_is_open(seg))
- continue;
+ if (FileSync(v->umfd_vfd, WAIT_EVENT_DATA_FILE_IMMEDIATE_SYNC) < 0)
+ ereport(data_sync_elevel(ERROR),
+ (errcode_for_file_access(),
+ errmsg("could not fsync file \"%s\": %m",
+ FilePathName(v->umfd_vfd))));

- path = umfile_segpath(rlocator, forknum, seg->umfd_segno);
- if (access(path.str, F_OK) == 0)
+ if (segno > min_inactive_seg)
{
- have_live = true;
- continue;
+ FileClose(v->umfd_vfd);
+ umfile_fdvec_resize(ctx, forknum, segno - 1);
}

- FileClose(seg->umfd_vfd);
- umfile_seg_entry_reset(seg);
+ segno--;
}

- return have_live;
}

-static inline bool
-umfile_seg_entry_is_open(const UmfdVec *seg)
+int
+umfile_fd(UmbraFileContext *ctx, ForkNumber forknum, BlockNumber blocknum, uint32 *off)
{
- return (seg != NULL && seg->umfd_vfd >= 0);
-}
+ UmfdVec *v;

-static inline void
-umfile_seg_entry_reset(UmfdVec *seg)
-{
- seg->umfd_vfd = -1;
- seg->umfd_segno = InvalidBlockNumber;
+ /*
+ * Sparse mapped forks can address a target segment even when segment 0 is
+ * absent. Reopen the specific target segment directly instead of insisting
+ * that segment 0 exists.
+ */
+ if (!umfile_fork_allows_sparse_segments(forknum))
+ (void) umfile_openfork(ctx, ctx->rlocator, forknum, UM_EXTENSION_FAIL);
+
+ v = umfile_getseg(ctx, ctx->rlocator,
+ forknum, blocknum, false /* skipFsync */,
+ UM_EXTENSION_FAIL,
+ RelFileLocatorBackendIsTemp(ctx->rlocator));
+
+ *off = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
+ Assert(*off < (off_t) BLCKSZ * RELSEG_SIZE);
+
+ return FileGetRawDesc(v->umfd_vfd);
}
diff --git a/src/backend/storage/sync/sync.c b/src/backend/storage/sync/sync.c
index 2c964b6f3d..51ed171c33 100644
--- a/src/backend/storage/sync/sync.c
+++ b/src/backend/storage/sync/sync.c
@@ -29,6 +29,9 @@
#include "storage/fd.h"
#include "storage/latch.h"
#include "storage/md.h"
+#ifdef USE_UMBRA
+#include "storage/umbra.h"
+#endif
#include "utils/hsearch.h"
#include "utils/memutils.h"
#include "utils/wait_event.h"
@@ -115,7 +118,14 @@ static const SyncOps syncsw[] = {
/* pg_multixact/members */
[SYNC_HANDLER_MULTIXACT_MEMBER] = {
.sync_syncfiletag = multixactmemberssyncfiletag
- }
+ },
+#ifdef USE_UMBRA
+ [SYNC_HANDLER_UMBRA] = {
+ .sync_syncfiletag = umsyncfiletag,
+ .sync_unlinkfiletag = umunlinkfiletag,
+ .sync_filetagmatches = umfiletagmatches
+ },
+#endif
};

/*
diff --git a/src/backend/utils/activity/wait_event_names.txt b/src/backend/utils/activity/wait_event_names.txt
index 7bda529855..a1de5a08d4 100644
--- a/src/backend/utils/activity/wait_event_names.txt
+++ b/src/backend/utils/activity/wait_event_names.txt
@@ -387,6 +387,7 @@ ReplicationOriginState "Waiting to read or update the progress of one replicatio
ReplicationSlotIO "Waiting for I/O on a replication slot."
LockFastPath "Waiting to read or update a process' fast-path lock information."
BufferMapping "Waiting to associate a data block with a buffer in the buffer pool."
+MapBufferContent "Waiting to read or update an Umbra metadata map cache page."
LockManager "Waiting to read or update information about <quote>heavyweight</quote> locks."
PredicateLockManager "Waiting to access predicate lock information used by serializable transactions."
ParallelHashJoin "Waiting to synchronize workers during Parallel Hash Join plan execution."
diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c
index 6f074013aa..62476de48e 100644
--- a/src/backend/utils/init/postinit.c
+++ b/src/backend/utils/init/postinit.c
@@ -823,10 +823,10 @@ InitPostgres(const char *in_dbname, Oid dboid,
before_shmem_exit(ShutdownXLOG, 0);
}

- /*
- * Initialize the relation cache and the system catalog caches. Note that
- * no catalog access happens here; we only set up the hashtable structure.
- * We must do this before starting a transaction because transaction abort
+ /*
+ * Initialize the relation cache and the system catalog caches. Note that
+ * no catalog access happens here; we only set up the hashtable structure.
+ * We must do this before starting a transaction because transaction abort
* would try to touch these hashtables.
*/
RelationCacheInitialize();
diff --git a/src/include/storage/lwlocklist.h b/src/include/storage/lwlocklist.h
index af8553bcb6..a8a476c19d 100644
--- a/src/include/storage/lwlocklist.h
+++ b/src/include/storage/lwlocklist.h
@@ -110,6 +110,7 @@ PG_LWLOCKTRANCHE(REPLICATION_ORIGIN_STATE, ReplicationOriginState)
PG_LWLOCKTRANCHE(REPLICATION_SLOT_IO, ReplicationSlotIO)
PG_LWLOCKTRANCHE(LOCK_FASTPATH, LockFastPath)
PG_LWLOCKTRANCHE(BUFFER_MAPPING, BufferMapping)
+PG_LWLOCKTRANCHE(MAP_BUFFER_CONTENT, MapBufferContent)
PG_LWLOCKTRANCHE(LOCK_MANAGER, LockManager)
PG_LWLOCKTRANCHE(PREDICATE_LOCK_MANAGER, PredicateLockManager)
PG_LWLOCKTRANCHE(PARALLEL_HASH_JOIN, ParallelHashJoin)
diff --git a/src/include/storage/map.h b/src/include/storage/map.h
index b0887794c3..b4f6063f35 100644
--- a/src/include/storage/map.h
+++ b/src/include/storage/map.h
@@ -1,53 +1,254 @@
/*-------------------------------------------------------------------------
*
* map.h
- * Umbra metadata-fork disk layout helpers.
+ * physical map layer: logical block number to physical block number mapping
*
- * This header defines the stable on-disk page layout and address translation
- * helpers for Umbra's relation-local metadata file.
- *
- * src/include/storage/map.h
+ * This header defines MAP metadata page layout, shared cache APIs, and address
+ * translation helpers for Umbra relation-local metadata.
*
*-------------------------------------------------------------------------
*/
#ifndef MAP_H
#define MAP_H

+#include "access/xlogdefs.h"
+#include "lib/ilist.h"
+#include "port/atomics.h"
#include "storage/block.h"
+#include "storage/buf.h"
+#include "storage/latch.h"
+#include "storage/lwlock.h"
+#include "storage/mapsuper.h"
#include "storage/relfilelocator.h"
+#include "storage/shmem.h"
+#include "storage/smgr.h"
+#include "storage/spin.h"
+#include "storage/umfile.h"
+
+/* Forward declarations */
+typedef struct MapSharedData MapSharedData;
+typedef struct MapBufferDesc MapBufferDesc;

+/* Map buffer configuration */
#define MAP_ENTRIES_PER_PAGE (BLCKSZ / sizeof(uint32))
+#define MAP_SUPERBLOCK_MIN_ENTRIES 50000

/*
- * Umbra metadata file page layout:
- * - block 0: superblock payload
+ * Umbra metadata fork page layout:
+ * - block 0: superblock (512-byte payload)
* - blocks 1..: repeated proportional groups
*
- * Each group reserves one FSM map page, one VM map page, and 8192 MAIN map
- * pages. That keeps the mapping formula stable while leaving room for the
- * auxiliary forks to grow alongside MAIN.
+ * Each proportional group contains:
+ * - 1 FSM map page
+ * - 1 VM map page
+ * - 8192 MAIN map pages
+ *
+ * This keeps MAIN close to the front of the file while preserving a stable
+ * formula-based layout and reserving room for auxiliary forks as MAIN grows.
*/
-#define MAP_BLOCK_SUPER 0
-#define MAP_BLOCK_FIRST_GROUP 1
-#define MAP_GROUP_FSM_PAGES 1
-#define MAP_GROUP_VM_PAGES 1
-#define MAP_GROUP_MAIN_PAGES 8192
+#define MAP_BLOCK_SUPER 0
+#define MAP_BLOCK_FIRST_GROUP 1
+#define MAP_GROUP_FSM_PAGES 1
+#define MAP_GROUP_VM_PAGES 1
+#define MAP_GROUP_MAIN_PAGES 8192
#define MAP_GROUP_TOTAL_PAGES \
(MAP_GROUP_FSM_PAGES + MAP_GROUP_VM_PAGES + MAP_GROUP_MAIN_PAGES)

+/* Map buffer state bits */
+#define MAPBUF_VALID_MASK 0x000001FF /* refcount (max 511) */
+#define MAPBUF_USAGE_COUNT_SHIFT 9
+#define MAPBUF_USAGE_COUNT_MASK 0x00003E00
+#define MAPBUF_DIRTY 0x00004000
+#define MAPBUF_IO_IN_PROGRESS 0x00008000
+#define MAPBUF_IO_ERROR 0x00010000
+#define MAPBUF_JUST_DIRTIED 0x00020000
+#define MAPBUF_NOT_MATERIALIZED 0x00040000
+#define MAPBUF_CHECKPOINT_NEEDED 0x00080000
+
+#define MAPBUF_GET_REFCOUNT(state) \
+ ((state) & MAPBUF_VALID_MASK)
+#define MAPBUF_GET_USAGECOUNT(state) \
+ (((state) & MAPBUF_USAGE_COUNT_MASK) >> MAPBUF_USAGE_COUNT_SHIFT)
+#define MAPBUF_USAGECOUNT_ONE 0x00000200
+
+/* Map page: a pure array of pblkno values */
typedef struct MapPage
{
uint32 pblknos[MAP_ENTRIES_PER_PAGE];
} MapPage;

-extern void MapPageInit(MapPage *page);
-extern BlockNumber MapPageGetEntry(const MapPage *page, int entry_idx);
-extern void MapPageSetEntry(MapPage *page, int entry_idx, BlockNumber pblkno);

-extern BlockNumber MapForkPageIndexToMapBlkno(ForkNumber forknum,
- BlockNumber fork_page_idx);
-extern BlockNumber MapLblknoToMapBlkno(ForkNumber forknum, BlockNumber lblkno);
-extern bool MapDecodeMapBlkno(BlockNumber map_blkno, ForkNumber *forknum,
- BlockNumber *fork_page_idx);
+/* Shared memory control structure */
+typedef struct MapSharedData
+{
+ /* clock sweep algorithm */
+ pg_atomic_uint32 next_victim_buffer;
+ slock_t clock_lock;
+ int first_free_buffer; /* head of free list, -1 if empty */
+
+ /* statistics */
+ pg_atomic_uint32 num_allocs;
+ uint32 complete_passes;
+
+ /* configuration */
+ int num_slots;
+} MapSharedData;
+
+/* Values for freeNext field */
+#define FREENEXT_END_OF_LIST (-1)
+#define FREENEXT_NOT_IN_LIST (-2)
+
+/*
+ * MapBufferDesc -- shared descriptor/state data for a single map buffer.
+ */
+typedef struct MapBufferDesc
+{
+ RelFileLocator rnode; /* relation identifier */
+ ForkNumber forknum; /* fork number */
+ int page_number; /* map page number in this slot, -1 if empty */
+ XLogRecPtr page_lsn; /* LSN of last modification */
+ int id; /* slot ID */
+ pg_atomic_uint32 state; /* state flags */
+ int freeNext; /* next buffer in free list */
+ int wait_backend_pid; /* backend PID of pin-count waiter */
+ LWLock buffer_lock; /* lock for buffer content access */
+ LWLock io_in_progress_lock; /* lock for buffer I/O state */
+} MapBufferDesc;
+
+
+extern void MapBackendInit(void);
+extern const ShmemCallbacks MapShmemCallbacks;
+
+/* Lookup/modification */
+extern bool MapTryLookup(UmbraFileContext *map_ctx, RelFileLocator rnode,
+ ForkNumber forknum, BlockNumber lblkno,
+ BlockNumber *pblkno);
+extern BlockNumber MapTryLookupPblkRun(UmbraFileContext *map_ctx,
+ RelFileLocator rnode,
+ ForkNumber forknum,
+ BlockNumber lblkno,
+ BlockNumber maxblocks,
+ BlockNumber *start_pblkno);/* Buffer management */
+extern int MapReadBuffer(UmbraFileContext *map_ctx, RelFileLocator rnode,
+ ForkNumber forknum, BlockNumber map_blkno);
+
+/* MAP superblock helpers */
+extern void MapSBlockInit(UmbraFileContext *map_ctx, RelFileLocator rnode,
+ XLogRecPtr map_lsn);
+extern bool MapSBlockEnsureLoaded(UmbraFileContext *map_ctx, RelFileLocator rnode);
+extern bool MapSBlockTryGetLogicalNblocks(UmbraFileContext *map_ctx,
+ RelFileLocator rnode,
+ ForkNumber forknum,
+ BlockNumber *nblocks);
+extern bool MapSBlockForkExists(UmbraFileContext *map_ctx,
+ RelFileLocator rnode,
+ ForkNumber forknum);
+extern bool MapSBlockTryGetNextFreePhysBlock(UmbraFileContext *map_ctx,
+ RelFileLocator rnode,
+ ForkNumber forknum,
+ BlockNumber *next_free_pblk);
+extern bool MapSBlockTryGetPhysicalNblocks(UmbraFileContext *map_ctx,
+ RelFileLocator rnode,
+ ForkNumber forknum,
+ BlockNumber *nblocks);
+extern void MapSBlockBumpLogicalNblocks(UmbraFileContext *map_ctx,
+ RelFileLocator rnode,
+ ForkNumber forknum,
+ BlockNumber nblocks,
+ XLogRecPtr map_lsn);
+extern void MapSBlockBumpPhysicalNblocks(UmbraFileContext *map_ctx,
+ RelFileLocator rnode,
+ ForkNumber forknum,
+ BlockNumber nblocks,
+ XLogRecPtr map_lsn);
+extern bool MapSBlockEnsurePhysicalNblocks(UmbraFileContext *map_ctx,
+ RelFileLocator rnode,
+ ForkNumber forknum,
+ BlockNumber nblocks,
+ bool skipFsync);
+extern void MapSBlockBumpNextFreePhysBlock(UmbraFileContext *map_ctx,
+ RelFileLocator rnode,
+ ForkNumber forknum,
+ BlockNumber next_free_pblk,
+ XLogRecPtr map_lsn);
+extern void MapSBlockSetLogicalNblocks(UmbraFileContext *map_ctx,
+ RelFileLocator rnode,
+ ForkNumber forknum,
+ BlockNumber nblocks,
+ XLogRecPtr map_lsn);
+extern void MapSBlockSetSkipWalPending(UmbraFileContext *map_ctx,
+ RelFileLocator rnode,
+ bool pending,
+ XLogRecPtr map_lsn);
+extern bool MapSBlockIsSkipWalPending(UmbraFileContext *map_ctx,
+ RelFileLocator rnode);
+
+/* Checkpoint interface */
+extern void MapPreCheckpoint(void);
+extern void MapCheckpoint(void);
+extern void MapCheckpointRelation(RelFileLocator rnode);
+extern void MapCheckpointDatabaseTablespaces(Oid dbid, int ntablespaces,
+ const Oid *tablespace_ids);
+extern void MapPostCheckpoint(void);
+extern int MapBgWriterFlush(int max_pages);
+extern void MapAbortBufferIO(void);
+extern void MapBackendExitCleanup(void);
+
+/* Relation lifecycle */
+extern void MapDrop(RelFileLocator rnode);
+extern void MapTruncate(UmbraFileContext *map_ctx, RelFileLocator rnode,
+ ForkNumber forknum, BlockNumber n_lblknos,
+ XLogRecPtr map_lsn);
+extern void MapPreloadTruncatePages(UmbraFileContext *map_ctx,
+ RelFileLocator rnode,
+ ForkNumber forknum,
+ BlockNumber n_lblknos);
+extern void MapReleasePreloadedTruncatePages(RelFileLocator rnode,
+ ForkNumber forknum);
+extern void MapInvalidateRelation(RelFileLocator rnode);
+extern void MapInvalidateDatabaseTablespaces(Oid dbid, int ntablespaces,
+ const Oid *tablespace_ids);
+extern void MapInvalidateDatabase(Oid dbid);
+
+/* Scan helpers */
+extern BlockNumber MapGetLogicalBlockCount(UmbraFileContext *map_ctx,
+ RelFileLocator rnode,
+ ForkNumber forknum);
+extern BlockNumber MapGetPhysicalBlockCount(UmbraFileContext *map_ctx,
+ RelFileLocator rnode,
+ ForkNumber forknum,
+ BlockNumber n_lblknos);
+
+/* Clock algorithm */
+extern int MapClockGetBuffer(void);
+extern void MapClockFreeBuffer(int slot_id);
+extern int MapSyncStart(uint32 *complete_passes, uint32 *num_allocs);
+
+/* Map cache hash table (in mapclock.c) */
+extern int MapCacheLookup(RelFileLocator rnode, ForkNumber forknum,
+ BlockNumber map_blkno);
+extern int MapCacheInsert(RelFileLocator rnode, ForkNumber forknum,
+ BlockNumber map_blkno, int slot_id);
+extern void MapCacheDelete(RelFileLocator rnode, ForkNumber forknum,
+ BlockNumber map_blkno, int slot_id);
+
+/* Buffer pin/unpin */
+extern void MapPinBuffer(int slot_id, bool adjust_usage);
+extern void MapUnpinBuffer(int slot_id);
+extern void MapInvalidateBuffer(int slot_id, RelFileLocator expected_rnode,
+ ForkNumber expected_forknum,
+ BlockNumber expected_map_blkno);
+
+/* GUCs */
+extern int map_buffers;
+extern int map_superblocks;
+
+/* Global data (defined in map.c) */
+extern MapSharedData *MapShared;
+extern MapBufferDesc *MapBuffers;
+extern char *MapPageData; /* actual page data (contiguous block) */
+
+#define MapGetPage(slot_id) \
+ ((MapPage *) (MapPageData + ((slot_id) * BLCKSZ)))

#endif /* MAP_H */
diff --git a/src/include/storage/map_internal.h b/src/include/storage/map_internal.h
new file mode 100644
index 0000000000..8a2ee89deb
--- /dev/null
+++ b/src/include/storage/map_internal.h
@@ -0,0 +1,28 @@
+/*-------------------------------------------------------------------------
+ *
+ * map_internal.h
+ * Internal interfaces shared by split MAP implementation files.
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef MAP_INTERNAL_H
+#define MAP_INTERNAL_H
+
+#include "storage/map.h"
+
+extern void MapEnsurePrivateRefCount(void);
+extern void MapCacheTableShmemRequest(void);
+extern void MapCacheTableShmemInit(void);
+extern void MapBufferUpdateStateBits(MapBufferDesc *buf, uint32 set_bits,
+ uint32 clear_bits);
+extern void MapMarkBufferDirty(UmbraFileContext *map_ctx, MapBufferDesc *buf,
+ XLogRecPtr page_lsn);
+extern bool MapStartBufferIO(MapBufferDesc *buf, uint32 required_bits);
+extern void MapTerminateBufferIO(MapBufferDesc *buf, bool clear_dirty,
+ uint32 set_flag_bits);
+extern void MapFlushBuffer(int slot_id);
+extern void MapResetAllTruncatePreloads(void);
+extern BlockNumber MapForkPageIndexToMapBlkno(ForkNumber forknum,
+ BlockNumber fork_page_idx);
+extern BlockNumber MapLblknoToMapBlkno(ForkNumber forknum, BlockNumber lblkno);
+#endif /* MAP_INTERNAL_H */
diff --git a/src/include/storage/mapsuper.h b/src/include/storage/mapsuper.h
index 1f6a5dca5a..3421c0ba58 100644
--- a/src/include/storage/mapsuper.h
+++ b/src/include/storage/mapsuper.h
@@ -1,12 +1,11 @@
/*-------------------------------------------------------------------------
*
* mapsuper.h
- * Umbra metadata superblock helpers.
+ * MAP superblock metadata helpers.
*
- * The superblock is stored in metadata block 0. Its first 512 bytes contain a
- * versioned payload plus CRC, and the remainder of the block is reserved.
- *
- * src/include/storage/mapsuper.h
+ * The on-disk layout is a 512-byte sector:
+ * - first 64 bytes: MapSuperblockData payload
+ * - remaining 448 bytes: zero padding
*
*-------------------------------------------------------------------------
*/
@@ -14,9 +13,9 @@
#define MAPSUPER_H

#include "access/xlogdefs.h"
+#include "common/relpath.h"
#include "port/pg_crc32c.h"
#include "storage/block.h"
-#include "storage/smgr.h"

#define MAP_SUPERBLOCK_MAGIC 0x554D4252U /* "UMBR" */
#define MAP_SUPERBLOCK_VERSION 1U
@@ -27,11 +26,13 @@

typedef struct pg_attribute_packed() MapSuperblockData
{
+ /* identity/version */
uint32 magic;
uint32 version;
uint32 blcksz;
uint32 flags;

+ /* physical allocator state */
BlockNumber next_free_phys_block_main;
BlockNumber phys_capacity_main;
BlockNumber next_free_phys_block_fsm;
@@ -39,10 +40,12 @@ typedef struct pg_attribute_packed() MapSuperblockData
BlockNumber next_free_phys_block_vm;
BlockNumber phys_capacity_vm;

+ /* logical block count cache */
BlockNumber logical_nblocks_main;
BlockNumber logical_nblocks_fsm;
BlockNumber logical_nblocks_vm;

+ /* crash-safety metadata */
XLogRecPtr last_updated_lsn;
pg_crc32c crc;
} MapSuperblockData;
@@ -88,13 +91,10 @@ extern BlockNumber MapSuperblockGetLogicalNblocks(const MapSuperblock *super,
extern void MapSuperblockSetLogicalNblocks(MapSuperblock *super, ForkNumber forknum,
BlockNumber nblocks);

-extern void MapSuperblockPackPage(const MapSuperblock *super, char page[BLCKSZ]);
-extern void MapSuperblockUnpackPage(MapSuperblock *super, const char page[BLCKSZ]);
-
-extern bool MapSBlockRead(SMgrRelation reln, MapSuperblock *super);
-extern void MapSBlockWrite(SMgrRelation reln, const MapSuperblock *super,
- bool skipFsync);
-extern void MapSBlockInitNew(SMgrRelation reln, uint32 flags, XLogRecPtr lsn,
- bool skipFsync);
+/* 512-byte sector I/O helpers */
+extern void MapSuperblockPackSector(const MapSuperblock *super,
+ char sector[MAP_SUPERBLOCK_SIZE]);
+extern void MapSuperblockUnpackSector(MapSuperblock *super,
+ const char sector[MAP_SUPERBLOCK_SIZE]);

#endif /* MAPSUPER_H */
diff --git a/src/include/storage/mapsuper_internal.h b/src/include/storage/mapsuper_internal.h
new file mode 100644
index 0000000000..960469538f
--- /dev/null
+++ b/src/include/storage/mapsuper_internal.h
@@ -0,0 +1,157 @@
+/*-------------------------------------------------------------------------
+ *
+ * mapsuper_internal.h
+ * Shared-memory MAP superblock table internals.
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef MAPSUPER_INTERNAL_H
+#define MAPSUPER_INTERNAL_H
+
+#include "storage/lwlock.h"
+#include "storage/mapsuper.h"
+#include "storage/relfilelocator.h"
+#include "storage/umfile.h"
+
+#define MAPSUPER_FLAG_VALID 0x01
+#define MAPSUPER_FLAG_DIRTY 0x02
+#define MAPSUPER_FLAG_CORRUPT 0x04
+
+#define MAPSUPER_RUNTIME_FLAG_EXTENDING_MAIN 0x08
+#define MAPSUPER_RUNTIME_FLAG_EXTENDING_FSM 0x10
+#define MAPSUPER_RUNTIME_FLAG_EXTENDING_VM 0x20
+
+typedef struct MapSuperTag
+{
+ RelFileLocator rnode;
+} MapSuperTag;
+
+typedef struct MapSuperEntry
+{
+ MapSuperTag key;
+ MapSuperblock super;
+ XLogRecPtr page_lsn;
+ uint32 flags;
+ uint32 runtime_flags;
+ BlockNumber reserved_next_free_main;
+ BlockNumber reserved_next_free_fsm;
+ BlockNumber reserved_next_free_vm;
+ BlockNumber extending_target_main;
+ BlockNumber extending_target_fsm;
+ BlockNumber extending_target_vm;
+ int next_free;
+ bool in_use;
+ LWLock lock;
+} MapSuperEntry;
+
+extern MapSuperEntry *MapSuperEntries;
+extern int MapSuperCapacity;
+
+static inline MapSuperEntry *
+MapSuperEntryBySlot(int slot_id)
+{
+ Assert(slot_id >= 0 && slot_id < MapSuperCapacity);
+ return &MapSuperEntries[slot_id];
+}
+
+extern void MapSBlockReportCorrupt(RelFileLocator rnode, const char *reason);
+extern bool MapForkHasMappedState(ForkNumber forknum);
+extern BlockNumber MapNormalizeForkBlockCount(ForkNumber forknum,
+ BlockNumber raw);
+
+static inline BlockNumber
+MapSuperGetReservedNextFree(const MapSuperEntry *entry, ForkNumber forknum)
+{
+ Assert(entry != NULL);
+
+ switch (forknum)
+ {
+ case MAIN_FORKNUM:
+ return entry->reserved_next_free_main;
+ case FSM_FORKNUM:
+ return entry->reserved_next_free_fsm;
+ case VISIBILITYMAP_FORKNUM:
+ return entry->reserved_next_free_vm;
+ default:
+ elog(ERROR, "unsupported fork number for reservation frontier: %d",
+ forknum);
+ }
+
+ pg_unreachable();
+}
+
+static inline void
+MapSuperSetReservedNextFree(MapSuperEntry *entry, ForkNumber forknum,
+ BlockNumber blkno)
+{
+ Assert(entry != NULL);
+
+ blkno = MapNormalizeForkBlockCount(forknum, blkno);
+
+ switch (forknum)
+ {
+ case MAIN_FORKNUM:
+ entry->reserved_next_free_main = blkno;
+ break;
+ case FSM_FORKNUM:
+ entry->reserved_next_free_fsm = blkno;
+ break;
+ case VISIBILITYMAP_FORKNUM:
+ entry->reserved_next_free_vm = blkno;
+ break;
+ default:
+ elog(ERROR, "unsupported fork number for reservation frontier: %d",
+ forknum);
+ }
+}
+
+static inline void
+MapSuperMaybeBumpReservedNextFree(MapSuperEntry *entry, ForkNumber forknum,
+ BlockNumber blkno)
+{
+ BlockNumber current;
+
+ Assert(entry != NULL);
+
+ blkno = MapNormalizeForkBlockCount(forknum, blkno);
+ current = MapSuperGetReservedNextFree(entry, forknum);
+ if (current < blkno)
+ MapSuperSetReservedNextFree(entry, forknum, blkno);
+}
+
+static inline void
+MapSuperResetReservedNextFrees(MapSuperEntry *entry)
+{
+ Assert(entry != NULL);
+
+ MapSuperSetReservedNextFree(entry, MAIN_FORKNUM,
+ MapSuperblockGetNextFreePhysBlock(&entry->super,
+ MAIN_FORKNUM));
+ MapSuperSetReservedNextFree(entry, FSM_FORKNUM,
+ MapSuperblockGetNextFreePhysBlock(&entry->super,
+ FSM_FORKNUM));
+ MapSuperSetReservedNextFree(entry, VISIBILITYMAP_FORKNUM,
+ MapSuperblockGetNextFreePhysBlock(&entry->super,
+ VISIBILITYMAP_FORKNUM));
+}
+
+extern bool MapSuperFindEntryLocked(RelFileLocator rnode, LWLockMode mode,
+ MapSuperEntry **entry);
+extern bool MapSuperFindEntryTryLocked(RelFileLocator rnode, LWLockMode mode,
+ MapSuperEntry **entry);
+extern MapSuperEntry *MapSuperEnsureEntryLocked(RelFileLocator rnode);
+extern void MapSuperDeleteEntry(RelFileLocator rnode);
+extern bool MapSuperForkExists(const MapSuperblock *super,
+ ForkNumber forknum);
+extern void MapSBlockBumpPhysicalState(UmbraFileContext *map_ctx,
+ RelFileLocator rnode,
+ ForkNumber forknum,
+ BlockNumber nblocks,
+ bool bump_next_free,
+ bool bump_capacity,
+ XLogRecPtr map_lsn);
+extern void MapSuperTableShmemRequest(void);
+extern void MapSuperTableShmemInit(void);
+extern void MapSuperTableShmemAttach(void);
+
+#endif /* MAPSUPER_INTERNAL_H */
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index 8d06d69b51..47dbf12643 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -119,6 +119,12 @@ extern void smgrcopyrelationmetadata(SMgrRelation src, SMgrRelation dst,
extern void smgrsyncrelationmetadata(SMgrRelation reln);
extern void smgrunlinkrelationmetadata(RelFileLocatorBackend rlocator,
bool isRedo);
+extern bool smgrcreatedballowswallog(void);
+extern void smgrcheckpointdatabasetablespaces(Oid dbid, int ntablespaces,
+ const Oid *tablespace_ids);
+extern void smgrinvalidatedatabasetablespaces(Oid dbid, int ntablespaces,
+ const Oid *tablespace_ids);
+extern void smgrinvalidatedatabase(Oid dbid);
extern void smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks,
BlockNumber *old_nblocks,
BlockNumber *nblocks);
diff --git a/src/include/storage/subsystemlist.h b/src/include/storage/subsystemlist.h
index 9ad619080b..3b9bd4e9ee 100644
--- a/src/include/storage/subsystemlist.h
+++ b/src/include/storage/subsystemlist.h
@@ -42,6 +42,9 @@ PG_SHMEM_SUBSYSTEM(MultiXactShmemCallbacks)
PG_SHMEM_SUBSYSTEM(BufferManagerShmemCallbacks)
PG_SHMEM_SUBSYSTEM(StrategyCtlShmemCallbacks)
PG_SHMEM_SUBSYSTEM(BufTableShmemCallbacks)
+#ifdef USE_UMBRA
+PG_SHMEM_SUBSYSTEM(MapShmemCallbacks)
+#endif

/* lock manager */
PG_SHMEM_SUBSYSTEM(LockManagerShmemCallbacks)
diff --git a/src/include/storage/sync.h b/src/include/storage/sync.h
index 88290500bc..559a8eea6c 100644
--- a/src/include/storage/sync.h
+++ b/src/include/storage/sync.h
@@ -39,6 +39,9 @@ typedef enum SyncRequestHandler
SYNC_HANDLER_COMMIT_TS,
SYNC_HANDLER_MULTIXACT_OFFSET,
SYNC_HANDLER_MULTIXACT_MEMBER,
+#ifdef USE_UMBRA
+ SYNC_HANDLER_UMBRA,
+#endif
SYNC_HANDLER_NONE,
} SyncRequestHandler;

diff --git a/src/include/storage/umbra.h b/src/include/storage/umbra.h
index 2fb3c2f75e..b41fae75ea 100644
--- a/src/include/storage/umbra.h
+++ b/src/include/storage/umbra.h
@@ -17,6 +17,7 @@
#include "storage/block.h"
#include "storage/relfilelocator.h"
#include "storage/smgr.h"
+#include "storage/sync.h"
#include "storage/um_defs.h"

extern bool UmMetadataExists(SMgrRelation reln);
@@ -25,17 +26,25 @@ extern BlockNumber UmMetadataNblocks(SMgrRelation reln);
extern void UmMetadataRead(SMgrRelation reln, BlockNumber blkno, void *buffer);
extern void UmMetadataWrite(SMgrRelation reln, BlockNumber blkno,
const void *buffer, bool skipFsync);
+extern void UmMetadataWriteSuperblock(RelFileLocatorBackend rlocator,
+ const void *sector, bool skipFsync);
extern void UmMetadataExtend(SMgrRelation reln, BlockNumber blkno,
const void *buffer, bool skipFsync);
extern void UmMetadataImmediateSync(SMgrRelation reln);
extern void UmMetadataUnlink(RelFileLocatorBackend rlocator, bool isRedo);
+extern void UmInvalidateDatabase(Oid dbid);

extern void uminit(void);
extern void umopen(SMgrRelation reln);
extern void umclose(SMgrRelation reln, ForkNumber forknum);
extern void umdestroy(SMgrRelation reln);
extern bool umisinternalfork(ForkNumber forknum);
+extern bool umcreatedballowswallog(void);
extern void umcreaterelationmetadata(SMgrRelation reln);
+extern void umcheckpointdatabasetablespaces(Oid dbid, int ntablespaces,
+ const Oid *tablespace_ids);
+extern void uminvalidatedatabasetablespaces(Oid dbid, int ntablespaces,
+ const Oid *tablespace_ids);
extern void umcopyrelationmetadata(SMgrRelation src, SMgrRelation dst,
char relpersistence);
extern void umsyncrelationmetadata(SMgrRelation reln);
@@ -69,5 +78,8 @@ extern void umimmedsync(SMgrRelation reln, ForkNumber forknum);
extern void umregistersync(SMgrRelation reln, ForkNumber forknum);
extern int umfd(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, uint32 *off);
+extern int umsyncfiletag(const FileTag *ftag, char *path);
+extern int umunlinkfiletag(const FileTag *ftag, char *path);
+extern bool umfiletagmatches(const FileTag *ftag, const FileTag *candidate);

#endif /* UMBRA_H */
diff --git a/src/include/storage/umfile.h b/src/include/storage/umfile.h
index 56936aa697..8b7400140d 100644
--- a/src/include/storage/umfile.h
+++ b/src/include/storage/umfile.h
@@ -1,22 +1,20 @@
/*-------------------------------------------------------------------------
*
* umfile.h
- * Umbra backend-local file/context helpers.
+ * Umbra file/segment manager (backend-local).
*
* This layer owns backend-local file contexts keyed by RelFileLocatorBackend.
- * It is the low-level file access boundary beneath Umbra metadata and mapping
- * code.
- *
- * src/include/storage/umfile.h
+ * It provides low-level physical file/segment handling for Umbra forks.
*
*-------------------------------------------------------------------------
*/
+
#ifndef UMFILE_H
#define UMFILE_H

+#include "storage/fd.h"
#include "storage/aio_types.h"
#include "storage/block.h"
-#include "storage/fd.h"
#include "storage/relfilelocator.h"
#include "storage/um_defs.h"

@@ -34,68 +32,88 @@ typedef enum UmFileExistsMode
UMFILE_EXISTS_SPARSE
} UmFileExistsMode;

-extern void umfile_init(void);
-
+/*
+ * Backend-local context registry.
+ *
+ * umfile owns physical file contexts keyed by RelFileLocatorBackend. smgr and
+ * MAP may borrow a context, but umfile is the only owner.
+ */
extern UmbraFileContext *umfile_ctx_lookup(RelFileLocatorBackend rlocator);
extern UmbraFileContext *umfile_ctx_acquire(RelFileLocatorBackend rlocator);
-extern UmbraFileContext *umfile_ctx_create_temporary(RelFileLocatorBackend rlocator);
-extern void umfile_ctx_destroy_temporary(UmbraFileContext *ctx);
-extern void umfile_ctx_release(RelFileLocatorBackend rlocator);
extern void umfile_ctx_forget(RelFileLocatorBackend rlocator);
extern void umfile_ctx_close_fork(UmbraFileContext *ctx, ForkNumber forknum);
+extern UmbraFileContext *umfile_ctx_create_temporary(RelFileLocatorBackend rlocator);
+extern void umfile_ctx_destroy_temporary(UmbraFileContext *ctx);

+/*
+ * Low-level context I/O helpers for Umbra MAP subsystem.
+ *
+ * These provide direct physical addressing against fork files without going
+ * through smgr mapping translation.
+ */
extern bool umfile_ctx_fork_exists(UmbraFileContext *ctx, ForkNumber forknum,
UmFileExistsMode mode);
-extern BlockNumber umfile_ctx_get_nblocks(UmbraFileContext *ctx,
- ForkNumber forknum,
+extern BlockNumber umfile_ctx_get_nblocks(UmbraFileContext *ctx, ForkNumber forknum,
UmFileNblocksMode mode);
-extern void umfile_ctx_read(UmbraFileContext *ctx, ForkNumber forknum,
- BlockNumber blkno, char *buffer, int nbytes);
-extern void umfile_ctx_write(UmbraFileContext *ctx, ForkNumber forknum,
- BlockNumber blkno, const char *buffer,
- int nbytes, bool skipFsync);
-extern void umfile_ctx_extend(UmbraFileContext *ctx, ForkNumber forknum,
- BlockNumber blkno, const char *buffer);
-extern void umfile_ctx_unlinkfork(RelFileLocatorBackend rlocator,
- ForkNumber forknum, bool isRedo);
+extern void umfile_ctx_read(UmbraFileContext *ctx, ForkNumber forknum, BlockNumber blkno,
+ char *buffer, int nbytes);
+extern void umfile_ctx_write(UmbraFileContext *ctx, ForkNumber forknum, BlockNumber blkno,
+ const char *buffer, int nbytes, bool skipFsync);
+extern void umfile_ctx_extend(UmbraFileContext *ctx, ForkNumber forknum, BlockNumber blkno,
+ const char *buffer);
+extern void umfile_ctx_prefetch(UmbraFileContext *ctx, ForkNumber forknum, BlockNumber blkno);
+extern bool umfile_ctx_block_exists(UmbraFileContext *ctx, ForkNumber forknum,
+ BlockNumber blkno);
+extern bool umfile_ctx_segment_exists(UmbraFileContext *ctx, ForkNumber forknum,
+ BlockNumber segno);
+extern void umfile_ctx_register_dirty(UmbraFileContext *ctx, ForkNumber forknum,
+ BlockNumber blkno, bool skipFsync,
+ bool isTempRelation);
+extern void umfile_ctx_unlinkfork(RelFileLocatorBackend rlocator, ForkNumber forkNum,
+ bool isRedo);

+/* lifecycle */
+extern void umfile_init(void);
+
+/* smgr-equivalent operations (physical file semantics) */
+extern void umfile_create(UmbraFileContext *ctx, ForkNumber forknum, bool isRedo);
extern bool umfile_exists(UmbraFileContext *ctx, ForkNumber forknum,
UmFileExistsMode mode);
extern bool umfile_open_or_create(UmbraFileContext *ctx, ForkNumber forknum,
bool isRedo, bool *created);
+extern void umfile_unlink(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo);
+extern void umfile_extend(UmbraFileContext *ctx, ForkNumber forknum, BlockNumber blocknum,
+ const void *buffer, bool skipFsync);
+extern void umfile_zeroextend(UmbraFileContext *ctx, ForkNumber forknum, BlockNumber blocknum,
+ int nblocks, bool skipFsync);
+extern bool umfile_prefetch(UmbraFileContext *ctx, ForkNumber forknum, BlockNumber blocknum, int nblocks);
+extern uint32 umfile_maxcombine(ForkNumber forknum, BlockNumber blocknum);
+extern void umfile_readv(UmbraFileContext *ctx, ForkNumber forknum, BlockNumber blocknum,
+ void **buffers, BlockNumber nblocks);
+extern void umfile_startreadv(PgAioHandle *ioh, UmbraFileContext *ctx, ForkNumber forknum,
+ BlockNumber blocknum, void **buffers, BlockNumber nblocks);
+/*
+ * Start an async read using physical addressing, while preserving the logical
+ * identity (block number) for error reporting and reopen semantics.
+ *
+ * This is used by Umbra's MAP translation: the file/offset are based on the
+ * physical block number, but smgr target identity remains logical.
+ */
+extern void umfile_startreadv_physical(PgAioHandle *ioh, UmbraFileContext *ctx,
+ ForkNumber forknum,
+ BlockNumber logical_blocknum,
+ BlockNumber physical_blocknum,
+ void **buffers, BlockNumber nblocks);
+extern void umfile_writev(UmbraFileContext *ctx, ForkNumber forknum, BlockNumber blocknum,
+ const void **buffers, BlockNumber nblocks, bool skipFsync);
+extern void umfile_writeback(UmbraFileContext *ctx, ForkNumber forknum,
+ BlockNumber blocknum, BlockNumber nblocks);
extern BlockNumber umfile_nblocks(UmbraFileContext *ctx, ForkNumber forknum,
UmFileNblocksMode mode);
-extern void umfile_readv(UmbraFileContext *ctx, ForkNumber forknum,
- BlockNumber blocknum, void **buffers,
- BlockNumber nblocks);
-extern void umfile_writev(UmbraFileContext *ctx, ForkNumber forknum,
- BlockNumber blocknum, const void **buffers,
- BlockNumber nblocks, bool skipFsync);
-extern void umfile_extend(UmbraFileContext *ctx, ForkNumber forknum,
- BlockNumber blocknum, const void *buffer,
- bool skipFsync);
-extern void umfile_zeroextend(UmbraFileContext *ctx, ForkNumber forknum,
- BlockNumber blocknum, int nblocks,
- bool skipFsync);
extern void umfile_truncate(UmbraFileContext *ctx, ForkNumber forknum,
BlockNumber old_blocks, BlockNumber nblocks);
extern void umfile_immedsync(UmbraFileContext *ctx, ForkNumber forknum);
extern void umfile_registersync(UmbraFileContext *ctx, ForkNumber forknum);
-extern void umfile_unlink(RelFileLocatorBackend rlocator, ForkNumber forknum,
- bool isRedo);
-
-/* Metadata-only convenience wrappers over the generic umfile surface. */
-extern bool umfile_metadata_exists(UmbraFileContext *ctx);
-extern bool umfile_metadata_open_or_create(UmbraFileContext *ctx,
- bool isRedo, bool *created);
-extern BlockNumber umfile_metadata_nblocks(UmbraFileContext *ctx);
-extern void umfile_metadata_read(UmbraFileContext *ctx, BlockNumber blkno,
- void *buffer);
-extern void umfile_metadata_write(UmbraFileContext *ctx, BlockNumber blkno,
- const void *buffer);
-extern void umfile_metadata_extend(UmbraFileContext *ctx, BlockNumber blkno,
- const void *buffer);
-extern void umfile_metadata_immedsync(UmbraFileContext *ctx);
-extern void umfile_metadata_unlink(RelFileLocatorBackend rlocator, bool isRedo);
+extern int umfile_fd(UmbraFileContext *ctx, ForkNumber forknum, BlockNumber blocknum, uint32 *off);

#endif /* UMFILE_H */
diff --git a/src/test/recovery/meson.build b/src/test/recovery/meson.build
index 36d789720a..0cbdf133ca 100644
--- a/src/test/recovery/meson.build
+++ b/src/test/recovery/meson.build
@@ -61,6 +61,9 @@ tests += {
't/050_redo_segment_missing.pl',
't/051_effective_wal_level.pl',
't/052_checkpoint_segment_missing.pl',
+ 't/053_umbra_map_superblock_watermark.pl',
+ 't/054_umbra_map_fork_policy.pl',
+ 't/063_umbra_mainfork_head_unlink_checkpoint.pl',
],
},
}
diff --git a/src/test/recovery/t/053_umbra_map_superblock_watermark.pl b/src/test/recovery/t/053_umbra_map_superblock_watermark.pl
new file mode 100644
index 0000000000..5f254146d4
--- /dev/null
+++ b/src/test/recovery/t/053_umbra_map_superblock_watermark.pl
@@ -0,0 +1,104 @@
+# Verify MAP superblock watermarks don't regress across crash restart.
+#
+# This test is UMBRA-specific. In md mode there is no MAP fork, so skip.
+use strict;
+use warnings;
+
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+plan skip_all => 'requires --with-umbra MAP fork'
+ unless check_pg_config('^#define USE_UMBRA 1$');
+
+sub u32le_from_hex
+{
+ my ($hex, $offset) = @_;
+ my $chunk = substr($hex, $offset * 2, 8);
+ my @b = ($chunk =~ /../g);
+
+ return hex($b[0]) +
+ (hex($b[1]) << 8) +
+ (hex($b[2]) << 16) +
+ (hex($b[3]) << 24);
+}
+
+my $node = PostgreSQL::Test::Cluster->new('master');
+$node->init();
+$node->append_conf(
+ 'postgresql.conf', qq{
+autovacuum = off
+});
+$node->start();
+
+$node->safe_psql(
+ 'postgres', q{
+CREATE TABLE map_super_t(a int, b text);
+INSERT INTO map_super_t
+SELECT g, repeat('x', 400) FROM generate_series(1, 20000) g;
+CHECKPOINT;
+});
+
+my $map_super_hex = $node->safe_psql(
+ 'postgres',
+ q{SELECT encode(pg_read_binary_file(pg_relation_filepath('map_super_t') || '_map', 0, 64, true), 'hex');}
+);
+
+my $logical_expected_1 = $node->safe_psql(
+ 'postgres',
+ q{SELECT pg_relation_size('map_super_t') / current_setting('block_size')::int;}
+);
+
+my $magic_1 = u32le_from_hex($map_super_hex, 0);
+my $version_1 = u32le_from_hex($map_super_hex, 4);
+my $blcksz_1 = u32le_from_hex($map_super_hex, 8);
+my $next_free_main_1 = u32le_from_hex($map_super_hex, 16);
+my $phys_capacity_main_1 = u32le_from_hex($map_super_hex, 20);
+my $logical_main_1 = u32le_from_hex($map_super_hex, 40);
+
+is($magic_1, 0x554D4252, 'superblock magic matches UMBR');
+is($version_1, 1, 'superblock version matches');
+is($blcksz_1, 8192, 'superblock block size matches');
+cmp_ok($logical_main_1, '==', $logical_expected_1,
+ 'logical_nblocks_main matches relation size in blocks');
+cmp_ok($next_free_main_1, '>=', $logical_main_1,
+ 'next_free_phys_block_main not behind logical_nblocks_main');
+cmp_ok($phys_capacity_main_1, '>=', $next_free_main_1,
+ 'phys_capacity_main not behind next_free_phys_block_main');
+
+$node->safe_psql(
+ 'postgres', q{
+INSERT INTO map_super_t
+SELECT g, repeat('y', 400) FROM generate_series(20001, 40000) g;
+CHECKPOINT;
+});
+
+$node->stop('immediate');
+$node->start();
+
+$map_super_hex = $node->safe_psql(
+ 'postgres',
+ q{SELECT encode(pg_read_binary_file(pg_relation_filepath('map_super_t') || '_map', 0, 64, true), 'hex');}
+);
+
+my $logical_expected_2 = $node->safe_psql(
+ 'postgres',
+ q{SELECT pg_relation_size('map_super_t') / current_setting('block_size')::int;}
+);
+
+my $next_free_main_2 = u32le_from_hex($map_super_hex, 16);
+my $phys_capacity_main_2 = u32le_from_hex($map_super_hex, 20);
+my $logical_main_2 = u32le_from_hex($map_super_hex, 40);
+
+cmp_ok($logical_main_2, '==', $logical_expected_2,
+ 'logical_nblocks_main survives crash restart');
+cmp_ok($logical_main_2, '>=', $logical_main_1,
+ 'logical_nblocks_main does not regress');
+cmp_ok($next_free_main_2, '>=', $next_free_main_1,
+ 'next_free_phys_block_main does not regress');
+cmp_ok($phys_capacity_main_2, '>=', $phys_capacity_main_1,
+ 'phys_capacity_main does not regress');
+cmp_ok($phys_capacity_main_2, '>=', $next_free_main_2,
+ 'phys_capacity_main remains ahead of next_free_phys_block_main');
+
+done_testing();
diff --git a/src/test/recovery/t/054_umbra_map_fork_policy.pl b/src/test/recovery/t/054_umbra_map_fork_policy.pl
new file mode 100644
index 0000000000..99616152bb
--- /dev/null
+++ b/src/test/recovery/t/054_umbra_map_fork_policy.pl
@@ -0,0 +1,62 @@
+# Verify UMBRA MAP fork policy and drop lifecycle behavior.
+#
+# In UMBRA mode:
+# - permanent relations should have MAP fork
+# - unlogged/temp relations should not have MAP fork
+# - dropped permanent relation's MAP fork should disappear after checkpoint
+#
+# In md mode, skip this test.
+use strict;
+use warnings;
+
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+plan skip_all => 'requires --with-umbra MAP fork'
+ unless check_pg_config('^#define USE_UMBRA 1$');
+
+my $node = PostgreSQL::Test::Cluster->new('master');
+$node->init();
+$node->append_conf(
+ 'postgresql.conf', qq{
+autovacuum = off
+});
+$node->start();
+
+my $perm_map_exists = $node->safe_psql(
+ 'postgres', q{
+CREATE TABLE umb_perm_t(a int);
+SELECT COALESCE(encode(pg_read_binary_file(pg_relation_filepath('umb_perm_t') || '_map', 0, 1, true), 'hex'), '') <> '';
+});
+
+my $unlogged_map_exists = $node->safe_psql(
+ 'postgres', q{
+CREATE UNLOGGED TABLE umb_unlogged_t(a int);
+SELECT COALESCE(encode(pg_read_binary_file(pg_relation_filepath('umb_unlogged_t') || '_map', 0, 1, true), 'hex'), '') <> '';
+});
+
+my $temp_map_exists = $node->safe_psql(
+ 'postgres', q{
+CREATE TEMP TABLE umb_temp_t(a int);
+SELECT COALESCE(encode(pg_read_binary_file(pg_relation_filepath('umb_temp_t') || '_map', 0, 1, true), 'hex'), '') <> '';
+});
+
+is($perm_map_exists, 't', 'permanent relation has MAP fork');
+is($unlogged_map_exists, 'f', 'unlogged relation has no MAP fork');
+is($temp_map_exists, 'f', 'temp relation has no MAP fork');
+
+my $perm_map_path =
+ $node->safe_psql('postgres',
+ q{SELECT pg_relation_filepath('umb_perm_t') || '_map';});
+
+$node->safe_psql('postgres', q{
+DROP TABLE umb_perm_t;
+CHECKPOINT;
+});
+
+ok($node->poll_query_until('postgres',
+ "SELECT COALESCE(encode(pg_read_binary_file('$perm_map_path', 0, 1, true), 'hex'), '') = '';", 't'),
+ 'dropped permanent relation MAP fork disappears after checkpoint');
+
+done_testing();
diff --git a/src/test/recovery/t/063_umbra_mainfork_head_unlink_checkpoint.pl b/src/test/recovery/t/063_umbra_mainfork_head_unlink_checkpoint.pl
new file mode 100644
index 0000000000..a8dc86d728
--- /dev/null
+++ b/src/test/recovery/t/063_umbra_mainfork_head_unlink_checkpoint.pl
@@ -0,0 +1,60 @@
+# Verify UMBRA delayed unlink behavior for MAIN fork segment 0.
+#
+# In UMBRA mode for permanent relations:
+# - DROP first truncates MAIN seg0 to 0 bytes
+# - actual unlink of MAIN seg0 is delayed to checkpoint
+#
+# In md mode, skip this test.
+use strict;
+use warnings;
+
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+plan skip_all => 'requires --with-umbra MAP fork'
+ unless check_pg_config('^#define USE_UMBRA 1$');
+
+my $node = PostgreSQL::Test::Cluster->new('master');
+$node->init();
+$node->append_conf(
+ 'postgresql.conf', qq{
+autovacuum = off
+});
+$node->start();
+
+$node->safe_psql(
+ 'postgres', q{
+CREATE TABLE umb_head_unlink_t(id int, payload text);
+INSERT INTO umb_head_unlink_t
+SELECT g, repeat('z', 2000) FROM generate_series(1, 15000) g;
+});
+
+my $main_path = $node->safe_psql(
+ 'postgres',
+ q{SELECT pg_relation_filepath('umb_head_unlink_t');}
+);
+
+cmp_ok(
+ $node->safe_psql(
+ 'postgres',
+ "SELECT COALESCE((pg_stat_file('$main_path', true)).size, -1);"),
+ '>',
+ 0,
+ 'MAIN seg0 size is non-zero before DROP');
+
+$node->safe_psql('postgres', q{DROP TABLE umb_head_unlink_t;});
+
+ok($node->poll_query_until(
+ 'postgres',
+ "SELECT COALESCE((pg_stat_file('$main_path', true)).size, -1) = 0;"),
+ 'MAIN seg0 is truncated to 0 before checkpoint (delayed unlink stage)');
+
+$node->safe_psql('postgres', q{CHECKPOINT;});
+
+ok($node->poll_query_until(
+ 'postgres',
+ "SELECT COALESCE((pg_stat_file('$main_path', true)).size, -1) = -1;"),
+ 'MAIN seg0 is physically removed after checkpoint');
+
+done_testing();
--
2.50.1 (Apple Git-155)

In response to

Browse pgsql-hackers by date

  From Date Subject
Next Message Mingwei Jia 2026-06-01 23:33:36 [RFC PATCH v2 RESEND 06/10] umbra: add patch 5 MAP access policy, translation, and materialization
Previous Message Mingwei Jia 2026-06-01 23:33:34 [RFC PATCH v2 RESEND 04/10] umbra: add patch 3 metadata disk format and identity mapping bootstrap