From 8211ba8be8f8d2da4fc3237c817b411ad9ebe728 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Thu, 13 Nov 2025 12:38:41 +0200
Subject: [PATCH v25 05/10] Reintroduce MultiXactMemberFreezeThreshold

---
 src/backend/access/transam/multixact.c | 202 ++++++++++++++++++++-----
 src/backend/access/transam/xlog.c      |   4 +-
 src/backend/commands/vacuum.c          |   6 +-
 src/backend/postmaster/autovacuum.c    |   4 +-
 src/include/access/multixact.h         |   4 +-
 5 files changed, 170 insertions(+), 50 deletions(-)

diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c
index 78ba6d72a92..c72b2cd7090 100644
--- a/src/backend/access/transam/multixact.c
+++ b/src/backend/access/transam/multixact.c
@@ -91,13 +91,13 @@
 
 
 /*
- * Multixact members warning threshold.
- *
- * If the difference between nextOffset and oldestOffset exceeds this value,
- * we trigger autovacuum in order to release disk space consumed by the
- * members SLRU.
+ * Thresholds used to keep members disk usage in check when multixids have a
+ * lot of members.  When MULTIXACT_MEMBER_LOW_THRESHOLD is reached, vacuum
+ * starts freezing multixids more aggressively, even if the normal multixid
+ * age limits haven't been reached yet.
  */
-#define MULTIXACT_MEMBER_AUTOVAC_THRESHOLD		UINT64CONST(4000000000)
+#define MULTIXACT_MEMBER_LOW_THRESHOLD		UINT64CONST(2000000000)
+#define MULTIXACT_MEMBER_HIGH_THRESHOLD		UINT64CONST(4000000000)
 
 static inline MultiXactId
 PreviousMultiXactId(MultiXactId multi)
@@ -140,6 +140,12 @@ typedef struct MultiXactStateData
 	MultiXactId oldestMultiXactId;
 	Oid			oldestMultiXactDB;
 
+	/*
+	 * Oldest multixact offset that is potentially referenced by a multixact
+	 * referenced by a relation.
+	 */
+	MultiXactOffset oldestOffset;
+
 	/* support for anti-wraparound measures */
 	MultiXactId multiVacLimit;
 	MultiXactId multiWarnLimit;
@@ -276,7 +282,7 @@ static bool MultiXactOffsetPrecedes(MultiXactOffset offset1,
 									MultiXactOffset offset2);
 static void ExtendMultiXactOffset(MultiXactId multi);
 static void ExtendMultiXactMember(MultiXactOffset offset, int nmembers);
-static bool SetOffsetVacuumLimit(bool is_startup);
+static void SetOffsetVacuumLimit(void);
 static bool find_multixact_start(MultiXactId multi, MultiXactOffset *result);
 static void WriteMTruncateXlogRec(Oid oldestMultiDB,
 								  MultiXactId startTruncOff,
@@ -1945,8 +1951,8 @@ TrimMultiXact(void)
 	MultiXactState->finishedStartup = true;
 	LWLockRelease(MultiXactGenLock);
 
-	/* Now compute how far away the next members wraparound is. */
-	SetMultiXactIdLimit(oldestMXact, oldestMXactDB, true);
+	/* Now compute how far away the next multixid wraparound is. */
+	SetMultiXactIdLimit(oldestMXact, oldestMXactDB);
 }
 
 /*
@@ -2015,28 +2021,24 @@ MultiXactSetNextMXact(MultiXactId nextMulti,
  * datminmxid (ie, the oldest MultiXactId that might exist in any database
  * of our cluster), and the OID of the (or a) database with that value.
  *
- * is_startup is true when we are just starting the cluster, false when we
- * are updating state in a running cluster.  This only affects log messages.
+ * This also updates MultiXactState->oldestOffset, by looking up the offset of
+ * MultiXactState->oldestMultiXactId.
  */
 void
-SetMultiXactIdLimit(MultiXactId oldest_datminmxid, Oid oldest_datoid,
-					bool is_startup)
+SetMultiXactIdLimit(MultiXactId oldest_datminmxid, Oid oldest_datoid)
 {
 	MultiXactId multiVacLimit;
 	MultiXactId multiWarnLimit;
 	MultiXactId multiStopLimit;
 	MultiXactId multiWrapLimit;
 	MultiXactId curMulti;
-	bool		needs_offset_vacuum;
 
 	Assert(MultiXactIdIsValid(oldest_datminmxid));
 
 	/*
 	 * We pretend that a wrap will happen halfway through the multixact ID
 	 * space, but that's not really true, because multixacts wrap differently
-	 * from transaction IDs.  Note that, separately from any concern about
-	 * multixact IDs wrapping, we must ensure that multixact members do not
-	 * wrap.  Limits for that are set in SetOffsetVacuumLimit, not here.
+	 * from transaction IDs.
 	 */
 	multiWrapLimit = oldest_datminmxid + (MaxMultiXactId >> 1);
 	if (multiWrapLimit < FirstMultiXactId)
@@ -2104,8 +2106,13 @@ SetMultiXactIdLimit(MultiXactId oldest_datminmxid, Oid oldest_datoid,
 
 	Assert(!InRecovery);
 
-	/* Set limits for offset vacuum. */
-	needs_offset_vacuum = SetOffsetVacuumLimit(is_startup);
+	/*
+	 * Offsets are 64-bits wide and never wrap around, so we don't need to
+	 * consider them for emergency autovacuum purposes.  But now that we're in
+	 * a consistent state, determine MultiXactState->oldestOffset, to be used
+	 * to calculate freezing cutoff to keep the offsets disk usage in check.
+	 */
+	SetOffsetVacuumLimit();
 
 	/*
 	 * If past the autovacuum force point, immediately signal an autovac
@@ -2114,8 +2121,7 @@ SetMultiXactIdLimit(MultiXactId oldest_datminmxid, Oid oldest_datoid,
 	 * database, it'll call here, and we'll signal the postmaster to start
 	 * another iteration immediately if there are still any old databases.
 	 */
-	if ((MultiXactIdPrecedes(multiVacLimit, curMulti) ||
-		 needs_offset_vacuum) && IsUnderPostmaster)
+	if (MultiXactIdPrecedes(multiVacLimit, curMulti) && IsUnderPostmaster)
 		SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_LAUNCHER);
 
 	/* Give an immediate warning if past the wrap warn point */
@@ -2198,7 +2204,7 @@ MultiXactAdvanceOldest(MultiXactId oldestMulti, Oid oldestMultiDB)
 	Assert(InRecovery);
 
 	if (MultiXactIdPrecedes(MultiXactState->oldestMultiXactId, oldestMulti))
-		SetMultiXactIdLimit(oldestMulti, oldestMultiDB, false);
+		SetMultiXactIdLimit(oldestMulti, oldestMultiDB);
 }
 
 /*
@@ -2348,22 +2354,17 @@ GetOldestMultiXactId(void)
 }
 
 /*
- * Determine if we need to vacuum to keep the size of the members SLRU in
- * check.
- *
- * To do so determine what's the oldest member offset and install the limit
- * info in MultiXactState, where it can be used to prevent overrun of old data
- * in the members SLRU area.
- *
- * The return value is true if autovacuum is required and false otherwise.
+ * Determine what's the oldest member offset and install it in MultiXactState,
+ * where it can be used to adjust multixid freezing cutoffs.
  */
-static bool
-SetOffsetVacuumLimit(bool is_startup)
+static void
+SetOffsetVacuumLimit(void)
 {
 	MultiXactId oldestMultiXactId;
 	MultiXactId nextMXact;
 	MultiXactOffset oldestOffset = 0;	/* placate compiler */
 	MultiXactOffset nextOffset;
+	bool		oldestOffsetKnown = false;
 
 	/*
 	 * NB: Have to prevent concurrent truncation, we might otherwise try to
@@ -2393,20 +2394,37 @@ SetOffsetVacuumLimit(bool is_startup)
 		 * offset.
 		 */
 		oldestOffset = nextOffset;
+		oldestOffsetKnown = true;
 	}
-	else if (!find_multixact_start(oldestMultiXactId, &oldestOffset))
+	else
 	{
-		ereport(LOG,
-				(errmsg("oldest checkpointed MultiXact %u does not exist on disk",
-						oldestMultiXactId)));
+		/*
+		 * Figure out the offset at which oldest existing multixact's members
+		 * are stored.  If we cannot find it, be careful not to fail.  (We had
+		 * bugs in early releases of PostgreSQL 9.3.X and 9.4.X, the
+		 * supposedly-earliest multixact might not really exist.  Those should
+		 * be long gone by now, but let's nevertheless be careful not to fail
+		 * in that case.)
+		 */
+		oldestOffsetKnown =
+			find_multixact_start(oldestMultiXactId, &oldestOffset);
+
+		if (!oldestOffsetKnown)
+			ereport(LOG,
+					(errmsg("oldest checkpointed MultiXact %u does not exist on disk",
+							oldestMultiXactId)));
+		return;
 	}
 
 	LWLockRelease(MultiXactTruncationLock);
 
-	/*
-	 * Do we need autovacuum?	If we're not sure, assume yes.
-	 */
-	return nextOffset - oldestOffset > MULTIXACT_MEMBER_AUTOVAC_THRESHOLD;
+	/* Install the computed value */
+	if (oldestOffsetKnown)
+	{
+		LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
+		MultiXactState->oldestOffset = oldestOffset;
+		LWLockRelease(MultiXactGenLock);
+	}
 }
 
 /*
@@ -2453,6 +2471,107 @@ find_multixact_start(MultiXactId multi, MultiXactOffset *result)
 	return true;
 }
 
+/*
+ * Determine how many multixacts, and how many multixact members, currently
+ * exist.
+ */
+static void
+ReadMultiXactCounts(uint32 *multixacts, MultiXactOffset *members)
+{
+	MultiXactOffset nextOffset;
+	MultiXactOffset oldestOffset;
+	MultiXactId oldestMultiXactId;
+	MultiXactId nextMultiXactId;
+
+	LWLockAcquire(MultiXactGenLock, LW_SHARED);
+	nextOffset = MultiXactState->nextOffset;
+	oldestMultiXactId = MultiXactState->oldestMultiXactId;
+	nextMultiXactId = MultiXactState->nextMXact;
+	oldestOffset = MultiXactState->oldestOffset;
+	LWLockRelease(MultiXactGenLock);
+
+	*members = nextOffset - oldestOffset;
+	*multixacts = nextMultiXactId - oldestMultiXactId;
+}
+
+/*
+ * Multixact members can be removed once the multixacts that refer to them are
+ * older than every datminmxid.  autovacuum_multixact_freeze_max_age and
+ * vacuum_multixact_freeze_table_age work together to make sure we never have
+ * too many multixacts; we hope that, at least under normal circumstances,
+ * this will also be sufficient to keep us from using too many offsets.
+ * However, if the average multixact has many members, we might accumulate a
+ * huge amount of members, consuming disk space, while still using few enough
+ * multixids that the multixid limits fail to trigger relminmxid advancement
+ * by VACUUM.
+ *
+ * To prevent that, if more than a certain amount of members space is used
+ * (MULTIXACT_MEMBER_LOW_THRESHOLD), we effectively reduce
+ * autovacuum_multixact_freeze_max_age to a value just less than the number of
+ * multixacts in use.  We hope that this will quickly trigger autovacuuming on
+ * the table or tables with the oldest relminmxid, thus allowing datminmxid
+ * values to advance and removing some members.
+ *
+ * As the amount of the member space in use grows, we become more aggressive
+ * in clamping this value.  That not only causes autovacuum to ramp up, but
+ * also makes any manual vacuums the user issues more aggressive.  This
+ * happens because vacuum_get_cutoffs() will clamp the freeze table and the
+ * minimum freeze age cutoffs based on the effective
+ * autovacuum_multixact_freeze_max_age this function returns.  At the extreme,
+ * when the members usage reaches MULTIXACT_MEMBER_HIGH_THRESHOLD, we'll clamp
+ * freeze_max_age to zero, and every vacuum of any table will freeze every
+ * multixact.
+ */
+int
+MultiXactMemberFreezeThreshold(void)
+{
+	MultiXactOffset members;
+	uint32		multixacts;
+	uint32		victim_multixacts;
+	double		fraction;
+	int			result;
+
+	/*
+	 * Read the current offsets and members usage.
+	 *
+	 * Note: In the case that we have been unable to calculate oldestOffset,
+	 * because we failed to find the offset of the oldest multixid, we assume
+	 * the worst because oldestOffset will be left to zero in that case.
+	 */
+	ReadMultiXactCounts(&multixacts, &members);
+
+	/* If member space utilization is low, no special action is required. */
+	if (members <= MULTIXACT_MEMBER_LOW_THRESHOLD)
+		return autovacuum_multixact_freeze_max_age;
+
+	/*
+	 * Compute a target for relminmxid advancement.  The number of multixacts
+	 * we try to eliminate from the system is based on how far we are past
+	 * MULTIXACT_MEMBER_LOW_THRESHOLD.
+	 *
+	 * The way this formula works is that when members is exactly at the low
+	 * threshold, fraction == 0.0, and we set freeze_max_age equal to
+	 * mxid_age(oldestMultiXactId).  As members grows further, towards the
+	 * high threshold, fraction grows linearly from 0.0 to 1.0, and the result
+	 * shrinks from mxid_age(oldestMultiXactId) to 0.  Beyond the high
+	 * threshold, fraction > 1.0 and the result is clamped to 0.
+	 */
+	fraction = (double) (members - MULTIXACT_MEMBER_LOW_THRESHOLD) /
+		(MULTIXACT_MEMBER_HIGH_THRESHOLD - MULTIXACT_MEMBER_LOW_THRESHOLD);
+	victim_multixacts = multixacts * fraction;
+
+	/* fraction could be > 1.0, but lowest possible freeze age is zero */
+	if (victim_multixacts > multixacts)
+		return 0;
+	result = multixacts - victim_multixacts;
+
+	/*
+	 * Clamp to autovacuum_multixact_freeze_max_age, so that we never make
+	 * autovacuum less aggressive than it would otherwise be.
+	 */
+	return Min(result, autovacuum_multixact_freeze_max_age);
+}
+
 typedef struct mxtruncinfo
 {
 	int64		earliestExistingPage;
@@ -2669,6 +2788,7 @@ TruncateMultiXact(MultiXactId newOldestMulti, Oid newOldestMultiDB)
 	LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE);
 	MultiXactState->oldestMultiXactId = newOldestMulti;
 	MultiXactState->oldestMultiXactDB = newOldestMultiDB;
+	MultiXactState->oldestOffset = newOldestOffset;
 	LWLockRelease(MultiXactGenLock);
 
 	/* First truncate members */
@@ -2864,7 +2984,7 @@ multixact_redo(XLogReaderState *record)
 		 * Advance the horizon values, so they're current at the end of
 		 * recovery.
 		 */
-		SetMultiXactIdLimit(xlrec.endTruncOff, xlrec.oldestMultiDB, false);
+		SetMultiXactIdLimit(xlrec.endTruncOff, xlrec.oldestMultiDB);
 
 		PerformMembersTruncation(xlrec.startTruncMemb, xlrec.endTruncMemb);
 
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index ef405d66b3b..a000b8bd509 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -5155,7 +5155,7 @@ BootStrapXLOG(uint32 data_checksum_version)
 	MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
 	AdvanceOldestClogXid(checkPoint.oldestXid);
 	SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
-	SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB, true);
+	SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB);
 	SetCommitTsLimit(InvalidTransactionId, InvalidTransactionId);
 
 	/* Set up the XLOG page header */
@@ -5636,7 +5636,7 @@ StartupXLOG(void)
 	MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
 	AdvanceOldestClogXid(checkPoint.oldestXid);
 	SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
-	SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB, true);
+	SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB);
 	SetCommitTsLimit(checkPoint.oldestCommitTsXid,
 					 checkPoint.newestCommitTsXid);
 
diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c
index 100e1a72c22..bd4278cd250 100644
--- a/src/backend/commands/vacuum.c
+++ b/src/backend/commands/vacuum.c
@@ -1146,9 +1146,9 @@ vacuum_get_cutoffs(Relation rel, const VacuumParams params,
 	/*
 	 * Also compute the multixact age for which freezing is urgent.  This is
 	 * normally autovacuum_multixact_freeze_max_age, but may be less if we are
-	 * short of multixact member space.
+	 * short of multixact member space. XXX update comment
 	 */
-	effective_multixact_freeze_max_age = autovacuum_multixact_freeze_max_age;
+	effective_multixact_freeze_max_age = MultiXactMemberFreezeThreshold();
 
 	/*
 	 * Almost ready to set freeze output parameters; check if OldestXmin or
@@ -1971,7 +1971,7 @@ vac_truncate_clog(TransactionId frozenXID,
 	 * signaling twice?
 	 */
 	SetTransactionIdLimit(frozenXID, oldestxid_datoid);
-	SetMultiXactIdLimit(minMulti, minmulti_datoid, false);
+	SetMultiXactIdLimit(minMulti, minmulti_datoid);
 
 	LWLockRelease(WrapLimitsVacuumLock);
 }
diff --git a/src/backend/postmaster/autovacuum.c b/src/backend/postmaster/autovacuum.c
index bf66f494e3a..1c38488f2cb 100644
--- a/src/backend/postmaster/autovacuum.c
+++ b/src/backend/postmaster/autovacuum.c
@@ -1151,7 +1151,7 @@ do_start_worker(void)
 
 	/* Also determine the oldest datminmxid we will consider. */
 	recentMulti = ReadNextMultiXactId();
-	multiForceLimit = recentMulti - autovacuum_multixact_freeze_max_age;
+	multiForceLimit = recentMulti - MultiXactMemberFreezeThreshold();
 	if (multiForceLimit < FirstMultiXactId)
 		multiForceLimit -= FirstMultiXactId;
 
@@ -1939,7 +1939,7 @@ do_autovacuum(void)
 	 * normally autovacuum_multixact_freeze_max_age, but may be less if we are
 	 * short of multixact member space.
 	 */
-	effective_multixact_freeze_max_age = autovacuum_multixact_freeze_max_age;
+	effective_multixact_freeze_max_age = MultiXactMemberFreezeThreshold();
 
 	/*
 	 * Find the pg_database entry and select the default freeze ages. We use
diff --git a/src/include/access/multixact.h b/src/include/access/multixact.h
index d688b547c54..cfff86f655f 100644
--- a/src/include/access/multixact.h
+++ b/src/include/access/multixact.h
@@ -126,8 +126,7 @@ extern void BootStrapMultiXact(void);
 extern void StartupMultiXact(void);
 extern void TrimMultiXact(void);
 extern void SetMultiXactIdLimit(MultiXactId oldest_datminmxid,
-								Oid oldest_datoid,
-								bool is_startup);
+								Oid oldest_datoid);
 extern void MultiXactGetCheckptMulti(bool is_shutdown,
 									 MultiXactId *nextMulti,
 									 MultiXactOffset *nextMultiOffset,
@@ -142,6 +141,7 @@ extern void MultiXactSetNextMXact(MultiXactId nextMulti,
 extern void MultiXactAdvanceNextMXact(MultiXactId minMulti,
 									  MultiXactOffset minMultiOffset);
 extern void MultiXactAdvanceOldest(MultiXactId oldestMulti, Oid oldestMultiDB);
+extern int	MultiXactMemberFreezeThreshold(void);
 
 extern void multixact_twophase_recover(FullTransactionId fxid, uint16 info,
 									   void *recdata, uint32 len);
-- 
2.47.3

