From 232bc8a73711c743e74d964abfb202563e176aa5 Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nathan@postgresql.org>
Date: Fri, 10 Oct 2025 12:28:37 -0500
Subject: [PATCH v4 1/1] autovacuum scheduling improvements

---
 src/backend/postmaster/autovacuum.c | 142 +++++++++++++++++++++++++---
 src/tools/pgindent/typedefs.list    |   1 +
 2 files changed, 128 insertions(+), 15 deletions(-)

diff --git a/src/backend/postmaster/autovacuum.c b/src/backend/postmaster/autovacuum.c
index 5084af7dfb6..e8cd414c445 100644
--- a/src/backend/postmaster/autovacuum.c
+++ b/src/backend/postmaster/autovacuum.c
@@ -97,6 +97,7 @@
 #include "storage/procsignal.h"
 #include "storage/smgr.h"
 #include "tcop/tcopprot.h"
+#include "utils/float.h"
 #include "utils/fmgroids.h"
 #include "utils/fmgrprotos.h"
 #include "utils/guc_hooks.h"
@@ -310,6 +311,12 @@ static AutoVacuumShmemStruct *AutoVacuumShmem;
 static dlist_head DatabaseList = DLIST_STATIC_INIT(DatabaseList);
 static MemoryContext DatabaseListCxt = NULL;
 
+typedef struct
+{
+	Oid			oid;
+	double		score;
+} TableToProcess;
+
 /*
  * Dummy pointer to persuade Valgrind that we've not leaked the array of
  * avl_dbase structs.  Make it global to ensure the compiler doesn't
@@ -351,7 +358,8 @@ static void relation_needs_vacanalyze(Oid relid, AutoVacOpts *relopts,
 									  Form_pg_class classForm,
 									  PgStat_StatTabEntry *tabentry,
 									  int effective_multixact_freeze_max_age,
-									  bool *dovacuum, bool *doanalyze, bool *wraparound);
+									  bool *dovacuum, bool *doanalyze, bool *wraparound,
+									  double *score);
 
 static void autovacuum_do_vac_analyze(autovac_table *tab,
 									  BufferAccessStrategy bstrategy);
@@ -1889,6 +1897,15 @@ get_database_list(void)
 	return dblist;
 }
 
+static int
+TableToProcessComparator(const ListCell *a, const ListCell *b)
+{
+	TableToProcess *t1 = (TableToProcess *) lfirst(a);
+	TableToProcess *t2 = (TableToProcess *) lfirst(b);
+
+	return float8_cmp_internal(t2->score, t1->score);
+}
+
 /*
  * Process a database table-by-table
  *
@@ -1902,7 +1919,7 @@ do_autovacuum(void)
 	HeapTuple	tuple;
 	TableScanDesc relScan;
 	Form_pg_database dbForm;
-	List	   *table_oids = NIL;
+	List	   *tables_to_process = NIL;
 	List	   *orphan_oids = NIL;
 	HASHCTL		ctl;
 	HTAB	   *table_toast_map;
@@ -2014,6 +2031,7 @@ do_autovacuum(void)
 		bool		dovacuum;
 		bool		doanalyze;
 		bool		wraparound;
+		double		score = 0.0;
 
 		if (classForm->relkind != RELKIND_RELATION &&
 			classForm->relkind != RELKIND_MATVIEW)
@@ -2054,11 +2072,19 @@ do_autovacuum(void)
 		/* Check if it needs vacuum or analyze */
 		relation_needs_vacanalyze(relid, relopts, classForm, tabentry,
 								  effective_multixact_freeze_max_age,
-								  &dovacuum, &doanalyze, &wraparound);
+								  &dovacuum, &doanalyze, &wraparound,
+								  &score);
 
-		/* Relations that need work are added to table_oids */
+		/* Relations that need work are added to tables_to_process */
 		if (dovacuum || doanalyze)
-			table_oids = lappend_oid(table_oids, relid);
+		{
+			TableToProcess *table = palloc(sizeof(TableToProcess));
+
+			table->oid = relid;
+			table->score = score;
+
+			tables_to_process = lappend(tables_to_process, table);
+		}
 
 		/*
 		 * Remember TOAST associations for the second pass.  Note: we must do
@@ -2114,6 +2140,7 @@ do_autovacuum(void)
 		bool		dovacuum;
 		bool		doanalyze;
 		bool		wraparound;
+		double		score = 0.0;
 
 		/*
 		 * We cannot safely process other backends' temp tables, so skip 'em.
@@ -2146,11 +2173,19 @@ do_autovacuum(void)
 
 		relation_needs_vacanalyze(relid, relopts, classForm, tabentry,
 								  effective_multixact_freeze_max_age,
-								  &dovacuum, &doanalyze, &wraparound);
+								  &dovacuum, &doanalyze, &wraparound,
+								  &score);
 
 		/* ignore analyze for toast tables */
 		if (dovacuum)
-			table_oids = lappend_oid(table_oids, relid);
+		{
+			TableToProcess *table = palloc(sizeof(TableToProcess));
+
+			table->oid = relid;
+			table->score = score;
+
+			tables_to_process = lappend(tables_to_process, table);
+		}
 
 		/* Release stuff to avoid leakage */
 		if (free_relopts)
@@ -2274,6 +2309,8 @@ do_autovacuum(void)
 		MemoryContextSwitchTo(AutovacMemCxt);
 	}
 
+	list_sort(tables_to_process, TableToProcessComparator);
+
 	/*
 	 * Optionally, create a buffer access strategy object for VACUUM to use.
 	 * We use the same BufferAccessStrategy object for all tables VACUUMed by
@@ -2302,9 +2339,9 @@ do_autovacuum(void)
 	/*
 	 * Perform operations on collected tables.
 	 */
-	foreach(cell, table_oids)
+	foreach_ptr(TableToProcess, table, tables_to_process)
 	{
-		Oid			relid = lfirst_oid(cell);
+		Oid			relid = table->oid;
 		HeapTuple	classTup;
 		autovac_table *tab;
 		bool		isshared;
@@ -2535,7 +2572,7 @@ deleted:
 		pg_atomic_test_set_flag(&MyWorkerInfo->wi_dobalance);
 	}
 
-	list_free(table_oids);
+	list_free_deep(tables_to_process);
 
 	/*
 	 * Perform additional work items, as requested by backends.
@@ -2934,6 +2971,7 @@ recheck_relation_needs_vacanalyze(Oid relid,
 								  bool *wraparound)
 {
 	PgStat_StatTabEntry *tabentry;
+	double		score;
 
 	/* fetch the pgstat table entry */
 	tabentry = pgstat_fetch_stat_tabentry_ext(classForm->relisshared,
@@ -2941,7 +2979,8 @@ recheck_relation_needs_vacanalyze(Oid relid,
 
 	relation_needs_vacanalyze(relid, avopts, classForm, tabentry,
 							  effective_multixact_freeze_max_age,
-							  dovacuum, doanalyze, wraparound);
+							  dovacuum, doanalyze, wraparound,
+							  &score);
 
 	/* Release tabentry to avoid leakage */
 	if (tabentry)
@@ -2990,6 +3029,32 @@ recheck_relation_needs_vacanalyze(Oid relid,
  * autovacuum_vacuum_threshold GUC variable.  Similarly, a vac_scale_factor
  * value < 0 is substituted with the value of
  * autovacuum_vacuum_scale_factor GUC variable.  Ditto for analyze.
+ *
+ * This function also returns a score that can be used to sort the list of
+ * tables to process.  The idea is to have autovacuum prioritize tables that
+ * are furthest beyond their thresholds (e.g., a table nearing transaction ID
+ * wraparound should be vacuumed first).  This prioritization scheme is
+ * certainly far from perfect; there are simply too many possibilities for any
+ * scoring technique to work across all workloads, and the situation might
+ * change significantly between the time we calculate the score and the time
+ * that autovacuum gets to processing it.  However, we have attempted to
+ * develop something that is expected to work for a large portion of workloads
+ * with reasonable parameter settings.
+ *
+ * The score is calculated as the maximum of the ratios of each of the table's
+ * relevant values to its threshold.  For example, if the number of inserted
+ * tuples is 100, and the insert threshold for the table is 80, the insert
+ * score is 1.25.  If all other scores are below that value, the returned score
+ * will be 1.25.  The other criteria considered for the score are the table
+ * ages (both relfrozenxid and relminmxid) compared to the corresponding
+ * freeze-max-age setting, the number of updated/deleted tuples compared to the
+ * vacuum threshold, and the number of inserted/updated/deleted tuples compared
+ * to the analyze threshold.
+ *
+ * One exception to the previous paragraph is for tables nearing wraparound,
+ * i.e., those that have surpassed the effective failsafe ages.  In that case,
+ * the relfrozen/relminmxid-based score is scaled aggressively so that the
+ * table has a decent chance of sorting to the top of the list.
  */
 static void
 relation_needs_vacanalyze(Oid relid,
@@ -3000,7 +3065,8 @@ relation_needs_vacanalyze(Oid relid,
  /* output params below */
 						  bool *dovacuum,
 						  bool *doanalyze,
-						  bool *wraparound)
+						  bool *wraparound,
+						  double *score)
 {
 	bool		force_vacuum;
 	bool		av_enabled;
@@ -3029,7 +3095,14 @@ relation_needs_vacanalyze(Oid relid,
 	int			multixact_freeze_max_age;
 	TransactionId xidForceLimit;
 	TransactionId relfrozenxid;
+	TransactionId relminmxid;
 	MultiXactId multiForceLimit;
+	uint32		xid_age;
+	uint32		mxid_age;
+	double		xid_score;
+	double		mxid_score;
+	int			effective_xid_failsafe_age;
+	int			effective_mxid_failsafe_age;
 
 	Assert(classForm != NULL);
 	Assert(OidIsValid(relid));
@@ -3081,17 +3154,17 @@ relation_needs_vacanalyze(Oid relid,
 
 	av_enabled = (relopts ? relopts->enabled : true);
 
+	relfrozenxid = classForm->relfrozenxid;
+	relminmxid = classForm->relminmxid;
+
 	/* Force vacuum if table is at risk of wraparound */
 	xidForceLimit = recentXid - freeze_max_age;
 	if (xidForceLimit < FirstNormalTransactionId)
 		xidForceLimit -= FirstNormalTransactionId;
-	relfrozenxid = classForm->relfrozenxid;
 	force_vacuum = (TransactionIdIsNormal(relfrozenxid) &&
 					TransactionIdPrecedes(relfrozenxid, xidForceLimit));
 	if (!force_vacuum)
 	{
-		MultiXactId relminmxid = classForm->relminmxid;
-
 		multiForceLimit = recentMulti - multixact_freeze_max_age;
 		if (multiForceLimit < FirstMultiXactId)
 			multiForceLimit -= FirstMultiXactId;
@@ -3100,6 +3173,33 @@ relation_needs_vacanalyze(Oid relid,
 	}
 	*wraparound = force_vacuum;
 
+	/*
+	 * To calculate the (M)XID age portion of the score, divide the age by its
+	 * respective *_freeze_max_age parameter.
+	 */
+	xid_age = TransactionIdIsNormal(relfrozenxid) ? relfrozenxid - recentXid : 0;
+	mxid_age = MultiXactIdIsValid(relminmxid) ? relminmxid - recentMulti : 0;
+
+	xid_score = (double) xid_age / freeze_max_age;
+	mxid_score = (double) mxid_age / multixact_freeze_max_age;
+
+	/*
+	 * To ensure tables are given increased priority once they begin
+	 * approaching wraparound, we scale the score aggressively if the ages
+	 * surpass vacuum_failsafe_age or vacuum_multixact_failsafe_age.
+	 */
+	effective_xid_failsafe_age = Max(vacuum_failsafe_age,
+									 autovacuum_freeze_max_age * 1.05);
+	effective_mxid_failsafe_age = Max(vacuum_multixact_failsafe_age,
+									  autovacuum_multixact_freeze_max_age * 1.05);
+
+	if (xid_age >= effective_xid_failsafe_age)
+		xid_score = pow(xid_score, Max(1.0, (double) xid_age / 100000000));
+	if (mxid_age >= effective_mxid_failsafe_age)
+		mxid_score = pow(mxid_score, Max(1.0, (double) mxid_age / 100000000));
+
+	*score = Max(xid_score, mxid_score);
+
 	/* User disabled it in pg_class.reloptions?  (But ignore if at risk) */
 	if (!av_enabled && !force_vacuum)
 	{
@@ -3173,6 +3273,18 @@ relation_needs_vacanalyze(Oid relid,
 		*dovacuum = force_vacuum || (vactuples > vacthresh) ||
 			(vac_ins_base_thresh >= 0 && instuples > vacinsthresh);
 		*doanalyze = (anltuples > anlthresh);
+
+		/*
+		 * Update the score, being careful to avoid division by zero and to
+		 * skip cases where auto-analyze does not apply, i.e., pg_statistic
+		 * and TOAST tables.
+		 */
+		*score = Max(*score, (double) vactuples / Max(vacthresh, 1));
+		if (relid != StatisticRelationId &&
+			classForm->relkind != RELKIND_TOASTVALUE)
+			*score = Max(*score, (double) anltuples / Max(anlthresh, 1));
+		if (vac_ins_base_thresh >= 0)
+			*score = Max(*score, (double) instuples / Max(vacinsthresh, 1));
 	}
 	else
 	{
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 43fe3bcd593..3e0739c7808 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -3008,6 +3008,7 @@ TableScanDesc
 TableScanDescData
 TableSpaceCacheEntry
 TableSpaceOpts
+TableToProcess
 TablespaceList
 TablespaceListCell
 TapeBlockTrailer
-- 
2.39.5 (Apple Git-154)