From 71407bf47926c707401278d6274db7641549d975 Mon Sep 17 00:00:00 2001
From: Nathan Bossart <bossartn@amazon.com>
Date: Sun, 5 Dec 2021 22:02:40 -0800
Subject: [PATCH v16 2/4] Move removal of old serialized snapshots to
 custodian.

This was only done during checkpoints because it was a convenient
place to put it.  However, if there are many snapshots to remove,
it can significantly extend checkpoint time.  To avoid this, move
this work to the newly-introduced custodian process.
---
 contrib/test_decoding/expected/spill.out    | 21 +++++++++++++++++++++
 contrib/test_decoding/sql/spill.sql         | 17 +++++++++++++++++
 src/backend/access/transam/xlog.c           |  6 ++++--
 src/backend/postmaster/custodian.c          |  2 ++
 src/backend/replication/logical/snapbuild.c |  9 ++++-----
 src/include/postmaster/custodian.h          |  2 +-
 src/include/replication/snapbuild.h         |  2 +-
 7 files changed, 50 insertions(+), 9 deletions(-)

diff --git a/contrib/test_decoding/expected/spill.out b/contrib/test_decoding/expected/spill.out
index 10734bdb6a..75acbd5d5c 100644
--- a/contrib/test_decoding/expected/spill.out
+++ b/contrib/test_decoding/expected/spill.out
@@ -248,6 +248,27 @@ GROUP BY 1 ORDER BY 1;
 (2 rows)
 
 DROP TABLE spill_test;
+-- make sure custodian cleans up files
+CHECKPOINT;
+DO $$
+DECLARE
+    snaps_removed bool;
+    loops int := 0;
+BEGIN
+    LOOP
+        snaps_removed := count(*) = 0 FROM pg_ls_logicalsnapdir();
+        IF snaps_removed OR loops > 120 * 100 THEN EXIT; END IF;
+        PERFORM pg_sleep(0.01);
+        loops := loops + 1;
+    END LOOP;
+END
+$$;
+SELECT count(*) = 0 FROM pg_ls_logicalsnapdir();
+ ?column? 
+----------
+ t
+(1 row)
+
 SELECT pg_drop_replication_slot('regression_slot');
  pg_drop_replication_slot 
 --------------------------
diff --git a/contrib/test_decoding/sql/spill.sql b/contrib/test_decoding/sql/spill.sql
index e638cacd3f..94d522f548 100644
--- a/contrib/test_decoding/sql/spill.sql
+++ b/contrib/test_decoding/sql/spill.sql
@@ -176,4 +176,21 @@ GROUP BY 1 ORDER BY 1;
 
 DROP TABLE spill_test;
 
+-- make sure custodian cleans up files
+CHECKPOINT;
+DO $$
+DECLARE
+    snaps_removed bool;
+    loops int := 0;
+BEGIN
+    LOOP
+        snaps_removed := count(*) = 0 FROM pg_ls_logicalsnapdir();
+        IF snaps_removed OR loops > 120 * 100 THEN EXIT; END IF;
+        PERFORM pg_sleep(0.01);
+        loops := loops + 1;
+    END LOOP;
+END
+$$;
+SELECT count(*) = 0 FROM pg_ls_logicalsnapdir();
+
 SELECT pg_drop_replication_slot('regression_slot');
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index a31fbbff78..c153c32a77 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -76,12 +76,12 @@
 #include "port/atomics.h"
 #include "port/pg_iovec.h"
 #include "postmaster/bgwriter.h"
+#include "postmaster/custodian.h"
 #include "postmaster/startup.h"
 #include "postmaster/walwriter.h"
 #include "replication/logical.h"
 #include "replication/origin.h"
 #include "replication/slot.h"
-#include "replication/snapbuild.h"
 #include "replication/walreceiver.h"
 #include "replication/walsender.h"
 #include "storage/bufmgr.h"
@@ -7001,10 +7001,12 @@ CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
 {
 	CheckPointRelationMap();
 	CheckPointReplicationSlots();
-	CheckPointSnapBuild();
 	CheckPointLogicalRewriteHeap();
 	CheckPointReplicationOrigin();
 
+	/* tasks offloaded to custodian */
+	RequestCustodian(CUSTODIAN_REMOVE_SERIALIZED_SNAPSHOTS, (Datum) 0);
+
 	/* Write out all dirty data in SLRUs and the main buffer pool */
 	TRACE_POSTGRESQL_BUFFER_CHECKPOINT_START(flags);
 	CheckpointStats.ckpt_write_t = GetCurrentTimestamp();
diff --git a/src/backend/postmaster/custodian.c b/src/backend/postmaster/custodian.c
index a94381bc21..d0fd955d4b 100644
--- a/src/backend/postmaster/custodian.c
+++ b/src/backend/postmaster/custodian.c
@@ -25,6 +25,7 @@
 #include "pgstat.h"
 #include "postmaster/custodian.h"
 #include "postmaster/interrupt.h"
+#include "replication/snapbuild.h"
 #include "storage/bufmgr.h"
 #include "storage/condition_variable.h"
 #include "storage/fd.h"
@@ -70,6 +71,7 @@ struct cust_task_funcs_entry
  * whether the task is already enqueued.
  */
 static const struct cust_task_funcs_entry cust_task_functions[] = {
+	{CUSTODIAN_REMOVE_SERIALIZED_SNAPSHOTS, RemoveOldSerializedSnapshots, NULL},
 	{INVALID_CUSTODIAN_TASK, NULL, NULL}	/* must be last */
 };
 
diff --git a/src/backend/replication/logical/snapbuild.c b/src/backend/replication/logical/snapbuild.c
index beddcbcdea..e7c4f69b42 100644
--- a/src/backend/replication/logical/snapbuild.c
+++ b/src/backend/replication/logical/snapbuild.c
@@ -2036,14 +2036,13 @@ SnapBuildRestoreContents(int fd, char *dest, Size size, const char *path)
 
 /*
  * Remove all serialized snapshots that are not required anymore because no
- * slot can need them. This doesn't actually have to run during a checkpoint,
- * but it's a convenient point to schedule this.
+ * slot can need them.
  *
- * NB: We run this during checkpoints even if logical decoding is disabled so
- * we cleanup old slots at some point after it got disabled.
+ * NB: We run this even if logical decoding is disabled so we cleanup old slots
+ * at some point after it got disabled.
  */
 void
-CheckPointSnapBuild(void)
+RemoveOldSerializedSnapshots(void)
 {
 	XLogRecPtr	cutoff;
 	XLogRecPtr	redo;
diff --git a/src/include/postmaster/custodian.h b/src/include/postmaster/custodian.h
index 73d0bc5f02..ab6d4283b9 100644
--- a/src/include/postmaster/custodian.h
+++ b/src/include/postmaster/custodian.h
@@ -18,7 +18,7 @@
  */
 typedef enum CustodianTask
 {
-	FAKE_TASK,						/* placeholder until we have a real task */
+	CUSTODIAN_REMOVE_SERIALIZED_SNAPSHOTS,
 
 	NUM_CUSTODIAN_TASKS,			/* new tasks go above */
 	INVALID_CUSTODIAN_TASK
diff --git a/src/include/replication/snapbuild.h b/src/include/replication/snapbuild.h
index 2a697e57c3..9eba403e0c 100644
--- a/src/include/replication/snapbuild.h
+++ b/src/include/replication/snapbuild.h
@@ -57,7 +57,7 @@ struct ReorderBuffer;
 struct xl_heap_new_cid;
 struct xl_running_xacts;
 
-extern void CheckPointSnapBuild(void);
+extern void RemoveOldSerializedSnapshots(void);
 
 extern SnapBuild *AllocateSnapshotBuilder(struct ReorderBuffer *reorder,
 										  TransactionId xmin_horizon, XLogRecPtr start_lsn,
-- 
2.25.1

