From 5354b935e2927de5d78cdb9a94b700855ba3f350 Mon Sep 17 00:00:00 2001
From: Nathan Bossart <bossartn@amazon.com>
Date: Sun, 5 Dec 2021 21:16:44 -0800
Subject: [PATCH v12 3/6] Split pgsql_tmp cleanup into two stages.

First, pgsql_tmp directories will be moved to a staging directory
and renamed to prepare them for removal.  Then, all files in these
directories are removed before removing the directories themselves.
This change is being made in preparation for a follow-up change to
offload most temporary file cleanup to the new custodian process.

Note that temporary relation files cannot be cleaned up via the
aforementioned strategy and will not be offloaded to the custodian.

This change also modifies several ereport(LOG, ...) calls within
the temporary file cleanup code to ERROR instead.  While temporary
file cleanup is typically not urgent enough to prevent startup,
excessive lenience might mask bugs.
---
 src/backend/postmaster/postmaster.c |   4 +
 src/backend/storage/file/fd.c       | 215 +++++++++++++++++++++++-----
 src/include/storage/fd.h            |   1 +
 3 files changed, 182 insertions(+), 38 deletions(-)

diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c
index faaf13d537..54548e28e9 100644
--- a/src/backend/postmaster/postmaster.c
+++ b/src/backend/postmaster/postmaster.c
@@ -1399,6 +1399,7 @@ PostmasterMain(int argc, char *argv[])
 	 * Remove old temporary files.  At this point there can be no other
 	 * Postgres processes running in this directory, so this should be safe.
 	 */
+	StagePgTempFilesForRemoval();
 	RemovePgTempFiles();
 
 	/*
@@ -4030,7 +4031,10 @@ PostmasterStateMachine(void)
 
 		/* remove leftover temporary files after a crash */
 		if (remove_temp_files_after_crash)
+		{
+			StagePgTempFilesForRemoval();
 			RemovePgTempFiles();
+		}
 
 		/* allow background workers to immediately restart */
 		ResetBackgroundWorkerCrashTimes();
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
index 0e1398d07c..9610850d45 100644
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -77,6 +77,7 @@
 #include <sys/param.h>
 #include <sys/resource.h>		/* for getrlimit */
 #include <sys/stat.h>
+#include <sys/time.h>
 #include <sys/types.h>
 #ifndef WIN32
 #include <sys/mman.h>
@@ -88,6 +89,7 @@
 #include "access/xact.h"
 #include "access/xlog.h"
 #include "catalog/pg_tablespace.h"
+#include "common/int.h"
 #include "common/file_perm.h"
 #include "common/file_utils.h"
 #include "common/pg_prng.h"
@@ -109,6 +111,8 @@
 #define PG_FLUSH_DATA_WORKS 1
 #endif
 
+#define PG_TEMP_TO_REMOVE_DIR (PG_TEMP_FILES_DIR "_staged_for_removal")
+
 /*
  * We must leave some file descriptors free for system(), the dynamic loader,
  * and other code that tries to open files without consulting fd.c.  This
@@ -335,6 +339,8 @@ static void BeforeShmemExit_Files(int code, Datum arg);
 static void CleanupTempFiles(bool isCommit, bool isProcExit);
 static void RemovePgTempRelationFiles(const char *tsdirname);
 static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname);
+static void StagePgTempDirForRemoval(const char *tmp_dir);
+static void RemoveStagedPgTempDirs(const char *spc_dir);
 
 static void walkdir(const char *path,
 					void (*action) (const char *fname, bool isdir, int elevel),
@@ -3049,29 +3055,24 @@ CleanupTempFiles(bool isCommit, bool isProcExit)
 		FreeDesc(&allocatedDescs[0]);
 }
 
-
 /*
- * Remove temporary and temporary relation files left over from a prior
- * postmaster session
+ * Stage temporary files left over from a prior postmaster session for removal.
  *
- * This should be called during postmaster startup.  It will forcibly
- * remove any leftover files created by OpenTemporaryFile and any leftover
- * temporary relation files created by mdcreate.
+ * This function also removes any leftover temporary relation files.  Unlike
+ * temporary files stored in pgsql_tmp directories, temporary relation files do
+ * not live in their own directory, so there isn't a tremendously beneficial way
+ * to stage them for removal at a later time.
  *
- * During post-backend-crash restart cycle, this routine is called when
- * remove_temp_files_after_crash GUC is enabled. Multiple crashes while
- * queries are using temp files could result in useless storage usage that can
- * only be reclaimed by a service restart. The argument against enabling it is
- * that someone might want to examine the temporary files for debugging
- * purposes. This does however mean that OpenTemporaryFile had better allow for
- * collision with an existing temp file name.
+ * RemovePgTempFiles() should be called at some point after this function in
+ * order to remove the staged temporary directories.
  *
- * NOTE: this function and its subroutines generally report syscall failures
- * with ereport(LOG) and keep going.  Removing temp files is not so critical
- * that we should fail to start the database when we can't do it.
+ * In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of
+ * DataDir as well.  However, that is *not* cleaned here because doing so would
+ * create a race condition.  It's done separately, earlier in postmaster
+ * startup.
  */
 void
-RemovePgTempFiles(void)
+StagePgTempFilesForRemoval(void)
 {
 	char		temp_path[MAXPGPATH + 10 + sizeof(TABLESPACE_VERSION_DIRECTORY) + sizeof(PG_TEMP_FILES_DIR)];
 	DIR		   *spc_dir;
@@ -3081,7 +3082,8 @@ RemovePgTempFiles(void)
 	 * First process temp files in pg_default ($PGDATA/base)
 	 */
 	snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR);
-	RemovePgTempDir(temp_path, true, false);
+	StagePgTempDirForRemoval(temp_path);
+
 	RemovePgTempRelationFiles("base");
 
 	/*
@@ -3089,7 +3091,7 @@ RemovePgTempFiles(void)
 	 */
 	spc_dir = AllocateDir("pg_tblspc");
 
-	while ((spc_de = ReadDirExtended(spc_dir, "pg_tblspc", LOG)) != NULL)
+	while ((spc_de = ReadDir(spc_dir, "pg_tblspc")) != NULL)
 	{
 		if (strcmp(spc_de->d_name, ".") == 0 ||
 			strcmp(spc_de->d_name, "..") == 0)
@@ -3097,7 +3099,7 @@ RemovePgTempFiles(void)
 
 		snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s/%s",
 				 spc_de->d_name, TABLESPACE_VERSION_DIRECTORY, PG_TEMP_FILES_DIR);
-		RemovePgTempDir(temp_path, true, false);
+		StagePgTempDirForRemoval(temp_path);
 
 		snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s",
 				 spc_de->d_name, TABLESPACE_VERSION_DIRECTORY);
@@ -3105,21 +3107,160 @@ RemovePgTempFiles(void)
 	}
 
 	FreeDir(spc_dir);
+}
+
+/*
+ * Remove temporary files that have been previously staged for removal by
+ * StagePgTempFilesForRemoval().
+ */
+void
+RemovePgTempFiles(void)
+{
+	char		temp_path[MAXPGPATH + 10 + sizeof(TABLESPACE_VERSION_DIRECTORY) + sizeof(PG_TEMP_FILES_DIR)];
+	DIR		   *spc_dir;
+	struct dirent *spc_de;
+
+	/*
+	 * First process temp files in pg_default ($PGDATA/base)
+	 */
+	RemoveStagedPgTempDirs("base");
 
 	/*
-	 * In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of
-	 * DataDir as well.  However, that is *not* cleaned here because doing so
-	 * would create a race condition.  It's done separately, earlier in
-	 * postmaster startup.
+	 * Cycle through temp directories for all non-default tablespaces.
 	 */
+	spc_dir = AllocateDir("pg_tblspc");
+
+	while ((spc_de = ReadDir(spc_dir, "pg_tblspc")) != NULL)
+	{
+		if (strcmp(spc_de->d_name, ".") == 0 ||
+			strcmp(spc_de->d_name, "..") == 0)
+			continue;
+
+		snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s",
+				 spc_de->d_name, TABLESPACE_VERSION_DIRECTORY);
+		RemoveStagedPgTempDirs(temp_path);
+	}
+
+	FreeDir(spc_dir);
 }
 
 /*
- * Process one pgsql_tmp directory for RemovePgTempFiles.
+ * StagePgTempDirForRemoval
+ *
+ * This function moves the given directory to a staging directory and renames
+ * it in preparation for removal by a later call to RemoveStagedPgTempDirs().
+ * The current timestamp is appended to the end of the new directory name in
+ * case previously staged pgsql_tmp directories have not yet been removed.
+ */
+static void
+StagePgTempDirForRemoval(const char *tmp_dir)
+{
+	struct stat st;
+	char		stage_path[MAXPGPATH * 2];
+	char		parent_path[MAXPGPATH * 2];
+	char		to_remove_path[MAXPGPATH * 2];
+	struct timeval tv;
+	uint64		epoch;
+
+	/*
+	 * If tmp_dir doesn't exist, there is nothing to stage.
+	 */
+	if (stat(tmp_dir, &st) != 0)
+	{
+		if (errno != ENOENT)
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not stat file \"%s\": %m", tmp_dir)));
+		return;
+	}
+	else if (!S_ISDIR(st.st_mode))
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("\"%s\" is not a directory", tmp_dir)));
+
+	strlcpy(parent_path, tmp_dir, MAXPGPATH * 2);
+	get_parent_directory(parent_path);
+
+	/*
+	 * get_parent_directory() returns an empty string if the input argument is
+	 * just a file name (see comments in path.c), so handle that as being the
+	 * current directory.
+	 */
+	if (strlen(parent_path) == 0)
+		strlcpy(parent_path, ".", MAXPGPATH * 2);
+
+	/*
+	 * Make sure the pgsql_tmp_staged_for_removal directory exists.
+	 */
+	snprintf(to_remove_path, sizeof(to_remove_path), "%s/%s", parent_path,
+			 PG_TEMP_TO_REMOVE_DIR);
+	if (MakePGDirectory(to_remove_path) != 0 && errno != EEXIST)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not create directory \"%s\": %m",
+						to_remove_path)));
+
+	/*
+	 * Pick a sufficiently unique name for the stage directory.  We just append
+	 * the current timestamp to the end of the name.
+	 */
+	gettimeofday(&tv, NULL);
+	if (pg_mul_u64_overflow((uint64) 1000, (uint64) tv.tv_sec, &epoch) ||
+		pg_add_u64_overflow(epoch, (uint64) tv.tv_usec, &epoch))
+		elog(ERROR, "could not stage temporary file directory for removal");
+
+	snprintf(stage_path, sizeof(stage_path), "%s/%s." UINT64_FORMAT,
+			 to_remove_path, PG_TEMP_FILES_DIR, epoch);
+
+	/*
+	 * Rename the temporary directory.
+	 */
+	if (rename(tmp_dir, stage_path) != 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not rename directory \"%s\" to \"%s\": %m",
+						tmp_dir, stage_path)));
+}
+
+/*
+ * RemoveStagedPgTempDirs
+ *
+ * This function removes all pgsql_tmp directories that have been staged for
+ * removal by StagePgTempDirForRemoval() in the given tablespace directory.
+ */
+static void
+RemoveStagedPgTempDirs(const char *spc_dir)
+{
+	char		stage_path[MAXPGPATH * 2];
+	char		temp_path[MAXPGPATH * 2];
+	DIR		   *dir;
+	struct dirent *de;
+
+	snprintf(stage_path, sizeof(stage_path), "%s/%s", spc_dir,
+			 PG_TEMP_TO_REMOVE_DIR);
+
+	dir = AllocateDir(stage_path);
+	if (dir == NULL && errno == ENOENT)
+		return;
+
+	while ((de = ReadDir(dir, stage_path)) != NULL)
+	{
+		if (strncmp(de->d_name, PG_TEMP_FILES_DIR,
+					strlen(PG_TEMP_FILES_DIR)) != 0)
+			continue;
+
+		snprintf(temp_path, sizeof(temp_path), "%s/%s", stage_path, de->d_name);
+		RemovePgTempDir(temp_path, true, false);
+	}
+	FreeDir(dir);
+}
+
+/*
+ * Process one pgsql_tmp directory for RemoveStagedPgTempDirs.
  *
  * If missing_ok is true, it's all right for the named directory to not exist.
- * Any other problem results in a LOG message.  (missing_ok should be true at
- * the top level, since pgsql_tmp directories are not created until needed.)
+ * Any other problem results in an ERROR.  (missing_ok should be true at the
+ * top level, since pgsql_tmp directories are not created until needed.)
  *
  * At the top level, this should be called with unlink_all = false, so that
  * only files matching the temporary name prefix will be unlinked.  When
@@ -3141,7 +3282,7 @@ RemovePgTempDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
 	if (temp_dir == NULL && errno == ENOENT && missing_ok)
 		return;
 
-	while ((temp_de = ReadDirExtended(temp_dir, tmpdirname, LOG)) != NULL)
+	while ((temp_de = ReadDir(temp_dir, tmpdirname)) != NULL)
 	{
 		if (strcmp(temp_de->d_name, ".") == 0 ||
 			strcmp(temp_de->d_name, "..") == 0)
@@ -3155,11 +3296,9 @@ RemovePgTempDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
 					PG_TEMP_FILE_PREFIX,
 					strlen(PG_TEMP_FILE_PREFIX)) == 0)
 		{
-			PGFileType	type = get_dirent_type(rm_path, temp_de, false, LOG);
+			PGFileType	type = get_dirent_type(rm_path, temp_de, false, ERROR);
 
-			if (type == PGFILETYPE_ERROR)
-				continue;
-			else if (type == PGFILETYPE_DIR)
+			if (type == PGFILETYPE_DIR)
 			{
 				/* recursively remove contents, then directory itself */
 				RemovePgTempDir(rm_path, false, true);
@@ -3167,14 +3306,14 @@ RemovePgTempDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
 			else
 			{
 				if (unlink(rm_path) < 0)
-					ereport(LOG,
+					ereport(ERROR,
 							(errcode_for_file_access(),
 							 errmsg("could not remove file \"%s\": %m",
 									rm_path)));
 			}
 		}
 		else
-			ereport(LOG,
+			ereport(ERROR,
 					(errmsg("unexpected file found in temporary-files directory: \"%s\"",
 							rm_path)));
 	}
@@ -3182,7 +3321,7 @@ RemovePgTempDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
 	FreeDir(temp_dir);
 
 	if (rmdir(tmpdirname) < 0)
-		ereport(LOG,
+		ereport(ERROR,
 				(errcode_for_file_access(),
 				 errmsg("could not remove directory \"%s\": %m",
 						tmpdirname)));
@@ -3198,7 +3337,7 @@ RemovePgTempRelationFiles(const char *tsdirname)
 
 	ts_dir = AllocateDir(tsdirname);
 
-	while ((de = ReadDirExtended(ts_dir, tsdirname, LOG)) != NULL)
+	while ((de = ReadDir(ts_dir, tsdirname)) != NULL)
 	{
 		/*
 		 * We're only interested in the per-database directories, which have
@@ -3226,7 +3365,7 @@ RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
 
 	dbspace_dir = AllocateDir(dbspacedirname);
 
-	while ((de = ReadDirExtended(dbspace_dir, dbspacedirname, LOG)) != NULL)
+	while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL)
 	{
 		if (!looks_like_temp_rel_name(de->d_name))
 			continue;
@@ -3235,7 +3374,7 @@ RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
 				 dbspacedirname, de->d_name);
 
 		if (unlink(rm_path) < 0)
-			ereport(LOG,
+			ereport(ERROR,
 					(errcode_for_file_access(),
 					 errmsg("could not remove file \"%s\": %m",
 							rm_path)));
diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h
index 790b9a9a14..cf27e90aea 100644
--- a/src/include/storage/fd.h
+++ b/src/include/storage/fd.h
@@ -166,6 +166,7 @@ extern Oid	GetNextTempTableSpace(void);
 extern void AtEOXact_Files(bool isCommit);
 extern void AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid,
 							  SubTransactionId parentSubid);
+extern void StagePgTempFilesForRemoval(void);
 extern void RemovePgTempFiles(void);
 extern void RemovePgTempDir(const char *tmpdirname, bool missing_ok,
 							bool unlink_all);
-- 
2.25.1

