From ee18ce65f9e89a2319fd137a4ac65deb29064a84 Mon Sep 17 00:00:00 2001 From: Dmitrii Dolgov <9erthalion6@gmail.com> Date: Thu, 18 Dec 2025 18:21:59 +0100 Subject: [PATCH v1] Use open file description locks for data directory lockfile When starting up, postmaster checks for an existing data directory lockfile. If this file contains current process PID, it's assumed to be stale. Turns out there is another possibility: we might be running in a PID namespace, and there is another postgres running inside another PID namespace using the same data directory. The result is that we don't see another process due to namespace isolation and start concurrently with the other. To prevent such situations, at startup use fcntl to get an exclusive open file description lock for data directory lockfile. Since such locks are associated with open file descriptors, meaning they're not affected by PID namespace isolation. It's a "best effort" locking, intended to work with already existing mechanism, not replace it. This approach was discussed multiple times in the past, and usually was rejected as the main work horse for the data directory lockfile due to: * Portability issues. Open file description lock was a non-POSIX extension in Linux and similar flock is from BSD standard. But looks like everybody agrees that such locks make more sense than a typical advisory locks, and F_OFD_SETLK made its way into POSIX.1 2024 [1]. * Issues with NFS. The current state of things here looks like this: - NFSv3 doesn't implement open file description locks, they're converted to advisory locks instead. Advisory locks are subject to namespace isolation, meaning that processes in different PID namespaces will not see each other advisory lock, and it's still possible to run multiple postgres instances on the same data directory. - NFSv4 uses a lease system for locking, I haven't found any mention of conversion to advisory locks neither in the man page nor in RFC [2]. To summarize, the approach is now considered POSIX and should fix the described problem everywhere, except NFSv3. [1]: https://pubs.opengroup.org/onlinepubs/9799919799/functions/fcntl.html [2]: https://www.rfc-editor.org/rfc/rfc7530 --- configure | 14 ++++ configure.ac | 3 + meson.build | 1 + src/backend/utils/init/miscinit.c | 107 +++++++++++++++++++++++++----- src/include/pg_config.h.in | 4 ++ 5 files changed, 111 insertions(+), 18 deletions(-) diff --git a/configure b/configure index 14ad0a5006f..b176ac39799 100755 --- a/configure +++ b/configure @@ -16177,6 +16177,20 @@ cat >>confdefs.h <<_ACEOF _ACEOF +# Linux open file descriptor locks +ac_fn_c_check_decl "$LINENO" "F_OFD_SETLK" "ac_cv_have_decl_F_OFD_SETLK" "#include +" +if test "x$ac_cv_have_decl_F_OFD_SETLK" = xyes; then : + ac_have_decl=1 +else + ac_have_decl=0 +fi + +cat >>confdefs.h <<_ACEOF +#define HAVE_DECL_F_OFD_SETLK $ac_have_decl +_ACEOF + + ac_fn_c_check_func "$LINENO" "explicit_bzero" "ac_cv_func_explicit_bzero" if test "x$ac_cv_func_explicit_bzero" = xyes; then : $as_echo "#define HAVE_EXPLICIT_BZERO 1" >>confdefs.h diff --git a/configure.ac b/configure.ac index 01b3bbc1be8..d6cf1f27771 100644 --- a/configure.ac +++ b/configure.ac @@ -1838,6 +1838,9 @@ AC_CHECK_DECLS([memset_s], [], [], [#define __STDC_WANT_LIB_EXT1__ 1 # This is probably only present on macOS, but may as well check always AC_CHECK_DECLS(F_FULLFSYNC, [], [], [#include ]) +# Linux open file descriptor locks +AC_CHECK_DECLS([F_OFD_SETLK], [], [], [#include ]) + AC_REPLACE_FUNCS(m4_normalize([ explicit_bzero getopt diff --git a/meson.build b/meson.build index d7c5193d4ce..ad9ecad829f 100644 --- a/meson.build +++ b/meson.build @@ -2667,6 +2667,7 @@ decl_checks = [ ['strnlen', 'string.h'], ['strsep', 'string.h'], ['timingsafe_bcmp', 'string.h'], + ['F_OFD_SETLK', 'fcntl.h'], ] # Need to check for function declarations for these functions, because diff --git a/src/backend/utils/init/miscinit.c b/src/backend/utils/init/miscinit.c index fec79992c8d..78bb7df543e 100644 --- a/src/backend/utils/init/miscinit.c +++ b/src/backend/utils/init/miscinit.c @@ -68,6 +68,11 @@ static List *lock_files = NIL; static Latch LocalLatchData; +#ifdef HAVE_DECL_F_OFD_SETLK +/* File descriptor for data directory lock file. */ +static int DataDirLockFD; +#endif + /* ---------------------------------------------------------------- * ignoring system indexes support stuff * @@ -1117,6 +1122,45 @@ RestoreClientConnectionInfo(char *conninfo) *------------------------------------------------------------------------- */ +/* + * Flock the data directory lockfile. + * + * Lock the data directory lockfile with an open file description lock. If the + * lock is already taken, it's a hard stop. It's only a best effort test, and + * any other errors are ignored. On succes the file descriptor is duplicated, + * to make sure there will be at least one open copy of it to keep the lock. + * + * filename is used only for reporting purposes. + */ +static void +FlockDataDirLockFile(int fd, const char *filename) +{ + +#ifdef HAVE_DECL_F_OFD_SETLK + struct flock lock; + + lock.l_type = F_WRLCK; + lock.l_whence = SEEK_SET; + lock.l_start = 0; + lock.l_len = 0; + lock.l_pid = 0; + + if (fcntl(fd, F_OFD_SETLK, &lock) == -1) + { + if (errno == EAGAIN) + ereport(FATAL, + (errcode(ERRCODE_LOCK_FILE_EXISTS), + errmsg("cannot lock the lock file \"%s\"", filename), + errhint("Another server is starting."))); + else + elog(WARNING, "Failed locking file \"%s\", %m", filename); + } + else + DataDirLockFD = dup(fd); +#endif + +} + /* * proc_exit callback to remove lockfiles. */ @@ -1125,6 +1169,11 @@ UnlinkLockFiles(int status, Datum arg) { ListCell *l; +#ifdef HAVE_DECL_F_OFD_SETLK + /* Close the file descriptor, which keeps the open file description lock */ + close(DataDirLockFD); +#endif + foreach(l, lock_files) { char *curfile = (char *) lfirst(l); @@ -1171,22 +1220,32 @@ CreateLockFile(const char *filename, bool amPostmaster, const char *envvar; /* - * If the PID in the lockfile is our own PID or our parent's or - * grandparent's PID, then the file must be stale (probably left over from - * a previous system boot cycle). We need to check this because of the - * likelihood that a reboot will assign exactly the same PID as we had in - * the previous reboot, or one that's only one or two counts larger and - * hence the lockfile's PID now refers to an ancestor shell process. We - * allow pg_ctl to pass down its parent shell PID (our grandparent PID) - * via the environment variable PG_GRANDPARENT_PID; this is so that - * launching the postmaster via pg_ctl can be just as reliable as - * launching it directly. There is no provision for detecting - * further-removed ancestor processes, but if the init script is written - * carefully then all but the immediate parent shell will be root-owned - * processes and so the kill test will fail with EPERM. Note that we - * cannot get a false negative this way, because an existing postmaster - * would surely never launch a competing postmaster or pg_ctl process - * directly. + * If we find an already existing lockfile containing our own PID, + * there are few options: + * + * - There is another process, that we don't see due to PID namespace + * isolation, which is already running in this data directory. + * + * To prevent two concurrent processes working with the same data + * directory, we first try to lock the lockfile exclusively. + * + * - The file must be stale, probably left over from a previous system boot + * cycle. The same if the lockfile contains our parent's or grandparent's + * PID. + * + * We need to check this because of the likelihood that a reboot will + * assign exactly the same PID as we had in the previous reboot, or one + * that's only one or two counts larger and hence the lockfile's PID now + * refers to an ancestor shell process. We allow pg_ctl to pass down its + * parent shell PID (our grandparent PID) via the environment variable + * PG_GRANDPARENT_PID; this is so that launching the postmaster via + * pg_ctl can be just as reliable as launching it directly. There is no + * provision for detecting further-removed ancestor processes, but if the + * init script is written carefully then all but the immediate parent + * shell will be root-owned processes and so the kill test will fail with + * EPERM. Note that we cannot get a false negative this way, because an + * existing postmaster would surely never launch a competing postmaster + * or pg_ctl process directly. */ my_pid = getpid(); @@ -1222,7 +1281,11 @@ CreateLockFile(const char *filename, bool amPostmaster, */ fd = open(filename, O_RDWR | O_CREAT | O_EXCL, pg_file_create_mode); if (fd >= 0) - break; /* Success; exit the retry loop */ + { + /* Success; lock and exit the retry loop */ + FlockDataDirLockFile(fd, filename); + break; + } /* * Couldn't create the pid file. Probably it already exists. @@ -1236,8 +1299,12 @@ CreateLockFile(const char *filename, bool amPostmaster, /* * Read the file to get the old owner's PID. Note race condition * here: file might have been deleted since we tried to create it. + * + * We're going to use the same fd for flock, and want to create a write + * lock for the latter one. Since both fd and the lock have to be of + * the same type, open the file for read and write. */ - fd = open(filename, O_RDONLY, pg_file_create_mode); + fd = open(filename, O_RDWR, pg_file_create_mode); if (fd < 0) { if (errno == ENOENT) @@ -1247,6 +1314,10 @@ CreateLockFile(const char *filename, bool amPostmaster, errmsg("could not open lock file \"%s\": %m", filename))); } + + /* Try to lock the file. We stop here, if it's already locked. */ + FlockDataDirLockFile(fd, filename); + pgstat_report_wait_start(WAIT_EVENT_LOCK_FILE_CREATE_READ); if ((len = read(fd, buffer, sizeof(buffer) - 1)) < 0) ereport(FATAL, diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index 92fcc5f3063..c19a50f108e 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -83,6 +83,10 @@ don't. */ #undef HAVE_DECL_F_FULLFSYNC +/* Define to 1 if you have the declaration of `F_OFD_SETLK', and to 0 if you + don't. */ +#undef HAVE_DECL_F_OFD_SETLK + /* Define to 1 if you have the declaration of `LLVMCreateGDBRegistrationListener', and to 0 if you don't. */ #undef HAVE_DECL_LLVMCREATEGDBREGISTRATIONLISTENER base-commit: 44f49511b7940adf3be4337d4feb2de38fe92297 -- 2.49.0