From 6301d334002036aa3ddf97236b69ad58eace9e56 Mon Sep 17 00:00:00 2001 From: Dmitrii Dolgov <9erthalion6@gmail.com> Date: Thu, 18 Dec 2025 18:21:59 +0100 Subject: [PATCH v3] Use open file description locks for lockfiles When starting up, postmaster checks for an existing data directory lockfile. If this file contains current process PID, it's assumed to be stale. Turns out there is another possibility: we might be running in a PID namespace, and there is another postgres running inside another PID namespace using the same data directory. The result is that we don't see another process due to namespace isolation and start concurrently with the other. To prevent such situations, at startup use fcntl to get an exclusive open file description lock for data directory lockfile. Since such locks are associated with open file descriptors, meaning they're not affected by PID namespace isolation. It's a "best effort" locking, intended to work with already existing mechanism, not replace it. This approach was discussed multiple times in the past, and usually was rejected as the main work horse for the data directory lockfile due to: * Portability issues. Open file description lock was a non-POSIX extension in Linux and similar flock is from BSD standard. But looks like everybody agrees that such locks make more sense than a typical advisory locks, and F_OFD_SETLK made its way into POSIX.1 2024 [1]. * Issues with NFS. The current state of things here looks like this: - NFSv3 doesn't implement open file description locks, they're converted to advisory locks instead. Advisory locks are subject to namespace isolation, meaning that processes in different PID namespaces will not see each other advisory lock, and it's still possible to run multiple postgres instances on the same data directory. - NFSv4 uses a lease system for locking, I haven't found any mention of conversion to advisory locks neither in the man page nor in RFC [2]. To summarize, the approach is now considered POSIX and should fix the described problem everywhere, except NFSv3. Use open file description lock for both data directory and socker lockfiles, since both are affected in the same way. [1]: https://pubs.opengroup.org/onlinepubs/9799919799/functions/fcntl.html [2]: https://www.rfc-editor.org/rfc/rfc7530 Reviewed-by: Ilmar Yunusov --- configure | 14 +++ configure.ac | 3 + meson.build | 1 + src/backend/utils/init/miscinit.c | 136 ++++++++++++++++++++++++------ src/include/pg_config.h.in | 4 + src/tools/pgindent/typedefs.list | 1 + 6 files changed, 134 insertions(+), 25 deletions(-) diff --git a/configure b/configure index 5f77f3cac29..15bda6c6413 100755 --- a/configure +++ b/configure @@ -16444,6 +16444,20 @@ cat >>confdefs.h <<_ACEOF _ACEOF +# Linux open file descriptor locks +ac_fn_c_check_decl "$LINENO" "F_OFD_SETLK" "ac_cv_have_decl_F_OFD_SETLK" "#include +" +if test "x$ac_cv_have_decl_F_OFD_SETLK" = xyes; then : + ac_have_decl=1 +else + ac_have_decl=0 +fi + +cat >>confdefs.h <<_ACEOF +#define HAVE_DECL_F_OFD_SETLK $ac_have_decl +_ACEOF + + ac_fn_c_check_func "$LINENO" "explicit_bzero" "ac_cv_func_explicit_bzero" if test "x$ac_cv_func_explicit_bzero" = xyes; then : $as_echo "#define HAVE_EXPLICIT_BZERO 1" >>confdefs.h diff --git a/configure.ac b/configure.ac index 61cee42daa7..e24cb7a6f01 100644 --- a/configure.ac +++ b/configure.ac @@ -1913,6 +1913,9 @@ AC_CHECK_DECLS([memset_s], [], [], [#define __STDC_WANT_LIB_EXT1__ 1 # This is probably only present on macOS, but may as well check always AC_CHECK_DECLS(F_FULLFSYNC, [], [], [#include ]) +# Linux open file descriptor locks +AC_CHECK_DECLS([F_OFD_SETLK], [], [], [#include ]) + AC_REPLACE_FUNCS(m4_normalize([ explicit_bzero getopt diff --git a/meson.build b/meson.build index 568e0e150bf..3d641dc0403 100644 --- a/meson.build +++ b/meson.build @@ -2901,6 +2901,7 @@ decl_checks = [ ['strlcpy', 'string.h'], ['strsep', 'string.h'], ['timingsafe_bcmp', 'string.h'], + ['F_OFD_SETLK', 'fcntl.h'], ] # Need to check for function declarations for these functions, because diff --git a/src/backend/utils/init/miscinit.c b/src/backend/utils/init/miscinit.c index 7ffc808073a..26c3324542c 100644 --- a/src/backend/utils/init/miscinit.c +++ b/src/backend/utils/init/miscinit.c @@ -69,6 +69,15 @@ static List *lock_files = NIL; static Latch LocalLatchData; +typedef struct +{ + /* LockFile name. */ + const char *filename; + + /* File descriptor for open file description lock. */ + int fd; +} LockFile; + /* ---------------------------------------------------------------- * ignoring system indexes support stuff * @@ -1119,6 +1128,48 @@ RestoreClientConnectionInfo(char *conninfo) *------------------------------------------------------------------------- */ +/* + * OFD lock the specified lockfile. + * + * Lock the lockfile with an open file description lock. If the lock is already + * taken, it's a hard stop. It's only a best effort test, and any other errors + * are ignored. On succes the file descriptor is duplicated, to make sure there + * will be at least one open copy of it to keep the lock. + * + * filename argument is used only for reporting purposes. + */ +static int +OFDLockFile(int fd, const char *filename) +{ +#if HAVE_DECL_F_OFD_SETLK + struct flock lock; + + lock.l_type = F_WRLCK; + lock.l_whence = SEEK_SET; + lock.l_start = 0; + lock.l_len = 0; + lock.l_pid = 0; + + if (fcntl(fd, F_OFD_SETLK, &lock) == -1) + { + if (errno == EAGAIN) + ereport(FATAL, + (errcode(ERRCODE_LOCK_FILE_EXISTS), + errmsg("cannot lock the lock file \"%s\"", filename), + errhint("Another server is starting."))); + else + { + elog(WARNING, "Failed locking file \"%s\", %m", filename); + return -1; + } + } + else + return dup(fd); +#else + return -1 +#endif +} + /* * proc_exit callback to remove lockfiles. */ @@ -1129,9 +1180,16 @@ UnlinkLockFiles(int status, Datum arg) foreach(l, lock_files) { - char *curfile = (char *) lfirst(l); + LockFile *lock_file = (LockFile *) lfirst(l); - unlink(curfile); + /* + * Close the file descriptor, which keeps the open file description + * lock. + */ + if (lock_file->fd > 0) + close(lock_file->fd); + + unlink(lock_file->filename); /* Should we complain if the unlink fails? */ } /* Since we're about to exit, no need to reclaim storage */ @@ -1161,7 +1219,9 @@ CreateLockFile(const char *filename, bool amPostmaster, const char *socketDir, bool isDDLock, const char *refName) { - int fd; + int fd, + flock_fd = -1; + LockFile *lock_file; char buffer[MAXPGPATH * 2 + 256]; int ntries; int len; @@ -1173,22 +1233,32 @@ CreateLockFile(const char *filename, bool amPostmaster, const char *envvar; /* - * If the PID in the lockfile is our own PID or our parent's or - * grandparent's PID, then the file must be stale (probably left over from - * a previous system boot cycle). We need to check this because of the - * likelihood that a reboot will assign exactly the same PID as we had in - * the previous reboot, or one that's only one or two counts larger and - * hence the lockfile's PID now refers to an ancestor shell process. We - * allow pg_ctl to pass down its parent shell PID (our grandparent PID) - * via the environment variable PG_GRANDPARENT_PID; this is so that - * launching the postmaster via pg_ctl can be just as reliable as - * launching it directly. There is no provision for detecting - * further-removed ancestor processes, but if the init script is written - * carefully then all but the immediate parent shell will be root-owned - * processes and so the kill test will fail with EPERM. Note that we - * cannot get a false negative this way, because an existing postmaster - * would surely never launch a competing postmaster or pg_ctl process - * directly. + * If we find an already existing lockfile containing our own PID, there + * are few options: + * + * - There is another process, that we don't see due to PID namespace + * isolation, which is already running in this data directory. + * + * To prevent two concurrent processes working with the same data + * directory, we first try to lock the lockfile exclusively. + * + * - The file must be stale, probably left over from a previous system + * boot cycle. The same if the lockfile contains our parent's or + * grandparent's PID. + * + * We need to check this because of the likelihood that a reboot will + * assign exactly the same PID as we had in the previous reboot, or one + * that's only one or two counts larger and hence the lockfile's PID now + * refers to an ancestor shell process. We allow pg_ctl to pass down its + * parent shell PID (our grandparent PID) via the environment variable + * PG_GRANDPARENT_PID; this is so that launching the postmaster via pg_ctl + * can be just as reliable as launching it directly. There is no + * provision for detecting further-removed ancestor processes, but if the + * init script is written carefully then all but the immediate parent + * shell will be root-owned processes and so the kill test will fail with + * EPERM. Note that we cannot get a false negative this way, because an + * existing postmaster would surely never launch a competing postmaster or + * pg_ctl process directly. */ my_pid = getpid(); @@ -1224,7 +1294,11 @@ CreateLockFile(const char *filename, bool amPostmaster, */ fd = open(filename, O_RDWR | O_CREAT | O_EXCL, pg_file_create_mode); if (fd >= 0) - break; /* Success; exit the retry loop */ + { + /* Success; lock and exit the retry loop */ + flock_fd = OFDLockFile(fd, filename); + break; + } /* * Couldn't create the pid file. Probably it already exists. @@ -1238,8 +1312,12 @@ CreateLockFile(const char *filename, bool amPostmaster, /* * Read the file to get the old owner's PID. Note race condition * here: file might have been deleted since we tried to create it. + * + * We're going to use the same fd for flock, and want to create a + * write lock for the latter one. Since both fd and the lock have to + * be of the same type, open the file for read and write. */ - fd = open(filename, O_RDONLY, pg_file_create_mode); + fd = open(filename, O_RDWR, pg_file_create_mode); if (fd < 0) { if (errno == ENOENT) @@ -1249,6 +1327,10 @@ CreateLockFile(const char *filename, bool amPostmaster, errmsg("could not open lock file \"%s\": %m", filename))); } + + /* Try to lock the file. We stop here, if it's already locked. */ + flock_fd = OFDLockFile(fd, filename); + pgstat_report_wait_start(WAIT_EVENT_LOCK_FILE_CREATE_READ); if ((len = read(fd, buffer, sizeof(buffer) - 1)) < 0) ereport(FATAL, @@ -1448,7 +1530,11 @@ CreateLockFile(const char *filename, bool amPostmaster, * Use lcons so that the lock files are unlinked in reverse order of * creation; this is critical! */ - lock_files = lcons(pstrdup(filename), lock_files); + lock_file = palloc0_object(LockFile); + lock_file->filename = pstrdup(filename); + lock_file->fd = flock_fd; + + lock_files = lcons(lock_file, lock_files); } /* @@ -1495,14 +1581,14 @@ TouchSocketLockFiles(void) foreach(l, lock_files) { - char *socketLockFile = (char *) lfirst(l); + LockFile *lock_file = (LockFile *) lfirst(l); /* No need to touch the data directory lock file, we trust */ - if (strcmp(socketLockFile, DIRECTORY_LOCK_FILE) == 0) + if (strcmp(lock_file->filename, DIRECTORY_LOCK_FILE) == 0) continue; /* we just ignore any error here */ - (void) utime(socketLockFile, NULL); + (void) utime(lock_file->filename, NULL); } } diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index 4f8113c144b..cc38c06dc13 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -85,6 +85,10 @@ don't. */ #undef HAVE_DECL_F_FULLFSYNC +/* Define to 1 if you have the declaration of `F_OFD_SETLK', and to 0 if you + don't. */ +#undef HAVE_DECL_F_OFD_SETLK + /* Define to 1 if you have the declaration of `memset_s', and to 0 if you don't. */ #undef HAVE_DECL_MEMSET_S diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 1969d467c1d..185d69b5520 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -1653,6 +1653,7 @@ LocationLen LockAcquireResult LockClauseStrength LockData +LockFile LockInfoData LockInstanceData LockMethod base-commit: 031904048aa22e7c70dc8e9c170e2743f9b0f090 -- 2.52.0