From 8b7285e61e425fad4d07a011efe2ccd7fd280d54 Mon Sep 17 00:00:00 2001 From: Peter Eisentraut Date: Thu, 18 Oct 2018 23:09:59 +0200 Subject: [PATCH v6] pg_upgrade: Allow use of file cloning Add another transfer mode --clone to pg_upgrade (besides the existing --link and the default copy), using special file cloning calls. This makes the file transfer faster and more space efficient, achieving speed similar to --link mode without the associated drawbacks. On Linux, file cloning is supported on Btrfs and XFS (if formatted with reflink support). On macOS, file cloning is supported on APFS. --- configure | 2 +- configure.in | 1 + doc/src/sgml/ref/pgupgrade.sgml | 37 ++++++++++--- src/bin/pg_upgrade/check.c | 13 ++++- src/bin/pg_upgrade/file.c | 90 ++++++++++++++++++++++++++++++++ src/bin/pg_upgrade/option.c | 8 +++ src/bin/pg_upgrade/pg_upgrade.h | 6 ++- src/bin/pg_upgrade/relfilenode.c | 44 ++++++++++------ src/include/pg_config.h.in | 3 ++ 9 files changed, 179 insertions(+), 25 deletions(-) diff --git a/configure b/configure index 43ae8c869d..5a7aa02b27 100755 --- a/configure +++ b/configure @@ -15130,7 +15130,7 @@ fi LIBS_including_readline="$LIBS" LIBS=`echo "$LIBS" | sed -e 's/-ledit//g' -e 's/-lreadline//g'` -for ac_func in cbrt clock_gettime fdatasync getifaddrs getpeerucred getrlimit mbstowcs_l memmove poll posix_fallocate ppoll pstat pthread_is_threaded_np readlink setproctitle setproctitle_fast setsid shm_open strchrnul symlink sync_file_range utime utimes wcstombs_l +for ac_func in cbrt clock_gettime copyfile fdatasync getifaddrs getpeerucred getrlimit mbstowcs_l memmove poll posix_fallocate ppoll pstat pthread_is_threaded_np readlink setproctitle setproctitle_fast setsid shm_open strchrnul symlink sync_file_range utime utimes wcstombs_l do : as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh` ac_fn_c_check_func "$LINENO" "$ac_func" "$as_ac_var" diff --git a/configure.in b/configure.in index 519ecd5e1e..7f22bcee75 100644 --- a/configure.in +++ b/configure.in @@ -1602,6 +1602,7 @@ LIBS=`echo "$LIBS" | sed -e 's/-ledit//g' -e 's/-lreadline//g'` AC_CHECK_FUNCS(m4_normalize([ cbrt clock_gettime + copyfile fdatasync getifaddrs getpeerucred diff --git a/doc/src/sgml/ref/pgupgrade.sgml b/doc/src/sgml/ref/pgupgrade.sgml index d51146d641..e616ea9dbc 100644 --- a/doc/src/sgml/ref/pgupgrade.sgml +++ b/doc/src/sgml/ref/pgupgrade.sgml @@ -182,6 +182,25 @@ Options display version information, then exit + + + + + Use efficient file cloning (also known as reflinks) + instead of copying files to the new cluster. This can result in + near-instantaneous copying of the data files, giving the speed + advantages of / while + leaving the old cluster untouched. + + + + At present, reflinks are supported on Linux (kernel 4.5 or later) with + Btrfs and XFS (on file systems created with reflink support, which is + not the default for XFS at this writing), and on macOS with APFS. + + + + @@ -340,7 +359,7 @@ Run <application>pg_upgrade</application> Always run the pg_upgrade binary of the new server, not the old one. pg_upgrade requires the specification of the old and new cluster's data and executable (bin) directories. You can also specify - user and port values, and whether you want the data files linked + user and port values, and whether you want the data files linked or cloned instead of the default copy behavior. @@ -351,8 +370,12 @@ Run <application>pg_upgrade</application> once you start the new cluster after the upgrade. Link mode also requires that the old and new cluster data directories be in the same file system. (Tablespaces and pg_wal can be on - different file systems.) See pg_upgrade --help for a full - list of options. + different file systems.) + The clone mode provides the same speed and disk space advantages but will + not leave the old cluster unusable after the upgrade. The clone mode + also requires that the old and new data directories be in the same file + system. The clone mode is only available on certain operating systems + and file systems. @@ -388,8 +411,9 @@ Run <application>pg_upgrade</application> to perform only the checks, even if the old server is still running. pg_upgrade --check will also outline any manual adjustments you will need to make after the upgrade. If you - are going to be using link mode, you should use the - option with to enable link-mode-specific checks. + are going to be using link or clone mode, you should use the option + or with + to enable mode-specific checks. pg_upgrade requires write permission in the current directory. @@ -722,7 +746,8 @@ Notes If you want to use link mode and you do not want your old cluster - to be modified when the new cluster is started, make a copy of the + to be modified when the new cluster is started, consider using the clone mode. + If that is not available, make a copy of the old cluster and upgrade that in link mode. To make a valid copy of the old cluster, use rsync to create a dirty copy of the old cluster while the server is running, then shut down diff --git a/src/bin/pg_upgrade/check.c b/src/bin/pg_upgrade/check.c index 5a78d603dc..555e5dcbba 100644 --- a/src/bin/pg_upgrade/check.c +++ b/src/bin/pg_upgrade/check.c @@ -149,8 +149,17 @@ check_new_cluster(void) check_loadable_libraries(); - if (user_opts.transfer_mode == TRANSFER_MODE_LINK) - check_hard_link(); + switch (user_opts.transfer_mode) + { + case TRANSFER_MODE_CLONE: + check_file_clone(); + break; + case TRANSFER_MODE_COPY: + break; + case TRANSFER_MODE_LINK: + check_hard_link(); + break; + } check_is_install_user(&new_cluster); diff --git a/src/bin/pg_upgrade/file.c b/src/bin/pg_upgrade/file.c index c27cc93dc2..244dd4d88b 100644 --- a/src/bin/pg_upgrade/file.c +++ b/src/bin/pg_upgrade/file.c @@ -18,6 +18,13 @@ #include #include +#ifdef HAVE_COPYFILE +#include +#endif +#ifdef __linux__ +#include +#include +#endif #ifdef WIN32 @@ -25,6 +32,47 @@ static int win32_pghardlink(const char *src, const char *dst); #endif +/* + * cloneFile() + * + * Clones/reflinks a relation file from src to dst. + * + * schemaName/relName are relation's SQL name (used for error messages only). + */ +void +cloneFile(const char *src, const char *dst, + const char *schemaName, const char *relName) +{ +#if defined(HAVE_COPYFILE) && defined(COPYFILE_CLONE_FORCE) + if (copyfile(src, dst, NULL, COPYFILE_CLONE_FORCE) < 0) + pg_fatal("error while cloning relation \"%s.%s\" (\"%s\" to \"%s\"): %s\n", + schemaName, relName, src, dst, strerror(errno)); +#elif defined(__linux__) && defined(FICLONE) + int src_fd; + int dest_fd; + + if ((src_fd = open(src, O_RDONLY | PG_BINARY, 0)) < 0) + pg_fatal("error while cloning relation \"%s.%s\": could not open file \"%s\": %s\n", + schemaName, relName, src, strerror(errno)); + + if ((dest_fd = open(dst, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, + pg_file_create_mode)) < 0) + pg_fatal("error while cloning relation \"%s.%s\": could not create file \"%s\": %s\n", + schemaName, relName, dst, strerror(errno)); + + if (ioctl(dest_fd, FICLONE, src_fd) < 0) + { + unlink(dst); + pg_fatal("error while cloning relation \"%s.%s\" (\"%s\" to \"%s\"): %s\n", + schemaName, relName, src, dst, strerror(errno)); + } + + close(src_fd); + close(dest_fd); +#endif +} + + /* * copyFile() * @@ -270,6 +318,48 @@ rewriteVisibilityMap(const char *fromfile, const char *tofile, close(src_fd); } +void +check_file_clone(void) +{ + char existing_file[MAXPGPATH]; + char new_link_file[MAXPGPATH]; + + snprintf(existing_file, sizeof(existing_file), "%s/PG_VERSION", old_cluster.pgdata); + snprintf(new_link_file, sizeof(new_link_file), "%s/PG_VERSION.clonetest", new_cluster.pgdata); + unlink(new_link_file); /* might fail */ + +#if defined(HAVE_COPYFILE) && defined(COPYFILE_CLONE_FORCE) + if (copyfile(existing_file, new_link_file, NULL, COPYFILE_CLONE_FORCE) < 0) + pg_fatal("could not clone file between old and new data directories: %s\n", + strerror(errno)); +#elif defined(__linux__) && defined(FICLONE) + { + int src_fd; + int dest_fd; + + if ((src_fd = open(existing_file, O_RDONLY | PG_BINARY, 0)) < 0) + pg_fatal("could not open file \"%s\": %s\n", + existing_file, strerror(errno)); + + if ((dest_fd = open(new_link_file, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, + pg_file_create_mode)) < 0) + pg_fatal("could not create file \"%s\": %s\n", + new_link_file, strerror(errno)); + + if (ioctl(dest_fd, FICLONE, src_fd) < 0) + pg_fatal("could not clone file between old and new data directories: %s\n", + strerror(errno)); + + close(src_fd); + close(dest_fd); + } +#else + pg_fatal("file cloning not supported on this platform\n"); +#endif + + unlink(new_link_file); +} + void check_hard_link(void) { diff --git a/src/bin/pg_upgrade/option.c b/src/bin/pg_upgrade/option.c index 9dbc9225a6..eb3b9f603d 100644 --- a/src/bin/pg_upgrade/option.c +++ b/src/bin/pg_upgrade/option.c @@ -53,6 +53,9 @@ parseCommandLine(int argc, char *argv[]) {"retain", no_argument, NULL, 'r'}, {"jobs", required_argument, NULL, 'j'}, {"verbose", no_argument, NULL, 'v'}, + + {"clone", no_argument, NULL, 1}, + {NULL, 0, NULL, 0} }; int option; /* Command line option */ @@ -203,6 +206,10 @@ parseCommandLine(int argc, char *argv[]) log_opts.verbose = true; break; + case 1: + user_opts.transfer_mode = TRANSFER_MODE_CLONE; + break; + default: pg_fatal("Try \"%s --help\" for more information.\n", os_info.progname); @@ -293,6 +300,7 @@ usage(void) printf(_(" -U, --username=NAME cluster superuser (default \"%s\")\n"), os_info.user); printf(_(" -v, --verbose enable verbose internal logging\n")); printf(_(" -V, --version display version information, then exit\n")); + printf(_(" --clone clone instead of copying files to new cluster\n")); printf(_(" -?, --help show this help, then exit\n")); printf(_("\n" "Before running pg_upgrade you must:\n" diff --git a/src/bin/pg_upgrade/pg_upgrade.h b/src/bin/pg_upgrade/pg_upgrade.h index f83a3eeb67..51bd211d46 100644 --- a/src/bin/pg_upgrade/pg_upgrade.h +++ b/src/bin/pg_upgrade/pg_upgrade.h @@ -230,10 +230,11 @@ typedef struct } ControlData; /* - * Enumeration to denote link modes + * Enumeration to denote transfer modes */ typedef enum { + TRANSFER_MODE_CLONE, TRANSFER_MODE_COPY, TRANSFER_MODE_LINK } transferMode; @@ -372,12 +373,15 @@ bool pid_lock_file_exists(const char *datadir); /* file.c */ +void cloneFile(const char *src, const char *dst, + const char *schemaName, const char *relName); void copyFile(const char *src, const char *dst, const char *schemaName, const char *relName); void linkFile(const char *src, const char *dst, const char *schemaName, const char *relName); void rewriteVisibilityMap(const char *fromfile, const char *tofile, const char *schemaName, const char *relName); +void check_file_clone(void); void check_hard_link(void); /* fopen_priv() is no longer different from fopen() */ diff --git a/src/bin/pg_upgrade/relfilenode.c b/src/bin/pg_upgrade/relfilenode.c index ed604f26ca..3b16c92a02 100644 --- a/src/bin/pg_upgrade/relfilenode.c +++ b/src/bin/pg_upgrade/relfilenode.c @@ -30,10 +30,18 @@ void transfer_all_new_tablespaces(DbInfoArr *old_db_arr, DbInfoArr *new_db_arr, char *old_pgdata, char *new_pgdata) { - if (user_opts.transfer_mode == TRANSFER_MODE_LINK) - pg_log(PG_REPORT, "Linking user relation files\n"); - else - pg_log(PG_REPORT, "Copying user relation files\n"); + switch (user_opts.transfer_mode) + { + case TRANSFER_MODE_CLONE: + pg_log(PG_REPORT, "Cloning user relation files\n"); + break; + case TRANSFER_MODE_COPY: + pg_log(PG_REPORT, "Copying user relation files\n"); + break; + case TRANSFER_MODE_LINK: + pg_log(PG_REPORT, "Linking user relation files\n"); + break; + } /* * Transferring files by tablespace is tricky because a single database @@ -250,17 +258,23 @@ transfer_relfile(FileNameMap *map, const char *type_suffix, bool vm_must_add_fro old_file, new_file); rewriteVisibilityMap(old_file, new_file, map->nspname, map->relname); } - else if (user_opts.transfer_mode == TRANSFER_MODE_COPY) - { - pg_log(PG_VERBOSE, "copying \"%s\" to \"%s\"\n", - old_file, new_file); - copyFile(old_file, new_file, map->nspname, map->relname); - } else - { - pg_log(PG_VERBOSE, "linking \"%s\" to \"%s\"\n", - old_file, new_file); - linkFile(old_file, new_file, map->nspname, map->relname); - } + switch (user_opts.transfer_mode) + { + case TRANSFER_MODE_CLONE: + pg_log(PG_VERBOSE, "cloning \"%s\" to \"%s\"\n", + old_file, new_file); + cloneFile(old_file, new_file, map->nspname, map->relname); + break; + case TRANSFER_MODE_COPY: + pg_log(PG_VERBOSE, "copying \"%s\" to \"%s\"\n", + old_file, new_file); + copyFile(old_file, new_file, map->nspname, map->relname); + break; + case TRANSFER_MODE_LINK: + pg_log(PG_VERBOSE, "linking \"%s\" to \"%s\"\n", + old_file, new_file); + linkFile(old_file, new_file, map->nspname, map->relname); + } } } diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index 9798bd24b4..d2a356a356 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -114,6 +114,9 @@ /* Define to 1 if your compiler handles computed gotos. */ #undef HAVE_COMPUTED_GOTO +/* Define to 1 if you have the `copyfile' function. */ +#undef HAVE_COPYFILE + /* Define to 1 if you have the header file. */ #undef HAVE_CRTDEFS_H base-commit: e74dd00f53cd6dc1887f76b9672e5f6dcf0fd8a2 -- 2.19.1