From 9a58fc2589e50d69b4b158ea5e8f3898483290d0 Mon Sep 17 00:00:00 2001 From: Peter Eisentraut Date: Thu, 27 Sep 2018 22:42:33 +0200 Subject: [PATCH v5] pg_upgrade: Allow use of file cloning For file copying in pg_upgrade, allow using special file cloning calls if available. This makes the copying faster and more space efficient. This achieves speed similar to --link mode without the associated drawbacks. Add an option --reflink to select whether file cloning is turned on, off, or automatic. Automatic is the default. On Linux, file cloning is supported on Btrfs and XFS (if formatted with reflink support). On macOS, file cloning is supported on APFS. --- configure | 2 +- configure.in | 2 +- doc/src/sgml/ref/pgupgrade.sgml | 33 +++++++++ src/bin/pg_upgrade/check.c | 2 + src/bin/pg_upgrade/file.c | 123 +++++++++++++++++++++++++++++++ src/bin/pg_upgrade/option.c | 14 ++++ src/bin/pg_upgrade/pg_upgrade.h | 15 ++++ src/bin/pg_upgrade/relfilenode.c | 31 +++++++- src/include/pg_config.h.in | 3 + 9 files changed, 220 insertions(+), 5 deletions(-) diff --git a/configure b/configure index 6414ec1ea6..ae6f1a2e17 100755 --- a/configure +++ b/configure @@ -15100,7 +15100,7 @@ fi LIBS_including_readline="$LIBS" LIBS=`echo "$LIBS" | sed -e 's/-ledit//g' -e 's/-lreadline//g'` -for ac_func in cbrt clock_gettime fdatasync getifaddrs getpeerucred getrlimit mbstowcs_l memmove poll posix_fallocate ppoll pstat pthread_is_threaded_np readlink setproctitle setproctitle_fast setsid shm_open symlink sync_file_range utime utimes wcstombs_l +for ac_func in cbrt clock_gettime copyfile fdatasync getifaddrs getpeerucred getrlimit mbstowcs_l memmove poll posix_fallocate ppoll pstat pthread_is_threaded_np readlink setproctitle setproctitle_fast setsid shm_open symlink sync_file_range utime utimes wcstombs_l do : as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh` ac_fn_c_check_func "$LINENO" "$ac_func" "$as_ac_var" diff --git a/configure.in b/configure.in index 158d5a1ac8..265faf1b99 100644 --- a/configure.in +++ b/configure.in @@ -1571,7 +1571,7 @@ PGAC_FUNC_WCSTOMBS_L LIBS_including_readline="$LIBS" LIBS=`echo "$LIBS" | sed -e 's/-ledit//g' -e 's/-lreadline//g'` -AC_CHECK_FUNCS([cbrt clock_gettime fdatasync getifaddrs getpeerucred getrlimit mbstowcs_l memmove poll posix_fallocate ppoll pstat pthread_is_threaded_np readlink setproctitle setproctitle_fast setsid shm_open symlink sync_file_range utime utimes wcstombs_l]) +AC_CHECK_FUNCS([cbrt clock_gettime copyfile fdatasync getifaddrs getpeerucred getrlimit mbstowcs_l memmove poll posix_fallocate ppoll pstat pthread_is_threaded_np readlink setproctitle setproctitle_fast setsid shm_open symlink sync_file_range utime utimes wcstombs_l]) AC_REPLACE_FUNCS(fseeko) case $host_os in diff --git a/doc/src/sgml/ref/pgupgrade.sgml b/doc/src/sgml/ref/pgupgrade.sgml index d51146d641..d994218c44 100644 --- a/doc/src/sgml/ref/pgupgrade.sgml +++ b/doc/src/sgml/ref/pgupgrade.sgml @@ -182,6 +182,39 @@ Options display version information, then exit + + ={always|auto|never} + + + Determines whether pg_upgrade, when in copy + mode, should use efficient file cloning (also known as + reflinks) on some operating systems and file systems. + This can result in near-instantaneous copying of the data files, + giving the speed advantages of + / while leaving the old + cluster untouched. + + + + The setting always requires the use of reflinks. If + they are not supported, the pg_upgrade run + will abort. Use this in production to limit the upgrade run time. + The setting auto uses reflinks when available, + otherwise it falls back to a normal copy. This is the default. The + setting never prevents use of reflinks and always + uses a normal copy. This can be useful to ensure that the upgraded + cluster has its disk space fully allocated and not shared with the old + cluster. + + + + At present, reflinks are supported on Linux (kernel 4.5 or later) with + Btrfs and XFS (on file systems created with reflink support, which is + not the default for XFS at this writing), and on macOS with APFS. + + + + diff --git a/src/bin/pg_upgrade/check.c b/src/bin/pg_upgrade/check.c index 5a78d603dc..eb1f18180a 100644 --- a/src/bin/pg_upgrade/check.c +++ b/src/bin/pg_upgrade/check.c @@ -151,6 +151,8 @@ check_new_cluster(void) if (user_opts.transfer_mode == TRANSFER_MODE_LINK) check_hard_link(); + else if (user_opts.transfer_mode == TRANSFER_MODE_COPY && user_opts.reflink_mode != REFLINK_NEVER) + check_reflink(); check_is_install_user(&new_cluster); diff --git a/src/bin/pg_upgrade/file.c b/src/bin/pg_upgrade/file.c index c27cc93dc2..2e864cd6bb 100644 --- a/src/bin/pg_upgrade/file.c +++ b/src/bin/pg_upgrade/file.c @@ -18,6 +18,13 @@ #include #include +#ifdef HAVE_COPYFILE +#include +#endif +#ifdef __linux__ +#include +#include +#endif #ifdef WIN32 @@ -93,6 +100,68 @@ copyFile(const char *src, const char *dst, #endif /* WIN32 */ } +/* + * cloneFile() + * + * Clones/reflinks a relation file from src to dst. + * + * schemaName/relName are relation's SQL name (used for error messages only). + * + * If unsupported_ok is true, then if the cloning fails because the OS or file + * system don't support it, don't error, instead return false. Otherwise, + * true is returned. Based on this, the caller can then try to call + * copyFile() instead, for example. + */ +bool +cloneFile(const char *src, const char *dst, + const char *schemaName, const char *relName, + bool unsupported_ok) +{ +#if defined(HAVE_COPYFILE) && defined(COPYFILE_CLONE_FORCE) + if (copyfile(src, dst, NULL, COPYFILE_CLONE_FORCE) < 0) + { + if (unsupported_ok && errno == ENOTSUP) + return false; + else + pg_fatal("error while cloning relation \"%s.%s\" (\"%s\" to \"%s\"): %s\n", + schemaName, relName, src, dst, strerror(errno)); + } + return true; +#elif defined(__linux__) && defined(FICLONE) + int src_fd; + int dest_fd; + + if ((src_fd = open(src, O_RDONLY | PG_BINARY, 0)) < 0) + pg_fatal("error while cloning relation \"%s.%s\": could not open file \"%s\": %s\n", + schemaName, relName, src, strerror(errno)); + + if ((dest_fd = open(dst, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, + pg_file_create_mode)) < 0) + pg_fatal("error while cloning relation \"%s.%s\": could not create file \"%s\": %s\n", + schemaName, relName, dst, strerror(errno)); + + if (ioctl(dest_fd, FICLONE, src_fd) < 0) + { + unlink(dst); + if (unsupported_ok && errno == EOPNOTSUPP) + { + close(src_fd); + close(dest_fd); + return false; + } + else + pg_fatal("error while cloning relation \"%s.%s\" (\"%s\" to \"%s\"): %s\n", + schemaName, relName, src, dst, strerror(errno)); + } + + close(src_fd); + close(dest_fd); + return true; +#else + return false; +#endif +} + /* * linkFile() @@ -270,6 +339,60 @@ rewriteVisibilityMap(const char *fromfile, const char *tofile, close(src_fd); } +void +check_reflink(void) +{ + char existing_file[MAXPGPATH]; + char new_link_file[MAXPGPATH]; + + snprintf(existing_file, sizeof(existing_file), "%s/PG_VERSION", old_cluster.pgdata); + snprintf(new_link_file, sizeof(new_link_file), "%s/PG_VERSION.reflinktest", new_cluster.pgdata); + unlink(new_link_file); /* might fail */ + +#if defined(HAVE_COPYFILE) && defined(COPYFILE_CLONE_FORCE) + if (copyfile(existing_file, new_link_file, NULL, COPYFILE_CLONE_FORCE) < 0) + { + if (user_opts.reflink_mode == REFLINK_ALWAYS) + pg_fatal("could not clone file between old and new data directories: %s\n", + strerror(errno)); + else if (user_opts.check) + pg_log(PG_REPORT, "could not clone file between old and new data directories: %s\n", + strerror(errno)); + } +#elif defined(__linux__) && defined(FICLONE) + { + int src_fd; + int dest_fd; + + if ((src_fd = open(existing_file, O_RDONLY | PG_BINARY, 0)) < 0) + pg_fatal("could not open file \"%s\": %s\n", + existing_file, strerror(errno)); + + if ((dest_fd = open(new_link_file, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, + pg_file_create_mode)) < 0) + pg_fatal("could not create file \"%s\": %s\n", + new_link_file, strerror(errno)); + + if (ioctl(dest_fd, FICLONE, src_fd) < 0) + { + if (user_opts.reflink_mode == REFLINK_ALWAYS) + pg_fatal("could not clone file between old and new data directories: %s\n", + strerror(errno)); + else if (user_opts.check) + pg_log(PG_REPORT, "could not clone file between old and new data directories: %s\n", + strerror(errno)); + } + + close(src_fd); + close(dest_fd); + } +#else + pg_fatal("file cloning not supported on this platform\n"); +#endif + + unlink(new_link_file); +} + void check_hard_link(void) { diff --git a/src/bin/pg_upgrade/option.c b/src/bin/pg_upgrade/option.c index 9dbc9225a6..d52a1bcee3 100644 --- a/src/bin/pg_upgrade/option.c +++ b/src/bin/pg_upgrade/option.c @@ -53,6 +53,9 @@ parseCommandLine(int argc, char *argv[]) {"retain", no_argument, NULL, 'r'}, {"jobs", required_argument, NULL, 'j'}, {"verbose", no_argument, NULL, 'v'}, + + {"reflink", required_argument, NULL, 1}, + {NULL, 0, NULL, 0} }; int option; /* Command line option */ @@ -203,6 +206,17 @@ parseCommandLine(int argc, char *argv[]) log_opts.verbose = true; break; + case 1: + if (strcmp(optarg, "always") == 0) + user_opts.reflink_mode = REFLINK_ALWAYS; + else if (strcmp(optarg, "auto") == 0) + user_opts.reflink_mode = REFLINK_AUTO; + else if (strcmp(optarg, "never") == 0) + user_opts.reflink_mode = REFLINK_NEVER; + else + pg_fatal("invalid reflink mode: %s\n", optarg); + break; + default: pg_fatal("Try \"%s --help\" for more information.\n", os_info.progname); diff --git a/src/bin/pg_upgrade/pg_upgrade.h b/src/bin/pg_upgrade/pg_upgrade.h index f83a3eeb67..16eb34e14c 100644 --- a/src/bin/pg_upgrade/pg_upgrade.h +++ b/src/bin/pg_upgrade/pg_upgrade.h @@ -238,6 +238,16 @@ typedef enum TRANSFER_MODE_LINK } transferMode; +/* + * Enumeration to denote reflink modes + */ +typedef enum +{ + REFLINK_NEVER, + REFLINK_AUTO, + REFLINK_ALWAYS +} reflinkMode; + /* * Enumeration to denote pg_log modes */ @@ -297,6 +307,7 @@ typedef struct bool check; /* true -> ask user for permission to make * changes */ transferMode transfer_mode; /* copy files or link them? */ + reflinkMode reflink_mode; int jobs; } UserOpts; @@ -374,10 +385,14 @@ bool pid_lock_file_exists(const char *datadir); void copyFile(const char *src, const char *dst, const char *schemaName, const char *relName); +bool cloneFile(const char *src, const char *dst, + const char *schemaName, const char *relName, + bool unsupported_ok); void linkFile(const char *src, const char *dst, const char *schemaName, const char *relName); void rewriteVisibilityMap(const char *fromfile, const char *tofile, const char *schemaName, const char *relName); +void check_reflink(void); void check_hard_link(void); /* fopen_priv() is no longer different from fopen() */ diff --git a/src/bin/pg_upgrade/relfilenode.c b/src/bin/pg_upgrade/relfilenode.c index ed604f26ca..fc00cfdfae 100644 --- a/src/bin/pg_upgrade/relfilenode.c +++ b/src/bin/pg_upgrade/relfilenode.c @@ -252,9 +252,34 @@ transfer_relfile(FileNameMap *map, const char *type_suffix, bool vm_must_add_fro } else if (user_opts.transfer_mode == TRANSFER_MODE_COPY) { - pg_log(PG_VERBOSE, "copying \"%s\" to \"%s\"\n", - old_file, new_file); - copyFile(old_file, new_file, map->nspname, map->relname); + if (user_opts.reflink_mode == REFLINK_ALWAYS) + { + pg_log(PG_VERBOSE, "cloning \"%s\" to \"%s\"\n", + old_file, new_file); + cloneFile(old_file, new_file, map->nspname, map->relname, false); + } + else if (user_opts.reflink_mode == REFLINK_AUTO) + { + static bool cloning_ok = true; + + pg_log(PG_VERBOSE, "copying \"%s\" to \"%s\"\n", + old_file, new_file); + if (cloning_ok && + !cloneFile(old_file, new_file, map->nspname, map->relname, true)) + { + pg_log(PG_VERBOSE, "cloning not supported, switching to copying\n"); + cloning_ok = false; + copyFile(old_file, new_file, map->nspname, map->relname); + } + else + copyFile(old_file, new_file, map->nspname, map->relname); + } + else + { + pg_log(PG_VERBOSE, "copying \"%s\" to \"%s\"\n", + old_file, new_file); + copyFile(old_file, new_file, map->nspname, map->relname); + } } else { diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index 90dda8ea05..2c57e31dcd 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -114,6 +114,9 @@ /* Define to 1 if your compiler handles computed gotos. */ #undef HAVE_COMPUTED_GOTO +/* Define to 1 if you have the `copyfile' function. */ +#undef HAVE_COPYFILE + /* Define to 1 if you have the header file. */ #undef HAVE_CRTDEFS_H base-commit: 27e082b0c6e564facfbf54b56090fdcc4bf44cca -- 2.19.0