From 56b5b574f6d900d5eb4932be499cf3bae0e7ba86 Mon Sep 17 00:00:00 2001 From: Peter Eisentraut Date: Tue, 20 Feb 2018 10:41:16 -0500 Subject: [PATCH] Use file cloning in pg_upgrade and CREATE DATABASE For file copying in pg_upgrade and CREATE DATABASE, use special file cloning calls if available. This makes the copying faster and more space efficient. For pg_upgrade, this achieves speed similar to --link mode without the associated drawbacks. On Linux, use copy_file_range(). This supports file cloning automatically on Btrfs and XFS (if formatted with reflink support). On macOS, use copyfile(), which supports file cloning on APFS. Even on file systems without cloning/reflink support, this is faster than the existing code, because it avoids copying the file contents out of kernel space and allows the OS to apply other optimizations. --- configure | 2 +- configure.in | 2 +- doc/src/sgml/ref/pgupgrade.sgml | 11 ++++++++ src/backend/storage/file/copydir.c | 55 +++++++++++++++++++++++++++++++++----- src/bin/pg_upgrade/file.c | 37 ++++++++++++++++++++++++- src/include/pg_config.h.in | 6 +++++ 6 files changed, 104 insertions(+), 9 deletions(-) diff --git a/configure b/configure index 7dcca506f8..eb8b321723 100755 --- a/configure +++ b/configure @@ -13079,7 +13079,7 @@ fi LIBS_including_readline="$LIBS" LIBS=`echo "$LIBS" | sed -e 's/-ledit//g' -e 's/-lreadline//g'` -for ac_func in cbrt clock_gettime dlopen fdatasync getifaddrs getpeerucred getrlimit mbstowcs_l memmove poll posix_fallocate pstat pthread_is_threaded_np readlink setproctitle setsid shm_open symlink sync_file_range utime utimes wcstombs_l +for ac_func in cbrt clock_gettime copy_file_range copyfile dlopen fdatasync getifaddrs getpeerucred getrlimit mbstowcs_l memmove poll posix_fallocate pstat pthread_is_threaded_np readlink setproctitle setsid shm_open symlink sync_file_range utime utimes wcstombs_l do : as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh` ac_fn_c_check_func "$LINENO" "$ac_func" "$as_ac_var" diff --git a/configure.in b/configure.in index 4d26034579..dfe3507b25 100644 --- a/configure.in +++ b/configure.in @@ -1425,7 +1425,7 @@ PGAC_FUNC_WCSTOMBS_L LIBS_including_readline="$LIBS" LIBS=`echo "$LIBS" | sed -e 's/-ledit//g' -e 's/-lreadline//g'` -AC_CHECK_FUNCS([cbrt clock_gettime dlopen fdatasync getifaddrs getpeerucred getrlimit mbstowcs_l memmove poll posix_fallocate pstat pthread_is_threaded_np readlink setproctitle setsid shm_open symlink sync_file_range utime utimes wcstombs_l]) +AC_CHECK_FUNCS([cbrt clock_gettime copy_file_range copyfile dlopen fdatasync getifaddrs getpeerucred getrlimit mbstowcs_l memmove poll posix_fallocate pstat pthread_is_threaded_np readlink setproctitle setsid shm_open symlink sync_file_range utime utimes wcstombs_l]) AC_REPLACE_FUNCS(fseeko) case $host_os in diff --git a/doc/src/sgml/ref/pgupgrade.sgml b/doc/src/sgml/ref/pgupgrade.sgml index 6dafb404a1..3873e71dd1 100644 --- a/doc/src/sgml/ref/pgupgrade.sgml +++ b/doc/src/sgml/ref/pgupgrade.sgml @@ -737,6 +737,17 @@ Notes is down. + + In PostgreSQL 11 and later, pg_upgrade + automatically uses efficient file cloning (also known as + reflinks) on some operating systems and file systems. This + can result in near-instantaneous copying of the data files, giving the + speed advantages of / while + leaving the old cluster untouched. At present, this is supported on Linux + (kernel 4.5 or later, glibc 2.27 or later) with Btrfs and XFS (on file + systems created with reflink support, which is not the default for XFS at + this writing), and on macOS with APFS. + diff --git a/src/backend/storage/file/copydir.c b/src/backend/storage/file/copydir.c index ca6342db0d..cd6398d69a 100644 --- a/src/backend/storage/file/copydir.c +++ b/src/backend/storage/file/copydir.c @@ -21,6 +21,9 @@ #include #include #include +#ifdef HAVE_COPYFILE +#include +#endif #include "storage/copydir.h" #include "storage/fd.h" @@ -74,7 +77,22 @@ copydir(char *fromdir, char *todir, bool recurse) copydir(fromfile, tofile, true); } else if (S_ISREG(fst.st_mode)) + { +#ifdef HAVE_COPYFILE + if (copyfile(fromfile, tofile, NULL, +#ifdef COPYFILE_CLONE + COPYFILE_CLONE +#else + COPYFILE_DATA +#endif + ) < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not copy file \"%s\" to \"%s\": %m", fromfile, tofile))); +#else copy_file(fromfile, tofile); +#endif + } } FreeDir(xldir); @@ -126,12 +144,17 @@ copydir(char *fromdir, char *todir, bool recurse) void copy_file(char *fromfile, char *tofile) { - char *buffer; int srcfd; int dstfd; +#ifdef HAVE_COPY_FILE_RANGE + struct stat stat; + size_t len; +#else + char *buffer; int nbytes; off_t offset; off_t flush_offset; +#endif /* Size of copy buffer (read and write requests) */ #define COPY_BUF_SIZE (8 * BLCKSZ) @@ -148,9 +171,6 @@ copy_file(char *fromfile, char *tofile) #define FLUSH_DISTANCE (1024 * 1024) #endif - /* Use palloc to ensure we get a maxaligned buffer */ - buffer = palloc(COPY_BUF_SIZE); - /* * Open the files */ @@ -166,6 +186,28 @@ copy_file(char *fromfile, char *tofile) (errcode_for_file_access(), errmsg("could not create file \"%s\": %m", tofile))); +#ifdef HAVE_COPY_FILE_RANGE + if (fstat(srcfd, &stat) < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not stat file \"%s\": %m", fromfile))); + + len = stat.st_size; + + do { + ssize_t ret = copy_file_range(srcfd, NULL, dstfd, NULL, len, 0); + if (ret < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not copy file \"%s\" to \"%s\": %m", + fromfile, tofile))); + + len -= ret; + } while (len > 0); +#else + /* Use palloc to ensure we get a maxaligned buffer */ + buffer = palloc(COPY_BUF_SIZE); + /* * Do the data copying. */ @@ -213,12 +255,13 @@ copy_file(char *fromfile, char *tofile) if (offset > flush_offset) pg_flush_data(dstfd, flush_offset, offset - flush_offset); + pfree(buffer); +#endif + if (CloseTransientFile(dstfd)) ereport(ERROR, (errcode_for_file_access(), errmsg("could not close file \"%s\": %m", tofile))); CloseTransientFile(srcfd); - - pfree(buffer); } diff --git a/src/bin/pg_upgrade/file.c b/src/bin/pg_upgrade/file.c index f38bfacf02..f05fd9db9c 100644 --- a/src/bin/pg_upgrade/file.c +++ b/src/bin/pg_upgrade/file.c @@ -17,6 +17,9 @@ #include #include +#ifdef HAVE_COPYFILE +#include +#endif #ifdef WIN32 @@ -34,10 +37,25 @@ void copyFile(const char *src, const char *dst, const char *schemaName, const char *relName) { -#ifndef WIN32 +#ifdef HAVE_COPYFILE + if (copyfile(src, dst, NULL, +#ifdef COPYFILE_CLONE + COPYFILE_CLONE +#else + COPYFILE_DATA +#endif + ) < 0) + pg_fatal("error while copying relation \"%s.%s\" (\"%s\" to \"%s\"): %s\n", + schemaName, relName, src, dst, strerror(errno)); +#elif !defined(WIN32) int src_fd; int dest_fd; +#ifdef HAVE_COPY_FILE_RANGE + struct stat stat; + size_t len; +#else char *buffer; +#endif if ((src_fd = open(src, O_RDONLY | PG_BINARY, 0)) < 0) pg_fatal("error while copying relation \"%s.%s\": could not open file \"%s\": %s\n", @@ -48,6 +66,22 @@ copyFile(const char *src, const char *dst, pg_fatal("error while copying relation \"%s.%s\": could not create file \"%s\": %s\n", schemaName, relName, dst, strerror(errno)); +#ifdef HAVE_COPY_FILE_RANGE + if (fstat(src_fd, &stat) < 0) + pg_fatal("could not stat file \"%s\": %s", + src, strerror(errno)); + + len = stat.st_size; + + do { + ssize_t ret = copy_file_range(src_fd, NULL, dest_fd, NULL, len, 0); + if (ret < 0) + pg_fatal("error while copying relation \"%s.%s\" (\"%s\" to \"%s\"): %s\n", + schemaName, relName, src, dst, strerror(errno)); + + len -= ret; + } while (len > 0); +#else /* copy in fairly large chunks for best efficiency */ #define COPY_BUF_SIZE (50 * BLCKSZ) @@ -77,6 +111,7 @@ copyFile(const char *src, const char *dst, } pg_free(buffer); +#endif close(src_fd); close(dest_fd); diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index f98f773ff0..38e88e0395 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -114,6 +114,12 @@ /* Define to 1 if your compiler handles computed gotos. */ #undef HAVE_COMPUTED_GOTO +/* Define to 1 if you have the `copyfile' function. */ +#undef HAVE_COPYFILE + +/* Define to 1 if you have the `copy_file_range' function. */ +#undef HAVE_COPY_FILE_RANGE + /* Define to 1 if you have the header file. */ #undef HAVE_CRTDEFS_H base-commit: 9a44a26b65d3d36867267624b76d3dea3dc4f6f6 -- 2.16.2