From bd8fe105f6b1c64098e344c4a7d0fc9c94d2e31d Mon Sep 17 00:00:00 2001 From: Peter Eisentraut Date: Tue, 20 Mar 2018 10:21:47 -0400 Subject: [PATCH v2] Use file cloning in pg_upgrade and CREATE DATABASE For file copying in pg_upgrade and CREATE DATABASE, use special file cloning calls if available. This makes the copying faster and more space efficient. For pg_upgrade, this achieves speed similar to --link mode without the associated drawbacks. Other backend users of copydir.c will also take advantage of these changes, but the performance improvement will probably not be as noticeable there. On Linux, use copy_file_range(). This supports file cloning automatically on Btrfs and XFS (if formatted with reflink support). On macOS, use copyfile(), which supports file cloning on APFS. Even on file systems without cloning/reflink support, this is faster than the existing code, because it avoids copying the file contents out of kernel space and allows the OS to apply other optimizations. --- configure | 2 +- configure.in | 2 +- doc/src/sgml/monitoring.sgml | 8 +++- doc/src/sgml/ref/pgupgrade.sgml | 11 +++++ src/backend/postmaster/pgstat.c | 3 ++ src/backend/storage/file/copydir.c | 84 ++++++++++++++++++++++++++++++-------- src/backend/storage/file/reinit.c | 3 +- src/bin/pg_upgrade/file.c | 56 +++++++++++++++++++------ src/include/pg_config.h.in | 6 +++ src/include/pgstat.h | 1 + 10 files changed, 141 insertions(+), 35 deletions(-) diff --git a/configure b/configure index 3943711283..f27c78f63a 100755 --- a/configure +++ b/configure @@ -13085,7 +13085,7 @@ fi LIBS_including_readline="$LIBS" LIBS=`echo "$LIBS" | sed -e 's/-ledit//g' -e 's/-lreadline//g'` -for ac_func in cbrt clock_gettime dlopen fdatasync getifaddrs getpeerucred getrlimit mbstowcs_l memmove poll posix_fallocate pstat pthread_is_threaded_np readlink setproctitle setsid shm_open symlink sync_file_range utime utimes wcstombs_l +for ac_func in cbrt clock_gettime copy_file_range copyfile dlopen fdatasync getifaddrs getpeerucred getrlimit mbstowcs_l memmove poll posix_fallocate pstat pthread_is_threaded_np readlink setproctitle setsid shm_open symlink sync_file_range utime utimes wcstombs_l do : as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh` ac_fn_c_check_func "$LINENO" "$ac_func" "$as_ac_var" diff --git a/configure.in b/configure.in index 1babdbb755..7eb8673753 100644 --- a/configure.in +++ b/configure.in @@ -1428,7 +1428,7 @@ PGAC_FUNC_WCSTOMBS_L LIBS_including_readline="$LIBS" LIBS=`echo "$LIBS" | sed -e 's/-ledit//g' -e 's/-lreadline//g'` -AC_CHECK_FUNCS([cbrt clock_gettime dlopen fdatasync getifaddrs getpeerucred getrlimit mbstowcs_l memmove poll posix_fallocate pstat pthread_is_threaded_np readlink setproctitle setsid shm_open symlink sync_file_range utime utimes wcstombs_l]) +AC_CHECK_FUNCS([cbrt clock_gettime copy_file_range copyfile dlopen fdatasync getifaddrs getpeerucred getrlimit mbstowcs_l memmove poll posix_fallocate pstat pthread_is_threaded_np readlink setproctitle setsid shm_open symlink sync_file_range utime utimes wcstombs_l]) AC_REPLACE_FUNCS(fseeko) case $host_os in diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml index 3bc4de57d5..02029e81bc 100644 --- a/doc/src/sgml/monitoring.sgml +++ b/doc/src/sgml/monitoring.sgml @@ -1418,7 +1418,7 @@ <structname>wait_event</structname> Description Waiting to apply WAL at recovery because it is delayed. - IO + IO BufFileRead Waiting for a read from a buffered file. @@ -1446,6 +1446,12 @@ <structname>wait_event</structname> Description ControlFileWriteUpdate Waiting for a write to update the control file. + + CopyFileCopy + Waiting for a file copy operation (if the copying is done by + an operating system call rather than as separate read and write + operations). + CopyFileRead Waiting for a read during a file copy operation. diff --git a/doc/src/sgml/ref/pgupgrade.sgml b/doc/src/sgml/ref/pgupgrade.sgml index 6dafb404a1..3873e71dd1 100644 --- a/doc/src/sgml/ref/pgupgrade.sgml +++ b/doc/src/sgml/ref/pgupgrade.sgml @@ -737,6 +737,17 @@ Notes is down. + + In PostgreSQL 11 and later, pg_upgrade + automatically uses efficient file cloning (also known as + reflinks) on some operating systems and file systems. This + can result in near-instantaneous copying of the data files, giving the + speed advantages of / while + leaving the old cluster untouched. At present, this is supported on Linux + (kernel 4.5 or later, glibc 2.27 or later) with Btrfs and XFS (on file + systems created with reflink support, which is not the default for XFS at + this writing), and on macOS with APFS. + diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c index 96ba216387..4feb3a5289 100644 --- a/src/backend/postmaster/pgstat.c +++ b/src/backend/postmaster/pgstat.c @@ -3744,6 +3744,9 @@ pgstat_get_wait_io(WaitEventIO w) case WAIT_EVENT_CONTROL_FILE_WRITE_UPDATE: event_name = "ControlFileWriteUpdate"; break; + case WAIT_EVENT_COPY_FILE_COPY: + event_name = "CopyFileCopy"; + break; case WAIT_EVENT_COPY_FILE_READ: event_name = "CopyFileRead"; break; diff --git a/src/backend/storage/file/copydir.c b/src/backend/storage/file/copydir.c index ca6342db0d..5aa9742b51 100644 --- a/src/backend/storage/file/copydir.c +++ b/src/backend/storage/file/copydir.c @@ -21,6 +21,9 @@ #include #include #include +#ifdef HAVE_COPYFILE +#include +#endif #include "storage/copydir.h" #include "storage/fd.h" @@ -126,13 +129,71 @@ copydir(char *fromdir, char *todir, bool recurse) void copy_file(char *fromfile, char *tofile) { - char *buffer; +#ifdef HAVE_COPYFILE + int ret; + + pgstat_report_wait_start(WAIT_EVENT_COPY_FILE_COPY); + ret = copyfile(fromfile, tofile, NULL, +#ifdef COPYFILE_CLONE + COPYFILE_CLONE +#else + COPYFILE_DATA +#endif + ); + pgstat_report_wait_end(); + if (ret < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not copy file \"%s\" to \"%s\": %m", fromfile, tofile))); +#else int srcfd; int dstfd; +#ifdef HAVE_COPY_FILE_RANGE + struct stat stat; + size_t len; +#else + char *buffer; int nbytes; off_t offset; off_t flush_offset; +#endif + + /* + * Open the files + */ + srcfd = OpenTransientFile(fromfile, O_RDONLY | PG_BINARY); + if (srcfd < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", fromfile))); + + dstfd = OpenTransientFile(tofile, O_RDWR | O_CREAT | O_EXCL | PG_BINARY); + if (dstfd < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not create file \"%s\": %m", tofile))); + +#ifdef HAVE_COPY_FILE_RANGE + if (fstat(srcfd, &stat) < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not stat file \"%s\": %m", fromfile))); + + len = stat.st_size; + do { + pgstat_report_wait_start(WAIT_EVENT_COPY_FILE_COPY); + ssize_t ret = copy_file_range(srcfd, NULL, dstfd, NULL, len, 0); + pgstat_report_wait_end(); + if (ret < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not copy file \"%s\" to \"%s\": %m", + fromfile, tofile))); + + len -= ret; + } while (len > 0); +#else /* Size of copy buffer (read and write requests) */ #define COPY_BUF_SIZE (8 * BLCKSZ) @@ -151,21 +212,6 @@ copy_file(char *fromfile, char *tofile) /* Use palloc to ensure we get a maxaligned buffer */ buffer = palloc(COPY_BUF_SIZE); - /* - * Open the files - */ - srcfd = OpenTransientFile(fromfile, O_RDONLY | PG_BINARY); - if (srcfd < 0) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not open file \"%s\": %m", fromfile))); - - dstfd = OpenTransientFile(tofile, O_RDWR | O_CREAT | O_EXCL | PG_BINARY); - if (dstfd < 0) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not create file \"%s\": %m", tofile))); - /* * Do the data copying. */ @@ -213,12 +259,14 @@ copy_file(char *fromfile, char *tofile) if (offset > flush_offset) pg_flush_data(dstfd, flush_offset, offset - flush_offset); + pfree(buffer); +#endif + if (CloseTransientFile(dstfd)) ereport(ERROR, (errcode_for_file_access(), errmsg("could not close file \"%s\": %m", tofile))); CloseTransientFile(srcfd); - - pfree(buffer); +#endif } diff --git a/src/backend/storage/file/reinit.c b/src/backend/storage/file/reinit.c index 92363ae6ad..2614b27307 100644 --- a/src/backend/storage/file/reinit.c +++ b/src/backend/storage/file/reinit.c @@ -314,8 +314,7 @@ ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op) FreeDir(dbspace_dir); /* - * copy_file() above has already called pg_flush_data() on the files - * it created. Now we need to fsync those files, because a checkpoint + * Now we need to fsync the copied files, because a checkpoint * won't do it for us while we're in recovery. We do this in a * separate pass to allow the kernel to perform all the flushes * (especially the metadata ones) at once. diff --git a/src/bin/pg_upgrade/file.c b/src/bin/pg_upgrade/file.c index f38bfacf02..4354b64b50 100644 --- a/src/bin/pg_upgrade/file.c +++ b/src/bin/pg_upgrade/file.c @@ -17,6 +17,9 @@ #include #include +#ifdef HAVE_COPYFILE +#include +#endif #ifdef WIN32 @@ -34,10 +37,32 @@ void copyFile(const char *src, const char *dst, const char *schemaName, const char *relName) { -#ifndef WIN32 +#if defined(HAVE_COPYFILE) + if (copyfile(src, dst, NULL, +#ifdef COPYFILE_CLONE + COPYFILE_CLONE +#else + COPYFILE_DATA +#endif + ) < 0) + pg_fatal("error while copying relation \"%s.%s\" (\"%s\" to \"%s\"): %s\n", + schemaName, relName, src, dst, strerror(errno)); +#elif defined(WIN32) + if (CopyFile(src, dst, true) == 0) + { + _dosmaperr(GetLastError()); + pg_fatal("error while copying relation \"%s.%s\" (\"%s\" to \"%s\"): %s\n", + schemaName, relName, src, dst, strerror(errno)); + } +#else int src_fd; int dest_fd; +#ifdef HAVE_COPY_FILE_RANGE + struct stat stat; + size_t len; +#else char *buffer; +#endif if ((src_fd = open(src, O_RDONLY | PG_BINARY, 0)) < 0) pg_fatal("error while copying relation \"%s.%s\": could not open file \"%s\": %s\n", @@ -48,6 +73,22 @@ copyFile(const char *src, const char *dst, pg_fatal("error while copying relation \"%s.%s\": could not create file \"%s\": %s\n", schemaName, relName, dst, strerror(errno)); +#ifdef HAVE_COPY_FILE_RANGE + if (fstat(src_fd, &stat) < 0) + pg_fatal("could not stat file \"%s\": %s", + src, strerror(errno)); + + len = stat.st_size; + + do { + ssize_t ret = copy_file_range(src_fd, NULL, dest_fd, NULL, len, 0); + if (ret < 0) + pg_fatal("error while copying relation \"%s.%s\" (\"%s\" to \"%s\"): %s\n", + schemaName, relName, src, dst, strerror(errno)); + + len -= ret; + } while (len > 0); +#else /* copy in fairly large chunks for best efficiency */ #define COPY_BUF_SIZE (50 * BLCKSZ) @@ -77,19 +118,10 @@ copyFile(const char *src, const char *dst, } pg_free(buffer); +#endif close(src_fd); close(dest_fd); - -#else /* WIN32 */ - - if (CopyFile(src, dst, true) == 0) - { - _dosmaperr(GetLastError()); - pg_fatal("error while copying relation \"%s.%s\" (\"%s\" to \"%s\"): %s\n", - schemaName, relName, src, dst, strerror(errno)); - } - -#endif /* WIN32 */ +#endif } diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index f98f773ff0..38e88e0395 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -114,6 +114,12 @@ /* Define to 1 if your compiler handles computed gotos. */ #undef HAVE_COMPUTED_GOTO +/* Define to 1 if you have the `copyfile' function. */ +#undef HAVE_COPYFILE + +/* Define to 1 if you have the `copy_file_range' function. */ +#undef HAVE_COPY_FILE_RANGE + /* Define to 1 if you have the header file. */ #undef HAVE_CRTDEFS_H diff --git a/src/include/pgstat.h b/src/include/pgstat.h index be2f59239b..934bce0fa9 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -863,6 +863,7 @@ typedef enum WAIT_EVENT_CONTROL_FILE_SYNC_UPDATE, WAIT_EVENT_CONTROL_FILE_WRITE, WAIT_EVENT_CONTROL_FILE_WRITE_UPDATE, + WAIT_EVENT_COPY_FILE_COPY, WAIT_EVENT_COPY_FILE_READ, WAIT_EVENT_COPY_FILE_WRITE, WAIT_EVENT_DATA_FILE_EXTEND, base-commit: 13c7c65ec900a30bcddcb27f5fd138dcdbc2ca2e -- 2.16.2