diff --git a/doc/src/sgml/ref/pg_rewind.sgml b/doc/src/sgml/ref/pg_rewind.sgml index 69d6924b3a..7f752b2ed0 100644 --- a/doc/src/sgml/ref/pg_rewind.sgml +++ b/doc/src/sgml/ref/pg_rewind.sgml @@ -334,15 +334,6 @@ GRANT EXECUTE ON function pg_catalog.pg_read_binary_file(text, bigint, bigint, b - - When executing pg_rewind using an online - cluster as source which has been recently promoted, it is necessary - to execute a CHECKPOINT after promotion such that its - control file reflects up-to-date timeline information, which is used by - pg_rewind to check if the target cluster - can be rewound using the designated source cluster. - - How It Works diff --git a/src/bin/pg_rewind/file_ops.c b/src/bin/pg_rewind/file_ops.c index 6cb288f099..2a407da1e4 100644 --- a/src/bin/pg_rewind/file_ops.c +++ b/src/bin/pg_rewind/file_ops.c @@ -309,9 +309,11 @@ sync_target_dir(void) * buffer is actually *filesize + 1. That's handy when reading a text file. * This function can be used to read binary files as well, you can just * ignore the zero-terminator in that case. + * + * If noerror is true, returns NULL when the file is not found. */ char * -slurpFile(const char *datadir, const char *path, size_t *filesize) +slurpFile(const char *datadir, const char *path, size_t *filesize, bool noerror) { int fd; char *buffer; @@ -323,8 +325,13 @@ slurpFile(const char *datadir, const char *path, size_t *filesize) snprintf(fullpath, sizeof(fullpath), "%s/%s", datadir, path); if ((fd = open(fullpath, O_RDONLY | PG_BINARY, 0)) == -1) + { + if (noerror && errno == ENOENT) + return NULL; + pg_fatal("could not open file \"%s\" for reading: %m", fullpath); + } if (fstat(fd, &statbuf) < 0) pg_fatal("could not open file \"%s\" for reading: %m", diff --git a/src/bin/pg_rewind/file_ops.h b/src/bin/pg_rewind/file_ops.h index 54a853bd42..92e19042cb 100644 --- a/src/bin/pg_rewind/file_ops.h +++ b/src/bin/pg_rewind/file_ops.h @@ -21,7 +21,8 @@ extern void create_target(file_entry_t *t); extern void remove_target(file_entry_t *t); extern void sync_target_dir(void); -extern char *slurpFile(const char *datadir, const char *path, size_t *filesize); +extern char *slurpFile(const char *datadir, const char *path, size_t *filesize, + bool noerror); typedef void (*process_file_callback_t) (const char *path, file_type_t type, size_t size, const char *link_target); extern void traverse_datadir(const char *datadir, process_file_callback_t callback); diff --git a/src/bin/pg_rewind/libpq_source.c b/src/bin/pg_rewind/libpq_source.c index 011c9cce6e..751c96e6e4 100644 --- a/src/bin/pg_rewind/libpq_source.c +++ b/src/bin/pg_rewind/libpq_source.c @@ -68,7 +68,7 @@ static void libpq_queue_fetch_range(rewind_source *source, const char *path, off_t off, size_t len); static void libpq_finish_fetch(rewind_source *source); static char *libpq_fetch_file(rewind_source *source, const char *path, - size_t *filesize); + size_t *filesize, bool noerror); static XLogRecPtr libpq_get_current_wal_insert_lsn(rewind_source *source); static void libpq_destroy(rewind_source *source); @@ -620,9 +620,12 @@ appendArrayEscapedString(StringInfo buf, const char *str) /* * Fetch a single file as a malloc'd buffer. + * + * If noerror is true, returns NULL if pg_read_binary_file() failed. */ static char * -libpq_fetch_file(rewind_source *source, const char *path, size_t *filesize) +libpq_fetch_file(rewind_source *source, const char *path, size_t *filesize, + bool noerror) { PGconn *conn = ((libpq_source *) source)->conn; PGresult *res; @@ -631,6 +634,34 @@ libpq_fetch_file(rewind_source *source, const char *path, size_t *filesize) const char *paramValues[1]; paramValues[0] = path; + + /* + * check the existence of the file. We do this before executing + * pg_read_binary_file so that server doesn't emit an error + */ + if (noerror) + { + res = PQexecParams(conn, "SELECT pg_stat_file($1, true)", + 1, NULL, paramValues, NULL, NULL, 1); + + if (PQresultStatus(res) != PGRES_TUPLES_OK) + { + pg_fatal("could not stat remote file \"%s\": %s", + path, PQresultErrorMessage(res)); + } + + /* sanity check the result set */ + if (PQntuples(res) != 1) + pg_fatal("unexpected result set while stating remote file \"%s\"", + path); + + /* Return NULL if the file was not found */ + if (PQgetisnull(res, 0, 0)) + return NULL; + + PQclear(res); + } + res = PQexecParams(conn, "SELECT pg_read_binary_file($1)", 1, NULL, paramValues, NULL, NULL, 1); diff --git a/src/bin/pg_rewind/local_source.c b/src/bin/pg_rewind/local_source.c index 2e50485c39..fc2e1e9f11 100644 --- a/src/bin/pg_rewind/local_source.c +++ b/src/bin/pg_rewind/local_source.c @@ -28,7 +28,7 @@ typedef struct static void local_traverse_files(rewind_source *source, process_file_callback_t callback); static char *local_fetch_file(rewind_source *source, const char *path, - size_t *filesize); + size_t *filesize, bool noerror); static void local_queue_fetch_file(rewind_source *source, const char *path, size_t len); static void local_queue_fetch_range(rewind_source *source, const char *path, @@ -63,9 +63,11 @@ local_traverse_files(rewind_source *source, process_file_callback_t callback) } static char * -local_fetch_file(rewind_source *source, const char *path, size_t *filesize) +local_fetch_file(rewind_source *source, const char *path, size_t *filesize, + bool noerror) { - return slurpFile(((local_source *) source)->datadir, path, filesize); + return slurpFile(((local_source *) source)->datadir, path, filesize, + noerror); } /* diff --git a/src/bin/pg_rewind/pg_rewind.c b/src/bin/pg_rewind/pg_rewind.c index 1ff8da1676..221e97c5d8 100644 --- a/src/bin/pg_rewind/pg_rewind.c +++ b/src/bin/pg_rewind/pg_rewind.c @@ -43,6 +43,8 @@ static void createBackupLabel(XLogRecPtr startpoint, TimeLineID starttli, static void digestControlFile(ControlFileData *ControlFile, const char *content, size_t size); +static TimeLineHistoryEntry *getTimelineHistory(ControlFileData *controlFile, + int *nentries); static void getRestoreCommand(const char *argv0); static void sanityChecks(void); static void findCommonAncestorTimeline(XLogRecPtr *recptr, int *tliIndex); @@ -70,9 +72,13 @@ bool do_sync = true; bool restore_wal = false; /* Target history */ -TimeLineHistoryEntry *targetHistory; +TimeLineHistoryEntry *targetHistory = NULL; int targetNentries; +/* Source history */ +TimeLineHistoryEntry *sourceHistory = NULL; +int sourceNentries; + /* Progress counters */ uint64 fetch_size; uint64 fetch_done; @@ -141,6 +147,7 @@ main(int argc, char **argv) bool rewind_needed; bool writerecoveryconf = false; filemap_t *filemap; + TimeLineID source_tli; pg_logging_init(argv[0]); set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("pg_rewind")); @@ -311,7 +318,7 @@ main(int argc, char **argv) * need to make sure by themselves that the target cluster is in a clean * state. */ - buffer = slurpFile(datadir_target, "global/pg_control", &size); + buffer = slurpFile(datadir_target, "global/pg_control", &size, false); digestControlFile(&ControlFile_target, buffer, size); pg_free(buffer); @@ -321,25 +328,43 @@ main(int argc, char **argv) { ensureCleanShutdown(argv[0]); - buffer = slurpFile(datadir_target, "global/pg_control", &size); + buffer = slurpFile(datadir_target, "global/pg_control", &size, false); digestControlFile(&ControlFile_target, buffer, size); pg_free(buffer); } - buffer = source->fetch_file(source, "global/pg_control", &size); + buffer = source->fetch_file(source, "global/pg_control", &size, false); digestControlFile(&ControlFile_source, buffer, size); pg_free(buffer); sanityChecks(); + /* Retrieve timelines for both source and target */ + sourceHistory = getTimelineHistory(&ControlFile_source, &sourceNentries); + targetHistory = getTimelineHistory(&ControlFile_target, &targetNentries); + + /* + * If the source just has been promoted but the end-of-recovery checkpoint + * has not completed, the soruce control file has a bit older content for + * identifying the source's timeline. Instead, look into timeline history, + * which is always refreshed up-to-date. + */ + source_tli = ControlFile_source.checkPointCopy.ThisTimeLineID; + + if (sourceHistory[sourceNentries - 1].tli > source_tli) + { + pg_log_info("source's actual timeline ID (%d) is newer than control file (%d)", + sourceHistory[sourceNentries - 1].tli, source_tli); + source_tli = sourceHistory[sourceNentries - 1].tli; + } + /* * Find the common ancestor timeline between the clusters. * * If both clusters are already on the same timeline, there's nothing to * do. */ - if (ControlFile_target.checkPointCopy.ThisTimeLineID == - ControlFile_source.checkPointCopy.ThisTimeLineID) + if (ControlFile_target.checkPointCopy.ThisTimeLineID == source_tli) { pg_log_info("source and target cluster are on the same timeline"); rewind_needed = false; @@ -581,7 +606,7 @@ perform_rewind(filemap_t *filemap, rewind_source *source, * Fetch the control file from the source last. This ensures that the * minRecoveryPoint is up-to-date. */ - buffer = source->fetch_file(source, "global/pg_control", &size); + buffer = source->fetch_file(source, "global/pg_control", &size, false); digestControlFile(&ControlFile_source_after, buffer, size); pg_free(buffer); @@ -654,7 +679,22 @@ perform_rewind(filemap_t *filemap, rewind_source *source, pg_fatal("source system was in unexpected state at end of rewind"); endrec = source->get_current_wal_insert_lsn(source); - endtli = ControlFile_source_after.checkPointCopy.ThisTimeLineID; + + /* + * Find the timeline ID corresponding to endrec on the source. + * + * In most cases we can use the TLI in the source control file, but + * that is not the case after promotion until end-of-recovery + * checkpoint completes, where the control file is a bit old for + * this purpose. It should be the latest timeline in the source's + * history file. + */ + if (!((sourceHistory[sourceNentries - 1].begin == 0 || + sourceHistory[sourceNentries - 1].begin <= endrec) && + sourceHistory[sourceNentries - 1].end == 0)) + pg_fatal("source server's current insert LSN was not on the latest timeline in history file"); + + endtli = sourceHistory[sourceNentries - 1].tli; } } else @@ -796,46 +836,83 @@ MinXLogRecPtr(XLogRecPtr a, XLogRecPtr b) } /* - * Retrieve timeline history for given control file which should behold - * either source or target. + * Retrieve the latest timeline history for given control file which should + * behold either source or target. + * + * This works on the assumption that the timeline IDs of existing history files + * are contiguous up to the latest history file. This is true for + * recently-promoted servers. See findNewestTimeLine() for this assumption. */ static TimeLineHistoryEntry * getTimelineHistory(ControlFileData *controlFile, int *nentries) { - TimeLineHistoryEntry *history; + TimeLineHistoryEntry *history = NULL; TimeLineID tli; + TimeLineID probe_tli; tli = controlFile->checkPointCopy.ThisTimeLineID; + Assert(tli > 0); + + /* Probe history files */ + for (probe_tli = tli ;; probe_tli++) + { + char path[MAXPGPATH]; + char *histfile; + TimeLineHistoryEntry *tmphistory; + int nent; + int i; + + if (probe_tli < 2) + continue; + + TLHistoryFilePath(path, probe_tli); + + /* Get history file from appropriate source */ + if (controlFile == &ControlFile_source) + histfile = source->fetch_file(source, path, NULL, true); + else if (controlFile == &ControlFile_target) + histfile = slurpFile(datadir_target, path, NULL, true); + else + pg_fatal("invalid control file"); + + /* no such history file, exit */ + if (!histfile) + break; + + /* preserve the history if we're part of it */ + tmphistory = rewind_parseTimeLineHistory(histfile, probe_tli, &nent); + pg_free(histfile); + + for (i = 0 ; i < nent ; i++) + { + if (tmphistory[i].tli == tli) + { + if (history) + pg_free(history); + + history = tmphistory; + *nentries = nent; + break; + } + } + if (tmphistory != history) + pg_free(tmphistory); + } + /* * Timeline 1 does not have a history file, so there is no need to check * and fake an entry with infinite start and end positions. */ - if (tli == 1) + if (!history) { + Assert (tli == 1); + history = (TimeLineHistoryEntry *) pg_malloc(sizeof(TimeLineHistoryEntry)); history->tli = tli; history->begin = history->end = InvalidXLogRecPtr; *nentries = 1; } - else - { - char path[MAXPGPATH]; - char *histfile; - - TLHistoryFilePath(path, tli); - - /* Get history file from appropriate source */ - if (controlFile == &ControlFile_source) - histfile = source->fetch_file(source, path, NULL); - else if (controlFile == &ControlFile_target) - histfile = slurpFile(datadir_target, path, NULL); - else - pg_fatal("invalid control file"); - - history = rewind_parseTimeLineHistory(histfile, tli, nentries); - pg_free(histfile); - } if (debug) { @@ -879,15 +956,9 @@ getTimelineHistory(ControlFileData *controlFile, int *nentries) static void findCommonAncestorTimeline(XLogRecPtr *recptr, int *tliIndex) { - TimeLineHistoryEntry *sourceHistory; - int sourceNentries; int i, n; - /* Retrieve timelines for both source and target */ - sourceHistory = getTimelineHistory(&ControlFile_source, &sourceNentries); - targetHistory = getTimelineHistory(&ControlFile_target, &targetNentries); - /* * Trace the history forward, until we hit the timeline diverge. It may * still be possible that the source and target nodes used the same @@ -910,7 +981,6 @@ findCommonAncestorTimeline(XLogRecPtr *recptr, int *tliIndex) *recptr = MinXLogRecPtr(sourceHistory[i].end, targetHistory[i].end); *tliIndex = i; - pg_free(sourceHistory); return; } else diff --git a/src/bin/pg_rewind/rewind_source.h b/src/bin/pg_rewind/rewind_source.h index 1310e86e75..6975848668 100644 --- a/src/bin/pg_rewind/rewind_source.h +++ b/src/bin/pg_rewind/rewind_source.h @@ -35,7 +35,7 @@ typedef struct rewind_source * handy for text files. */ char *(*fetch_file) (struct rewind_source *, const char *path, - size_t *filesize); + size_t *filesize, bool noerror); /* * Request to fetch (part of) a file in the source system, specified by an