commit b2d59c9532f3beb3a17b8c03341aa4557809d038 Author: Alexander Korotkov Date: Sat Jan 18 18:11:49 2020 +0300 pg_rewind: Add options to restore WAL files from archive Currently, pg_rewind fails when it could not find required WAL files in the target data directory. One have to manually figure out which WAL files are required and copy them back from archive. This commit implements two new pg_rewind options, which allow pg_rewind to automatically retrieve missing WAL files from archival storage. The first option let pg_rewind read restore_command from postgresql.conf, while the second one specifies restore_command directly. Discussion: https://postgr.es/m/a3acff50-5a0d-9a2c-b3b2-ee36168955c1%40postgrespro.ru Author: Alexey Kondratov Reviewed-by: Andrey Borodin, Alvaro Herrera, Michael Paquier Reviewed-by: Andres Freund, Alexander Korotkov diff --git a/doc/src/sgml/ref/pg_rewind.sgml b/doc/src/sgml/ref/pg_rewind.sgml index 42d29edd4e9..b601a5c7e44 100644 --- a/doc/src/sgml/ref/pg_rewind.sgml +++ b/doc/src/sgml/ref/pg_rewind.sgml @@ -66,11 +66,12 @@ PostgreSQL documentation can be found either on the target timeline, the source timeline, or their common ancestor. In the typical failover scenario where the target cluster was shut down soon after the divergence, this is not a problem, but if the - target cluster ran for a long time after the divergence, the old WAL - files might no longer be present. In that case, they can be manually - copied from the WAL archive to the pg_wal directory, or - fetched on startup by configuring or - . The use of + target cluster ran for a long time after the divergence, its old WAL + files might no longer be present. In this case, you can manually copy them + from the WAL archive to the pg_wal directory, or run + pg_rewind with the -c or + -C option to automatically retrieve them from the WAL + archive. The use of pg_rewind is not limited to failover, e.g. a standby server can be promoted, run some write transactions, and then rewinded to become a standby again. @@ -232,6 +233,39 @@ PostgreSQL documentation + + + + + + Use the restore_command defined in + postgresql.conf to retrieve WAL files from + the WAL archive if these files are no longer available in the + pg_wal directory of the target cluster. + + + This option cannot be used together with . + + + + + + + + + + Specifies the restore_command to use for retrieving + WAL files from the WAL archive if these files are no longer available + in the pg_wal directory of the target cluster. + + + If restore_command is already set in + postgresql.conf, you can provide the + option instead. + + + + @@ -318,7 +352,10 @@ GRANT EXECUTE ON function pg_catalog.pg_read_binary_file(text, bigint, bigint, b history forked off from the target cluster. For each WAL record, record each data block that was touched. This yields a list of all the data blocks that were changed in the target cluster, after the - source cluster forked off. + source cluster forked off. If some of the WAL files are no longer + available, try re-running pg_rewind with + the or option to search + for the missing files in the WAL archive. diff --git a/src/bin/pg_rewind/parsexlog.c b/src/bin/pg_rewind/parsexlog.c index b6429827cfb..26f41581c00 100644 --- a/src/bin/pg_rewind/parsexlog.c +++ b/src/bin/pg_rewind/parsexlog.c @@ -12,6 +12,7 @@ #include "postgres_fe.h" #include +#include #include "access/rmgr.h" #include "access/xlog_internal.h" @@ -19,6 +20,7 @@ #include "catalog/pg_control.h" #include "catalog/storage_xlog.h" #include "commands/dbcommands_xlog.h" +#include "common/fe_archive.h" #include "filemap.h" #include "pg_rewind.h" @@ -41,6 +43,7 @@ static char xlogfpath[MAXPGPATH]; typedef struct XLogPageReadPrivate { + const char *restoreCommand; int tliIndex; } XLogPageReadPrivate; @@ -55,7 +58,7 @@ static int SimpleXLogPageRead(XLogReaderState *xlogreader, */ void extractPageMap(const char *datadir, XLogRecPtr startpoint, int tliIndex, - XLogRecPtr endpoint) + XLogRecPtr endpoint, const char *restore_command) { XLogRecord *record; XLogReaderState *xlogreader; @@ -63,6 +66,7 @@ extractPageMap(const char *datadir, XLogRecPtr startpoint, int tliIndex, XLogPageReadPrivate private; private.tliIndex = tliIndex; + private.restoreCommand = restore_command; xlogreader = XLogReaderAllocate(WalSegSz, datadir, &SimpleXLogPageRead, &private); if (xlogreader == NULL) @@ -148,7 +152,7 @@ readOneRecord(const char *datadir, XLogRecPtr ptr, int tliIndex) void findLastCheckpoint(const char *datadir, XLogRecPtr forkptr, int tliIndex, XLogRecPtr *lastchkptrec, TimeLineID *lastchkpttli, - XLogRecPtr *lastchkptredo) + XLogRecPtr *lastchkptredo, const char *restoreCommand) { /* Walk backwards, starting from the given record */ XLogRecord *record; @@ -172,6 +176,7 @@ findLastCheckpoint(const char *datadir, XLogRecPtr forkptr, int tliIndex, } private.tliIndex = tliIndex; + private.restoreCommand = restoreCommand; xlogreader = XLogReaderAllocate(WalSegSz, datadir, &SimpleXLogPageRead, &private); if (xlogreader == NULL) @@ -282,8 +287,29 @@ SimpleXLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, if (xlogreadfd < 0) { - pg_log_error("could not open file \"%s\": %m", xlogfpath); - return -1; + /* + * If we have no restore_command to execute, then exit. + */ + if (private->restoreCommand == NULL) + { + pg_log_error("could not open file \"%s\": %m", xlogfpath); + return -1; + } + + /* + * Since we have restore_command to execute, then try to retrieve + * missing WAL file from the archive. + */ + xlogreadfd = RestoreArchivedWALFile(xlogreader->segcxt.ws_dir, + xlogfname, + WalSegSz, + private->restoreCommand); + + if (xlogreadfd < 0) + return -1; + else + pg_log_debug("using file \"%s\" restored from archive", + xlogfpath); } } diff --git a/src/bin/pg_rewind/pg_rewind.c b/src/bin/pg_rewind/pg_rewind.c index c6d00bb0ab7..33d6aada4e5 100644 --- a/src/bin/pg_rewind/pg_rewind.c +++ b/src/bin/pg_rewind/pg_rewind.c @@ -53,11 +53,13 @@ int WalSegSz; char *datadir_target = NULL; char *datadir_source = NULL; char *connstr_source = NULL; +char *restore_command = NULL; static bool debug = false; bool showprogress = false; bool dry_run = false; bool do_sync = true; +bool restore_wal = false; /* Target history */ TimeLineHistoryEntry *targetHistory; @@ -74,19 +76,21 @@ usage(const char *progname) printf(_("%s resynchronizes a PostgreSQL cluster with another copy of the cluster.\n\n"), progname); printf(_("Usage:\n %s [OPTION]...\n\n"), progname); printf(_("Options:\n")); - printf(_(" -D, --target-pgdata=DIRECTORY existing data directory to modify\n")); - printf(_(" --source-pgdata=DIRECTORY source data directory to synchronize with\n")); - printf(_(" --source-server=CONNSTR source server to synchronize with\n")); - printf(_(" -R, --write-recovery-conf write configuration for replication\n" - " (requires --source-server)\n")); - printf(_(" -n, --dry-run stop before modifying anything\n")); - printf(_(" -N, --no-sync do not wait for changes to be written\n" - " safely to disk\n")); - printf(_(" -P, --progress write progress messages\n")); - printf(_(" --no-ensure-shutdown do not automatically fix unclean shutdown\n")); - printf(_(" --debug write a lot of debug messages\n")); - printf(_(" -V, --version output version information, then exit\n")); - printf(_(" -?, --help show this help, then exit\n")); + printf(_(" -D, --target-pgdata=DIRECTORY existing data directory to modify\n")); + printf(_(" --source-pgdata=DIRECTORY source data directory to synchronize with\n")); + printf(_(" --source-server=CONNSTR source server to synchronize with\n")); + printf(_(" -R, --write-recovery-conf write configuration for replication\n" + " (requires --source-server)\n")); + printf(_(" -c, --restore-target-wal use restore_command in target config\n")); + printf(_(" to retrieve WAL files from archive\n")); + printf(_(" -n, --dry-run stop before modifying anything\n")); + printf(_(" -N, --no-sync do not wait for changes to be written\n" + " safely to disk\n")); + printf(_(" -P, --progress write progress messages\n")); + printf(_(" --no-ensure-shutdown do not automatically fix unclean shutdown\n")); + printf(_(" --debug write a lot of debug messages\n")); + printf(_(" -V, --version output version information, then exit\n")); + printf(_(" -?, --help show this help, then exit\n")); printf(_("\nReport bugs to .\n")); } @@ -105,6 +109,7 @@ main(int argc, char **argv) {"dry-run", no_argument, NULL, 'n'}, {"no-sync", no_argument, NULL, 'N'}, {"progress", no_argument, NULL, 'P'}, + {"restore-target-wal", no_argument, NULL, 'c'}, {"debug", no_argument, NULL, 3}, {NULL, 0, NULL, 0} }; @@ -143,7 +148,7 @@ main(int argc, char **argv) } } - while ((c = getopt_long(argc, argv, "D:nNPR", long_options, &option_index)) != -1) + while ((c = getopt_long(argc, argv, "D:nNPRC:c", long_options, &option_index)) != -1) { switch (c) { @@ -155,6 +160,10 @@ main(int argc, char **argv) showprogress = true; break; + case 'c': + restore_wal = true; + break; + case 'n': dry_run = true; break; @@ -252,6 +261,65 @@ main(int argc, char **argv) exit(1); } + if (restore_wal) + { + int rc; + char postgres_exec_path[MAXPGPATH], + postgres_cmd[MAXPGPATH], + cmd_output[MAXPGPATH]; + FILE *output_fp; + + /* Find postgres executable. */ + rc = find_other_exec(argv[0], "postgres", + PG_BACKEND_VERSIONSTR, + postgres_exec_path); + + if (rc < 0) + { + char full_path[MAXPGPATH]; + + if (find_my_exec(argv[0], full_path) < 0) + strlcpy(full_path, progname, sizeof(full_path)); + + if (rc == -1) + pg_log_error("The program \"postgres\" is needed by %s but was not found in the\n" + "same directory as \"%s\".\n" + "Check your installation.", + progname, full_path); + else + pg_log_error("The program \"postgres\" was found by \"%s\"\n" + "but was not the same version as %s.\n" + "Check your installation.", + full_path, progname); + exit(1); + } + + /* + * Build a command to execute for restore_command GUC retrieval if + * set. + */ + snprintf(postgres_cmd, sizeof(postgres_cmd), "%s -D %s -C restore_command", + postgres_exec_path, datadir_target); + + if ((output_fp = popen(postgres_cmd, "r")) == NULL || + fgets(cmd_output, sizeof(cmd_output), output_fp) == NULL) + pg_fatal("could not get restore_command using %s: %s", + postgres_cmd, strerror(errno)); + + pclose(output_fp); + + /* Remove trailing newline */ + if (strchr(cmd_output, '\n') != NULL) + *strchr(cmd_output, '\n') = '\0'; + + if (!strcmp(cmd_output, "")) + pg_fatal("restore_command is not set on the target cluster"); + + restore_command = pg_strdup(cmd_output); + + pg_log_debug("using config variable restore_command=\'%s\'.", restore_command); + } + umask(pg_mode_mask); atexit(disconnect_atexit); @@ -349,9 +417,8 @@ main(int argc, char **argv) exit(0); } - findLastCheckpoint(datadir_target, divergerec, - lastcommontliIndex, - &chkptrec, &chkpttli, &chkptredo); + findLastCheckpoint(datadir_target, divergerec, lastcommontliIndex, + &chkptrec, &chkpttli, &chkptredo, restore_command); pg_log_info("rewinding from last common checkpoint at %X/%X on timeline %u", (uint32) (chkptrec >> 32), (uint32) chkptrec, chkpttli); @@ -377,7 +444,7 @@ main(int argc, char **argv) if (showprogress) pg_log_info("reading WAL in target"); extractPageMap(datadir_target, chkptrec, lastcommontliIndex, - ControlFile_target.checkPoint); + ControlFile_target.checkPoint, restore_command); filemap_finalize(); if (showprogress) diff --git a/src/bin/pg_rewind/pg_rewind.h b/src/bin/pg_rewind/pg_rewind.h index e4e8d23c32d..b122ae43e5a 100644 --- a/src/bin/pg_rewind/pg_rewind.h +++ b/src/bin/pg_rewind/pg_rewind.h @@ -42,11 +42,13 @@ extern uint64 fetch_done; /* in parsexlog.c */ extern void extractPageMap(const char *datadir, XLogRecPtr startpoint, - int tliIndex, XLogRecPtr endpoint); + int tliIndex, XLogRecPtr endpoint, + const char *restoreCommand); extern void findLastCheckpoint(const char *datadir, XLogRecPtr searchptr, int tliIndex, XLogRecPtr *lastchkptrec, TimeLineID *lastchkpttli, - XLogRecPtr *lastchkptredo); + XLogRecPtr *lastchkptredo, + const char *restoreCommand); extern XLogRecPtr readOneRecord(const char *datadir, XLogRecPtr ptr, int tliIndex); diff --git a/src/bin/pg_rewind/t/001_basic.pl b/src/bin/pg_rewind/t/001_basic.pl index 95d8ccfced1..d97e4377419 100644 --- a/src/bin/pg_rewind/t/001_basic.pl +++ b/src/bin/pg_rewind/t/001_basic.pl @@ -1,7 +1,7 @@ use strict; use warnings; use TestLib; -use Test::More tests => 15; +use Test::More tests => 20; use FindBin; use lib $FindBin::RealBin; @@ -171,5 +171,6 @@ in master, before promotion # Run the test in both modes run_test('local'); run_test('remote'); +run_test('archive'); exit(0); diff --git a/src/bin/pg_rewind/t/002_databases.pl b/src/bin/pg_rewind/t/002_databases.pl index 1db534c0dc0..3027e09e0f4 100644 --- a/src/bin/pg_rewind/t/002_databases.pl +++ b/src/bin/pg_rewind/t/002_databases.pl @@ -1,7 +1,7 @@ use strict; use warnings; use TestLib; -use Test::More tests => 7; +use Test::More tests => 10; use FindBin; use lib $FindBin::RealBin; @@ -70,5 +70,6 @@ template1 # Run the test in both modes. run_test('local'); run_test('remote'); +run_test('archive'); exit(0); diff --git a/src/bin/pg_rewind/t/003_extrafiles.pl b/src/bin/pg_rewind/t/003_extrafiles.pl index f4710440fc3..bbad5ce9c94 100644 --- a/src/bin/pg_rewind/t/003_extrafiles.pl +++ b/src/bin/pg_rewind/t/003_extrafiles.pl @@ -3,7 +3,7 @@ use strict; use warnings; use TestLib; -use Test::More tests => 5; +use Test::More tests => 7; use File::Find; @@ -90,5 +90,6 @@ sub run_test # Run the test in both modes. run_test('local'); run_test('remote'); +run_test('archive'); exit(0); diff --git a/src/bin/pg_rewind/t/RewindTest.pm b/src/bin/pg_rewind/t/RewindTest.pm index 82fa220ac86..14cefe4517c 100644 --- a/src/bin/pg_rewind/t/RewindTest.pm +++ b/src/bin/pg_rewind/t/RewindTest.pm @@ -216,6 +216,46 @@ sub promote_standby return; } +# Moves WAL files to the temporary location and returns restore_command +# to get them back. +sub move_wal +{ + my ($tmp_dir, $master_pgdata) = @_; + my $wal_archive_path = "$tmp_dir/master_wal_archive"; + my $wal_path = "$master_pgdata/pg_wal"; + my $wal_dir; + my $restore_command; + + rmtree($wal_archive_path); + mkdir($wal_archive_path) or + die "mkdir($wal_archive_path) failed: $!"; + + # Move all old master WAL files to the archive. + # Old master should be stopped at this point. + opendir($wal_dir, $wal_path) or + die "opendir($wal_path) failed: $!";; + while (my $wal_file = readdir($wal_dir)) + { + if ($wal_file =~ /^[0-9A-F]{24}$/) + { + move("$wal_path/$wal_file", "$wal_archive_path") or + die "move $wal_path/$wal_file -> $wal_archive_path failed: $!"; + } + } + closedir($wal_dir); + + if ($windows_os) + { + $restore_command = "copy $wal_archive_path\\\%f \%p"; + } + else + { + $restore_command = "cp $wal_archive_path/\%f \%p"; + } + + return $restore_command; +} + sub run_pg_rewind { my $test_mode = shift; @@ -227,10 +267,23 @@ sub run_pg_rewind # Append the rewind-specific role to the connection string. $standby_connstr = "$standby_connstr user=rewind_user"; - # Stop the master and be ready to perform the rewind. The cluster - # needs recovery to finish once, and pg_rewind makes sure that it - # happens automatically. - $node_master->stop('immediate'); + if ($test_mode eq 'archive' || $test_mode eq 'archive_conf') + { + # We test pg_rewind with restore_command by simply moving all WAL files + # to another location. It leads to failed ensureCleanShutdown + # execution. Since it is difficult to emulate a situation, when + # keeping the last WAL segment is enough for startup recovery, but + # not enough for successful pg_rewind run, we run these modes with + # --no-ensure-shutdown. So stop the master gracefully. + $node_master->stop; + } + else + { + # Stop the master and be ready to perform the rewind. The cluster + # needs recovery to finish once, and pg_rewind makes sure that it + # happens automatically. + $node_master->stop('immediate'); + } # At this point, the rewind processing is ready to run. # We now have a very simple scenario with a few diverged WAL record. @@ -284,6 +337,31 @@ sub run_pg_rewind $node_standby->safe_psql('postgres', "ALTER ROLE rewind_user WITH REPLICATION;"); } + elsif ($test_mode eq "archive") + { + + # Do rewind using a local pgdata as source and + # specified directory with target WAL archive. + my $restore_command = move_wal($tmp_folder, $master_pgdata); + + # Stop the new master and be ready to perform the rewind. + $node_standby->stop; + + # Add restore_command to postgresql.conf of target cluster. + $node_master->append_conf("postgresql.conf", + "restore_command='$restore_command'"); + + command_ok( + [ + 'pg_rewind', + "--debug", + "--source-pgdata=$standby_pgdata", + "--target-pgdata=$master_pgdata", + "--no-sync", "--no-ensure-shutdown", + "-c" + ], + 'pg_rewind archive_conf'); + } else { diff --git a/src/common/Makefile b/src/common/Makefile index 44ca68fa6c3..b92c03c6331 100644 --- a/src/common/Makefile +++ b/src/common/Makefile @@ -86,6 +86,7 @@ endif # (Mkvcbuild.pm has a copy of this list, too) OBJS_FRONTEND = \ $(OBJS_COMMON) \ + fe_archive.o \ fe_memutils.o \ file_utils.o \ logging.o \ diff --git a/src/common/fe_archive.c b/src/common/fe_archive.c new file mode 100644 index 00000000000..34d2be86954 --- /dev/null +++ b/src/common/fe_archive.c @@ -0,0 +1,154 @@ +/*------------------------------------------------------------------------- + * + * fe_archive.c + * Routines to access WAL archive from frontend + * + * Copyright (c) 2020, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/common/fe_archive.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres_fe.h" + +#include +#include + +#include "access/xlog_internal.h" +#include "common/fe_archive.h" +#include "common/logging.h" + + +/* logging support */ +#define pg_fatal(...) do { pg_log_fatal(__VA_ARGS__); exit(1); } while(0) + +/* + * Attempt to retrieve the specified file from off-line archival storage. + * If successful return a file descriptor of the restored WAL file, else + * return -1. + * + * For fixed-size files, the caller may pass the expected size as an + * additional crosscheck on successful recovery. If the file size is not + * known, set expectedSize = 0. + */ +int +RestoreArchivedWALFile(const char *path, const char *xlogfname, + off_t expectedSize, const char *restoreCommand) +{ + char xlogpath[MAXPGPATH], + xlogRestoreCmd[MAXPGPATH], + *dp, + *endp; + const char *sp; + int rc, + xlogfd; + struct stat stat_buf; + + snprintf(xlogpath, MAXPGPATH, "%s/" XLOGDIR "/%s", path, xlogfname); + + /* + * Construct the command to be executed. + */ + dp = xlogRestoreCmd; + endp = xlogRestoreCmd + MAXPGPATH - 1; + *endp = '\0'; + + for (sp = restoreCommand; *sp; sp++) + { + if (*sp == '%') + { + switch (sp[1]) + { + case 'p': + /* %p: relative path of target file */ + sp++; + StrNCpy(dp, xlogpath, endp - dp); + make_native_path(dp); + dp += strlen(dp); + break; + case 'f': + /* %f: filename of desired file */ + sp++; + StrNCpy(dp, xlogfname, endp - dp); + dp += strlen(dp); + break; + case 'r': + /* %r: filename of last restartpoint */ + pg_fatal("restore_command with %%r cannot be used with pg_rewind."); + break; + case '%': + /* convert %% to a single % */ + sp++; + if (dp < endp) + *dp++ = *sp; + break; + default: + /* otherwise treat the % as not special */ + if (dp < endp) + *dp++ = *sp; + break; + } + } + else + { + if (dp < endp) + *dp++ = *sp; + } + } + *dp = '\0'; + + /* + * Execute restore_command, which should copy the missing WAL file from + * archival storage. + */ + rc = system(xlogRestoreCmd); + + if (rc == 0) + { + /* + * Command apparently succeeded, but let's make sure the file is + * really there now and has the correct size. + */ + if (stat(xlogpath, &stat_buf) == 0) + { + if (expectedSize > 0 && stat_buf.st_size != expectedSize) + { + pg_log_error("archive file \"%s\" has wrong size: %lu instead of %lu, %s", + xlogfname, (unsigned long) stat_buf.st_size, + (unsigned long) expectedSize, strerror(errno)); + } + else + { + xlogfd = open(xlogpath, O_RDONLY | PG_BINARY, 0); + + if (xlogfd < 0) + pg_log_error("could not open file \"%s\" restored from archive: %s\n", + xlogpath, strerror(errno)); + else + return xlogfd; + } + } + else + { + /* Stat failed */ + pg_log_error("could not stat file \"%s\" restored from archive: %s", + xlogpath, strerror(errno)); + } + } + + /* + * If the failure was due to any sort of signal, then it will be + * misleading to return message 'could not restore file...' and propagate + * result to the upper levels. We should exit right now. + */ + if (wait_result_is_any_signal(rc, false)) + pg_fatal("restore_command failed due to the signal: %s", + wait_result_to_str(rc)); + + pg_log_error("could not restore file \"%s\" from archive\n", + xlogfname); + return -1; +} diff --git a/src/include/common/fe_archive.h b/src/include/common/fe_archive.h new file mode 100644 index 00000000000..7e9d140ea8f --- /dev/null +++ b/src/include/common/fe_archive.h @@ -0,0 +1,18 @@ +/*------------------------------------------------------------------------- + * + * fe_archive.h + * Routines to access WAL archive from frontend + * + * Copyright (c) 2020, PostgreSQL Global Development Group + * + * src/include/common/fe_archive.h + * + *------------------------------------------------------------------------- + */ +#ifndef ARCHIVE_H +#define ARCHIVE_H + +extern int RestoreArchivedWALFile(const char *path, const char *xlogfname, + off_t expectedSize, const char *restoreCommand); + +#endif /* ARCHIVE_H */