commit c1aa5f67df052467dd6d67cdf5dbd5388ecefb1f Author: Anastasia Date: Thu Dec 26 17:30:33 2019 +0300 Subject: [PATCH v7 3/3] Fix replay of create database records on standby Crash recovery on standby may encounter missing directories when replaying create database WAL records. Prior to this patch, the standby would fail to recover in such a case. However, the directories could be legitimately missing. Consider a sequence of WAL records as follows: CREATE DATABASE DROP DATABASE DROP TABLESPACE If, after replaying the last WAL record and removing the tablespace directory, the standby crashes and has to replay the create database record again, the crash recovery must be able to move on. This patch adds mechanism similar to invalid page hash table, to track missing directories during crash recovery. If all the missing directory references are matched with corresponding drop records at the end of crash recovery, the standby can safely enter archive recovery. Bug identified by Paul. Authored by Paul, Kyotaro and Asim R P. diff --git a/src/backend/access/rmgrdesc/dbasedesc.c b/src/backend/access/rmgrdesc/dbasedesc.c index d08c575..4858ce6 100644 --- a/src/backend/access/rmgrdesc/dbasedesc.c +++ b/src/backend/access/rmgrdesc/dbasedesc.c @@ -23,14 +23,17 @@ dbase_desc(StringInfo buf, XLogReaderState *record) { char *rec = XLogRecGetData(record); uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + char *dbpath1, *dbpath2; if (info == XLOG_DBASE_CREATE) { xl_dbase_create_rec *xlrec = (xl_dbase_create_rec *) rec; - appendStringInfo(buf, "copy dir %u/%u to %u/%u", - xlrec->src_tablespace_id, xlrec->src_db_id, - xlrec->tablespace_id, xlrec->db_id); + dbpath1 = GetDatabasePath(xlrec->src_db_id, xlrec->src_tablespace_id); + dbpath2 = GetDatabasePath(xlrec->db_id, xlrec->tablespace_id); + appendStringInfo(buf, "copy dir %s to %s", dbpath1, dbpath2); + pfree(dbpath2); + pfree(dbpath1); } else if (info == XLOG_DBASE_DROP) { @@ -39,8 +42,11 @@ dbase_desc(StringInfo buf, XLogReaderState *record) appendStringInfo(buf, "dir"); for (i = 0; i < xlrec->ntablespaces; i++) - appendStringInfo(buf, " %u/%u", - xlrec->tablespace_ids[i], xlrec->db_id); + { + dbpath1 = GetDatabasePath(xlrec->db_id, xlrec->tablespace_ids[i]); + appendStringInfo(buf, "%s", dbpath1); + pfree(dbpath1); + } } } diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 5658971..ea6661e 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -7890,6 +7890,12 @@ CheckRecoveryConsistency(void) */ XLogCheckInvalidPages(); + /* + * Check if the XLOG sequence contained any unresolved references to + * missing directories. + */ + XLogCheckMissingDirs(); + reachedConsistency = true; ereport(LOG, (errmsg("consistent recovery state reached at %X/%X", diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c index 14efbf3..1417707 100644 --- a/src/backend/access/transam/xlogutils.c +++ b/src/backend/access/transam/xlogutils.c @@ -56,6 +56,136 @@ typedef struct xl_invalid_page static HTAB *invalid_page_tab = NULL; +/* + * If a create database WAL record is being replayed more than once during + * crash recovery on a standby, it is possible that either the tablespace + * directory or the template database directory is missing. This happens when + * the directories are removed by replay of subsequent drop records. Note + * that this problem happens only on standby and not on master. On master, a + * checkpoint is created at the end of create database operation. On standby, + * however, such a strategy (creating restart points during replay) is not + * viable because it will slow down WAL replay. + * + * The alternative is to track references to each missing directory + * encountered when performing crash recovery in the following hash table. + * Similar to invalid page table above, the expectation is that each missing + * directory entry should be matched with a drop database or drop tablespace + * WAL record by the end of crash recovery. + */ +typedef struct xl_missing_dir_key +{ + Oid spcNode; + Oid dbNode; +} xl_missing_dir_key; + +typedef struct xl_missing_dir +{ + xl_missing_dir_key key; + char path[MAXPGPATH]; +} xl_missing_dir; + +static HTAB *missing_dir_tab = NULL; + +void +XLogLogMissingDir(Oid spcNode, Oid dbNode, char *path) +{ + xl_missing_dir_key key; + bool found; + xl_missing_dir *entry; + + /* + * Database OID may be invalid but tablespace OID must be valid. If + * dbNode is InvalidOid, we are logging a missing tablespace directory, + * otherwise we are logging a missing database directory. + */ + Assert(OidIsValid(spcNode)); + + if (reachedConsistency) + elog(PANIC, "cannot find directory %s tablespace %d database %d", + path, spcNode, dbNode); + + if (missing_dir_tab == NULL) + { + /* create hash table when first needed */ + HASHCTL ctl; + + memset(&ctl, 0, sizeof(ctl)); + ctl.keysize = sizeof(xl_missing_dir_key); + ctl.entrysize = sizeof(xl_missing_dir); + + missing_dir_tab = hash_create("XLOG missing directory table", + 100, + &ctl, + HASH_ELEM | HASH_BLOBS); + } + + key.spcNode = spcNode; + key.dbNode = dbNode; + + entry = hash_search(missing_dir_tab, &key, HASH_ENTER, &found); + + if (found) + elog(DEBUG2, "missing directory %s tablespace %d database %d already exists: %s", + path, spcNode, dbNode, entry->path); + else + { + strlcpy(entry->path, path, sizeof(entry->path)); + elog(DEBUG2, "logged missing dir %s tablespace %d database %d", + path, spcNode, dbNode); + } +} + +void +XLogForgetMissingDir(Oid spcNode, Oid dbNode, char *path) +{ + xl_missing_dir_key key; + + key.spcNode = spcNode; + key.dbNode = dbNode; + + /* Database OID may be invalid but tablespace OID must be valid. */ + Assert(OidIsValid(spcNode)); + + if (missing_dir_tab == NULL) + return; + + if (hash_search(missing_dir_tab, &key, HASH_REMOVE, NULL) == NULL) + elog(DEBUG2, "dir %s tablespace %d database %d is not missing", + path, spcNode, dbNode); + else + elog(DEBUG2, "forgot missing dir %s for tablespace %d database %d", + path, spcNode, dbNode); +} + +/* + * This is called at the end of crash recovery, before entering archive + * recovery on a standby. PANIC if the hash table is not empty. + */ +void +XLogCheckMissingDirs(void) +{ + HASH_SEQ_STATUS status; + xl_missing_dir *hentry; + bool foundone = false; + + if (missing_dir_tab == NULL) + return; /* nothing to do */ + + hash_seq_init(&status, missing_dir_tab); + + while ((hentry = (xl_missing_dir *) hash_seq_search(&status)) != NULL) + { + elog(WARNING, "missing directory \"%s\" tablespace %d database %d", + hentry->path, hentry->key.spcNode, hentry->key.dbNode); + foundone = true; + } + + if (foundone) + elog(PANIC, "WAL contains references to missing directories"); + + hash_destroy(missing_dir_tab); + missing_dir_tab = NULL; +} /* Report a reference to an invalid page */ static void diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c index da0e5d8..b7cbb88 100644 --- a/src/backend/commands/dbcommands.c +++ b/src/backend/commands/dbcommands.c @@ -46,6 +46,7 @@ #include "commands/defrem.h" #include "commands/seclabel.h" #include "commands/tablespace.h" +#include "common/file_perm.h" #include "mb/pg_wchar.h" #include "miscadmin.h" #include "pgstat.h" @@ -2185,7 +2186,9 @@ dbase_redo(XLogReaderState *record) xl_dbase_create_rec *xlrec = (xl_dbase_create_rec *) XLogRecGetData(record); char *src_path; char *dst_path; + char *parent_path; struct stat st; + bool skip = false; src_path = GetDatabasePath(xlrec->src_db_id, xlrec->src_tablespace_id); dst_path = GetDatabasePath(xlrec->db_id, xlrec->tablespace_id); @@ -2203,6 +2206,54 @@ dbase_redo(XLogReaderState *record) (errmsg("some useless files may be left behind in old database directory \"%s\"", dst_path))); } + else + { + /* + * It is possible that drop tablespace record appearing later in + * the WAL as already been replayed. That means we are replaying + * the create database record second time, as part of crash + * recovery. In that case, the tablespace directory has already + * been removed and the create database operation cannot be + * replayed. We should skip the replay but remember the missing + * tablespace directory, to be matched with a drop tablespace + * record later. + */ + parent_path = pstrdup(dst_path); + get_parent_directory(parent_path); + if (!(stat(parent_path, &st) == 0 && S_ISDIR(st.st_mode))) + { + XLogLogMissingDir(xlrec->tablespace_id, InvalidOid, dst_path); + skip = true; + ereport(WARNING, + (errmsg("skipping create database WAL record"), + errdetail("Target tablespace \"%s\" not found. We " + "expect to encounter a WAL record that " + "removes this directory before reaching " + "consistent state.", parent_path))); + } + pfree(parent_path); + } + + /* + * Source directory may be missing. E.g. the template database used + * for creating this database may have been dropped, due to reasons + * noted above. Moving a database from one tablespace may also be a + * partner in the crime. + */ + if (!(stat(src_path, &st) == 0 && S_ISDIR(st.st_mode))) + { + XLogLogMissingDir(xlrec->src_tablespace_id, xlrec->src_db_id, src_path); + skip = true; + ereport(WARNING, + (errmsg("skipping create database WAL record"), + errdetail("Source database \"%s\" not found. We expect " + "to encounter a WAL record that removes this " + "directory before reaching consistent state.", + src_path))); + } + + if (skip) + return; /* * Force dirty buffers out to disk, to ensure source database is @@ -2260,6 +2311,9 @@ dbase_redo(XLogReaderState *record) ereport(WARNING, (errmsg("some useless files may be left behind in old database directory \"%s\"", dst_path))); + + XLogForgetMissingDir(xlrec->tablespace_ids[i], xlrec->db_id, dst_path); + pfree(dst_path); } diff --git a/src/backend/commands/tablespace.c b/src/backend/commands/tablespace.c index 570dcb2..e4f6aad 100644 --- a/src/backend/commands/tablespace.c +++ b/src/backend/commands/tablespace.c @@ -58,6 +58,7 @@ #include "access/xact.h" #include "access/xlog.h" #include "access/xloginsert.h" +#include "access/xlogutils.h" #include "catalog/catalog.h" #include "catalog/dependency.h" #include "catalog/indexing.h" @@ -1516,6 +1517,8 @@ tblspc_redo(XLogReaderState *record) { xl_tblspc_drop_rec *xlrec = (xl_tblspc_drop_rec *) XLogRecGetData(record); + XLogForgetMissingDir(xlrec->ts_id, InvalidOid, ""); + /* * If we issued a WAL record for a drop tablespace it implies that * there were no files in it at all when the DROP was done. That means diff --git a/src/include/access/xlogutils.h b/src/include/access/xlogutils.h index 0572b24..938a35f 100644 --- a/src/include/access/xlogutils.h +++ b/src/include/access/xlogutils.h @@ -23,6 +23,10 @@ extern void XLogDropDatabase(Oid dbid); extern void XLogTruncateRelation(RelFileNode rnode, ForkNumber forkNum, BlockNumber nblocks); +extern void XLogLogMissingDir(Oid spcNode, Oid dbNode, char *path); +extern void XLogForgetMissingDir(Oid spcNode, Oid dbNode, char *path); +extern void XLogCheckMissingDirs(void); + /* Result codes for XLogReadBufferForRedo[Extended] */ typedef enum { diff --git a/src/include/commands/dbcommands.h b/src/include/commands/dbcommands.h index d1e91a2..9d321e7 100644 --- a/src/include/commands/dbcommands.h +++ b/src/include/commands/dbcommands.h @@ -19,6 +19,8 @@ #include "lib/stringinfo.h" #include "nodes/parsenodes.h" +extern void CheckMissingDirs4DbaseRedo(void); + extern Oid createdb(ParseState *pstate, const CreatedbStmt *stmt); extern void dropdb(const char *dbname, bool missing_ok, bool force); extern void DropDatabase(ParseState *pstate, DropdbStmt *stmt);