From 834235add60df8203c243fa30e78cfa62619076b Mon Sep 17 00:00:00 2001 From: Jehan-Guillaume de Rorthais Date: Wed, 1 Apr 2020 16:36:06 +0200 Subject: [PATCH 1/2] Fix WAL retention during production crash recovery During crash recovery of a production cluster with archive_mode=on, XLogArchiveCheckDone() was considering the cluster as inRecovery without archive_mode=always. Because of this non-arcived WAL and related .ready files were recycled or removed. --- src/backend/access/transam/xlog.c | 25 +++++++++++++++++------- src/backend/access/transam/xlogarchive.c | 7 ++++--- src/include/access/xlog.h | 9 +++++++++ 3 files changed, 31 insertions(+), 10 deletions(-) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 977d448f50..55d06a8704 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -219,8 +219,8 @@ static TimeLineID receiveTLI = 0; static bool lastFullPageWrites; /* - * Local copy of SharedRecoveryInProgress variable. True actually means "not - * known, need to check the shared state". + * This is false when SharedRecoveryInProgress is not IN_ARCHIVE_RECOVERY. + * True actually means "not known, need to check the shared state". */ static bool LocalRecoveryInProgress = true; @@ -653,10 +653,10 @@ typedef struct XLogCtlData TimeLineID PrevTimeLineID; /* - * SharedRecoveryInProgress indicates if we're still in crash or archive + * SharedRecoveryInProgress indicates if we're either in crash or archive * recovery. Protected by info_lck. */ - bool SharedRecoveryInProgress; + RecoveryState SharedRecoveryInProgress; /* * SharedHotStandbyActive indicates if we allow hot standby queries to be @@ -5131,7 +5131,7 @@ XLOGShmemInit(void) * in additional info.) */ XLogCtl->XLogCacheBlck = XLOGbuffers - 1; - XLogCtl->SharedRecoveryInProgress = true; + XLogCtl->SharedRecoveryInProgress = IN_ARCHIVE_RECOVERY; XLogCtl->SharedHotStandbyActive = false; XLogCtl->SharedPromoteIsTriggered = false; XLogCtl->WalWriterSleeping = false; @@ -6901,6 +6901,9 @@ StartupXLOG(void) ControlFile->checkPointCopy.ThisTimeLineID, recoveryTargetTLI))); ControlFile->state = DB_IN_CRASH_RECOVERY; + SpinLockAcquire(&XLogCtl->info_lck); + XLogCtl->SharedRecoveryInProgress = IN_CRASH_RECOVERY; + SpinLockRelease(&XLogCtl->info_lck); } ControlFile->checkPoint = checkPointLoc; ControlFile->checkPointCopy = checkPoint; @@ -7928,7 +7931,7 @@ StartupXLOG(void) ControlFile->time = (pg_time_t) time(NULL); SpinLockAcquire(&XLogCtl->info_lck); - XLogCtl->SharedRecoveryInProgress = false; + XLogCtl->SharedRecoveryInProgress = NOT_IN_RECOVERY; SpinLockRelease(&XLogCtl->info_lck); UpdateControlFile(); @@ -8074,7 +8077,7 @@ RecoveryInProgress(void) */ volatile XLogCtlData *xlogctl = XLogCtl; - LocalRecoveryInProgress = xlogctl->SharedRecoveryInProgress; + LocalRecoveryInProgress = (xlogctl->SharedRecoveryInProgress != NOT_IN_RECOVERY); /* * Initialize TimeLineID and RedoRecPtr when we discover that recovery @@ -8103,6 +8106,14 @@ RecoveryInProgress(void) } } +RecoveryState +GetRecoveryState(void) +{ + volatile XLogCtlData *xlogctl = XLogCtl; + + return xlogctl->SharedRecoveryInProgress; +} + /* * Is HotStandby active yet? This is only important in special backends * since normal backends won't ever be able to connect until this returns diff --git a/src/backend/access/transam/xlogarchive.c b/src/backend/access/transam/xlogarchive.c index d62c12310a..25666cdbcf 100644 --- a/src/backend/access/transam/xlogarchive.c +++ b/src/backend/access/transam/xlogarchive.c @@ -572,7 +572,7 @@ XLogArchiveCheckDone(const char *xlog) { char archiveStatusPath[MAXPGPATH]; struct stat stat_buf; - bool inRecovery = RecoveryInProgress(); + RecoveryState inRecoveryState = GetRecoveryState(); /* * The file is always deletable if archive_mode is "off". On standbys @@ -580,8 +580,9 @@ XLogArchiveCheckDone(const char *xlog) * "always". On a primary, archiving is enabled if archive_mode is "on" * or "always". */ - if (!((XLogArchivingActive() && !inRecovery) || - (XLogArchivingAlways() && inRecovery))) + if ( (inRecoveryState != IN_CRASH_RECOVERY) && ( + (inRecoveryState == NOT_IN_RECOVERY && !XLogArchivingActive()) && + (inRecoveryState == IN_ARCHIVE_RECOVERY && !XLogArchivingAlways()))) return true; /* First check for .done --- this means archiver is done with it */ diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index 9ec7b31cce..d8b08d6d17 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -165,6 +165,14 @@ typedef enum WalLevel WAL_LEVEL_LOGICAL } WalLevel; +/* Recovery state */ +typedef enum RecoveryState +{ + NOT_IN_RECOVERY = 0, + IN_CRASH_RECOVERY, + IN_ARCHIVE_RECOVERY +} RecoveryState; + extern PGDLLIMPORT int wal_level; /* Is WAL archiving enabled (always or only while server is running normally)? */ @@ -278,6 +286,7 @@ extern const char *xlog_identify(uint8 info); extern void issue_xlog_fsync(int fd, XLogSegNo segno); extern bool RecoveryInProgress(void); +extern RecoveryState GetRecoveryState(void); extern bool HotStandbyActive(void); extern bool HotStandbyActiveInReplay(void); extern bool XLogInsertAllowed(void); -- 2.20.1