diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 7165a3e..fd8520a 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -488,6 +488,13 @@ static int readSource = 0; /* XLOG_FROM_* code */ * record from and failed. */ static int failedSources = 0; +static int invalidRecordCount = 0; + +/* + * Number of times we're willing to find the same record invalid before + * deciding it is hopeless. + */ +#define MAX_INVALID_RECORD_COUNT 5 /* Buffer for currently read page (XLOG_BLCKSZ bytes) */ static char *readBuf = NULL; @@ -577,6 +584,7 @@ static void ValidateXLOGDirectoryStructure(void); static void CleanupBackupHistory(void); static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force); static XLogRecord *ReadRecord(XLogRecPtr *RecPtr, int emode, bool fetching_ckpt); +static void TallyInvalidRecord(int source); static void CheckRecoveryConsistency(void); static bool ValidXLOGHeader(XLogPageHeader hdr, int emode); static XLogRecord *ReadCheckpointRecord(XLogRecPtr RecPtr, int whichChkpt); @@ -3691,6 +3699,7 @@ ReadRecord(XLogRecPtr *RecPtr, int emode, bool fetching_ckpt) /* This is the first try to read this page. */ failedSources = 0; + invalidRecordCount = 0; retry: /* Read the page containing the record */ if (!XLogPageRead(RecPtr, emode, fetching_ckpt, randAccess)) @@ -3924,7 +3933,7 @@ retry: return (XLogRecord *) buffer; next_record_is_invalid: - failedSources |= readSource; + TallyInvalidRecord(readSource); if (readFile >= 0) { @@ -3940,6 +3949,26 @@ next_record_is_invalid: } /* + * If we keep rereading the same WAL location and the record at that location + * keeps being invalid, abort recovery. Administrator intervention is + * required; continuing to reread the same data won't help. This will shut + * down Hot Standby if it's running, but a standby which is unrecoverably out + * of sync with the master is not what most people want. + */ +static void +TallyInvalidRecord(int source) +{ + failedSources |= source; + if (source != XLOG_FROM_PG_XLOG) + { + if (++invalidRecordCount >= MAX_INVALID_RECORD_COUNT) + ereport(ERROR, + (errmsg("invalid record in WAL stream"), + errhint("Take a new base backup, or remove recovery.conf and restart in read-write mode."))); + } +} + +/* * Check whether the xlog header of a page just read in looks valid. * * This is just a convenience subroutine to avoid duplicated code in @@ -9203,7 +9232,7 @@ StartupProcessMain(void) * In standby mode, if after a successful return of XLogPageRead() the * caller finds the record it's interested in to be broken, it should * ereport the error with the level determined by - * emode_for_corrupt_record(), and then set "failedSources |= readSource" + * emode_for_corrupt_record(), and then TallyInvalidRecord(readSource) * and call XLogPageRead() again with the same arguments. This lets * XLogPageRead() to try fetching the record from another source, or to * sleep and retry. @@ -9275,9 +9304,8 @@ retry: /* * If we find an invalid record in the WAL streamed from * master, something is seriously wrong. There's little - * chance that the problem will just go away, but PANIC - * is not good for availability either, especially in - * hot standby mode. Disconnect, and retry from + * chance that the problem will just go away, but we have + * to do something. Disconnect, and retry from * archive/pg_xlog again. The WAL in the archive should * be identical to what was streamed, so it's unlikely * that it helps, but one can hope... @@ -9513,7 +9541,7 @@ retry: return true; next_record_is_invalid: - failedSources |= readSource; + TallyInvalidRecord(readSource); if (readFile >= 0) close(readFile);