From 47511afed5f8acf92abaf1cd6fcfecc1faea9c87 Mon Sep 17 00:00:00 2001 From: Kyotaro Horiguchi Date: Fri, 28 Feb 2020 15:52:58 +0900 Subject: [PATCH v3] Make End-Of-Recovery error less scary When recovery in any type ends, we see a bit scary error message like "invalid record length" that suggests something serious is happening. Make this message less scary as "reached end of WAL". --- src/backend/access/transam/xlog.c | 69 ++++++++++++++++++------- src/backend/access/transam/xlogreader.c | 9 ++++ src/backend/replication/walreceiver.c | 11 ++-- 3 files changed, 67 insertions(+), 22 deletions(-) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 793c076da6..6c2924dfb7 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -4283,12 +4283,15 @@ ReadRecord(XLogReaderState *xlogreader, int emode, for (;;) { char *errormsg; + XLogRecPtr ErrRecPtr = InvalidXLogRecPtr; record = XLogReadRecord(xlogreader, &errormsg); ReadRecPtr = xlogreader->ReadRecPtr; EndRecPtr = xlogreader->EndRecPtr; if (record == NULL) { + ErrRecPtr = ReadRecPtr ? ReadRecPtr : EndRecPtr; + if (readFile >= 0) { close(readFile); @@ -4296,13 +4299,13 @@ ReadRecord(XLogReaderState *xlogreader, int emode, } /* - * We only end up here without a message when XLogPageRead() - * failed - in that case we already logged something. In - * StandbyMode that only happens if we have been triggered, so we - * shouldn't loop anymore in that case. + * If we are fetching checkpoint, we emit the error message right + * now. Otherwise the error is regarded as "end of WAL" and the + * message if any is shown as a part of the end-of-WAL message + * below. */ - if (errormsg) - ereport(emode_for_corrupt_record(emode, EndRecPtr), + if (fetching_ckpt && errormsg) + ereport(emode_for_corrupt_record(emode, ErrRecPtr), (errmsg_internal("%s", errormsg) /* already translated */ )); } @@ -4333,11 +4336,12 @@ ReadRecord(XLogReaderState *xlogreader, int emode, /* Great, got a record */ return record; } - else - { - /* No valid record available from this source */ - lastSourceFailed = true; + /* No valid record available from this source */ + lastSourceFailed = true; + + if (!fetching_ckpt) + { /* * If archive recovery was requested, but we were still doing * crash recovery, switch to archive recovery and retry using the @@ -4350,11 +4354,18 @@ ReadRecord(XLogReaderState *xlogreader, int emode, * we'd have no idea how far we'd have to replay to reach * consistency. So err on the safe side and give up. */ - if (!InArchiveRecovery && ArchiveRecoveryRequested && - !fetching_ckpt) + if (!InArchiveRecovery && ArchiveRecoveryRequested) { + /* + * We don't report this as LOG, since we don't stop recovery + * here + */ ereport(DEBUG1, - (errmsg_internal("reached end of WAL in pg_wal, entering archive recovery"))); + (errmsg_internal("reached end of WAL at %X/%X on timeline %u in %s during crash recovery, entering archive recovery", + (uint32) (ErrRecPtr >> 32), + (uint32) ErrRecPtr, + ThisTimeLineID, + xlogSourceNames[currentSource]))); InArchiveRecovery = true; if (StandbyModeRequested) StandbyMode = true; @@ -4392,12 +4403,34 @@ ReadRecord(XLogReaderState *xlogreader, int emode, continue; } - /* In standby mode, loop back to retry. Otherwise, give up. */ - if (StandbyMode && !CheckForStandbyTrigger()) - continue; - else - return NULL; + /* + * We reached the end of WAL, show the messages just once at the + * same LSN. + */ + if (emode_for_corrupt_record(LOG, ErrRecPtr) == LOG) + { + char *fmt; + + if (StandbyMode) + fmt = gettext_noop("reached end of WAL at %X/%X on timeline %u in %s during standby mode"); + else if (InArchiveRecovery) + fmt = gettext_noop("reached end of WAL at %X/%X on timeline %u in %s during archive recovery"); + else + fmt = gettext_noop("reached end of WAL at %X/%X on timeline %u in %s during crash recovery"); + + ereport(LOG, + (errmsg(fmt, (uint32) (EndRecPtr >> 32), + (uint32) EndRecPtr, ThisTimeLineID, + xlogSourceNames[currentSource]), + (errormsg ? errdetail_internal("%s", errormsg) : 0))); + } } + + /* In standby mode, loop back to retry. Otherwise, give up. */ + if (StandbyMode && !CheckForStandbyTrigger()) + continue; + else + return NULL; } } diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c index 32f02256ed..9ea1305364 100644 --- a/src/backend/access/transam/xlogreader.c +++ b/src/backend/access/transam/xlogreader.c @@ -682,6 +682,15 @@ ValidXLogRecordHeader(XLogReaderState *state, XLogRecPtr RecPtr, XLogRecPtr PrevRecPtr, XLogRecord *record, bool randAccess) { + + if (record->xl_tot_len == 0) + { + /* This is strictly not an invalid state, so phrase it as so. */ + report_invalid_record(state, + "record length is 0 at %X/%X", + (uint32) (RecPtr >> 32), (uint32) RecPtr); + return false; + } if (record->xl_tot_len < SizeOfXLogRecord) { report_invalid_record(state, diff --git a/src/backend/replication/walreceiver.c b/src/backend/replication/walreceiver.c index 25e0333c9e..da978d4047 100644 --- a/src/backend/replication/walreceiver.c +++ b/src/backend/replication/walreceiver.c @@ -479,12 +479,15 @@ WalReceiverMain(void) else if (len < 0) { ereport(LOG, - (errmsg("replication terminated by primary server"), - errdetail("End of WAL reached on timeline %u at %X/%X.", + (errmsg("replication terminated by primary server on timeline %u at %X/%X.", startpointTLI, (uint32) (LogstreamResult.Write >> 32), (uint32) LogstreamResult.Write))); - endofwal = true; - break; + + /* + * we have no longer anything to do on the broken + * connection other than exiting. + */ + proc_exit(1); } len = walrcv_receive(wrconn, &buf, &wait_fd); } -- 2.18.2