From 4f96baaece1cff35f89893d8e7aa1fdf57435f53 Mon Sep 17 00:00:00 2001
From: Kyotaro Horiguchi <horikyota.ntt@gmail.com>
Date: Tue, 8 Feb 2022 16:42:53 +0900
Subject: [PATCH v10 1/5] Get rid of unused path to handle concurrent
 checkpoints

CreateRestartPoint considered the case a concurrent checkpoint is
running. But 7ff23c6d27 eliminates the possibility that multiple
checkpoints run simultaneously.  That code path, if it were passed,
might leave unrecoverable database by removing WAL segments that are
required by the last established restartpoint.

In passing, the code in the function gets tightened-up and tidied-up
in some points so that it gets easier to read.
---
 src/backend/access/transam/xlog.c     | 67 ++++++++++++++++-----------
 src/backend/postmaster/checkpointer.c |  1 -
 2 files changed, 41 insertions(+), 27 deletions(-)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 958220c495..8c2882b49f 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -9659,6 +9659,9 @@ CreateRestartPoint(int flags)
 	XLogSegNo	_logSegNo;
 	TimestampTz xtime;

+	/* we don't assume concurrent checkpoint/restartpoint to run */
+	Assert (!IsUnderPostmaster || MyBackendType == B_CHECKPOINTER);
+
 	/* Get a local copy of the last safe checkpoint record. */
 	SpinLockAcquire(&XLogCtl->info_lck);
 	lastCheckPointRecPtr = XLogCtl->lastCheckPointRecPtr;
@@ -9724,7 +9727,7 @@ CreateRestartPoint(int flags)

 	/* Also update the info_lck-protected copy */
 	SpinLockAcquire(&XLogCtl->info_lck);
-	XLogCtl->RedoRecPtr = lastCheckPoint.redo;
+	XLogCtl->RedoRecPtr = RedoRecPtr;
 	SpinLockRelease(&XLogCtl->info_lck);

 	/*
@@ -9743,7 +9746,12 @@ CreateRestartPoint(int flags)
 	/* Update the process title */
 	update_checkpoint_display(flags, true, false);

-	CheckPointGuts(lastCheckPoint.redo, flags);
+	CheckPointGuts(RedoRecPtr, flags);
+
+	/*
+	 * Update pg_control, using current time.
+	 */
+	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);

 	/*
 	 * Remember the prior checkpoint's redo ptr for
@@ -9751,30 +9759,23 @@ CreateRestartPoint(int flags)
 	 */
 	PriorRedoPtr = ControlFile->checkPointCopy.redo;

+	ControlFile->checkPoint = lastCheckPointRecPtr;
+	ControlFile->checkPointCopy = lastCheckPoint;
+
 	/*
-	 * Update pg_control, using current time.  Check that it still shows
-	 * DB_IN_ARCHIVE_RECOVERY state and an older checkpoint, else do nothing;
-	 * this is a quick hack to make sure nothing really bad happens if somehow
-	 * we get here after the end-of-recovery checkpoint.
+	 * Ensure minRecoveryPoint is past the checkpoint record while archive
+	 * recovery is still ongoing.  Normally, this will have happened already
+	 * while writing out dirty buffers, but not necessarily - e.g. because no
+	 * buffers were dirtied.  We do this because a non-exclusive base backup
+	 * uses minRecoveryPoint to determine which WAL files must be included in
+	 * the backup, and the file (or files) containing the checkpoint record
+	 * must be included, at a minimum. Note that for an ordinary restart of
+	 * recovery there's no value in having the minimum recovery point any
+	 * earlier than this anyway, because redo will begin just after the
+	 * checkpoint record.
 	 */
-	LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
-	if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY &&
-		ControlFile->checkPointCopy.redo < lastCheckPoint.redo)
+	if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY)
 	{
-		ControlFile->checkPoint = lastCheckPointRecPtr;
-		ControlFile->checkPointCopy = lastCheckPoint;
-
-		/*
-		 * Ensure minRecoveryPoint is past the checkpoint record.  Normally,
-		 * this will have happened already while writing out dirty buffers,
-		 * but not necessarily - e.g. because no buffers were dirtied.  We do
-		 * this because a non-exclusive base backup uses minRecoveryPoint to
-		 * determine which WAL files must be included in the backup, and the
-		 * file (or files) containing the checkpoint record must be included,
-		 * at a minimum. Note that for an ordinary restart of recovery there's
-		 * no value in having the minimum recovery point any earlier than this
-		 * anyway, because redo will begin just after the checkpoint record.
-		 */
 		if (ControlFile->minRecoveryPoint < lastCheckPointEndPtr)
 		{
 			ControlFile->minRecoveryPoint = lastCheckPointEndPtr;
@@ -9786,15 +9787,29 @@ CreateRestartPoint(int flags)
 		}
 		if (flags & CHECKPOINT_IS_SHUTDOWN)
 			ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
-		UpdateControlFile();
 	}
+	else
+	{
+		/*
+		 * Archive recovery has ended. Crash recovery ever after should always
+		 * recover to the end of WAL.
+		 */
+		ControlFile->minRecoveryPoint = InvalidXLogRecPtr;
+		ControlFile->minRecoveryPointTLI = 0;
+	}
+	UpdateControlFile();
 	LWLockRelease(ControlFileLock);

 	/*
 	 * Update the average distance between checkpoints/restartpoints if the
 	 * prior checkpoint exists.
 	 */
-	if (PriorRedoPtr != InvalidXLogRecPtr)
+
+ 	/*
+ 	 * Update the average distance between checkpoints/restartpoints if the
+	 * prior checkpoint exists. The second term is just in case.
+ 	 */
+	if (PriorRedoPtr != InvalidXLogRecPtr && RedoRecPtr > PriorRedoPtr)
 		UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr);

 	/*
@@ -9864,7 +9879,7 @@ CreateRestartPoint(int flags)
 	xtime = GetLatestXTime();
 	ereport((log_checkpoints ? LOG : DEBUG2),
 			(errmsg("recovery restart point at %X/%X",
-					LSN_FORMAT_ARGS(lastCheckPoint.redo)),
+					LSN_FORMAT_ARGS(RedoRecPtr)),
 			 xtime ? errdetail("Last completed transaction was at log time %s.",
 							   timestamptz_to_str(xtime)) : 0));

diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c
index 23f691cd47..8bd4c47b6b 100644
--- a/src/backend/postmaster/checkpointer.c
+++ b/src/backend/postmaster/checkpointer.c
@@ -386,7 +386,6 @@ CheckpointerMain(void)

 			/* Check if we should perform a checkpoint or a restartpoint. */
 			do_restartpoint = RecoveryInProgress();
-
 			/*
 			 * Atomically fetch the request flags to figure out what kind of a
 			 * checkpoint we should perform, and increase the started-counter
-- 
2.27.0