From c926abcb1171cc99fec36e189d46b27afda3cc92 Mon Sep 17 00:00:00 2001
From: Julien Rouhaud
Date: Mon, 4 Nov 2019 08:40:23 +0100
Subject: [PATCH v14 1/2] Add backend infrastructure to check the validity of
an on-disk block.
A new CheckBuffer function is introduced. It takes care of the various locking
aspects to make sure that no false positive can be returned.
Author: Julien Rouhaud
Reviewed-by: Michael Paquier, Masahiko Sawada, Justin Pryzby
Discussion: https://postgr.es/m/CAOBaU_aVvMjQn%3Dge5qPiJOPMmOj5%3Dii3st5Q0Y%2BWuLML5sR17w%40mail.gmail.com
---
src/backend/storage/buffer/bufmgr.c | 216 ++++++++++++++++++++++++++++
src/include/storage/bufmgr.h | 8 ++
2 files changed, 224 insertions(+)
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index e549fa1d30..e81e899594 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -45,7 +45,9 @@
#include "postmaster/bgwriter.h"
#include "storage/buf_internals.h"
#include "storage/bufmgr.h"
+#include "storage/checksum.h"
#include "storage/ipc.h"
+#include "storage/lmgr.h"
#include "storage/proc.h"
#include "storage/smgr.h"
#include "storage/standby.h"
@@ -480,6 +482,42 @@ static int buffertag_comparator(const void *p1, const void *p2);
static int ckpt_buforder_comparator(const void *pa, const void *pb);
static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg);
+/* ----------------
+ * The rest of this module provides a set of functions that can be used to
+ * safely check all checksums on a running cluster.
+ *
+ * Please note that those only perform standard buffered reads, and don't try
+ * to bypass or discard the operating system cache. If you want to check the
+ * actual storage, you have to discard the operating system cache before
+ * running those functions.
+ *
+ * To avoid torn pages and possible false positives when reading data, the
+ * following heuristics are used:
+ *
+ * - a shared LWLock is taken on the target buffer pool partition mapping, and
+ * we detect if a block is in shared_buffers or not. See check_get_buffer()
+ * comments for more details about the locking strategy.
+ *
+ * - if a block is dirty in shared_buffers, it's ignored as it'll be flushed to
+ * disk either before the end of the next checkpoint or during recovery in
+ * case of unsafe shutdown
+ *
+ * - if a block is otherwise found in shared_buffers, an IO lock is taken on
+ * the block and the block is then read from storage, ignoring the block in
+ * shared_buffers
+ *
+ * - if a block is not found in shared_buffers, the block is read from disk
+ * while holding the buffer pool partition mapping LWLock.
+ *
+ * The check can be performed using an SQL function, returning the list of
+ * problematic blocks.
+ * ----------------
+ */
+static bool check_buffer(char *buffer, uint32 blkno, uint16 *chk_expected,
+ uint16 *chk_found);
+static bool check_get_buffer(Relation relation, ForkNumber forknum,
+ BlockNumber blkno, char *buffer);
+
/*
* Implementation of PrefetchBuffer() for shared buffers.
@@ -4583,3 +4621,181 @@ TestForOldSnapshot_impl(Snapshot snapshot, Relation relation)
(errcode(ERRCODE_SNAPSHOT_TOO_OLD),
errmsg("snapshot too old")));
}
+
+/*
+ * Perform a checksum check on the passed page. Return True iff the page is
+ * valid or not, and assign the expected and found checksum in chk_expected and
+ * chk_found, respectively. Note that a page can look like new but could be
+ * the result of corruption. We still check for this case, but we can't
+ * compute its checksum as pg_checksum_page() is explicitly checking for
+ * non-new pages, so NoComputedChecksum will be set in chk_found.
+ */
+static bool
+check_buffer(char *buffer, uint32 blkno, uint16 *chk_expected,
+ uint16 *chk_found)
+{
+ Page page = (Page) buffer;
+ PageHeader hdr = (PageHeader) page;
+
+ Assert(chk_expected && chk_found);
+
+ if (PageIsNew(page))
+ {
+ /*
+ * Check if the page is really new or if there's corruption that
+ * affected PageIsNew detection. Note that PageIsVerified won't try to
+ * detect checksum corruption in this case, so there's no risk of
+ * duplicated corruption report.
+ */
+ if (PageIsVerified(page, blkno))
+ {
+ /* No corruption. */
+ return true;
+ }
+
+ /*
+ * There's corruption, but since this affects PageIsNew, we
+ * can't compute a checksum, so set NoComputedChecksum for the
+ * expected checksum.
+ */
+ *chk_expected = NoComputedChecksum;
+ *chk_found = hdr->pd_checksum;
+ return false;
+ }
+
+ *chk_expected = pg_checksum_page(buffer, blkno);
+ *chk_found = hdr->pd_checksum;
+
+ return (*chk_expected == *chk_found);
+}
+
+/*
+ *-------------------------
+ * Safely read the wanted buffer from disk, dealing with possible concurrency
+ * issue. Note that if a buffer is found dirty in shared_buffers, no read will
+ * be performed and the caller will be informed that no check should be done.
+ * We can safely ignore such buffers as they'll be written before next
+ * checkpoint's completion.
+ *
+ * Note that the given buffer will be retrieved in a private memory.
+ *
+ * The following locks can be used in this function:
+ *
+ * - shared LWLock on the target buffer pool partition mapping.
+ * - IOLock on the buffer
+ *
+ * The IOLock is taken when reading the buffer from disk if it exists in
+ * shared_buffers, to avoid torn pages.
+ *
+ * If the buffer isn't in shared_buffers, it'll be read while the buffer
+ * mapping partition LWLock is still being held. Reading with this lock is to
+ * avoid the unlikely but possible case that a buffer wasn't present in shared
+ * buffers when we checked but it was then alloc'ed in shared_buffers, modified
+ * and flushed concurrently when we later try to read it, leading to false
+ * positives due to a torn page.
+ *
+ * Caller should hold an AccessShareLock on the Relation
+ *-------------------------
+ */
+static bool
+check_get_buffer(Relation relation, ForkNumber forknum,
+ BlockNumber blkno, char *buffer)
+{
+ bool checkit = true;
+ BufferTag buf_tag; /* identity of requested block */
+ uint32 buf_hash; /* hash value for buf_tag */
+ LWLock *partLock; /* buffer partition lock for the buffer */
+ BufferDesc *bufdesc;
+ int buf_id;
+
+ /* create a tag so we can lookup the buffer */
+ INIT_BUFFERTAG(buf_tag, relation->rd_smgr->smgr_rnode.node, forknum, blkno);
+
+ /* determine its hash code and partition lock ID */
+ buf_hash = BufTableHashCode(&buf_tag);
+ partLock = BufMappingPartitionLock(buf_hash);
+
+ /* see if the block is in the buffer pool already */
+ LWLockAcquire(partLock, LW_SHARED);
+ buf_id = BufTableLookup(&buf_tag, buf_hash);
+ if (buf_id >= 0)
+ {
+ uint32 buf_state;
+
+ /*
+ * Found it. Now, retrieve its state to know what to do with it, and
+ * release the pin immediately. We do so to limit overhead as much
+ * as possible. We'll keep the shared lightweight lock on the target
+ * buffer mapping partition, so this buffer can't be evicted, and
+ * we'll acquire an IOLock on the buffer if we need to read the
+ * content on disk.
+ */
+ bufdesc = GetBufferDescriptor(buf_id);
+
+ buf_state = LockBufHdr(bufdesc);
+ UnlockBufHdr(bufdesc, buf_state);
+
+ /*
+ * Dirty pages are ignored as they'll be flushed soon. Invalid buffers
+ * are also skipped.
+ */
+ if ((buf_state & BM_DIRTY) || !(buf_state & BM_TAG_VALID))
+ checkit = false;
+
+ /*
+ * Read the buffer from disk, taking an IO lock to prevent torn-page
+ * reads, in the unlikely event that it was concurrently dirtied and
+ * flushed.
+ */
+ if (checkit)
+ {
+ LWLockAcquire(BufferDescriptorGetIOLock(bufdesc), LW_SHARED);
+ smgrread(relation->rd_smgr, forknum, blkno, buffer);
+ LWLockRelease(BufferDescriptorGetIOLock(bufdesc));
+ }
+ }
+ else
+ {
+ /*
+ * Simply read the buffer. There's no risk of modification on it as we
+ * kept the buffer pool partition mapping lock.
+ */
+ smgrread(relation->rd_smgr, forknum, blkno, buffer);
+
+ /* The buffer will have to be checked. */
+ Assert(checkit);
+ }
+
+ LWLockRelease(partLock);
+
+ return checkit;
+}
+
+/*
+ * Check data sanity for a specific block in the given fork of the given
+ * relation, always retrieved locally with smgrread even if a version exists in
+ * shared_buffers. Returns false if the block appears to be corrupted, true
+ * otherwise. Note that dirty and invalid blocks won't be checked. Caller
+ * must hold at least an AccessShareLock on the relation.
+ */
+bool
+CheckBuffer(Relation relation, ForkNumber forknum, BlockNumber blkno,
+ uint16 *chk_expected, uint16 *chk_found)
+{
+ char buffer[BLCKSZ];
+
+ Assert(CheckRelationLockedByMe(relation, AccessShareLock, true));
+ Assert(blkno < RelationGetNumberOfBlocksInFork(relation, forknum));
+ Assert(smgrexists(relation->rd_smgr, forknum));
+
+ *chk_expected = *chk_found = NoComputedChecksum;
+
+ if (!check_get_buffer(relation, forknum, blkno, buffer))
+ return true;
+
+ if (check_buffer(buffer, blkno, chk_expected, chk_found))
+ return true;
+
+ /* A corruption is detected. */
+ return false;
+}
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index ee91b8fa26..24aa102175 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -244,6 +244,14 @@ extern void TestForOldSnapshot_impl(Snapshot snapshot, Relation relation);
extern BufferAccessStrategy GetAccessStrategy(BufferAccessStrategyType btype);
extern void FreeAccessStrategy(BufferAccessStrategy strategy);
+/*
+ * A zero checksum can never be computed, see pg_checksum_page() */
+#define NoComputedChecksum 0
+
+extern bool CheckBuffer(Relation relation, ForkNumber forknum,
+ BlockNumber blkno, uint16 *chk_expected,
+ uint16 *chk_found);
+
/* inline functions */
--
2.20.1