From 2f2b6b3006be047f730530b0fdf4d219bcc77222 Mon Sep 17 00:00:00 2001 From: Julien Rouhaud Date: Mon, 4 Nov 2019 08:40:23 +0100 Subject: [PATCH v17 1/2] Add backend infrastructure to check the validity of an on-disk block. A new CheckBuffer function is introduced. It takes care of the various locking aspects to make sure that no false positive can be returned. Author: Julien Rouhaud Reviewed-by: Michael Paquier, Masahiko Sawada, Justin Pryzby Discussion: https://postgr.es/m/CAOBaU_aVvMjQn%3Dge5qPiJOPMmOj5%3Dii3st5Q0Y%2BWuLML5sR17w%40mail.gmail.com --- src/backend/storage/buffer/bufmgr.c | 216 ++++++++++++++++++++++++++++ src/include/storage/bufmgr.h | 8 ++ 2 files changed, 224 insertions(+) diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index e549fa1d30..e81e899594 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -45,7 +45,9 @@ #include "postmaster/bgwriter.h" #include "storage/buf_internals.h" #include "storage/bufmgr.h" +#include "storage/checksum.h" #include "storage/ipc.h" +#include "storage/lmgr.h" #include "storage/proc.h" #include "storage/smgr.h" #include "storage/standby.h" @@ -480,6 +482,42 @@ static int buffertag_comparator(const void *p1, const void *p2); static int ckpt_buforder_comparator(const void *pa, const void *pb); static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg); +/* ---------------- + * The rest of this module provides a set of functions that can be used to + * safely check all checksums on a running cluster. + * + * Please note that those only perform standard buffered reads, and don't try + * to bypass or discard the operating system cache. If you want to check the + * actual storage, you have to discard the operating system cache before + * running those functions. + * + * To avoid torn pages and possible false positives when reading data, the + * following heuristics are used: + * + * - a shared LWLock is taken on the target buffer pool partition mapping, and + * we detect if a block is in shared_buffers or not. See check_get_buffer() + * comments for more details about the locking strategy. + * + * - if a block is dirty in shared_buffers, it's ignored as it'll be flushed to + * disk either before the end of the next checkpoint or during recovery in + * case of unsafe shutdown + * + * - if a block is otherwise found in shared_buffers, an IO lock is taken on + * the block and the block is then read from storage, ignoring the block in + * shared_buffers + * + * - if a block is not found in shared_buffers, the block is read from disk + * while holding the buffer pool partition mapping LWLock. + * + * The check can be performed using an SQL function, returning the list of + * problematic blocks. + * ---------------- + */ +static bool check_buffer(char *buffer, uint32 blkno, uint16 *chk_expected, + uint16 *chk_found); +static bool check_get_buffer(Relation relation, ForkNumber forknum, + BlockNumber blkno, char *buffer); + /* * Implementation of PrefetchBuffer() for shared buffers. @@ -4583,3 +4621,181 @@ TestForOldSnapshot_impl(Snapshot snapshot, Relation relation) (errcode(ERRCODE_SNAPSHOT_TOO_OLD), errmsg("snapshot too old"))); } + +/* + * Perform a checksum check on the passed page. Return True iff the page is + * valid or not, and assign the expected and found checksum in chk_expected and + * chk_found, respectively. Note that a page can look like new but could be + * the result of corruption. We still check for this case, but we can't + * compute its checksum as pg_checksum_page() is explicitly checking for + * non-new pages, so NoComputedChecksum will be set in chk_found. + */ +static bool +check_buffer(char *buffer, uint32 blkno, uint16 *chk_expected, + uint16 *chk_found) +{ + Page page = (Page) buffer; + PageHeader hdr = (PageHeader) page; + + Assert(chk_expected && chk_found); + + if (PageIsNew(page)) + { + /* + * Check if the page is really new or if there's corruption that + * affected PageIsNew detection. Note that PageIsVerified won't try to + * detect checksum corruption in this case, so there's no risk of + * duplicated corruption report. + */ + if (PageIsVerified(page, blkno)) + { + /* No corruption. */ + return true; + } + + /* + * There's corruption, but since this affects PageIsNew, we + * can't compute a checksum, so set NoComputedChecksum for the + * expected checksum. + */ + *chk_expected = NoComputedChecksum; + *chk_found = hdr->pd_checksum; + return false; + } + + *chk_expected = pg_checksum_page(buffer, blkno); + *chk_found = hdr->pd_checksum; + + return (*chk_expected == *chk_found); +} + +/* + *------------------------- + * Safely read the wanted buffer from disk, dealing with possible concurrency + * issue. Note that if a buffer is found dirty in shared_buffers, no read will + * be performed and the caller will be informed that no check should be done. + * We can safely ignore such buffers as they'll be written before next + * checkpoint's completion. + * + * Note that the given buffer will be retrieved in a private memory. + * + * The following locks can be used in this function: + * + * - shared LWLock on the target buffer pool partition mapping. + * - IOLock on the buffer + * + * The IOLock is taken when reading the buffer from disk if it exists in + * shared_buffers, to avoid torn pages. + * + * If the buffer isn't in shared_buffers, it'll be read while the buffer + * mapping partition LWLock is still being held. Reading with this lock is to + * avoid the unlikely but possible case that a buffer wasn't present in shared + * buffers when we checked but it was then alloc'ed in shared_buffers, modified + * and flushed concurrently when we later try to read it, leading to false + * positives due to a torn page. + * + * Caller should hold an AccessShareLock on the Relation + *------------------------- + */ +static bool +check_get_buffer(Relation relation, ForkNumber forknum, + BlockNumber blkno, char *buffer) +{ + bool checkit = true; + BufferTag buf_tag; /* identity of requested block */ + uint32 buf_hash; /* hash value for buf_tag */ + LWLock *partLock; /* buffer partition lock for the buffer */ + BufferDesc *bufdesc; + int buf_id; + + /* create a tag so we can lookup the buffer */ + INIT_BUFFERTAG(buf_tag, relation->rd_smgr->smgr_rnode.node, forknum, blkno); + + /* determine its hash code and partition lock ID */ + buf_hash = BufTableHashCode(&buf_tag); + partLock = BufMappingPartitionLock(buf_hash); + + /* see if the block is in the buffer pool already */ + LWLockAcquire(partLock, LW_SHARED); + buf_id = BufTableLookup(&buf_tag, buf_hash); + if (buf_id >= 0) + { + uint32 buf_state; + + /* + * Found it. Now, retrieve its state to know what to do with it, and + * release the pin immediately. We do so to limit overhead as much + * as possible. We'll keep the shared lightweight lock on the target + * buffer mapping partition, so this buffer can't be evicted, and + * we'll acquire an IOLock on the buffer if we need to read the + * content on disk. + */ + bufdesc = GetBufferDescriptor(buf_id); + + buf_state = LockBufHdr(bufdesc); + UnlockBufHdr(bufdesc, buf_state); + + /* + * Dirty pages are ignored as they'll be flushed soon. Invalid buffers + * are also skipped. + */ + if ((buf_state & BM_DIRTY) || !(buf_state & BM_TAG_VALID)) + checkit = false; + + /* + * Read the buffer from disk, taking an IO lock to prevent torn-page + * reads, in the unlikely event that it was concurrently dirtied and + * flushed. + */ + if (checkit) + { + LWLockAcquire(BufferDescriptorGetIOLock(bufdesc), LW_SHARED); + smgrread(relation->rd_smgr, forknum, blkno, buffer); + LWLockRelease(BufferDescriptorGetIOLock(bufdesc)); + } + } + else + { + /* + * Simply read the buffer. There's no risk of modification on it as we + * kept the buffer pool partition mapping lock. + */ + smgrread(relation->rd_smgr, forknum, blkno, buffer); + + /* The buffer will have to be checked. */ + Assert(checkit); + } + + LWLockRelease(partLock); + + return checkit; +} + +/* + * Check data sanity for a specific block in the given fork of the given + * relation, always retrieved locally with smgrread even if a version exists in + * shared_buffers. Returns false if the block appears to be corrupted, true + * otherwise. Note that dirty and invalid blocks won't be checked. Caller + * must hold at least an AccessShareLock on the relation. + */ +bool +CheckBuffer(Relation relation, ForkNumber forknum, BlockNumber blkno, + uint16 *chk_expected, uint16 *chk_found) +{ + char buffer[BLCKSZ]; + + Assert(CheckRelationLockedByMe(relation, AccessShareLock, true)); + Assert(blkno < RelationGetNumberOfBlocksInFork(relation, forknum)); + Assert(smgrexists(relation->rd_smgr, forknum)); + + *chk_expected = *chk_found = NoComputedChecksum; + + if (!check_get_buffer(relation, forknum, blkno, buffer)) + return true; + + if (check_buffer(buffer, blkno, chk_expected, chk_found)) + return true; + + /* A corruption is detected. */ + return false; +} diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index ee91b8fa26..24aa102175 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -244,6 +244,14 @@ extern void TestForOldSnapshot_impl(Snapshot snapshot, Relation relation); extern BufferAccessStrategy GetAccessStrategy(BufferAccessStrategyType btype); extern void FreeAccessStrategy(BufferAccessStrategy strategy); +/* + * A zero checksum can never be computed, see pg_checksum_page() */ +#define NoComputedChecksum 0 + +extern bool CheckBuffer(Relation relation, ForkNumber forknum, + BlockNumber blkno, uint16 *chk_expected, + uint16 *chk_found); + /* inline functions */ -- 2.20.1