From 84d0a4c5ec5d6b6f832fad02d421c204f1bee98b Mon Sep 17 00:00:00 2001
From: Julien Rouhaud
Date: Mon, 4 Nov 2019 08:40:23 +0100
Subject: [PATCH v12] Add a pg_check_relation() function.
This functions checks the validity of the checksums for all non-dirty blocks of
a given relation, and optionally a given fork, and returns the list of all
blocks that don't match, along with the expected and found checksums.
Author: Julien Rouhaud
Reviewed-by: Michael Paquier, Masahiko Sawada, Justin Pryzby
Discussion: https://postgr.es/m/CAOBaU_aVvMjQn%3Dge5qPiJOPMmOj5%3Dii3st5Q0Y%2BWuLML5sR17w%40mail.gmail.com
---
doc/src/sgml/config.sgml | 85 +++++
doc/src/sgml/func.sgml | 51 +++
src/backend/postmaster/pgstat.c | 3 +
src/backend/storage/page/checksum.c | 322 +++++++++++++++++-
src/backend/utils/adt/Makefile | 1 +
src/backend/utils/adt/checksumfuncs.c | 218 ++++++++++++
src/backend/utils/init/globals.c | 7 +
src/backend/utils/misc/guc.c | 33 ++
src/backend/utils/misc/postgresql.conf.sample | 6 +
src/include/catalog/pg_proc.dat | 16 +
src/include/miscadmin.h | 7 +
src/include/pgstat.h | 3 +-
src/include/utils/checksumfuncs.h | 31 ++
src/include/utils/guc_tables.h | 1 +
src/test/modules/Makefile | 1 +
src/test/modules/check_relation/.gitignore | 2 +
src/test/modules/check_relation/Makefile | 14 +
src/test/modules/check_relation/README | 23 ++
.../check_relation/t/001_checksums_check.pl | 276 +++++++++++++++
19 files changed, 1096 insertions(+), 4 deletions(-)
create mode 100644 src/backend/utils/adt/checksumfuncs.c
create mode 100644 src/include/utils/checksumfuncs.h
create mode 100644 src/test/modules/check_relation/.gitignore
create mode 100644 src/test/modules/check_relation/Makefile
create mode 100644 src/test/modules/check_relation/README
create mode 100644 src/test/modules/check_relation/t/001_checksums_check.pl
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index c4ba49ffaf..b7629fde60 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -2137,6 +2137,91 @@ include_dir 'conf.d'
+
+ Cost-based Checksum Verification Delay
+
+
+ During the execution of
+ function, the system maintains an internal counter that keeps track of
+ the estimated cost of the various I/O operations that are performed.
+ When the accumulated cost reaches a limit (specified by
+ checksum_cost_limit), the process performing the
+ operation will sleep for a short period of time, as specified by
+ checksum_cost_delay. Then it will reset the counter
+ and continue execution.
+
+
+
+ This feature is disabled by default. To enable it, set the
+ checksum_cost_delay variable to a nonzero
+ value.
+
+
+
+
+ checksum_cost_delay (floating point)
+
+ checksum_cost_delay configuration parameter
+
+
+
+
+ The amount of time that the process will sleep
+ when the cost limit has been exceeded.
+ If this value is specified without units, it is taken as milliseconds.
+ The default value is zero, which disables the cost-based checksum
+ verification delay feature. Positive values enable cost-based
+ checksum verification.
+
+
+
+
+
+ checksum_cost_page (integer)
+
+ checksum_cost_page configuration parameter
+
+
+
+
+ The estimated cost for verifying a buffer, whether it's found in the
+ shared buffer cache or not. It represents the cost to lock the buffer
+ pool, lookup the shared hash table, read the content of the page from
+ disk and compute its checksum. The default value is 10.
+
+
+
+
+
+ checksum_cost_limit (integer)
+
+ checksum_cost_limit configuration parameter
+
+
+
+
+ The accumulated cost that will cause the verification process to sleep.
+ The default value is 200.
+
+
+
+
+
+
+
+ There are certain operations that hold critical locks and should
+ therefore complete as quickly as possible. Cost-based checksum
+ verification delays do not occur during such operations. Therefore it
+ is possible that the cost accumulates far higher than the specified
+ limit. To avoid uselessly long delays in such cases, the actual delay
+ is calculated as checksum_cost_delay *
+ accumulated_balance /
+ checksum_cost_limit with a maximum of
+ checksum_cost_delay * 4.
+
+
+
+
Background Writer
diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index e2e618791e..bcc97d1306 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -26221,6 +26221,57 @@ SELECT convert_from(pg_read_binary_file('file_in_utf8.txt'), 'UTF8');
+
+ Data Sanity Functions
+
+
+ The functions shown in
+ provide a means to check for health of a data file in a cluster.
+
+
+
+ Data Sanity Functions
+
+
+ Name Return Type Description
+
+
+
+
+
+
+ pg_check_relation(relation regclass [, fork text])
+
+ setof record
+ Validate the checksum for all blocks of a relation.
+
+
+
+
+
+
+
+ pg_check_relation
+
+
+ pg_check_relation iterates over all blocks of a
+ given relation and verifies their checksums. If passed,
+ fork specifies that only checksums of the given
+ fork are to be verified. Fork should be 'main' for the
+ main data fork, 'fsm' for the free space map,
+ 'vm' for the visibility map, or
+ 'init' for the initialization fork.
+ The function returns a list of blocks for which the computed and stored
+ checksums don't match. See for
+ information on how to configure cost-based verification delay. You must be
+ a member of the pg_read_all_stats role to use this
+ function. It can only be used if data checksums are enabled. See for more information.
+
+
+
+
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index 5f4b168fd1..64664c0bc6 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -3912,6 +3912,9 @@ pgstat_get_wait_timeout(WaitEventTimeout w)
case WAIT_EVENT_VACUUM_DELAY:
event_name = "VacuumDelay";
break;
+ case WAIT_EVENT_CHECK_DELAY:
+ event_name = "CheckDelay";
+ break;
/* no default case, so that compiler will warn */
}
diff --git a/src/backend/storage/page/checksum.c b/src/backend/storage/page/checksum.c
index e010691c9f..dbdc105060 100644
--- a/src/backend/storage/page/checksum.c
+++ b/src/backend/storage/page/checksum.c
@@ -15,8 +15,324 @@
#include "storage/checksum.h"
/*
- * The actual code is in storage/checksum_impl.h. This is done so that
- * external programs can incorporate the checksum code by #include'ing
- * that file from the exported Postgres headers. (Compare our CRC code.)
+ * The actual checksum computation code is in storage/checksum_impl.h. This
+ * is done so that external programs can incorporate the checksum code by
+ * #include'ing that file from the exported Postgres headers. (Compare our
+ * CRC code.)
*/
#include "storage/checksum_impl.h"
+
+#include "funcapi.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "storage/buf_internals.h"
+#include "storage/lmgr.h"
+#include "utils/checksumfuncs.h"
+
+/* ----------------
+ * The rest of this module provides a set of functions that can be used to
+ * safely check all checksums on a running cluster.
+ *
+ * Please note that those only perform standard buffered reads, and don't try
+ * to bypass or discard the operating system cache. If you want to check the
+ * actual storage, you have to discard the operating system cache before
+ * running those functions.
+ *
+ * To avoid torn pages and possible false positives when reading data, and to
+ * keep overhead as low as possible, the following heuristics are used:
+ *
+ * - a shared LWLock is taken on the target buffer pool partition mapping, and
+ * we detect if a block is in shared_buffers or not. See check_get_buffer()
+ * comments for more details about the locking strategy.
+ *
+ * - if a block is dirty in shared_buffers, it's ignored as it'll be flushed to
+ * disk either before the end of the next checkpoint or during recovery in
+ * case of unsafe shutdown
+ *
+ * - if a block is otherwise found in shared_buffers, an IO lock is taken on
+ * the block and the block is then read from storage, ignoring the block in
+ * shared_buffers
+ *
+ * - if a block is not found in shared_buffers, the LWLock is released and the
+ * block is read from disk without taking any lock. If an error is detected,
+ * the read block will be discarded and retrieved again while holding the
+ * LWLock. This is because an error due to concurrent write is possible but
+ * very unlikely, so it's better to have an optimistic approach to limit
+ * locking overhead
+ *
+ * The check can be performed using an SQL function, returning the list of
+ * problematic blocks.
+ * ----------------
+ */
+
+static bool check_buffer(char *buffer, uint32 blkno, uint16 *chk_expected,
+ uint16 *chk_found);
+static void check_delay_point(void);
+static bool check_get_buffer(Relation relation, ForkNumber forknum,
+ BlockNumber blkno, char *buffer, bool needlock,
+ bool *found_in_sb);
+
+/*
+ * Check data sanity for a specific block in the given fork of the given
+ * relation, always retrieved locally with smgrread even if a version exists in
+ * shared_buffers. Returns false if the block appears to be corrupted, true
+ * otherwise. Note that dirty and invalid blocks won't be checked. Caller
+ * must hold at least an AccessShareLock on the relation.
+ */
+bool
+check_one_block(Relation relation, ForkNumber forknum, BlockNumber blkno,
+ uint16 *chk_expected, uint16 *chk_found)
+{
+ char buffer[BLCKSZ];
+ bool force_lock = false;
+ bool found_in_sb;
+
+ Assert(CheckRelationLockedByMe(relation, AccessShareLock, true));
+ Assert(blkno < RelationGetNumberOfBlocksInFork(relation, forknum));
+ Assert(smgrexists(relation->rd_smgr, forknum));
+
+ *chk_expected = *chk_found = NoComputedChecksum;
+
+ /*
+ * To avoid excessive overhead, the buffer will be first read without
+ * the locks that would prevent false positives, as such
+ * events should be quite rare.
+ */
+Retry:
+ if (!check_get_buffer(relation, forknum, blkno, buffer, force_lock,
+ &found_in_sb))
+ return true;
+
+ if (check_buffer(buffer, blkno, chk_expected, chk_found))
+ return true;
+
+ /*
+ * If we get a failure and the buffer wasn't found in shared buffers,
+ * reread the buffer with suitable lock to avoid false positive. See
+ * check_get_buffer for more details.
+ */
+ if (!found_in_sb && !force_lock)
+ {
+ force_lock = true;
+ goto Retry;
+ }
+
+ /* A corruption is detected. */
+ return false;
+}
+
+/*
+ * Perform a checksum check on the passed page. Return True iff the page is
+ * valid or not, and assign the expected and found checksum in chk_expected and
+ * chk_found, respectively. Note that a page can look like new but could be
+ * the result of corruption. We still check for this case, but we can't
+ * compute its checksum as pg_checksum_page() is explicitly checking for
+ * non-new pages, so NoComputedChecksum will be set in chk_found.
+ */
+static bool
+check_buffer(char *buffer, uint32 blkno, uint16 *chk_expected,
+ uint16 *chk_found)
+{
+ Page page = (Page) buffer;
+ PageHeader hdr = (PageHeader) page;
+
+ Assert(chk_expected && chk_found);
+
+ if (PageIsNew(page))
+ {
+ /*
+ * Check if the page is really new or if there's corruption that
+ * affected PageIsNew detection. Note that PageIsVerified won't try to
+ * detect checksum corruption in this case, so there's no risk of
+ * duplicated corruption report.
+ */
+ if (PageIsVerified(page, blkno))
+ {
+ /* No corruption. */
+ return true;
+ }
+
+ /*
+ * There's corruption, but since this affects PageIsNew, we
+ * can't compute a checksum, so set NoComputedChecksum for the
+ * expected checksum.
+ */
+ *chk_expected = NoComputedChecksum;
+ *chk_found = hdr->pd_checksum;
+ return false;
+ }
+
+ *chk_expected = pg_checksum_page(buffer, blkno);
+ *chk_found = hdr->pd_checksum;
+
+ return (*chk_expected == *chk_found);
+}
+
+/*
+ * Check for interrupts and cost-based delay.
+ */
+static void
+check_delay_point(void)
+{
+ /* Always check for interrupts */
+ CHECK_FOR_INTERRUPTS();
+
+ if (!ChecksumCostActive || InterruptPending)
+ return;
+
+ /* Nap if appropriate */
+ if (ChecksumCostBalance >= ChecksumCostLimit)
+ {
+ int msec;
+
+ msec = ChecksumCostDelay * ChecksumCostBalance / ChecksumCostLimit;
+ if (msec > ChecksumCostDelay * 4)
+ msec = ChecksumCostDelay * 4;
+
+ pgstat_report_wait_start(WAIT_EVENT_CHECK_DELAY);
+ pg_usleep(msec * 1000L);
+ pgstat_report_wait_end();
+
+ ChecksumCostBalance = 0;
+
+ /* Might have gotten an interrupt while sleeping */
+ CHECK_FOR_INTERRUPTS();
+ }
+}
+
+/*
+ *-------------------------
+ * Safely read the wanted buffer from disk, dealing with possible concurrency
+ * issue. Note that if a buffer is found dirty in shared_buffers, no read will
+ * be performed and the caller will be informed that no check should be done.
+ * We can safely ignore such buffers as they'll be written before next
+ * checkpoint's completion..
+ *
+ * The following locks can be used in this function:
+ *
+ * - shared LWLock on the target buffer pool partition mapping.
+ * - IOLock on the buffer
+ *
+ * The IOLock is taken when reading the buffer from disk if it exists in
+ * shared_buffers, to avoid torn pages.
+ *
+ * If the buffer isn't in shared_buffers, it'll be read from disk without any
+ * lock unless caller asked otherwise, setting needlock. In this case, the
+ * read will be done while the buffer mapping partition LWLock is still being
+ * held. Reading with this lock is to avoid the unlikely but possible case
+ * that a buffer wasn't present in shared buffers when we checked but it then
+ * alloc'ed in shared_buffers, modified and flushed concurrently when we
+ * later try to read it, leading to false positives due to a torn page. Caller
+ * can first read the buffer without holding the target buffer mapping
+ * partition LWLock to have an optimistic approach, and reread the buffer
+ * from disk in case of error.
+ *
+ * Caller should hold an AccessShareLock on the Relation
+ *-------------------------
+ */
+static bool
+check_get_buffer(Relation relation, ForkNumber forknum,
+ BlockNumber blkno, char *buffer, bool needlock,
+ bool *found_in_sb)
+{
+ bool checkit = true;
+ BufferTag buf_tag; /* identity of requested block */
+ uint32 buf_hash; /* hash value for buf_tag */
+ LWLock *partLock; /* buffer partition lock for the buffer */
+ BufferDesc *bufdesc;
+ int buf_id;
+
+ *found_in_sb = false;
+
+ /* Check for interrupts and take throttling into account. */
+ check_delay_point();
+
+ /* create a tag so we can lookup the buffer */
+ INIT_BUFFERTAG(buf_tag, relation->rd_smgr->smgr_rnode.node, forknum, blkno);
+
+ /* determine its hash code and partition lock ID */
+ buf_hash = BufTableHashCode(&buf_tag);
+ partLock = BufMappingPartitionLock(buf_hash);
+
+ /* see if the block is in the buffer pool already */
+ LWLockAcquire(partLock, LW_SHARED);
+ buf_id = BufTableLookup(&buf_tag, buf_hash);
+ if (buf_id >= 0)
+ {
+ uint32 buf_state;
+
+ *found_in_sb = true;
+
+ /*
+ * Found it. Now, retrieve its state to know what to do with it, and
+ * release the pin immediately. We do so to limit overhead as much
+ * as possible. We'll keep the shared lightweight lock on the target
+ * buffer mapping partition, so this buffer can't be evicted, and
+ * we'll acquire an IOLock on the buffer if we need to read the
+ * content on disk.
+ */
+ bufdesc = GetBufferDescriptor(buf_id);
+
+ buf_state = LockBufHdr(bufdesc);
+ UnlockBufHdr(bufdesc, buf_state);
+
+ /*
+ * Dirty pages are ignored as they'll be flushed soon. Invalid buffers
+ * are also skipped.
+ */
+ if ((buf_state & BM_DIRTY) || !(buf_state & BM_TAG_VALID))
+ checkit = false;
+
+ /*
+ * Read the buffer from disk, taking an IO lock to prevent torn-page
+ * reads, in the unlikely event that it was concurrently dirtied and
+ * flushed.
+ */
+ if (checkit)
+ {
+ LWLockAcquire(BufferDescriptorGetIOLock(bufdesc), LW_SHARED);
+ smgrread(relation->rd_smgr, forknum, blkno, buffer);
+ LWLockRelease(BufferDescriptorGetIOLock(bufdesc));
+
+ /* Add a page cost. */
+ ChecksumCostBalance += ChecksumCostPage;
+ }
+ }
+ else if (needlock)
+ {
+ /*
+ * Caller asked to read the buffer while we have a lock on the target
+ * partition.
+ */
+ smgrread(relation->rd_smgr, forknum, blkno, buffer);
+
+ /* The buffer will have to be checked. */
+ Assert(checkit);
+
+ /* Add a page cost. */
+ ChecksumCostBalance += ChecksumCostPage;
+ }
+
+ LWLockRelease(partLock);
+
+ if (*found_in_sb || needlock)
+ return checkit;
+
+ /* After this point the buffer will always be checked. */
+ Assert(checkit);
+
+ /*
+ * Didn't find it in the buffer pool and didn't read it while holding the
+ * buffer mapping partition lock. We'll have to try to read it from
+ * disk, after releasing the target partition lock to avoid excessive
+ * overhead. It means that it's possible to get a torn page later, so
+ * we'll have to retry with a suitable lock in case of error to avoid
+ * false positive.
+ */
+ smgrread(relation->rd_smgr, forknum, blkno, buffer);
+
+ /* Add a page cost. */
+ ChecksumCostBalance += ChecksumCostPage;
+
+ return checkit;
+}
diff --git a/src/backend/utils/adt/Makefile b/src/backend/utils/adt/Makefile
index b4d55e849b..603f63afb6 100644
--- a/src/backend/utils/adt/Makefile
+++ b/src/backend/utils/adt/Makefile
@@ -22,6 +22,7 @@ OBJS = \
bool.o \
cash.o \
char.o \
+ checksumfuncs.o \
cryptohashes.o \
date.o \
datetime.o \
diff --git a/src/backend/utils/adt/checksumfuncs.c b/src/backend/utils/adt/checksumfuncs.c
new file mode 100644
index 0000000000..700831cbc7
--- /dev/null
+++ b/src/backend/utils/adt/checksumfuncs.c
@@ -0,0 +1,218 @@
+/*-------------------------------------------------------------------------
+ *
+ * checksumfuncs.c
+ * Functions for checksum related feature such as online verification
+ *
+ * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/utils/adt/checksumfuncs.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/relation.h"
+#include "catalog/pg_authid_d.h"
+#include "funcapi.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "storage/bufmgr.h"
+#include "storage/checksum.h"
+#include "storage/smgr.h"
+#include "utils/acl.h"
+#include "utils/checksumfuncs.h"
+#include "utils/rel.h"
+#include "utils/builtins.h"
+#include "utils/lsyscache.h"
+
+static void check_one_relation(TupleDesc tupdesc, Tuplestorestate *tupstore,
+ Oid relid, ForkNumber single_forknum);
+static void check_relation_fork(TupleDesc tupdesc, Tuplestorestate *tupstore,
+ Relation relation, ForkNumber forknum);
+static void pg_check_relation_internal(FunctionCallInfo fcinfo, Oid relid,
+ Oid forknum);
+
+
+Datum
+pg_check_relation(PG_FUNCTION_ARGS)
+{
+ Oid relid = InvalidOid;
+
+ pg_check_relation_internal(fcinfo, relid, InvalidForkNumber);
+
+ return (Datum) 0;
+}
+
+Datum
+pg_check_relation_fork(PG_FUNCTION_ARGS)
+{
+ Oid relid = InvalidOid;
+ const char *forkname;
+ ForkNumber forknum;
+
+ forkname = TextDatumGetCString(PG_GETARG_TEXT_PP(1));
+ forknum = forkname_to_number(forkname);
+
+ pg_check_relation_internal(fcinfo, relid, forknum);
+
+ return (Datum) 0;
+}
+
+/* Common code for all versions of pg_check_relation() */
+static void
+pg_check_relation_internal(FunctionCallInfo fcinfo, Oid relid, Oid forknum)
+{
+ ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+ TupleDesc tupdesc;
+ Tuplestorestate *tupstore;
+ MemoryContext per_query_ctx;
+ MemoryContext oldcontext;
+
+ if (!DataChecksumsEnabled())
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("data checksums are not enabled in cluster")));
+
+ if (!is_member_of_role(GetUserId(), DEFAULT_ROLE_STAT_SCAN_TABLES))
+ ereport(ERROR,
+ (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+ errmsg("only superuser or a member of the pg_stat_scan_tables role may use this function")));
+
+ /* Switch into long-lived context to construct returned data structures */
+ per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
+ oldcontext = MemoryContextSwitchTo(per_query_ctx);
+
+ /* Build a tuple descriptor for our result type */
+ if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
+ elog(ERROR, "return type must be a row type");
+
+ tupstore = tuplestore_begin_heap(true, false, work_mem);
+ rsinfo->returnMode = SFRM_Materialize;
+ rsinfo->setResult = tupstore;
+ rsinfo->setDesc = tupdesc;
+
+ MemoryContextSwitchTo(oldcontext);
+
+ relid = PG_GETARG_OID(0);
+
+ /* Set cost-based checksum verification delay */
+ ChecksumCostActive = (ChecksumCostDelay > 0);
+ ChecksumCostBalance = 0;
+
+ check_one_relation(tupdesc, tupstore, relid, forknum);
+
+ tuplestore_donestoring(tupstore);
+}
+
+/*
+ * Perform the check on a single relation, possibly filtered with a single
+ * fork. This function will check if the given relation exists or not, as
+ * a relation could be dropped after checking for the list of relations and
+ * before getting here, and we don't want to error out in this case.
+ */
+static void
+check_one_relation(TupleDesc tupdesc, Tuplestorestate *tupstore,
+ Oid relid, ForkNumber single_forknum)
+{
+ Relation relation = NULL;
+ ForkNumber forknum;
+
+ relation = relation_open(relid, AccessShareLock);
+
+ /* sanity checks */
+ if (!RELKIND_HAS_STORAGE(relation->rd_rel->relkind))
+ ereport(ERROR,
+ (errcode(ERRCODE_WRONG_OBJECT_TYPE),
+ errmsg("relation \"%s\" does not have storage to be checked",
+ RelationGetRelationName(relation))));
+
+ if (RELATION_IS_OTHER_TEMP(relation))
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("cannot verify temporary tables of other sessions")));
+
+ RelationOpenSmgr(relation);
+
+ for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
+ {
+ if (single_forknum != InvalidForkNumber && single_forknum != forknum)
+ continue;
+
+ if (smgrexists(relation->rd_smgr, forknum))
+ check_relation_fork(tupdesc, tupstore, relation, forknum);
+ }
+
+ relation_close(relation, AccessShareLock); /* release the lock */
+}
+
+/*
+ * For a given relation and fork, Do the real work of iterating over all pages
+ * and doing the check. Caller must hold an AccessShareLock lock on the given
+ * relation.
+ */
+static void
+check_relation_fork(TupleDesc tupdesc, Tuplestorestate *tupstore,
+ Relation relation, ForkNumber forknum)
+{
+ BlockNumber blkno,
+ nblocks;
+
+ /*
+ * We remember the number of blocks here. Since caller must hold a lock on
+ * the relation, we know that it won't be truncated while we're iterating
+ * over the blocks. Any block added after this function started won't be
+ * checked, but this is out of scope as such pages will be flushed before
+ * the next checkpoint's completion.
+ */
+ nblocks = RelationGetNumberOfBlocksInFork(relation, forknum);
+
+#define PG_CHECK_RELATION_COLS 5 /* Number of output arguments in the SRF */
+ for (blkno = 0; blkno < nblocks; blkno++)
+ {
+ uint16 chk_expected,
+ chk_found;
+ Datum values[PG_CHECK_RELATION_COLS];
+ bool nulls[PG_CHECK_RELATION_COLS];
+ int i = 0;
+
+ /* Check the given buffer */
+ if (check_one_block(relation, forknum, blkno, &chk_expected,
+ &chk_found))
+ {
+ /* Buffer not corrupted or not worth checking, continue */
+ continue;
+ }
+
+ memset(values, 0, sizeof(values));
+ memset(nulls, 0, sizeof(nulls));
+
+ values[i++] = ObjectIdGetDatum(relation->rd_id);
+ values[i++] = Int32GetDatum(forknum);
+ values[i++] = UInt32GetDatum(blkno);
+ /*
+ * This can happen if corruption makes the block appears as
+ * PageIsNew() but isn't a new page.
+ */
+ if (chk_expected == NoComputedChecksum)
+ nulls[i++] = true;
+ else
+ values[i++] = UInt16GetDatum(chk_expected);
+ values[i++] = UInt16GetDatum(chk_found);
+
+ Assert(i == PG_CHECK_RELATION_COLS);
+
+ /* Report the failure to the stat collector and the logs. */
+ pgstat_report_checksum_failure();
+ ereport(WARNING,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("invalid page in block %u of relation %s",
+ blkno,
+ relpath(relation->rd_smgr->smgr_rnode, forknum))));
+
+ /* Save the corrupted blocks in the tuplestore. */
+ tuplestore_putvalues(tupstore, tupdesc, values, nulls);
+ }
+}
diff --git a/src/backend/utils/init/globals.c b/src/backend/utils/init/globals.c
index 6ab8216839..68e6e0cfcb 100644
--- a/src/backend/utils/init/globals.c
+++ b/src/backend/utils/init/globals.c
@@ -135,6 +135,13 @@ int max_worker_processes = 8;
int max_parallel_workers = 8;
int MaxBackends = 0;
+int ChecksumCostPage = 10; /* GUC parameters for checksum check */
+int ChecksumCostLimit = 200;
+double ChecksumCostDelay = 0;
+
+int ChecksumCostBalance = 0; /* working state for checksums check */
+bool ChecksumCostActive = false;
+
int VacuumCostPageHit = 1; /* GUC parameters for vacuum */
int VacuumCostPageMiss = 10;
int VacuumCostPageDirty = 20;
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index de87ad6ef7..a18f85181d 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -691,6 +691,8 @@ const char *const config_group_names[] =
gettext_noop("Resource Usage / Disk"),
/* RESOURCES_KERNEL */
gettext_noop("Resource Usage / Kernel Resources"),
+ /* RESOURCES_CHECKSUM_DELAY */
+ gettext_noop("Resource Usage / Cost-Based Checksum Verification Delay"),
/* RESOURCES_VACUUM_DELAY */
gettext_noop("Resource Usage / Cost-Based Vacuum Delay"),
/* RESOURCES_BGWRITER */
@@ -2385,6 +2387,26 @@ static struct config_int ConfigureNamesInt[] =
NULL, NULL, NULL
},
+ {
+ {"checksum_cost_page", PGC_USERSET, RESOURCES_CHECKSUM_DELAY,
+ gettext_noop("Checksum cost for verifying a page."),
+ NULL
+ },
+ &ChecksumCostPage,
+ 10, 0, 10000,
+ NULL, NULL, NULL
+ },
+
+ {
+ {"checksum_cost_limit", PGC_USERSET, RESOURCES_CHECKSUM_DELAY,
+ gettext_noop("Checksum cost amount available before napping."),
+ NULL
+ },
+ &ChecksumCostLimit,
+ 200, 1, 10000,
+ NULL, NULL, NULL
+ },
+
{
{"vacuum_cost_page_hit", PGC_USERSET, RESOURCES_VACUUM_DELAY,
gettext_noop("Vacuum cost for a page found in the buffer cache."),
@@ -3585,6 +3607,17 @@ static struct config_real ConfigureNamesReal[] =
check_random_seed, assign_random_seed, show_random_seed
},
+ {
+ {"checksum_cost_delay", PGC_USERSET, RESOURCES_CHECKSUM_DELAY,
+ gettext_noop("Checksum cost delay in milliseconds."),
+ NULL,
+ GUC_UNIT_MS
+ },
+ &ChecksumCostDelay,
+ 0, 0, 100,
+ NULL, NULL, NULL
+ },
+
{
{"vacuum_cost_delay", PGC_USERSET, RESOURCES_VACUUM_DELAY,
gettext_noop("Vacuum cost delay in milliseconds."),
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 9cb571f7cc..0b45a3a9eb 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -160,6 +160,12 @@
#max_files_per_process = 1000 # min 64
# (change requires restart)
+# - Cost-Based Checksum Verification Delay -
+
+#checksum_cost_delay = 0 # 0-100 milliseconds (0 disables)
+#checksum_cost_page = 10 # 0-10000 credits
+#checksum_cost_limit = 200 # 1-10000 credits
+
# - Cost-Based Vacuum Delay -
#vacuum_cost_delay = 0 # 0-100 milliseconds (0 disables)
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index 687509ba92..e0afc7a1f0 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -10938,6 +10938,22 @@
proallargtypes => '{oid,text,int8,timestamptz}', proargmodes => '{i,o,o,o}',
proargnames => '{tablespace,name,size,modification}',
prosrc => 'pg_ls_tmpdir_1arg' },
+{ oid => '9147', descr => 'check data integrity for all forks of a relation',
+ proname => 'pg_check_relation', procost => '10000',
+ prorows => '20', proretset => 't', proparallel => 'r',
+ provolatile => 'v', prorettype => 'record', proargtypes => 'regclass',
+ proallargtypes => '{regclass,oid,int4,int8,int4,int4}',
+ proargmodes => '{i,o,o,o,o,o}',
+ proargnames => '{relation,relid,forknum,failed_blocknum,expected_checksum,found_checksum}',
+ prosrc => 'pg_check_relation' },
+{ oid => '9148', descr => 'check data integrity for one fork of a relation',
+ proname => 'pg_check_relation', procost => '10000',
+ prorows => '20', proretset => 't', proparallel => 'r',
+ provolatile => 'v', prorettype => 'record', proargtypes => 'regclass text',
+ proallargtypes => '{regclass,text,oid,int4,int8,int4,int4}',
+ proargmodes => '{i,i,o,o,o,o,o}',
+ proargnames => '{relation,fork,relid,forknum,failed_blocknum,expected_checksum,found_checksum}',
+ prosrc => 'pg_check_relation_fork' },
# hash partitioning constraint function
{ oid => '5028', descr => 'hash partition CHECK constraint',
diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h
index 72e3352398..44c473995f 100644
--- a/src/include/miscadmin.h
+++ b/src/include/miscadmin.h
@@ -247,6 +247,13 @@ extern PGDLLIMPORT double hash_mem_multiplier;
extern PGDLLIMPORT int maintenance_work_mem;
extern PGDLLIMPORT int max_parallel_maintenance_workers;
+extern int ChecksumCostPage;
+extern int ChecksumCostLimit;
+extern double ChecksumCostDelay;
+
+extern int ChecksumCostBalance;
+extern bool ChecksumCostActive;
+
extern int VacuumCostPageHit;
extern int VacuumCostPageMiss;
extern int VacuumCostPageDirty;
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index 0dfbac46b4..b9ff8392b8 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -902,7 +902,8 @@ typedef enum
WAIT_EVENT_PG_SLEEP,
WAIT_EVENT_RECOVERY_APPLY_DELAY,
WAIT_EVENT_RECOVERY_RETRIEVE_RETRY_INTERVAL,
- WAIT_EVENT_VACUUM_DELAY
+ WAIT_EVENT_VACUUM_DELAY,
+ WAIT_EVENT_CHECK_DELAY
} WaitEventTimeout;
/* ----------
diff --git a/src/include/utils/checksumfuncs.h b/src/include/utils/checksumfuncs.h
new file mode 100644
index 0000000000..f3e13b64b1
--- /dev/null
+++ b/src/include/utils/checksumfuncs.h
@@ -0,0 +1,31 @@
+/*-------------------------------------------------------------------------
+ *
+ * checksumfunc.h
+ * Checksum verification implementation for data pages.
+ *
+ * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/checksumfunc.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef CHECKSUMFUNC_H
+#define CHECKSUMFUNC_H
+
+#include "postgres.h"
+
+#include "access/tupdesc.h"
+#include "common/relpath.h"
+#include "utils/relcache.h"
+#include "utils/tuplestore.h"
+
+/*
+ * A zero checksum can never be computed, see pg_checksum_page() */
+#define NoComputedChecksum 0
+
+extern bool check_one_block(Relation relation, ForkNumber forknum,
+ BlockNumber blkno, uint16 *chk_expected,
+ uint16 *chk_found);
+
+#endif /* CHECKSUMFUNC_H */
diff --git a/src/include/utils/guc_tables.h b/src/include/utils/guc_tables.h
index 04431d0eb2..4ed2a09600 100644
--- a/src/include/utils/guc_tables.h
+++ b/src/include/utils/guc_tables.h
@@ -62,6 +62,7 @@ enum config_group
RESOURCES_MEM,
RESOURCES_DISK,
RESOURCES_KERNEL,
+ RESOURCES_CHECKSUM_DELAY,
RESOURCES_VACUUM_DELAY,
RESOURCES_BGWRITER,
RESOURCES_ASYNCHRONOUS,
diff --git a/src/test/modules/Makefile b/src/test/modules/Makefile
index a6d2ffbf9e..a845af71fd 100644
--- a/src/test/modules/Makefile
+++ b/src/test/modules/Makefile
@@ -7,6 +7,7 @@ include $(top_builddir)/src/Makefile.global
SUBDIRS = \
brin \
commit_ts \
+ check_relation \
delay_execution \
dummy_index_am \
dummy_seclabel \
diff --git a/src/test/modules/check_relation/.gitignore b/src/test/modules/check_relation/.gitignore
new file mode 100644
index 0000000000..871e943d50
--- /dev/null
+++ b/src/test/modules/check_relation/.gitignore
@@ -0,0 +1,2 @@
+# Generated by test suite
+/tmp_check/
diff --git a/src/test/modules/check_relation/Makefile b/src/test/modules/check_relation/Makefile
new file mode 100644
index 0000000000..a540cdece2
--- /dev/null
+++ b/src/test/modules/check_relation/Makefile
@@ -0,0 +1,14 @@
+# src/test/modules/check_relation/Makefile
+
+TAP_TESTS = 1
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+subdir = src/test/modules/check_relation
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
diff --git a/src/test/modules/check_relation/README b/src/test/modules/check_relation/README
new file mode 100644
index 0000000000..415c4b21ad
--- /dev/null
+++ b/src/test/modules/check_relation/README
@@ -0,0 +1,23 @@
+src/test/check_relation/README
+
+Regression tests for online checksums verification
+==================================================
+
+This directory contains a test suite for online checksums verification.
+
+Running the tests
+=================
+
+NOTE: You must have given the --enable-tap-tests argument to configure.
+
+Run
+ make check
+or
+ make installcheck
+You can use "make installcheck" if you previously did "make install".
+In that case, the code in the installation tree is tested. With
+"make check", a temporary installation tree is built from the current
+sources and then tested.
+
+Either way, this test initializes, starts, and stops a test Postgres
+cluster.
diff --git a/src/test/modules/check_relation/t/001_checksums_check.pl b/src/test/modules/check_relation/t/001_checksums_check.pl
new file mode 100644
index 0000000000..2a3f2880ea
--- /dev/null
+++ b/src/test/modules/check_relation/t/001_checksums_check.pl
@@ -0,0 +1,276 @@
+use strict;
+use warnings;
+
+use PostgresNode;
+use TestLib;
+use Test::More tests => 59;
+
+our $CHECKSUM_UINT16_OFFSET = 4;
+our $PD_UPPER_UINT16_OFFSET = 7;
+our $BLOCKSIZE;
+our $TOTAL_NB_ERR = 0;
+
+sub get_block
+{
+ my ($filename, $blkno) = @_;
+ my $block;
+
+ open(my $infile, '<', $filename) or die;
+ binmode($infile);
+
+ my $success = read($infile, $block, $BLOCKSIZE, ($blkno * $BLOCKSIZE));
+ die($!) if not defined $success;
+
+ close($infile);
+
+ return($block);
+}
+
+sub overwrite_block
+{
+ my ($filename, $block, $blkno) = @_;
+
+ open(my $outfile, '>', $filename) or die;
+ binmode ($outfile);
+
+ my $nb = syswrite($outfile, $block, $BLOCKSIZE, ($blkno * $BLOCKSIZE));
+
+ die($!) if not defined $nb;
+ die("Write error") if ($nb != $BLOCKSIZE);
+
+ $outfile->flush();
+
+ close($outfile);
+}
+
+sub get_uint16_from_page
+{
+ my ($block, $offset) = @_;
+
+ return (unpack("S*", $block))[$offset];
+}
+
+sub set_uint16_to_page
+{
+ my ($block, $data, $offset) = @_;
+
+ my $pack = pack("S", $data);
+
+ # vec with 16B or more won't preserve endianness
+ vec($block, 2*$offset, 8) = (unpack('C*', $pack))[0];
+ vec($block, (2*$offset) + 1, 8) = (unpack('C*', $pack))[1];
+
+ return $block;
+}
+
+sub check_checksums_call
+{
+ my ($node, $relname) = @_;
+
+ my ($cmdret, $stdout, $stderr) = $node->psql('postgres', "SELECT COUNT(*)"
+ . " FROM pg_catalog.pg_check_relation('$relname')"
+ );
+
+ return ($stderr eq '');
+}
+
+sub check_checksums_nb_error
+{
+ my ($node, $nb, $pattern) = @_;
+
+ my ($cmdret, $stdout, $stderr) = $node->psql('postgres', "SELECT COUNT(*)"
+ . " FROM (SELECT pg_catalog.pg_check_relation(oid, 'main')"
+ . " FROM pg_class WHERE relkind in ('r', 'i', 'm')) AS s"
+ );
+
+ is($cmdret, 0, 'Function should run successfully');
+ like($stderr, $pattern, 'Error output should match expectations');
+ is($stdout, $nb, "Should have $nb error");
+
+ $TOTAL_NB_ERR += $nb;
+}
+
+sub check_pg_stat_database_nb_error
+{
+ my ($node) = @_;
+
+ my ($cmdret, $stdout, $stderr) = $node->psql('postgres', "SELECT "
+ . " sum(checksum_failures)"
+ . " FROM pg_catalog.pg_stat_database"
+ );
+
+ is($cmdret, 0, 'Function should run successfully');
+ is($stderr, '', 'Function should run successfully');
+ is($stdout, $TOTAL_NB_ERR, "Should have $TOTAL_NB_ERR error");
+}
+
+sub get_checksums_errors
+{
+ my ($node, $nb, $pattern) = @_;
+
+ my ($cmdret, $stdout, $stderr) = $node->psql('postgres', "SELECT"
+ . " relid::regclass::text, forknum, failed_blocknum,"
+ . " expected_checksum, found_checksum"
+ . " FROM (SELECT (pg_catalog.pg_check_relation(oid)).*"
+ . " FROM pg_class WHERE relkind in ('r','i', 'm')) AS s"
+ );
+
+ is($cmdret, '0', 'Function should run successfully');
+ like($stderr, $pattern, 'Error output should match expectations');
+
+ $TOTAL_NB_ERR += $nb;
+
+ return $stdout;
+}
+
+# This function will perform various test by modifying the specified block at
+# the specified uint16 offset, checking that the corruption is correctly
+# detected, and finally restore the specified block to its original content.
+sub corrupt_and_test_block
+{
+ my ($node, $filename, $blkno, $offset, $fake_data) = @_;
+
+ check_checksums_nb_error($node, 0, qr/^$/);
+
+ check_pg_stat_database_nb_error($node);
+
+ $node->stop();
+
+ my $original_block = get_block($filename, 0);
+ my $original_data = get_uint16_from_page($original_block, $offset);
+
+ isnt($original_data, $fake_data,
+ "The fake data at offset $offset should be different"
+ . " from the existing one");
+
+ my $new_block = set_uint16_to_page($original_block, $fake_data, $offset);
+ isnt($original_data, get_uint16_from_page($new_block, $offset),
+ "The fake data at offset $offset should have been changed in memory");
+
+ overwrite_block($filename, $new_block, 0);
+
+ my $written_data = get_uint16_from_page(get_block($filename, 0), $offset);
+ isnt($original_data, $written_data,
+ "The data written at offset $offset should be different"
+ . " from the original one");
+ is(get_uint16_from_page($new_block, $offset), $written_data,
+ "The data written at offset $offset should be the same"
+ . " as the one in memory");
+ is($written_data, $fake_data,
+ "The data written at offset $offset should be the one"
+ . " we wanted to write");
+
+ $node->start();
+
+ check_checksums_nb_error($node, 1, qr/invalid page in block $blkno/);
+
+ my $expected_checksum;
+ my $found_checksum = get_uint16_from_page($new_block,
+ $CHECKSUM_UINT16_OFFSET);
+ if ($offset == $PD_UPPER_UINT16_OFFSET)
+ {
+ # A checksum can't be computed if it's detected as PageIsNew(), so the
+ # function returns NULL for the computed checksum
+ $expected_checksum = '';
+ }
+ else
+ {
+ $expected_checksum = get_uint16_from_page($original_block,
+ $CHECKSUM_UINT16_OFFSET);
+ }
+
+ my $det = get_checksums_errors($node, 1, qr/invalid page in block $blkno/);
+ is($det, "t1|0|0|$expected_checksum|$found_checksum",
+ "The checksums error for modification at offset $offset"
+ . " should be detected");
+
+ $node->stop();
+
+ $new_block = set_uint16_to_page($original_block, $original_data, $offset);
+ is($original_data, get_uint16_from_page($new_block, $offset),
+ "The data at offset $offset should have been restored in memory");
+
+ overwrite_block($filename, $new_block, 0);
+ is($original_data, get_uint16_from_page(get_block($filename, $blkno),
+ $offset),
+ "The data at offset $offset should have been restored on disk");
+
+ $node->start();
+
+ check_checksums_nb_error($node, 0, qr/^$/);
+}
+
+if (exists $ENV{MY_PG_REGRESS})
+{
+ $ENV{PG_REGRESS} = $ENV{MY_PG_REGRESS};
+}
+
+my $node = get_new_node('main');
+
+my %params;
+$params{'extra'} = ['--data-checksums'];
+$node->init(%params);
+
+$node->start();
+
+$ENV{PGOPTIONS} = '--client-min-messages=WARNING';
+
+my ($cmdret, $stdout, $stderr) = $node->psql('postgres', "SELECT"
+ . " current_setting('data_checksums')");
+
+is($stdout, 'on', 'Data checksums should be enabled');
+
+($cmdret, $stdout, $stderr) = $node->psql('postgres', "SELECT"
+ . " current_setting('block_size')");
+
+$BLOCKSIZE = $stdout;
+
+$node->safe_psql(
+ 'postgres', q|
+ CREATE TABLE public.t1(id integer);
+ CREATE INDEX t1_id_idx ON public.t1 (id);
+ INSERT INTO public.t1 SELECT generate_series(1, 100);
+ CREATE VIEW public.v1 AS SELECT * FROM t1;
+ CREATE MATERIALIZED VIEW public.mv1 AS SELECT * FROM t1;
+ CREATE SEQUENCE public.s1;
+ CREATE UNLOGGED TABLE public.u_t1(id integer);
+ CREATE INDEX u_t1_id_idx ON public.u_t1 (id);
+ INSERT INTO public.u_t1 SELECT generate_series(1, 100);
+ CHECKPOINT;
+|);
+
+# Check sane behavior on various objects type, including those that don't have
+# a storage.
+is(check_checksums_call($node, 't1'), '1', 'Can check a table');
+is(check_checksums_call($node, 't1_id_idx'), '1', 'Can check an index');
+is(check_checksums_call($node, 'v1'), '', 'Cannot check a view');
+is(check_checksums_call($node, 'mv1'), '1', 'Can check a materialized view');
+is(check_checksums_call($node, 's1'), '1', 'Can check a sequence');
+is(check_checksums_call($node, 'u_t1'), '1', 'Can check an unlogged table');
+is(check_checksums_call($node, 'u_t1_id_idx'), '1', 'Can check an unlogged index');
+
+# get the underlying heap absolute path
+($cmdret, $stdout, $stderr) = $node->psql('postgres', "SELECT"
+ . " current_setting('data_directory') || '/' || pg_relation_filepath('t1')"
+);
+
+isnt($stdout, '', 'A relfilenode should be returned');
+
+my $filename = $stdout;
+
+check_checksums_nb_error($node, 0, qr/^$/);
+
+check_pg_stat_database_nb_error($node);
+
+my $fake_uint16 = hex '0x0000';
+
+# Test with a modified checksum. We use a zero checksum here as it's the only
+# one that cannot exist on a checksummed page. We also don't have an easy way
+# to compute what the checksum would be after a modification in a random place
+# in the block.
+corrupt_and_test_block($node, $filename, 0, $CHECKSUM_UINT16_OFFSET,
+ $fake_uint16);
+
+# Test corruption making the block looks like it's PageIsNew().
+corrupt_and_test_block($node, $filename, 0, $PD_UPPER_UINT16_OFFSET,
+ $fake_uint16);
--
2.20.1