Cost-based Checksum Verification Delay

From 84d0a4c5ec5d6b6f832fad02d421c204f1bee98b Mon Sep 17 00:00:00 2001 From: Julien Rouhaud Date: Mon, 4 Nov 2019 08:40:23 +0100 Subject: [PATCH v12] Add a pg_check_relation() function. This functions checks the validity of the checksums for all non-dirty blocks of a given relation, and optionally a given fork, and returns the list of all blocks that don't match, along with the expected and found checksums. Author: Julien Rouhaud Reviewed-by: Michael Paquier, Masahiko Sawada, Justin Pryzby Discussion: https://postgr.es/m/CAOBaU_aVvMjQn%3Dge5qPiJOPMmOj5%3Dii3st5Q0Y%2BWuLML5sR17w%40mail.gmail.com --- doc/src/sgml/config.sgml | 85 +++++ doc/src/sgml/func.sgml | 51 +++ src/backend/postmaster/pgstat.c | 3 + src/backend/storage/page/checksum.c | 322 +++++++++++++++++- src/backend/utils/adt/Makefile | 1 + src/backend/utils/adt/checksumfuncs.c | 218 ++++++++++++ src/backend/utils/init/globals.c | 7 + src/backend/utils/misc/guc.c | 33 ++ src/backend/utils/misc/postgresql.conf.sample | 6 + src/include/catalog/pg_proc.dat | 16 + src/include/miscadmin.h | 7 + src/include/pgstat.h | 3 +- src/include/utils/checksumfuncs.h | 31 ++ src/include/utils/guc_tables.h | 1 + src/test/modules/Makefile | 1 + src/test/modules/check_relation/.gitignore | 2 + src/test/modules/check_relation/Makefile | 14 + src/test/modules/check_relation/README | 23 ++ .../check_relation/t/001_checksums_check.pl | 276 +++++++++++++++ 19 files changed, 1096 insertions(+), 4 deletions(-) create mode 100644 src/backend/utils/adt/checksumfuncs.c create mode 100644 src/include/utils/checksumfuncs.h create mode 100644 src/test/modules/check_relation/.gitignore create mode 100644 src/test/modules/check_relation/Makefile create mode 100644 src/test/modules/check_relation/README create mode 100644 src/test/modules/check_relation/t/001_checksums_check.pl diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index c4ba49ffaf..b7629fde60 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -2137,6 +2137,91 @@ include_dir 'conf.d' + + Cost-based Checksum Verification Delay + + + During the execution of + function, the system maintains an internal counter that keeps track of + the estimated cost of the various I/O operations that are performed. + When the accumulated cost reaches a limit (specified by + checksum_cost_limit), the process performing the + operation will sleep for a short period of time, as specified by + checksum_cost_delay. Then it will reset the counter + and continue execution. + + + + This feature is disabled by default. To enable it, set the + checksum_cost_delay variable to a nonzero + value. + + + + + checksum_cost_delay (floating point) + + checksum_cost_delay configuration parameter + + + + + The amount of time that the process will sleep + when the cost limit has been exceeded. + If this value is specified without units, it is taken as milliseconds. + The default value is zero, which disables the cost-based checksum + verification delay feature. Positive values enable cost-based + checksum verification. + + + + + + checksum_cost_page (integer) + + checksum_cost_page configuration parameter + + + + + The estimated cost for verifying a buffer, whether it's found in the + shared buffer cache or not. It represents the cost to lock the buffer + pool, lookup the shared hash table, read the content of the page from + disk and compute its checksum. The default value is 10. + + + + + + checksum_cost_limit (integer) + + checksum_cost_limit configuration parameter + + + + + The accumulated cost that will cause the verification process to sleep. + The default value is 200. + + + + + + + + There are certain operations that hold critical locks and should + therefore complete as quickly as possible. Cost-based checksum + verification delays do not occur during such operations. Therefore it + is possible that the cost accumulates far higher than the specified + limit. To avoid uselessly long delays in such cases, the actual delay + is calculated as checksum_cost_delay * + accumulated_balance / + checksum_cost_limit with a maximum of + checksum_cost_delay * 4. + + + + Background Writer diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index e2e618791e..bcc97d1306 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -26221,6 +26221,57 @@ SELECT convert_from(pg_read_binary_file('file_in_utf8.txt'), 'UTF8'); + + Data Sanity Functions + + + The functions shown in + provide a means to check for health of a data file in a cluster. + + + + Data Sanity Functions + + + Name Return Type Description + + + + + + + pg_check_relation(relation regclass [, fork text]) + + setof record + Validate the checksum for all blocks of a relation. + + + + +

+ + + pg_check_relation + + + pg_check_relation iterates over all blocks of a + given relation and verifies their checksums. If passed, + fork specifies that only checksums of the given + fork are to be verified. Fork should be 'main' for the + main data fork, 'fsm' for the free space map, + 'vm' for the visibility map, or + 'init' for the initialization fork. + The function returns a list of blocks for which the computed and stored + checksums don't match. See for + information on how to configure cost-based verification delay. You must be + a member of the pg_read_all_stats role to use this + function. It can only be used if data checksums are enabled. See for more information. + + + + diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c index 5f4b168fd1..64664c0bc6 100644 --- a/src/backend/postmaster/pgstat.c +++ b/src/backend/postmaster/pgstat.c @@ -3912,6 +3912,9 @@ pgstat_get_wait_timeout(WaitEventTimeout w) case WAIT_EVENT_VACUUM_DELAY: event_name = "VacuumDelay"; break; + case WAIT_EVENT_CHECK_DELAY: + event_name = "CheckDelay"; + break; /* no default case, so that compiler will warn */ } diff --git a/src/backend/storage/page/checksum.c b/src/backend/storage/page/checksum.c index e010691c9f..dbdc105060 100644 --- a/src/backend/storage/page/checksum.c +++ b/src/backend/storage/page/checksum.c @@ -15,8 +15,324 @@ #include "storage/checksum.h" /* - * The actual code is in storage/checksum_impl.h. This is done so that - * external programs can incorporate the checksum code by #include'ing - * that file from the exported Postgres headers. (Compare our CRC code.) + * The actual checksum computation code is in storage/checksum_impl.h. This + * is done so that external programs can incorporate the checksum code by + * #include'ing that file from the exported Postgres headers. (Compare our + * CRC code.) */ #include "storage/checksum_impl.h" + +#include "funcapi.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "storage/buf_internals.h" +#include "storage/lmgr.h" +#include "utils/checksumfuncs.h" + +/* ---------------- + * The rest of this module provides a set of functions that can be used to + * safely check all checksums on a running cluster. + * + * Please note that those only perform standard buffered reads, and don't try + * to bypass or discard the operating system cache. If you want to check the + * actual storage, you have to discard the operating system cache before + * running those functions. + * + * To avoid torn pages and possible false positives when reading data, and to + * keep overhead as low as possible, the following heuristics are used: + * + * - a shared LWLock is taken on the target buffer pool partition mapping, and + * we detect if a block is in shared_buffers or not. See check_get_buffer() + * comments for more details about the locking strategy. + * + * - if a block is dirty in shared_buffers, it's ignored as it'll be flushed to + * disk either before the end of the next checkpoint or during recovery in + * case of unsafe shutdown + * + * - if a block is otherwise found in shared_buffers, an IO lock is taken on + * the block and the block is then read from storage, ignoring the block in + * shared_buffers + * + * - if a block is not found in shared_buffers, the LWLock is released and the + * block is read from disk without taking any lock. If an error is detected, + * the read block will be discarded and retrieved again while holding the + * LWLock. This is because an error due to concurrent write is possible but + * very unlikely, so it's better to have an optimistic approach to limit + * locking overhead + * + * The check can be performed using an SQL function, returning the list of + * problematic blocks. + * ---------------- + */ + +static bool check_buffer(char *buffer, uint32 blkno, uint16 *chk_expected, + uint16 *chk_found); +static void check_delay_point(void); +static bool check_get_buffer(Relation relation, ForkNumber forknum, + BlockNumber blkno, char *buffer, bool needlock, + bool *found_in_sb); + +/* + * Check data sanity for a specific block in the given fork of the given + * relation, always retrieved locally with smgrread even if a version exists in + * shared_buffers. Returns false if the block appears to be corrupted, true + * otherwise. Note that dirty and invalid blocks won't be checked. Caller + * must hold at least an AccessShareLock on the relation. + */ +bool +check_one_block(Relation relation, ForkNumber forknum, BlockNumber blkno, + uint16 *chk_expected, uint16 *chk_found) +{ + char buffer[BLCKSZ]; + bool force_lock = false; + bool found_in_sb; + + Assert(CheckRelationLockedByMe(relation, AccessShareLock, true)); + Assert(blkno < RelationGetNumberOfBlocksInFork(relation, forknum)); + Assert(smgrexists(relation->rd_smgr, forknum)); + + *chk_expected = *chk_found = NoComputedChecksum; + + /* + * To avoid excessive overhead, the buffer will be first read without + * the locks that would prevent false positives, as such + * events should be quite rare. + */ +Retry: + if (!check_get_buffer(relation, forknum, blkno, buffer, force_lock, + &found_in_sb)) + return true; + + if (check_buffer(buffer, blkno, chk_expected, chk_found)) + return true; + + /* + * If we get a failure and the buffer wasn't found in shared buffers, + * reread the buffer with suitable lock to avoid false positive. See + * check_get_buffer for more details. + */ + if (!found_in_sb && !force_lock) + { + force_lock = true; + goto Retry; + } + + /* A corruption is detected. */ + return false; +} + +/* + * Perform a checksum check on the passed page. Return True iff the page is + * valid or not, and assign the expected and found checksum in chk_expected and + * chk_found, respectively. Note that a page can look like new but could be + * the result of corruption. We still check for this case, but we can't + * compute its checksum as pg_checksum_page() is explicitly checking for + * non-new pages, so NoComputedChecksum will be set in chk_found. + */ +static bool +check_buffer(char *buffer, uint32 blkno, uint16 *chk_expected, + uint16 *chk_found) +{ + Page page = (Page) buffer; + PageHeader hdr = (PageHeader) page; + + Assert(chk_expected && chk_found); + + if (PageIsNew(page)) + { + /* + * Check if the page is really new or if there's corruption that + * affected PageIsNew detection. Note that PageIsVerified won't try to + * detect checksum corruption in this case, so there's no risk of + * duplicated corruption report. + */ + if (PageIsVerified(page, blkno)) + { + /* No corruption. */ + return true; + } + + /* + * There's corruption, but since this affects PageIsNew, we + * can't compute a checksum, so set NoComputedChecksum for the + * expected checksum. + */ + *chk_expected = NoComputedChecksum; + *chk_found = hdr->pd_checksum; + return false; + } + + *chk_expected = pg_checksum_page(buffer, blkno); + *chk_found = hdr->pd_checksum; + + return (*chk_expected == *chk_found); +} + +/* + * Check for interrupts and cost-based delay. + */ +static void +check_delay_point(void) +{ + /* Always check for interrupts */ + CHECK_FOR_INTERRUPTS(); + + if (!ChecksumCostActive || InterruptPending) + return; + + /* Nap if appropriate */ + if (ChecksumCostBalance >= ChecksumCostLimit) + { + int msec; + + msec = ChecksumCostDelay * ChecksumCostBalance / ChecksumCostLimit; + if (msec > ChecksumCostDelay * 4) + msec = ChecksumCostDelay * 4; + + pgstat_report_wait_start(WAIT_EVENT_CHECK_DELAY); + pg_usleep(msec * 1000L); + pgstat_report_wait_end(); + + ChecksumCostBalance = 0; + + /* Might have gotten an interrupt while sleeping */ + CHECK_FOR_INTERRUPTS(); + } +} + +/* + *------------------------- + * Safely read the wanted buffer from disk, dealing with possible concurrency + * issue. Note that if a buffer is found dirty in shared_buffers, no read will + * be performed and the caller will be informed that no check should be done. + * We can safely ignore such buffers as they'll be written before next + * checkpoint's completion.. + * + * The following locks can be used in this function: + * + * - shared LWLock on the target buffer pool partition mapping. + * - IOLock on the buffer + * + * The IOLock is taken when reading the buffer from disk if it exists in + * shared_buffers, to avoid torn pages. + * + * If the buffer isn't in shared_buffers, it'll be read from disk without any + * lock unless caller asked otherwise, setting needlock. In this case, the + * read will be done while the buffer mapping partition LWLock is still being + * held. Reading with this lock is to avoid the unlikely but possible case + * that a buffer wasn't present in shared buffers when we checked but it then + * alloc'ed in shared_buffers, modified and flushed concurrently when we + * later try to read it, leading to false positives due to a torn page. Caller + * can first read the buffer without holding the target buffer mapping + * partition LWLock to have an optimistic approach, and reread the buffer + * from disk in case of error. + * + * Caller should hold an AccessShareLock on the Relation + *------------------------- + */ +static bool +check_get_buffer(Relation relation, ForkNumber forknum, + BlockNumber blkno, char *buffer, bool needlock, + bool *found_in_sb) +{ + bool checkit = true; + BufferTag buf_tag; /* identity of requested block */ + uint32 buf_hash; /* hash value for buf_tag */ + LWLock *partLock; /* buffer partition lock for the buffer */ + BufferDesc *bufdesc; + int buf_id; + + *found_in_sb = false; + + /* Check for interrupts and take throttling into account. */ + check_delay_point(); + + /* create a tag so we can lookup the buffer */ + INIT_BUFFERTAG(buf_tag, relation->rd_smgr->smgr_rnode.node, forknum, blkno); + + /* determine its hash code and partition lock ID */ + buf_hash = BufTableHashCode(&buf_tag); + partLock = BufMappingPartitionLock(buf_hash); + + /* see if the block is in the buffer pool already */ + LWLockAcquire(partLock, LW_SHARED); + buf_id = BufTableLookup(&buf_tag, buf_hash); + if (buf_id >= 0) + { + uint32 buf_state; + + *found_in_sb = true; + + /* + * Found it. Now, retrieve its state to know what to do with it, and + * release the pin immediately. We do so to limit overhead as much + * as possible. We'll keep the shared lightweight lock on the target + * buffer mapping partition, so this buffer can't be evicted, and + * we'll acquire an IOLock on the buffer if we need to read the + * content on disk. + */ + bufdesc = GetBufferDescriptor(buf_id); + + buf_state = LockBufHdr(bufdesc); + UnlockBufHdr(bufdesc, buf_state); + + /* + * Dirty pages are ignored as they'll be flushed soon. Invalid buffers + * are also skipped. + */ + if ((buf_state & BM_DIRTY) || !(buf_state & BM_TAG_VALID)) + checkit = false; + + /* + * Read the buffer from disk, taking an IO lock to prevent torn-page + * reads, in the unlikely event that it was concurrently dirtied and + * flushed. + */ + if (checkit) + { + LWLockAcquire(BufferDescriptorGetIOLock(bufdesc), LW_SHARED); + smgrread(relation->rd_smgr, forknum, blkno, buffer); + LWLockRelease(BufferDescriptorGetIOLock(bufdesc)); + + /* Add a page cost. */ + ChecksumCostBalance += ChecksumCostPage; + } + } + else if (needlock) + { + /* + * Caller asked to read the buffer while we have a lock on the target + * partition. + */ + smgrread(relation->rd_smgr, forknum, blkno, buffer); + + /* The buffer will have to be checked. */ + Assert(checkit); + + /* Add a page cost. */ + ChecksumCostBalance += ChecksumCostPage; + } + + LWLockRelease(partLock); + + if (*found_in_sb || needlock) + return checkit; + + /* After this point the buffer will always be checked. */ + Assert(checkit); + + /* + * Didn't find it in the buffer pool and didn't read it while holding the + * buffer mapping partition lock. We'll have to try to read it from + * disk, after releasing the target partition lock to avoid excessive + * overhead. It means that it's possible to get a torn page later, so + * we'll have to retry with a suitable lock in case of error to avoid + * false positive. + */ + smgrread(relation->rd_smgr, forknum, blkno, buffer); + + /* Add a page cost. */ + ChecksumCostBalance += ChecksumCostPage; + + return checkit; +} diff --git a/src/backend/utils/adt/Makefile b/src/backend/utils/adt/Makefile index b4d55e849b..603f63afb6 100644 --- a/src/backend/utils/adt/Makefile +++ b/src/backend/utils/adt/Makefile @@ -22,6 +22,7 @@ OBJS = \ bool.o \ cash.o \ char.o \ + checksumfuncs.o \ cryptohashes.o \ date.o \ datetime.o \ diff --git a/src/backend/utils/adt/checksumfuncs.c b/src/backend/utils/adt/checksumfuncs.c new file mode 100644 index 0000000000..700831cbc7 --- /dev/null +++ b/src/backend/utils/adt/checksumfuncs.c @@ -0,0 +1,218 @@ +/*------------------------------------------------------------------------- + * + * checksumfuncs.c + * Functions for checksum related feature such as online verification + * + * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/utils/adt/checksumfuncs.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/relation.h" +#include "catalog/pg_authid_d.h" +#include "funcapi.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "storage/bufmgr.h" +#include "storage/checksum.h" +#include "storage/smgr.h" +#include "utils/acl.h" +#include "utils/checksumfuncs.h" +#include "utils/rel.h" +#include "utils/builtins.h" +#include "utils/lsyscache.h" + +static void check_one_relation(TupleDesc tupdesc, Tuplestorestate *tupstore, + Oid relid, ForkNumber single_forknum); +static void check_relation_fork(TupleDesc tupdesc, Tuplestorestate *tupstore, + Relation relation, ForkNumber forknum); +static void pg_check_relation_internal(FunctionCallInfo fcinfo, Oid relid, + Oid forknum); + + +Datum +pg_check_relation(PG_FUNCTION_ARGS) +{ + Oid relid = InvalidOid; + + pg_check_relation_internal(fcinfo, relid, InvalidForkNumber); + + return (Datum) 0; +} + +Datum +pg_check_relation_fork(PG_FUNCTION_ARGS) +{ + Oid relid = InvalidOid; + const char *forkname; + ForkNumber forknum; + + forkname = TextDatumGetCString(PG_GETARG_TEXT_PP(1)); + forknum = forkname_to_number(forkname); + + pg_check_relation_internal(fcinfo, relid, forknum); + + return (Datum) 0; +} + +/* Common code for all versions of pg_check_relation() */ +static void +pg_check_relation_internal(FunctionCallInfo fcinfo, Oid relid, Oid forknum) +{ + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + TupleDesc tupdesc; + Tuplestorestate *tupstore; + MemoryContext per_query_ctx; + MemoryContext oldcontext; + + if (!DataChecksumsEnabled()) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("data checksums are not enabled in cluster"))); + + if (!is_member_of_role(GetUserId(), DEFAULT_ROLE_STAT_SCAN_TABLES)) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("only superuser or a member of the pg_stat_scan_tables role may use this function"))); + + /* Switch into long-lived context to construct returned data structures */ + per_query_ctx = rsinfo->econtext->ecxt_per_query_memory; + oldcontext = MemoryContextSwitchTo(per_query_ctx); + + /* Build a tuple descriptor for our result type */ + if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) + elog(ERROR, "return type must be a row type"); + + tupstore = tuplestore_begin_heap(true, false, work_mem); + rsinfo->returnMode = SFRM_Materialize; + rsinfo->setResult = tupstore; + rsinfo->setDesc = tupdesc; + + MemoryContextSwitchTo(oldcontext); + + relid = PG_GETARG_OID(0); + + /* Set cost-based checksum verification delay */ + ChecksumCostActive = (ChecksumCostDelay > 0); + ChecksumCostBalance = 0; + + check_one_relation(tupdesc, tupstore, relid, forknum); + + tuplestore_donestoring(tupstore); +} + +/* + * Perform the check on a single relation, possibly filtered with a single + * fork. This function will check if the given relation exists or not, as + * a relation could be dropped after checking for the list of relations and + * before getting here, and we don't want to error out in this case. + */ +static void +check_one_relation(TupleDesc tupdesc, Tuplestorestate *tupstore, + Oid relid, ForkNumber single_forknum) +{ + Relation relation = NULL; + ForkNumber forknum; + + relation = relation_open(relid, AccessShareLock); + + /* sanity checks */ + if (!RELKIND_HAS_STORAGE(relation->rd_rel->relkind)) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("relation \"%s\" does not have storage to be checked", + RelationGetRelationName(relation)))); + + if (RELATION_IS_OTHER_TEMP(relation)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot verify temporary tables of other sessions"))); + + RelationOpenSmgr(relation); + + for (forknum = 0; forknum <= MAX_FORKNUM; forknum++) + { + if (single_forknum != InvalidForkNumber && single_forknum != forknum) + continue; + + if (smgrexists(relation->rd_smgr, forknum)) + check_relation_fork(tupdesc, tupstore, relation, forknum); + } + + relation_close(relation, AccessShareLock); /* release the lock */ +} + +/* + * For a given relation and fork, Do the real work of iterating over all pages + * and doing the check. Caller must hold an AccessShareLock lock on the given + * relation. + */ +static void +check_relation_fork(TupleDesc tupdesc, Tuplestorestate *tupstore, + Relation relation, ForkNumber forknum) +{ + BlockNumber blkno, + nblocks; + + /* + * We remember the number of blocks here. Since caller must hold a lock on + * the relation, we know that it won't be truncated while we're iterating + * over the blocks. Any block added after this function started won't be + * checked, but this is out of scope as such pages will be flushed before + * the next checkpoint's completion. + */ + nblocks = RelationGetNumberOfBlocksInFork(relation, forknum); + +#define PG_CHECK_RELATION_COLS 5 /* Number of output arguments in the SRF */ + for (blkno = 0; blkno < nblocks; blkno++) + { + uint16 chk_expected, + chk_found; + Datum values[PG_CHECK_RELATION_COLS]; + bool nulls[PG_CHECK_RELATION_COLS]; + int i = 0; + + /* Check the given buffer */ + if (check_one_block(relation, forknum, blkno, &chk_expected, + &chk_found)) + { + /* Buffer not corrupted or not worth checking, continue */ + continue; + } + + memset(values, 0, sizeof(values)); + memset(nulls, 0, sizeof(nulls)); + + values[i++] = ObjectIdGetDatum(relation->rd_id); + values[i++] = Int32GetDatum(forknum); + values[i++] = UInt32GetDatum(blkno); + /* + * This can happen if corruption makes the block appears as + * PageIsNew() but isn't a new page. + */ + if (chk_expected == NoComputedChecksum) + nulls[i++] = true; + else + values[i++] = UInt16GetDatum(chk_expected); + values[i++] = UInt16GetDatum(chk_found); + + Assert(i == PG_CHECK_RELATION_COLS); + + /* Report the failure to the stat collector and the logs. */ + pgstat_report_checksum_failure(); + ereport(WARNING, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("invalid page in block %u of relation %s", + blkno, + relpath(relation->rd_smgr->smgr_rnode, forknum)))); + + /* Save the corrupted blocks in the tuplestore. */ + tuplestore_putvalues(tupstore, tupdesc, values, nulls); + } +} diff --git a/src/backend/utils/init/globals.c b/src/backend/utils/init/globals.c index 6ab8216839..68e6e0cfcb 100644 --- a/src/backend/utils/init/globals.c +++ b/src/backend/utils/init/globals.c @@ -135,6 +135,13 @@ int max_worker_processes = 8; int max_parallel_workers = 8; int MaxBackends = 0; +int ChecksumCostPage = 10; /* GUC parameters for checksum check */ +int ChecksumCostLimit = 200; +double ChecksumCostDelay = 0; + +int ChecksumCostBalance = 0; /* working state for checksums check */ +bool ChecksumCostActive = false; + int VacuumCostPageHit = 1; /* GUC parameters for vacuum */ int VacuumCostPageMiss = 10; int VacuumCostPageDirty = 20; diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index de87ad6ef7..a18f85181d 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -691,6 +691,8 @@ const char *const config_group_names[] = gettext_noop("Resource Usage / Disk"), /* RESOURCES_KERNEL */ gettext_noop("Resource Usage / Kernel Resources"), + /* RESOURCES_CHECKSUM_DELAY */ + gettext_noop("Resource Usage / Cost-Based Checksum Verification Delay"), /* RESOURCES_VACUUM_DELAY */ gettext_noop("Resource Usage / Cost-Based Vacuum Delay"), /* RESOURCES_BGWRITER */ @@ -2385,6 +2387,26 @@ static struct config_int ConfigureNamesInt[] = NULL, NULL, NULL }, + { + {"checksum_cost_page", PGC_USERSET, RESOURCES_CHECKSUM_DELAY, + gettext_noop("Checksum cost for verifying a page."), + NULL + }, + &ChecksumCostPage, + 10, 0, 10000, + NULL, NULL, NULL + }, + + { + {"checksum_cost_limit", PGC_USERSET, RESOURCES_CHECKSUM_DELAY, + gettext_noop("Checksum cost amount available before napping."), + NULL + }, + &ChecksumCostLimit, + 200, 1, 10000, + NULL, NULL, NULL + }, + { {"vacuum_cost_page_hit", PGC_USERSET, RESOURCES_VACUUM_DELAY, gettext_noop("Vacuum cost for a page found in the buffer cache."), @@ -3585,6 +3607,17 @@ static struct config_real ConfigureNamesReal[] = check_random_seed, assign_random_seed, show_random_seed }, + { + {"checksum_cost_delay", PGC_USERSET, RESOURCES_CHECKSUM_DELAY, + gettext_noop("Checksum cost delay in milliseconds."), + NULL, + GUC_UNIT_MS + }, + &ChecksumCostDelay, + 0, 0, 100, + NULL, NULL, NULL + }, + { {"vacuum_cost_delay", PGC_USERSET, RESOURCES_VACUUM_DELAY, gettext_noop("Vacuum cost delay in milliseconds."), diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index 9cb571f7cc..0b45a3a9eb 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -160,6 +160,12 @@ #max_files_per_process = 1000 # min 64 # (change requires restart) +# - Cost-Based Checksum Verification Delay - + +#checksum_cost_delay = 0 # 0-100 milliseconds (0 disables) +#checksum_cost_page = 10 # 0-10000 credits +#checksum_cost_limit = 200 # 1-10000 credits + # - Cost-Based Vacuum Delay - #vacuum_cost_delay = 0 # 0-100 milliseconds (0 disables) diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index 687509ba92..e0afc7a1f0 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -10938,6 +10938,22 @@ proallargtypes => '{oid,text,int8,timestamptz}', proargmodes => '{i,o,o,o}', proargnames => '{tablespace,name,size,modification}', prosrc => 'pg_ls_tmpdir_1arg' }, +{ oid => '9147', descr => 'check data integrity for all forks of a relation', + proname => 'pg_check_relation', procost => '10000', + prorows => '20', proretset => 't', proparallel => 'r', + provolatile => 'v', prorettype => 'record', proargtypes => 'regclass', + proallargtypes => '{regclass,oid,int4,int8,int4,int4}', + proargmodes => '{i,o,o,o,o,o}', + proargnames => '{relation,relid,forknum,failed_blocknum,expected_checksum,found_checksum}', + prosrc => 'pg_check_relation' }, +{ oid => '9148', descr => 'check data integrity for one fork of a relation', + proname => 'pg_check_relation', procost => '10000', + prorows => '20', proretset => 't', proparallel => 'r', + provolatile => 'v', prorettype => 'record', proargtypes => 'regclass text', + proallargtypes => '{regclass,text,oid,int4,int8,int4,int4}', + proargmodes => '{i,i,o,o,o,o,o}', + proargnames => '{relation,fork,relid,forknum,failed_blocknum,expected_checksum,found_checksum}', + prosrc => 'pg_check_relation_fork' }, # hash partitioning constraint function { oid => '5028', descr => 'hash partition CHECK constraint', diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index 72e3352398..44c473995f 100644 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -247,6 +247,13 @@ extern PGDLLIMPORT double hash_mem_multiplier; extern PGDLLIMPORT int maintenance_work_mem; extern PGDLLIMPORT int max_parallel_maintenance_workers; +extern int ChecksumCostPage; +extern int ChecksumCostLimit; +extern double ChecksumCostDelay; + +extern int ChecksumCostBalance; +extern bool ChecksumCostActive; + extern int VacuumCostPageHit; extern int VacuumCostPageMiss; extern int VacuumCostPageDirty; diff --git a/src/include/pgstat.h b/src/include/pgstat.h index 0dfbac46b4..b9ff8392b8 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -902,7 +902,8 @@ typedef enum WAIT_EVENT_PG_SLEEP, WAIT_EVENT_RECOVERY_APPLY_DELAY, WAIT_EVENT_RECOVERY_RETRIEVE_RETRY_INTERVAL, - WAIT_EVENT_VACUUM_DELAY + WAIT_EVENT_VACUUM_DELAY, + WAIT_EVENT_CHECK_DELAY } WaitEventTimeout; /* ---------- diff --git a/src/include/utils/checksumfuncs.h b/src/include/utils/checksumfuncs.h new file mode 100644 index 0000000000..f3e13b64b1 --- /dev/null +++ b/src/include/utils/checksumfuncs.h @@ -0,0 +1,31 @@ +/*------------------------------------------------------------------------- + * + * checksumfunc.h + * Checksum verification implementation for data pages. + * + * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/storage/checksumfunc.h + * + *------------------------------------------------------------------------- + */ +#ifndef CHECKSUMFUNC_H +#define CHECKSUMFUNC_H + +#include "postgres.h" + +#include "access/tupdesc.h" +#include "common/relpath.h" +#include "utils/relcache.h" +#include "utils/tuplestore.h" + +/* + * A zero checksum can never be computed, see pg_checksum_page() */ +#define NoComputedChecksum 0 + +extern bool check_one_block(Relation relation, ForkNumber forknum, + BlockNumber blkno, uint16 *chk_expected, + uint16 *chk_found); + +#endif /* CHECKSUMFUNC_H */ diff --git a/src/include/utils/guc_tables.h b/src/include/utils/guc_tables.h index 04431d0eb2..4ed2a09600 100644 --- a/src/include/utils/guc_tables.h +++ b/src/include/utils/guc_tables.h @@ -62,6 +62,7 @@ enum config_group RESOURCES_MEM, RESOURCES_DISK, RESOURCES_KERNEL, + RESOURCES_CHECKSUM_DELAY, RESOURCES_VACUUM_DELAY, RESOURCES_BGWRITER, RESOURCES_ASYNCHRONOUS, diff --git a/src/test/modules/Makefile b/src/test/modules/Makefile index a6d2ffbf9e..a845af71fd 100644 --- a/src/test/modules/Makefile +++ b/src/test/modules/Makefile @@ -7,6 +7,7 @@ include $(top_builddir)/src/Makefile.global SUBDIRS = \ brin \ commit_ts \ + check_relation \ delay_execution \ dummy_index_am \ dummy_seclabel \ diff --git a/src/test/modules/check_relation/.gitignore b/src/test/modules/check_relation/.gitignore new file mode 100644 index 0000000000..871e943d50 --- /dev/null +++ b/src/test/modules/check_relation/.gitignore @@ -0,0 +1,2 @@ +# Generated by test suite +/tmp_check/ diff --git a/src/test/modules/check_relation/Makefile b/src/test/modules/check_relation/Makefile new file mode 100644 index 0000000000..a540cdece2 --- /dev/null +++ b/src/test/modules/check_relation/Makefile @@ -0,0 +1,14 @@ +# src/test/modules/check_relation/Makefile + +TAP_TESTS = 1 + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +subdir = src/test/modules/check_relation +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif diff --git a/src/test/modules/check_relation/README b/src/test/modules/check_relation/README new file mode 100644 index 0000000000..415c4b21ad --- /dev/null +++ b/src/test/modules/check_relation/README @@ -0,0 +1,23 @@ +src/test/check_relation/README + +Regression tests for online checksums verification +================================================== + +This directory contains a test suite for online checksums verification. + +Running the tests +================= + +NOTE: You must have given the --enable-tap-tests argument to configure. + +Run + make check +or + make installcheck +You can use "make installcheck" if you previously did "make install". +In that case, the code in the installation tree is tested. With +"make check", a temporary installation tree is built from the current +sources and then tested. + +Either way, this test initializes, starts, and stops a test Postgres +cluster. diff --git a/src/test/modules/check_relation/t/001_checksums_check.pl b/src/test/modules/check_relation/t/001_checksums_check.pl new file mode 100644 index 0000000000..2a3f2880ea --- /dev/null +++ b/src/test/modules/check_relation/t/001_checksums_check.pl @@ -0,0 +1,276 @@ +use strict; +use warnings; + +use PostgresNode; +use TestLib; +use Test::More tests => 59; + +our $CHECKSUM_UINT16_OFFSET = 4; +our $PD_UPPER_UINT16_OFFSET = 7; +our $BLOCKSIZE; +our $TOTAL_NB_ERR = 0; + +sub get_block +{ + my ($filename, $blkno) = @_; + my $block; + + open(my $infile, '<', $filename) or die; + binmode($infile); + + my $success = read($infile, $block, $BLOCKSIZE, ($blkno * $BLOCKSIZE)); + die($!) if not defined $success; + + close($infile); + + return($block); +} + +sub overwrite_block +{ + my ($filename, $block, $blkno) = @_; + + open(my $outfile, '>', $filename) or die; + binmode ($outfile); + + my $nb = syswrite($outfile, $block, $BLOCKSIZE, ($blkno * $BLOCKSIZE)); + + die($!) if not defined $nb; + die("Write error") if ($nb != $BLOCKSIZE); + + $outfile->flush(); + + close($outfile); +} + +sub get_uint16_from_page +{ + my ($block, $offset) = @_; + + return (unpack("S*", $block))[$offset]; +} + +sub set_uint16_to_page +{ + my ($block, $data, $offset) = @_; + + my $pack = pack("S", $data); + + # vec with 16B or more won't preserve endianness + vec($block, 2*$offset, 8) = (unpack('C*', $pack))[0]; + vec($block, (2*$offset) + 1, 8) = (unpack('C*', $pack))[1]; + + return $block; +} + +sub check_checksums_call +{ + my ($node, $relname) = @_; + + my ($cmdret, $stdout, $stderr) = $node->psql('postgres', "SELECT COUNT(*)" + . " FROM pg_catalog.pg_check_relation('$relname')" + ); + + return ($stderr eq ''); +} + +sub check_checksums_nb_error +{ + my ($node, $nb, $pattern) = @_; + + my ($cmdret, $stdout, $stderr) = $node->psql('postgres', "SELECT COUNT(*)" + . " FROM (SELECT pg_catalog.pg_check_relation(oid, 'main')" + . " FROM pg_class WHERE relkind in ('r', 'i', 'm')) AS s" + ); + + is($cmdret, 0, 'Function should run successfully'); + like($stderr, $pattern, 'Error output should match expectations'); + is($stdout, $nb, "Should have $nb error"); + + $TOTAL_NB_ERR += $nb; +} + +sub check_pg_stat_database_nb_error +{ + my ($node) = @_; + + my ($cmdret, $stdout, $stderr) = $node->psql('postgres', "SELECT " + . " sum(checksum_failures)" + . " FROM pg_catalog.pg_stat_database" + ); + + is($cmdret, 0, 'Function should run successfully'); + is($stderr, '', 'Function should run successfully'); + is($stdout, $TOTAL_NB_ERR, "Should have $TOTAL_NB_ERR error"); +} + +sub get_checksums_errors +{ + my ($node, $nb, $pattern) = @_; + + my ($cmdret, $stdout, $stderr) = $node->psql('postgres', "SELECT" + . " relid::regclass::text, forknum, failed_blocknum," + . " expected_checksum, found_checksum" + . " FROM (SELECT (pg_catalog.pg_check_relation(oid)).*" + . " FROM pg_class WHERE relkind in ('r','i', 'm')) AS s" + ); + + is($cmdret, '0', 'Function should run successfully'); + like($stderr, $pattern, 'Error output should match expectations'); + + $TOTAL_NB_ERR += $nb; + + return $stdout; +} + +# This function will perform various test by modifying the specified block at +# the specified uint16 offset, checking that the corruption is correctly +# detected, and finally restore the specified block to its original content. +sub corrupt_and_test_block +{ + my ($node, $filename, $blkno, $offset, $fake_data) = @_; + + check_checksums_nb_error($node, 0, qr/^$/); + + check_pg_stat_database_nb_error($node); + + $node->stop(); + + my $original_block = get_block($filename, 0); + my $original_data = get_uint16_from_page($original_block, $offset); + + isnt($original_data, $fake_data, + "The fake data at offset $offset should be different" + . " from the existing one"); + + my $new_block = set_uint16_to_page($original_block, $fake_data, $offset); + isnt($original_data, get_uint16_from_page($new_block, $offset), + "The fake data at offset $offset should have been changed in memory"); + + overwrite_block($filename, $new_block, 0); + + my $written_data = get_uint16_from_page(get_block($filename, 0), $offset); + isnt($original_data, $written_data, + "The data written at offset $offset should be different" + . " from the original one"); + is(get_uint16_from_page($new_block, $offset), $written_data, + "The data written at offset $offset should be the same" + . " as the one in memory"); + is($written_data, $fake_data, + "The data written at offset $offset should be the one" + . " we wanted to write"); + + $node->start(); + + check_checksums_nb_error($node, 1, qr/invalid page in block $blkno/); + + my $expected_checksum; + my $found_checksum = get_uint16_from_page($new_block, + $CHECKSUM_UINT16_OFFSET); + if ($offset == $PD_UPPER_UINT16_OFFSET) + { + # A checksum can't be computed if it's detected as PageIsNew(), so the + # function returns NULL for the computed checksum + $expected_checksum = ''; + } + else + { + $expected_checksum = get_uint16_from_page($original_block, + $CHECKSUM_UINT16_OFFSET); + } + + my $det = get_checksums_errors($node, 1, qr/invalid page in block $blkno/); + is($det, "t1|0|0|$expected_checksum|$found_checksum", + "The checksums error for modification at offset $offset" + . " should be detected"); + + $node->stop(); + + $new_block = set_uint16_to_page($original_block, $original_data, $offset); + is($original_data, get_uint16_from_page($new_block, $offset), + "The data at offset $offset should have been restored in memory"); + + overwrite_block($filename, $new_block, 0); + is($original_data, get_uint16_from_page(get_block($filename, $blkno), + $offset), + "The data at offset $offset should have been restored on disk"); + + $node->start(); + + check_checksums_nb_error($node, 0, qr/^$/); +} + +if (exists $ENV{MY_PG_REGRESS}) +{ + $ENV{PG_REGRESS} = $ENV{MY_PG_REGRESS}; +} + +my $node = get_new_node('main'); + +my %params; +$params{'extra'} = ['--data-checksums']; +$node->init(%params); + +$node->start(); + +$ENV{PGOPTIONS} = '--client-min-messages=WARNING'; + +my ($cmdret, $stdout, $stderr) = $node->psql('postgres', "SELECT" + . " current_setting('data_checksums')"); + +is($stdout, 'on', 'Data checksums should be enabled'); + +($cmdret, $stdout, $stderr) = $node->psql('postgres', "SELECT" + . " current_setting('block_size')"); + +$BLOCKSIZE = $stdout; + +$node->safe_psql( + 'postgres', q| + CREATE TABLE public.t1(id integer); + CREATE INDEX t1_id_idx ON public.t1 (id); + INSERT INTO public.t1 SELECT generate_series(1, 100); + CREATE VIEW public.v1 AS SELECT * FROM t1; + CREATE MATERIALIZED VIEW public.mv1 AS SELECT * FROM t1; + CREATE SEQUENCE public.s1; + CREATE UNLOGGED TABLE public.u_t1(id integer); + CREATE INDEX u_t1_id_idx ON public.u_t1 (id); + INSERT INTO public.u_t1 SELECT generate_series(1, 100); + CHECKPOINT; +|); + +# Check sane behavior on various objects type, including those that don't have +# a storage. +is(check_checksums_call($node, 't1'), '1', 'Can check a table'); +is(check_checksums_call($node, 't1_id_idx'), '1', 'Can check an index'); +is(check_checksums_call($node, 'v1'), '', 'Cannot check a view'); +is(check_checksums_call($node, 'mv1'), '1', 'Can check a materialized view'); +is(check_checksums_call($node, 's1'), '1', 'Can check a sequence'); +is(check_checksums_call($node, 'u_t1'), '1', 'Can check an unlogged table'); +is(check_checksums_call($node, 'u_t1_id_idx'), '1', 'Can check an unlogged index'); + +# get the underlying heap absolute path +($cmdret, $stdout, $stderr) = $node->psql('postgres', "SELECT" + . " current_setting('data_directory') || '/' || pg_relation_filepath('t1')" +); + +isnt($stdout, '', 'A relfilenode should be returned'); + +my $filename = $stdout; + +check_checksums_nb_error($node, 0, qr/^$/); + +check_pg_stat_database_nb_error($node); + +my $fake_uint16 = hex '0x0000'; + +# Test with a modified checksum. We use a zero checksum here as it's the only +# one that cannot exist on a checksummed page. We also don't have an easy way +# to compute what the checksum would be after a modification in a random place +# in the block. +corrupt_and_test_block($node, $filename, 0, $CHECKSUM_UINT16_OFFSET, + $fake_uint16); + +# Test corruption making the block looks like it's PageIsNew(). +corrupt_and_test_block($node, $filename, 0, $PD_UPPER_UINT16_OFFSET, + $fake_uint16); -- 2.20.1