diff --git a/doc/src/sgml/maintenance.sgml b/doc/src/sgml/maintenance.sgml index 8764e00..dcd9b78 100644 --- a/doc/src/sgml/maintenance.sgml +++ b/doc/src/sgml/maintenance.sgml @@ -628,6 +628,9 @@ HINT: Stop the postmaster and vacuum that database in single-user mode. Like transaction IDs, multixact IDs are implemented as a 32-bit counter and corresponding storage, all of which requires careful aging management, storage cleanup, and wraparound handling. + There is a separate storage area which holds the list of members in + each multixact, which also uses a 32-bit counter and which must also + be managed. @@ -655,8 +658,11 @@ HINT: Stop the postmaster and vacuum that database in single-user mode. As a safety device, a whole-table vacuum scan will occur for any table whose multixact-age is greater than - . - This will occur even if autovacuum is nominally disabled. + . Whole-table + vacuum scans will also occur progressively for all tables, starting with + those that have the oldest multixact-age, if the amount of used member + storage space exceeds the amount 25% of the addressible storage space. + These will occur even if autovacuum is nominally disabled. diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c index 928f9fe..5ce841d 100644 --- a/src/backend/access/transam/multixact.c +++ b/src/backend/access/transam/multixact.c @@ -168,6 +168,11 @@ (MXOffsetToFlagsOffset(xid) + MULTIXACT_FLAGBYTES_PER_GROUP + \ ((xid) % MULTIXACT_MEMBERS_PER_MEMBERGROUP) * sizeof(TransactionId)) +/* Multixact members wraparound thresholds. */ +#define MULTIXACT_MEMBER_SAFE_THRESHOLD (MaxMultiXactOffset / 4) +#define MULTIXACT_MEMBER_DANGER_THRESHOLD \ + (MaxMultiXactOffset - MaxMultiXactOffset / 4) + /* * Links to shared-memory data structures for MultiXact control @@ -199,6 +204,7 @@ typedef struct MultiXactStateData */ MultiXactId oldestMultiXactId; Oid oldestMultiXactDB; + MultiXactOffset oldestOffset; /* * This is what the previous checkpoint stored as the truncate position. @@ -949,14 +955,16 @@ GetNewMultiXactId(int nmembers, MultiXactOffset *offset) * against catastrophic data loss due to multixact wraparound. The basic * rules are: * - * If we're past multiVacLimit, start trying to force autovacuum cycles. + * If we're past multiVacLimit or the safe threshold for member storage space, + * start trying to force autovacuum cycles. * If we're past multiWarnLimit, start issuing warnings. * If we're past multiStopLimit, refuse to create new MultiXactIds. * * Note these are pretty much the same protections in GetNewTransactionId. *---------- */ - if (!MultiXactIdPrecedes(result, MultiXactState->multiVacLimit)) + if (!MultiXactIdPrecedes(result, MultiXactState->multiVacLimit) || + (MultiXactState->nextOffset - MultiXactState->oldestOffset > MULTIXACT_MEMBER_SAFE_THRESHOLD)) { /* * For safety's sake, we release MultiXactGenLock while sending @@ -2477,6 +2485,7 @@ static void DetermineSafeOldestOffset(MultiXactId oldestMXact) { MultiXactOffset oldestOffset; + MultiXactOffset startOfOldestSegment; /* * Can't do this while initdb'ing or in the startup process while @@ -2495,12 +2504,14 @@ DetermineSafeOldestOffset(MultiXactId oldestMXact) */ oldestOffset = find_multixact_start(oldestMXact); /* move back to start of the corresponding segment */ - oldestOffset -= oldestOffset / MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT; + startOfOldestSegment = oldestOffset - + (oldestOffset % (MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT)); LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); /* always leave one segment before the wraparound point */ - MultiXactState->offsetStopLimit = oldestOffset - + MultiXactState->offsetStopLimit = startOfOldestSegment - (MULTIXACT_MEMBERS_PER_PAGE * SLRU_PAGES_PER_SEGMENT); + MultiXactState->oldestOffset = oldestOffset; LWLockRelease(MultiXactGenLock); } @@ -2578,6 +2589,87 @@ find_multixact_start(MultiXactId multi) } /* + * Determine how many multixacts, and how many multixact members, currently + * exist. + */ +static void +ReadMultiXactCounts(uint32 *multixacts, MultiXactOffset *members) +{ + MultiXactOffset nextOffset; + MultiXactOffset oldestOffset; + MultiXactId oldestMultiXactId; + MultiXactId nextMultiXactId; + + LWLockAcquire(MultiXactGenLock, LW_SHARED); + nextOffset = MultiXactState->nextOffset; + oldestMultiXactId = MultiXactState->oldestMultiXactId; + nextMultiXactId = MultiXactState->nextMXact; + oldestOffset = MultiXactState->oldestOffset; + LWLockRelease(MultiXactGenLock); + + *members = nextOffset - oldestOffset; + *multixacts = nextMultiXactId - oldestMultiXactId; +} + +/* + * Multixact members can be removed once the multixacts that refer to them + * are older than every datminxmid. autovacuum_multixact_freeze_max_age and + * vacuum_multixact_freeze_table_age work together to make sure we never have + * too many multixacts; we hope that, at least under normal circumstances, + * this will also be sufficient to keep us from using too many offsets. + * However, if the average multixact has many members, we might exhaust the + * members space while still using few enough members that these limits fail + * to trigger full table scans for relminmxid advancement. At that point, + * we'd have no choice but to start failing multixact-creating operations + * with an error. + * + * To prevent that, if more than a threshold portion of the members space is + * used, we effectively reduce autovacuum_multixact_freeze_max_age and + * vacuum_multixact_freeze_table_age to a value just less than the number of + * multixacts in use. We hope that this will quickly trigger autovacuuming on + * the table or tables with the oldest relminmxid, thus allowing datminmxid + * values to advance and removing some members. + * + * As the fraction of the member space currently in use grows, we become + * more aggressive in clamping these values, so that autovacuum will begin + * working on more and more tables, and any manual vacuums the user issues + * will also become more aggressive. When it gets high enough, we assume + * the situation is desperate and treat both values as zero, essentially + * freezing every possible multixact. + * + * It's possible that these thresholds should be user-tunable, but for now + * we keep it simple. + */ +int +MultiXactMemberFreezeThreshold(void) +{ + MultiXactOffset members; + uint32 multixacts; + uint32 victim_multixacts; + double fraction; + + ReadMultiXactCounts(&multixacts, &members); + + /* If member space utilization is low, no special action is required. */ + if (members <= MULTIXACT_MEMBER_SAFE_THRESHOLD) + return -1; + + /* + * Compute a target for relminmxid advancement. The number of multixacts + * we try to eliminate from the system is based on how far we are past + * MULTIXACT_MEMBER_SAFE_THRESHOLD. + */ + fraction = (double) (members - MULTIXACT_MEMBER_SAFE_THRESHOLD) / + (MULTIXACT_MEMBER_DANGER_THRESHOLD - MULTIXACT_MEMBER_SAFE_THRESHOLD); + victim_multixacts = multixacts * fraction; + + /* fraction could be > 1.0, but lowest possible freeze age is zero */ + if (victim_multixacts > multixacts) + return 0; + return multixacts - victim_multixacts; +} + +/* * SlruScanDirectory callback. * This callback deletes segments that are outside the range determined by * the given page numbers. diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c index 7ead161..0b5ab40 100644 --- a/src/backend/commands/vacuum.c +++ b/src/backend/commands/vacuum.c @@ -92,10 +92,7 @@ ExecVacuum(VacuumStmt *vacstmt, bool isTopLevel) Assert((vacstmt->options & VACOPT_ANALYZE) || vacstmt->va_cols == NIL); Assert(!(vacstmt->options & VACOPT_SKIPTOAST)); - /* - * All freeze ages are zero if the FREEZE option is given; otherwise pass - * them as -1 which means to use the default values. - */ + /* All freeze ages are zero if the FREEZE option is given. */ if (vacstmt->options & VACOPT_FREEZE) { params.freeze_min_age = 0; @@ -105,10 +102,26 @@ ExecVacuum(VacuumStmt *vacstmt, bool isTopLevel) } else { + int multixact_member_max_age; + + /* Use defaults. */ params.freeze_min_age = -1; params.freeze_table_age = -1; params.multixact_freeze_min_age = -1; params.multixact_freeze_table_age = -1; + + /* + * Override the multixact freeze settings if we're using too much + * member address space. It's a bit of kludge to do this here, + * but if we pushed it down into vacuum(), autovacuum workers would + * repeat this calculation for every table. + */ + multixact_member_max_age = MultiXactMemberFreezeThreshold(); + if (multixact_member_max_age >= 0) + { + params.multixact_freeze_table_age = multixact_member_max_age; + params.multixact_freeze_min_age = multixact_member_max_age / 2; + } } /* user-invoked vacuum is never "for wraparound" */ diff --git a/src/backend/postmaster/autovacuum.c b/src/backend/postmaster/autovacuum.c index be4cd1d..db73886 100644 --- a/src/backend/postmaster/autovacuum.c +++ b/src/backend/postmaster/autovacuum.c @@ -297,10 +297,12 @@ static void do_autovacuum(void); static void FreeWorkerInfo(int code, Datum arg); static autovac_table *table_recheck_autovac(Oid relid, HTAB *table_toast_map, - TupleDesc pg_class_desc); + TupleDesc pg_class_desc, + int multixact_member_max_age); static void relation_needs_vacanalyze(Oid relid, AutoVacOpts *relopts, Form_pg_class classForm, PgStat_StatTabEntry *tabentry, + int multixact_member_max_age, bool *dovacuum, bool *doanalyze, bool *wraparound); static void autovacuum_do_vac_analyze(autovac_table *tab, @@ -1077,6 +1079,7 @@ do_start_worker(void) Oid retval = InvalidOid; MemoryContext tmpcxt, oldcxt; + int multixact_member_max_age; /* return quickly when there are no free workers */ LWLockAcquire(AutovacuumLock, LW_SHARED); @@ -1118,7 +1121,12 @@ do_start_worker(void) /* Also determine the oldest datminmxid we will consider. */ recentMulti = ReadNextMultiXactId(); - multiForceLimit = recentMulti - autovacuum_multixact_freeze_max_age; + multixact_member_max_age = MultiXactMemberFreezeThreshold(); + if (multixact_member_max_age >= 0) + multiForceLimit = recentMulti - Min(autovacuum_freeze_max_age, + multixact_member_max_age); + else + multiForceLimit = recentMulti - autovacuum_multixact_freeze_max_age; if (multiForceLimit < FirstMultiXactId) multiForceLimit -= FirstMultiXactId; @@ -1881,6 +1889,7 @@ do_autovacuum(void) BufferAccessStrategy bstrategy; ScanKeyData key; TupleDesc pg_class_desc; + int multixact_member_max_age; /* * StartTransactionCommand and CommitTransactionCommand will automatically @@ -1959,6 +1968,12 @@ do_autovacuum(void) HASH_ELEM | HASH_BLOBS); /* + * Determine whether we need to freeze multixacts more aggressively due + * to high multixact member utilization. + */ + multixact_member_max_age = MultiXactMemberFreezeThreshold(); + + /* * Scan pg_class to determine which tables to vacuum. * * We do this in two passes: on the first one we collect the list of plain @@ -2001,6 +2016,7 @@ do_autovacuum(void) /* Check if it needs vacuum or analyze */ relation_needs_vacanalyze(relid, relopts, classForm, tabentry, + multixact_member_max_age, &dovacuum, &doanalyze, &wraparound); /* @@ -2129,6 +2145,7 @@ do_autovacuum(void) shared, dbentry); relation_needs_vacanalyze(relid, relopts, classForm, tabentry, + multixact_member_max_age, &dovacuum, &doanalyze, &wraparound); /* ignore analyze for toast tables */ @@ -2235,7 +2252,8 @@ do_autovacuum(void) * the race condition is not closed but it is very small. */ MemoryContextSwitchTo(AutovacMemCxt); - tab = table_recheck_autovac(relid, table_toast_map, pg_class_desc); + tab = table_recheck_autovac(relid, table_toast_map, pg_class_desc, + multixact_member_max_age); if (tab == NULL) { /* someone else vacuumed the table, or it went away */ @@ -2442,7 +2460,8 @@ get_pgstat_tabentry_relid(Oid relid, bool isshared, PgStat_StatDBEntry *shared, */ static autovac_table * table_recheck_autovac(Oid relid, HTAB *table_toast_map, - TupleDesc pg_class_desc) + TupleDesc pg_class_desc, + int multixact_member_max_age) { Form_pg_class classForm; HeapTuple classTup; @@ -2488,6 +2507,7 @@ table_recheck_autovac(Oid relid, HTAB *table_toast_map, shared, dbentry); relation_needs_vacanalyze(relid, avopts, classForm, tabentry, + multixact_member_max_age, &dovacuum, &doanalyze, &wraparound); /* ignore ANALYZE for toast tables */ @@ -2550,6 +2570,18 @@ table_recheck_autovac(Oid relid, HTAB *table_toast_map, ? avopts->multixact_freeze_table_age : default_multixact_freeze_table_age; + /* + * Override the multixact freeze settings if we're using too much + * member address space. + */ + if (multixact_member_max_age >= 0) + { + multixact_freeze_table_age = Min(multixact_member_max_age, + multixact_freeze_table_age); + multixact_freeze_min_age = Min(multixact_member_max_age / 2, + multixact_freeze_min_age); + } + tab = palloc(sizeof(autovac_table)); tab->at_relid = relid; tab->at_vacoptions = VACOPT_SKIPTOAST | @@ -2606,8 +2638,8 @@ table_recheck_autovac(Oid relid, HTAB *table_toast_map, * analyze. This is asymmetric to the VACUUM case. * * We also force vacuum if the table's relfrozenxid is more than freeze_max_age - * transactions back, and if its relminmxid is more than - * multixact_freeze_max_age multixacts back. + * transactions back, or if its relminmxid is more than + * multixact_freeze_max_age or multixact_member_max_age multixacts back. * * A table whose autovacuum_enabled option is false is * automatically skipped (unless we have to vacuum it due to freeze_max_age). @@ -2624,6 +2656,7 @@ relation_needs_vacanalyze(Oid relid, AutoVacOpts *relopts, Form_pg_class classForm, PgStat_StatTabEntry *tabentry, + int multixact_member_max_age, /* output params below */ bool *dovacuum, bool *doanalyze, @@ -2687,6 +2720,11 @@ relation_needs_vacanalyze(Oid relid, ? Min(relopts->multixact_freeze_max_age, autovacuum_multixact_freeze_max_age) : autovacuum_multixact_freeze_max_age; + /* Special settings if too much multixact member space is in use. */ + if (multixact_member_max_age >= 0) + multixact_freeze_max_age = Min(multixact_freeze_max_age, + multixact_member_max_age); + av_enabled = (relopts ? relopts->enabled : true); /* Force vacuum if table is at risk of wraparound */ diff --git a/src/include/access/multixact.h b/src/include/access/multixact.h index 640b198..9353289 100644 --- a/src/include/access/multixact.h +++ b/src/include/access/multixact.h @@ -126,6 +126,7 @@ extern void MultiXactAdvanceNextMXact(MultiXactId minMulti, MultiXactOffset minMultiOffset); extern void MultiXactAdvanceOldest(MultiXactId oldestMulti, Oid oldestMultiDB); extern void MultiXactSetSafeTruncate(MultiXactId safeTruncateMulti); +extern int MultiXactMemberFreezeThreshold(void); extern void multixact_twophase_recover(TransactionId xid, uint16 info, void *recdata, uint32 len);