From 6089fa0107af4fcc8a2e94798d27a324de748607 Mon Sep 17 00:00:00 2001 From: Andres Freund Date: Mon, 6 Apr 2020 21:28:55 -0700 Subject: [PATCH v7 05/11] Change the way backends perform tuple-is-invisible-to-everyone tests. Instead of using RecentGlobal[Data]Xmin the tests are now done via InvisibleToEveryone* APIs. Following commit will take advantage of that to make GetSnapshotData() more scalable. Note: This contains a workaround in heap_page_prune_opt() to keep the snapshot_too_old tests working. While that workaround is ugly, the tests currently are not meaningful, and it seems best to address them separately. --- src/include/access/ginblock.h | 4 +- src/include/access/heapam.h | 11 +- src/include/access/transam.h | 78 +- src/include/storage/bufpage.h | 6 - src/include/storage/proc.h | 8 - src/include/storage/procarray.h | 39 +- src/include/utils/snapmgr.h | 37 +- src/include/utils/snapshot.h | 6 + src/backend/access/gin/ginvacuum.c | 19 + src/backend/access/gist/gistutil.c | 8 +- src/backend/access/gist/gistxlog.c | 10 +- src/backend/access/heap/heapam.c | 12 +- src/backend/access/heap/heapam_handler.c | 24 +- src/backend/access/heap/heapam_visibility.c | 78 +- src/backend/access/heap/pruneheap.c | 199 +++- src/backend/access/heap/vacuumlazy.c | 24 +- src/backend/access/nbtree/README | 10 +- src/backend/access/nbtree/nbtpage.c | 4 +- src/backend/access/nbtree/nbtree.c | 28 +- src/backend/access/nbtree/nbtxlog.c | 10 +- src/backend/access/spgist/spgvacuum.c | 6 +- src/backend/access/transam/README | 96 +- src/backend/access/transam/varsup.c | 48 + src/backend/access/transam/xlog.c | 11 +- src/backend/commands/analyze.c | 2 +- src/backend/commands/vacuum.c | 37 +- src/backend/postmaster/autovacuum.c | 4 + src/backend/replication/logical/launcher.c | 6 + src/backend/replication/walreceiver.c | 17 +- src/backend/replication/walsender.c | 15 +- src/backend/storage/ipc/procarray.c | 949 ++++++++++++++++---- src/backend/utils/adt/selfuncs.c | 20 +- src/backend/utils/init/postinit.c | 4 + src/backend/utils/time/snapmgr.c | 252 +++--- contrib/amcheck/verify_nbtree.c | 4 +- contrib/pg_visibility/pg_visibility.c | 18 +- contrib/pgstattuple/pgstatapprox.c | 2 +- 37 files changed, 1498 insertions(+), 608 deletions(-) diff --git a/src/include/access/ginblock.h b/src/include/access/ginblock.h index 3f64fd572e3..fe66a95226b 100644 --- a/src/include/access/ginblock.h +++ b/src/include/access/ginblock.h @@ -12,6 +12,7 @@ #include "access/transam.h" #include "storage/block.h" +#include "storage/bufpage.h" #include "storage/itemptr.h" #include "storage/off.h" @@ -134,8 +135,7 @@ typedef struct GinMetaPageData */ #define GinPageGetDeleteXid(page) ( ((PageHeader) (page))->pd_prune_xid ) #define GinPageSetDeleteXid(page, xid) ( ((PageHeader) (page))->pd_prune_xid = xid) -#define GinPageIsRecyclable(page) ( PageIsNew(page) || (GinPageIsDeleted(page) \ - && TransactionIdPrecedes(GinPageGetDeleteXid(page), RecentGlobalXmin))) +extern bool GinPageIsRecyclable(Page page); /* * We use our own ItemPointerGet(BlockNumber|OffsetNumber) diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index f279edc4734..db9e0b48a08 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -172,9 +172,12 @@ extern TransactionId heap_compute_xid_horizon_for_tuples(Relation rel, int nitems); /* in heap/pruneheap.c */ +struct InvisibleToEveryoneState; extern void heap_page_prune_opt(Relation relation, Buffer buffer); extern int heap_page_prune(Relation relation, Buffer buffer, - TransactionId OldestXmin, + struct InvisibleToEveryoneState *horizon, + TransactionId limited_oldest_xmin, + TimestampTz limited_oldest_ts, bool report_stats, TransactionId *latestRemovedXid); extern void heap_page_prune_execute(Buffer buffer, OffsetNumber *redirected, int nredirected, @@ -201,11 +204,15 @@ extern TM_Result HeapTupleSatisfiesUpdate(HeapTuple stup, CommandId curcid, Buffer buffer); extern HTSV_Result HeapTupleSatisfiesVacuum(HeapTuple stup, TransactionId OldestXmin, Buffer buffer); +extern HTSV_Result HeapTupleSatisfiesVacuumHorizon(HeapTuple stup, Buffer buffer, + TransactionId *dead_after); extern void HeapTupleSetHintBits(HeapTupleHeader tuple, Buffer buffer, uint16 infomask, TransactionId xid); extern bool HeapTupleHeaderIsOnlyLocked(HeapTupleHeader tuple); extern bool XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot); -extern bool HeapTupleIsSurelyDead(HeapTuple htup, TransactionId OldestXmin); +struct InvisibleToEveryoneState; +extern bool HeapTupleIsSurelyDead(struct InvisibleToEveryoneState *invstate, + HeapTuple htup); /* * To avoid leaking too much knowledge about reorderbuffer implementation diff --git a/src/include/access/transam.h b/src/include/access/transam.h index 9a808f64ebe..924e5fa724e 100644 --- a/src/include/access/transam.h +++ b/src/include/access/transam.h @@ -54,6 +54,8 @@ #define FullTransactionIdFollowsOrEquals(a, b) ((a).value >= (b).value) #define FullTransactionIdIsValid(x) TransactionIdIsValid(XidFromFullTransactionId(x)) #define InvalidFullTransactionId FullTransactionIdFromEpochAndXid(0, InvalidTransactionId) +#define FirstNormalFullTransactionId FullTransactionIdFromEpochAndXid(0, FirstNormalTransactionId) +#define FullTransactionIdIsNormal(x) FullTransactionIdFollowsOrEquals(x, FirstNormalFullTransactionId) /* * A 64 bit value that contains an epoch and a TransactionId. This is @@ -102,6 +104,19 @@ FullTransactionIdAdvance(FullTransactionId *dest) dest->value++; } +/* retreat a FullTransactionId variable, stepping over special XIDs */ +static inline void +FullTransactionIdRetreat(FullTransactionId *dest) +{ + dest->value--; + + if (FullTransactionIdPrecedes(*dest, FirstNormalFullTransactionId)) + return; + + while (XidFromFullTransactionId(*dest) < FirstNormalTransactionId) + dest->value--; +} + /* back up a transaction ID variable, handling wraparound correctly */ #define TransactionIdRetreat(dest) \ do { \ @@ -193,8 +208,8 @@ typedef struct VariableCacheData /* * These fields are protected by ProcArrayLock. */ - TransactionId latestCompletedXid; /* newest XID that has committed or - * aborted */ + FullTransactionId latestCompletedFullXid; /* newest full XID that has + * committed or aborted */ /* * These fields are protected by CLogTruncationLock @@ -244,6 +259,12 @@ extern void AdvanceOldestClogXid(TransactionId oldest_datfrozenxid); extern bool ForceTransactionIdLimitUpdate(void); extern Oid GetNewObjectId(void); +#ifdef USE_ASSERT_CHECKING +extern void AssertTransactionIdMayBeOnDisk(TransactionId xid); +#else +#define AssertTransactionIdMayBeOnDisk(xid) ((void)true) +#endif + /* * Some frontend programs include this header. For compilers that emit static * inline functions even when they're unused, that leads to unsatisfied @@ -260,6 +281,59 @@ ReadNewTransactionId(void) return XidFromFullTransactionId(ReadNextFullTransactionId()); } +/* return transaction ID backed up by amount, handling wraparound correctly */ +static inline TransactionId +TransactionIdRetreatedBy(TransactionId xid, uint32 amount) +{ + xid -= amount; + + while (xid < FirstNormalTransactionId) + xid--; + + return xid; +} + +/* return the older of the two IDs */ +static inline TransactionId +TransactionIdOlder(TransactionId a, TransactionId b) +{ + if (!TransactionIdIsValid(a)) + return b; + + if (!TransactionIdIsValid(b)) + return a; + + if (TransactionIdPrecedes(a, b)) + return a; + return b; +} + +/* return the older of the two IDs, assuming they're both normal */ +static inline TransactionId +NormalTransactionIdOlder(TransactionId a, TransactionId b) +{ + Assert(TransactionIdIsNormal(a)); + Assert(TransactionIdIsNormal(b)); + if (NormalTransactionIdPrecedes(a, b)) + return a; + return b; +} + +/* return the newer of the two IDs */ +static inline FullTransactionId +FullTransactionIdNewer(FullTransactionId a, FullTransactionId b) +{ + if (!FullTransactionIdIsValid(a)) + return b; + + if (!FullTransactionIdIsValid(b)) + return a; + + if (FullTransactionIdFollows(a, b)) + return a; + return b; +} + #endif /* FRONTEND */ #endif /* TRANSAM_H */ diff --git a/src/include/storage/bufpage.h b/src/include/storage/bufpage.h index 3f88683a059..51b8f994ac0 100644 --- a/src/include/storage/bufpage.h +++ b/src/include/storage/bufpage.h @@ -389,12 +389,6 @@ PageValidateSpecialPointer(Page page) #define PageClearAllVisible(page) \ (((PageHeader) (page))->pd_flags &= ~PD_ALL_VISIBLE) -#define PageIsPrunable(page, oldestxmin) \ -( \ - AssertMacro(TransactionIdIsNormal(oldestxmin)), \ - TransactionIdIsValid(((PageHeader) (page))->pd_prune_xid) && \ - TransactionIdPrecedes(((PageHeader) (page))->pd_prune_xid, oldestxmin) \ -) #define PageSetPrunable(page, xid) \ do { \ Assert(TransactionIdIsNormal(xid)); \ diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h index ae4f573ab46..23d12c1f72f 100644 --- a/src/include/storage/proc.h +++ b/src/include/storage/proc.h @@ -42,13 +42,6 @@ struct XidCache /* * Flags for PGXACT->vacuumFlags - * - * Note: If you modify these flags, you need to modify PROCARRAY_XXX flags - * in src/include/storage/procarray.h. - * - * PROC_RESERVED may later be assigned for use in vacuumFlags, but its value is - * used for PROCARRAY_SLOTS_XMIN in procarray.h, so GetOldestXmin won't be able - * to match and ignore processes with this flag set. */ #define PROC_IS_AUTOVACUUM 0x01 /* is it an autovac worker? */ #define PROC_IN_VACUUM 0x02 /* currently running lazy vacuum */ @@ -56,7 +49,6 @@ struct XidCache #define PROC_VACUUM_FOR_WRAPAROUND 0x08 /* set by autovac only */ #define PROC_IN_LOGICAL_DECODING 0x10 /* currently doing logical * decoding outside xact */ -#define PROC_RESERVED 0x20 /* reserved for procarray */ /* flags reset at EOXact */ #define PROC_VACUUM_STATE_MASK \ diff --git a/src/include/storage/procarray.h b/src/include/storage/procarray.h index a5c7d0c0644..0f3c151fdb2 100644 --- a/src/include/storage/procarray.h +++ b/src/include/storage/procarray.h @@ -20,41 +20,6 @@ #include "utils/snapshot.h" -/* - * These are to implement PROCARRAY_FLAGS_XXX - * - * Note: These flags are cloned from PROC_XXX flags in src/include/storage/proc.h - * to avoid forcing to include proc.h when including procarray.h. So if you modify - * PROC_XXX flags, you need to modify these flags. - */ -#define PROCARRAY_VACUUM_FLAG 0x02 /* currently running lazy - * vacuum */ -#define PROCARRAY_ANALYZE_FLAG 0x04 /* currently running - * analyze */ -#define PROCARRAY_LOGICAL_DECODING_FLAG 0x10 /* currently doing logical - * decoding outside xact */ - -#define PROCARRAY_SLOTS_XMIN 0x20 /* replication slot xmin, - * catalog_xmin */ -/* - * Only flags in PROCARRAY_PROC_FLAGS_MASK are considered when matching - * PGXACT->vacuumFlags. Other flags are used for different purposes and - * have no corresponding PROC flag equivalent. - */ -#define PROCARRAY_PROC_FLAGS_MASK (PROCARRAY_VACUUM_FLAG | \ - PROCARRAY_ANALYZE_FLAG | \ - PROCARRAY_LOGICAL_DECODING_FLAG) - -/* Use the following flags as an input "flags" to GetOldestXmin function */ -/* Consider all backends except for logical decoding ones which manage xmin separately */ -#define PROCARRAY_FLAGS_DEFAULT PROCARRAY_LOGICAL_DECODING_FLAG -/* Ignore vacuum backends */ -#define PROCARRAY_FLAGS_VACUUM PROCARRAY_FLAGS_DEFAULT | PROCARRAY_VACUUM_FLAG -/* Ignore analyze backends */ -#define PROCARRAY_FLAGS_ANALYZE PROCARRAY_FLAGS_DEFAULT | PROCARRAY_ANALYZE_FLAG -/* Ignore both vacuum and analyze backends */ -#define PROCARRAY_FLAGS_VACUUM_ANALYZE PROCARRAY_FLAGS_DEFAULT | PROCARRAY_VACUUM_FLAG | PROCARRAY_ANALYZE_FLAG - extern Size ProcArrayShmemSize(void); extern void CreateSharedProcArray(void); extern void ProcArrayAdd(PGPROC *proc); @@ -88,7 +53,9 @@ extern RunningTransactions GetRunningTransactionData(void); extern bool TransactionIdIsInProgress(TransactionId xid); extern bool TransactionIdIsActive(TransactionId xid); -extern TransactionId GetOldestXmin(Relation rel, int flags); +extern TransactionId GetOldestVisibleTransactionId(Relation rel); +extern void GetReplicationHorizons(TransactionId *slot_xmin, TransactionId *catalog_xmin); +extern TransactionId GetOldestTransactionIdConsideredRunning(void); extern TransactionId GetOldestActiveTransactionId(void); extern TransactionId GetOldestSafeDecodingTransactionId(bool catalogOnly); diff --git a/src/include/utils/snapmgr.h b/src/include/utils/snapmgr.h index 7738d6a8e01..a47eb7406cf 100644 --- a/src/include/utils/snapmgr.h +++ b/src/include/utils/snapmgr.h @@ -52,13 +52,12 @@ extern Size SnapMgrShmemSize(void); extern void SnapMgrInit(void); extern TimestampTz GetSnapshotCurrentTimestamp(void); extern TimestampTz GetOldSnapshotThresholdTimestamp(void); +extern void SnapshotTooOldMagicForTest(void); extern bool FirstSnapshotSet; extern PGDLLIMPORT TransactionId TransactionXmin; extern PGDLLIMPORT TransactionId RecentXmin; -extern PGDLLIMPORT TransactionId RecentGlobalXmin; -extern PGDLLIMPORT TransactionId RecentGlobalDataXmin; /* Variables representing various special snapshot semantics */ extern PGDLLIMPORT SnapshotData SnapshotSelfData; @@ -78,11 +77,12 @@ extern PGDLLIMPORT SnapshotData CatalogSnapshotData; /* * Similarly, some initialization is required for a NonVacuumable snapshot. - * The caller must supply the xmin horizon to use (e.g., RecentGlobalXmin). + * The caller must supply the visibility cutoff state to use (c.f. + * InvisibleToEveryoneTestInit()). */ -#define InitNonVacuumableSnapshot(snapshotdata, xmin_horizon) \ +#define InitNonVacuumableSnapshot(snapshotdata, statep) \ ((snapshotdata).snapshot_type = SNAPSHOT_NON_VACUUMABLE, \ - (snapshotdata).xmin = (xmin_horizon)) + (snapshotdata).invstate = (statep)) /* * Similarly, some initialization is required for SnapshotToast. We need @@ -98,6 +98,10 @@ extern PGDLLIMPORT SnapshotData CatalogSnapshotData; ((snapshot)->snapshot_type == SNAPSHOT_MVCC || \ (snapshot)->snapshot_type == SNAPSHOT_HISTORIC_MVCC) +static inline bool OldSnapshotThresholdActive(void) +{ + return old_snapshot_threshold >= 0; +} extern Snapshot GetTransactionSnapshot(void); extern Snapshot GetLatestSnapshot(void); @@ -123,8 +127,6 @@ extern void UnregisterSnapshot(Snapshot snapshot); extern Snapshot RegisterSnapshotOnOwner(Snapshot snapshot, ResourceOwner owner); extern void UnregisterSnapshotFromOwner(Snapshot snapshot, ResourceOwner owner); -extern FullTransactionId GetFullRecentGlobalXmin(void); - extern void AtSubCommit_Snapshot(int level); extern void AtSubAbort_Snapshot(int level); extern void AtEOXact_Snapshot(bool isCommit, bool resetXmin); @@ -133,13 +135,30 @@ extern void ImportSnapshot(const char *idstr); extern bool XactHasExportedSnapshots(void); extern void DeleteAllExportedSnapshotFiles(void); extern bool ThereAreNoPriorRegisteredSnapshots(void); -extern TransactionId TransactionIdLimitedForOldSnapshots(TransactionId recentXmin, - Relation relation); +extern bool TransactionIdLimitedForOldSnapshots(TransactionId recentXmin, + Relation relation, + TransactionId *limit_xid, + TimestampTz *limit_ts); +extern void SetOldSnapshotThresholdTimestamp(TimestampTz ts, TransactionId xlimit); extern void MaintainOldSnapshotTimeMapping(TimestampTz whenTaken, TransactionId xmin); extern char *ExportSnapshot(Snapshot snapshot); +/* + * These live in procarray.c because they're intimately linked to the + * procarray contents, but thematically they better fit into snapmgr.h + */ +typedef struct InvisibleToEveryoneState InvisibleToEveryoneState; +extern InvisibleToEveryoneState *InvisibleToEveryoneTestInit(Relation rel); +extern bool InvisibleToEveryoneTestXid(InvisibleToEveryoneState *state, TransactionId xid); +extern bool InvisibleToEveryoneTestFullXid(InvisibleToEveryoneState *state, FullTransactionId fxid); +extern FullTransactionId InvisibleToEveryoneTestFullCutoff(InvisibleToEveryoneState *state); +extern TransactionId InvisibleToEveryoneTestCutoff(InvisibleToEveryoneState *state); +extern bool InvisibleToEveryoneCheckXid(Relation rel, TransactionId xid); +extern bool InvisibleToEveryoneCheckFullXid(Relation rel, FullTransactionId fxid); + + /* * Utility functions for implementing visibility routines in table AMs. */ diff --git a/src/include/utils/snapshot.h b/src/include/utils/snapshot.h index 4796edb63aa..2bc415376ac 100644 --- a/src/include/utils/snapshot.h +++ b/src/include/utils/snapshot.h @@ -192,6 +192,12 @@ typedef struct SnapshotData */ uint32 speculativeToken; + /* + * For SNAPSHOT_NON_VACUUMABLE (and hopefully more in the future) this + * contains the visibility cutoff state. + */ + struct InvisibleToEveryoneState *invstate; + /* * Book-keeping information, used by the snapshot manager */ diff --git a/src/backend/access/gin/ginvacuum.c b/src/backend/access/gin/ginvacuum.c index 8ae4fd95a7b..1b0e04ee0fa 100644 --- a/src/backend/access/gin/ginvacuum.c +++ b/src/backend/access/gin/ginvacuum.c @@ -793,3 +793,22 @@ ginvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) return stats; } + +bool +GinPageIsRecyclable(Page page) +{ + TransactionId delete_xid; + + if (PageIsNew(page)) + return true; + + if (!GinPageIsDeleted(page)) + return false; + + delete_xid = GinPageGetDeleteXid(page); + + if (!TransactionIdIsValid(delete_xid)) + return true; + + return InvisibleToEveryoneCheckXid(NULL, delete_xid); +} diff --git a/src/backend/access/gist/gistutil.c b/src/backend/access/gist/gistutil.c index 765329bbcd4..195491e2766 100644 --- a/src/backend/access/gist/gistutil.c +++ b/src/backend/access/gist/gistutil.c @@ -891,15 +891,13 @@ gistPageRecyclable(Page page) * As long as that can happen, we must keep the deleted page around as * a tombstone. * - * Compare the deletion XID with RecentGlobalXmin. If deleteXid < - * RecentGlobalXmin, then no scan that's still in progress could have + * For that check if the deletion XID could still be visible to + * anyone. If not, then no scan that's still in progress could have * seen its downlink, and we can recycle it. */ FullTransactionId deletexid_full = GistPageGetDeleteXid(page); - FullTransactionId recentxmin_full = GetFullRecentGlobalXmin(); - if (FullTransactionIdPrecedes(deletexid_full, recentxmin_full)) - return true; + return InvisibleToEveryoneCheckFullXid(NULL, deletexid_full); } return false; } diff --git a/src/backend/access/gist/gistxlog.c b/src/backend/access/gist/gistxlog.c index b60dba052fa..66ddbaa5c4a 100644 --- a/src/backend/access/gist/gistxlog.c +++ b/src/backend/access/gist/gistxlog.c @@ -387,11 +387,11 @@ gistRedoPageReuse(XLogReaderState *record) * PAGE_REUSE records exist to provide a conflict point when we reuse * pages in the index via the FSM. That's all they do though. * - * latestRemovedXid was the page's deleteXid. The deleteXid < - * RecentGlobalXmin test in gistPageRecyclable() conceptually mirrors the - * pgxact->xmin > limitXmin test in GetConflictingVirtualXIDs(). - * Consequently, one XID value achieves the same exclusion effect on - * master and standby. + * latestRemovedXid was the page's deleteXid. The + * InvisibleToEveryoneCheckFullXid(deleteXid) test in gistPageRecyclable() + * conceptually mirrors the pgxact->xmin > limitXmin test in + * GetConflictingVirtualXIDs(). Consequently, one XID value achieves the + * same exclusion effect on master and standby. */ if (InHotStandby) { diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 0af51880ccc..f7caae2c081 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -1537,6 +1537,7 @@ heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer, bool at_chain_start; bool valid; bool skip; + InvisibleToEveryoneState *invstate = NULL; /* If this is not the first call, previous call returned a (live!) tuple */ if (all_dead) @@ -1636,9 +1637,14 @@ heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer, * Note: if you change the criterion here for what is "dead", fix the * planner's get_actual_variable_range() function to match. */ - if (all_dead && *all_dead && - !HeapTupleIsSurelyDead(heapTuple, RecentGlobalXmin)) - *all_dead = false; + if (all_dead && *all_dead) + { + if (!invstate) + invstate = InvisibleToEveryoneTestInit(relation); + + if (!HeapTupleIsSurelyDead(invstate, heapTuple)) + *all_dead = false; + } /* * Check to see if HOT chain continues past this tuple; if so fetch diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index 56b35622f1a..854176a0e2f 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -1201,7 +1201,7 @@ heapam_index_build_range_scan(Relation heapRelation, /* okay to ignore lazy VACUUMs here */ if (!IsBootstrapProcessingMode() && !indexInfo->ii_Concurrent) - OldestXmin = GetOldestXmin(heapRelation, PROCARRAY_FLAGS_VACUUM); + OldestXmin = GetOldestVisibleTransactionId(heapRelation); if (!scan) { @@ -1242,6 +1242,17 @@ heapam_index_build_range_scan(Relation heapRelation, hscan = (HeapScanDesc) scan; + /* + * Must have called GetOldestVisibleTransactionId() if using SnapshotAny. + * Shouldn't have for an MVCC snapshot. (It's especially worth checking + * this for parallel builds, since ambuild routines that support parallel + * builds must work these details out for themselves.) + */ + Assert(snapshot == SnapshotAny || IsMVCCSnapshot(snapshot)); + Assert(snapshot == SnapshotAny ? TransactionIdIsValid(OldestXmin) : + !TransactionIdIsValid(OldestXmin)); + Assert(snapshot == SnapshotAny || !anyvisible); + /* Publish number of blocks to scan */ if (progress) { @@ -1261,17 +1272,6 @@ heapam_index_build_range_scan(Relation heapRelation, nblocks); } - /* - * Must call GetOldestXmin() with SnapshotAny. Should never call - * GetOldestXmin() with MVCC snapshot. (It's especially worth checking - * this for parallel builds, since ambuild routines that support parallel - * builds must work these details out for themselves.) - */ - Assert(snapshot == SnapshotAny || IsMVCCSnapshot(snapshot)); - Assert(snapshot == SnapshotAny ? TransactionIdIsValid(OldestXmin) : - !TransactionIdIsValid(OldestXmin)); - Assert(snapshot == SnapshotAny || !anyvisible); - /* set our scan endpoints */ if (!allow_sync) heap_setscanlimits(scan, start_blockno, numblocks); diff --git a/src/backend/access/heap/heapam_visibility.c b/src/backend/access/heap/heapam_visibility.c index dba10890aab..793a8036331 100644 --- a/src/backend/access/heap/heapam_visibility.c +++ b/src/backend/access/heap/heapam_visibility.c @@ -1154,19 +1154,55 @@ HeapTupleSatisfiesMVCC(HeapTuple htup, Snapshot snapshot, * we mainly want to know is if a tuple is potentially visible to *any* * running transaction. If so, it can't be removed yet by VACUUM. * - * OldestXmin is a cutoff XID (obtained from GetOldestXmin()). Tuples - * deleted by XIDs >= OldestXmin are deemed "recently dead"; they might - * still be visible to some open transaction, so we can't remove them, - * even if we see that the deleting transaction has committed. + * OldestXmin is a cutoff XID (obtained from GetOldestVisibleTransactionId()). + * Tuples deleted by XIDs >= OldestXmin are deemed "recently dead"; they might + * still be visible to some open transaction, so we can't remove them, even if + * we see that the deleting transaction has committed. */ HTSV_Result HeapTupleSatisfiesVacuum(HeapTuple htup, TransactionId OldestXmin, Buffer buffer) +{ + TransactionId dead_after = InvalidTransactionId; + HTSV_Result res; + + res = HeapTupleSatisfiesVacuumHorizon(htup, buffer, &dead_after); + + if (res == HEAPTUPLE_RECENTLY_DEAD) + { + Assert(TransactionIdIsValid(dead_after)); + + if (TransactionIdPrecedes(dead_after, OldestXmin)) + res = HEAPTUPLE_DEAD; + } + else + Assert(!TransactionIdIsValid(dead_after)); + + return res; +} + +/* + * Work horse for HeapTupleSatisfiesVacuum and similar routines. + * + * In contrast to HeapTupleSatisfiesVacuum this routine, when encountering a + * tuple that could still be visible to some backend, stores the xid that + * needs to be compared with the horizon in *dead_after, and returns + * HEAPTUPLE_RECENTLY_DEAD. The caller then can perform the comparison with + * the horizon. This is e.g. useful when comparing with different horizons. + * + * Note: HEAPTUPLE_DEAD can still be returned here, e.g. if the inserting + * transaction aborted. + */ +HTSV_Result +HeapTupleSatisfiesVacuumHorizon(HeapTuple htup, Buffer buffer, TransactionId *dead_after) { HeapTupleHeader tuple = htup->t_data; Assert(ItemPointerIsValid(&htup->t_self)); Assert(htup->t_tableOid != InvalidOid); + Assert(dead_after != NULL); + + *dead_after = InvalidTransactionId; /* * Has inserting transaction committed? @@ -1323,17 +1359,15 @@ HeapTupleSatisfiesVacuum(HeapTuple htup, TransactionId OldestXmin, else if (TransactionIdDidCommit(xmax)) { /* - * The multixact might still be running due to lockers. If the - * updater is below the xid horizon, we have to return DEAD - * regardless -- otherwise we could end up with a tuple where the - * updater has to be removed due to the horizon, but is not pruned - * away. It's not a problem to prune that tuple, because any - * remaining lockers will also be present in newer tuple versions. + * The multixact might still be running due to lockers. Need to + * allow for pruning if below the xid horizon regardless -- + * otherwise we could end up with a tuple where the updater has to + * be removed due to the horizon, but is not pruned away. It's + * not a problem to prune that tuple, because any remaining + * lockers will also be present in newer tuple versions. */ - if (!TransactionIdPrecedes(xmax, OldestXmin)) - return HEAPTUPLE_RECENTLY_DEAD; - - return HEAPTUPLE_DEAD; + *dead_after = xmax; + return HEAPTUPLE_RECENTLY_DEAD; } else if (!MultiXactIdIsRunning(HeapTupleHeaderGetRawXmax(tuple), false)) { @@ -1372,14 +1406,11 @@ HeapTupleSatisfiesVacuum(HeapTuple htup, TransactionId OldestXmin, } /* - * Deleter committed, but perhaps it was recent enough that some open - * transactions could still see the tuple. + * Deleter committed, allow caller to check if it was recent enough that + * some open transactions could still see the tuple. */ - if (!TransactionIdPrecedes(HeapTupleHeaderGetRawXmax(tuple), OldestXmin)) - return HEAPTUPLE_RECENTLY_DEAD; - - /* Otherwise, it's dead and removable */ - return HEAPTUPLE_DEAD; + *dead_after = HeapTupleHeaderGetRawXmax(tuple); + return HEAPTUPLE_RECENTLY_DEAD; } @@ -1418,7 +1449,8 @@ HeapTupleSatisfiesNonVacuumable(HeapTuple htup, Snapshot snapshot, * if the tuple is removable. */ bool -HeapTupleIsSurelyDead(HeapTuple htup, TransactionId OldestXmin) +HeapTupleIsSurelyDead(InvisibleToEveryoneState *invstate, + HeapTuple htup) { HeapTupleHeader tuple = htup->t_data; @@ -1459,7 +1491,7 @@ HeapTupleIsSurelyDead(HeapTuple htup, TransactionId OldestXmin) return false; /* Deleter committed, so tuple is dead if the XID is old enough. */ - return TransactionIdPrecedes(HeapTupleHeaderGetRawXmax(tuple), OldestXmin); + return InvisibleToEveryoneTestXid(invstate, HeapTupleHeaderGetRawXmax(tuple)); } /* diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c index 1794cfd8d9a..e36ca648cef 100644 --- a/src/backend/access/heap/pruneheap.c +++ b/src/backend/access/heap/pruneheap.c @@ -23,12 +23,24 @@ #include "miscadmin.h" #include "pgstat.h" #include "storage/bufmgr.h" +#include "utils/snapmgr.h" #include "utils/rel.h" #include "utils/snapmgr.h" /* Working data for heap_page_prune and subroutines */ typedef struct { + Relation rel; + + /* + * State related to determining whether a dead tuple is still needed. + */ + InvisibleToEveryoneState *vistest; + TimestampTz limited_oldest_ts; + TransactionId limited_oldest_xmin; + /* have we made removal decision based on old_snapshot_threshold */ + bool limited_oldest_committed; + TransactionId new_prune_xid; /* new prune hint value for page */ TransactionId latestRemovedXid; /* latest xid to be removed by this prune */ int nredirected; /* numbers of entries in arrays below */ @@ -43,9 +55,8 @@ typedef struct } PruneState; /* Local functions */ -static int heap_prune_chain(Relation relation, Buffer buffer, +static int heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, - TransactionId OldestXmin, PruneState *prstate); static void heap_prune_record_prunable(PruneState *prstate, TransactionId xid); static void heap_prune_record_redirect(PruneState *prstate, @@ -65,16 +76,16 @@ static void heap_prune_record_unused(PruneState *prstate, OffsetNumber offnum); * if there's not any use in pruning. * * Caller must have pin on the buffer, and must *not* have a lock on it. - * - * OldestXmin is the cutoff XID used to distinguish whether tuples are DEAD - * or RECENTLY_DEAD (see HeapTupleSatisfiesVacuum). */ void heap_page_prune_opt(Relation relation, Buffer buffer) { Page page = BufferGetPage(buffer); + TransactionId prune_xid; + InvisibleToEveryoneState *vistest; + TransactionId limited_xmin = InvalidTransactionId; + TimestampTz limited_ts = 0; Size minfree; - TransactionId OldestXmin; /* * We can't write WAL in recovery mode, so there's no point trying to @@ -85,37 +96,53 @@ heap_page_prune_opt(Relation relation, Buffer buffer) return; /* - * Use the appropriate xmin horizon for this relation. If it's a proper - * catalog relation or a user defined, additional, catalog relation, we - * need to use the horizon that includes slots, otherwise the data-only - * horizon can be used. Note that the toast relation of user defined - * relations are *not* considered catalog relations. + * XXX: pointless call to make old_snapshot_threshold tests work. They're + * broken, and discussion of what to do about them is ongoing. + */ + if (old_snapshot_threshold == 0) + SnapshotTooOldMagicForTest(); + + /* + * First check whether there's any chance there's something to prune, + * determining the appropriate horizon is a waste if there's no prune_xid + * (i.e. no updates/deletes left potentially dead tuples around). + */ + prune_xid = ((PageHeader) page)->pd_prune_xid; + if (!TransactionIdIsValid(prune_xid)) + return; + + /* + * Check whether prune_xid indicates that there may be dead rows that can + * be cleaned up. * - * It is OK to apply the old snapshot limit before acquiring the cleanup + * It is OK to check the old snapshot limit before acquiring the cleanup * lock because the worst that can happen is that we are not quite as * aggressive about the cleanup (by however many transaction IDs are * consumed between this point and acquiring the lock). This allows us to * save significant overhead in the case where the page is found not to be * prunable. - */ - if (IsCatalogRelation(relation) || - RelationIsAccessibleInLogicalDecoding(relation)) - OldestXmin = RecentGlobalXmin; - else - OldestXmin = - TransactionIdLimitedForOldSnapshots(RecentGlobalDataXmin, - relation); - - Assert(TransactionIdIsValid(OldestXmin)); - - /* - * Let's see if we really need pruning. * - * Forget it if page is not hinted to contain something prunable that's - * older than OldestXmin. + * Even if old_snapshot_threshold is set, we first check whether the page + * can be pruned without. Both because + * TransactionIdLimitedForOldSnapshots() is not cheap, and because not + * unnecessarily relying on old_snapshot_threshold avoids causing + * conflicts. */ - if (!PageIsPrunable(page, OldestXmin)) - return; + vistest = InvisibleToEveryoneTestInit(relation); + + if (!InvisibleToEveryoneTestXid(vistest, prune_xid)) + { + if (!OldSnapshotThresholdActive()) + return; + + if (!TransactionIdLimitedForOldSnapshots(InvisibleToEveryoneTestCutoff(vistest), + relation, + &limited_xmin, &limited_ts)) + return; + + if (!TransactionIdPrecedes(prune_xid, limited_xmin)) + return; + } /* * We prune when a previous UPDATE failed to find enough space on the page @@ -151,7 +178,9 @@ heap_page_prune_opt(Relation relation, Buffer buffer) * needed */ /* OK to prune */ - (void) heap_page_prune(relation, buffer, OldestXmin, true, &ignore); + (void) heap_page_prune(relation, buffer, vistest, + limited_xmin, limited_ts, + true, &ignore); } /* And release buffer lock */ @@ -165,8 +194,11 @@ heap_page_prune_opt(Relation relation, Buffer buffer) * * Caller must have pin and buffer cleanup lock on the page. * - * OldestXmin is the cutoff XID used to distinguish whether tuples are DEAD - * or RECENTLY_DEAD (see HeapTupleSatisfiesVacuum). + * vistest is used to distinguish whether tuples are DEAD or RECENTLY_DEAD + * (see heap_prune_satisfies_vacuum and + * HeapTupleSatisfiesVacuum). limited_oldest_xmin / limited_oldest_ts need to + * either have been set by TransactionIdLimitedForOldSnapshots, or + * InvalidTransactionId/0 respectively. * * If report_stats is true then we send the number of reclaimed heap-only * tuples to pgstats. (This must be false during vacuum, since vacuum will @@ -177,7 +209,10 @@ heap_page_prune_opt(Relation relation, Buffer buffer) * latestRemovedXid. */ int -heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin, +heap_page_prune(Relation relation, Buffer buffer, + InvisibleToEveryoneState* vistest, + TransactionId limited_oldest_xmin, + TimestampTz limited_oldest_ts, bool report_stats, TransactionId *latestRemovedXid) { int ndeleted = 0; @@ -198,6 +233,11 @@ heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin, * initialize the rest of our working state. */ prstate.new_prune_xid = InvalidTransactionId; + prstate.rel = relation; + prstate.vistest = vistest; + prstate.limited_oldest_xmin = limited_oldest_xmin; + prstate.limited_oldest_ts = limited_oldest_ts; + prstate.limited_oldest_committed = false; prstate.latestRemovedXid = *latestRemovedXid; prstate.nredirected = prstate.ndead = prstate.nunused = 0; memset(prstate.marked, 0, sizeof(prstate.marked)); @@ -220,9 +260,7 @@ heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin, continue; /* Process this item or chain of items */ - ndeleted += heap_prune_chain(relation, buffer, offnum, - OldestXmin, - &prstate); + ndeleted += heap_prune_chain(buffer, offnum, &prstate); } /* Any error while applying the changes is critical */ @@ -323,6 +361,85 @@ heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin, } +/* + * Perform visiblity checks for heap pruning. + * + * This is more complicated than just calling InvisibleToEveryoneTestXid() + * because of old_snapshot_threshold. We only want to increase the threshold + * that triggers errors for old snapshots when we actually decide to remove a + * row based on the limited horizon. + * + * Due to its cost we also only want to call + * TransactionIdLimitedForOldSnapshots() if necessary, i.e. we might not have + * done so in heap_hot_prune_opt() if pd_prune_xid was old enough. But we + * still want to be able to remove rows that are too new to be removed + * accuring to prstate->vistest, but that can be removed based on + * old_snapshot_threshold. So we call TransactionIdLimitedForOldSnapshots() on + * demand in here, if appropriate. + */ +static HTSV_Result +heap_prune_satisfies_vacuum(PruneState *prstate, HeapTuple tup, Buffer buffer) +{ + HTSV_Result res; + TransactionId dead_after; + + res = HeapTupleSatisfiesVacuumHorizon(tup, buffer, &dead_after); + + if (res != HEAPTUPLE_RECENTLY_DEAD) + return res; + + /* + * If we are already relying on the limited xmin, there is no need to + * delay doing so anymore. + */ + if (prstate->limited_oldest_committed) + { + Assert(TransactionIdIsValid(prstate->limited_oldest_xmin)); + + if (TransactionIdPrecedes(dead_after, prstate->limited_oldest_xmin)) + res = HEAPTUPLE_DEAD; + return res; + } + + /* + * First check if InvisibleToEveryoneTestXid() is sufficient to find the + * row dead. If not, and old_snapshot_threshold is enabled, try to use the + * lowered horizon. + */ + if (InvisibleToEveryoneTestXid(prstate->vistest, dead_after)) + res = HEAPTUPLE_DEAD; + else if (OldSnapshotThresholdActive()) + { + /* haven't determined limited horizon yet, requests */ + if (!TransactionIdIsValid(prstate->limited_oldest_xmin)) + { + TransactionId horizon = + InvisibleToEveryoneTestCutoff(prstate->vistest); + + TransactionIdLimitedForOldSnapshots(horizon, prstate->rel, + &prstate->limited_oldest_xmin, + &prstate->limited_oldest_ts); + } + + if (TransactionIdIsValid(prstate->limited_oldest_xmin) && + TransactionIdPrecedes(dead_after, prstate->limited_oldest_xmin)) + { + /* + * About to remove row based on snapshot_too_old. Need to raise + * the threshold so problematic accesses would error. + */ + Assert(!prstate->limited_oldest_committed); + SetOldSnapshotThresholdTimestamp(prstate->limited_oldest_ts, + prstate->limited_oldest_xmin); + prstate->limited_oldest_committed = true; + res = HEAPTUPLE_DEAD; + } + } + + return res; +} + + /* * Prune specified line pointer or a HOT chain originating at line pointer. * @@ -349,9 +466,7 @@ heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin, * Returns the number of tuples (to be) deleted from the page. */ static int -heap_prune_chain(Relation relation, Buffer buffer, OffsetNumber rootoffnum, - TransactionId OldestXmin, - PruneState *prstate) +heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate) { int ndeleted = 0; Page dp = (Page) BufferGetPage(buffer); @@ -366,7 +481,7 @@ heap_prune_chain(Relation relation, Buffer buffer, OffsetNumber rootoffnum, i; HeapTupleData tup; - tup.t_tableOid = RelationGetRelid(relation); + tup.t_tableOid = RelationGetRelid(prstate->rel); rootlp = PageGetItemId(dp, rootoffnum); @@ -401,7 +516,7 @@ heap_prune_chain(Relation relation, Buffer buffer, OffsetNumber rootoffnum, * either here or while following a chain below. Whichever path * gets there first will mark the tuple unused. */ - if (HeapTupleSatisfiesVacuum(&tup, OldestXmin, buffer) + if (heap_prune_satisfies_vacuum(prstate, &tup, buffer) == HEAPTUPLE_DEAD && !HeapTupleHeaderIsHotUpdated(htup)) { heap_prune_record_unused(prstate, rootoffnum); @@ -485,7 +600,7 @@ heap_prune_chain(Relation relation, Buffer buffer, OffsetNumber rootoffnum, */ tupdead = recent_dead = false; - switch (HeapTupleSatisfiesVacuum(&tup, OldestXmin, buffer)) + switch (heap_prune_satisfies_vacuum(prstate, &tup, buffer)) { case HEAPTUPLE_DEAD: tupdead = true; diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c index f3382d37a40..5799795b877 100644 --- a/src/backend/access/heap/vacuumlazy.c +++ b/src/backend/access/heap/vacuumlazy.c @@ -780,6 +780,7 @@ lazy_scan_heap(Relation onerel, VacuumParams *params, LVRelStats *vacrelstats, PROGRESS_VACUUM_MAX_DEAD_TUPLES }; int64 initprog_val[3]; + InvisibleToEveryoneState *vistest; pg_rusage_init(&ru0); @@ -808,6 +809,8 @@ lazy_scan_heap(Relation onerel, VacuumParams *params, LVRelStats *vacrelstats, vacrelstats->nonempty_pages = 0; vacrelstats->latestRemovedXid = InvalidTransactionId; + vistest = InvisibleToEveryoneTestInit(onerel); + /* * Initialize the state for a parallel vacuum. As of now, only one worker * can be used for an index, so we invoke parallelism only if there are at @@ -1231,7 +1234,8 @@ lazy_scan_heap(Relation onerel, VacuumParams *params, LVRelStats *vacrelstats, * * We count tuples removed by the pruning step as removed by VACUUM. */ - tups_vacuumed += heap_page_prune(onerel, buf, OldestXmin, false, + tups_vacuumed += heap_page_prune(onerel, buf, vistest, false, + InvalidTransactionId, 0, &vacrelstats->latestRemovedXid); /* @@ -1588,14 +1592,16 @@ lazy_scan_heap(Relation onerel, VacuumParams *params, LVRelStats *vacrelstats, } /* - * It's possible for the value returned by GetOldestXmin() to move - * backwards, so it's not wrong for us to see tuples that appear to - * not be visible to everyone yet, while PD_ALL_VISIBLE is already - * set. The real safe xmin value never moves backwards, but - * GetOldestXmin() is conservative and sometimes returns a value - * that's unnecessarily small, so if we see that contradiction it just - * means that the tuples that we think are not visible to everyone yet - * actually are, and the PD_ALL_VISIBLE flag is correct. + * It's possible for the value returned by + * GetOldestVisibleTransactionId() to move backwards, so it's not + * wrong for us to see tuples that appear to not be visible to + * everyone yet, while PD_ALL_VISIBLE is already set. The real safe + * xmin value never moves backwards, but + * GetOldestVisibleTransactionId() is conservative and sometimes + * returns a value that's unnecessarily small, so if we see that + * contradiction it just means that the tuples that we think are not + * visible to everyone yet actually are, and the PD_ALL_VISIBLE flag + * is correct. * * There should never be dead tuples on a page with PD_ALL_VISIBLE * set, however. diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README index 2d0f8f4b79a..46adc5ee9a2 100644 --- a/src/backend/access/nbtree/README +++ b/src/backend/access/nbtree/README @@ -336,9 +336,9 @@ snapshots and registered snapshots as of the deletion are gone; which is overly strong, but is simple to implement within Postgres. When marked dead, a deleted page is labeled with the next-transaction counter value. VACUUM can reclaim the page for re-use when this transaction number is -older than RecentGlobalXmin. As collateral damage, this implementation -also waits for running XIDs with no snapshots and for snapshots taken -until the next transaction to allocate an XID commits. +guaranteed to be "invisible to everyone". As collateral damage, this +implementation also waits for running XIDs with no snapshots and for +snapshots taken until the next transaction to allocate an XID commits. Reclaiming a page doesn't actually change its state on disk --- we simply record it in the shared-memory free space map, from which it will be @@ -405,8 +405,8 @@ page and also the correct place to hold the current value. We can avoid the cost of walking down the tree in such common cases. The optimization works on the assumption that there can only be one -non-ignorable leaf rightmost page, and so even a RecentGlobalXmin style -interlock isn't required. We cannot fail to detect that our hint was +non-ignorable leaf rightmost page, and so not even a invisible-to-everyone +style interlock required. We cannot fail to detect that our hint was invalidated, because there can only be one such page in the B-Tree at any time. It's possible that the page will be deleted and recycled without a backend's cached page also being detected as invalidated, but diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index 39b8f17f4b5..6e5ee3b443e 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -983,7 +983,7 @@ _bt_page_recyclable(Page page) */ opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (P_ISDELETED(opaque) && - TransactionIdPrecedes(opaque->btpo.xact, RecentGlobalXmin)) + InvisibleToEveryoneCheckXid(NULL, opaque->btpo.xact)) return true; return false; } @@ -2186,7 +2186,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, bool *rightsib_empty) * updated links to the target, ReadNewTransactionId() suffices as an * upper bound. Any scan having retained a now-stale link is advertising * in its PGXACT an xmin less than or equal to the value we read here. It - * will continue to do so, holding back RecentGlobalXmin, for the duration + * will continue to do so, holding back xmin horizon, for the duration * of that scan. */ page = BufferGetPage(buf); diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 36294789f3f..fc81d719093 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -802,6 +802,12 @@ _bt_vacuum_needs_cleanup(IndexVacuumInfo *info) metapg = BufferGetPage(metabuf); metad = BTPageGetMeta(metapg); + /* + * XXX: If IndexVacuumInfo contained the heap relation, we could be more + * aggressive about vacuuming non catalog relations by passing the table + * to InvisibleToEveryoneCheckXid(). + */ + if (metad->btm_version < BTREE_NOVAC_VERSION) { /* @@ -811,12 +817,11 @@ _bt_vacuum_needs_cleanup(IndexVacuumInfo *info) result = true; } else if (TransactionIdIsValid(metad->btm_oldest_btpo_xact) && - TransactionIdPrecedes(metad->btm_oldest_btpo_xact, - RecentGlobalXmin)) + InvisibleToEveryoneCheckXid(NULL, metad->btm_oldest_btpo_xact)) { /* - * If oldest btpo.xact in the deleted pages is older than - * RecentGlobalXmin, then at least one deleted page can be recycled. + * If oldest btpo.xact in the deleted pages is invisible, then at + * least one deleted page can be recycled. */ result = true; } @@ -1227,14 +1232,13 @@ restart: * own conflict now.) * * Backends with snapshots acquired after a VACUUM starts but - * before it finishes could have a RecentGlobalXmin with a - * later xid than the VACUUM's OldestXmin cutoff. These - * backends might happen to opportunistically mark some index - * tuples LP_DEAD before we reach them, even though they may - * be after our cutoff. We don't try to kill these "extra" - * index tuples in _bt_delitems_vacuum(). This keep things - * simple, and allows us to always avoid generating our own - * conflicts. + * before it finishes could have visibility cutoff with a + * later xid than VACUUM's OldestXmin cutoff. These backends + * might happen to opportunistically mark some index tuples + * LP_DEAD before we reach them, even though they may be after + * our cutoff. We don't try to kill these "extra" index + * tuples in _bt_delitems_vacuum(). This keep things simple, + * and allows us to always avoid generating our own conflicts. */ Assert(!BTreeTupleIsPivot(itup)); if (!BTreeTupleIsPosting(itup)) diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c index 99d0914e724..431d7c3d709 100644 --- a/src/backend/access/nbtree/nbtxlog.c +++ b/src/backend/access/nbtree/nbtxlog.c @@ -926,11 +926,11 @@ btree_xlog_reuse_page(XLogReaderState *record) * Btree reuse_page records exist to provide a conflict point when we * reuse pages in the index via the FSM. That's all they do though. * - * latestRemovedXid was the page's btpo.xact. The btpo.xact < - * RecentGlobalXmin test in _bt_page_recyclable() conceptually mirrors the - * pgxact->xmin > limitXmin test in GetConflictingVirtualXIDs(). - * Consequently, one XID value achieves the same exclusion effect on - * master and standby. + * latestRemovedXid was the page's btpo.xact. The + * InvisibleToEveryoneCheckXid test in _bt_page_recyclable() conceptually + * mirrors the pgxact->xmin > limitXmin test in + * GetConflictingVirtualXIDs(). Consequently, one XID value achieves the + * same exclusion effect on master and standby. */ if (InHotStandby) { diff --git a/src/backend/access/spgist/spgvacuum.c b/src/backend/access/spgist/spgvacuum.c index bd98707f3c0..0414382f34e 100644 --- a/src/backend/access/spgist/spgvacuum.c +++ b/src/backend/access/spgist/spgvacuum.c @@ -501,10 +501,14 @@ vacuumRedirectAndPlaceholder(Relation index, Buffer buffer) OffsetNumber itemToPlaceholder[MaxIndexTuplesPerPage]; OffsetNumber itemnos[MaxIndexTuplesPerPage]; spgxlogVacuumRedirect xlrec; + InvisibleToEveryoneState *invstate; xlrec.nToPlaceholder = 0; xlrec.newestRedirectXid = InvalidTransactionId; + /* XXX: providing heap relation would allow more pruning */ + invstate = InvisibleToEveryoneTestInit(NULL); + START_CRIT_SECTION(); /* @@ -521,7 +525,7 @@ vacuumRedirectAndPlaceholder(Relation index, Buffer buffer) dt = (SpGistDeadTuple) PageGetItem(page, PageGetItemId(page, i)); if (dt->tupstate == SPGIST_REDIRECT && - TransactionIdPrecedes(dt->xid, RecentGlobalXmin)) + InvisibleToEveryoneTestXid(invstate, dt->xid)) { dt->tupstate = SPGIST_PLACEHOLDER; Assert(opaque->nRedirection > 0); diff --git a/src/backend/access/transam/README b/src/backend/access/transam/README index eb9aac5fd39..be805a5660b 100644 --- a/src/backend/access/transam/README +++ b/src/backend/access/transam/README @@ -257,31 +257,31 @@ simultaneously, we have one backend take ProcArrayLock and clear the XIDs of multiple processes at once.) ProcArrayEndTransaction also holds the lock while advancing the shared -latestCompletedXid variable. This allows GetSnapshotData to use -latestCompletedXid + 1 as xmax for its snapshot: there can be no +latestCompletedFullXid variable. This allows GetSnapshotData to use +latestCompletedFullXid + 1 as xmax for its snapshot: there can be no transaction >= this xid value that the snapshot needs to consider as completed. In short, then, the rule is that no transaction may exit the set of -currently-running transactions between the time we fetch latestCompletedXid +currently-running transactions between the time we fetch latestCompletedFullXid and the time we finish building our snapshot. However, this restriction only applies to transactions that have an XID --- read-only transactions can end without acquiring ProcArrayLock, since they don't affect anyone -else's snapshot nor latestCompletedXid. +else's snapshot nor latestCompletedFullXid. Transaction start, per se, doesn't have any interlocking with these considerations, since we no longer assign an XID immediately at transaction start. But when we do decide to allocate an XID, GetNewTransactionId must store the new XID into the shared ProcArray before releasing XidGenLock. -This ensures that all top-level XIDs <= latestCompletedXid are either +This ensures that all top-level XIDs <= latestCompletedFullXid are either present in the ProcArray, or not running anymore. (This guarantee doesn't apply to subtransaction XIDs, because of the possibility that there's not room for them in the subxid array; instead we guarantee that they are present or the overflow flag is set.) If a backend released XidGenLock before storing its XID into MyPgXact, then it would be possible for another -backend to allocate and commit a later XID, causing latestCompletedXid to +backend to allocate and commit a later XID, causing latestCompletedFullXid to pass the first backend's XID, before that value became visible in the -ProcArray. That would break GetOldestXmin, as discussed below. +ProcArray. That would break ComputeTransactionHorizons, as discussed below. We allow GetNewTransactionId to store the XID into MyPgXact->xid (or the subxid array) without taking ProcArrayLock. This was once necessary to @@ -293,42 +293,54 @@ once, rather than assume they can read it multiple times and get the same answer each time. (Use volatile-qualified pointers when doing this, to ensure that the C compiler does exactly what you tell it to.) -Another important activity that uses the shared ProcArray is GetOldestXmin, -which must determine a lower bound for the oldest xmin of any active MVCC -snapshot, system-wide. Each individual backend advertises the smallest -xmin of its own snapshots in MyPgXact->xmin, or zero if it currently has no -live snapshots (eg, if it's between transactions or hasn't yet set a -snapshot for a new transaction). GetOldestXmin takes the MIN() of the -valid xmin fields. It does this with only shared lock on ProcArrayLock, -which means there is a potential race condition against other backends -doing GetSnapshotData concurrently: we must be certain that a concurrent -backend that is about to set its xmin does not compute an xmin less than -what GetOldestXmin returns. We ensure that by including all the active -XIDs into the MIN() calculation, along with the valid xmins. The rule that -transactions can't exit without taking exclusive ProcArrayLock ensures that -concurrent holders of shared ProcArrayLock will compute the same minimum of -currently-active XIDs: no xact, in particular not the oldest, can exit -while we hold shared ProcArrayLock. So GetOldestXmin's view of the minimum -active XID will be the same as that of any concurrent GetSnapshotData, and -so it can't produce an overestimate. If there is no active transaction at -all, GetOldestXmin returns latestCompletedXid + 1, which is a lower bound -for the xmin that might be computed by concurrent or later GetSnapshotData -calls. (We know that no XID less than this could be about to appear in -the ProcArray, because of the XidGenLock interlock discussed above.) +Another important activity that uses the shared ProcArray is +ComputeTransactionHorizons, which must determine lower bound for the oldest +xmin of any active MVCC snapshot, system-wide. Each individual backend +advertises the smallest xmin of its own snapshots in MyPgXact->xmin, or zero +if it currently has no live snapshots (eg, if it's between transactions or +hasn't yet set a snapshot for a new transaction). +ComputeTransactionHorizons takes the MIN() of the valid xmin fields. It +does this with only shared lock on ProcArrayLock, which means there is a +potential race condition against other backends doing GetSnapshotData +concurrently: we must be certain that a concurrent backend that is about to +set its xmin does not compute an xmin less than what +ComputeTransactionHorizons determines. We ensure that by including all the +active XIDs into the MIN() calculation, along with the valid xmins. The +rule that transactions can't exit without taking exclusive ProcArrayLock +ensures that concurrent holders of shared ProcArrayLock will compute the +same minimum of currently-active XIDs: no xact, in particular not the +oldest, can exit while we hold shared ProcArrayLock. So +ComputeTransactionHorizons's view of the minimum active XID will be the same +as that of any concurrent GetSnapshotData, and so it can't produce an +overestimate. If there is no active transaction at all, +ComputeTransactionHorizons uses latestCompletedFullXid + 1, which is a lower +bound for the xmin that might be computed by concurrent or later +GetSnapshotData calls. (We know that no XID less than this could be about +to appear in the ProcArray, because of the XidGenLock interlock discussed +above.) -GetSnapshotData also performs an oldest-xmin calculation (which had better -match GetOldestXmin's) and stores that into RecentGlobalXmin, which is used -for some tuple age cutoff checks where a fresh call of GetOldestXmin seems -too expensive. Note that while it is certain that two concurrent -executions of GetSnapshotData will compute the same xmin for their own -snapshots, as argued above, it is not certain that they will arrive at the -same estimate of RecentGlobalXmin. This is because we allow XID-less -transactions to clear their MyPgXact->xmin asynchronously (without taking -ProcArrayLock), so one execution might see what had been the oldest xmin, -and another not. This is OK since RecentGlobalXmin need only be a valid -lower bound. As noted above, we are already assuming that fetch/store -of the xid fields is atomic, so assuming it for xmin as well is no extra -risk. +As GetSnapshotData is performance critical, it does not perform an +accurate oldest-xmin calculation (it used to, until v13). The contents +of a snapshot only depend on the xids of other backends, not their +xmin. As backend's xmin changes much more often than its xid, having +GetSnapshotData look at xmins can lead to a lot of unnecessary +cacheline ping-pong. Instead GetSnapshotData updates approximate +thresholds (one that guarantees that all deleted rows older than it +can be removed, another determining that deleted rows newer than it +can not be removed). InvisibleToEveryoneTest* uses those threshold to +make invisibility decision, falling back to ComputeTransactionHorizons +if necessary. + +Note that while it is certain that two concurrent executions of +GetSnapshotData will compute the same xmin for their own snapshots, +there is no such guarantee for the horizons computed by +ComputeTransactionHorizons. This is because we allow XID-less +transactions to clear their MyPgXact->xmin asynchronously (without +taking ProcArrayLock), so one execution might see what had been the +oldest xmin, and another not. This is OK since the thresholds need +only be a valid lower bound. As noted above, we are already assuming +that fetch/store of the xid fields is atomic, so assuming it for xmin +as well is no extra risk. pg_xact and pg_subtrans diff --git a/src/backend/access/transam/varsup.c b/src/backend/access/transam/varsup.c index 2570e7086a7..43973130b7c 100644 --- a/src/backend/access/transam/varsup.c +++ b/src/backend/access/transam/varsup.c @@ -566,3 +566,51 @@ GetNewObjectId(void) return result; } + + +#ifdef USE_ASSERT_CHECKING + +/* + * Assert that xid is one that we could actually see on disk. + * + * As xid ShmemVariableCache->oldestXid could change just after this call + * without further precautions, and as xid could just fall between the bounds + * due to xid wraparound, this can only detect if something is definitely + * wrong, but not establish correctness. + * + * This intentionally does not expose a return value, to avoid code being + * introduced that depends on the return value. + */ +void AssertTransactionIdMayBeOnDisk(TransactionId xid) +{ + TransactionId oldest_xid; + TransactionId next_xid; + + Assert(TransactionIdIsValid(xid)); + + /* we may see bootstrap / frozen */ + if (!TransactionIdIsNormal(xid)) + return; + + /* + * We can't acquire XidGenLock, as this may be called with XidGenLock + * already held (or with other locks that don't allow XidGenLock to be + * nested). That's ok for our purposes though, since we already rely on + * 32bit reads to be atomic. While nextFullXid is 64 bit, we only look at + * the lower 32bit, so a skewed read doesn't hurt. + * + * There's no increased danger of oldest / next by accessing them without + * a lock. xid needs to have been created with GetNewTransactionId() in + * the originating session, and the locks there pair with the memory + * barrier below. We do however accept xid to be <= to next_xid, instead + * of just <, as xid could be from the procarray, before we see the + * updated nextFullXid value. + */ + pg_memory_barrier(); + oldest_xid = ShmemVariableCache->oldestXid; + next_xid = XidFromFullTransactionId(ShmemVariableCache->nextFullXid); + + Assert(TransactionIdFollowsOrEquals(xid, oldest_xid) || + TransactionIdPrecedesOrEquals(xid, next_xid)); +} +#endif diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index abf954ba392..8ce853c81d4 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -7810,10 +7810,11 @@ StartupXLOG(void) XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL); XLogCtl->lastSegSwitchLSN = EndOfLog; - /* also initialize latestCompletedXid, to nextXid - 1 */ + /* also initialize latestCompletedFullXid, to nextFullXid - 1 */ LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); - ShmemVariableCache->latestCompletedXid = XidFromFullTransactionId(ShmemVariableCache->nextFullXid); - TransactionIdRetreat(ShmemVariableCache->latestCompletedXid); + ShmemVariableCache->latestCompletedFullXid = + ShmemVariableCache->nextFullXid; + FullTransactionIdRetreat(&ShmemVariableCache->latestCompletedFullXid); LWLockRelease(ProcArrayLock); /* @@ -9023,7 +9024,7 @@ CreateCheckPoint(int flags) * StartupSUBTRANS hasn't been called yet. */ if (!RecoveryInProgress()) - TruncateSUBTRANS(GetOldestXmin(NULL, PROCARRAY_FLAGS_DEFAULT)); + TruncateSUBTRANS(GetOldestTransactionIdConsideredRunning()); /* Real work is done, but log and update stats before releasing lock. */ LogCheckpointEnd(false); @@ -9382,7 +9383,7 @@ CreateRestartPoint(int flags) * this because StartupSUBTRANS hasn't been called yet. */ if (EnableHotStandby) - TruncateSUBTRANS(GetOldestXmin(NULL, PROCARRAY_FLAGS_DEFAULT)); + TruncateSUBTRANS(GetOldestTransactionIdConsideredRunning()); /* Real work is done, but log and update before releasing lock. */ LogCheckpointEnd(true); diff --git a/src/backend/commands/analyze.c b/src/backend/commands/analyze.c index 924ef37c816..7b75945c4a9 100644 --- a/src/backend/commands/analyze.c +++ b/src/backend/commands/analyze.c @@ -1056,7 +1056,7 @@ acquire_sample_rows(Relation onerel, int elevel, totalblocks = RelationGetNumberOfBlocks(onerel); /* Need a cutoff xmin for HeapTupleSatisfiesVacuum */ - OldestXmin = GetOldestXmin(onerel, PROCARRAY_FLAGS_VACUUM); + OldestXmin = GetOldestVisibleTransactionId(onerel); /* Prepare for sampling block numbers */ nblocks = BlockSampler_Init(&bs, totalblocks, targrows, random()); diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c index 3a89f8fe1e2..7055b237337 100644 --- a/src/backend/commands/vacuum.c +++ b/src/backend/commands/vacuum.c @@ -957,8 +957,25 @@ vacuum_set_xid_limits(Relation rel, * working on a particular table at any time, and that each vacuum is * always an independent transaction. */ - *oldestXmin = - TransactionIdLimitedForOldSnapshots(GetOldestXmin(rel, PROCARRAY_FLAGS_VACUUM), rel); + *oldestXmin = GetOldestVisibleTransactionId(rel); + + if (OldSnapshotThresholdActive()) + { + TransactionId limit_xmin; + TimestampTz limit_ts; + + if (TransactionIdLimitedForOldSnapshots(*oldestXmin, rel, &limit_xmin, &limit_ts)) + { + /* + * TODO: We should only set the threshold if we are pruning on the + * basis of the increased limits. Not as crucial here as it is for + * opportunistic pruning (which often happens at a much higher + * frequency), but would still be a significant improvement. + */ + SetOldSnapshotThresholdTimestamp(limit_ts, limit_xmin); + *oldestXmin = limit_xmin; + } + } Assert(TransactionIdIsNormal(*oldestXmin)); @@ -1347,12 +1364,13 @@ vac_update_datfrozenxid(void) bool dirty = false; /* - * Initialize the "min" calculation with GetOldestXmin, which is a - * reasonable approximation to the minimum relfrozenxid for not-yet- - * committed pg_class entries for new tables; see AddNewRelationTuple(). - * So we cannot produce a wrong minimum by starting with this. + * Initialize the "min" calculation with GetOldestVisibleTransactionId(), + * which is a reasonable approximation to the minimum relfrozenxid for + * not-yet-committed pg_class entries for new tables; see + * AddNewRelationTuple(). So we cannot produce a wrong minimum by + * starting with this. */ - newFrozenXid = GetOldestXmin(NULL, PROCARRAY_FLAGS_VACUUM); + newFrozenXid = GetOldestVisibleTransactionId(NULL); /* * Similarly, initialize the MultiXact "min" with the value that would be @@ -1683,8 +1701,9 @@ vacuum_rel(Oid relid, RangeVar *relation, VacuumParams *params) StartTransactionCommand(); /* - * Functions in indexes may want a snapshot set. Also, setting a snapshot - * ensures that RecentGlobalXmin is kept truly recent. + * Need to acquire a snapshot to prevent pg_subtrans from being truncated, + * cutoff xids in local memory wrapping around, and to have updated xmin + * horizons. */ PushActiveSnapshot(GetTransactionSnapshot()); diff --git a/src/backend/postmaster/autovacuum.c b/src/backend/postmaster/autovacuum.c index 7e97ffab27d..df1af9354ce 100644 --- a/src/backend/postmaster/autovacuum.c +++ b/src/backend/postmaster/autovacuum.c @@ -1878,6 +1878,10 @@ get_database_list(void) * the secondary effect that it sets RecentGlobalXmin. (This is critical * for anything that reads heap pages, because HOT may decide to prune * them even if the process doesn't attempt to modify any tuples.) + * + * FIXME: This comment is inaccurate / the code buggy. A snapshot that is + * not pushed/active does not reliably prevent HOT pruning (->xmin could + * e.g. be cleared when cache invalidations are processed). */ StartTransactionCommand(); (void) GetTransactionSnapshot(); diff --git a/src/backend/replication/logical/launcher.c b/src/backend/replication/logical/launcher.c index aec885e9871..eb9b1c87caf 100644 --- a/src/backend/replication/logical/launcher.c +++ b/src/backend/replication/logical/launcher.c @@ -122,6 +122,12 @@ get_subscription_list(void) * the secondary effect that it sets RecentGlobalXmin. (This is critical * for anything that reads heap pages, because HOT may decide to prune * them even if the process doesn't attempt to modify any tuples.) + * + * + * FIXME: This comment is inaccurate / the code buggy. A snapshot that is + * not pushed/active does not reliably prevent HOT pruning (->xmin could + * e.g. be cleared when cache invalidations are processed). Also, this is + * not reading pg_database. */ StartTransactionCommand(); (void) GetTransactionSnapshot(); diff --git a/src/backend/replication/walreceiver.c b/src/backend/replication/walreceiver.c index aee67c61aa6..2975242b5b3 100644 --- a/src/backend/replication/walreceiver.c +++ b/src/backend/replication/walreceiver.c @@ -1176,22 +1176,7 @@ XLogWalRcvSendHSFeedback(bool immed) */ if (hot_standby_feedback) { - TransactionId slot_xmin; - - /* - * Usually GetOldestXmin() would include both global replication slot - * xmin and catalog_xmin in its calculations, but we want to derive - * separate values for each of those. So we ask for an xmin that - * excludes the catalog_xmin. - */ - xmin = GetOldestXmin(NULL, - PROCARRAY_FLAGS_DEFAULT | PROCARRAY_SLOTS_XMIN); - - ProcArrayGetReplicationSlotXmin(&slot_xmin, &catalog_xmin); - - if (TransactionIdIsValid(slot_xmin) && - TransactionIdPrecedes(slot_xmin, xmin)) - xmin = slot_xmin; + GetReplicationHorizons(&xmin, &catalog_xmin); } else { diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c index 9e5611574cc..d7088d19fd6 100644 --- a/src/backend/replication/walsender.c +++ b/src/backend/replication/walsender.c @@ -2097,9 +2097,10 @@ ProcessStandbyHSFeedbackMessage(void) /* * Set the WalSender's xmin equal to the standby's requested xmin, so that - * the xmin will be taken into account by GetOldestXmin. This will hold - * back the removal of dead rows and thereby prevent the generation of - * cleanup conflicts on the standby server. + * the xmin will be taken into account by GetSnapshotData() / + * ComputeTransactionHorizons(). This will hold back the removal of dead + * rows and thereby prevent the generation of cleanup conflicts on the + * standby server. * * There is a small window for a race condition here: although we just * checked that feedbackXmin precedes nextXid, the nextXid could have @@ -2112,10 +2113,10 @@ ProcessStandbyHSFeedbackMessage(void) * own xmin would prevent nextXid from advancing so far. * * We don't bother taking the ProcArrayLock here. Setting the xmin field - * is assumed atomic, and there's no real need to prevent a concurrent - * GetOldestXmin. (If we're moving our xmin forward, this is obviously - * safe, and if we're moving it backwards, well, the data is at risk - * already since a VACUUM could have just finished calling GetOldestXmin.) + * is assumed atomic, and there's no real need to prevent concurrent + * horizon determinations. (If we're moving our xmin forward, this is + * obviously safe, and if we're moving it backwards, well, the data is at + * risk already since a VACUUM could already have determined the horizon.) * * If we're using a replication slot we reserve the xmin via that, * otherwise via the walsender's PGXACT entry. We can only track the diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c index 363000670b2..a1823caf632 100644 --- a/src/backend/storage/ipc/procarray.c +++ b/src/backend/storage/ipc/procarray.c @@ -99,6 +99,98 @@ typedef struct ProcArrayStruct int pgprocnos[FLEXIBLE_ARRAY_MEMBER]; } ProcArrayStruct; +/* + * State for testing whether tuple versions may be removed. To improve + * GetSnapshotData() performance we don't compute an accurate value whenever + * acquiring a snapshot. Instead we compute boundaries above/below which we + * know that row versions are [not] needed anymore. If at test time values + * falls in between the two, the boundaries can be recomputed (unless that + * just happened). + * + * The thresholds are FullTransactionIds instead of TransactionIds as it + * otherwise would be possible that, since the time the values were last + * computed, other activity in the system would lead to them being considered + * in the future. There is no procarray state preventing that from happening. + * + * The typedef is in the header. + */ +struct InvisibleToEveryoneState +{ + /* + * Xids above definitely_needed_bound are considered as definitely not + * removable. Xids below may be old enough to be removed, but unless + * they're older than maybe_needed_bound, the procarray needs to be + * consulted to be sure. + */ + FullTransactionId definitely_needed_bound; + + /* + * Xids below maybe_needed_bound are definitely removable. + */ + FullTransactionId maybe_needed_bound; +}; + +/* state for ComputeTransactionHorizons() */ +typedef struct ComputedHorizons +{ + /* + * The value of ShmemVariableCache->latestCompletedFullXid when + * ComputeTransactionHorizons() held ProcArrayLock. + */ + FullTransactionId latest_completed; + + /* + * The same for procArray->replication_slot_xmin and. + * procArray->replication_slot_catalog_xmin. + */ + TransactionId slot_xmin; + TransactionId slot_catalog_xmin; + + /* + * Oldest xid that any backend might think is still running. This needs to + * include processes running VACUUM, in contrast to the normal visibility + * cutoffs, as vacuum needs to be able to perform pg_subtrans lookups when + * determining visibility, but doesn't care about rows above its xmin to + * be removed. + * + * This likely should only be needed to determine whether pg_subtrans can + * be truncated. It currently includes the effects of replications slots, + * for historical reasons. But that could likely be changed. + */ + TransactionId oldest_considered_running; + + /* + * Oldest xid that may be necessary to retain in for shared tables. + * + * This includes the effects of replications lots. If that's not desired, + * look at shared_oldest_visible_raw; + */ + TransactionId shared_oldest_visible; + + /* + * Oldest xid that may be necessary to retain in for shared tables, + * but is not affected by replication slot's catalog_xmin. + * + * This is mainly useful to be able to send the catalog_xmin to upstream + * streaming replication servers via hot_standby_feedback, so they can + * apply the limit only when accessing catalog tables. + */ + TransactionId shared_oldest_visible_raw; + + /* + * Oldest xid that may be necessary to retain in for non-shared catalog + * tables. + */ + TransactionId catalog_oldest_visible; + + /* + * Oldest xid that may be necessary to retain in for normal user defined + * tables. + */ + TransactionId data_oldest_visible; +} ComputedHorizons; + + static ProcArrayStruct *procArray; static PGPROC *allProcs; @@ -118,6 +210,23 @@ static TransactionId latestObservedXid = InvalidTransactionId; */ static TransactionId standbySnapshotPendingXmin; +/* + * State for visibility checks on different types of relations. See struct + * InvisibleToEveryoneState for details. As shared, catalog, and user defined + * relations can have different horizons, one such state exists for each. + */ +static InvisibleToEveryoneState InvisibleShared; +static InvisibleToEveryoneState InvisibleCatalog; +static InvisibleToEveryoneState InvisibleData; + +/* + * This backend's RecentXmin at the last time the accurate xmin horizon was + * recomputed, or InvalidTransactionId if it has not. Used to limit how many + * times accurate horizons are recomputed + * InvisibleToEveryoneShouldUpdateHorizons(). + */ +static TransactionId ComputedHorizonsLastXmin; + #ifdef XIDCACHE_DEBUG /* counters for XidCache measurement */ @@ -175,6 +284,9 @@ static void KnownAssignedXidsReset(void); static inline void ProcArrayEndTransactionInternal(PGPROC *proc, PGXACT *pgxact, TransactionId latestXid); static void ProcArrayGroupClearXid(PGPROC *proc, TransactionId latestXid); +static void MaintainLatestCompletedXid(TransactionId latestXid); + +static inline FullTransactionId FullXidViaRelative(FullTransactionId rel, TransactionId xid); /* * Report shared-memory space needed by CreateSharedProcArray. @@ -351,9 +463,7 @@ ProcArrayRemove(PGPROC *proc, TransactionId latestXid) Assert(TransactionIdIsValid(allPgXact[proc->pgprocno].xid)); /* Advance global latestCompletedXid while holding the lock */ - if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid, - latestXid)) - ShmemVariableCache->latestCompletedXid = latestXid; + MaintainLatestCompletedXid(latestXid); } else { @@ -466,9 +576,7 @@ ProcArrayEndTransactionInternal(PGPROC *proc, PGXACT *pgxact, pgxact->overflowed = false; /* Also advance global latestCompletedXid while holding the lock */ - if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid, - latestXid)) - ShmemVariableCache->latestCompletedXid = latestXid; + MaintainLatestCompletedXid(latestXid); } /* @@ -623,6 +731,29 @@ ProcArrayClearTransaction(PGPROC *proc) pgxact->overflowed = false; } +/* + * Update ShmemVariableCache->latestCompletedFullXid to point to latestXid if + * currently older. + */ +static void +MaintainLatestCompletedXid(TransactionId latestXid) +{ + FullTransactionId cur_latest = ShmemVariableCache->latestCompletedFullXid; + + Assert(LWLockHeldByMe(ProcArrayLock)); + Assert(FullTransactionIdIsValid(cur_latest)); + + if (TransactionIdPrecedes(XidFromFullTransactionId(cur_latest), latestXid)) + { + FullTransactionId fxid = FullXidViaRelative(cur_latest, latestXid); + + ShmemVariableCache->latestCompletedFullXid = fxid; + } + + Assert(IsBootstrapProcessingMode() || + FullTransactionIdIsNormal(ShmemVariableCache->latestCompletedFullXid)); +} + /* * ProcArrayInitRecovery -- initialize recovery xid mgmt environment * @@ -667,6 +798,7 @@ ProcArrayApplyRecoveryInfo(RunningTransactions running) TransactionId *xids; int nxids; int i; + FullTransactionId fxid; Assert(standbyState >= STANDBY_INITIALIZED); Assert(TransactionIdIsValid(running->nextXid)); @@ -843,7 +975,7 @@ ProcArrayApplyRecoveryInfo(RunningTransactions running) * Now we've got the running xids we need to set the global values that * are used to track snapshots as they evolve further. * - * - latestCompletedXid which will be the xmax for snapshots + * - latestCompletedFullXid which will be the xmax for snapshots * - lastOverflowedXid which shows whether snapshots overflow * - nextXid * @@ -867,24 +999,26 @@ ProcArrayApplyRecoveryInfo(RunningTransactions running) standbySnapshotPendingXmin = InvalidTransactionId; } - /* - * If a transaction wrote a commit record in the gap between taking and - * logging the snapshot then latestCompletedXid may already be higher than - * the value from the snapshot, so check before we use the incoming value. - */ - if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid, - running->latestCompletedXid)) - ShmemVariableCache->latestCompletedXid = running->latestCompletedXid; - - Assert(TransactionIdIsNormal(ShmemVariableCache->latestCompletedXid)); - - LWLockRelease(ProcArrayLock); /* ShmemVariableCache->nextFullXid must be beyond any observed xid. */ AdvanceNextFullTransactionIdPastXid(latestObservedXid); Assert(FullTransactionIdIsValid(ShmemVariableCache->nextFullXid)); + /* + * If a transaction wrote a commit record in the gap between taking and + * logging the snapshot then latestCompletedFullXid may already be higher + * than the value from the snapshot, so check before we use the incoming + * value. It also might not yet be set at all. + */ + fxid = FullXidViaRelative(ShmemVariableCache->nextFullXid, + running->latestCompletedXid); + if (!FullTransactionIdIsValid(ShmemVariableCache->latestCompletedFullXid) || + FullTransactionIdFollows(fxid, ShmemVariableCache->latestCompletedFullXid)) + ShmemVariableCache->latestCompletedFullXid = fxid; + + LWLockRelease(ProcArrayLock); + KnownAssignedXidsDisplay(trace_recovery(DEBUG3)); if (standbyState == STANDBY_SNAPSHOT_READY) elog(trace_recovery(DEBUG1), "recovery snapshots are now enabled"); @@ -1050,10 +1184,11 @@ TransactionIdIsInProgress(TransactionId xid) LWLockAcquire(ProcArrayLock, LW_SHARED); /* - * Now that we have the lock, we can check latestCompletedXid; if the + * Now that we have the lock, we can check latestCompletedFullXid; if the * target Xid is after that, it's surely still running. */ - if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid, xid)) + if (TransactionIdPrecedes(XidFromFullTransactionId(ShmemVariableCache->latestCompletedFullXid), + xid)) { LWLockRelease(ProcArrayLock); xc_by_latest_xid_inc(); @@ -1250,159 +1385,166 @@ TransactionIdIsActive(TransactionId xid) /* - * GetOldestXmin -- returns oldest transaction that was running - * when any current transaction was started. + * Determine horizons due to concurrently running transactions. * - * If rel is NULL or a shared relation, all backends are considered, otherwise - * only backends running in this database are considered. - * - * The flags are used to ignore the backends in calculation when any of the - * corresponding flags is set. Typically, if you want to ignore ones with - * PROC_IN_VACUUM flag, you can use PROCARRAY_FLAGS_VACUUM. - * - * PROCARRAY_SLOTS_XMIN causes GetOldestXmin to ignore the xmin and - * catalog_xmin of any replication slots that exist in the system when - * calculating the oldest xmin. - * - * This is used by VACUUM to decide which deleted tuples must be preserved in - * the passed in table. For shared relations backends in all databases must be - * considered, but for non-shared relations that's not required, since only - * backends in my own database could ever see the tuples in them. Also, we can - * ignore concurrently running lazy VACUUMs because (a) they must be working - * on other tables, and (b) they don't need to do snapshot-based lookups. - * - * This is also used to determine where to truncate pg_subtrans. For that - * backends in all databases have to be considered, so rel = NULL has to be - * passed in. + * This is used by wrapper functions for more specific use cases like hot + * pruning, vacuuming and pg_subtrans truncations. * * Note: we include all currently running xids in the set of considered xids. * This ensures that if a just-started xact has not yet set its snapshot, * when it does set the snapshot it cannot set xmin less than what we compute. * See notes in src/backend/access/transam/README. * - * Note: despite the above, it's possible for the calculated value to move - * backwards on repeated calls. The calculated value is conservative, so that - * anything older is definitely not considered as running by anyone anymore, - * but the exact value calculated depends on a number of things. For example, - * if rel = NULL and there are no transactions running in the current - * database, GetOldestXmin() returns latestCompletedXid. If a transaction + * Note: despite the above, it's possible for the calculated values to move + * backwards on repeated calls. The calculated values are conservative, so + * that anything older is definitely not considered as running by anyone + * anymore, but the exact values calculated depend on a number of things. For + * example, if there are no transactions running in the current database, the + * horizon for normal tables will be latestCompletedFullXid. If a transaction * begins after that, its xmin will include in-progress transactions in other * databases that started earlier, so another call will return a lower value. * Nonetheless it is safe to vacuum a table in the current database with the * first result. There are also replication-related effects: a walsender * process can set its xmin based on transactions that are no longer running * in the master but are still being replayed on the standby, thus possibly - * making the GetOldestXmin reading go backwards. In this case there is a - * possibility that we lose data that the standby would like to have, but - * unless the standby uses a replication slot to make its xmin persistent - * there is little we can do about that --- data is only protected if the - * walsender runs continuously while queries are executed on the standby. - * (The Hot Standby code deals with such cases by failing standby queries - * that needed to access already-removed data, so there's no integrity bug.) - * The return value is also adjusted with vacuum_defer_cleanup_age, so - * increasing that setting on the fly is another easy way to make - * GetOldestXmin() move backwards, with no consequences for data integrity. + * making the values go backwards. In this case there is a possibility that + * we lose data that the standby would like to have, but unless the standby + * uses a replication slot to make its xmin persistent there is little we can + * do about that --- data is only protected if the walsender runs continuously + * while queries are executed on the standby. (The Hot Standby code deals + * with such cases by failing standby queries that needed to access + * already-removed data, so there's no integrity bug.) The computed values + * are also adjusted with vacuum_defer_cleanup_age, so increasing that setting + * on the fly is another easy way to make horizons move backwards, with no + * consequences for data integrity. */ -TransactionId -GetOldestXmin(Relation rel, int flags) +static void +ComputeTransactionHorizons(ComputedHorizons *h) { ProcArrayStruct *arrayP = procArray; - TransactionId result; - int index; - bool allDbs; + TransactionId kaxmin; + bool in_recovery = RecoveryInProgress(); - TransactionId replication_slot_xmin = InvalidTransactionId; - TransactionId replication_slot_catalog_xmin = InvalidTransactionId; - - /* - * If we're not computing a relation specific limit, or if a shared - * relation has been passed in, backends in all databases have to be - * considered. - */ - allDbs = rel == NULL || rel->rd_rel->relisshared; - - /* Cannot look for individual databases during recovery */ - Assert(allDbs || !RecoveryInProgress()); + /* inferred after ProcArrayLock is released */ + h->catalog_oldest_visible = InvalidTransactionId; LWLockAcquire(ProcArrayLock, LW_SHARED); - /* - * We initialize the MIN() calculation with latestCompletedXid + 1. This - * is a lower bound for the XIDs that might appear in the ProcArray later, - * and so protects us against overestimating the result due to future - * additions. - */ - result = ShmemVariableCache->latestCompletedXid; - Assert(TransactionIdIsNormal(result)); - TransactionIdAdvance(result); + h->latest_completed = ShmemVariableCache->latestCompletedFullXid; - for (index = 0; index < arrayP->numProcs; index++) + /* + * We initialize the MIN() calculation with latestCompletedFullXid + + * 1. This is a lower bound for the XIDs that might appear in the + * ProcArray later, and so protects us against overestimating the result + * due to future additions. + */ + { + TransactionId initial; + + initial = XidFromFullTransactionId(h->latest_completed); + Assert(TransactionIdIsValid(initial)); + TransactionIdAdvance(initial); + + h->oldest_considered_running = initial; + h->shared_oldest_visible = initial; + h->data_oldest_visible = initial; + } + + /* + * Fetch slot horizons while ProcArrayLock is held - the + * LWLockAcquire/LWLockRelease are a barrier, ensuring this happens inside + * the lock. + */ + h->slot_xmin = procArray->replication_slot_xmin; + h->slot_catalog_xmin = procArray->replication_slot_catalog_xmin; + + for (int index = 0; index < arrayP->numProcs; index++) { int pgprocno = arrayP->pgprocnos[index]; PGPROC *proc = &allProcs[pgprocno]; PGXACT *pgxact = &allPgXact[pgprocno]; + TransactionId xid; + TransactionId xmin; - if (pgxact->vacuumFlags & (flags & PROCARRAY_PROC_FLAGS_MASK)) + /* Fetch xid just once - see GetNewTransactionId */ + xid = UINT32_ACCESS_ONCE(pgxact->xid); + xmin = UINT32_ACCESS_ONCE(pgxact->xmin); + + /* + * Consider both the transaction's Xmin, and its Xid. + * + * We must check both because a transaction might have an Xmin but not + * (yet) an Xid; conversely, if it has an Xid, that could determine + * some not-yet-set Xmin. + */ + xmin = TransactionIdOlder(xmin, xid); + + /* if neither is set, this proc doesn't influence the horizon */ + if (!TransactionIdIsValid(xmin)) continue; - if (allDbs || + /* + * Don't ignore any procs when determining which transactions might be + * considered running. While slots should ensure logical decoding + * backends are protected even without this check, it can't hurt to + * include them here as well.. + */ + h->oldest_considered_running = + TransactionIdOlder(h->oldest_considered_running, xmin); + + /* + * Skip over backends either vacuuming (which is ok with rows being + * removed, as long as pg_subtrans is not truncated) or doing logical + * decoding (which manages xmin separately, check below). + */ + if (pgxact->vacuumFlags & (PROC_IN_VACUUM | PROC_IN_LOGICAL_DECODING)) + continue; + + /* shared tables need to take backends in all database into account */ + h->shared_oldest_visible = + TransactionIdOlder(h->shared_oldest_visible, xmin); + + /* + * Normally queries in other databases are ignored for anything but + * the shared horizon. But in recovery we cannot compute an accurate + * per-database horizon as all xids are managed via the + * KnownAssignedXids machinery. + */ + if (in_recovery || proc->databaseId == MyDatabaseId || proc->databaseId == 0) /* always include WalSender */ { - /* Fetch xid just once - see GetNewTransactionId */ - TransactionId xid = UINT32_ACCESS_ONCE(pgxact->xid); - - /* First consider the transaction's own Xid, if any */ - if (TransactionIdIsNormal(xid) && - TransactionIdPrecedes(xid, result)) - result = xid; - - /* - * Also consider the transaction's Xmin, if set. - * - * We must check both Xid and Xmin because a transaction might - * have an Xmin but not (yet) an Xid; conversely, if it has an - * Xid, that could determine some not-yet-set Xmin. - */ - xid = UINT32_ACCESS_ONCE(pgxact->xmin); - if (TransactionIdIsNormal(xid) && - TransactionIdPrecedes(xid, result)) - result = xid; + h->data_oldest_visible = + TransactionIdOlder(h->data_oldest_visible, xmin); } } /* - * Fetch into local variable while ProcArrayLock is held - the - * LWLockRelease below is a barrier, ensuring this happens inside the - * lock. + * If in recovery fetch oldest xid in KnownAssignedXids, will be applied + * after lock is released. */ - replication_slot_xmin = procArray->replication_slot_xmin; - replication_slot_catalog_xmin = procArray->replication_slot_catalog_xmin; + if (in_recovery) + kaxmin = KnownAssignedXidsGetOldestXmin(); - if (RecoveryInProgress()) + /* + * No other information needed, so release the lock immediately. The rest + * of the computations can be done without a lock. + */ + LWLockRelease(ProcArrayLock); + + if (in_recovery) { - /* - * Check to see whether KnownAssignedXids contains an xid value older - * than the main procarray. - */ - TransactionId kaxmin = KnownAssignedXidsGetOldestXmin(); - - LWLockRelease(ProcArrayLock); - - if (TransactionIdIsNormal(kaxmin) && - TransactionIdPrecedes(kaxmin, result)) - result = kaxmin; + h->oldest_considered_running = + TransactionIdOlder(h->oldest_considered_running, kaxmin); + h->shared_oldest_visible = + TransactionIdOlder(h->shared_oldest_visible, kaxmin); + h->data_oldest_visible = + TransactionIdOlder(h->data_oldest_visible, kaxmin); } else { /* - * No other information needed, so release the lock immediately. - */ - LWLockRelease(ProcArrayLock); - - /* - * Compute the cutoff XID by subtracting vacuum_defer_cleanup_age, - * being careful not to generate a "permanent" XID. + * Compute the cutoff XID by subtracting vacuum_defer_cleanup_age. * * vacuum_defer_cleanup_age provides some additional "slop" for the * benefit of hot standby queries on standby servers. This is quick @@ -1414,34 +1556,141 @@ GetOldestXmin(Relation rel, int flags) * in varsup.c. Also note that we intentionally don't apply * vacuum_defer_cleanup_age on standby servers. */ - result -= vacuum_defer_cleanup_age; - if (!TransactionIdIsNormal(result)) - result = FirstNormalTransactionId; + h->oldest_considered_running = + TransactionIdRetreatedBy(h->oldest_considered_running, + vacuum_defer_cleanup_age); + h->shared_oldest_visible = + TransactionIdRetreatedBy(h->shared_oldest_visible, + vacuum_defer_cleanup_age); + h->data_oldest_visible = + TransactionIdRetreatedBy(h->data_oldest_visible, + vacuum_defer_cleanup_age); } /* * Check whether there are replication slots requiring an older xmin. */ - if (!(flags & PROCARRAY_SLOTS_XMIN) && - TransactionIdIsValid(replication_slot_xmin) && - NormalTransactionIdPrecedes(replication_slot_xmin, result)) - result = replication_slot_xmin; + h->shared_oldest_visible = + TransactionIdOlder(h->shared_oldest_visible, h->slot_xmin); + h->data_oldest_visible = + TransactionIdOlder(h->data_oldest_visible, h->slot_xmin); /* - * After locks have been released and vacuum_defer_cleanup_age has been - * applied, check whether we need to back up further to make logical - * decoding possible. We need to do so if we're computing the global limit - * (rel = NULL) or if the passed relation is a catalog relation of some - * kind. + * The only difference between catalog / data horizons is that the slot's + * catalog xmin is applied to the catalog one (so catalogs can be accessed + * for logical decoding). Initialize with data horizon, and then back up + * further if necessary. Have to back up the shared horizon as well, since + * that also can contain catalogs. */ - if (!(flags & PROCARRAY_SLOTS_XMIN) && - (rel == NULL || - RelationIsAccessibleInLogicalDecoding(rel)) && - TransactionIdIsValid(replication_slot_catalog_xmin) && - NormalTransactionIdPrecedes(replication_slot_catalog_xmin, result)) - result = replication_slot_catalog_xmin; + h->shared_oldest_visible_raw = h->shared_oldest_visible; + h->shared_oldest_visible = + TransactionIdOlder(h->shared_oldest_visible, + h->slot_catalog_xmin); + h->catalog_oldest_visible = h->data_oldest_visible; + h->catalog_oldest_visible = + TransactionIdOlder(h->catalog_oldest_visible, + h->slot_catalog_xmin); - return result; + /* + * It's possible that slots / vacuum_defer_cleanup_age backed up the + * horizons further than oldest_considered_running. Fix. + */ + h->oldest_considered_running = + TransactionIdOlder(h->oldest_considered_running, + h->shared_oldest_visible); + h->oldest_considered_running = + TransactionIdOlder(h->oldest_considered_running, + h->catalog_oldest_visible); + h->oldest_considered_running = + TransactionIdOlder(h->oldest_considered_running, + h->data_oldest_visible); + + /* shared horizons have to be at least as old as the oldest visible in current db */ + Assert(TransactionIdPrecedesOrEquals(h->shared_oldest_visible, h->data_oldest_visible)); + Assert(TransactionIdPrecedesOrEquals(h->shared_oldest_visible, h->catalog_oldest_visible)); + + /* + * Horizons need to ensure that pg_subtrans access is still possible for + * the relevant backends. + */ + Assert(TransactionIdPrecedesOrEquals(h->oldest_considered_running, + h->shared_oldest_visible)); + Assert(TransactionIdPrecedesOrEquals(h->oldest_considered_running, + h->catalog_oldest_visible)); + Assert(TransactionIdPrecedesOrEquals(h->oldest_considered_running, + h->data_oldest_visible)); + Assert(!TransactionIdIsValid(h->slot_xmin) || + TransactionIdPrecedesOrEquals(h->oldest_considered_running, + h->slot_xmin)); + Assert(!TransactionIdIsValid(h->slot_catalog_xmin) || + TransactionIdPrecedesOrEquals(h->oldest_considered_running, + h->slot_catalog_xmin)); +} + +/* + * Return the oldest transaction id that might still be considered as visible + * by any backend. Rows that are only visible to transactions before the + * returned xid can safely be removed. + * + * If rel is not NULL the horizon may be considerably more recent than if NULL + * were passed. In the NULL case a horizon that is correct (but not optimal) + * for all relations will be returned. + */ +TransactionId +GetOldestVisibleTransactionId(Relation rel) +{ + ComputedHorizons horizons; + + ComputeTransactionHorizons(&horizons); + + /* + * If we're not computing a relation specific limit, or if a shared + * relation has been passed in, backends in all databases have to be + * considered. + */ + if (rel == NULL || rel->rd_rel->relisshared) + return horizons.shared_oldest_visible; + + if (RelationIsAccessibleInLogicalDecoding(rel)) + return horizons.catalog_oldest_visible; + + return horizons.data_oldest_visible; +} + +/* + * Return the oldest transaction id any currently running backend might still + * think is running. This should not be used for visibility / pruning + * determinations (see GetOldestVisibleTransactionId()), but for decisions + * like up to where pg_subtrans can be truncated. + */ +TransactionId +GetOldestTransactionIdConsideredRunning(void) +{ + ComputedHorizons horizons; + + ComputeTransactionHorizons(&horizons); + + return horizons.oldest_considered_running; +} + +/* + * Return the visibility horizons for a hot standby feedback message. + */ +void +GetReplicationHorizons(TransactionId *xmin, TransactionId *catalog_xmin) +{ + ComputedHorizons horizons; + + ComputeTransactionHorizons(&horizons); + + /* + * Don't want to use shared_oldest_visible here, as that contains the + * effect of replication slot's catalog_xmin. We want to send a separate + * feedback for the catalog horizon, so the primary can remove data table + * contents more aggressively. + */ + *xmin = horizons.shared_oldest_visible_raw; + *catalog_xmin = horizons.slot_catalog_xmin; } /* @@ -1492,12 +1741,9 @@ GetMaxSnapshotSubxidCount(void) * current transaction (this is the same as MyPgXact->xmin). * RecentXmin: the xmin computed for the most recent snapshot. XIDs * older than this are known not running any more. - * RecentGlobalXmin: the global xmin (oldest TransactionXmin across all - * running transactions, except those running LAZY VACUUM). This is - * the same computation done by - * GetOldestXmin(NULL, PROCARRAY_FLAGS_VACUUM). - * RecentGlobalDataXmin: the global xmin for non-catalog tables - * >= RecentGlobalXmin + * + * And update the state in InvisibleShared, InvisibleCatalog, InvisibleData + * for the benefit InvisibleToEveryone*. * * Note: this function should probably not be called with an argument that's * not statically allocated (see xip allocation below). @@ -1508,11 +1754,12 @@ GetSnapshotData(Snapshot snapshot) ProcArrayStruct *arrayP = procArray; TransactionId xmin; TransactionId xmax; - TransactionId globalxmin; int index; int count = 0; int subcount = 0; bool suboverflowed = false; + FullTransactionId latest_completed; + TransactionId oldestxid; TransactionId replication_slot_xmin = InvalidTransactionId; TransactionId replication_slot_catalog_xmin = InvalidTransactionId; @@ -1556,13 +1803,16 @@ GetSnapshotData(Snapshot snapshot) */ LWLockAcquire(ProcArrayLock, LW_SHARED); + latest_completed = ShmemVariableCache->latestCompletedFullXid; + oldestxid = ShmemVariableCache->oldestXid; + /* xmax is always latestCompletedXid + 1 */ - xmax = ShmemVariableCache->latestCompletedXid; - Assert(TransactionIdIsNormal(xmax)); + xmax = XidFromFullTransactionId(latest_completed); TransactionIdAdvance(xmax); + Assert(TransactionIdIsNormal(xmax)); /* initialize xmin calculation with xmax */ - globalxmin = xmin = xmax; + xmin = xmax; snapshot->takenDuringRecovery = RecoveryInProgress(); @@ -1591,12 +1841,6 @@ GetSnapshotData(Snapshot snapshot) (PROC_IN_LOGICAL_DECODING | PROC_IN_VACUUM)) continue; - /* Update globalxmin to be the smallest valid xmin */ - xid = UINT32_ACCESS_ONCE(pgxact->xmin); - if (TransactionIdIsNormal(xid) && - NormalTransactionIdPrecedes(xid, globalxmin)) - globalxmin = xid; - /* Fetch xid just once - see GetNewTransactionId */ xid = UINT32_ACCESS_ONCE(pgxact->xid); @@ -1712,34 +1956,78 @@ GetSnapshotData(Snapshot snapshot) LWLockRelease(ProcArrayLock); - /* - * Update globalxmin to include actual process xids. This is a slightly - * different way of computing it than GetOldestXmin uses, but should give - * the same result. - */ - if (TransactionIdPrecedes(xmin, globalxmin)) - globalxmin = xmin; + /* maintain state for invisible-to-everyone tests */ + { + TransactionId def_vis_xid; + TransactionId def_vis_xid_data; + FullTransactionId def_vis_fxid; + FullTransactionId def_vis_fxid_data; + FullTransactionId oldestfxid; - /* Update global variables too */ - RecentGlobalXmin = globalxmin - vacuum_defer_cleanup_age; - if (!TransactionIdIsNormal(RecentGlobalXmin)) - RecentGlobalXmin = FirstNormalTransactionId; + /* + * Converting oldestXid is only safe when xid horizon cannot advance, + * i.e. holding locks. While we don't hold the lock anymore, all the + * necessary data has been gathered with lock held. + */ + oldestfxid = FullXidViaRelative(latest_completed, oldestxid); - /* Check whether there's a replication slot requiring an older xmin. */ - if (TransactionIdIsValid(replication_slot_xmin) && - NormalTransactionIdPrecedes(replication_slot_xmin, RecentGlobalXmin)) - RecentGlobalXmin = replication_slot_xmin; + /* apply vacuum_defer_cleanup_age */ + def_vis_xid_data = + TransactionIdRetreatedBy(xmin, vacuum_defer_cleanup_age); - /* Non-catalog tables can be vacuumed if older than this xid */ - RecentGlobalDataXmin = RecentGlobalXmin; + /* Check whether there's a replication slot requiring an older xmin. */ + def_vis_xid_data = + TransactionIdOlder(def_vis_xid_data, replication_slot_xmin); - /* - * Check whether there's a replication slot requiring an older catalog - * xmin. - */ - if (TransactionIdIsNormal(replication_slot_catalog_xmin) && - NormalTransactionIdPrecedes(replication_slot_catalog_xmin, RecentGlobalXmin)) - RecentGlobalXmin = replication_slot_catalog_xmin; + /* + * Rows in non-shared, non-catalog tables possibly could be vacuumed + * if older than this xid. + */ + def_vis_xid = def_vis_xid_data; + + /* + * Check whether there's a replication slot requiring an older catalog + * xmin. + */ + def_vis_xid = + TransactionIdOlder(replication_slot_catalog_xmin, def_vis_xid); + + def_vis_fxid = FullXidViaRelative(latest_completed, def_vis_xid); + def_vis_fxid_data = FullXidViaRelative(latest_completed, def_vis_xid_data); + + /* + * Check if we can increase upper bound. As a previous + * InvisibleToEveryoneUpdateHorizons() might have computed more + * aggressive values, don't overwrite them if so. + */ + InvisibleShared.definitely_needed_bound = + FullTransactionIdNewer(def_vis_fxid, + InvisibleShared.definitely_needed_bound); + InvisibleCatalog.definitely_needed_bound = + FullTransactionIdNewer(def_vis_fxid, + InvisibleCatalog.definitely_needed_bound); + InvisibleData.definitely_needed_bound = + FullTransactionIdNewer(def_vis_fxid_data, + InvisibleData.definitely_needed_bound); + + /* + * Check if we know that we can initialize or increase the lower + * bound. Currently the only cheap way to do so is to use + * ShmemVariableCache->oldestXid as input. + * + * We should definitely be able to do better. We could e.g. put a + * global lower bound value into ShmemVariableCache. + */ + InvisibleShared.maybe_needed_bound = + FullTransactionIdNewer(InvisibleShared.maybe_needed_bound, + oldestfxid); + InvisibleCatalog.maybe_needed_bound = + FullTransactionIdNewer(InvisibleCatalog.maybe_needed_bound, + oldestfxid); + InvisibleData.maybe_needed_bound = + FullTransactionIdNewer(InvisibleData.maybe_needed_bound, + oldestfxid); + } RecentXmin = xmin; @@ -1986,7 +2274,7 @@ GetRunningTransactionData(void) LWLockAcquire(ProcArrayLock, LW_SHARED); LWLockAcquire(XidGenLock, LW_SHARED); - latestCompletedXid = ShmemVariableCache->latestCompletedXid; + latestCompletedXid = XidFromFullTransactionId(ShmemVariableCache->latestCompletedFullXid); oldestRunningXid = XidFromFullTransactionId(ShmemVariableCache->nextFullXid); @@ -3209,9 +3497,11 @@ XidCacheRemoveRunningXids(TransactionId xid, elog(WARNING, "did not find subXID %u in MyProc", xid); /* Also advance global latestCompletedXid while holding the lock */ - if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid, + if (TransactionIdPrecedes(XidFromFullTransactionId(ShmemVariableCache->latestCompletedFullXid), latestXid)) - ShmemVariableCache->latestCompletedXid = latestXid; + ShmemVariableCache->latestCompletedFullXid = + FullXidViaRelative(ShmemVariableCache->latestCompletedFullXid, + latestXid); LWLockRelease(ProcArrayLock); } @@ -3238,6 +3528,273 @@ DisplayXidCache(void) } #endif /* XIDCACHE_DEBUG */ +/* + * Initialize test allowing to make determinations about whether rows with + * xids are still needed for backend that can access rel. If rel is NULL, the + * test state will be appropriate to test if there's any table in the system + * that may still need a row with such an xid. + * + * This needs to be called while holding a snapshot, otherwise there are + * wraparound and other dangers. + */ +InvisibleToEveryoneState * +InvisibleToEveryoneTestInit(Relation rel) +{ + bool need_shared; + bool need_catalog; + InvisibleToEveryoneState *state; + + /* cannot safely be used without holding a snapshot */ + Assert(SnapshotSet()); + + if (!rel) + need_shared = need_catalog = true; + else + { + /* + * Other kinds currently don't contain xids, nor always the necessary + * logical decoding markers. + */ + Assert(rel->rd_rel->relkind == RELKIND_RELATION || + rel->rd_rel->relkind == RELKIND_MATVIEW || + rel->rd_rel->relkind == RELKIND_TOASTVALUE); + + need_shared = rel->rd_rel->relisshared || RecoveryInProgress(); + need_catalog = IsCatalogRelation(rel) || RelationIsAccessibleInLogicalDecoding(rel); + } + + if (need_shared) + state = &InvisibleShared; + else if (need_catalog) + state = &InvisibleCatalog; + else + state = &InvisibleData; + + Assert(FullTransactionIdIsValid(state->definitely_needed_bound) && + FullTransactionIdIsValid(state->maybe_needed_bound)); + + return state; +} + +/* + * Return true if it's worth updating the accurate maybe_needed_bound visibility boundary. + * + * As it is somewhat expensive to determine xmin horizons, we don't want to + * repeatedly do so when there is a low likelihood of it being + * beneficial. + * + * The current heuristic is that we at most do so once per snapshot computed, + * and for further computations of the snapshot, we only recompute if the xmin + * horizon has changed since. The latter indicates that transactions have + * completed since. + */ +static bool +InvisibleToEveryoneShouldUpdateHorizons(InvisibleToEveryoneState *state) +{ + /* hasn't been computed yet in this transaction */ + if (!TransactionIdIsValid(ComputedHorizonsLastXmin)) + return true; + + /* + * If the maybe_needed_bound/definitely_needed_bound boundaries are the + * same, it's unlikely to be beneficial to recompute boundaries. + */ + if (FullTransactionIdFollowsOrEquals(state->maybe_needed_bound, + state->definitely_needed_bound)) + return false; + + /* snapshot computation has yielded different xmin since last update */ + return RecentXmin != ComputedHorizonsLastXmin; +} + +/* + * Update the boundaries in Invisible{Shared,Catalog, Data} with accurate + * values. + */ +static void +InvisibleToEveryoneUpdateHorizons(void) +{ + ComputedHorizons horizons; + + ComputeTransactionHorizons(&horizons); + + InvisibleShared.maybe_needed_bound = + FullXidViaRelative(horizons.latest_completed, + horizons.shared_oldest_visible); + InvisibleCatalog.maybe_needed_bound = + FullXidViaRelative(horizons.latest_completed, + horizons.catalog_oldest_visible); + InvisibleData.maybe_needed_bound = + FullXidViaRelative(horizons.latest_completed, + horizons.data_oldest_visible); + + /* + * In longer running transactions it's possible that transactions we + * previously needed to treat as running aren't around anymore. So update + * definitely_needed_bound to not be earlier than maybe_needed_bound. + */ + InvisibleShared.definitely_needed_bound = + FullTransactionIdNewer(InvisibleShared.maybe_needed_bound, + InvisibleShared.definitely_needed_bound); + InvisibleCatalog.definitely_needed_bound = + FullTransactionIdNewer(InvisibleCatalog.maybe_needed_bound, + InvisibleCatalog.definitely_needed_bound); + InvisibleData.definitely_needed_bound = + FullTransactionIdNewer(InvisibleData.maybe_needed_bound, + InvisibleData.definitely_needed_bound); + + ComputedHorizonsLastXmin = RecentXmin; +} + +/* + * Return true if rows that have become invisible at fxid are not visible to + * any backend anymore, false otherwise. + * + * The state passed needs to have been initialized for the relation fxid is + * from (NULL is also OK), otherwise the result may not be correct. + */ +bool +InvisibleToEveryoneTestFullXid(InvisibleToEveryoneState *state, FullTransactionId fxid) +{ + /* + * If the xid is older than maybe_needed_bound bound, it definitely can be + * removed (even though maybe_needed_bound is approximate, it can only be + * older than the accurate bound). + */ + if (FullTransactionIdPrecedes(fxid, state->maybe_needed_bound)) + return true; + + /* + * If the xid is >= definitely_needed_bound bound, it can't be removed, + * and updating our horizons would not help (or at least be fairly + * unlikely to). + */ + if (FullTransactionIdFollowsOrEquals(fxid, state->definitely_needed_bound)) + return false; + + /* + * The value is between maybe_needed_bound and definitely_needed_bound, + * i.e. it may or may not still be visible. If we haven't already done so, + * recompute bounds, and recheck. + */ + if (InvisibleToEveryoneShouldUpdateHorizons(state)) + { + InvisibleToEveryoneUpdateHorizons(); + + Assert(FullTransactionIdPrecedes(fxid, state->definitely_needed_bound)); + + return FullTransactionIdPrecedes(fxid, state->maybe_needed_bound); + } + else + return false; +} + +/* + * Wrapper around InvisibleToEveryoneTestFullXid() that accepts 32bit xids. + * + * It is crucial that this only gets called for xids from a source that + * protects against xid wraparounds (e.g. from a table and thus protected by + * relfrozenxid). + */ +bool +InvisibleToEveryoneTestXid(InvisibleToEveryoneState *state, TransactionId xid) +{ + FullTransactionId fxid; + + /* + * Convert 32 bit argument to FullTransactionId. We can do so safely + * because we know the xid has to, at the very least, be between + * [oldestXid, nextFullXid), i.e. within 2 billion of xid. To avoid taking + * a lock to determine either, we can just compare with + * state->definitely_needed_bound, which was based on those value at the + * time the current snapshot was built. + */ + fxid = FullXidViaRelative(state->definitely_needed_bound, xid); + + return InvisibleToEveryoneTestFullXid(state, fxid); +} + +/* + * Return FullTransactionId below which rows that have become invisible are + * not visible to any backend anymore. + * + * Note: This is less efficient than testing with + * InvisibleToEveryoneTestFullXid because it will require computing an + * accurate value, even if the all the values compared with the return value + * would be determined invisible due to being < state->maybe_needed_bound. + * + */ +FullTransactionId +InvisibleToEveryoneTestFullCutoff(InvisibleToEveryoneState *state) +{ + /* acquire accurate horizon if not already done */ + if (InvisibleToEveryoneShouldUpdateHorizons(state)) + InvisibleToEveryoneUpdateHorizons(); + + return state->maybe_needed_bound; +} + +/* wrapper around InvisibleToEveryoneTestFullCutoff */ +TransactionId +InvisibleToEveryoneTestCutoff(InvisibleToEveryoneState *state) +{ + return XidFromFullTransactionId(InvisibleToEveryoneTestFullCutoff(state)); +} + +/* + * Convenience wrapper around InvisibleToEveryoneTestInit() and + * InvisibleToEveryoneTestFullXid(), see their comments. + */ +bool +InvisibleToEveryoneCheckFullXid(Relation rel, FullTransactionId fxid) +{ + InvisibleToEveryoneState *state; + + state = InvisibleToEveryoneTestInit(rel); + + return InvisibleToEveryoneTestFullXid(state, fxid); +} + +/* + * Convenience wrapper around InvisibleToEveryoneTestInit() and + * InvisibleToEveryoneTestXid(), see their comments. + */ +bool +InvisibleToEveryoneCheckXid(Relation rel, TransactionId xid) +{ + InvisibleToEveryoneState *state; + + state = InvisibleToEveryoneTestInit(rel); + + return InvisibleToEveryoneTestXid(state, xid); +} + +/* + * Convert a 32 bit transaction id into 64 bit transaction id, by assuming it + * is within MaxTransactionId / 2 of XidFromFullTransactionId(rel). + * + * Be very careful about when to use this function. It can only safely be used + * when there is a guarantee that xid is within MaxTransactionId / 2 xids of + * rel. That e.g. can be guaranteed if the the caller assures a snapshot is + * held by the backend and xid is from a table (where vacuum/freezing ensures + * the xid has to be within that range), or if xid is from the procarray and + * prevents xid wraparound that way. + */ +static inline FullTransactionId +FullXidViaRelative(FullTransactionId rel, TransactionId xid) +{ + TransactionId rel_xid = XidFromFullTransactionId(rel); + + Assert(TransactionIdIsValid(xid)); + Assert(TransactionIdIsValid(rel_xid)); + + /* not guaranteed to find issues, but likely to catch mistakes */ + AssertTransactionIdMayBeOnDisk(xid); + + return FullTransactionIdFromU64( + U64FromFullTransactionId(rel) + (int32)(xid - rel_xid)); +} + /* ---------------------------------------------- * KnownAssignedTransactionIds sub-module @@ -3390,9 +3947,7 @@ ExpireTreeKnownAssignedTransactionIds(TransactionId xid, int nsubxids, KnownAssignedXidsRemoveTree(xid, nsubxids, subxids); /* As in ProcArrayEndTransaction, advance latestCompletedXid */ - if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid, - max_xid)) - ShmemVariableCache->latestCompletedXid = max_xid; + MaintainLatestCompletedXid(max_xid); LWLockRelease(ProcArrayLock); } diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index 4fdcb07d97b..fb94c114a50 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -5591,14 +5591,15 @@ get_actual_variable_endpoint(Relation heapRel, * recent); that case motivates not using SnapshotAny here. * * A crucial point here is that SnapshotNonVacuumable, with - * RecentGlobalXmin as horizon, yields the inverse of the condition that - * the indexscan will use to decide that index entries are killable (see - * heap_hot_search_buffer()). Therefore, if the snapshot rejects a tuple - * (or more precisely, all tuples of a HOT chain) and we have to continue - * scanning past it, we know that the indexscan will mark that index entry - * killed. That means that the next get_actual_variable_endpoint() call - * will not have to re-consider that index entry. In this way we avoid - * repetitive work when this function is used a lot during planning. + * InvisibleToEveryoneTestInit(heapRel) as horizon, yields the inverse of + * the condition that the indexscan will use to decide that index entries + * are killable (see heap_hot_search_buffer()). Therefore, if the + * snapshot rejects a tuple (or more precisely, all tuples of a HOT chain) + * and we have to continue scanning past it, we know that the indexscan + * will mark that index entry killed. That means that the next + * get_actual_variable_endpoint() call will not have to re-consider that + * index entry. In this way we avoid repetitive work when this function + * is used a lot during planning. * * But using SnapshotNonVacuumable creates a hazard of its own. In a * recently-created index, some index entries may point at "broken" HOT @@ -5610,7 +5611,8 @@ get_actual_variable_endpoint(Relation heapRel, * or could even be NULL. We avoid this hazard because we take the data * from the index entry not the heap. */ - InitNonVacuumableSnapshot(SnapshotNonVacuumable, RecentGlobalXmin); + InitNonVacuumableSnapshot(SnapshotNonVacuumable, + InvisibleToEveryoneTestInit(heapRel)); index_scan = index_beginscan(heapRel, indexRel, &SnapshotNonVacuumable, diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c index f4247ea70d5..893be2f3ddb 100644 --- a/src/backend/utils/init/postinit.c +++ b/src/backend/utils/init/postinit.c @@ -722,6 +722,10 @@ InitPostgres(const char *in_dbname, Oid dboid, const char *username, * is critical for anything that reads heap pages, because HOT may decide * to prune them even if the process doesn't attempt to modify any * tuples.) + * + * FIXME: This comment is inaccurate / the code buggy. A snapshot that is + * not pushed/active does not reliably prevent HOT pruning (->xmin could + * e.g. be cleared when cache invalidations are processed). */ if (!bootstrap) { diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c index 3b148ae30a6..1182233bf43 100644 --- a/src/backend/utils/time/snapmgr.c +++ b/src/backend/utils/time/snapmgr.c @@ -157,16 +157,9 @@ static Snapshot HistoricSnapshot = NULL; * These are updated by GetSnapshotData. We initialize them this way * for the convenience of TransactionIdIsInProgress: even in bootstrap * mode, we don't want it to say that BootstrapTransactionId is in progress. - * - * RecentGlobalXmin and RecentGlobalDataXmin are initialized to - * InvalidTransactionId, to ensure that no one tries to use a stale - * value. Readers should ensure that it has been set to something else - * before using it. */ TransactionId TransactionXmin = FirstNormalTransactionId; TransactionId RecentXmin = FirstNormalTransactionId; -TransactionId RecentGlobalXmin = InvalidTransactionId; -TransactionId RecentGlobalDataXmin = InvalidTransactionId; /* (table, ctid) => (cmin, cmax) mapping during timetravel */ static HTAB *tuplecid_data = NULL; @@ -583,9 +576,7 @@ SetTransactionSnapshot(Snapshot sourcesnap, VirtualTransactionId *sourcevxid, * Even though we are not going to use the snapshot it computes, we must * call GetSnapshotData, for two reasons: (1) to be sure that * CurrentSnapshotData's XID arrays have been allocated, and (2) to update - * RecentXmin and RecentGlobalXmin. (We could alternatively include those - * two variables in exported snapshot files, but it seems better to have - * snapshot importers compute reasonably up-to-date values for them.) + * the state for InvisibleToEveryone*. */ CurrentSnapshot = GetSnapshotData(&CurrentSnapshotData); @@ -977,36 +968,6 @@ xmin_cmp(const pairingheap_node *a, const pairingheap_node *b, void *arg) return 0; } -/* - * Get current RecentGlobalXmin value, as a FullTransactionId. - */ -FullTransactionId -GetFullRecentGlobalXmin(void) -{ - FullTransactionId nextxid_full; - uint32 nextxid_epoch; - TransactionId nextxid_xid; - uint32 epoch; - - Assert(TransactionIdIsNormal(RecentGlobalXmin)); - - /* - * Compute the epoch from the next XID's epoch. This relies on the fact - * that RecentGlobalXmin must be within the 2 billion XID horizon from the - * next XID. - */ - nextxid_full = ReadNextFullTransactionId(); - nextxid_epoch = EpochFromFullTransactionId(nextxid_full); - nextxid_xid = XidFromFullTransactionId(nextxid_full); - - if (RecentGlobalXmin > nextxid_xid) - epoch = nextxid_epoch - 1; - else - epoch = nextxid_epoch; - - return FullTransactionIdFromEpochAndXid(epoch, RecentGlobalXmin); -} - /* * SnapshotResetXmin * @@ -1776,106 +1737,151 @@ GetOldSnapshotThresholdTimestamp(void) return threshold_timestamp; } -static void +void SetOldSnapshotThresholdTimestamp(TimestampTz ts, TransactionId xlimit) { SpinLockAcquire(&oldSnapshotControl->mutex_threshold); + Assert(oldSnapshotControl->threshold_timestamp <= ts); + Assert(TransactionIdPrecedesOrEquals(oldSnapshotControl->threshold_xid, xlimit)); oldSnapshotControl->threshold_timestamp = ts; oldSnapshotControl->threshold_xid = xlimit; SpinLockRelease(&oldSnapshotControl->mutex_threshold); } +void +SnapshotTooOldMagicForTest(void) +{ + TimestampTz ts = GetSnapshotCurrentTimestamp(); + + Assert(old_snapshot_threshold == 0); + + ts -= 5 * USECS_PER_SEC; + + SpinLockAcquire(&oldSnapshotControl->mutex_threshold); + oldSnapshotControl->threshold_timestamp = ts; + SpinLockRelease(&oldSnapshotControl->mutex_threshold); +} + +/* + * If there is a valid mapping for the timestamp, set *xlimitp to + * that. Returns whether there is such a mapping. + */ +static bool +GetOldSnapshotFromTimeMapping(TimestampTz ts, TransactionId *xlimitp) +{ + bool in_mapping = false; + + Assert(ts == AlignTimestampToMinuteBoundary(ts)); + + LWLockAcquire(OldSnapshotTimeMapLock, LW_SHARED); + + if (oldSnapshotControl->count_used > 0 + && ts >= oldSnapshotControl->head_timestamp) + { + int offset; + + offset = ((ts - oldSnapshotControl->head_timestamp) + / USECS_PER_MINUTE); + if (offset > oldSnapshotControl->count_used - 1) + offset = oldSnapshotControl->count_used - 1; + offset = (oldSnapshotControl->head_offset + offset) + % OLD_SNAPSHOT_TIME_MAP_ENTRIES; + + *xlimitp = oldSnapshotControl->xid_by_minute[offset]; + + in_mapping = true; + } + + LWLockRelease(OldSnapshotTimeMapLock); + + return in_mapping; +} + /* * TransactionIdLimitedForOldSnapshots * - * Apply old snapshot limit, if any. This is intended to be called for page - * pruning and table vacuuming, to allow old_snapshot_threshold to override - * the normal global xmin value. Actual testing for snapshot too old will be - * based on whether a snapshot timestamp is prior to the threshold timestamp - * set in this function. + * Apply old snapshot limit. This is intended to be called for page pruning + * and table vacuuming, to allow old_snapshot_threshold to override the normal + * global xmin value. Actual testing for snapshot too old will be based on + * whether a snapshot timestamp is prior to the threshold timestamp set in + * this function. + * + * If the limited horizon allows a cleanup action that otherwise would not be + * possible, SetOldSnapshotThresholdTimestamp(*limit_ts, *limit_xid) needs to + * be called before that cleanup action. */ -TransactionId +bool TransactionIdLimitedForOldSnapshots(TransactionId recentXmin, - Relation relation) + Relation relation, + TransactionId *limit_xid, + TimestampTz *limit_ts) { - if (TransactionIdIsNormal(recentXmin) - && old_snapshot_threshold >= 0 - && RelationAllowsEarlyPruning(relation)) + TimestampTz ts; + TransactionId xlimit = recentXmin; + TransactionId latest_xmin; + TimestampTz next_map_update_ts; + TransactionId threshold_timestamp; + TransactionId threshold_xid; + + Assert(TransactionIdIsNormal(recentXmin)); + Assert(OldSnapshotThresholdActive()); + Assert(limit_ts != NULL && limit_xid != NULL); + + if (!RelationAllowsEarlyPruning(relation)) + return false; + + ts = GetSnapshotCurrentTimestamp(); + + SpinLockAcquire(&oldSnapshotControl->mutex_latest_xmin); + latest_xmin = oldSnapshotControl->latest_xmin; + next_map_update_ts = oldSnapshotControl->next_map_update; + SpinLockRelease(&oldSnapshotControl->mutex_latest_xmin); + + /* + * Zero threshold always overrides to latest xmin, if valid. Without + * some heuristic it will find its own snapshot too old on, for + * example, a simple UPDATE -- which would make it useless for most + * testing, but there is no principled way to ensure that it doesn't + * fail in this way. Use a five-second delay to try to get useful + * testing behavior, but this may need adjustment. + */ + if (old_snapshot_threshold == 0) { - TimestampTz ts = GetSnapshotCurrentTimestamp(); - TransactionId xlimit = recentXmin; - TransactionId latest_xmin; - TimestampTz update_ts; - bool same_ts_as_threshold = false; - - SpinLockAcquire(&oldSnapshotControl->mutex_latest_xmin); - latest_xmin = oldSnapshotControl->latest_xmin; - update_ts = oldSnapshotControl->next_map_update; - SpinLockRelease(&oldSnapshotControl->mutex_latest_xmin); - - /* - * Zero threshold always overrides to latest xmin, if valid. Without - * some heuristic it will find its own snapshot too old on, for - * example, a simple UPDATE -- which would make it useless for most - * testing, but there is no principled way to ensure that it doesn't - * fail in this way. Use a five-second delay to try to get useful - * testing behavior, but this may need adjustment. - */ - if (old_snapshot_threshold == 0) - { - if (TransactionIdPrecedes(latest_xmin, MyPgXact->xmin) - && TransactionIdFollows(latest_xmin, xlimit)) - xlimit = latest_xmin; - - ts -= 5 * USECS_PER_SEC; - SetOldSnapshotThresholdTimestamp(ts, xlimit); - - return xlimit; - } + if (TransactionIdPrecedes(latest_xmin, MyPgXact->xmin) + && TransactionIdFollows(latest_xmin, xlimit)) + xlimit = latest_xmin; + ts -= 5 * USECS_PER_SEC; + } + else + { ts = AlignTimestampToMinuteBoundary(ts) - (old_snapshot_threshold * USECS_PER_MINUTE); /* Check for fast exit without LW locking. */ SpinLockAcquire(&oldSnapshotControl->mutex_threshold); - if (ts == oldSnapshotControl->threshold_timestamp) - { - xlimit = oldSnapshotControl->threshold_xid; - same_ts_as_threshold = true; - } + threshold_timestamp = oldSnapshotControl->threshold_timestamp; + threshold_xid = oldSnapshotControl->threshold_xid; SpinLockRelease(&oldSnapshotControl->mutex_threshold); - if (!same_ts_as_threshold) + if (ts == threshold_timestamp) + { + /* + * Current timestamp is in same bucket as the the last limit that + * was applied. Reuse. + */ + xlimit = threshold_xid; + } + else if (ts == next_map_update_ts) + { + /* + * FIXME: This branch is super iffy - but that should probably + * fixed separately. + */ + xlimit = latest_xmin; + } + else if (GetOldSnapshotFromTimeMapping(ts, &xlimit)) { - if (ts == update_ts) - { - xlimit = latest_xmin; - if (NormalTransactionIdFollows(xlimit, recentXmin)) - SetOldSnapshotThresholdTimestamp(ts, xlimit); - } - else - { - LWLockAcquire(OldSnapshotTimeMapLock, LW_SHARED); - - if (oldSnapshotControl->count_used > 0 - && ts >= oldSnapshotControl->head_timestamp) - { - int offset; - - offset = ((ts - oldSnapshotControl->head_timestamp) - / USECS_PER_MINUTE); - if (offset > oldSnapshotControl->count_used - 1) - offset = oldSnapshotControl->count_used - 1; - offset = (oldSnapshotControl->head_offset + offset) - % OLD_SNAPSHOT_TIME_MAP_ENTRIES; - xlimit = oldSnapshotControl->xid_by_minute[offset]; - - if (NormalTransactionIdFollows(xlimit, recentXmin)) - SetOldSnapshotThresholdTimestamp(ts, xlimit); - } - - LWLockRelease(OldSnapshotTimeMapLock); - } } /* @@ -1890,12 +1896,18 @@ TransactionIdLimitedForOldSnapshots(TransactionId recentXmin, if (TransactionIdIsNormal(latest_xmin) && TransactionIdPrecedes(latest_xmin, xlimit)) xlimit = latest_xmin; - - if (NormalTransactionIdFollows(xlimit, recentXmin)) - return xlimit; } - return recentXmin; + if (TransactionIdIsValid(xlimit) && + TransactionIdFollowsOrEquals(xlimit, recentXmin)) + { + *limit_ts = ts; + *limit_xid = xlimit; + + return true; + } + + return false; } /* diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c index 8f43f3e9dfb..b16facad70c 100644 --- a/contrib/amcheck/verify_nbtree.c +++ b/contrib/amcheck/verify_nbtree.c @@ -413,7 +413,7 @@ bt_check_every_level(Relation rel, Relation heaprel, bool heapkeyspace, /* * This assertion matches the one in index_getnext_tid(). See page - * recycling/RecentGlobalXmin notes in nbtree README. + * recycling/"invisible to everyone" notes in nbtree README. */ Assert(SnapshotSet()); @@ -1437,7 +1437,7 @@ bt_right_page_check_scankey(BtreeCheckState *state) * does not occur until no possible index scan could land on the page. * Index scans can follow links with nothing more than their snapshot as * an interlock and be sure of at least that much. (See page - * recycling/RecentGlobalXmin notes in nbtree README.) + * recycling/"invisible to everyone" notes in nbtree README.) * * Furthermore, it's okay if we follow a rightlink and find a half-dead or * dead (ignorable) page one or more times. There will either be a diff --git a/contrib/pg_visibility/pg_visibility.c b/contrib/pg_visibility/pg_visibility.c index 0cd1160ceb2..ee1fb208e07 100644 --- a/contrib/pg_visibility/pg_visibility.c +++ b/contrib/pg_visibility/pg_visibility.c @@ -563,17 +563,14 @@ collect_corrupt_items(Oid relid, bool all_visible, bool all_frozen) BufferAccessStrategy bstrategy = GetAccessStrategy(BAS_BULKREAD); TransactionId OldestXmin = InvalidTransactionId; - if (all_visible) - { - /* Don't pass rel; that will fail in recovery. */ - OldestXmin = GetOldestXmin(NULL, PROCARRAY_FLAGS_VACUUM); - } - rel = relation_open(relid, AccessShareLock); /* Only some relkinds have a visibility map */ check_relation_relkind(rel); + if (all_visible) + OldestXmin = GetOldestVisibleTransactionId(rel); + nblocks = RelationGetNumberOfBlocks(rel); /* @@ -679,11 +676,12 @@ collect_corrupt_items(Oid relid, bool all_visible, bool all_frozen) * From a concurrency point of view, it sort of sucks to * retake ProcArrayLock here while we're holding the buffer * exclusively locked, but it should be safe against - * deadlocks, because surely GetOldestXmin() should never take - * a buffer lock. And this shouldn't happen often, so it's - * worth being careful so as to avoid false positives. + * deadlocks, because surely GetOldestVisibleTransactionId() + * should never take a buffer lock. And this shouldn't happen + * often, so it's worth being careful so as to avoid false + * positives. */ - RecomputedOldestXmin = GetOldestXmin(NULL, PROCARRAY_FLAGS_VACUUM); + RecomputedOldestXmin = GetOldestVisibleTransactionId(rel); if (!TransactionIdPrecedes(OldestXmin, RecomputedOldestXmin)) record_corrupt_item(items, &tuple.t_self); diff --git a/contrib/pgstattuple/pgstatapprox.c b/contrib/pgstattuple/pgstatapprox.c index 96d837485fa..b664f95e865 100644 --- a/contrib/pgstattuple/pgstatapprox.c +++ b/contrib/pgstattuple/pgstatapprox.c @@ -71,7 +71,7 @@ statapprox_heap(Relation rel, output_type *stat) BufferAccessStrategy bstrategy; TransactionId OldestXmin; - OldestXmin = GetOldestXmin(rel, PROCARRAY_FLAGS_VACUUM); + OldestXmin = GetOldestVisibleTransactionId(rel); bstrategy = GetAccessStrategy(BAS_BULKREAD); nblocks = RelationGetNumberOfBlocks(rel); -- 2.25.0.114.g5b0ca878e0