From b2aeed6d4b83b46e818423e674b182256843ccf4 Mon Sep 17 00:00:00 2001 From: Melanie Plageman Date: Tue, 18 Jun 2024 16:06:48 -0400 Subject: [PATCH v1 2/2] Fix infinite loop in vacuum caused by visibility mismatch lazy_scan_prune() may loop infinitely if a tuple deemed non-removable by heap_prune_satisfies_vacuum() in heap_page_prune() is later deemed HEAPTUPLE_DEAD in lazy_scan_prune() by HeapTupleSatisfiesVacuum(). Both determine what transaction ID a tuple would be dead_after using HeapTupleSatisfiesVacuumHorizon() and then compare that transaction ID to a horizon. HeapTupleSatisfiesVacuum() uses OldestXmin from VacuumCutoffs as that horizon and heap_prune_satisfies_vacuum() uses GlobalVisTestIsRemovableXid() -- which checks the GlobalVisState that was passed in to heap_page_prune(). OldestXmin is determined at the start of vacuuming the relation, but GlobalVisState's maybe_needed value can change if a tuple's xmax falls within GlobalVisState's fuzzy window and vacuum has had to take a new snapshot since it started vacuuming the relation. In rare cases, a backend's GlobalVisState->maybe_needed may move backward -- for example if a standby with an older horizon which was not connected at the start of the the vacuum later connects. When this happends, a tuple with an xmax older than OldestXmin but newer than the new value of maybe_needed after GlobalVisState was updated could cause lazy_scan_prune() to loop infinitely. By only computing the tuple's HTSV_Result once, in heap_page_prune(), and saving it to be passed back to lazy_scan_prune(), we can avoid the problem altogether. This also happens to be cheaper, since HeapTupleSatisfiesVacuumHorizon() accesses the clog. As a consequence, OldestXmin is now only used for determining whether or not to freeze tuples and whether or not the whole page is visible -- not for determining whether or not tuples are removable. --- src/backend/access/heap/pruneheap.c | 47 +++++++++++++++----------- src/backend/access/heap/vacuumlazy.c | 50 ++++++++++------------------ src/include/access/heapam.h | 15 +++++++++ 3 files changed, 59 insertions(+), 53 deletions(-) diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c index 47b9e209154..404cdd13080 100644 --- a/src/backend/access/heap/pruneheap.c +++ b/src/backend/access/heap/pruneheap.c @@ -65,16 +65,6 @@ typedef struct * 1. Otherwise every access would need to subtract 1. */ bool marked[MaxHeapTuplesPerPage + 1]; - - /* - * Tuple visibility is only computed once for each tuple, for correctness - * and efficiency reasons; see comment in heap_page_prune() for details. - * This is of type int8[], instead of HTSV_Result[], so we can use -1 to - * indicate no visibility has been computed, e.g. for LP_DEAD items. - * - * Same indexing as ->marked. - */ - int8 htsv[MaxHeapTuplesPerPage + 1]; } PruneState; /* Local functions */ @@ -83,7 +73,7 @@ static HTSV_Result heap_prune_satisfies_vacuum(PruneState *prstate, Buffer buffer); static int heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, - PruneState *prstate); + PruneState *prstate, int8 *htsv); static void heap_prune_record_prunable(PruneState *prstate, TransactionId xid); static void heap_prune_record_redirect(PruneState *prstate, OffsetNumber offnum, OffsetNumber rdoffnum); @@ -204,9 +194,10 @@ heap_page_prune_opt(Relation relation, Buffer buffer) { int ndeleted, nnewlpdead; + int8 htsv[MaxHeapTuplesPerPage + 1]; ndeleted = heap_page_prune(relation, buffer, vistest, limited_xmin, - limited_ts, &nnewlpdead, NULL); + limited_ts, htsv, &nnewlpdead, NULL); /* * Report the number of tuples reclaimed to pgstats. This is @@ -253,6 +244,18 @@ heap_page_prune_opt(Relation relation, Buffer buffer) * either have been set by TransactionIdLimitedForOldSnapshots, or * InvalidTransactionId/0 respectively. * + * htsv is an in/out parameter filled in by heap_page_prune() with tuple + * visibility, computed once for each tuple for correctness and efficiency + * reasons. Some callers want access to htsv for later decisions and metrics. + * htsv is of type int8[], instead of HTSV_Result[], so we can use -1 to + * indicate no visibility has been computed, e.g. for LP_DEAD items. It can be + * passed in uninitialized because all spots will be set either to a valid + * HTSV_Result value or -1. + * + * It is the caller's responsibility to ensure that htsv is + * MaxHeapTuplesPerPage + 1 in size. By making it MaxHeapTuplesPerPage + 1, we + * can avoid subtracting 1 everywhere even though FirstOffsetNumber is 1. + * * Sets *nnewlpdead for caller, indicating the number of items that were * newly set LP_DEAD during prune operation. * @@ -266,6 +269,7 @@ heap_page_prune(Relation relation, Buffer buffer, GlobalVisState *vistest, TransactionId old_snap_xmin, TimestampTz old_snap_ts, + int8 htsv[MaxHeapTuplesPerPage + 1], int *nnewlpdead, OffsetNumber *off_loc) { @@ -331,7 +335,7 @@ heap_page_prune(Relation relation, Buffer buffer, /* Nothing to do if slot doesn't contain a tuple */ if (!ItemIdIsNormal(itemid)) { - prstate.htsv[offnum] = -1; + htsv[offnum] = -1; continue; } @@ -347,7 +351,7 @@ heap_page_prune(Relation relation, Buffer buffer, if (off_loc) *off_loc = offnum; - prstate.htsv[offnum] = heap_prune_satisfies_vacuum(&prstate, &tup, + htsv[offnum] = heap_prune_satisfies_vacuum(&prstate, &tup, buffer); } @@ -372,7 +376,7 @@ heap_page_prune(Relation relation, Buffer buffer, continue; /* Process this item or chain of items */ - ndeleted += heap_prune_chain(buffer, offnum, &prstate); + ndeleted += heap_prune_chain(buffer, offnum, &prstate, htsv); } /* Clear the offset information once we have processed the given page. */ @@ -561,6 +565,8 @@ heap_prune_satisfies_vacuum(PruneState *prstate, HeapTuple tup, Buffer buffer) /* * Prune specified line pointer or a HOT chain originating at line pointer. * + * Tuple visibility information is provided in htsv. + * * If the item is an index-referenced tuple (i.e. not a heap-only tuple), * the HOT chain is pruned by removing all DEAD tuples at the start of the HOT * chain. We also prune any RECENTLY_DEAD tuples preceding a DEAD tuple. @@ -588,7 +594,8 @@ heap_prune_satisfies_vacuum(PruneState *prstate, HeapTuple tup, Buffer buffer) * Returns the number of tuples (to be) deleted from the page. */ static int -heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate) +heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate, + int8 *htsv) { int ndeleted = 0; Page dp = (Page) BufferGetPage(buffer); @@ -609,7 +616,7 @@ heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate) */ if (ItemIdIsNormal(rootlp)) { - Assert(prstate->htsv[rootoffnum] != -1); + Assert(htsv[rootoffnum] != -1); htup = (HeapTupleHeader) PageGetItem(dp, rootlp); if (HeapTupleHeaderIsHeapOnly(htup)) @@ -632,7 +639,7 @@ heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate) * either here or while following a chain below. Whichever path * gets there first will mark the tuple unused. */ - if (prstate->htsv[rootoffnum] == HEAPTUPLE_DEAD && + if (htsv[rootoffnum] == HEAPTUPLE_DEAD && !HeapTupleHeaderIsHotUpdated(htup)) { heap_prune_record_unused(prstate, rootoffnum); @@ -700,7 +707,7 @@ heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate) break; Assert(ItemIdIsNormal(lp)); - Assert(prstate->htsv[offnum] != -1); + Assert(htsv[offnum] != -1); htup = (HeapTupleHeader) PageGetItem(dp, lp); /* @@ -720,7 +727,7 @@ heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate) */ tupdead = recent_dead = false; - switch ((HTSV_Result) prstate->htsv[offnum]) + switch ((HTSV_Result) htsv[offnum]) { case HEAPTUPLE_DEAD: tupdead = true; diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c index c474b061146..2e2af40b5e0 100644 --- a/src/backend/access/heap/vacuumlazy.c +++ b/src/backend/access/heap/vacuumlazy.c @@ -1518,12 +1518,14 @@ lazy_scan_new_or_empty(LVRelState *vacrel, Buffer buf, BlockNumber blkno, * of complexity just so we could deal with tuples that were DEAD to VACUUM, * but nevertheless were left with storage after pruning. * - * The approach we take now is to restart pruning when the race condition is - * detected. This allows heap_page_prune() to prune the tuples inserted by - * the now-aborted transaction. This is a little crude, but it guarantees - * that any items that make it into the dead_items array are simple LP_DEAD - * line pointers, and that every remaining item with tuple storage is - * considered as a candidate for freezing. + * As of minor release blah, we circumvent this problem altogether by reusing + * the result of heap_page_prune()'s visibility check. Without the second call + * to HeapTupleSatisfiesVacuum(), there is no new HTSV_Result and there can be + * no disagreement. We'll just handle such tuples as if they had become fully + * dead right after this operation completes instead of in the middle of it. + * Note that any tuple that becomes dead after the call to heap_page_prune() + * can't need to be frozen, because it was visible to another session when + * vacuum started. */ static void lazy_scan_prune(LVRelState *vacrel, @@ -1536,8 +1538,6 @@ lazy_scan_prune(LVRelState *vacrel, OffsetNumber offnum, maxoff; ItemId itemid; - HeapTupleData tuple; - HTSV_Result res; int tuples_deleted, tuples_frozen, lpdead_items, @@ -1548,6 +1548,7 @@ lazy_scan_prune(LVRelState *vacrel, int64 fpi_before = pgWalUsage.wal_fpi; OffsetNumber deadoffsets[MaxHeapTuplesPerPage]; HeapTupleFreeze frozen[MaxHeapTuplesPerPage]; + int8 htsv[MaxHeapTuplesPerPage + 1]; Assert(BufferGetBlockNumber(buf) == blkno); @@ -1558,8 +1559,6 @@ lazy_scan_prune(LVRelState *vacrel, */ maxoff = PageGetMaxOffsetNumber(page); -retry: - /* Initialize (or reset) page-level state */ pagefrz.freeze_required = false; pagefrz.FreezePageRelfrozenXid = vacrel->NewRelfrozenXid; @@ -1582,7 +1581,7 @@ retry: * that were deleted from indexes. */ tuples_deleted = heap_page_prune(rel, buf, vacrel->vistest, - InvalidTransactionId, 0, &nnewlpdead, + InvalidTransactionId, 0, htsv, &nnewlpdead, &vacrel->offnum); /* @@ -1599,6 +1598,7 @@ retry: offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { + HeapTupleHeader htup; bool totally_frozen; /* @@ -1641,22 +1641,7 @@ retry: Assert(ItemIdIsNormal(itemid)); - ItemPointerSet(&(tuple.t_self), blkno, offnum); - tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid); - tuple.t_len = ItemIdGetLength(itemid); - tuple.t_tableOid = RelationGetRelid(rel); - - /* - * DEAD tuples are almost always pruned into LP_DEAD line pointers by - * heap_page_prune(), but it's possible that the tuple state changed - * since heap_page_prune() looked. Handle that here by restarting. - * (See comments at the top of function for a full explanation.) - */ - res = HeapTupleSatisfiesVacuum(&tuple, vacrel->cutoffs.OldestXmin, - buf); - - if (unlikely(res == HEAPTUPLE_DEAD)) - goto retry; + htup = (HeapTupleHeader) PageGetItem(page, itemid); /* * The criteria for counting a tuple as live in this block need to @@ -1677,7 +1662,7 @@ retry: * (Cases where we bypass index vacuuming will violate this optimistic * assumption, but the overall impact of that should be negligible.) */ - switch (res) + switch (htsv_get_valid_status(htsv[offnum])) { case HEAPTUPLE_LIVE: @@ -1699,7 +1684,7 @@ retry: { TransactionId xmin; - if (!HeapTupleHeaderXminCommitted(tuple.t_data)) + if (!HeapTupleHeaderXminCommitted(htup)) { prunestate->all_visible = false; break; @@ -1709,9 +1694,8 @@ retry: * The inserter definitely committed. But is it old enough * that everyone sees it as committed? */ - xmin = HeapTupleHeaderGetXmin(tuple.t_data); - if (!TransactionIdPrecedes(xmin, - vacrel->cutoffs.OldestXmin)) + xmin = HeapTupleHeaderGetXmin(htup); + if (!TransactionIdPrecedes(xmin, vacrel->cutoffs.OldestXmin)) { prunestate->all_visible = false; break; @@ -1763,7 +1747,7 @@ retry: prunestate->hastup = true; /* page makes rel truncation unsafe */ /* Tuple with storage -- consider need to freeze */ - if (heap_prepare_freeze_tuple(tuple.t_data, &vacrel->cutoffs, &pagefrz, + if (heap_prepare_freeze_tuple(htup, &vacrel->cutoffs, &pagefrz, &frozen[tuples_frozen], &totally_frozen)) { /* Save prepared freeze plan for later */ diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index faf50265191..c0d0592163b 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -288,6 +288,7 @@ extern int heap_page_prune(Relation relation, Buffer buffer, struct GlobalVisState *vistest, TransactionId old_snap_xmin, TimestampTz old_snap_ts, + int8 htsv[MaxHeapTuplesPerPage + 1], int *nnewlpdead, OffsetNumber *off_loc); extern void heap_page_prune_execute(Buffer buffer, @@ -329,4 +330,18 @@ extern bool ResolveCminCmaxDuringDecoding(struct HTAB *tuplecid_data, extern void HeapCheckForSerializableConflictOut(bool visible, Relation relation, HeapTuple tuple, Buffer buffer, Snapshot snapshot); +/* + * Pruning calculates tuple visibility once and saves the results in an array + * of int8. See PruneResult.htsv for details. This helper function is meant to + * guard against examining visibility status array members which have not yet + * been computed. + */ +static inline HTSV_Result +htsv_get_valid_status(int status) +{ + Assert(status >= HEAPTUPLE_DEAD && + status <= HEAPTUPLE_DELETE_IN_PROGRESS); + return (HTSV_Result) status; +} + #endif /* HEAPAM_H */ -- 2.34.1