From 3f7fcf89c81eb38982dcac3ada038e6f3a539090 Mon Sep 17 00:00:00 2001 From: Peter Geoghegan Date: Mon, 8 Nov 2021 22:58:14 -0800 Subject: [PATCH v4] Fix aborted HOT update bug in heap pruning. Author: Peter Geoghegan Reported-By: Alexander Lakhin Diagnosed-By: Andres Freund Reviewed-By: Andres Freund Bug: #17255 Discussion: https://postgr.es/m/17255-14c0ac58d0f9b583@postgresql.org Backpatch: 14, where snapshot scalability improvements first appear --- src/backend/access/heap/pruneheap.c | 361 +++++++++++++++++----------- 1 file changed, 218 insertions(+), 143 deletions(-) diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c index 50ed76198..0bbef4647 100644 --- a/src/backend/access/heap/pruneheap.c +++ b/src/backend/access/heap/pruneheap.c @@ -31,6 +31,7 @@ typedef struct { Relation rel; + BlockNumber targetblkno; /* tuple visibility test, initialized for the relation */ GlobalVisState *vistest; @@ -56,14 +57,16 @@ typedef struct OffsetNumber redirected[MaxHeapTuplesPerPage * 2]; OffsetNumber nowdead[MaxHeapTuplesPerPage]; OffsetNumber nowunused[MaxHeapTuplesPerPage]; - /* marked[i] is true if item i is entered in one of the above arrays */ - bool marked[MaxHeapTuplesPerPage + 1]; + /* visited[i] is true if item i is from a known valid HOT chain */ + bool visited[MaxHeapTuplesPerPage + 1]; } PruneState; /* Local functions */ static int heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate); +static int heap_prune_disconnected(Buffer buffer, OffsetNumber offnum, + PruneState *prstate); static void heap_prune_record_prunable(PruneState *prstate, TransactionId xid); static void heap_prune_record_redirect(PruneState *prstate, OffsetNumber offnum, OffsetNumber rdoffnum); @@ -249,24 +252,29 @@ heap_page_prune(Relation relation, Buffer buffer, */ prstate.new_prune_xid = InvalidTransactionId; prstate.rel = relation; + prstate.targetblkno = BufferGetBlockNumber(buffer); prstate.vistest = vistest; prstate.old_snap_xmin = old_snap_xmin; prstate.old_snap_ts = old_snap_ts; prstate.old_snap_used = false; prstate.latestRemovedXid = InvalidTransactionId; prstate.nredirected = prstate.ndead = prstate.nunused = 0; - memset(prstate.marked, 0, sizeof(prstate.marked)); + memset(prstate.visited, 0, sizeof(prstate.visited)); - /* Scan the page */ + /* + * Scan the page to try to visit all items. + * + * It's possible that a few heap-only tuples will not get visited during + * our initial scan over the page. This happens when the tuples cannot be + * located by following a valid HOT chain. + */ maxoff = PageGetMaxOffsetNumber(page); for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { - ItemId itemid; - - /* Ignore items already processed as part of an earlier chain */ - if (prstate.marked[offnum]) + /* Ignore items already visited as part of an earlier chain */ + if (prstate.visited[offnum]) continue; /* @@ -276,15 +284,33 @@ heap_page_prune(Relation relation, Buffer buffer, if (off_loc) *off_loc = offnum; - /* Nothing to do if slot is empty or already dead */ - itemid = PageGetItemId(page, offnum); - if (!ItemIdIsUsed(itemid) || ItemIdIsDead(itemid)) - continue; - /* Process this item or chain of items */ ndeleted += heap_prune_chain(buffer, offnum, &prstate); } + /* + * Scan the page again, processing heap-only tuples missed by first scan. + * + * These disconnected heap-only tuples (which always originate in aborted + * transactions) are always considered DEAD (and always become LP_UNUSED). + * Delaying processing of these tuples until our second pass allows our + * first pass to cleanly process entire HOT chains as whole units. + */ + for (offnum = FirstOffsetNumber; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + /* Ignore items already visited (only want disconnected tuples) */ + if (prstate.visited[offnum]) + continue; + + if (off_loc) + *off_loc = offnum; + + /* Process this disconnected heap-only tuple */ + ndeleted += heap_prune_disconnected(buffer, offnum, &prstate); + } + /* Clear the offset information once we have processed the given page. */ if (off_loc) *off_loc = InvalidOffsetNumber; @@ -473,20 +499,21 @@ heap_prune_satisfies_vacuum(PruneState *prstate, HeapTuple tup, Buffer buffer) /* - * Prune specified line pointer or a HOT chain originating at line pointer. + * Prune specified HOT chain originating at line pointer, or simple tuple. * - * If the item is an index-referenced tuple (i.e. not a heap-only tuple), - * the HOT chain is pruned by removing all DEAD tuples at the start of the HOT - * chain. We also prune any RECENTLY_DEAD tuples preceding a DEAD tuple. - * This is OK because a RECENTLY_DEAD tuple preceding a DEAD tuple is really - * DEAD, the heap_prune_satisfies_vacuum test is just too coarse to detect it. + * Used during initial pass over the heap page. Won't process any heap-only + * tuples that cannot be found by traversing a HOT chain whose root item is at + * offset rootoffnum. Remaining "disconnected" heap-only tuples are dealt + * with in caller's second pass over the page instead. This is how we make + * sure that no DEAD tuples (or whole HOT chains) are missed during pruning. * * In general, pruning must never leave behind a DEAD tuple that still has * tuple storage. VACUUM isn't prepared to deal with that case. That's why * VACUUM prunes the same heap page a second time (without dropping its lock * in the interim) when it sees a newly DEAD tuple that we initially saw as - * in-progress. Retrying pruning like this can only happen when an inserting - * transaction concurrently aborts. + * in-progress. Retrying pruning like this can only happen due to certain + * edge-cases, like the case where an inserting transaction concurrently + * aborts. * * The root line pointer is redirected to the tuple immediately after the * latest DEAD tuple. If all tuples in the chain are DEAD, the root line @@ -508,77 +535,59 @@ heap_prune_satisfies_vacuum(PruneState *prstate, HeapTuple tup, Buffer buffer) static int heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate) { - int ndeleted = 0; Page dp = (Page) BufferGetPage(buffer); TransactionId priorXmax = InvalidTransactionId; ItemId rootlp; - HeapTupleHeader htup; OffsetNumber latestdead = InvalidOffsetNumber, maxoff = PageGetMaxOffsetNumber(dp), offnum; + bool pastlatestdead = false; OffsetNumber chainitems[MaxHeapTuplesPerPage]; - int nchain = 0, - i; - HeapTupleData tup; + int nchain; - tup.t_tableOid = RelationGetRelid(prstate->rel); + Assert(!prstate->visited[rootoffnum]); rootlp = PageGetItemId(dp, rootoffnum); - /* - * If it's a heap-only tuple, then it is not the start of a HOT chain. - */ if (ItemIdIsNormal(rootlp)) { - htup = (HeapTupleHeader) PageGetItem(dp, rootlp); - - tup.t_data = htup; - tup.t_len = ItemIdGetLength(rootlp); - ItemPointerSet(&(tup.t_self), BufferGetBlockNumber(buffer), rootoffnum); + HeapTupleHeader htup = (HeapTupleHeader) PageGetItem(dp, rootlp); + /* + * If it's a heap-only tuple, then it is not the start of a HOT chain. + * We'll process it later, either by traversing its HOT chain + * (starting from the root item), or in heap_prune_disconnected() call + * made during our second pass over page. + */ if (HeapTupleHeaderIsHeapOnly(htup)) - { - /* - * If the tuple is DEAD and doesn't chain to anything else, mark - * it unused immediately. (If it does chain, we can only remove - * it as part of pruning its chain.) - * - * We need this primarily to handle aborted HOT updates, that is, - * XMIN_INVALID heap-only tuples. Those might not be linked to by - * any chain, since the parent tuple might be re-updated before - * any pruning occurs. So we have to be able to reap them - * separately from chain-pruning. (Note that - * HeapTupleHeaderIsHotUpdated will never return true for an - * XMIN_INVALID tuple, so this code will work even when there were - * sequential updates within the aborted transaction.) - * - * Note that we might first arrive at a dead heap-only tuple - * either here or while following a chain below. Whichever path - * gets there first will mark the tuple unused. - */ - if (heap_prune_satisfies_vacuum(prstate, &tup, buffer) - == HEAPTUPLE_DEAD && !HeapTupleHeaderIsHotUpdated(htup)) - { - heap_prune_record_unused(prstate, rootoffnum); - HeapTupleHeaderAdvanceLatestRemovedXid(htup, - &prstate->latestRemovedXid); - ndeleted++; - } + return 0; + } + else if (!ItemIdIsRedirected(rootlp)) + { + /* + * Nothing to do if slot cannot possibly be valid root item of HOT + * chain or a simple heap tuple + */ + Assert(ItemIdIsDead(rootlp) || !ItemIdIsUsed(rootlp)); + prstate->visited[rootoffnum] = true; - /* Nothing more to do */ - return ndeleted; - } + return 0; } - /* Start from the root tuple */ + /* + * Start from the root item. Mark it as valid up front, since root items + * are always processed up front, in first pass over page. + */ + prstate->visited[rootoffnum] = true; offnum = rootoffnum; + nchain = 0; /* while not end of the chain */ for (;;) { ItemId lp; - bool tupdead, - recent_dead; + HeapTupleHeader htup; + HeapTupleData tup; /* Sanity check (pure paranoia) */ if (offnum < FirstOffsetNumber) @@ -592,15 +601,11 @@ heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate) break; /* If item is already processed, stop --- it must not be same chain */ - if (prstate->marked[offnum]) + if (nchain != 0 && prstate->visited[offnum]) break; lp = PageGetItemId(dp, offnum); - /* Unused item obviously isn't part of the chain */ - if (!ItemIdIsUsed(lp)) - break; - /* * If we are looking at the redirected root line pointer, jump to the * first normal tuple in the chain. If we find a redirect somewhere @@ -615,20 +620,24 @@ heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate) continue; } - /* - * Likewise, a dead line pointer can't be part of the chain. (We - * already eliminated the case of dead root tuple outside this - * function.) - */ - if (ItemIdIsDead(lp)) + /* LP_UNUSED or LP_DEAD items obviously not part of the chain */ + if (!ItemIdIsUsed(lp) || ItemIdIsDead(lp)) + { + /* + * Can consider LP_UNUSED/LP_DEAD items visited when we arrive + * here by following a heap-only tuple's t_ctid link + */ + prstate->visited[offnum] = true; break; + } Assert(ItemIdIsNormal(lp)); htup = (HeapTupleHeader) PageGetItem(dp, lp); - tup.t_data = htup; tup.t_len = ItemIdGetLength(lp); - ItemPointerSet(&(tup.t_self), BufferGetBlockNumber(buffer), offnum); + tup.t_tableOid = RelationGetRelid(prstate->rel); + tup.t_data = htup; + ItemPointerSet(&(tup.t_self), prstate->targetblkno, offnum); /* * Check the tuple XMIN against prior XMAX, if any @@ -638,33 +647,46 @@ heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate) break; /* - * OK, this tuple is indeed a member of the chain. + * OK, this tuple is indeed a member of the chain. Stick with this + * interpretation for the entire prune operation by marking the item + * as visited now. */ + Assert((nchain == 0 && ItemIdIsNormal(rootlp)) || + HeapTupleHeaderIsHeapOnly(htup)); chainitems[nchain++] = offnum; + prstate->visited[offnum] = true; /* * Check tuple's visibility status. */ - tupdead = recent_dead = false; - switch (heap_prune_satisfies_vacuum(prstate, &tup, buffer)) { case HEAPTUPLE_DEAD: - tupdead = true; - break; - - case HEAPTUPLE_RECENTLY_DEAD: - recent_dead = true; /* - * This tuple may soon become DEAD. Update the hint field so - * that the page is reconsidered for pruning in future. + * Remember the offnum of the last DEAD tuple in this HOT + * chain. To keep things simple, don't treat heap-only tuples + * from a HOT chain as DEAD unless they're only preceded by + * other DEAD tuples (in addition to actually being DEAD). + * Remaining tuples that appear DEAD (but don't get treated as + * such by us) are from concurrently aborting updaters. + * + * VACUUM will ask us to prune the heap page a second time + * when it sees that there is a DEAD tuple left behind, but + * that would be necessary regardless of our approach here. */ - heap_prune_record_prunable(prstate, - HeapTupleHeaderGetUpdateXid(htup)); - break; + if (!pastlatestdead) + { + latestdead = offnum; + HeapTupleHeaderAdvanceLatestRemovedXid(htup, + &prstate->latestRemovedXid); + break; + } + /* FALL THRU */ + case HEAPTUPLE_RECENTLY_DEAD: case HEAPTUPLE_DELETE_IN_PROGRESS: + pastlatestdead = true; /* no further DEAD tuples in CHAIN */ /* * This tuple may soon become DEAD. Update the hint field so @@ -676,6 +698,7 @@ heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate) case HEAPTUPLE_LIVE: case HEAPTUPLE_INSERT_IN_PROGRESS: + pastlatestdead = true; /* no further DEAD tuples in CHAIN */ /* * If we wanted to optimize for aborts, we might consider @@ -690,25 +713,12 @@ heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate) break; } - /* - * Remember the last DEAD tuple seen. We will advance past - * RECENTLY_DEAD tuples just in case there's a DEAD one after them; - * but we can't advance past anything else. We have to make sure that - * we don't miss any DEAD tuples, since DEAD tuples that still have - * tuple storage after pruning will confuse VACUUM. - */ - if (tupdead) - { - latestdead = offnum; - HeapTupleHeaderAdvanceLatestRemovedXid(htup, - &prstate->latestRemovedXid); - } - else if (!recent_dead) - break; - /* * If the tuple is not HOT-updated, then we are at the end of this - * HOT-update chain. + * HOT-update chain. There might actually be more tuples that were + * considered part of the same HOT chain in the past, before the + * updater's xact aborted. We'll process any such tuples later on + * instead, inside heap_prune_disconnected(). */ if (!HeapTupleHeaderIsHotUpdated(htup)) break; @@ -720,23 +730,35 @@ heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate) * Advance to next chain member. */ Assert(ItemPointerGetBlockNumber(&htup->t_ctid) == - BufferGetBlockNumber(buffer)); + prstate->targetblkno); offnum = ItemPointerGetOffsetNumber(&htup->t_ctid); priorXmax = HeapTupleHeaderGetUpdateXid(htup); } /* - * If we found a DEAD tuple in the chain, adjust the HOT chain so that all - * the DEAD tuples at the start of the chain are removed and the root line - * pointer is appropriately redirected. + * Should never find an LP_REDIRECT root item that didn't already point to + * a valid item. While a heap-only tuple's t_ctid link can legitimately + * point to almost anything, the rules for LP_REDIRECT links are far + * stricter: LP_REDIRECTs must _always_ point to a valid heap-only tuple. */ + Assert(ItemIdIsNormal(rootlp) || + (ItemIdIsRedirected(rootlp) && nchain >= 2)); + if (OffsetNumberIsValid(latestdead)) { + int i, + ndeleted = 0; + /* - * Mark as unused each intermediate item that we are able to remove - * from the chain. + * Okay, at least one tuple from the chain (or the single plain heap + * tuple) is considered DEAD. Record what to do with items in the + * chain now. * - * When the previous item is the last dead tuple seen, we are at the + * First deal with the non-root items from HOT chain. Mark earlier + * items we consider DEAD as LP_UNUSED (since they're heap-only + * tuples). + * + * When the previous item is the last DEAD tuple seen, we are at the * right candidate for redirection. */ for (i = 1; (i < nchain) && (chainitems[i - 1] != latestdead); i++) @@ -746,36 +768,97 @@ heap_prune_chain(Buffer buffer, OffsetNumber rootoffnum, PruneState *prstate) } /* - * If the root entry had been a normal tuple, we are deleting it, so - * count it in the result. But changing a redirect (even to DEAD - * state) doesn't count. + * If the root item is a normal tuple, we are logically deleting it, + * so count it in the result. But changing an LP_REDIRECT (even to + * make it LP_DEAD) doesn't get counted in ndeleted -- that would + * amount to double-counting DEAD tuples (with tuple storage) in + * ndeleted. */ if (ItemIdIsNormal(rootlp)) ndeleted++; /* + * Finally, consider what to do with the root item itself. + * * If the DEAD tuple is at the end of the chain, the entire chain is - * dead and the root line pointer can be marked dead. Otherwise just - * redirect the root to the correct chain member. + * considered DEAD. The root item must therefore become LP_DEAD. + * Otherwise just redirect the root to the correct chain member. */ if (i >= nchain) heap_prune_record_dead(prstate, rootoffnum); else heap_prune_record_redirect(prstate, rootoffnum, chainitems[i]); - } - else if (nchain < 2 && ItemIdIsRedirected(rootlp)) - { - /* - * We found a redirect item that doesn't point to a valid follow-on - * item. This can happen if the loop in heap_page_prune caused us to - * visit the dead successor of a redirect item before visiting the - * redirect item. We can clean up by setting the redirect item to - * DEAD state. - */ - heap_prune_record_dead(prstate, rootoffnum); + + return ndeleted; } - return ndeleted; + return 0; +} + +/* + * Handle disconnected heap-only tuples during second pass over page. We + * always expect to process these tuples as DEAD tuples here. Since they're + * heap-only tuples it follows that they'll always be set LP_UNUSED. + * + * This is how we handle aborted heap-only tuples that were not visited in our + * first pass (via HOT chain traversal with the usual cross-checks). These + * tuples occur when a parent tuple is updated, the updater aborts, and some + * unrelated updater re-updates the original parent tuple a second time. The + * parent's t_ctid link won't continue to point to the aborted tuple. (Even + * when it does, we won't consider the parent to have been HOT updated, just + * because its XMAX aborted -- so we still end up here for the aborted tuple). + * + * Like heap_prune_chain, we don't actually change the page here. + * + * Returns the number of tuples (to be) deleted from the page, though this + * should always be 1 in practice. +*/ +static int +heap_prune_disconnected(Buffer buffer, OffsetNumber offnum, + PruneState *prstate) +{ + Page dp = (Page) BufferGetPage(buffer); + ItemId lp; + HeapTupleHeader htup; + HeapTupleData tup; + HTSV_Result res; + + lp = PageGetItemId(dp, offnum); + Assert(ItemIdIsNormal(lp)); + htup = (HeapTupleHeader) PageGetItem(dp, lp); + + /* + * Caller must make sure that the tuple at 'offnum' is in fact a heap-only + * tuple that is disconnected from its HOT chain + */ + Assert(!prstate->visited[offnum]); + Assert(HeapTupleHeaderIsHeapOnly(htup)); + + /* + * We expect that disconnected heap-only tuples must be from aborted + * transactions. They must already be DEAD, or something is amiss. + */ + tup.t_len = ItemIdGetLength(lp); + tup.t_tableOid = RelationGetRelid(prstate->rel); + tup.t_data = htup; + ItemPointerSet(&(tup.t_self), prstate->targetblkno, offnum); + res = heap_prune_satisfies_vacuum(prstate, &tup, buffer); + if (res == HEAPTUPLE_DEAD) + { + heap_prune_record_unused(prstate, offnum); + + /* Unnecessary, but be conservative here */ + HeapTupleHeaderAdvanceLatestRemovedXid(htup, + &prstate->latestRemovedXid); + return 1; + } + + /* + * Should always be DEAD. A DEAD heap-only tuple is always counted in + * top-level ndeleted counter for pruning operation. + */ + Assert(false); + return 0; } /* Record lowest soon-prunable XID */ @@ -801,10 +884,6 @@ heap_prune_record_redirect(PruneState *prstate, prstate->redirected[prstate->nredirected * 2] = offnum; prstate->redirected[prstate->nredirected * 2 + 1] = rdoffnum; prstate->nredirected++; - Assert(!prstate->marked[offnum]); - prstate->marked[offnum] = true; - Assert(!prstate->marked[rdoffnum]); - prstate->marked[rdoffnum] = true; } /* Record line pointer to be marked dead */ @@ -814,8 +893,6 @@ heap_prune_record_dead(PruneState *prstate, OffsetNumber offnum) Assert(prstate->ndead < MaxHeapTuplesPerPage); prstate->nowdead[prstate->ndead] = offnum; prstate->ndead++; - Assert(!prstate->marked[offnum]); - prstate->marked[offnum] = true; } /* Record line pointer to be marked unused */ @@ -825,8 +902,6 @@ heap_prune_record_unused(PruneState *prstate, OffsetNumber offnum) Assert(prstate->nunused < MaxHeapTuplesPerPage); prstate->nowunused[prstate->nunused] = offnum; prstate->nunused++; - Assert(!prstate->marked[offnum]); - prstate->marked[offnum] = true; } -- 2.30.2