From 21e5d4b629cca1ad3416efe6a3e978cca244b368 Mon Sep 17 00:00:00 2001 From: Andrey Borodin Date: Tue, 17 Jul 2018 22:34:58 +0400 Subject: [PATCH 2/2] Physical GiST scan during VACUUM v10 --- src/backend/access/gist/gistvacuum.c | 366 +++++++++++++++++++++++++++++++---- 1 file changed, 326 insertions(+), 40 deletions(-) diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c index 8d97c44..778c806 100644 --- a/src/backend/access/gist/gistvacuum.c +++ b/src/backend/access/gist/gistvacuum.c @@ -103,8 +103,9 @@ gistvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) typedef struct GistBDItem { - GistNSN parentlsn; - BlockNumber blkno; + GistNSN parentlsn; + BlockNumber blkno; + OffsetNumber parentoffset; struct GistBDItem *next; } GistBDItem; @@ -129,30 +130,232 @@ pushStackIfSplited(Page page, GistBDItem *stack) } /* - * Bulk deletion of all index entries pointing to a set of heap tuples and - * check invalid tuples left after upgrade. - * The set of target tuples is specified via a callback routine that tells - * whether any given heap tuple (identified by ItemPointer) is being deleted. - * - * Result: a palloc'd struct containing statistical info for VACUUM displays. + * During physical scan for every pair parent-child we can either find parent + * first or child first. Every time we open internal page - we mark parent + * block no for every child and set GIST_PS_HAS_PARENT. When scan will get to + * child page, if this page turns out to be empty - we will get back by + * parent link. If we find child first (still without parent link), we mark + * the page as GIST_PS_EMPTY_LEAF if it is ready to be deleted. When we will + * scan it's parent - we will pick it to rescan list. */ -IndexBulkDeleteResult * -gistbulkdelete(IndexVacuumInfo * info, IndexBulkDeleteResult * stats, IndexBulkDeleteCallback callback, void* callback_state) +#define GIST_PS_HAS_PARENT 1 +#define GIST_PS_EMPTY_LEAF 2 + + +/* Physiscal scan item */ +typedef struct GistPSItem { - Relation rel = info->index; - GistBDItem *stack, - *ptr; - BlockNumber recentParent = InvalidBlockNumber; - List *rescanList = NULL; - ListCell *cell; + BlockNumber parent; + List* emptyLeafOffsets; + OffsetNumber parentOffset; + uint16 flags; +} GistPSItem; + +/* Blocknumber of internal pages with offsets to rescan for deletion */ +typedef struct GistRescanItem +{ + BlockNumber blkno; + List* emptyLeafOffsets; + struct GistRescanItem* next; +} GistRescanItem; - /* first time through? */ - if (stats == NULL) - stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); - /* we'll re-count the tuples each time */ - stats->estimated_count = false; - stats->num_index_tuples = 0; +static void +gistbulkdeletephysicalcanpage(IndexVacuumInfo * info, IndexBulkDeleteResult * stats, + IndexBulkDeleteCallback callback, void* callback_state, + BlockNumber blkno, GistNSN startNSN, GistPSItem *graph) +{ + Relation rel = info->index; + Buffer buffer; + Page page; + OffsetNumber i, + maxoff; + IndexTuple idxtuple; + ItemId iid; + + /* + * This is recursive call, should almost never be deeper than + * GIST_MAX_SPLIT_PAGES, but check anyway. + */ + check_stack_depth(); + + vacuum_delay_point(); + + buffer = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, + info->strategy); + /* + * We are not going to stay here for a long time, calling recursive algorithms. + * Especially for an internal page. So, agressivly grab an exclusive lock. + */ + LockBuffer(buffer, GIST_EXCLUSIVE); + page = (Page) BufferGetPage(buffer); + + if (PageIsNew(page) || GistPageIsDeleted(page)) + { + UnlockReleaseBuffer(buffer); + /* TODO: Should not we record free page here? */ + return; + } + + maxoff = PageGetMaxOffsetNumber(page); + + if (GistPageIsLeaf(page)) + { + OffsetNumber todelete[MaxOffsetNumber]; + int ntodelete = 0; + GISTPageOpaque opaque = GistPageGetOpaque(page); + + /* + * If this page was splitted after start of the VACUUM we have to + * revisit rightlink, if it points to block we already scanned. + * This is recursive revisit, should not be deep, but we check + * the possibility of stack overflow anyway. + */ + if ((GistFollowRight(page) || startNSN < GistPageGetNSN(page)) && + (opaque->rightlink != InvalidBlockNumber) && (opaque->rightlink < blkno)) + { + gistbulkdeletephysicalcanpage(info, stats, callback, callback_state, opaque->rightlink, startNSN, graph); + } + + /* + * Remove deletable tuples from page + */ + + for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) + { + iid = PageGetItemId(page, i); + idxtuple = (IndexTuple) PageGetItem(page, iid); + + if (callback(&(idxtuple->t_tid), callback_state)) + todelete[ntodelete++] = i; + else + stats->num_index_tuples += 1; + } + + stats->tuples_removed += ntodelete; + + /* We have dead tuples on the page */ + if (ntodelete) + { + START_CRIT_SECTION(); + + MarkBufferDirty(buffer); + + PageIndexMultiDelete(page, todelete, ntodelete); + GistMarkTuplesDeleted(page); + + if (RelationNeedsWAL(rel)) + { + XLogRecPtr recptr; + + recptr = gistXLogUpdate(buffer, + todelete, ntodelete, + NULL, 0, InvalidBuffer); + PageSetLSN(page, recptr); + } + else + PageSetLSN(page, gistGetFakeLSN(rel)); + + END_CRIT_SECTION(); + } + + /* The page is completely empty */ + if (ntodelete == maxoff) + { + /* This page is a candidate to be deleted. Remember it's parent to rescan it later with xlock */ + if (graph[blkno].flags & GIST_PS_HAS_PARENT) + { + /* Go to parent and append myself */ + BlockNumber parentblockno = graph[blkno].parent; + graph[parentblockno].emptyLeafOffsets = lappend_int(graph[parentblockno].emptyLeafOffsets, (int)graph[blkno].parentOffset); + } + else + { + /* Parent will collect me later */ + graph[blkno].flags |= GIST_PS_EMPTY_LEAF; + } + } + } + else + { + /* For internal pages we remember stucture of the tree */ + for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i)) + { + BlockNumber childblkno; + iid = PageGetItemId(page, i); + idxtuple = (IndexTuple) PageGetItem(page, iid); + childblkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid)); + + if (graph[childblkno].flags & GIST_PS_EMPTY_LEAF) + { + /* Child has been scanned earlier and is ready to be picked up */ + graph[blkno].emptyLeafOffsets = lappend_int(graph[blkno].emptyLeafOffsets, i); + } + else + { + /* Collect leaf when scan will come close */ + graph[childblkno].parent = blkno; + graph[childblkno].parentOffset = i; + graph[childblkno].flags |= GIST_PS_HAS_PARENT; + } + + if (GistTupleIsInvalid(idxtuple)) + ereport(LOG, + (errmsg("index \"%s\" contains an inner tuple marked as invalid", + RelationGetRelationName(rel)), + errdetail("This is caused by an incomplete page split at crash recovery before upgrading to PostgreSQL 9.1."), + errhint("Please REINDEX it."))); + } + } + UnlockReleaseBuffer(buffer); +} + +/* Read all pages sequentially populating array of GistPSItem */ +static GistRescanItem* +gistbulkdeletephysicalcan(IndexVacuumInfo * info, IndexBulkDeleteResult * stats, IndexBulkDeleteCallback callback, void* callback_state, BlockNumber npages) +{ + GistRescanItem *result = NULL; + BlockNumber blkno; + GistNSN startNSN = GetInsertRecPtr(); + + /* Here we will store whole graph of the index */ + GistPSItem *graph = palloc0(npages * sizeof(GistPSItem)); + + + for (blkno = GIST_ROOT_BLKNO; blkno < npages; blkno++) + { + gistbulkdeletephysicalcanpage(info, stats, callback, callback_state, blkno, startNSN, graph); + } + + /* Search for internal pages pointing to empty leafs */ + for (blkno = GIST_ROOT_BLKNO; blkno < npages; blkno++) + { + if (graph[blkno].emptyLeafOffsets) + { + GistRescanItem *next = palloc(sizeof(GistRescanItem)); + next->blkno = blkno; + next->emptyLeafOffsets = graph[blkno].emptyLeafOffsets; + next->next = result; + result = next; + } + } + + pfree(graph); + + return result; +} + +/* Logical scan descends from root to leafs in DFS search */ +static GistRescanItem* +gistbulkdeletelogicalscan(IndexVacuumInfo * info, IndexBulkDeleteResult * stats, IndexBulkDeleteCallback callback, void* callback_state) +{ + Relation rel = info->index; + BlockNumber recentParent = InvalidBlockNumber; + GistBDItem *stack, + *ptr; + GistRescanItem *result = NULL; + + /* This stack is used to organize DFS */ stack = (GistBDItem *) palloc0(sizeof(GistBDItem)); stack->blkno = GIST_ROOT_BLKNO; @@ -237,11 +440,18 @@ gistbulkdelete(IndexVacuumInfo * info, IndexBulkDeleteResult * stats, IndexBulkD END_CRIT_SECTION(); } - if (ntodelete == maxoff && recentParent!=InvalidBlockNumber && - (rescanList == NULL || (BlockNumber)llast_int(rescanList) != recentParent)) + if (ntodelete == maxoff && recentParent!=InvalidBlockNumber) { /* This page is a candidate to be deleted. Remember it's parent to rescan it later with xlock */ - rescanList = lappend_int(rescanList, recentParent); + if (result == NULL || result->blkno != recentParent) + { + GistRescanItem *next = palloc(sizeof(GistRescanItem)); + next->blkno = recentParent; + next->emptyLeafOffsets = NULL; + next->next = result; + result = next; + } + result->emptyLeafOffsets = lappend_int(result->emptyLeafOffsets, stack->parentoffset); } } else @@ -261,6 +471,7 @@ gistbulkdelete(IndexVacuumInfo * info, IndexBulkDeleteResult * stats, IndexBulkD ptr->blkno = ItemPointerGetBlockNumber(&(idxtuple->t_tid)); ptr->parentlsn = BufferGetLSNAtomic(buffer); ptr->next = stack->next; + ptr->parentoffset = i; stack->next = ptr; if (GistTupleIsInvalid(idxtuple)) @@ -281,20 +492,82 @@ gistbulkdelete(IndexVacuumInfo * info, IndexBulkDeleteResult * stats, IndexBulkD vacuum_delay_point(); } - /* rescan inner pages that had empty child pages */ - foreach(cell,rescanList) + return result; +} + +/* + * This function is used to sort offsets + * When employing physical scan rescan offsets are not ordered. + */ +static int +compare_offsetnumber(const void *x, const void *y) +{ + OffsetNumber a = *((OffsetNumber *)x); + OffsetNumber b = *((OffsetNumber *)y); + return a - b; +} + +/* + * Bulk deletion of all index entries pointing to a set of heap tuples and + * check invalid tuples left after upgrade. + * The set of target tuples is specified via a callback routine that tells + * whether any given heap tuple (identified by ItemPointer) is being deleted. + * + * Result: a palloc'd struct containing statistical info for VACUUM displays. + */ +IndexBulkDeleteResult * +gistbulkdelete(IndexVacuumInfo * info, IndexBulkDeleteResult * stats, IndexBulkDeleteCallback callback, void* callback_state) +{ + Relation rel = info->index; + GistRescanItem *rescan; + BlockNumber npages; + bool needLock; + + /* first time through? */ + if (stats == NULL) + stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); + /* we'll re-count the tuples each time */ + stats->estimated_count = false; + stats->num_index_tuples = 0; + + /* + * Need lock unless it's local to this backend. + */ + needLock = !RELATION_IS_LOCAL(rel); + + /* try to find deleted pages */ + if (needLock) + LockRelationForExtension(rel, ExclusiveLock); + npages = RelationGetNumberOfBlocks(rel); + if (needLock) + UnlockRelationForExtension(rel, ExclusiveLock); + + /* If we have enough space to contruct map of whole graph, then we can do sequential reading of all index */ + if (npages * (sizeof(GistPSItem)) > maintenance_work_mem * 1024) { - Buffer buffer; - Page page; - OffsetNumber i, - maxoff; - IndexTuple idxtuple; - ItemId iid; - OffsetNumber todelete[MaxOffsetNumber]; - Buffer buftodelete[MaxOffsetNumber]; - int ntodelete = 0; + rescan = gistbulkdeletelogicalscan(info, stats, callback, callback_state); + } + else + { + rescan = gistbulkdeletephysicalcan(info, stats, callback, callback_state, npages); + } - buffer = ReadBufferExtended(rel, MAIN_FORKNUM, (BlockNumber)lfirst_int(cell), + /* rescan inner pages that had empty child pages */ + while (rescan) + { + Buffer buffer; + Page page; + OffsetNumber i, + maxoff; + IndexTuple idxtuple; + ItemId iid; + OffsetNumber todelete[MaxOffsetNumber]; + Buffer buftodelete[MaxOffsetNumber]; + int ntodelete = 0; + ListCell *cell; + GistRescanItem *oldRescan; + + buffer = ReadBufferExtended(rel, MAIN_FORKNUM, rescan->blkno, RBM_NORMAL, info->strategy); LockBuffer(buffer, GIST_EXCLUSIVE); gistcheckpage(rel, buffer); @@ -304,11 +577,18 @@ gistbulkdelete(IndexVacuumInfo * info, IndexBulkDeleteResult * stats, IndexBulkD maxoff = PageGetMaxOffsetNumber(page); - for (i = OffsetNumberNext(FirstOffsetNumber); i <= maxoff; i = OffsetNumberNext(i)) + /* Check that leafs are still empty and decide what to delete */ + foreach(cell, rescan->emptyLeafOffsets) { Buffer leafBuffer; Page leafPage; + i = (OffsetNumber)lfirst_int(cell); + if(i > maxoff) + { + continue; + } + iid = PageGetItemId(page, i); idxtuple = (IndexTuple) PageGetItem(page, iid); @@ -333,7 +613,10 @@ gistbulkdelete(IndexVacuumInfo * info, IndexBulkDeleteResult * stats, IndexBulkD if (ntodelete) { - /* + /* Prepare possibly onurdered offsets */ + qsort(todelete, ntodelete, sizeof(OffsetNumber), compare_offsetnumber); + + /* * Like in _bt_unlink_halfdead_page we need a upper bound on xid * that could hold downlinks to this page. We use * ReadNewTransactionId() to instead of GetCurrentTransactionId @@ -378,11 +661,14 @@ gistbulkdelete(IndexVacuumInfo * info, IndexBulkDeleteResult * stats, IndexBulkD } UnlockReleaseBuffer(buffer); + oldRescan = rescan; + rescan = rescan->next; + list_free(oldRescan->emptyLeafOffsets); + pfree(oldRescan); vacuum_delay_point(); } - list_free(rescanList); return stats; } \ No newline at end of file -- 2.15.2 (Apple Git-101.1)