diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index b4e8d44..c2da15e 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -299,6 +299,7 @@ bool btgettuple(IndexScanDesc scan, ScanDirection dir) { BTScanOpaque so = (BTScanOpaque) scan->opaque; + BTScanState state = &so->state; bool res; /* btree indexes are never lossy */ @@ -309,7 +310,7 @@ btgettuple(IndexScanDesc scan, ScanDirection dir) * scan. We can't do this in btrescan because we don't know the scan * direction at that time. */ - if (so->numArrayKeys && !BTScanPosIsValid(so->currPos)) + if (so->numArrayKeys && !BTScanPosIsValid(state->currPos)) { /* punt if we have any unsatisfiable array keys */ if (so->numArrayKeys < 0) @@ -326,7 +327,7 @@ btgettuple(IndexScanDesc scan, ScanDirection dir) * the appropriate direction. If we haven't done so yet, we call * _bt_first() to get the first item in the scan. */ - if (!BTScanPosIsValid(so->currPos)) + if (!BTScanPosIsValid(state->currPos)) res = _bt_first(scan, dir); else { @@ -344,11 +345,11 @@ btgettuple(IndexScanDesc scan, ScanDirection dir) * trying to optimize that, so we don't detect it, but instead * just forget any excess entries. */ - if (so->killedItems == NULL) - so->killedItems = (int *) + if (state->killedItems == NULL) + state->killedItems = (int *) palloc(MaxIndexTuplesPerPage * sizeof(int)); - if (so->numKilled < MaxIndexTuplesPerPage) - so->killedItems[so->numKilled++] = so->currPos.itemIndex; + if (state->numKilled < MaxIndexTuplesPerPage) + state->killedItems[so->state.numKilled++] = state->currPos.itemIndex; } /* @@ -373,6 +374,7 @@ int64 btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm) { BTScanOpaque so = (BTScanOpaque) scan->opaque; + BTScanPos currPos = &so->state.currPos; int64 ntids = 0; ItemPointer heapTid; @@ -405,7 +407,7 @@ btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm) * Advance to next tuple within page. This is the same as the * easy case in _bt_next(). */ - if (++so->currPos.itemIndex > so->currPos.lastItem) + if (++currPos->itemIndex > currPos->lastItem) { /* let _bt_next do the heavy lifting */ if (!_bt_next(scan, ForwardScanDirection)) @@ -413,7 +415,7 @@ btgetbitmap(IndexScanDesc scan, TIDBitmap *tbm) } /* Save tuple ID, and continue scanning */ - heapTid = &so->currPos.items[so->currPos.itemIndex].heapTid; + heapTid = &currPos->items[currPos->itemIndex].heapTid; tbm_add_tuples(tbm, heapTid, 1, false); ntids++; } @@ -441,8 +443,8 @@ btbeginscan(Relation rel, int nkeys, int norderbys) /* allocate private workspace */ so = (BTScanOpaque) palloc(sizeof(BTScanOpaqueData)); - BTScanPosInvalidate(so->currPos); - BTScanPosInvalidate(so->markPos); + BTScanPosInvalidate(so->state.currPos); + BTScanPosInvalidate(so->state.markPos); if (scan->numberOfKeys > 0) so->keyData = (ScanKey) palloc(scan->numberOfKeys * sizeof(ScanKeyData)); else @@ -453,15 +455,15 @@ btbeginscan(Relation rel, int nkeys, int norderbys) so->arrayKeys = NULL; so->arrayContext = NULL; - so->killedItems = NULL; /* until needed */ - so->numKilled = 0; + so->state.killedItems = NULL; /* until needed */ + so->state.numKilled = 0; /* * We don't know yet whether the scan will be index-only, so we do not * allocate the tuple workspace arrays until btrescan. However, we set up * scan->xs_itupdesc whether we'll need it or not, since that's so cheap. */ - so->currTuples = so->markTuples = NULL; + so->state.currTuples = so->state.markTuples = NULL; scan->xs_itupdesc = RelationGetDescr(rel); @@ -470,6 +472,45 @@ btbeginscan(Relation rel, int nkeys, int norderbys) return scan; } +static void +_bt_release_current_position(BTScanState state, Relation indexRelation, + bool invalidate) +{ + /* we aren't holding any read locks, but gotta drop the pins */ + if (BTScanPosIsValid(state->currPos)) + { + /* Before leaving current page, deal with any killed items */ + if (state->numKilled > 0) + _bt_killitems(state, indexRelation); + + BTScanPosUnpinIfPinned(state->currPos); + + if (invalidate) + BTScanPosInvalidate(state->currPos); + } +} + +static void +_bt_release_scan_state(IndexScanDesc scan, BTScanState state, bool free) +{ + /* No need to invalidate positions, if the RAM is about to be freed. */ + _bt_release_current_position(state, scan->indexRelation, !free); + + state->markItemIndex = -1; + BTScanPosUnpinIfPinned(state->markPos); + + if (free) + { + if (state->killedItems != NULL) + pfree(state->killedItems); + if (state->currTuples != NULL) + pfree(state->currTuples); + /* markTuples should not be pfree'd (_bt_allocate_tuple_workspaces) */ + } + else + BTScanPosInvalidate(state->markPos); +} + /* * btrescan() -- rescan an index relation */ @@ -478,20 +519,9 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, ScanKey orderbys, int norderbys) { BTScanOpaque so = (BTScanOpaque) scan->opaque; + BTScanState state = &so->state; - /* we aren't holding any read locks, but gotta drop the pins */ - if (BTScanPosIsValid(so->currPos)) - { - /* Before leaving current page, deal with any killed items */ - if (so->numKilled > 0) - _bt_killitems(scan); - BTScanPosUnpinIfPinned(so->currPos); - BTScanPosInvalidate(so->currPos); - } - - so->markItemIndex = -1; - BTScanPosUnpinIfPinned(so->markPos); - BTScanPosInvalidate(so->markPos); + _bt_release_scan_state(scan, state, false); /* * Allocate tuple workspace arrays, if needed for an index-only scan and @@ -509,11 +539,8 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, * a SIGSEGV is not possible. Yeah, this is ugly as sin, but it beats * adding special-case treatment for name_ops elsewhere. */ - if (scan->xs_want_itup && so->currTuples == NULL) - { - so->currTuples = (char *) palloc(BLCKSZ * 2); - so->markTuples = so->currTuples + BLCKSZ; - } + if (scan->xs_want_itup && state->currTuples == NULL) + _bt_allocate_tuple_workspaces(state); /* * Reset the scan keys. Note that keys ordering stuff moved to _bt_first. @@ -537,19 +564,7 @@ btendscan(IndexScanDesc scan) { BTScanOpaque so = (BTScanOpaque) scan->opaque; - /* we aren't holding any read locks, but gotta drop the pins */ - if (BTScanPosIsValid(so->currPos)) - { - /* Before leaving current page, deal with any killed items */ - if (so->numKilled > 0) - _bt_killitems(scan); - BTScanPosUnpinIfPinned(so->currPos); - } - - so->markItemIndex = -1; - BTScanPosUnpinIfPinned(so->markPos); - - /* No need to invalidate positions, the RAM is about to be freed. */ + _bt_release_scan_state(scan, &so->state, true); /* Release storage */ if (so->keyData != NULL) @@ -557,24 +572,15 @@ btendscan(IndexScanDesc scan) /* so->arrayKeyData and so->arrayKeys are in arrayContext */ if (so->arrayContext != NULL) MemoryContextDelete(so->arrayContext); - if (so->killedItems != NULL) - pfree(so->killedItems); - if (so->currTuples != NULL) - pfree(so->currTuples); - /* so->markTuples should not be pfree'd, see btrescan */ + pfree(so); } -/* - * btmarkpos() -- save current scan position - */ -void -btmarkpos(IndexScanDesc scan) +static void +_bt_mark_current_position(BTScanState state) { - BTScanOpaque so = (BTScanOpaque) scan->opaque; - /* There may be an old mark with a pin (but no lock). */ - BTScanPosUnpinIfPinned(so->markPos); + BTScanPosUnpinIfPinned(state->markPos); /* * Just record the current itemIndex. If we later step to next page @@ -582,32 +588,34 @@ btmarkpos(IndexScanDesc scan) * the currPos struct in markPos. If (as often happens) the mark is moved * before we leave the page, we don't have to do that work. */ - if (BTScanPosIsValid(so->currPos)) - so->markItemIndex = so->currPos.itemIndex; + if (BTScanPosIsValid(state->currPos)) + state->markItemIndex = state->currPos.itemIndex; else { - BTScanPosInvalidate(so->markPos); - so->markItemIndex = -1; + BTScanPosInvalidate(state->markPos); + state->markItemIndex = -1; } - - /* Also record the current positions of any array keys */ - if (so->numArrayKeys) - _bt_mark_array_keys(scan); } /* - * btrestrpos() -- restore scan to last saved position + * btmarkpos() -- save current scan position */ void -btrestrpos(IndexScanDesc scan) +btmarkpos(IndexScanDesc scan) { BTScanOpaque so = (BTScanOpaque) scan->opaque; - /* Restore the marked positions of any array keys */ + _bt_mark_current_position(&so->state); + + /* Also record the current positions of any array keys */ if (so->numArrayKeys) - _bt_restore_array_keys(scan); + _bt_mark_array_keys(scan); +} - if (so->markItemIndex >= 0) +static void +_bt_restore_marked_position(IndexScanDesc scan, BTScanState state) +{ + if (state->markItemIndex >= 0) { /* * The scan has never moved to a new page since the last mark. Just @@ -616,7 +624,7 @@ btrestrpos(IndexScanDesc scan) * NB: In this case we can't count on anything in so->markPos to be * accurate. */ - so->currPos.itemIndex = so->markItemIndex; + state->currPos.itemIndex = state->markItemIndex; } else { @@ -626,32 +634,40 @@ btrestrpos(IndexScanDesc scan) * locks, but if we're still holding the pin for the current position, * we must drop it. */ - if (BTScanPosIsValid(so->currPos)) - { - /* Before leaving current page, deal with any killed items */ - if (so->numKilled > 0) - _bt_killitems(scan); - BTScanPosUnpinIfPinned(so->currPos); - } + _bt_release_current_position(state, scan->indexRelation, + !BTScanPosIsValid(state->markPos)); - if (BTScanPosIsValid(so->markPos)) + if (BTScanPosIsValid(state->markPos)) { /* bump pin on mark buffer for assignment to current buffer */ - if (BTScanPosIsPinned(so->markPos)) - IncrBufferRefCount(so->markPos.buf); - memcpy(&so->currPos, &so->markPos, + if (BTScanPosIsPinned(state->markPos)) + IncrBufferRefCount(state->markPos.buf); + memcpy(&state->currPos, &state->markPos, offsetof(BTScanPosData, items[1]) + - so->markPos.lastItem * sizeof(BTScanPosItem)); - if (so->currTuples) - memcpy(so->currTuples, so->markTuples, - so->markPos.nextTupleOffset); + state->markPos.lastItem * sizeof(BTScanPosItem)); + if (state->currTuples) + memcpy(state->currTuples, state->markTuples, + state->markPos.nextTupleOffset); } - else - BTScanPosInvalidate(so->currPos); } } /* + * btrestrpos() -- restore scan to last saved position + */ +void +btrestrpos(IndexScanDesc scan) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + + /* Restore the marked positions of any array keys */ + if (so->numArrayKeys) + _bt_restore_array_keys(scan); + + _bt_restore_marked_position(scan, &so->state); +} + +/* * Bulk deletion of all index entries pointing to a set of heap tuples. * The set of target tuples is specified via a callback routine that tells * whether any given heap tuple (identified by ItemPointer) is being deleted. diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index b6459d2..c041056 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -25,11 +25,11 @@ #include "utils/tqual.h" -static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir, +static bool _bt_readpage(IndexScanDesc scan, BTScanState state, ScanDirection dir, OffsetNumber offnum); -static void _bt_saveitem(BTScanOpaque so, int itemIndex, +static void _bt_saveitem(BTScanState state, int itemIndex, OffsetNumber offnum, IndexTuple itup); -static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir); +static bool _bt_steppage(IndexScanDesc scan, BTScanState state, ScanDirection dir); static Buffer _bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot); static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir); static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp); @@ -509,6 +509,58 @@ _bt_compare(Relation rel, } /* + * _bt_return_current_item() -- Prepare current scan state item for return. + * + * This function is used only in "return _bt_return_current_item();" statements + * and always returns true. + */ +static inline bool +_bt_return_current_item(IndexScanDesc scan, BTScanState state) +{ + BTScanPosItem *currItem = &state->currPos.items[state->currPos.itemIndex]; + + scan->xs_ctup.t_self = currItem->heapTid; + + if (scan->xs_want_itup) + scan->xs_itup = (IndexTuple) (state->currTuples + currItem->tupleOffset); + + return true; +} + +/* + * _bt_load_first_page() -- Load data from the first page of the scan. + * + * Caller must have pinned and read-locked state->currPos.buf. + * + * On success exit, state->currPos is updated to contain data from the next + * interesting page. For success on a scan using a non-MVCC snapshot we hold + * a pin, but not a read lock, on that page. If we do not hold the pin, we + * set state->currPos.buf to InvalidBuffer. We return true to indicate success. + * + * If there are no more matching records in the given direction at all, + * we drop all locks and pins, set state->currPos.buf to InvalidBuffer, + * and return false. + */ +static bool +_bt_load_first_page(IndexScanDesc scan, BTScanState state, ScanDirection dir, + OffsetNumber offnum) +{ + if (!_bt_readpage(scan, state, dir, offnum)) + { + /* + * There's no actually-matching data on this page. Try to advance to + * the next page. Return false if there's no matching data at all. + */ + LockBuffer(state->currPos.buf, BUFFER_LOCK_UNLOCK); + return _bt_steppage(scan, state, dir); + } + + /* Drop the lock, and maybe the pin, on the current page */ + _bt_drop_lock_and_maybe_pin(scan, &state->currPos); + return true; +} + +/* * _bt_first() -- Find the first item in a scan. * * We need to be clever about the direction of scan, the search @@ -533,6 +585,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) { Relation rel = scan->indexRelation; BTScanOpaque so = (BTScanOpaque) scan->opaque; + BTScanPos currPos = &so->state.currPos; Buffer buf; BTStack stack; OffsetNumber offnum; @@ -545,9 +598,8 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) int keysCount = 0; int i; StrategyNumber strat_total; - BTScanPosItem *currItem; - Assert(!BTScanPosIsValid(so->currPos)); + Assert(!BTScanPosIsValid(*currPos)); pgstat_count_index_scan(rel); @@ -1002,16 +1054,16 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) /* initialize moreLeft/moreRight appropriately for scan direction */ if (ScanDirectionIsForward(dir)) { - so->currPos.moreLeft = false; - so->currPos.moreRight = true; + currPos->moreLeft = false; + currPos->moreRight = true; } else { - so->currPos.moreLeft = true; - so->currPos.moreRight = false; + currPos->moreLeft = true; + currPos->moreRight = false; } - so->numKilled = 0; /* just paranoia */ - Assert(so->markItemIndex == -1); + so->state.numKilled = 0; /* just paranoia */ + Assert(so->state.markItemIndex == -1); /* position to the precise item on the page */ offnum = _bt_binsrch(rel, buf, keysCount, scankeys, nextkey); @@ -1038,35 +1090,35 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) offnum = OffsetNumberPrev(offnum); /* remember which buffer we have pinned, if any */ - Assert(!BTScanPosIsValid(so->currPos)); - so->currPos.buf = buf; + Assert(!BTScanPosIsValid(*currPos)); + currPos->buf = buf; - /* - * Now load data from the first page of the scan. - */ - if (!_bt_readpage(scan, dir, offnum)) + if (!_bt_load_first_page(scan, &so->state, dir, offnum)) + return false; + + /* OK, currPos->itemIndex says what to return */ + return _bt_return_current_item(scan, &so->state); +} + +/* + * Advance to next tuple on current page; or if there's no more, + * try to step to the next page with data. + */ +static bool +_bt_next_item(IndexScanDesc scan, BTScanState state, ScanDirection dir) +{ + if (ScanDirectionIsForward(dir)) { - /* - * There's no actually-matching data on this page. Try to advance to - * the next page. Return false if there's no matching data at all. - */ - LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK); - if (!_bt_steppage(scan, dir)) - return false; + if (++state->currPos.itemIndex <= state->currPos.lastItem) + return true; } else { - /* Drop the lock, and maybe the pin, on the current page */ - _bt_drop_lock_and_maybe_pin(scan, &so->currPos); + if (--state->currPos.itemIndex >= state->currPos.firstItem) + return true; } - /* OK, itemIndex says what to return */ - currItem = &so->currPos.items[so->currPos.itemIndex]; - scan->xs_ctup.t_self = currItem->heapTid; - if (scan->xs_want_itup) - scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset); - - return true; + return _bt_steppage(scan, state, dir); } /* @@ -1087,44 +1139,20 @@ bool _bt_next(IndexScanDesc scan, ScanDirection dir) { BTScanOpaque so = (BTScanOpaque) scan->opaque; - BTScanPosItem *currItem; - /* - * Advance to next tuple on current page; or if there's no more, try to - * step to the next page with data. - */ - if (ScanDirectionIsForward(dir)) - { - if (++so->currPos.itemIndex > so->currPos.lastItem) - { - if (!_bt_steppage(scan, dir)) - return false; - } - } - else - { - if (--so->currPos.itemIndex < so->currPos.firstItem) - { - if (!_bt_steppage(scan, dir)) - return false; - } - } + if (!_bt_next_item(scan, &so->state, dir)) + return false; /* OK, itemIndex says what to return */ - currItem = &so->currPos.items[so->currPos.itemIndex]; - scan->xs_ctup.t_self = currItem->heapTid; - if (scan->xs_want_itup) - scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset); - - return true; + return _bt_return_current_item(scan, &so->state); } /* * _bt_readpage() -- Load data from current index page into so->currPos * - * Caller must have pinned and read-locked so->currPos.buf; the buffer's state - * is not changed here. Also, currPos.moreLeft and moreRight must be valid; - * they are updated as appropriate. All other fields of so->currPos are + * Caller must have pinned and read-locked pos->buf; the buffer's state + * is not changed here. Also, pos->moreLeft and moreRight must be valid; + * they are updated as appropriate. All other fields of pos are * initialized from scratch here. * * We scan the current page starting at offnum and moving in the indicated @@ -1135,9 +1163,10 @@ _bt_next(IndexScanDesc scan, ScanDirection dir) * Returns true if any matching items found on the page, false if none. */ static bool -_bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) +_bt_readpage(IndexScanDesc scan, BTScanState state, ScanDirection dir, + OffsetNumber offnum) { - BTScanOpaque so = (BTScanOpaque) scan->opaque; + BTScanPos pos = &state->currPos; Page page; BTPageOpaque opaque; OffsetNumber minoff; @@ -1150,9 +1179,9 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) * We must have the buffer pinned and locked, but the usual macro can't be * used here; this function is what makes it good for currPos. */ - Assert(BufferIsValid(so->currPos.buf)); + Assert(BufferIsValid(pos->buf)); - page = BufferGetPage(so->currPos.buf); + page = BufferGetPage(pos->buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); minoff = P_FIRSTDATAKEY(opaque); maxoff = PageGetMaxOffsetNumber(page); @@ -1161,30 +1190,30 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) * We note the buffer's block number so that we can release the pin later. * This allows us to re-read the buffer if it is needed again for hinting. */ - so->currPos.currPage = BufferGetBlockNumber(so->currPos.buf); + pos->currPage = BufferGetBlockNumber(pos->buf); /* * We save the LSN of the page as we read it, so that we know whether it * safe to apply LP_DEAD hints to the page later. This allows us to drop * the pin for MVCC scans, which allows vacuum to avoid blocking. */ - so->currPos.lsn = PageGetLSN(page); + pos->lsn = PageGetLSN(page); /* * we must save the page's right-link while scanning it; this tells us * where to step right to after we're done with these items. There is no * corresponding need for the left-link, since splits always go right. */ - so->currPos.nextPage = opaque->btpo_next; + pos->nextPage = opaque->btpo_next; /* initialize tuple workspace to empty */ - so->currPos.nextTupleOffset = 0; + pos->nextTupleOffset = 0; /* * Now that the current page has been made consistent, the macro should be * good. */ - Assert(BTScanPosIsPinned(so->currPos)); + Assert(BTScanPosIsPinned(*pos)); if (ScanDirectionIsForward(dir)) { @@ -1199,13 +1228,13 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) if (itup != NULL) { /* tuple passes all scan key conditions, so remember it */ - _bt_saveitem(so, itemIndex, offnum, itup); + _bt_saveitem(state, itemIndex, offnum, itup); itemIndex++; } if (!continuescan) { /* there can't be any more matches, so stop */ - so->currPos.moreRight = false; + pos->moreRight = false; break; } @@ -1213,9 +1242,9 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) } Assert(itemIndex <= MaxIndexTuplesPerPage); - so->currPos.firstItem = 0; - so->currPos.lastItem = itemIndex - 1; - so->currPos.itemIndex = 0; + pos->firstItem = 0; + pos->lastItem = itemIndex - 1; + pos->itemIndex = 0; } else { @@ -1231,12 +1260,12 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) { /* tuple passes all scan key conditions, so remember it */ itemIndex--; - _bt_saveitem(so, itemIndex, offnum, itup); + _bt_saveitem(state, itemIndex, offnum, itup); } if (!continuescan) { /* there can't be any more matches, so stop */ - so->currPos.moreLeft = false; + pos->moreLeft = false; break; } @@ -1244,30 +1273,31 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) } Assert(itemIndex >= 0); - so->currPos.firstItem = itemIndex; - so->currPos.lastItem = MaxIndexTuplesPerPage - 1; - so->currPos.itemIndex = MaxIndexTuplesPerPage - 1; + pos->firstItem = itemIndex; + pos->lastItem = MaxIndexTuplesPerPage - 1; + pos->itemIndex = MaxIndexTuplesPerPage - 1; } - return (so->currPos.firstItem <= so->currPos.lastItem); + return (pos->firstItem <= pos->lastItem); } /* Save an index item into so->currPos.items[itemIndex] */ static void -_bt_saveitem(BTScanOpaque so, int itemIndex, +_bt_saveitem(BTScanState state, int itemIndex, OffsetNumber offnum, IndexTuple itup) { - BTScanPosItem *currItem = &so->currPos.items[itemIndex]; + BTScanPosItem *currItem = &state->currPos.items[itemIndex]; currItem->heapTid = itup->t_tid; currItem->indexOffset = offnum; - if (so->currTuples) + if (state->currTuples) { Size itupsz = IndexTupleSize(itup); - currItem->tupleOffset = so->currPos.nextTupleOffset; - memcpy(so->currTuples + so->currPos.nextTupleOffset, itup, itupsz); - so->currPos.nextTupleOffset += MAXALIGN(itupsz); + currItem->tupleOffset = state->currPos.nextTupleOffset; + memcpy(state->currTuples + state->currPos.nextTupleOffset, + itup, itupsz); + state->currPos.nextTupleOffset += MAXALIGN(itupsz); } } @@ -1287,65 +1317,63 @@ _bt_saveitem(BTScanOpaque so, int itemIndex, * locks and pins, set so->currPos.buf to InvalidBuffer, and return FALSE. */ static bool -_bt_steppage(IndexScanDesc scan, ScanDirection dir) +_bt_steppage(IndexScanDesc scan, BTScanState state, ScanDirection dir) { - BTScanOpaque so = (BTScanOpaque) scan->opaque; - Relation rel; + BTScanPos currPos = &state->currPos; + Relation rel = scan->indexRelation; Page page; BTPageOpaque opaque; - Assert(BTScanPosIsValid(so->currPos)); + Assert(BTScanPosIsValid(*currPos)); /* Before leaving current page, deal with any killed items */ - if (so->numKilled > 0) - _bt_killitems(scan); + if (state->numKilled > 0) + _bt_killitems(state, rel); /* * Before we modify currPos, make a copy of the page data if there was a * mark position that needs it. */ - if (so->markItemIndex >= 0) + if (state->markItemIndex >= 0) { /* bump pin on current buffer for assignment to mark buffer */ - if (BTScanPosIsPinned(so->currPos)) - IncrBufferRefCount(so->currPos.buf); - memcpy(&so->markPos, &so->currPos, + if (BTScanPosIsPinned(*currPos)) + IncrBufferRefCount(currPos->buf); + memcpy(&state->markPos, currPos, offsetof(BTScanPosData, items[1]) + - so->currPos.lastItem * sizeof(BTScanPosItem)); - if (so->markTuples) - memcpy(so->markTuples, so->currTuples, - so->currPos.nextTupleOffset); - so->markPos.itemIndex = so->markItemIndex; - so->markItemIndex = -1; + currPos->lastItem * sizeof(BTScanPosItem)); + if (state->markTuples) + memcpy(state->markTuples, state->currTuples, + currPos->nextTupleOffset); + state->markPos.itemIndex = state->markItemIndex; + state->markItemIndex = -1; } - rel = scan->indexRelation; - if (ScanDirectionIsForward(dir)) { /* Walk right to the next page with data */ /* We must rely on the previously saved nextPage link! */ - BlockNumber blkno = so->currPos.nextPage; + BlockNumber blkno = currPos->nextPage; /* Remember we left a page with data */ - so->currPos.moreLeft = true; + currPos->moreLeft = true; /* release the previous buffer, if pinned */ - BTScanPosUnpinIfPinned(so->currPos); + BTScanPosUnpinIfPinned(*currPos); for (;;) { /* if we're at end of scan, give up */ - if (blkno == P_NONE || !so->currPos.moreRight) + if (blkno == P_NONE || !currPos->moreRight) { - BTScanPosInvalidate(so->currPos); + BTScanPosInvalidate(*currPos); return false; } /* check for interrupts while we're not holding any buffer lock */ CHECK_FOR_INTERRUPTS(); /* step right one page */ - so->currPos.buf = _bt_getbuf(rel, blkno, BT_READ); - page = BufferGetPage(so->currPos.buf); + currPos->buf = _bt_getbuf(rel, blkno, BT_READ); + page = BufferGetPage(currPos->buf); TestForOldSnapshot(scan->xs_snapshot, rel, page); opaque = (BTPageOpaque) PageGetSpecialPointer(page); /* check for deleted page */ @@ -1354,19 +1382,19 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir) PredicateLockPage(rel, blkno, scan->xs_snapshot); /* see if there are any matches on this page */ /* note that this will clear moreRight if we can stop */ - if (_bt_readpage(scan, dir, P_FIRSTDATAKEY(opaque))) + if (_bt_readpage(scan, state, dir, P_FIRSTDATAKEY(opaque))) break; } /* nope, keep going */ blkno = opaque->btpo_next; - _bt_relbuf(rel, so->currPos.buf); + _bt_relbuf(rel, currPos->buf); } } else { /* Remember we left a page with data */ - so->currPos.moreRight = true; + currPos->moreRight = true; /* * Walk left to the next page with data. This is much more complex @@ -1390,29 +1418,28 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir) * is MVCC the page cannot move past the half-dead state to fully * deleted. */ - if (BTScanPosIsPinned(so->currPos)) - LockBuffer(so->currPos.buf, BT_READ); + if (BTScanPosIsPinned(*currPos)) + LockBuffer(currPos->buf, BT_READ); else - so->currPos.buf = _bt_getbuf(rel, so->currPos.currPage, BT_READ); + currPos->buf = _bt_getbuf(rel, currPos->currPage, BT_READ); for (;;) { /* Done if we know there are no matching keys to the left */ - if (!so->currPos.moreLeft) + if (!currPos->moreLeft) { - _bt_relbuf(rel, so->currPos.buf); - BTScanPosInvalidate(so->currPos); + _bt_relbuf(rel, currPos->buf); + BTScanPosInvalidate(*currPos); return false; } /* Step to next physical page */ - so->currPos.buf = _bt_walk_left(rel, so->currPos.buf, - scan->xs_snapshot); + currPos->buf = _bt_walk_left(rel, currPos->buf, scan->xs_snapshot); /* if we're physically at end of index, return failure */ - if (so->currPos.buf == InvalidBuffer) + if (currPos->buf == InvalidBuffer) { - BTScanPosInvalidate(so->currPos); + BTScanPosInvalidate(*currPos); return false; } @@ -1421,22 +1448,22 @@ _bt_steppage(IndexScanDesc scan, ScanDirection dir) * it's not half-dead and contains matching tuples. Else loop back * and do it all again. */ - page = BufferGetPage(so->currPos.buf); + page = BufferGetPage(currPos->buf); TestForOldSnapshot(scan->xs_snapshot, rel, page); opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (!P_IGNORE(opaque)) { - PredicateLockPage(rel, BufferGetBlockNumber(so->currPos.buf), scan->xs_snapshot); + PredicateLockPage(rel, BufferGetBlockNumber(currPos->buf), scan->xs_snapshot); /* see if there are any matches on this page */ /* note that this will clear moreLeft if we can stop */ - if (_bt_readpage(scan, dir, PageGetMaxOffsetNumber(page))) + if (_bt_readpage(scan, state, dir, PageGetMaxOffsetNumber(page))) break; } } } /* Drop the lock, and maybe the pin, on the current page */ - _bt_drop_lock_and_maybe_pin(scan, &so->currPos); + _bt_drop_lock_and_maybe_pin(scan, currPos); return true; } @@ -1661,11 +1688,11 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir) { Relation rel = scan->indexRelation; BTScanOpaque so = (BTScanOpaque) scan->opaque; + BTScanPos currPos = &so->state.currPos; Buffer buf; Page page; BTPageOpaque opaque; OffsetNumber start; - BTScanPosItem *currItem; /* * Scan down to the leftmost or rightmost leaf page. This is a simplified @@ -1681,7 +1708,7 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir) * exists. */ PredicateLockRelation(rel, scan->xs_snapshot); - BTScanPosInvalidate(so->currPos); + BTScanPosInvalidate(*currPos); return false; } @@ -1710,46 +1737,25 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir) } /* remember which buffer we have pinned */ - so->currPos.buf = buf; + currPos->buf = buf; /* initialize moreLeft/moreRight appropriately for scan direction */ if (ScanDirectionIsForward(dir)) { - so->currPos.moreLeft = false; - so->currPos.moreRight = true; + currPos->moreLeft = false; + currPos->moreRight = true; } else { - so->currPos.moreLeft = true; - so->currPos.moreRight = false; + currPos->moreLeft = true; + currPos->moreRight = false; } - so->numKilled = 0; /* just paranoia */ - so->markItemIndex = -1; /* ditto */ + so->state.numKilled = 0; /* just paranoia */ + so->state.markItemIndex = -1; /* ditto */ - /* - * Now load data from the first page of the scan. - */ - if (!_bt_readpage(scan, dir, start)) - { - /* - * There's no actually-matching data on this page. Try to advance to - * the next page. Return false if there's no matching data at all. - */ - LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK); - if (!_bt_steppage(scan, dir)) - return false; - } - else - { - /* Drop the lock, and maybe the pin, on the current page */ - _bt_drop_lock_and_maybe_pin(scan, &so->currPos); - } - - /* OK, itemIndex says what to return */ - currItem = &so->currPos.items[so->currPos.itemIndex]; - scan->xs_ctup.t_self = currItem->heapTid; - if (scan->xs_want_itup) - scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset); + if (!_bt_load_first_page(scan, &so->state, dir, start)) + return false; - return true; + /* OK, currPos->itemIndex says what to return */ + return _bt_return_current_item(scan, &so->state); } diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index da0f330..ebcba7e 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -1725,26 +1725,26 @@ _bt_check_rowcompare(ScanKey skey, IndexTuple tuple, TupleDesc tupdesc, * away and the TID was re-used by a completely different heap tuple. */ void -_bt_killitems(IndexScanDesc scan) +_bt_killitems(BTScanState state, Relation indexRelation) { - BTScanOpaque so = (BTScanOpaque) scan->opaque; + BTScanPos pos = &state->currPos; Page page; BTPageOpaque opaque; OffsetNumber minoff; OffsetNumber maxoff; int i; - int numKilled = so->numKilled; + int numKilled = state->numKilled; bool killedsomething = false; - Assert(BTScanPosIsValid(so->currPos)); + Assert(BTScanPosIsValid(state->currPos)); /* * Always reset the scan state, so we don't look for same items on other * pages. */ - so->numKilled = 0; + state->numKilled = 0; - if (BTScanPosIsPinned(so->currPos)) + if (BTScanPosIsPinned(*pos)) { /* * We have held the pin on this page since we read the index tuples, @@ -1752,28 +1752,28 @@ _bt_killitems(IndexScanDesc scan) * re-use of any TID on the page, so there is no need to check the * LSN. */ - LockBuffer(so->currPos.buf, BT_READ); + LockBuffer(pos->buf, BT_READ); - page = BufferGetPage(so->currPos.buf); + page = BufferGetPage(pos->buf); } else { Buffer buf; /* Attempt to re-read the buffer, getting pin and lock. */ - buf = _bt_getbuf(scan->indexRelation, so->currPos.currPage, BT_READ); + buf = _bt_getbuf(indexRelation, pos->currPage, BT_READ); /* It might not exist anymore; in which case we can't hint it. */ if (!BufferIsValid(buf)) return; page = BufferGetPage(buf); - if (PageGetLSN(page) == so->currPos.lsn) - so->currPos.buf = buf; + if (PageGetLSN(page) == pos->lsn) + pos->buf = buf; else { /* Modified while not pinned means hinting is not safe. */ - _bt_relbuf(scan->indexRelation, buf); + _bt_relbuf(indexRelation, buf); return; } } @@ -1784,12 +1784,12 @@ _bt_killitems(IndexScanDesc scan) for (i = 0; i < numKilled; i++) { - int itemIndex = so->killedItems[i]; - BTScanPosItem *kitem = &so->currPos.items[itemIndex]; + int itemIndex = state->killedItems[i]; + BTScanPosItem *kitem = &pos->items[itemIndex]; OffsetNumber offnum = kitem->indexOffset; - Assert(itemIndex >= so->currPos.firstItem && - itemIndex <= so->currPos.lastItem); + Assert(itemIndex >= pos->firstItem && + itemIndex <= pos->lastItem); if (offnum < minoff) continue; /* pure paranoia */ while (offnum <= maxoff) @@ -1817,10 +1817,10 @@ _bt_killitems(IndexScanDesc scan) if (killedsomething) { opaque->btpo_flags |= BTP_HAS_GARBAGE; - MarkBufferDirtyHint(so->currPos.buf, true); + MarkBufferDirtyHint(pos->buf, true); } - LockBuffer(so->currPos.buf, BUFFER_LOCK_UNLOCK); + LockBuffer(pos->buf, BUFFER_LOCK_UNLOCK); } @@ -2065,3 +2065,14 @@ btproperty(Oid index_oid, int attno, return false; /* punt to generic code */ } } + +/* + * _bt_allocate_tuple_workspaces() -- Allocate buffers for saving index tuples + * in index-only scans. + */ +void +_bt_allocate_tuple_workspaces(BTScanState state) +{ + state->currTuples = (char *) palloc(BLCKSZ * 2); + state->markTuples = state->currTuples + BLCKSZ; +} diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 011a72e..4124010 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -598,20 +598,8 @@ typedef struct BTArrayKeyInfo Datum *elem_values; /* array of num_elems Datums */ } BTArrayKeyInfo; -typedef struct BTScanOpaqueData +typedef struct BTScanStateData { - /* these fields are set by _bt_preprocess_keys(): */ - bool qual_ok; /* false if qual can never be satisfied */ - int numberOfKeys; /* number of preprocessed scan keys */ - ScanKey keyData; /* array of preprocessed scan keys */ - - /* workspace for SK_SEARCHARRAY support */ - ScanKey arrayKeyData; /* modified copy of scan->keyData */ - int numArrayKeys; /* number of equality-type array keys (-1 if - * there are any unsatisfiable array keys) */ - BTArrayKeyInfo *arrayKeys; /* info about each equality-type array key */ - MemoryContext arrayContext; /* scan-lifespan context for array data */ - /* info about killed items if any (killedItems is NULL if never used) */ int *killedItems; /* currPos.items indexes of killed items */ int numKilled; /* number of currently stored items */ @@ -636,6 +624,23 @@ typedef struct BTScanOpaqueData /* keep these last in struct for efficiency */ BTScanPosData currPos; /* current position data */ BTScanPosData markPos; /* marked position, if any */ +} BTScanStateData, *BTScanState; + +typedef struct BTScanOpaqueData +{ + /* these fields are set by _bt_preprocess_keys(): */ + bool qual_ok; /* false if qual can never be satisfied */ + int numberOfKeys; /* number of preprocessed scan keys */ + ScanKey keyData; /* array of preprocessed scan keys */ + + /* workspace for SK_SEARCHARRAY support */ + ScanKey arrayKeyData; /* modified copy of scan->keyData */ + int numArrayKeys; /* number of equality-type array keys (-1 if + * there are any unsatisfiable array keys) */ + BTArrayKeyInfo *arrayKeys; /* info about each equality-type array key */ + MemoryContext arrayContext; /* scan-lifespan context for array data */ + + BTScanStateData state; } BTScanOpaqueData; typedef BTScanOpaqueData *BTScanOpaque; @@ -739,7 +744,7 @@ extern void _bt_preprocess_keys(IndexScanDesc scan); extern IndexTuple _bt_checkkeys(IndexScanDesc scan, Page page, OffsetNumber offnum, ScanDirection dir, bool *continuescan); -extern void _bt_killitems(IndexScanDesc scan); +extern void _bt_killitems(BTScanState state, Relation indexRelation); extern BTCycleId _bt_vacuum_cycleid(Relation rel); extern BTCycleId _bt_start_vacuum(Relation rel); extern void _bt_end_vacuum(Relation rel); @@ -750,6 +755,7 @@ extern bytea *btoptions(Datum reloptions, bool validate); extern bool btproperty(Oid index_oid, int attno, IndexAMProperty prop, const char *propname, bool *res, bool *isnull); +extern void _bt_allocate_tuple_workspaces(BTScanState state); /* * prototypes for functions in nbtvalidate.c