From bbc222f8ce4d06f16606ab5ea52f5dc420ba3cb1 Mon Sep 17 00:00:00 2001 From: Matthias van de Meent Date: Wed, 11 Jan 2023 02:13:04 +0100 Subject: [PATCH v12 2/6] Specialize nbtree functions on btree key shape. nbtree keys are not all made the same, so a significant amount of time is spent on code that exists only to deal with other key's shape. By specializing function calls based on the key shape, we can remove or reduce these causes of overhead. This commit adds the basic infrastructure for specializing specific hot code in the nbtree AM to certain shapes of keys, and splits the code that can benefit from attribute offset optimizations into separate files. This does NOT yet update the code itself - it just makes the code compile cleanly. The performance should be comparable if not the same. --- contrib/amcheck/verify_nbtree.c | 6 + src/backend/access/nbtree/README | 28 + src/backend/access/nbtree/nbtdedup.c | 300 +---- src/backend/access/nbtree/nbtdedup_spec.c | 317 +++++ src/backend/access/nbtree/nbtinsert.c | 579 +-------- src/backend/access/nbtree/nbtinsert_spec.c | 584 +++++++++ src/backend/access/nbtree/nbtpage.c | 1 + src/backend/access/nbtree/nbtree.c | 37 +- src/backend/access/nbtree/nbtree_spec.c | 69 + src/backend/access/nbtree/nbtsearch.c | 1111 +--------------- src/backend/access/nbtree/nbtsearch_spec.c | 1123 +++++++++++++++++ src/backend/access/nbtree/nbtsort.c | 264 +--- src/backend/access/nbtree/nbtsort_spec.c | 280 ++++ src/backend/access/nbtree/nbtsplitloc.c | 3 + src/backend/access/nbtree/nbtutils.c | 754 +---------- src/backend/access/nbtree/nbtutils_spec.c | 775 ++++++++++++ src/backend/utils/sort/tuplesortvariants.c | 156 +-- .../utils/sort/tuplesortvariants_spec.c | 175 +++ src/include/access/nbtree.h | 44 +- src/include/access/nbtree_spec.h | 183 +++ src/include/access/nbtree_specfuncs.h | 65 + src/tools/pginclude/cpluspluscheck | 2 + src/tools/pginclude/headerscheck | 2 + 23 files changed, 3669 insertions(+), 3189 deletions(-) create mode 100644 src/backend/access/nbtree/nbtdedup_spec.c create mode 100644 src/backend/access/nbtree/nbtinsert_spec.c create mode 100644 src/backend/access/nbtree/nbtree_spec.c create mode 100644 src/backend/access/nbtree/nbtsearch_spec.c create mode 100644 src/backend/access/nbtree/nbtsort_spec.c create mode 100644 src/backend/access/nbtree/nbtutils_spec.c create mode 100644 src/backend/utils/sort/tuplesortvariants_spec.c create mode 100644 src/include/access/nbtree_spec.h create mode 100644 src/include/access/nbtree_specfuncs.h diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c index e57625b75c..10ed67bffe 100644 --- a/contrib/amcheck/verify_nbtree.c +++ b/contrib/amcheck/verify_nbtree.c @@ -2680,6 +2680,7 @@ bt_rootdescend(BtreeCheckState *state, IndexTuple itup) BTStack stack; Buffer lbuf; bool exists; + nbts_prep_ctx(NULL); key = _bt_mkscankey(state->rel, itup); Assert(key->heapkeyspace && key->scantid != NULL); @@ -2780,6 +2781,7 @@ invariant_l_offset(BtreeCheckState *state, BTScanInsert key, ItemId itemid; int32 cmp; AttrNumber cmpcol = 1; + nbts_prep_ctx(NULL); Assert(key->pivotsearch); @@ -2843,6 +2845,7 @@ invariant_leq_offset(BtreeCheckState *state, BTScanInsert key, { int32 cmp; AttrNumber cmpcol = 1; + nbts_prep_ctx(NULL); Assert(key->pivotsearch); @@ -2867,6 +2870,7 @@ invariant_g_offset(BtreeCheckState *state, BTScanInsert key, { int32 cmp; AttrNumber cmpcol = 1; + nbts_prep_ctx(NULL); Assert(key->pivotsearch); @@ -2906,6 +2910,7 @@ invariant_l_nontarget_offset(BtreeCheckState *state, BTScanInsert key, ItemId itemid; int32 cmp; AttrNumber cmpcol = 1; + nbts_prep_ctx(NULL); Assert(key->pivotsearch); @@ -3141,6 +3146,7 @@ static inline BTScanInsert bt_mkscankey_pivotsearch(Relation rel, IndexTuple itup) { BTScanInsert skey; + nbts_prep_ctx(NULL); skey = _bt_mkscankey(rel, itup); skey->pivotsearch = true; diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README index 0f10141a2f..e9d0cf6ac1 100644 --- a/src/backend/access/nbtree/README +++ b/src/backend/access/nbtree/README @@ -1084,6 +1084,34 @@ that need a page split anyway. Besides, supporting variable "split points" while splitting posting lists won't actually improve overall space utilization. +Notes about nbtree specialization +--------------------------------- + +Attribute iteration is a significant overhead for multi-column indexes +with variable length attributes, due to our inability to cache the offset +of each attribute into an on-disk tuple. To combat this, we'd have to either +fully deserialize the tuple, or maintain our offset into the tuple as we +iterate over the tuple's fields. + +Keeping track of this offset also has a non-negligible overhead too, so we'd +prefer to not have to keep track of these offsets when we can use the cache. +By specializing performance-sensitive search functions for these specific +index tuple shapes and calling those selectively, we can keep the performance +of cacheable attribute offsets where that is applicable, while improving +performance where we currently would see O(n_atts^2) time iterating on +variable-length attributes. Additionally, we update the entry points +in the index AM to call the specialized functions, increasing the +performance of those hot paths. + +Optimized code paths exist for the following cases, in order of preference: + - multi-column indexes that could benefit from the attcacheoff optimization + NB: This is also the default path, and is comparatively slow for uncachable + attribute offsets. + +Future work will optimize for multi-column indexes that don't benefit +from the attcacheoff optimization by improving on the O(n^2) nature of +index_getattr through storing attribute offsets. + Notes About Data Representation ------------------------------- diff --git a/src/backend/access/nbtree/nbtdedup.c b/src/backend/access/nbtree/nbtdedup.c index d4db0b28f2..4589ade267 100644 --- a/src/backend/access/nbtree/nbtdedup.c +++ b/src/backend/access/nbtree/nbtdedup.c @@ -22,260 +22,14 @@ static void _bt_bottomupdel_finish_pending(Page page, BTDedupState state, TM_IndexDeleteOp *delstate); -static bool _bt_do_singleval(Relation rel, Page page, BTDedupState state, - OffsetNumber minoff, IndexTuple newitem); static void _bt_singleval_fillfactor(Page page, BTDedupState state, Size newitemsz); #ifdef USE_ASSERT_CHECKING static bool _bt_posting_valid(IndexTuple posting); #endif -/* - * Perform a deduplication pass. - * - * The general approach taken here is to perform as much deduplication as - * possible to free as much space as possible. Note, however, that "single - * value" strategy is used for !bottomupdedup callers when the page is full of - * tuples of a single value. Deduplication passes that apply the strategy - * will leave behind a few untouched tuples at the end of the page, preparing - * the page for an anticipated page split that uses nbtsplitloc.c's own single - * value strategy. Our high level goal is to delay merging the untouched - * tuples until after the page splits. - * - * When a call to _bt_bottomupdel_pass() just took place (and failed), our - * high level goal is to prevent a page split entirely by buying more time. - * We still hope that a page split can be avoided altogether. That's why - * single value strategy is not even considered for bottomupdedup callers. - * - * The page will have to be split if we cannot successfully free at least - * newitemsz (we also need space for newitem's line pointer, which isn't - * included in caller's newitemsz). - * - * Note: Caller should have already deleted all existing items with their - * LP_DEAD bits set. - */ -void -_bt_dedup_pass(Relation rel, Buffer buf, IndexTuple newitem, Size newitemsz, - bool bottomupdedup) -{ - OffsetNumber offnum, - minoff, - maxoff; - Page page = BufferGetPage(buf); - BTPageOpaque opaque = BTPageGetOpaque(page); - Page newpage; - BTDedupState state; - Size pagesaving PG_USED_FOR_ASSERTS_ONLY = 0; - bool singlevalstrat = false; - int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); - - /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */ - newitemsz += sizeof(ItemIdData); - - /* - * Initialize deduplication state. - * - * It would be possible for maxpostingsize (limit on posting list tuple - * size) to be set to one third of the page. However, it seems like a - * good idea to limit the size of posting lists to one sixth of a page. - * That ought to leave us with a good split point when pages full of - * duplicates can be split several times. - */ - state = (BTDedupState) palloc(sizeof(BTDedupStateData)); - state->deduplicate = true; - state->nmaxitems = 0; - state->maxpostingsize = Min(BTMaxItemSize(page) / 2, INDEX_SIZE_MASK); - /* Metadata about base tuple of current pending posting list */ - state->base = NULL; - state->baseoff = InvalidOffsetNumber; - state->basetupsize = 0; - /* Metadata about current pending posting list TIDs */ - state->htids = palloc(state->maxpostingsize); - state->nhtids = 0; - state->nitems = 0; - /* Size of all physical tuples to be replaced by pending posting list */ - state->phystupsize = 0; - /* nintervals should be initialized to zero */ - state->nintervals = 0; - - minoff = P_FIRSTDATAKEY(opaque); - maxoff = PageGetMaxOffsetNumber(page); - - /* - * Consider applying "single value" strategy, though only if the page - * seems likely to be split in the near future - */ - if (!bottomupdedup) - singlevalstrat = _bt_do_singleval(rel, page, state, minoff, newitem); - - /* - * Deduplicate items from page, and write them to newpage. - * - * Copy the original page's LSN into newpage copy. This will become the - * updated version of the page. We need this because XLogInsert will - * examine the LSN and possibly dump it in a page image. - */ - newpage = PageGetTempPageCopySpecial(page); - PageSetLSN(newpage, PageGetLSN(page)); - - /* Copy high key, if any */ - if (!P_RIGHTMOST(opaque)) - { - ItemId hitemid = PageGetItemId(page, P_HIKEY); - Size hitemsz = ItemIdGetLength(hitemid); - IndexTuple hitem = (IndexTuple) PageGetItem(page, hitemid); - - if (PageAddItem(newpage, (Item) hitem, hitemsz, P_HIKEY, - false, false) == InvalidOffsetNumber) - elog(ERROR, "deduplication failed to add highkey"); - } - - for (offnum = minoff; - offnum <= maxoff; - offnum = OffsetNumberNext(offnum)) - { - ItemId itemid = PageGetItemId(page, offnum); - IndexTuple itup = (IndexTuple) PageGetItem(page, itemid); - - Assert(!ItemIdIsDead(itemid)); - - if (offnum == minoff) - { - /* - * No previous/base tuple for the data item -- use the data item - * as base tuple of pending posting list - */ - _bt_dedup_start_pending(state, itup, offnum); - } - else if (state->deduplicate && - _bt_keep_natts_fast(rel, state->base, itup) > nkeyatts && - _bt_dedup_save_htid(state, itup)) - { - /* - * Tuple is equal to base tuple of pending posting list. Heap - * TID(s) for itup have been saved in state. - */ - } - else - { - /* - * Tuple is not equal to pending posting list tuple, or - * _bt_dedup_save_htid() opted to not merge current item into - * pending posting list for some other reason (e.g., adding more - * TIDs would have caused posting list to exceed current - * maxpostingsize). - * - * If state contains pending posting list with more than one item, - * form new posting tuple and add it to our temp page (newpage). - * Else add pending interval's base tuple to the temp page as-is. - */ - pagesaving += _bt_dedup_finish_pending(newpage, state); - - if (singlevalstrat) - { - /* - * Single value strategy's extra steps. - * - * Lower maxpostingsize for sixth and final large posting list - * tuple at the point where 5 maxpostingsize-capped tuples - * have either been formed or observed. - * - * When a sixth maxpostingsize-capped item is formed/observed, - * stop merging together tuples altogether. The few tuples - * that remain at the end of the page won't be merged together - * at all (at least not until after a future page split takes - * place, when this page's newly allocated right sibling page - * gets its first deduplication pass). - */ - if (state->nmaxitems == 5) - _bt_singleval_fillfactor(page, state, newitemsz); - else if (state->nmaxitems == 6) - { - state->deduplicate = false; - singlevalstrat = false; /* won't be back here */ - } - } - - /* itup starts new pending posting list */ - _bt_dedup_start_pending(state, itup, offnum); - } - } - - /* Handle the last item */ - pagesaving += _bt_dedup_finish_pending(newpage, state); - - /* - * If no items suitable for deduplication were found, newpage must be - * exactly the same as the original page, so just return from function. - * - * We could determine whether or not to proceed on the basis the space - * savings being sufficient to avoid an immediate page split instead. We - * don't do that because there is some small value in nbtsplitloc.c always - * operating against a page that is fully deduplicated (apart from - * newitem). Besides, most of the cost has already been paid. - */ - if (state->nintervals == 0) - { - /* cannot leak memory here */ - pfree(newpage); - pfree(state->htids); - pfree(state); - return; - } - - /* - * By here, it's clear that deduplication will definitely go ahead. - * - * Clear the BTP_HAS_GARBAGE page flag. The index must be a heapkeyspace - * index, and as such we'll never pay attention to BTP_HAS_GARBAGE anyway. - * But keep things tidy. - */ - if (P_HAS_GARBAGE(opaque)) - { - BTPageOpaque nopaque = BTPageGetOpaque(newpage); - - nopaque->btpo_flags &= ~BTP_HAS_GARBAGE; - } - - START_CRIT_SECTION(); - - PageRestoreTempPage(newpage, page); - MarkBufferDirty(buf); - - /* XLOG stuff */ - if (RelationNeedsWAL(rel)) - { - XLogRecPtr recptr; - xl_btree_dedup xlrec_dedup; - - xlrec_dedup.nintervals = state->nintervals; - - XLogBeginInsert(); - XLogRegisterBuffer(0, buf, REGBUF_STANDARD); - XLogRegisterData((char *) &xlrec_dedup, SizeOfBtreeDedup); - - /* - * The intervals array is not in the buffer, but pretend that it is. - * When XLogInsert stores the whole buffer, the array need not be - * stored too. - */ - XLogRegisterBufData(0, (char *) state->intervals, - state->nintervals * sizeof(BTDedupInterval)); - - recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DEDUP); - - PageSetLSN(page, recptr); - } - - END_CRIT_SECTION(); - - /* Local space accounting should agree with page accounting */ - Assert(pagesaving < newitemsz || PageGetExactFreeSpace(page) >= newitemsz); - - /* cannot leak memory here */ - pfree(state->htids); - pfree(state); -} +#define NBT_SPECIALIZE_FILE "../../backend/access/nbtree/nbtdedup_spec.c" +#include "access/nbtree_spec.h" /* * Perform bottom-up index deletion pass. @@ -316,6 +70,7 @@ _bt_bottomupdel_pass(Relation rel, Buffer buf, Relation heapRel, TM_IndexDeleteOp delstate; bool neverdedup; int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); + nbts_prep_ctx(rel); /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */ newitemsz += sizeof(ItemIdData); @@ -752,55 +507,6 @@ _bt_bottomupdel_finish_pending(Page page, BTDedupState state, state->phystupsize = 0; } -/* - * Determine if page non-pivot tuples (data items) are all duplicates of the - * same value -- if they are, deduplication's "single value" strategy should - * be applied. The general goal of this strategy is to ensure that - * nbtsplitloc.c (which uses its own single value strategy) will find a useful - * split point as further duplicates are inserted, and successive rightmost - * page splits occur among pages that store the same duplicate value. When - * the page finally splits, it should end up BTREE_SINGLEVAL_FILLFACTOR% full, - * just like it would if deduplication were disabled. - * - * We expect that affected workloads will require _several_ single value - * strategy deduplication passes (over a page that only stores duplicates) - * before the page is finally split. The first deduplication pass should only - * find regular non-pivot tuples. Later deduplication passes will find - * existing maxpostingsize-capped posting list tuples, which must be skipped - * over. The penultimate pass is generally the first pass that actually - * reaches _bt_singleval_fillfactor(), and so will deliberately leave behind a - * few untouched non-pivot tuples. The final deduplication pass won't free - * any space -- it will skip over everything without merging anything (it - * retraces the steps of the penultimate pass). - * - * Fortunately, having several passes isn't too expensive. Each pass (after - * the first pass) won't spend many cycles on the large posting list tuples - * left by previous passes. Each pass will find a large contiguous group of - * smaller duplicate tuples to merge together at the end of the page. - */ -static bool -_bt_do_singleval(Relation rel, Page page, BTDedupState state, - OffsetNumber minoff, IndexTuple newitem) -{ - int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); - ItemId itemid; - IndexTuple itup; - - itemid = PageGetItemId(page, minoff); - itup = (IndexTuple) PageGetItem(page, itemid); - - if (_bt_keep_natts_fast(rel, newitem, itup) > nkeyatts) - { - itemid = PageGetItemId(page, PageGetMaxOffsetNumber(page)); - itup = (IndexTuple) PageGetItem(page, itemid); - - if (_bt_keep_natts_fast(rel, newitem, itup) > nkeyatts) - return true; - } - - return false; -} - /* * Lower maxpostingsize when using "single value" strategy, to avoid a sixth * and final maxpostingsize-capped tuple. The sixth and final posting list diff --git a/src/backend/access/nbtree/nbtdedup_spec.c b/src/backend/access/nbtree/nbtdedup_spec.c new file mode 100644 index 0000000000..4b280de980 --- /dev/null +++ b/src/backend/access/nbtree/nbtdedup_spec.c @@ -0,0 +1,317 @@ +/*------------------------------------------------------------------------- + * + * nbtdedup_spec.c + * Index shape-specialized functions for nbtdedup.c + * + * NOTES + * See also: access/nbtree/README section "nbtree specialization" + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/nbtree/nbtdedup_spec.c + * + *------------------------------------------------------------------------- + */ + +#define _bt_do_singleval NBTS_FUNCTION(_bt_do_singleval) + +static bool _bt_do_singleval(Relation rel, Page page, BTDedupState state, + OffsetNumber minoff, IndexTuple newitem); + +/* + * Perform a deduplication pass. + * + * The general approach taken here is to perform as much deduplication as + * possible to free as much space as possible. Note, however, that "single + * value" strategy is used for !bottomupdedup callers when the page is full of + * tuples of a single value. Deduplication passes that apply the strategy + * will leave behind a few untouched tuples at the end of the page, preparing + * the page for an anticipated page split that uses nbtsplitloc.c's own single + * value strategy. Our high level goal is to delay merging the untouched + * tuples until after the page splits. + * + * When a call to _bt_bottomupdel_pass() just took place (and failed), our + * high level goal is to prevent a page split entirely by buying more time. + * We still hope that a page split can be avoided altogether. That's why + * single value strategy is not even considered for bottomupdedup callers. + * + * The page will have to be split if we cannot successfully free at least + * newitemsz (we also need space for newitem's line pointer, which isn't + * included in caller's newitemsz). + * + * Note: Caller should have already deleted all existing items with their + * LP_DEAD bits set. + */ +void +_bt_dedup_pass(Relation rel, Buffer buf, IndexTuple newitem, Size newitemsz, + bool bottomupdedup) +{ + OffsetNumber offnum, + minoff, + maxoff; + Page page = BufferGetPage(buf); + BTPageOpaque opaque = BTPageGetOpaque(page); + Page newpage; + BTDedupState state; + Size pagesaving PG_USED_FOR_ASSERTS_ONLY = 0; + bool singlevalstrat = false; + int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); + + /* Passed-in newitemsz is MAXALIGNED but does not include line pointer */ + newitemsz += sizeof(ItemIdData); + + /* + * Initialize deduplication state. + * + * It would be possible for maxpostingsize (limit on posting list tuple + * size) to be set to one third of the page. However, it seems like a + * good idea to limit the size of posting lists to one sixth of a page. + * That ought to leave us with a good split point when pages full of + * duplicates can be split several times. + */ + state = (BTDedupState) palloc(sizeof(BTDedupStateData)); + state->deduplicate = true; + state->nmaxitems = 0; + state->maxpostingsize = Min(BTMaxItemSize(page) / 2, INDEX_SIZE_MASK); + /* Metadata about base tuple of current pending posting list */ + state->base = NULL; + state->baseoff = InvalidOffsetNumber; + state->basetupsize = 0; + /* Metadata about current pending posting list TIDs */ + state->htids = palloc(state->maxpostingsize); + state->nhtids = 0; + state->nitems = 0; + /* Size of all physical tuples to be replaced by pending posting list */ + state->phystupsize = 0; + /* nintervals should be initialized to zero */ + state->nintervals = 0; + + minoff = P_FIRSTDATAKEY(opaque); + maxoff = PageGetMaxOffsetNumber(page); + + /* + * Consider applying "single value" strategy, though only if the page + * seems likely to be split in the near future + */ + if (!bottomupdedup) + singlevalstrat = _bt_do_singleval(rel, page, state, minoff, newitem); + + /* + * Deduplicate items from page, and write them to newpage. + * + * Copy the original page's LSN into newpage copy. This will become the + * updated version of the page. We need this because XLogInsert will + * examine the LSN and possibly dump it in a page image. + */ + newpage = PageGetTempPageCopySpecial(page); + PageSetLSN(newpage, PageGetLSN(page)); + + /* Copy high key, if any */ + if (!P_RIGHTMOST(opaque)) + { + ItemId hitemid = PageGetItemId(page, P_HIKEY); + Size hitemsz = ItemIdGetLength(hitemid); + IndexTuple hitem = (IndexTuple) PageGetItem(page, hitemid); + + if (PageAddItem(newpage, (Item) hitem, hitemsz, P_HIKEY, + false, false) == InvalidOffsetNumber) + elog(ERROR, "deduplication failed to add highkey"); + } + + for (offnum = minoff; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + ItemId itemid = PageGetItemId(page, offnum); + IndexTuple itup = (IndexTuple) PageGetItem(page, itemid); + + Assert(!ItemIdIsDead(itemid)); + + if (offnum == minoff) + { + /* + * No previous/base tuple for the data item -- use the data item + * as base tuple of pending posting list + */ + _bt_dedup_start_pending(state, itup, offnum); + } + else if (state->deduplicate && + _bt_keep_natts_fast(rel, state->base, itup) > nkeyatts && + _bt_dedup_save_htid(state, itup)) + { + /* + * Tuple is equal to base tuple of pending posting list. Heap + * TID(s) for itup have been saved in state. + */ + } + else + { + /* + * Tuple is not equal to pending posting list tuple, or + * _bt_dedup_save_htid() opted to not merge current item into + * pending posting list for some other reason (e.g., adding more + * TIDs would have caused posting list to exceed current + * maxpostingsize). + * + * If state contains pending posting list with more than one item, + * form new posting tuple and add it to our temp page (newpage). + * Else add pending interval's base tuple to the temp page as-is. + */ + pagesaving += _bt_dedup_finish_pending(newpage, state); + + if (singlevalstrat) + { + /* + * Single value strategy's extra steps. + * + * Lower maxpostingsize for sixth and final large posting list + * tuple at the point where 5 maxpostingsize-capped tuples + * have either been formed or observed. + * + * When a sixth maxpostingsize-capped item is formed/observed, + * stop merging together tuples altogether. The few tuples + * that remain at the end of the page won't be merged together + * at all (at least not until after a future page split takes + * place, when this page's newly allocated right sibling page + * gets its first deduplication pass). + */ + if (state->nmaxitems == 5) + _bt_singleval_fillfactor(page, state, newitemsz); + else if (state->nmaxitems == 6) + { + state->deduplicate = false; + singlevalstrat = false; /* won't be back here */ + } + } + + /* itup starts new pending posting list */ + _bt_dedup_start_pending(state, itup, offnum); + } + } + + /* Handle the last item */ + pagesaving += _bt_dedup_finish_pending(newpage, state); + + /* + * If no items suitable for deduplication were found, newpage must be + * exactly the same as the original page, so just return from function. + * + * We could determine whether or not to proceed on the basis the space + * savings being sufficient to avoid an immediate page split instead. We + * don't do that because there is some small value in nbtsplitloc.c always + * operating against a page that is fully deduplicated (apart from + * newitem). Besides, most of the cost has already been paid. + */ + if (state->nintervals == 0) + { + /* cannot leak memory here */ + pfree(newpage); + pfree(state->htids); + pfree(state); + return; + } + + /* + * By here, it's clear that deduplication will definitely go ahead. + * + * Clear the BTP_HAS_GARBAGE page flag. The index must be a heapkeyspace + * index, and as such we'll never pay attention to BTP_HAS_GARBAGE anyway. + * But keep things tidy. + */ + if (P_HAS_GARBAGE(opaque)) + { + BTPageOpaque nopaque = BTPageGetOpaque(newpage); + + nopaque->btpo_flags &= ~BTP_HAS_GARBAGE; + } + + START_CRIT_SECTION(); + + PageRestoreTempPage(newpage, page); + MarkBufferDirty(buf); + + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) + { + XLogRecPtr recptr; + xl_btree_dedup xlrec_dedup; + + xlrec_dedup.nintervals = state->nintervals; + + XLogBeginInsert(); + XLogRegisterBuffer(0, buf, REGBUF_STANDARD); + XLogRegisterData((char *) &xlrec_dedup, SizeOfBtreeDedup); + + /* + * The intervals array is not in the buffer, but pretend that it is. + * When XLogInsert stores the whole buffer, the array need not be + * stored too. + */ + XLogRegisterBufData(0, (char *) state->intervals, + state->nintervals * sizeof(BTDedupInterval)); + + recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DEDUP); + + PageSetLSN(page, recptr); + } + + END_CRIT_SECTION(); + + /* Local space accounting should agree with page accounting */ + Assert(pagesaving < newitemsz || PageGetExactFreeSpace(page) >= newitemsz); + + /* cannot leak memory here */ + pfree(state->htids); + pfree(state); +} + +/* + * Determine if page non-pivot tuples (data items) are all duplicates of the + * same value -- if they are, deduplication's "single value" strategy should + * be applied. The general goal of this strategy is to ensure that + * nbtsplitloc.c (which uses its own single value strategy) will find a useful + * split point as further duplicates are inserted, and successive rightmost + * page splits occur among pages that store the same duplicate value. When + * the page finally splits, it should end up BTREE_SINGLEVAL_FILLFACTOR% full, + * just like it would if deduplication were disabled. + * + * We expect that affected workloads will require _several_ single value + * strategy deduplication passes (over a page that only stores duplicates) + * before the page is finally split. The first deduplication pass should only + * find regular non-pivot tuples. Later deduplication passes will find + * existing maxpostingsize-capped posting list tuples, which must be skipped + * over. The penultimate pass is generally the first pass that actually + * reaches _bt_singleval_fillfactor(), and so will deliberately leave behind a + * few untouched non-pivot tuples. The final deduplication pass won't free + * any space -- it will skip over everything without merging anything (it + * retraces the steps of the penultimate pass). + * + * Fortunately, having several passes isn't too expensive. Each pass (after + * the first pass) won't spend many cycles on the large posting list tuples + * left by previous passes. Each pass will find a large contiguous group of + * smaller duplicate tuples to merge together at the end of the page. + */ +static bool +_bt_do_singleval(Relation rel, Page page, BTDedupState state, + OffsetNumber minoff, IndexTuple newitem) +{ + int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); + ItemId itemid; + IndexTuple itup; + + itemid = PageGetItemId(page, minoff); + itup = (IndexTuple) PageGetItem(page, itemid); + + if (_bt_keep_natts_fast(rel, newitem, itup) > nkeyatts) + { + itemid = PageGetItemId(page, PageGetMaxOffsetNumber(page)); + itup = (IndexTuple) PageGetItem(page, itemid); + + if (_bt_keep_natts_fast(rel, newitem, itup) > nkeyatts) + return true; + } + + return false; +} diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index 39e7e9b731..3607bd418e 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -30,28 +30,16 @@ #define BTREE_FASTPATH_MIN_LEVEL 2 -static BTStack _bt_search_insert(Relation rel, Relation heaprel, - BTInsertState insertstate); static TransactionId _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, IndexUniqueCheck checkUnique, bool *is_unique, uint32 *speculativeToken); -static OffsetNumber _bt_findinsertloc(Relation rel, - BTInsertState insertstate, - bool checkingunique, - bool indexUnchanged, - BTStack stack, - Relation heapRel); static void _bt_stepright(Relation rel, Relation heaprel, BTInsertState insertstate, BTStack stack); -static void _bt_insertonpg(Relation rel, Relation heaprel, BTScanInsert itup_key, - Buffer buf, - Buffer cbuf, - BTStack stack, - IndexTuple itup, - Size itemsz, - OffsetNumber newitemoff, - int postingoff, +static void _bt_insertonpg(Relation rel, Relation heaprel, + BTScanInsert itup_key, Buffer buf, Buffer cbuf, + BTStack stack, IndexTuple itup, Size itemsz, + OffsetNumber newitemoff, int postingoff, bool split_only_page); static Buffer _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf, Buffer cbuf, OffsetNumber newitemoff, @@ -75,313 +63,8 @@ static BlockNumber *_bt_deadblocks(Page page, OffsetNumber *deletable, int *nblocks); static inline int _bt_blk_cmp(const void *arg1, const void *arg2); -/* - * _bt_doinsert() -- Handle insertion of a single index tuple in the tree. - * - * This routine is called by the public interface routine, btinsert. - * By here, itup is filled in, including the TID. - * - * If checkUnique is UNIQUE_CHECK_NO or UNIQUE_CHECK_PARTIAL, this - * will allow duplicates. Otherwise (UNIQUE_CHECK_YES or - * UNIQUE_CHECK_EXISTING) it will throw error for a duplicate. - * For UNIQUE_CHECK_EXISTING we merely run the duplicate check, and - * don't actually insert. - * - * indexUnchanged executor hint indicates if itup is from an - * UPDATE that didn't logically change the indexed value, but - * must nevertheless have a new entry to point to a successor - * version. - * - * The result value is only significant for UNIQUE_CHECK_PARTIAL: - * it must be true if the entry is known unique, else false. - * (In the current implementation we'll also return true after a - * successful UNIQUE_CHECK_YES or UNIQUE_CHECK_EXISTING call, but - * that's just a coding artifact.) - */ -bool -_bt_doinsert(Relation rel, IndexTuple itup, - IndexUniqueCheck checkUnique, bool indexUnchanged, - Relation heapRel) -{ - bool is_unique = false; - BTInsertStateData insertstate; - BTScanInsert itup_key; - BTStack stack; - bool checkingunique = (checkUnique != UNIQUE_CHECK_NO); - - /* we need an insertion scan key to do our search, so build one */ - itup_key = _bt_mkscankey(rel, itup); - - if (checkingunique) - { - if (!itup_key->anynullkeys) - { - /* No (heapkeyspace) scantid until uniqueness established */ - itup_key->scantid = NULL; - } - else - { - /* - * Scan key for new tuple contains NULL key values. Bypass - * checkingunique steps. They are unnecessary because core code - * considers NULL unequal to every value, including NULL. - * - * This optimization avoids O(N^2) behavior within the - * _bt_findinsertloc() heapkeyspace path when a unique index has a - * large number of "duplicates" with NULL key values. - */ - checkingunique = false; - /* Tuple is unique in the sense that core code cares about */ - Assert(checkUnique != UNIQUE_CHECK_EXISTING); - is_unique = true; - } - } - - /* - * Fill in the BTInsertState working area, to track the current page and - * position within the page to insert on. - * - * Note that itemsz is passed down to lower level code that deals with - * inserting the item. It must be MAXALIGN()'d. This ensures that space - * accounting code consistently considers the alignment overhead that we - * expect PageAddItem() will add later. (Actually, index_form_tuple() is - * already conservative about alignment, but we don't rely on that from - * this distance. Besides, preserving the "true" tuple size in index - * tuple headers for the benefit of nbtsplitloc.c might happen someday. - * Note that heapam does not MAXALIGN() each heap tuple's lp_len field.) - */ - insertstate.itup = itup; - insertstate.itemsz = MAXALIGN(IndexTupleSize(itup)); - insertstate.itup_key = itup_key; - insertstate.bounds_valid = false; - insertstate.buf = InvalidBuffer; - insertstate.postingoff = 0; - -search: - - /* - * Find and lock the leaf page that the tuple should be added to by - * searching from the root page. insertstate.buf will hold a buffer that - * is locked in exclusive mode afterwards. - */ - stack = _bt_search_insert(rel, heapRel, &insertstate); - - /* - * checkingunique inserts are not allowed to go ahead when two tuples with - * equal key attribute values would be visible to new MVCC snapshots once - * the xact commits. Check for conflicts in the locked page/buffer (if - * needed) here. - * - * It might be necessary to check a page to the right in _bt_check_unique, - * though that should be very rare. In practice the first page the value - * could be on (with scantid omitted) is almost always also the only page - * that a matching tuple might be found on. This is due to the behavior - * of _bt_findsplitloc with duplicate tuples -- a group of duplicates can - * only be allowed to cross a page boundary when there is no candidate - * leaf page split point that avoids it. Also, _bt_check_unique can use - * the leaf page high key to determine that there will be no duplicates on - * the right sibling without actually visiting it (it uses the high key in - * cases where the new item happens to belong at the far right of the leaf - * page). - * - * NOTE: obviously, _bt_check_unique can only detect keys that are already - * in the index; so it cannot defend against concurrent insertions of the - * same key. We protect against that by means of holding a write lock on - * the first page the value could be on, with omitted/-inf value for the - * implicit heap TID tiebreaker attribute. Any other would-be inserter of - * the same key must acquire a write lock on the same page, so only one - * would-be inserter can be making the check at one time. Furthermore, - * once we are past the check we hold write locks continuously until we - * have performed our insertion, so no later inserter can fail to see our - * insertion. (This requires some care in _bt_findinsertloc.) - * - * If we must wait for another xact, we release the lock while waiting, - * and then must perform a new search. - * - * For a partial uniqueness check, we don't wait for the other xact. Just - * let the tuple in and return false for possibly non-unique, or true for - * definitely unique. - */ - if (checkingunique) - { - TransactionId xwait; - uint32 speculativeToken; - - xwait = _bt_check_unique(rel, &insertstate, heapRel, checkUnique, - &is_unique, &speculativeToken); - - if (unlikely(TransactionIdIsValid(xwait))) - { - /* Have to wait for the other guy ... */ - _bt_relbuf(rel, insertstate.buf); - insertstate.buf = InvalidBuffer; - - /* - * If it's a speculative insertion, wait for it to finish (ie. to - * go ahead with the insertion, or kill the tuple). Otherwise - * wait for the transaction to finish as usual. - */ - if (speculativeToken) - SpeculativeInsertionWait(xwait, speculativeToken); - else - XactLockTableWait(xwait, rel, &itup->t_tid, XLTW_InsertIndex); - - /* start over... */ - if (stack) - _bt_freestack(stack); - goto search; - } - - /* Uniqueness is established -- restore heap tid as scantid */ - if (itup_key->heapkeyspace) - itup_key->scantid = &itup->t_tid; - } - - if (checkUnique != UNIQUE_CHECK_EXISTING) - { - OffsetNumber newitemoff; - - /* - * The only conflict predicate locking cares about for indexes is when - * an index tuple insert conflicts with an existing lock. We don't - * know the actual page we're going to insert on for sure just yet in - * checkingunique and !heapkeyspace cases, but it's okay to use the - * first page the value could be on (with scantid omitted) instead. - */ - CheckForSerializableConflictIn(rel, NULL, BufferGetBlockNumber(insertstate.buf)); - - /* - * Do the insertion. Note that insertstate contains cached binary - * search bounds established within _bt_check_unique when insertion is - * checkingunique. - */ - newitemoff = _bt_findinsertloc(rel, &insertstate, checkingunique, - indexUnchanged, stack, heapRel); - _bt_insertonpg(rel, heapRel, itup_key, insertstate.buf, InvalidBuffer, - stack, itup, insertstate.itemsz, newitemoff, - insertstate.postingoff, false); - } - else - { - /* just release the buffer */ - _bt_relbuf(rel, insertstate.buf); - } - - /* be tidy */ - if (stack) - _bt_freestack(stack); - pfree(itup_key); - - return is_unique; -} - -/* - * _bt_search_insert() -- _bt_search() wrapper for inserts - * - * Search the tree for a particular scankey, or more precisely for the first - * leaf page it could be on. Try to make use of the fastpath optimization's - * rightmost leaf page cache before actually searching the tree from the root - * page, though. - * - * Return value is a stack of parent-page pointers (though see notes about - * fastpath optimization and page splits below). insertstate->buf is set to - * the address of the leaf-page buffer, which is write-locked and pinned in - * all cases (if necessary by creating a new empty root page for caller). - * - * The fastpath optimization avoids most of the work of searching the tree - * repeatedly when a single backend inserts successive new tuples on the - * rightmost leaf page of an index. A backend cache of the rightmost leaf - * page is maintained within _bt_insertonpg(), and used here. The cache is - * invalidated here when an insert of a non-pivot tuple must take place on a - * non-rightmost leaf page. - * - * The optimization helps with indexes on an auto-incremented field. It also - * helps with indexes on datetime columns, as well as indexes with lots of - * NULL values. (NULLs usually get inserted in the rightmost page for single - * column indexes, since they usually get treated as coming after everything - * else in the key space. Individual NULL tuples will generally be placed on - * the rightmost leaf page due to the influence of the heap TID column.) - * - * Note that we avoid applying the optimization when there is insufficient - * space on the rightmost page to fit caller's new item. This is necessary - * because we'll need to return a real descent stack when a page split is - * expected (actually, caller can cope with a leaf page split that uses a NULL - * stack, but that's very slow and so must be avoided). Note also that the - * fastpath optimization acquires the lock on the page conditionally as a way - * of reducing extra contention when there are concurrent insertions into the - * rightmost page (we give up if we'd have to wait for the lock). We assume - * that it isn't useful to apply the optimization when there is contention, - * since each per-backend cache won't stay valid for long. - */ -static BTStack -_bt_search_insert(Relation rel, Relation heaprel, BTInsertState insertstate) -{ - Assert(insertstate->buf == InvalidBuffer); - Assert(!insertstate->bounds_valid); - Assert(insertstate->postingoff == 0); - - if (RelationGetTargetBlock(rel) != InvalidBlockNumber) - { - /* Simulate a _bt_getbuf() call with conditional locking */ - insertstate->buf = ReadBuffer(rel, RelationGetTargetBlock(rel)); - if (_bt_conditionallockbuf(rel, insertstate->buf)) - { - Page page; - BTPageOpaque opaque; - AttrNumber cmpcol = 1; - - _bt_checkpage(rel, insertstate->buf); - page = BufferGetPage(insertstate->buf); - opaque = BTPageGetOpaque(page); - - /* - * Check if the page is still the rightmost leaf page and has - * enough free space to accommodate the new tuple. Also check - * that the insertion scan key is strictly greater than the first - * non-pivot tuple on the page. (Note that we expect itup_key's - * scantid to be unset when our caller is a checkingunique - * inserter.) - */ - if (P_RIGHTMOST(opaque) && - P_ISLEAF(opaque) && - !P_IGNORE(opaque) && - PageGetFreeSpace(page) > insertstate->itemsz && - PageGetMaxOffsetNumber(page) >= P_HIKEY && - _bt_compare(rel, insertstate->itup_key, page, P_HIKEY, - &cmpcol) > 0) - { - /* - * Caller can use the fastpath optimization because cached - * block is still rightmost leaf page, which can fit caller's - * new tuple without splitting. Keep block in local cache for - * next insert, and have caller use NULL stack. - * - * Note that _bt_insert_parent() has an assertion that catches - * leaf page splits that somehow follow from a fastpath insert - * (it should only be passed a NULL stack when it must deal - * with a concurrent root page split, and never because a NULL - * stack was returned here). - */ - return NULL; - } - - /* Page unsuitable for caller, drop lock and pin */ - _bt_relbuf(rel, insertstate->buf); - } - else - { - /* Lock unavailable, drop pin */ - ReleaseBuffer(insertstate->buf); - } - - /* Forget block, since cache doesn't appear to be useful */ - RelationSetTargetBlock(rel, InvalidBlockNumber); - } - - /* Cannot use optimization -- descend tree, return proper descent stack */ - return _bt_search(rel, heaprel, insertstate->itup_key, &insertstate->buf, - BT_WRITE, NULL); -} +#define NBT_SPECIALIZE_FILE "../../backend/access/nbtree/nbtinsert_spec.c" +#include "access/nbtree_spec.h" /* * _bt_check_unique() -- Check for violation of unique index constraint @@ -425,6 +108,7 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, bool inposting = false; bool prevalldead = true; int curposti = 0; + nbts_prep_ctx(rel); /* Assume unique until we find a duplicate */ *is_unique = true; @@ -776,253 +460,6 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel, return InvalidTransactionId; } - -/* - * _bt_findinsertloc() -- Finds an insert location for a tuple - * - * On entry, insertstate buffer contains the page the new tuple belongs - * on. It is exclusive-locked and pinned by the caller. - * - * If 'checkingunique' is true, the buffer on entry is the first page - * that contains duplicates of the new key. If there are duplicates on - * multiple pages, the correct insertion position might be some page to - * the right, rather than the first page. In that case, this function - * moves right to the correct target page. - * - * (In a !heapkeyspace index, there can be multiple pages with the same - * high key, where the new tuple could legitimately be placed on. In - * that case, the caller passes the first page containing duplicates, - * just like when checkingunique=true. If that page doesn't have enough - * room for the new tuple, this function moves right, trying to find a - * legal page that does.) - * - * If 'indexUnchanged' is true, this is for an UPDATE that didn't - * logically change the indexed value, but must nevertheless have a new - * entry to point to a successor version. This hint from the executor - * will influence our behavior when the page might have to be split and - * we must consider our options. Bottom-up index deletion can avoid - * pathological version-driven page splits, but we only want to go to the - * trouble of trying it when we already have moderate confidence that - * it's appropriate. The hint should not significantly affect our - * behavior over time unless practically all inserts on to the leaf page - * get the hint. - * - * On exit, insertstate buffer contains the chosen insertion page, and - * the offset within that page is returned. If _bt_findinsertloc needed - * to move right, the lock and pin on the original page are released, and - * the new buffer is exclusively locked and pinned instead. - * - * If insertstate contains cached binary search bounds, we will take - * advantage of them. This avoids repeating comparisons that we made in - * _bt_check_unique() already. - */ -static OffsetNumber -_bt_findinsertloc(Relation rel, - BTInsertState insertstate, - bool checkingunique, - bool indexUnchanged, - BTStack stack, - Relation heapRel) -{ - BTScanInsert itup_key = insertstate->itup_key; - Page page = BufferGetPage(insertstate->buf); - BTPageOpaque opaque; - OffsetNumber newitemoff; - - opaque = BTPageGetOpaque(page); - - /* Check 1/3 of a page restriction */ - if (unlikely(insertstate->itemsz > BTMaxItemSize(page))) - _bt_check_third_page(rel, heapRel, itup_key->heapkeyspace, page, - insertstate->itup); - - Assert(P_ISLEAF(opaque) && !P_INCOMPLETE_SPLIT(opaque)); - Assert(!insertstate->bounds_valid || checkingunique); - Assert(!itup_key->heapkeyspace || itup_key->scantid != NULL); - Assert(itup_key->heapkeyspace || itup_key->scantid == NULL); - Assert(!itup_key->allequalimage || itup_key->heapkeyspace); - - if (itup_key->heapkeyspace) - { - /* Keep track of whether checkingunique duplicate seen */ - bool uniquedup = indexUnchanged; - - /* - * If we're inserting into a unique index, we may have to walk right - * through leaf pages to find the one leaf page that we must insert on - * to. - * - * This is needed for checkingunique callers because a scantid was not - * used when we called _bt_search(). scantid can only be set after - * _bt_check_unique() has checked for duplicates. The buffer - * initially stored in insertstate->buf has the page where the first - * duplicate key might be found, which isn't always the page that new - * tuple belongs on. The heap TID attribute for new tuple (scantid) - * could force us to insert on a sibling page, though that should be - * very rare in practice. - */ - if (checkingunique) - { - if (insertstate->low < insertstate->stricthigh) - { - /* Encountered a duplicate in _bt_check_unique() */ - Assert(insertstate->bounds_valid); - uniquedup = true; - } - - for (;;) - { - AttrNumber cmpcol = 1; - - /* - * Does the new tuple belong on this page? - * - * The earlier _bt_check_unique() call may well have - * established a strict upper bound on the offset for the new - * item. If it's not the last item of the page (i.e. if there - * is at least one tuple on the page that goes after the tuple - * we're inserting) then we know that the tuple belongs on - * this page. We can skip the high key check. - */ - if (insertstate->bounds_valid && - insertstate->low <= insertstate->stricthigh && - insertstate->stricthigh <= PageGetMaxOffsetNumber(page)) - break; - - /* Test '<=', not '!=', since scantid is set now */ - if (P_RIGHTMOST(opaque) || - _bt_compare(rel, itup_key, page, P_HIKEY, &cmpcol) <= 0) - break; - - _bt_stepright(rel, heapRel, insertstate, stack); - /* Update local state after stepping right */ - page = BufferGetPage(insertstate->buf); - opaque = BTPageGetOpaque(page); - /* Assume duplicates (if checkingunique) */ - uniquedup = true; - } - } - - /* - * If the target page cannot fit newitem, try to avoid splitting the - * page on insert by performing deletion or deduplication now - */ - if (PageGetFreeSpace(page) < insertstate->itemsz) - _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, false, - checkingunique, uniquedup, - indexUnchanged); - } - else - { - /*---------- - * This is a !heapkeyspace (version 2 or 3) index. The current page - * is the first page that we could insert the new tuple to, but there - * may be other pages to the right that we could opt to use instead. - * - * If the new key is equal to one or more existing keys, we can - * legitimately place it anywhere in the series of equal keys. In - * fact, if the new key is equal to the page's "high key" we can place - * it on the next page. If it is equal to the high key, and there's - * not room to insert the new tuple on the current page without - * splitting, then we move right hoping to find more free space and - * avoid a split. - * - * Keep scanning right until we - * (a) find a page with enough free space, - * (b) reach the last page where the tuple can legally go, or - * (c) get tired of searching. - * (c) is not flippant; it is important because if there are many - * pages' worth of equal keys, it's better to split one of the early - * pages than to scan all the way to the end of the run of equal keys - * on every insert. We implement "get tired" as a random choice, - * since stopping after scanning a fixed number of pages wouldn't work - * well (we'd never reach the right-hand side of previously split - * pages). The probability of moving right is set at 0.99, which may - * seem too high to change the behavior much, but it does an excellent - * job of preventing O(N^2) behavior with many equal keys. - *---------- - */ - while (PageGetFreeSpace(page) < insertstate->itemsz) - { - AttrNumber cmpcol = 1; - - /* - * Before considering moving right, see if we can obtain enough - * space by erasing LP_DEAD items - */ - if (P_HAS_GARBAGE(opaque)) - { - /* Perform simple deletion */ - _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, true, - false, false, false); - - if (PageGetFreeSpace(page) >= insertstate->itemsz) - break; /* OK, now we have enough space */ - } - - /* - * Nope, so check conditions (b) and (c) enumerated above - * - * The earlier _bt_check_unique() call may well have established a - * strict upper bound on the offset for the new item. If it's not - * the last item of the page (i.e. if there is at least one tuple - * on the page that's greater than the tuple we're inserting to) - * then we know that the tuple belongs on this page. We can skip - * the high key check. - */ - if (insertstate->bounds_valid && - insertstate->low <= insertstate->stricthigh && - insertstate->stricthigh <= PageGetMaxOffsetNumber(page)) - break; - - if (P_RIGHTMOST(opaque) || - _bt_compare(rel, itup_key, page, P_HIKEY, &cmpcol) != 0 || - pg_prng_uint32(&pg_global_prng_state) <= (PG_UINT32_MAX / 100)) - break; - - _bt_stepright(rel, heapRel, insertstate, stack); - /* Update local state after stepping right */ - page = BufferGetPage(insertstate->buf); - opaque = BTPageGetOpaque(page); - } - } - - /* - * We should now be on the correct page. Find the offset within the page - * for the new tuple. (Possibly reusing earlier search bounds.) - */ - { - AttrNumber cmpcol PG_USED_FOR_ASSERTS_ONLY = 1; - Assert(P_RIGHTMOST(opaque) || - _bt_compare(rel, itup_key, page, P_HIKEY, &cmpcol) <= 0); - } - - newitemoff = _bt_binsrch_insert(rel, insertstate, 1); - - if (insertstate->postingoff == -1) - { - /* - * There is an overlapping posting list tuple with its LP_DEAD bit - * set. We don't want to unnecessarily unset its LP_DEAD bit while - * performing a posting list split, so perform simple index tuple - * deletion early. - */ - _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, true, - false, false, false); - - /* - * Do new binary search. New insert location cannot overlap with any - * posting list now. - */ - Assert(!insertstate->bounds_valid); - insertstate->postingoff = 0; - newitemoff = _bt_binsrch_insert(rel, insertstate, 1); - Assert(insertstate->postingoff == 0); - } - - return newitemoff; -} - /* * Step right to next non-dead page, during insertion. * @@ -1506,6 +943,7 @@ _bt_split(Relation rel, Relation heaprel, BTScanInsert itup_key, Buffer buf, bool newitemonleft, isleaf, isrightmost; + nbts_prep_ctx(rel); /* * origpage is the original page to be split. leftpage is a temporary @@ -2706,6 +2144,7 @@ _bt_delete_or_dedup_one_page(Relation rel, Relation heapRel, BTScanInsert itup_key = insertstate->itup_key; Page page = BufferGetPage(buffer); BTPageOpaque opaque = BTPageGetOpaque(page); + nbts_prep_ctx(rel); Assert(P_ISLEAF(opaque)); Assert(simpleonly || itup_key->heapkeyspace); diff --git a/src/backend/access/nbtree/nbtinsert_spec.c b/src/backend/access/nbtree/nbtinsert_spec.c new file mode 100644 index 0000000000..6915f22839 --- /dev/null +++ b/src/backend/access/nbtree/nbtinsert_spec.c @@ -0,0 +1,584 @@ +/*------------------------------------------------------------------------- + * + * nbtinsert_spec.c + * Index shape-specialized functions for nbtinsert.c + * + * NOTES + * See also: access/nbtree/README section "nbtree specialization" + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/nbtree/nbtinsert_spec.c + * + *------------------------------------------------------------------------- + */ + +#define _bt_search_insert NBTS_FUNCTION(_bt_search_insert) +#define _bt_findinsertloc NBTS_FUNCTION(_bt_findinsertloc) + +static BTStack _bt_search_insert(Relation rel, Relation heaprel, + BTInsertState insertstate); +static OffsetNumber _bt_findinsertloc(Relation rel, + BTInsertState insertstate, + bool checkingunique, + bool indexUnchanged, + BTStack stack, + Relation heapRel); + + +/* + * _bt_doinsert() -- Handle insertion of a single index tuple in the tree. + * + * This routine is called by the public interface routine, btinsert. + * By here, itup is filled in, including the TID. + * + * If checkUnique is UNIQUE_CHECK_NO or UNIQUE_CHECK_PARTIAL, this + * will allow duplicates. Otherwise (UNIQUE_CHECK_YES or + * UNIQUE_CHECK_EXISTING) it will throw error for a duplicate. + * For UNIQUE_CHECK_EXISTING we merely run the duplicate check, and + * don't actually insert. + * + * indexUnchanged executor hint indicates if itup is from an + * UPDATE that didn't logically change the indexed value, but + * must nevertheless have a new entry to point to a successor + * version. + * + * The result value is only significant for UNIQUE_CHECK_PARTIAL: + * it must be true if the entry is known unique, else false. + * (In the current implementation we'll also return true after a + * successful UNIQUE_CHECK_YES or UNIQUE_CHECK_EXISTING call, but + * that's just a coding artifact.) + */ +bool +_bt_doinsert(Relation rel, IndexTuple itup, + IndexUniqueCheck checkUnique, bool indexUnchanged, + Relation heapRel) +{ + bool is_unique = false; + BTInsertStateData insertstate; + BTScanInsert itup_key; + BTStack stack; + bool checkingunique = (checkUnique != UNIQUE_CHECK_NO); + + /* we need an insertion scan key to do our search, so build one */ + itup_key = _bt_mkscankey(rel, itup); + + if (checkingunique) + { + if (!itup_key->anynullkeys) + { + /* No (heapkeyspace) scantid until uniqueness established */ + itup_key->scantid = NULL; + } + else + { + /* + * Scan key for new tuple contains NULL key values. Bypass + * checkingunique steps. They are unnecessary because core code + * considers NULL unequal to every value, including NULL. + * + * This optimization avoids O(N^2) behavior within the + * _bt_findinsertloc() heapkeyspace path when a unique index has a + * large number of "duplicates" with NULL key values. + */ + checkingunique = false; + /* Tuple is unique in the sense that core code cares about */ + Assert(checkUnique != UNIQUE_CHECK_EXISTING); + is_unique = true; + } + } + + /* + * Fill in the BTInsertState working area, to track the current page and + * position within the page to insert on. + * + * Note that itemsz is passed down to lower level code that deals with + * inserting the item. It must be MAXALIGN()'d. This ensures that space + * accounting code consistently considers the alignment overhead that we + * expect PageAddItem() will add later. (Actually, index_form_tuple() is + * already conservative about alignment, but we don't rely on that from + * this distance. Besides, preserving the "true" tuple size in index + * tuple headers for the benefit of nbtsplitloc.c might happen someday. + * Note that heapam does not MAXALIGN() each heap tuple's lp_len field.) + */ + insertstate.itup = itup; + insertstate.itemsz = MAXALIGN(IndexTupleSize(itup)); + insertstate.itup_key = itup_key; + insertstate.bounds_valid = false; + insertstate.buf = InvalidBuffer; + insertstate.postingoff = 0; + + search: + + /* + * Find and lock the leaf page that the tuple should be added to by + * searching from the root page. insertstate.buf will hold a buffer that + * is locked in exclusive mode afterwards. + */ + stack = _bt_search_insert(rel, heapRel, &insertstate); + + /* + * checkingunique inserts are not allowed to go ahead when two tuples with + * equal key attribute values would be visible to new MVCC snapshots once + * the xact commits. Check for conflicts in the locked page/buffer (if + * needed) here. + * + * It might be necessary to check a page to the right in _bt_check_unique, + * though that should be very rare. In practice the first page the value + * could be on (with scantid omitted) is almost always also the only page + * that a matching tuple might be found on. This is due to the behavior + * of _bt_findsplitloc with duplicate tuples -- a group of duplicates can + * only be allowed to cross a page boundary when there is no candidate + * leaf page split point that avoids it. Also, _bt_check_unique can use + * the leaf page high key to determine that there will be no duplicates on + * the right sibling without actually visiting it (it uses the high key in + * cases where the new item happens to belong at the far right of the leaf + * page). + * + * NOTE: obviously, _bt_check_unique can only detect keys that are already + * in the index; so it cannot defend against concurrent insertions of the + * same key. We protect against that by means of holding a write lock on + * the first page the value could be on, with omitted/-inf value for the + * implicit heap TID tiebreaker attribute. Any other would-be inserter of + * the same key must acquire a write lock on the same page, so only one + * would-be inserter can be making the check at one time. Furthermore, + * once we are past the check we hold write locks continuously until we + * have performed our insertion, so no later inserter can fail to see our + * insertion. (This requires some care in _bt_findinsertloc.) + * + * If we must wait for another xact, we release the lock while waiting, + * and then must perform a new search. + * + * For a partial uniqueness check, we don't wait for the other xact. Just + * let the tuple in and return false for possibly non-unique, or true for + * definitely unique. + */ + if (checkingunique) + { + TransactionId xwait; + uint32 speculativeToken; + + xwait = _bt_check_unique(rel, &insertstate, heapRel, checkUnique, + &is_unique, &speculativeToken); + + if (unlikely(TransactionIdIsValid(xwait))) + { + /* Have to wait for the other guy ... */ + _bt_relbuf(rel, insertstate.buf); + insertstate.buf = InvalidBuffer; + + /* + * If it's a speculative insertion, wait for it to finish (ie. to + * go ahead with the insertion, or kill the tuple). Otherwise + * wait for the transaction to finish as usual. + */ + if (speculativeToken) + SpeculativeInsertionWait(xwait, speculativeToken); + else + XactLockTableWait(xwait, rel, &itup->t_tid, XLTW_InsertIndex); + + /* start over... */ + if (stack) + _bt_freestack(stack); + goto search; + } + + /* Uniqueness is established -- restore heap tid as scantid */ + if (itup_key->heapkeyspace) + itup_key->scantid = &itup->t_tid; + } + + if (checkUnique != UNIQUE_CHECK_EXISTING) + { + OffsetNumber newitemoff; + + /* + * The only conflict predicate locking cares about for indexes is when + * an index tuple insert conflicts with an existing lock. We don't + * know the actual page we're going to insert on for sure just yet in + * checkingunique and !heapkeyspace cases, but it's okay to use the + * first page the value could be on (with scantid omitted) instead. + */ + CheckForSerializableConflictIn(rel, NULL, BufferGetBlockNumber(insertstate.buf)); + + /* + * Do the insertion. Note that insertstate contains cached binary + * search bounds established within _bt_check_unique when insertion is + * checkingunique. + */ + newitemoff = _bt_findinsertloc(rel, &insertstate, checkingunique, + indexUnchanged, stack, heapRel); + _bt_insertonpg(rel, heapRel, itup_key, insertstate.buf, InvalidBuffer, + stack, itup, insertstate.itemsz, newitemoff, + insertstate.postingoff, false); + } + else + { + /* just release the buffer */ + _bt_relbuf(rel, insertstate.buf); + } + + /* be tidy */ + if (stack) + _bt_freestack(stack); + pfree(itup_key); + + return is_unique; +} + +/* + * _bt_search_insert() -- _bt_search() wrapper for inserts + * + * Search the tree for a particular scankey, or more precisely for the first + * leaf page it could be on. Try to make use of the fastpath optimization's + * rightmost leaf page cache before actually searching the tree from the root + * page, though. + * + * Return value is a stack of parent-page pointers (though see notes about + * fastpath optimization and page splits below). insertstate->buf is set to + * the address of the leaf-page buffer, which is write-locked and pinned in + * all cases (if necessary by creating a new empty root page for caller). + * + * The fastpath optimization avoids most of the work of searching the tree + * repeatedly when a single backend inserts successive new tuples on the + * rightmost leaf page of an index. A backend cache of the rightmost leaf + * page is maintained within _bt_insertonpg(), and used here. The cache is + * invalidated here when an insert of a non-pivot tuple must take place on a + * non-rightmost leaf page. + * + * The optimization helps with indexes on an auto-incremented field. It also + * helps with indexes on datetime columns, as well as indexes with lots of + * NULL values. (NULLs usually get inserted in the rightmost page for single + * column indexes, since they usually get treated as coming after everything + * else in the key space. Individual NULL tuples will generally be placed on + * the rightmost leaf page due to the influence of the heap TID column.) + * + * Note that we avoid applying the optimization when there is insufficient + * space on the rightmost page to fit caller's new item. This is necessary + * because we'll need to return a real descent stack when a page split is + * expected (actually, caller can cope with a leaf page split that uses a NULL + * stack, but that's very slow and so must be avoided). Note also that the + * fastpath optimization acquires the lock on the page conditionally as a way + * of reducing extra contention when there are concurrent insertions into the + * rightmost page (we give up if we'd have to wait for the lock). We assume + * that it isn't useful to apply the optimization when there is contention, + * since each per-backend cache won't stay valid for long. + */ +static BTStack +_bt_search_insert(Relation rel, Relation heaprel, BTInsertState insertstate) +{ + Assert(insertstate->buf == InvalidBuffer); + Assert(!insertstate->bounds_valid); + Assert(insertstate->postingoff == 0); + + if (RelationGetTargetBlock(rel) != InvalidBlockNumber) + { + /* Simulate a _bt_getbuf() call with conditional locking */ + insertstate->buf = ReadBuffer(rel, RelationGetTargetBlock(rel)); + if (_bt_conditionallockbuf(rel, insertstate->buf)) + { + Page page; + BTPageOpaque opaque; + AttrNumber cmpcol = 1; + + _bt_checkpage(rel, insertstate->buf); + page = BufferGetPage(insertstate->buf); + opaque = BTPageGetOpaque(page); + + /* + * Check if the page is still the rightmost leaf page and has + * enough free space to accommodate the new tuple. Also check + * that the insertion scan key is strictly greater than the first + * non-pivot tuple on the page. (Note that we expect itup_key's + * scantid to be unset when our caller is a checkingunique + * inserter.) + */ + if (P_RIGHTMOST(opaque) && + P_ISLEAF(opaque) && + !P_IGNORE(opaque) && + PageGetFreeSpace(page) > insertstate->itemsz && + PageGetMaxOffsetNumber(page) >= P_HIKEY && + _bt_compare(rel, insertstate->itup_key, page, P_HIKEY, + &cmpcol) > 0) + { + /* + * Caller can use the fastpath optimization because cached + * block is still rightmost leaf page, which can fit caller's + * new tuple without splitting. Keep block in local cache for + * next insert, and have caller use NULL stack. + * + * Note that _bt_insert_parent() has an assertion that catches + * leaf page splits that somehow follow from a fastpath insert + * (it should only be passed a NULL stack when it must deal + * with a concurrent root page split, and never because a NULL + * stack was returned here). + */ + return NULL; + } + + /* Page unsuitable for caller, drop lock and pin */ + _bt_relbuf(rel, insertstate->buf); + } + else + { + /* Lock unavailable, drop pin */ + ReleaseBuffer(insertstate->buf); + } + + /* Forget block, since cache doesn't appear to be useful */ + RelationSetTargetBlock(rel, InvalidBlockNumber); + } + + /* Cannot use optimization -- descend tree, return proper descent stack */ + return _bt_search(rel, heaprel, insertstate->itup_key, &insertstate->buf, + BT_WRITE, NULL); +} + +/* + * _bt_findinsertloc() -- Finds an insert location for a tuple + * + * On entry, insertstate buffer contains the page the new tuple belongs + * on. It is exclusive-locked and pinned by the caller. + * + * If 'checkingunique' is true, the buffer on entry is the first page + * that contains duplicates of the new key. If there are duplicates on + * multiple pages, the correct insertion position might be some page to + * the right, rather than the first page. In that case, this function + * moves right to the correct target page. + * + * (In a !heapkeyspace index, there can be multiple pages with the same + * high key, where the new tuple could legitimately be placed on. In + * that case, the caller passes the first page containing duplicates, + * just like when checkingunique=true. If that page doesn't have enough + * room for the new tuple, this function moves right, trying to find a + * legal page that does.) + * + * If 'indexUnchanged' is true, this is for an UPDATE that didn't + * logically change the indexed value, but must nevertheless have a new + * entry to point to a successor version. This hint from the executor + * will influence our behavior when the page might have to be split and + * we must consider our options. Bottom-up index deletion can avoid + * pathological version-driven page splits, but we only want to go to the + * trouble of trying it when we already have moderate confidence that + * it's appropriate. The hint should not significantly affect our + * behavior over time unless practically all inserts on to the leaf page + * get the hint. + * + * On exit, insertstate buffer contains the chosen insertion page, and + * the offset within that page is returned. If _bt_findinsertloc needed + * to move right, the lock and pin on the original page are released, and + * the new buffer is exclusively locked and pinned instead. + * + * If insertstate contains cached binary search bounds, we will take + * advantage of them. This avoids repeating comparisons that we made in + * _bt_check_unique() already. + */ +static OffsetNumber +_bt_findinsertloc(Relation rel, + BTInsertState insertstate, + bool checkingunique, + bool indexUnchanged, + BTStack stack, + Relation heapRel) +{ + BTScanInsert itup_key = insertstate->itup_key; + Page page = BufferGetPage(insertstate->buf); + BTPageOpaque opaque; + OffsetNumber newitemoff; + + opaque = BTPageGetOpaque(page); + + /* Check 1/3 of a page restriction */ + if (unlikely(insertstate->itemsz > BTMaxItemSize(page))) + _bt_check_third_page(rel, heapRel, itup_key->heapkeyspace, page, + insertstate->itup); + + Assert(P_ISLEAF(opaque) && !P_INCOMPLETE_SPLIT(opaque)); + Assert(!insertstate->bounds_valid || checkingunique); + Assert(!itup_key->heapkeyspace || itup_key->scantid != NULL); + Assert(itup_key->heapkeyspace || itup_key->scantid == NULL); + Assert(!itup_key->allequalimage || itup_key->heapkeyspace); + + if (itup_key->heapkeyspace) + { + /* Keep track of whether checkingunique duplicate seen */ + bool uniquedup = indexUnchanged; + + /* + * If we're inserting into a unique index, we may have to walk right + * through leaf pages to find the one leaf page that we must insert on + * to. + * + * This is needed for checkingunique callers because a scantid was not + * used when we called _bt_search(). scantid can only be set after + * _bt_check_unique() has checked for duplicates. The buffer + * initially stored in insertstate->buf has the page where the first + * duplicate key might be found, which isn't always the page that new + * tuple belongs on. The heap TID attribute for new tuple (scantid) + * could force us to insert on a sibling page, though that should be + * very rare in practice. + */ + if (checkingunique) + { + if (insertstate->low < insertstate->stricthigh) + { + /* Encountered a duplicate in _bt_check_unique() */ + Assert(insertstate->bounds_valid); + uniquedup = true; + } + + for (;;) + { + AttrNumber cmpcol = 1; + + /* + * Does the new tuple belong on this page? + * + * The earlier _bt_check_unique() call may well have + * established a strict upper bound on the offset for the new + * item. If it's not the last item of the page (i.e. if there + * is at least one tuple on the page that goes after the tuple + * we're inserting) then we know that the tuple belongs on + * this page. We can skip the high key check. + */ + if (insertstate->bounds_valid && + insertstate->low <= insertstate->stricthigh && + insertstate->stricthigh <= PageGetMaxOffsetNumber(page)) + break; + + /* Test '<=', not '!=', since scantid is set now */ + if (P_RIGHTMOST(opaque) || + _bt_compare(rel, itup_key, page, P_HIKEY, &cmpcol) <= 0) + break; + + _bt_stepright(rel, heapRel, insertstate, stack); + /* Update local state after stepping right */ + page = BufferGetPage(insertstate->buf); + opaque = BTPageGetOpaque(page); + /* Assume duplicates (if checkingunique) */ + uniquedup = true; + } + } + + /* + * If the target page cannot fit newitem, try to avoid splitting the + * page on insert by performing deletion or deduplication now + */ + if (PageGetFreeSpace(page) < insertstate->itemsz) + _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, false, + checkingunique, uniquedup, + indexUnchanged); + } + else + { + /*---------- + * This is a !heapkeyspace (version 2 or 3) index. The current page + * is the first page that we could insert the new tuple to, but there + * may be other pages to the right that we could opt to use instead. + * + * If the new key is equal to one or more existing keys, we can + * legitimately place it anywhere in the series of equal keys. In + * fact, if the new key is equal to the page's "high key" we can place + * it on the next page. If it is equal to the high key, and there's + * not room to insert the new tuple on the current page without + * splitting, then we move right hoping to find more free space and + * avoid a split. + * + * Keep scanning right until we + * (a) find a page with enough free space, + * (b) reach the last page where the tuple can legally go, or + * (c) get tired of searching. + * (c) is not flippant; it is important because if there are many + * pages' worth of equal keys, it's better to split one of the early + * pages than to scan all the way to the end of the run of equal keys + * on every insert. We implement "get tired" as a random choice, + * since stopping after scanning a fixed number of pages wouldn't work + * well (we'd never reach the right-hand side of previously split + * pages). The probability of moving right is set at 0.99, which may + * seem too high to change the behavior much, but it does an excellent + * job of preventing O(N^2) behavior with many equal keys. + *---------- + */ + while (PageGetFreeSpace(page) < insertstate->itemsz) + { + AttrNumber cmpcol = 1; + + /* + * Before considering moving right, see if we can obtain enough + * space by erasing LP_DEAD items + */ + if (P_HAS_GARBAGE(opaque)) + { + /* Perform simple deletion */ + _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, true, + false, false, false); + + if (PageGetFreeSpace(page) >= insertstate->itemsz) + break; /* OK, now we have enough space */ + } + + /* + * Nope, so check conditions (b) and (c) enumerated above + * + * The earlier _bt_check_unique() call may well have established a + * strict upper bound on the offset for the new item. If it's not + * the last item of the page (i.e. if there is at least one tuple + * on the page that's greater than the tuple we're inserting to) + * then we know that the tuple belongs on this page. We can skip + * the high key check. + */ + if (insertstate->bounds_valid && + insertstate->low <= insertstate->stricthigh && + insertstate->stricthigh <= PageGetMaxOffsetNumber(page)) + break; + + if (P_RIGHTMOST(opaque) || + _bt_compare(rel, itup_key, page, P_HIKEY, &cmpcol) != 0 || + pg_prng_uint32(&pg_global_prng_state) <= (PG_UINT32_MAX / 100)) + break; + + _bt_stepright(rel, heapRel, insertstate, stack); + /* Update local state after stepping right */ + page = BufferGetPage(insertstate->buf); + opaque = BTPageGetOpaque(page); + } + } + + /* + * We should now be on the correct page. Find the offset within the page + * for the new tuple. (Possibly reusing earlier search bounds.) + */ + { + AttrNumber cmpcol PG_USED_FOR_ASSERTS_ONLY = 1; + Assert(P_RIGHTMOST(opaque) || + _bt_compare(rel, itup_key, page, P_HIKEY, &cmpcol) <= 0); + } + + newitemoff = _bt_binsrch_insert(rel, insertstate, 1); + + if (insertstate->postingoff == -1) + { + /* + * There is an overlapping posting list tuple with its LP_DEAD bit + * set. We don't want to unnecessarily unset its LP_DEAD bit while + * performing a posting list split, so perform simple index tuple + * deletion early. + */ + _bt_delete_or_dedup_one_page(rel, heapRel, insertstate, true, + false, false, false); + + /* + * Do new binary search. New insert location cannot overlap with any + * posting list now. + */ + Assert(!insertstate->bounds_valid); + insertstate->postingoff = 0; + newitemoff = _bt_binsrch_insert(rel, insertstate, 1); + Assert(insertstate->postingoff == 0); + } + + return newitemoff; +} diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index 6558aea42b..7e8e4409c1 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -1810,6 +1810,7 @@ _bt_pagedel(Relation rel, Buffer leafbuf, BTVacState *vstate) bool rightsib_empty; Page page; BTPageOpaque opaque; + nbts_prep_ctx(rel); /* * Save original leafbuf block number from caller. Only deleted blocks diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 62bc9917f1..58f2fdba18 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -87,6 +87,8 @@ static BTVacuumPosting btreevacuumposting(BTVacState *vstate, OffsetNumber updatedoffset, int *nremaining); +#define NBT_SPECIALIZE_FILE "../../backend/access/nbtree/nbtree_spec.c" +#include "access/nbtree_spec.h" /* * Btree handler function: return IndexAmRoutine with access method parameters @@ -121,7 +123,7 @@ bthandler(PG_FUNCTION_ARGS) amroutine->ambuild = btbuild; amroutine->ambuildempty = btbuildempty; - amroutine->aminsert = btinsert; + amroutine->aminsert = btinsert_default; amroutine->ambulkdelete = btbulkdelete; amroutine->amvacuumcleanup = btvacuumcleanup; amroutine->amcanreturn = btcanreturn; @@ -155,6 +157,8 @@ btbuildempty(Relation index) Buffer metabuf; Page metapage; + nbt_opt_specialize(index); + /* * Initalize the metapage. * @@ -180,33 +184,6 @@ btbuildempty(Relation index) ReleaseBuffer(metabuf); } -/* - * btinsert() -- insert an index tuple into a btree. - * - * Descend the tree recursively, find the appropriate location for our - * new tuple, and put it there. - */ -bool -btinsert(Relation rel, Datum *values, bool *isnull, - ItemPointer ht_ctid, Relation heapRel, - IndexUniqueCheck checkUnique, - bool indexUnchanged, - IndexInfo *indexInfo) -{ - bool result; - IndexTuple itup; - - /* generate an index tuple */ - itup = index_form_tuple(RelationGetDescr(rel), values, isnull); - itup->t_tid = *ht_ctid; - - result = _bt_doinsert(rel, itup, checkUnique, indexUnchanged, heapRel); - - pfree(itup); - - return result; -} - /* * btgettuple() -- Get the next tuple in the scan. */ @@ -348,6 +325,8 @@ btbeginscan(Relation rel, int nkeys, int norderbys) IndexScanDesc scan; BTScanOpaque so; + nbt_opt_specialize(rel); + /* no order by operators allowed */ Assert(norderbys == 0); @@ -791,6 +770,8 @@ btbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, Relation rel = info->index; BTCycleId cycleid; + nbt_opt_specialize(rel); + /* allocate stats if first time through, else re-use existing struct */ if (stats == NULL) stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); diff --git a/src/backend/access/nbtree/nbtree_spec.c b/src/backend/access/nbtree/nbtree_spec.c new file mode 100644 index 0000000000..6b766581ab --- /dev/null +++ b/src/backend/access/nbtree/nbtree_spec.c @@ -0,0 +1,69 @@ +/*------------------------------------------------------------------------- + * + * nbtree_spec.c + * Index shape-specialized functions for nbtree.c + * + * NOTES + * See also: access/nbtree/README section "nbtree specialization" + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/nbtree/nbtree_spec.c + * + *------------------------------------------------------------------------- + */ + + +/* + * _bt_specialize() -- Specialize this index relation for its index key. + */ +void +_bt_specialize(Relation rel) +{ +#ifdef NBTS_SPECIALIZING_DEFAULT + NBTS_MAKE_CTX(rel); + /* + * We can't directly address _bt_specialize here because it'd be macro- + * expanded, nor can we utilize NBTS_SPECIALIZE_NAME here because it'd + * try to call _bt_specialize, which would be an infinite recursive call. + */ + switch (__nbts_ctx) { + case NBTS_CTX_CACHED: + _bt_specialize_cached(rel); + break; + case NBTS_CTX_DEFAULT: + break; + } +#else + rel->rd_indam->aminsert = btinsert; +#endif +} + +/* + * btinsert() -- insert an index tuple into a btree. + * + * Descend the tree recursively, find the appropriate location for our + * new tuple, and put it there. + */ +bool +btinsert(Relation rel, Datum *values, bool *isnull, + ItemPointer ht_ctid, Relation heapRel, + IndexUniqueCheck checkUnique, + bool indexUnchanged, + IndexInfo *indexInfo) +{ + bool result; + IndexTuple itup; + + /* generate an index tuple */ + itup = index_form_tuple(RelationGetDescr(rel), values, isnull); + itup->t_tid = *ht_ctid; + + result = _bt_doinsert(rel, itup, checkUnique, indexUnchanged, heapRel); + + pfree(itup); + + return result; +} diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index a6998e48d8..d31bb8abdf 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -26,12 +26,8 @@ static void _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp); -static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf, - AttrNumber *highkeycmpcol); static int _bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum); -static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir, - OffsetNumber offnum); static void _bt_saveitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum, IndexTuple itup); static int _bt_setuppostingitems(BTScanOpaque so, int itemIndex, @@ -48,6 +44,8 @@ static Buffer _bt_walk_left(Relation rel, Buffer buf, Snapshot snapshot); static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir); static inline void _bt_initialize_more_data(BTScanOpaque so, ScanDirection dir); +#define NBT_SPECIALIZE_FILE "../../backend/access/nbtree/nbtsearch_spec.c" +#include "access/nbtree_spec.h" /* * _bt_drop_lock_and_maybe_pin() @@ -72,601 +70,6 @@ _bt_drop_lock_and_maybe_pin(IndexScanDesc scan, BTScanPos sp) } } -/* - * _bt_search() -- Search the tree for a particular scankey, - * or more precisely for the first leaf page it could be on. - * - * The passed scankey is an insertion-type scankey (see nbtree/README), - * but it can omit the rightmost column(s) of the index. - * - * Return value is a stack of parent-page pointers (i.e. there is no entry for - * the leaf level/page). *bufP is set to the address of the leaf-page buffer, - * which is locked and pinned. No locks are held on the parent pages, - * however! - * - * If the snapshot parameter is not NULL, "old snapshot" checking will take - * place during the descent through the tree. This is not needed when - * positioning for an insert or delete, so NULL is used for those cases. - * - * The returned buffer is locked according to access parameter. Additionally, - * access = BT_WRITE will allow an empty root page to be created and returned. - * When access = BT_READ, an empty index will result in *bufP being set to - * InvalidBuffer. Also, in BT_WRITE mode, any incomplete splits encountered - * during the search will be finished. - * - * heaprel must be provided by callers that pass access = BT_WRITE, since we - * might need to allocate a new root page for caller -- see _bt_allocbuf. - */ -BTStack -_bt_search(Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP, - int access, Snapshot snapshot) -{ - BTStack stack_in = NULL; - int page_access = BT_READ; - char tupdatabuf[BLCKSZ / 3]; - AttrNumber highkeycmpcol = 1; - - /* heaprel must be set whenever _bt_allocbuf is reachable */ - Assert(access == BT_READ || access == BT_WRITE); - Assert(access == BT_READ || heaprel != NULL); - - /* Get the root page to start with */ - *bufP = _bt_getroot(rel, heaprel, access); - - /* If index is empty and access = BT_READ, no root page is created. */ - if (!BufferIsValid(*bufP)) - return (BTStack) NULL; - - /* Loop iterates once per level descended in the tree */ - for (;;) - { - Page page; - BTPageOpaque opaque; - OffsetNumber offnum; - ItemId itemid; - IndexTuple itup; - BlockNumber child; - BTStack new_stack; - - /* - * Race -- the page we just grabbed may have split since we read its - * downlink in its parent page (or the metapage). If it has, we may - * need to move right to its new sibling. Do that. - * - * In write-mode, allow _bt_moveright to finish any incomplete splits - * along the way. Strictly speaking, we'd only need to finish an - * incomplete split on the leaf page we're about to insert to, not on - * any of the upper levels (internal pages with incomplete splits are - * also taken care of in _bt_getstackbuf). But this is a good - * opportunity to finish splits of internal pages too. - */ - *bufP = _bt_moveright(rel, heaprel, key, *bufP, (access == BT_WRITE), - stack_in, page_access, snapshot, &highkeycmpcol, - (char *) tupdatabuf); - - /* if this is a leaf page, we're done */ - page = BufferGetPage(*bufP); - opaque = BTPageGetOpaque(page); - if (P_ISLEAF(opaque)) - break; - - /* - * Find the appropriate pivot tuple on this page. Its downlink points - * to the child page that we're about to descend to. - */ - offnum = _bt_binsrch(rel, key, *bufP, &highkeycmpcol); - itemid = PageGetItemId(page, offnum); - itup = (IndexTuple) PageGetItem(page, itemid); - Assert(BTreeTupleIsPivot(itup) || !key->heapkeyspace); - child = BTreeTupleGetDownLink(itup); - - Assert(IndexTupleSize(itup) < sizeof(tupdatabuf)); - memcpy((char *) tupdatabuf, (char *) itup, IndexTupleSize(itup)); - - /* - * We need to save the location of the pivot tuple we chose in a new - * stack entry for this page/level. If caller ends up splitting a - * page one level down, it usually ends up inserting a new pivot - * tuple/downlink immediately after the location recorded here. - */ - new_stack = (BTStack) palloc(sizeof(BTStackData)); - new_stack->bts_blkno = BufferGetBlockNumber(*bufP); - new_stack->bts_offset = offnum; - new_stack->bts_parent = stack_in; - - /* - * Page level 1 is lowest non-leaf page level prior to leaves. So, if - * we're on the level 1 and asked to lock leaf page in write mode, - * then lock next page in write mode, because it must be a leaf. - */ - if (opaque->btpo_level == 1 && access == BT_WRITE) - page_access = BT_WRITE; - - /* drop the read lock on the page, then acquire one on its child */ - *bufP = _bt_relandgetbuf(rel, *bufP, child, page_access); - - /* okay, all set to move down a level */ - stack_in = new_stack; - } - - /* - * If we're asked to lock leaf in write mode, but didn't manage to, then - * relock. This should only happen when the root page is a leaf page (and - * the only page in the index other than the metapage). - */ - if (access == BT_WRITE && page_access == BT_READ) - { - highkeycmpcol = 1; - - /* trade in our read lock for a write lock */ - _bt_unlockbuf(rel, *bufP); - _bt_lockbuf(rel, *bufP, BT_WRITE); - - /* - * Race -- the leaf page may have split after we dropped the read lock - * but before we acquired a write lock. If it has, we may need to - * move right to its new sibling. Do that. - */ - *bufP = _bt_moveright(rel, heaprel, key, *bufP, true, stack_in, BT_WRITE, - snapshot, &highkeycmpcol, (char *) tupdatabuf); - } - - return stack_in; -} - -/* - * _bt_moveright() -- move right in the btree if necessary. - * - * When we follow a pointer to reach a page, it is possible that - * the page has changed in the meanwhile. If this happens, we're - * guaranteed that the page has "split right" -- that is, that any - * data that appeared on the page originally is either on the page - * or strictly to the right of it. - * - * This routine decides whether or not we need to move right in the - * tree by examining the high key entry on the page. If that entry is - * strictly less than the scankey, or <= the scankey in the - * key.nextkey=true case, then we followed the wrong link and we need - * to move right. - * - * The passed insertion-type scankey can omit the rightmost column(s) of the - * index. (see nbtree/README) - * - * When key.nextkey is false (the usual case), we are looking for the first - * item >= key. When key.nextkey is true, we are looking for the first item - * strictly greater than key. - * - * If forupdate is true, we will attempt to finish any incomplete splits - * that we encounter. This is required when locking a target page for an - * insertion, because we don't allow inserting on a page before the split is - * completed. 'heaprel' and 'stack' are only used if forupdate is true. - * - * On entry, we have the buffer pinned and a lock of the type specified by - * 'access'. If we move right, we release the buffer and lock and acquire - * the same on the right sibling. Return value is the buffer we stop at. - * - * If the snapshot parameter is not NULL, "old snapshot" checking will take - * place during the descent through the tree. This is not needed when - * positioning for an insert or delete, so NULL is used for those cases. - */ -Buffer -_bt_moveright(Relation rel, - Relation heaprel, - BTScanInsert key, - Buffer buf, - bool forupdate, - BTStack stack, - int access, - Snapshot snapshot, - AttrNumber *comparecol, - char *tupdatabuf) -{ - Page page; - BTPageOpaque opaque; - int32 cmpval; - - Assert(!forupdate || heaprel != NULL); - Assert(PointerIsValid(comparecol) && PointerIsValid(tupdatabuf)); - - /* - * When nextkey = false (normal case): if the scan key that brought us to - * this page is > the high key stored on the page, then the page has split - * and we need to move right. (pg_upgrade'd !heapkeyspace indexes could - * have some duplicates to the right as well as the left, but that's - * something that's only ever dealt with on the leaf level, after - * _bt_search has found an initial leaf page.) - * - * When nextkey = true: move right if the scan key is >= page's high key. - * (Note that key.scantid cannot be set in this case.) - * - * The page could even have split more than once, so scan as far as - * needed. - * - * We also have to move right if we followed a link that brought us to a - * dead page. - */ - cmpval = key->nextkey ? 0 : 1; - - for (;;) - { - AttrNumber cmpcol = 1; - - page = BufferGetPage(buf); - TestForOldSnapshot(snapshot, rel, page); - opaque = BTPageGetOpaque(page); - - if (P_RIGHTMOST(opaque)) - { - *comparecol = 1; - break; - } - - /* - * Finish any incomplete splits we encounter along the way. - */ - if (forupdate && P_INCOMPLETE_SPLIT(opaque)) - { - BlockNumber blkno = BufferGetBlockNumber(buf); - - /* upgrade our lock if necessary */ - if (access == BT_READ) - { - _bt_unlockbuf(rel, buf); - _bt_lockbuf(rel, buf, BT_WRITE); - } - - if (P_INCOMPLETE_SPLIT(opaque)) - _bt_finish_split(rel, heaprel, buf, stack); - else - _bt_relbuf(rel, buf); - - /* re-acquire the lock in the right mode, and re-check */ - buf = _bt_getbuf(rel, blkno, access); - continue; - } - - /* - * tupdatabuf is filled with the right seperator of the parent node. - * This allows us to do a binary equality check between the parent - * node's right seperator (which is < key) and this page's P_HIKEY. - * If they equal, we can reuse the result of the parent node's - * rightkey compare, which means we can potentially save a full key - * compare (which includes indirect calls to attribute comparison - * functions). - * - * Without this, we'd on average use 3 full key compares per page before - * we achieve full dynamic prefix bounds, but with this optimization - * that is only 2. - * - * 3 compares: 1 for the highkey (rightmost), and on average 2 before - * we move right in the binary search on the page, this average equals - * SUM (1/2 ^ x) for x from 0 to log(n items)), which tends to 2. - */ - if (!P_IGNORE(opaque) && *comparecol > 1) - { - IndexTuple itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, P_HIKEY)); - IndexTuple buftuple = (IndexTuple) tupdatabuf; - if (IndexTupleSize(itup) == IndexTupleSize(buftuple)) - { - char *dataptr = (char *) itup; - - if (memcmp(dataptr + sizeof(IndexTupleData), - tupdatabuf + sizeof(IndexTupleData), - IndexTupleSize(itup) - sizeof(IndexTupleData)) == 0) - break; - } else { - *comparecol = 1; - } - } else { - *comparecol = 1; - } - - if (P_IGNORE(opaque) || - _bt_compare(rel, key, page, P_HIKEY, &cmpcol) >= cmpval) - { - *comparecol = 1; - /* step right one page */ - buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access); - continue; - } - else - { - *comparecol = cmpcol; - break; - } - } - - if (P_IGNORE(opaque)) - elog(ERROR, "fell off the end of index \"%s\"", - RelationGetRelationName(rel)); - - return buf; -} - -/* - * _bt_binsrch() -- Do a binary search for a key on a particular page. - * - * On a leaf page, _bt_binsrch() returns the OffsetNumber of the first - * key >= given scankey, or > scankey if nextkey is true. (NOTE: in - * particular, this means it is possible to return a value 1 greater than the - * number of keys on the page, if the scankey is > all keys on the page.) - * - * On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber - * of the last key < given scankey, or last key <= given scankey if nextkey - * is true. (Since _bt_compare treats the first data key of such a page as - * minus infinity, there will be at least one key < scankey, so the result - * always points at one of the keys on the page.) This key indicates the - * right place to descend to be sure we find all leaf keys >= given scankey - * (or leaf keys > given scankey when nextkey is true). - * - * When called, the "highkeycmpcol" pointer argument is expected to contain the - * AttrNumber of the first attribute that is not shared between scan key and - * this page's high key, i.e. the first attribute that we have to compare - * against the scan key. The value will be updated by _bt_binsrch to contain - * this same first column we'll need to compare against the scan key, but now - * for the index tuple at the returned offset. Valid values range from 1 - * (no shared prefix) to the number of key attributes + 1 (all index key - * attributes are equal to the scan key). See also _bt_compare, and - * backend/access/nbtree/README for more info. - * - * This procedure is not responsible for walking right, it just examines - * the given page. _bt_binsrch() has no lock or refcount side effects - * on the buffer. - */ -static OffsetNumber -_bt_binsrch(Relation rel, - BTScanInsert key, - Buffer buf, - AttrNumber *highkeycmpcol) -{ - Page page; - BTPageOpaque opaque; - OffsetNumber low, - high; - int32 result, - cmpval; - /* - * Prefix bounds, for the high/low offset's compare columns. - * "highkeycmpcol" is the value for this page's high key (if any) or 1 - * (no established shared prefix) - */ - AttrNumber highcmpcol = *highkeycmpcol, - lowcmpcol = 1; - - page = BufferGetPage(buf); - opaque = BTPageGetOpaque(page); - - /* Requesting nextkey semantics while using scantid seems nonsensical */ - Assert(!key->nextkey || key->scantid == NULL); - /* scantid-set callers must use _bt_binsrch_insert() on leaf pages */ - Assert(!P_ISLEAF(opaque) || key->scantid == NULL); - - low = P_FIRSTDATAKEY(opaque); - high = PageGetMaxOffsetNumber(page); - - /* - * If there are no keys on the page, return the first available slot. Note - * this covers two cases: the page is really empty (no keys), or it - * contains only a high key. The latter case is possible after vacuuming. - * This can never happen on an internal page, however, since they are - * never empty (an internal page must have children). - */ - if (unlikely(high < low)) - return low; - - /* - * Binary search to find the first key on the page >= scan key, or first - * key > scankey when nextkey is true. - * - * For nextkey=false (cmpval=1), the loop invariant is: all slots before - * 'low' are < scan key, all slots at or after 'high' are >= scan key. - * - * For nextkey=true (cmpval=0), the loop invariant is: all slots before - * 'low' are <= scan key, all slots at or after 'high' are > scan key. - * - * We maintain highcmpcol and lowcmpcol to keep track of prefixes that - * tuples share with the scan key, potentially allowing us to skip a - * prefix in the midpoint comparison. - * - * We can fall out when high == low. - */ - high++; /* establish the loop invariant for high */ - - cmpval = key->nextkey ? 0 : 1; /* select comparison value */ - - while (high > low) - { - OffsetNumber mid = low + ((high - low) / 2); - AttrNumber cmpcol = Min(highcmpcol, lowcmpcol); /* update prefix bounds */ - - /* We have low <= mid < high, so mid points at a real slot */ - - result = _bt_compare(rel, key, page, mid, &cmpcol); - - if (result >= cmpval) - { - low = mid + 1; - lowcmpcol = cmpcol; - } - else - { - high = mid; - highcmpcol = cmpcol; - } - } - - /* update the bounds at the caller */ - *highkeycmpcol = highcmpcol; - - /* - * At this point we have high == low, but be careful: they could point - * past the last slot on the page. - * - * On a leaf page, we always return the first key >= scan key (resp. > - * scan key), which could be the last slot + 1. - */ - if (P_ISLEAF(opaque)) - return low; - - /* - * On a non-leaf page, return the last key < scan key (resp. <= scan key). - * There must be one if _bt_compare() is playing by the rules. - */ - Assert(low > P_FIRSTDATAKEY(opaque)); - - return OffsetNumberPrev(low); -} - -/* - * - * _bt_binsrch_insert() -- Cacheable, incremental leaf page binary search. - * - * Like _bt_binsrch(), but with support for caching the binary search - * bounds. Only used during insertion, and only on the leaf page that it - * looks like caller will insert tuple on. Exclusive-locked and pinned - * leaf page is contained within insertstate. - * - * Caches the bounds fields in insertstate so that a subsequent call can - * reuse the low and strict high bounds of original binary search. Callers - * that use these fields directly must be prepared for the case where low - * and/or stricthigh are not on the same page (one or both exceed maxoff - * for the page). The case where there are no items on the page (high < - * low) makes bounds invalid. - * - * Caller is responsible for invalidating bounds when it modifies the page - * before calling here a second time, and for dealing with posting list - * tuple matches (callers can use insertstate's postingoff field to - * determine which existing heap TID will need to be replaced by a posting - * list split). - */ -OffsetNumber -_bt_binsrch_insert(Relation rel, BTInsertState insertstate, - AttrNumber highcmpcol) -{ - BTScanInsert key = insertstate->itup_key; - Page page; - BTPageOpaque opaque; - OffsetNumber low, - high, - stricthigh; - int32 result, - cmpval; - AttrNumber lowcmpcol = 1; - - page = BufferGetPage(insertstate->buf); - opaque = BTPageGetOpaque(page); - - Assert(P_ISLEAF(opaque)); - Assert(!key->nextkey); - Assert(insertstate->postingoff == 0); - - if (!insertstate->bounds_valid) - { - /* Start new binary search */ - low = P_FIRSTDATAKEY(opaque); - high = PageGetMaxOffsetNumber(page); - } - else - { - /* Restore result of previous binary search against same page */ - low = insertstate->low; - high = insertstate->stricthigh; - } - - /* If there are no keys on the page, return the first available slot */ - if (unlikely(high < low)) - { - /* Caller can't reuse bounds */ - insertstate->low = InvalidOffsetNumber; - insertstate->stricthigh = InvalidOffsetNumber; - insertstate->bounds_valid = false; - return low; - } - - /* - * Binary search to find the first key on the page >= scan key. (nextkey - * is always false when inserting). - * - * The loop invariant is: all slots before 'low' are < scan key, all slots - * at or after 'high' are >= scan key. 'stricthigh' is > scan key, and is - * maintained to save additional search effort for caller. - * - * We can fall out when high == low. - */ - if (!insertstate->bounds_valid) - high++; /* establish the loop invariant for high */ - stricthigh = high; /* high initially strictly higher */ - - cmpval = 1; /* !nextkey comparison value */ - - while (high > low) - { - OffsetNumber mid = low + ((high - low) / 2); - AttrNumber cmpcol = Min(highcmpcol, lowcmpcol); - - /* We have low <= mid < high, so mid points at a real slot */ - - result = _bt_compare(rel, key, page, mid, &cmpcol); - - if (result >= cmpval) - { - low = mid + 1; - lowcmpcol = cmpcol; - } - else - { - high = mid; - highcmpcol = cmpcol; - - if (result != 0) - stricthigh = high; - } - - /* - * If tuple at offset located by binary search is a posting list whose - * TID range overlaps with caller's scantid, perform posting list - * binary search to set postingoff for caller. Caller must split the - * posting list when postingoff is set. This should happen - * infrequently. - */ - if (unlikely(result == 0 && key->scantid != NULL)) - { - /* - * postingoff should never be set more than once per leaf page - * binary search. That would mean that there are duplicate table - * TIDs in the index, which is never okay. Check for that here. - */ - if (insertstate->postingoff != 0) - ereport(ERROR, - (errcode(ERRCODE_INDEX_CORRUPTED), - errmsg_internal("table tid from new index tuple (%u,%u) cannot find insert offset between offsets %u and %u of block %u in index \"%s\"", - ItemPointerGetBlockNumber(key->scantid), - ItemPointerGetOffsetNumber(key->scantid), - low, stricthigh, - BufferGetBlockNumber(insertstate->buf), - RelationGetRelationName(rel)))); - - insertstate->postingoff = _bt_binsrch_posting(key, page, mid); - } - } - - /* - * On a leaf page, a binary search always returns the first key >= scan - * key (at least in !nextkey case), which could be the last slot + 1. This - * is also the lower bound of cached search. - * - * stricthigh may also be the last slot + 1, which prevents caller from - * using bounds directly, but is still useful to us if we're called a - * second time with cached bounds (cached low will be < stricthigh when - * that happens). - */ - insertstate->low = low; - insertstate->stricthigh = stricthigh; - insertstate->bounds_valid = true; - - return low; -} - /*---------- * _bt_binsrch_posting() -- posting list binary search. * @@ -734,235 +137,6 @@ _bt_binsrch_posting(BTScanInsert key, Page page, OffsetNumber offnum) return low; } -/*---------- - * _bt_compare() -- Compare insertion-type scankey to tuple on a page. - * - * page/offnum: location of btree item to be compared to. - * - * This routine returns: - * <0 if scankey < tuple at offnum; - * 0 if scankey == tuple at offnum; - * >0 if scankey > tuple at offnum. - * - * NULLs in the keys are treated as sortable values. Therefore - * "equality" does not necessarily mean that the item should be returned - * to the caller as a matching key. Similarly, an insertion scankey - * with its scantid set is treated as equal to a posting tuple whose TID - * range overlaps with their scantid. There generally won't be a - * matching TID in the posting tuple, which caller must handle - * themselves (e.g., by splitting the posting list tuple). - * - * NOTE: The "comparecol" argument must refer to the first attribute of the - * index tuple of which the caller knows that it does not match the scan key: - * this means 1 for "no known matching attributes", up to the number of key - * attributes + 1 if the caller knows that all key attributes of the index - * tuple match those of the scan key. See backend/access/nbtree/README for - * details. - * - * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be - * "minus infinity": this routine will always claim it is less than the - * scankey. The actual key value stored is explicitly truncated to 0 - * attributes (explicitly minus infinity) with version 3+ indexes, but - * that isn't relied upon. This allows us to implement the Lehman and - * Yao convention that the first down-link pointer is before the first - * key. See backend/access/nbtree/README for details. - *---------- - */ -int32 -_bt_compare(Relation rel, - BTScanInsert key, - Page page, - OffsetNumber offnum, - AttrNumber *comparecol) -{ - TupleDesc itupdesc = RelationGetDescr(rel); - BTPageOpaque opaque = BTPageGetOpaque(page); - IndexTuple itup; - ItemPointer heapTid; - ScanKey scankey; - int ncmpkey; - int ntupatts; - int32 result; - - Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum)); - Assert(key->keysz <= IndexRelationGetNumberOfKeyAttributes(rel)); - Assert(key->heapkeyspace || key->scantid == NULL); - - /* - * Force result ">" if target item is first data item on an internal page - * --- see NOTE above. - */ - if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque)) - return 1; - - itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); - ntupatts = BTreeTupleGetNAtts(itup, rel); - - /* - * The scan key is set up with the attribute number associated with each - * term in the key. It is important that, if the index is multi-key, the - * scan contain the first k key attributes, and that they be in order. If - * you think about how multi-key ordering works, you'll understand why - * this is. - * - * We don't test for violation of this condition here, however. The - * initial setup for the index scan had better have gotten it right (see - * _bt_first). - */ - - ncmpkey = Min(ntupatts, key->keysz); - Assert(key->heapkeyspace || ncmpkey == key->keysz); - Assert(!BTreeTupleIsPosting(itup) || key->allequalimage); - - scankey = key->scankeys + ((*comparecol) - 1); - for (int i = *comparecol; i <= ncmpkey; i++) - { - Datum datum; - bool isNull; - - datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull); - - if (scankey->sk_flags & SK_ISNULL) /* key is NULL */ - { - if (isNull) - result = 0; /* NULL "=" NULL */ - else if (scankey->sk_flags & SK_BT_NULLS_FIRST) - result = -1; /* NULL "<" NOT_NULL */ - else - result = 1; /* NULL ">" NOT_NULL */ - } - else if (isNull) /* key is NOT_NULL and item is NULL */ - { - if (scankey->sk_flags & SK_BT_NULLS_FIRST) - result = 1; /* NOT_NULL ">" NULL */ - else - result = -1; /* NOT_NULL "<" NULL */ - } - else - { - /* - * The sk_func needs to be passed the index value as left arg and - * the sk_argument as right arg (they might be of different - * types). Since it is convenient for callers to think of - * _bt_compare as comparing the scankey to the index item, we have - * to flip the sign of the comparison result. (Unless it's a DESC - * column, in which case we *don't* flip the sign.) - */ - result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func, - scankey->sk_collation, - datum, - scankey->sk_argument)); - - if (!(scankey->sk_flags & SK_BT_DESC)) - INVERT_COMPARE_RESULT(result); - } - - /* if the keys are unequal, return the difference */ - if (result != 0) - { - *comparecol = i; - return result; - } - - scankey++; - } - - /* - * All tuple attributes are equal to the scan key, only later attributes - * could potentially not equal the scan key. - */ - *comparecol = ntupatts + 1; - - /* - * All non-truncated attributes (other than heap TID) were found to be - * equal. Treat truncated attributes as minus infinity when scankey has a - * key attribute value that would otherwise be compared directly. - * - * Note: it doesn't matter if ntupatts includes non-key attributes; - * scankey won't, so explicitly excluding non-key attributes isn't - * necessary. - */ - if (key->keysz > ntupatts) - return 1; - - /* - * Use the heap TID attribute and scantid to try to break the tie. The - * rules are the same as any other key attribute -- only the - * representation differs. - */ - heapTid = BTreeTupleGetHeapTID(itup); - if (key->scantid == NULL) - { - /* - * Most searches have a scankey that is considered greater than a - * truncated pivot tuple if and when the scankey has equal values for - * attributes up to and including the least significant untruncated - * attribute in tuple. - * - * For example, if an index has the minimum two attributes (single - * user key attribute, plus heap TID attribute), and a page's high key - * is ('foo', -inf), and scankey is ('foo', ), the search - * will not descend to the page to the left. The search will descend - * right instead. The truncated attribute in pivot tuple means that - * all non-pivot tuples on the page to the left are strictly < 'foo', - * so it isn't necessary to descend left. In other words, search - * doesn't have to descend left because it isn't interested in a match - * that has a heap TID value of -inf. - * - * However, some searches (pivotsearch searches) actually require that - * we descend left when this happens. -inf is treated as a possible - * match for omitted scankey attribute(s). This is needed by page - * deletion, which must re-find leaf pages that are targets for - * deletion using their high keys. - * - * Note: the heap TID part of the test ensures that scankey is being - * compared to a pivot tuple with one or more truncated key - * attributes. - * - * Note: pg_upgrade'd !heapkeyspace indexes must always descend to the - * left here, since they have no heap TID attribute (and cannot have - * any -inf key values in any case, since truncation can only remove - * non-key attributes). !heapkeyspace searches must always be - * prepared to deal with matches on both sides of the pivot once the - * leaf level is reached. - */ - if (key->heapkeyspace && !key->pivotsearch && - key->keysz == ntupatts && heapTid == NULL) - return 1; - - /* All provided scankey arguments found to be equal */ - return 0; - } - - /* - * Treat truncated heap TID as minus infinity, since scankey has a key - * attribute value (scantid) that would otherwise be compared directly - */ - Assert(key->keysz == IndexRelationGetNumberOfKeyAttributes(rel)); - if (heapTid == NULL) - return 1; - - /* - * Scankey must be treated as equal to a posting list tuple if its scantid - * value falls within the range of the posting list. In all other cases - * there can only be a single heap TID value, which is compared directly - * with scantid. - */ - Assert(ntupatts >= IndexRelationGetNumberOfKeyAttributes(rel)); - result = ItemPointerCompare(key->scantid, heapTid); - if (result <= 0 || !BTreeTupleIsPosting(itup)) - return result; - else - { - result = ItemPointerCompare(key->scantid, - BTreeTupleGetMaxHeapTID(itup)); - if (result > 0) - return 1; - } - - return 0; -} - /* * _bt_first() -- Find the first item in a scan. * @@ -1004,6 +178,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) BTScanPosItem *currItem; BlockNumber blkno; AttrNumber cmpcol = 1; + nbts_prep_ctx(rel); Assert(!BTScanPosIsValid(so->currPos)); @@ -1638,280 +813,6 @@ _bt_next(IndexScanDesc scan, ScanDirection dir) return true; } -/* - * _bt_readpage() -- Load data from current index page into so->currPos - * - * Caller must have pinned and read-locked so->currPos.buf; the buffer's state - * is not changed here. Also, currPos.moreLeft and moreRight must be valid; - * they are updated as appropriate. All other fields of so->currPos are - * initialized from scratch here. - * - * We scan the current page starting at offnum and moving in the indicated - * direction. All items matching the scan keys are loaded into currPos.items. - * moreLeft or moreRight (as appropriate) is cleared if _bt_checkkeys reports - * that there can be no more matching tuples in the current scan direction. - * - * In the case of a parallel scan, caller must have called _bt_parallel_seize - * prior to calling this function; this function will invoke - * _bt_parallel_release before returning. - * - * Returns true if any matching items found on the page, false if none. - */ -static bool -_bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) -{ - BTScanOpaque so = (BTScanOpaque) scan->opaque; - Page page; - BTPageOpaque opaque; - OffsetNumber minoff; - OffsetNumber maxoff; - int itemIndex; - bool continuescan; - int indnatts; - - /* - * We must have the buffer pinned and locked, but the usual macro can't be - * used here; this function is what makes it good for currPos. - */ - Assert(BufferIsValid(so->currPos.buf)); - - page = BufferGetPage(so->currPos.buf); - opaque = BTPageGetOpaque(page); - - /* allow next page be processed by parallel worker */ - if (scan->parallel_scan) - { - if (ScanDirectionIsForward(dir)) - _bt_parallel_release(scan, opaque->btpo_next); - else - _bt_parallel_release(scan, BufferGetBlockNumber(so->currPos.buf)); - } - - continuescan = true; /* default assumption */ - indnatts = IndexRelationGetNumberOfAttributes(scan->indexRelation); - minoff = P_FIRSTDATAKEY(opaque); - maxoff = PageGetMaxOffsetNumber(page); - - /* - * We note the buffer's block number so that we can release the pin later. - * This allows us to re-read the buffer if it is needed again for hinting. - */ - so->currPos.currPage = BufferGetBlockNumber(so->currPos.buf); - - /* - * We save the LSN of the page as we read it, so that we know whether it - * safe to apply LP_DEAD hints to the page later. This allows us to drop - * the pin for MVCC scans, which allows vacuum to avoid blocking. - */ - so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf); - - /* - * we must save the page's right-link while scanning it; this tells us - * where to step right to after we're done with these items. There is no - * corresponding need for the left-link, since splits always go right. - */ - so->currPos.nextPage = opaque->btpo_next; - - /* initialize tuple workspace to empty */ - so->currPos.nextTupleOffset = 0; - - /* - * Now that the current page has been made consistent, the macro should be - * good. - */ - Assert(BTScanPosIsPinned(so->currPos)); - - if (ScanDirectionIsForward(dir)) - { - /* load items[] in ascending order */ - itemIndex = 0; - - offnum = Max(offnum, minoff); - - while (offnum <= maxoff) - { - ItemId iid = PageGetItemId(page, offnum); - IndexTuple itup; - - /* - * If the scan specifies not to return killed tuples, then we - * treat a killed tuple as not passing the qual - */ - if (scan->ignore_killed_tuples && ItemIdIsDead(iid)) - { - offnum = OffsetNumberNext(offnum); - continue; - } - - itup = (IndexTuple) PageGetItem(page, iid); - - if (_bt_checkkeys(scan, itup, indnatts, dir, &continuescan)) - { - /* tuple passes all scan key conditions */ - if (!BTreeTupleIsPosting(itup)) - { - /* Remember it */ - _bt_saveitem(so, itemIndex, offnum, itup); - itemIndex++; - } - else - { - int tupleOffset; - - /* - * Set up state to return posting list, and remember first - * TID - */ - tupleOffset = - _bt_setuppostingitems(so, itemIndex, offnum, - BTreeTupleGetPostingN(itup, 0), - itup); - itemIndex++; - /* Remember additional TIDs */ - for (int i = 1; i < BTreeTupleGetNPosting(itup); i++) - { - _bt_savepostingitem(so, itemIndex, offnum, - BTreeTupleGetPostingN(itup, i), - tupleOffset); - itemIndex++; - } - } - } - /* When !continuescan, there can't be any more matches, so stop */ - if (!continuescan) - break; - - offnum = OffsetNumberNext(offnum); - } - - /* - * We don't need to visit page to the right when the high key - * indicates that no more matches will be found there. - * - * Checking the high key like this works out more often than you might - * think. Leaf page splits pick a split point between the two most - * dissimilar tuples (this is weighed against the need to evenly share - * free space). Leaf pages with high key attribute values that can - * only appear on non-pivot tuples on the right sibling page are - * common. - */ - if (continuescan && !P_RIGHTMOST(opaque)) - { - ItemId iid = PageGetItemId(page, P_HIKEY); - IndexTuple itup = (IndexTuple) PageGetItem(page, iid); - int truncatt; - - truncatt = BTreeTupleGetNAtts(itup, scan->indexRelation); - _bt_checkkeys(scan, itup, truncatt, dir, &continuescan); - } - - if (!continuescan) - so->currPos.moreRight = false; - - Assert(itemIndex <= MaxTIDsPerBTreePage); - so->currPos.firstItem = 0; - so->currPos.lastItem = itemIndex - 1; - so->currPos.itemIndex = 0; - } - else - { - /* load items[] in descending order */ - itemIndex = MaxTIDsPerBTreePage; - - offnum = Min(offnum, maxoff); - - while (offnum >= minoff) - { - ItemId iid = PageGetItemId(page, offnum); - IndexTuple itup; - bool tuple_alive; - bool passes_quals; - - /* - * If the scan specifies not to return killed tuples, then we - * treat a killed tuple as not passing the qual. Most of the - * time, it's a win to not bother examining the tuple's index - * keys, but just skip to the next tuple (previous, actually, - * since we're scanning backwards). However, if this is the first - * tuple on the page, we do check the index keys, to prevent - * uselessly advancing to the page to the left. This is similar - * to the high key optimization used by forward scans. - */ - if (scan->ignore_killed_tuples && ItemIdIsDead(iid)) - { - Assert(offnum >= P_FIRSTDATAKEY(opaque)); - if (offnum > P_FIRSTDATAKEY(opaque)) - { - offnum = OffsetNumberPrev(offnum); - continue; - } - - tuple_alive = false; - } - else - tuple_alive = true; - - itup = (IndexTuple) PageGetItem(page, iid); - - passes_quals = _bt_checkkeys(scan, itup, indnatts, dir, - &continuescan); - if (passes_quals && tuple_alive) - { - /* tuple passes all scan key conditions */ - if (!BTreeTupleIsPosting(itup)) - { - /* Remember it */ - itemIndex--; - _bt_saveitem(so, itemIndex, offnum, itup); - } - else - { - int tupleOffset; - - /* - * Set up state to return posting list, and remember first - * TID. - * - * Note that we deliberately save/return items from - * posting lists in ascending heap TID order for backwards - * scans. This allows _bt_killitems() to make a - * consistent assumption about the order of items - * associated with the same posting list tuple. - */ - itemIndex--; - tupleOffset = - _bt_setuppostingitems(so, itemIndex, offnum, - BTreeTupleGetPostingN(itup, 0), - itup); - /* Remember additional TIDs */ - for (int i = 1; i < BTreeTupleGetNPosting(itup); i++) - { - itemIndex--; - _bt_savepostingitem(so, itemIndex, offnum, - BTreeTupleGetPostingN(itup, i), - tupleOffset); - } - } - } - if (!continuescan) - { - /* there can't be any more matches, so stop */ - so->currPos.moreLeft = false; - break; - } - - offnum = OffsetNumberPrev(offnum); - } - - Assert(itemIndex >= 0); - so->currPos.firstItem = itemIndex; - so->currPos.lastItem = MaxTIDsPerBTreePage - 1; - so->currPos.itemIndex = MaxTIDsPerBTreePage - 1; - } - - return (so->currPos.firstItem <= so->currPos.lastItem); -} - /* Save an index item into so->currPos.items[itemIndex] */ static void _bt_saveitem(BTScanOpaque so, int itemIndex, @@ -2120,12 +1021,11 @@ static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir) { BTScanOpaque so = (BTScanOpaque) scan->opaque; - Relation rel; + Relation rel = scan->indexRelation; Page page; BTPageOpaque opaque; bool status; - - rel = scan->indexRelation; + nbts_prep_ctx(rel); if (ScanDirectionIsForward(dir)) { @@ -2537,6 +1437,7 @@ _bt_endpoint(IndexScanDesc scan, ScanDirection dir) BTPageOpaque opaque; OffsetNumber start; BTScanPosItem *currItem; + nbts_prep_ctx(rel); /* * Scan down to the leftmost or rightmost leaf page. This is a simplified diff --git a/src/backend/access/nbtree/nbtsearch_spec.c b/src/backend/access/nbtree/nbtsearch_spec.c new file mode 100644 index 0000000000..5f1ead2400 --- /dev/null +++ b/src/backend/access/nbtree/nbtsearch_spec.c @@ -0,0 +1,1123 @@ +/*------------------------------------------------------------------------- + * + * nbtsearch_spec.c + * Index shape-specialized functions for nbtsearch.c + * + * NOTES + * See also: access/nbtree/README section "nbtree specialization" + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/nbtree/nbtsearch_spec.c + * + *------------------------------------------------------------------------- + */ + +#define _bt_binsrch NBTS_FUNCTION(_bt_binsrch) +#define _bt_readpage NBTS_FUNCTION(_bt_readpage) + +static OffsetNumber _bt_binsrch(Relation rel, BTScanInsert key, Buffer buf, + AttrNumber *highkeycmpcol); +static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir, + OffsetNumber offnum); + +/* + * _bt_search() -- Search the tree for a particular scankey, + * or more precisely for the first leaf page it could be on. + * + * The passed scankey is an insertion-type scankey (see nbtree/README), + * but it can omit the rightmost column(s) of the index. + * + * Return value is a stack of parent-page pointers (i.e. there is no entry for + * the leaf level/page). *bufP is set to the address of the leaf-page buffer, + * which is locked and pinned. No locks are held on the parent pages, + * however! + * + * If the snapshot parameter is not NULL, "old snapshot" checking will take + * place during the descent through the tree. This is not needed when + * positioning for an insert or delete, so NULL is used for those cases. + * + * The returned buffer is locked according to access parameter. Additionally, + * access = BT_WRITE will allow an empty root page to be created and returned. + * When access = BT_READ, an empty index will result in *bufP being set to + * InvalidBuffer. Also, in BT_WRITE mode, any incomplete splits encountered + * during the search will be finished. + * + * heaprel must be provided by callers that pass access = BT_WRITE, since we + * might need to allocate a new root page for caller -- see _bt_allocbuf. + */ +BTStack +_bt_search(Relation rel, Relation heaprel, BTScanInsert key, Buffer *bufP, + int access, Snapshot snapshot) +{ + BTStack stack_in = NULL; + int page_access = BT_READ; + char tupdatabuf[BLCKSZ / 3]; + AttrNumber highkeycmpcol = 1; + + /* heaprel must be set whenever _bt_allocbuf is reachable */ + Assert(access == BT_READ || access == BT_WRITE); + Assert(access == BT_READ || heaprel != NULL); + + /* Get the root page to start with */ + *bufP = _bt_getroot(rel, heaprel, access); + + /* If index is empty and access = BT_READ, no root page is created. */ + if (!BufferIsValid(*bufP)) + return (BTStack) NULL; + + /* Loop iterates once per level descended in the tree */ + for (;;) + { + Page page; + BTPageOpaque opaque; + OffsetNumber offnum; + ItemId itemid; + IndexTuple itup; + BlockNumber child; + BTStack new_stack; + + /* + * Race -- the page we just grabbed may have split since we read its + * downlink in its parent page (or the metapage). If it has, we may + * need to move right to its new sibling. Do that. + * + * In write-mode, allow _bt_moveright to finish any incomplete splits + * along the way. Strictly speaking, we'd only need to finish an + * incomplete split on the leaf page we're about to insert to, not on + * any of the upper levels (internal pages with incomplete splits are + * also taken care of in _bt_getstackbuf). But this is a good + * opportunity to finish splits of internal pages too. + */ + *bufP = _bt_moveright(rel, heaprel, key, *bufP, (access == BT_WRITE), + stack_in, page_access, snapshot, &highkeycmpcol, + (char *) tupdatabuf); + + /* if this is a leaf page, we're done */ + page = BufferGetPage(*bufP); + opaque = BTPageGetOpaque(page); + if (P_ISLEAF(opaque)) + break; + + /* + * Find the appropriate pivot tuple on this page. Its downlink points + * to the child page that we're about to descend to. + */ + offnum = _bt_binsrch(rel, key, *bufP, &highkeycmpcol); + itemid = PageGetItemId(page, offnum); + itup = (IndexTuple) PageGetItem(page, itemid); + Assert(BTreeTupleIsPivot(itup) || !key->heapkeyspace); + child = BTreeTupleGetDownLink(itup); + + Assert(IndexTupleSize(itup) < sizeof(tupdatabuf)); + memcpy((char *) tupdatabuf, (char *) itup, IndexTupleSize(itup)); + + /* + * We need to save the location of the pivot tuple we chose in a new + * stack entry for this page/level. If caller ends up splitting a + * page one level down, it usually ends up inserting a new pivot + * tuple/downlink immediately after the location recorded here. + */ + new_stack = (BTStack) palloc(sizeof(BTStackData)); + new_stack->bts_blkno = BufferGetBlockNumber(*bufP); + new_stack->bts_offset = offnum; + new_stack->bts_parent = stack_in; + + /* + * Page level 1 is lowest non-leaf page level prior to leaves. So, if + * we're on the level 1 and asked to lock leaf page in write mode, + * then lock next page in write mode, because it must be a leaf. + */ + if (opaque->btpo_level == 1 && access == BT_WRITE) + page_access = BT_WRITE; + + /* drop the read lock on the page, then acquire one on its child */ + *bufP = _bt_relandgetbuf(rel, *bufP, child, page_access); + + /* okay, all set to move down a level */ + stack_in = new_stack; + } + + /* + * If we're asked to lock leaf in write mode, but didn't manage to, then + * relock. This should only happen when the root page is a leaf page (and + * the only page in the index other than the metapage). + */ + if (access == BT_WRITE && page_access == BT_READ) + { + highkeycmpcol = 1; + + /* trade in our read lock for a write lock */ + _bt_unlockbuf(rel, *bufP); + _bt_lockbuf(rel, *bufP, BT_WRITE); + + /* + * Race -- the leaf page may have split after we dropped the read lock + * but before we acquired a write lock. If it has, we may need to + * move right to its new sibling. Do that. + */ + *bufP = _bt_moveright(rel, heaprel, key, *bufP, true, stack_in, BT_WRITE, + snapshot, &highkeycmpcol, (char *) tupdatabuf); + } + + return stack_in; +} + +/* + * _bt_moveright() -- move right in the btree if necessary. + * + * When we follow a pointer to reach a page, it is possible that + * the page has changed in the meanwhile. If this happens, we're + * guaranteed that the page has "split right" -- that is, that any + * data that appeared on the page originally is either on the page + * or strictly to the right of it. + * + * This routine decides whether or not we need to move right in the + * tree by examining the high key entry on the page. If that entry is + * strictly less than the scankey, or <= the scankey in the + * key.nextkey=true case, then we followed the wrong link and we need + * to move right. + * + * The passed insertion-type scankey can omit the rightmost column(s) of the + * index. (see nbtree/README) + * + * When key.nextkey is false (the usual case), we are looking for the first + * item >= key. When key.nextkey is true, we are looking for the first item + * strictly greater than key. + * + * If forupdate is true, we will attempt to finish any incomplete splits + * that we encounter. This is required when locking a target page for an + * insertion, because we don't allow inserting on a page before the split is + * completed. 'heaprel' and 'stack' are only used if forupdate is true. + * + * On entry, we have the buffer pinned and a lock of the type specified by + * 'access'. If we move right, we release the buffer and lock and acquire + * the same on the right sibling. Return value is the buffer we stop at. + * + * If the snapshot parameter is not NULL, "old snapshot" checking will take + * place during the descent through the tree. This is not needed when + * positioning for an insert or delete, so NULL is used for those cases. + */ +Buffer +_bt_moveright(Relation rel, + Relation heaprel, + BTScanInsert key, + Buffer buf, + bool forupdate, + BTStack stack, + int access, + Snapshot snapshot, + AttrNumber *comparecol, + char *tupdatabuf) +{ + Page page; + BTPageOpaque opaque; + int32 cmpval; + + Assert(!forupdate || heaprel != NULL); + Assert(PointerIsValid(comparecol) && PointerIsValid(tupdatabuf)); + + /* + * When nextkey = false (normal case): if the scan key that brought us to + * this page is > the high key stored on the page, then the page has split + * and we need to move right. (pg_upgrade'd !heapkeyspace indexes could + * have some duplicates to the right as well as the left, but that's + * something that's only ever dealt with on the leaf level, after + * _bt_search has found an initial leaf page.) + * + * When nextkey = true: move right if the scan key is >= page's high key. + * (Note that key.scantid cannot be set in this case.) + * + * The page could even have split more than once, so scan as far as + * needed. + * + * We also have to move right if we followed a link that brought us to a + * dead page. + */ + cmpval = key->nextkey ? 0 : 1; + + for (;;) + { + AttrNumber cmpcol = 1; + + page = BufferGetPage(buf); + TestForOldSnapshot(snapshot, rel, page); + opaque = BTPageGetOpaque(page); + + if (P_RIGHTMOST(opaque)) + { + *comparecol = 1; + break; + } + + /* + * Finish any incomplete splits we encounter along the way. + */ + if (forupdate && P_INCOMPLETE_SPLIT(opaque)) + { + BlockNumber blkno = BufferGetBlockNumber(buf); + + /* upgrade our lock if necessary */ + if (access == BT_READ) + { + _bt_unlockbuf(rel, buf); + _bt_lockbuf(rel, buf, BT_WRITE); + } + + if (P_INCOMPLETE_SPLIT(opaque)) + _bt_finish_split(rel, heaprel, buf, stack); + else + _bt_relbuf(rel, buf); + + /* re-acquire the lock in the right mode, and re-check */ + buf = _bt_getbuf(rel, blkno, access); + continue; + } + + /* + * tupdatabuf is filled with the right seperator of the parent node. + * This allows us to do a binary equality check between the parent + * node's right seperator (which is < key) and this page's P_HIKEY. + * If they equal, we can reuse the result of the parent node's + * rightkey compare, which means we can potentially save a full key + * compare (which includes indirect calls to attribute comparison + * functions). + * + * Without this, we'd on average use 3 full key compares per page before + * we achieve full dynamic prefix bounds, but with this optimization + * that is only 2. + * + * 3 compares: 1 for the highkey (rightmost), and on average 2 before + * we move right in the binary search on the page, this average equals + * SUM (1/2 ^ x) for x from 0 to log(n items)), which tends to 2. + */ + if (!P_IGNORE(opaque) && *comparecol > 1) + { + IndexTuple itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, P_HIKEY)); + IndexTuple buftuple = (IndexTuple) tupdatabuf; + if (IndexTupleSize(itup) == IndexTupleSize(buftuple)) + { + char *dataptr = (char *) itup; + + if (memcmp(dataptr + sizeof(IndexTupleData), + tupdatabuf + sizeof(IndexTupleData), + IndexTupleSize(itup) - sizeof(IndexTupleData)) == 0) + break; + } else { + *comparecol = 1; + } + } else { + *comparecol = 1; + } + + if (P_IGNORE(opaque) || + _bt_compare(rel, key, page, P_HIKEY, &cmpcol) >= cmpval) + { + *comparecol = 1; + /* step right one page */ + buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access); + continue; + } + else + { + *comparecol = cmpcol; + break; + } + } + + if (P_IGNORE(opaque)) + elog(ERROR, "fell off the end of index \"%s\"", + RelationGetRelationName(rel)); + + return buf; +} + +/* + * _bt_binsrch() -- Do a binary search for a key on a particular page. + * + * On a leaf page, _bt_binsrch() returns the OffsetNumber of the first + * key >= given scankey, or > scankey if nextkey is true. (NOTE: in + * particular, this means it is possible to return a value 1 greater than the + * number of keys on the page, if the scankey is > all keys on the page.) + * + * On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber + * of the last key < given scankey, or last key <= given scankey if nextkey + * is true. (Since _bt_compare treats the first data key of such a page as + * minus infinity, there will be at least one key < scankey, so the result + * always points at one of the keys on the page.) This key indicates the + * right place to descend to be sure we find all leaf keys >= given scankey + * (or leaf keys > given scankey when nextkey is true). + * + * When called, the "highkeycmpcol" pointer argument is expected to contain the + * AttrNumber of the first attribute that is not shared between scan key and + * this page's high key, i.e. the first attribute that we have to compare + * against the scan key. The value will be updated by _bt_binsrch to contain + * this same first column we'll need to compare against the scan key, but now + * for the index tuple at the returned offset. Valid values range from 1 + * (no shared prefix) to the number of key attributes + 1 (all index key + * attributes are equal to the scan key). See also _bt_compare, and + * backend/access/nbtree/README for more info. + * + * This procedure is not responsible for walking right, it just examines + * the given page. _bt_binsrch() has no lock or refcount side effects + * on the buffer. + */ +static OffsetNumber +_bt_binsrch(Relation rel, + BTScanInsert key, + Buffer buf, + AttrNumber *highkeycmpcol) +{ + Page page; + BTPageOpaque opaque; + OffsetNumber low, + high; + int32 result, + cmpval; + /* + * Prefix bounds, for the high/low offset's compare columns. + * "highkeycmpcol" is the value for this page's high key (if any) or 1 + * (no established shared prefix) + */ + AttrNumber highcmpcol = *highkeycmpcol, + lowcmpcol = 1; + + page = BufferGetPage(buf); + opaque = BTPageGetOpaque(page); + + /* Requesting nextkey semantics while using scantid seems nonsensical */ + Assert(!key->nextkey || key->scantid == NULL); + /* scantid-set callers must use _bt_binsrch_insert() on leaf pages */ + Assert(!P_ISLEAF(opaque) || key->scantid == NULL); + + low = P_FIRSTDATAKEY(opaque); + high = PageGetMaxOffsetNumber(page); + + /* + * If there are no keys on the page, return the first available slot. Note + * this covers two cases: the page is really empty (no keys), or it + * contains only a high key. The latter case is possible after vacuuming. + * This can never happen on an internal page, however, since they are + * never empty (an internal page must have children). + */ + if (unlikely(high < low)) + return low; + + /* + * Binary search to find the first key on the page >= scan key, or first + * key > scankey when nextkey is true. + * + * For nextkey=false (cmpval=1), the loop invariant is: all slots before + * 'low' are < scan key, all slots at or after 'high' are >= scan key. + * + * For nextkey=true (cmpval=0), the loop invariant is: all slots before + * 'low' are <= scan key, all slots at or after 'high' are > scan key. + * + * We maintain highcmpcol and lowcmpcol to keep track of prefixes that + * tuples share with the scan key, potentially allowing us to skip a + * prefix in the midpoint comparison. + * + * We can fall out when high == low. + */ + high++; /* establish the loop invariant for high */ + + cmpval = key->nextkey ? 0 : 1; /* select comparison value */ + + while (high > low) + { + OffsetNumber mid = low + ((high - low) / 2); + AttrNumber cmpcol = Min(highcmpcol, lowcmpcol); /* update prefix bounds */ + + /* We have low <= mid < high, so mid points at a real slot */ + + result = _bt_compare(rel, key, page, mid, &cmpcol); + + if (result >= cmpval) + { + low = mid + 1; + lowcmpcol = cmpcol; + } + else + { + high = mid; + highcmpcol = cmpcol; + } + } + + /* update the bounds at the caller */ + *highkeycmpcol = highcmpcol; + + /* + * At this point we have high == low, but be careful: they could point + * past the last slot on the page. + * + * On a leaf page, we always return the first key >= scan key (resp. > + * scan key), which could be the last slot + 1. + */ + if (P_ISLEAF(opaque)) + return low; + + /* + * On a non-leaf page, return the last key < scan key (resp. <= scan key). + * There must be one if _bt_compare() is playing by the rules. + */ + Assert(low > P_FIRSTDATAKEY(opaque)); + + return OffsetNumberPrev(low); +} + +/* + * + * _bt_binsrch_insert() -- Cacheable, incremental leaf page binary search. + * + * Like _bt_binsrch(), but with support for caching the binary search + * bounds. Only used during insertion, and only on the leaf page that it + * looks like caller will insert tuple on. Exclusive-locked and pinned + * leaf page is contained within insertstate. + * + * Caches the bounds fields in insertstate so that a subsequent call can + * reuse the low and strict high bounds of original binary search. Callers + * that use these fields directly must be prepared for the case where low + * and/or stricthigh are not on the same page (one or both exceed maxoff + * for the page). The case where there are no items on the page (high < + * low) makes bounds invalid. + * + * Caller is responsible for invalidating bounds when it modifies the page + * before calling here a second time, and for dealing with posting list + * tuple matches (callers can use insertstate's postingoff field to + * determine which existing heap TID will need to be replaced by a posting + * list split). + */ +OffsetNumber +_bt_binsrch_insert(Relation rel, BTInsertState insertstate, + AttrNumber highcmpcol) +{ + BTScanInsert key = insertstate->itup_key; + Page page; + BTPageOpaque opaque; + OffsetNumber low, + high, + stricthigh; + int32 result, + cmpval; + AttrNumber lowcmpcol = 1; + + page = BufferGetPage(insertstate->buf); + opaque = BTPageGetOpaque(page); + + Assert(P_ISLEAF(opaque)); + Assert(!key->nextkey); + Assert(insertstate->postingoff == 0); + + if (!insertstate->bounds_valid) + { + /* Start new binary search */ + low = P_FIRSTDATAKEY(opaque); + high = PageGetMaxOffsetNumber(page); + } + else + { + /* Restore result of previous binary search against same page */ + low = insertstate->low; + high = insertstate->stricthigh; + } + + /* If there are no keys on the page, return the first available slot */ + if (unlikely(high < low)) + { + /* Caller can't reuse bounds */ + insertstate->low = InvalidOffsetNumber; + insertstate->stricthigh = InvalidOffsetNumber; + insertstate->bounds_valid = false; + return low; + } + + /* + * Binary search to find the first key on the page >= scan key. (nextkey + * is always false when inserting). + * + * The loop invariant is: all slots before 'low' are < scan key, all slots + * at or after 'high' are >= scan key. 'stricthigh' is > scan key, and is + * maintained to save additional search effort for caller. + * + * We can fall out when high == low. + */ + if (!insertstate->bounds_valid) + high++; /* establish the loop invariant for high */ + stricthigh = high; /* high initially strictly higher */ + + cmpval = 1; /* !nextkey comparison value */ + + while (high > low) + { + OffsetNumber mid = low + ((high - low) / 2); + AttrNumber cmpcol = Min(highcmpcol, lowcmpcol); + + /* We have low <= mid < high, so mid points at a real slot */ + + result = _bt_compare(rel, key, page, mid, &cmpcol); + + if (result >= cmpval) + { + low = mid + 1; + lowcmpcol = cmpcol; + } + else + { + high = mid; + highcmpcol = cmpcol; + + if (result != 0) + stricthigh = high; + } + + /* + * If tuple at offset located by binary search is a posting list whose + * TID range overlaps with caller's scantid, perform posting list + * binary search to set postingoff for caller. Caller must split the + * posting list when postingoff is set. This should happen + * infrequently. + */ + if (unlikely(result == 0 && key->scantid != NULL)) + { + /* + * postingoff should never be set more than once per leaf page + * binary search. That would mean that there are duplicate table + * TIDs in the index, which is never okay. Check for that here. + */ + if (insertstate->postingoff != 0) + ereport(ERROR, + (errcode(ERRCODE_INDEX_CORRUPTED), + errmsg_internal("table tid from new index tuple (%u,%u) cannot find insert offset between offsets %u and %u of block %u in index \"%s\"", + ItemPointerGetBlockNumber(key->scantid), + ItemPointerGetOffsetNumber(key->scantid), + low, stricthigh, + BufferGetBlockNumber(insertstate->buf), + RelationGetRelationName(rel)))); + + insertstate->postingoff = _bt_binsrch_posting(key, page, mid); + } + } + + /* + * On a leaf page, a binary search always returns the first key >= scan + * key (at least in !nextkey case), which could be the last slot + 1. This + * is also the lower bound of cached search. + * + * stricthigh may also be the last slot + 1, which prevents caller from + * using bounds directly, but is still useful to us if we're called a + * second time with cached bounds (cached low will be < stricthigh when + * that happens). + */ + insertstate->low = low; + insertstate->stricthigh = stricthigh; + insertstate->bounds_valid = true; + + return low; +} + +/*---------- + * _bt_compare() -- Compare insertion-type scankey to tuple on a page. + * + * page/offnum: location of btree item to be compared to. + * + * This routine returns: + * <0 if scankey < tuple at offnum; + * 0 if scankey == tuple at offnum; + * >0 if scankey > tuple at offnum. + * + * NULLs in the keys are treated as sortable values. Therefore + * "equality" does not necessarily mean that the item should be returned + * to the caller as a matching key. Similarly, an insertion scankey + * with its scantid set is treated as equal to a posting tuple whose TID + * range overlaps with their scantid. There generally won't be a + * matching TID in the posting tuple, which caller must handle + * themselves (e.g., by splitting the posting list tuple). + * + * NOTE: The "comparecol" argument must refer to the first attribute of the + * index tuple of which the caller knows that it does not match the scan key: + * this means 1 for "no known matching attributes", up to the number of key + * attributes + 1 if the caller knows that all key attributes of the index + * tuple match those of the scan key. See backend/access/nbtree/README for + * details. + * + * CRUCIAL NOTE: on a non-leaf page, the first data key is assumed to be + * "minus infinity": this routine will always claim it is less than the + * scankey. The actual key value stored is explicitly truncated to 0 + * attributes (explicitly minus infinity) with version 3+ indexes, but + * that isn't relied upon. This allows us to implement the Lehman and + * Yao convention that the first down-link pointer is before the first + * key. See backend/access/nbtree/README for details. + *---------- + */ +int32 +_bt_compare(Relation rel, + BTScanInsert key, + Page page, + OffsetNumber offnum, + AttrNumber *comparecol) +{ + TupleDesc itupdesc = RelationGetDescr(rel); + BTPageOpaque opaque = BTPageGetOpaque(page); + IndexTuple itup; + ItemPointer heapTid; + ScanKey scankey; + int ncmpkey; + int ntupatts; + int32 result; + + Assert(_bt_check_natts(rel, key->heapkeyspace, page, offnum)); + Assert(key->keysz <= IndexRelationGetNumberOfKeyAttributes(rel)); + Assert(key->heapkeyspace || key->scantid == NULL); + + /* + * Force result ">" if target item is first data item on an internal page + * --- see NOTE above. + */ + if (!P_ISLEAF(opaque) && offnum == P_FIRSTDATAKEY(opaque)) + return 1; + + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum)); + ntupatts = BTreeTupleGetNAtts(itup, rel); + + /* + * The scan key is set up with the attribute number associated with each + * term in the key. It is important that, if the index is multi-key, the + * scan contain the first k key attributes, and that they be in order. If + * you think about how multi-key ordering works, you'll understand why + * this is. + * + * We don't test for violation of this condition here, however. The + * initial setup for the index scan had better have gotten it right (see + * _bt_first). + */ + + ncmpkey = Min(ntupatts, key->keysz); + Assert(key->heapkeyspace || ncmpkey == key->keysz); + Assert(!BTreeTupleIsPosting(itup) || key->allequalimage); + + scankey = key->scankeys + ((*comparecol) - 1); + for (int i = *comparecol; i <= ncmpkey; i++) + { + Datum datum; + bool isNull; + + datum = index_getattr(itup, scankey->sk_attno, itupdesc, &isNull); + + if (scankey->sk_flags & SK_ISNULL) /* key is NULL */ + { + if (isNull) + result = 0; /* NULL "=" NULL */ + else if (scankey->sk_flags & SK_BT_NULLS_FIRST) + result = -1; /* NULL "<" NOT_NULL */ + else + result = 1; /* NULL ">" NOT_NULL */ + } + else if (isNull) /* key is NOT_NULL and item is NULL */ + { + if (scankey->sk_flags & SK_BT_NULLS_FIRST) + result = 1; /* NOT_NULL ">" NULL */ + else + result = -1; /* NOT_NULL "<" NULL */ + } + else + { + /* + * The sk_func needs to be passed the index value as left arg and + * the sk_argument as right arg (they might be of different + * types). Since it is convenient for callers to think of + * _bt_compare as comparing the scankey to the index item, we have + * to flip the sign of the comparison result. (Unless it's a DESC + * column, in which case we *don't* flip the sign.) + */ + result = DatumGetInt32(FunctionCall2Coll(&scankey->sk_func, + scankey->sk_collation, + datum, + scankey->sk_argument)); + + if (!(scankey->sk_flags & SK_BT_DESC)) + INVERT_COMPARE_RESULT(result); + } + + /* if the keys are unequal, return the difference */ + if (result != 0) + { + *comparecol = i; + return result; + } + + scankey++; + } + + /* + * All tuple attributes are equal to the scan key, only later attributes + * could potentially not equal the scan key. + */ + *comparecol = ntupatts + 1; + + /* + * All non-truncated attributes (other than heap TID) were found to be + * equal. Treat truncated attributes as minus infinity when scankey has a + * key attribute value that would otherwise be compared directly. + * + * Note: it doesn't matter if ntupatts includes non-key attributes; + * scankey won't, so explicitly excluding non-key attributes isn't + * necessary. + */ + if (key->keysz > ntupatts) + return 1; + + /* + * Use the heap TID attribute and scantid to try to break the tie. The + * rules are the same as any other key attribute -- only the + * representation differs. + */ + heapTid = BTreeTupleGetHeapTID(itup); + if (key->scantid == NULL) + { + /* + * Most searches have a scankey that is considered greater than a + * truncated pivot tuple if and when the scankey has equal values for + * attributes up to and including the least significant untruncated + * attribute in tuple. + * + * For example, if an index has the minimum two attributes (single + * user key attribute, plus heap TID attribute), and a page's high key + * is ('foo', -inf), and scankey is ('foo', ), the search + * will not descend to the page to the left. The search will descend + * right instead. The truncated attribute in pivot tuple means that + * all non-pivot tuples on the page to the left are strictly < 'foo', + * so it isn't necessary to descend left. In other words, search + * doesn't have to descend left because it isn't interested in a match + * that has a heap TID value of -inf. + * + * However, some searches (pivotsearch searches) actually require that + * we descend left when this happens. -inf is treated as a possible + * match for omitted scankey attribute(s). This is needed by page + * deletion, which must re-find leaf pages that are targets for + * deletion using their high keys. + * + * Note: the heap TID part of the test ensures that scankey is being + * compared to a pivot tuple with one or more truncated key + * attributes. + * + * Note: pg_upgrade'd !heapkeyspace indexes must always descend to the + * left here, since they have no heap TID attribute (and cannot have + * any -inf key values in any case, since truncation can only remove + * non-key attributes). !heapkeyspace searches must always be + * prepared to deal with matches on both sides of the pivot once the + * leaf level is reached. + */ + if (key->heapkeyspace && !key->pivotsearch && + key->keysz == ntupatts && heapTid == NULL) + return 1; + + /* All provided scankey arguments found to be equal */ + return 0; + } + + /* + * Treat truncated heap TID as minus infinity, since scankey has a key + * attribute value (scantid) that would otherwise be compared directly + */ + Assert(key->keysz == IndexRelationGetNumberOfKeyAttributes(rel)); + if (heapTid == NULL) + return 1; + + /* + * Scankey must be treated as equal to a posting list tuple if its scantid + * value falls within the range of the posting list. In all other cases + * there can only be a single heap TID value, which is compared directly + * with scantid. + */ + Assert(ntupatts >= IndexRelationGetNumberOfKeyAttributes(rel)); + result = ItemPointerCompare(key->scantid, heapTid); + if (result <= 0 || !BTreeTupleIsPosting(itup)) + return result; + else + { + result = ItemPointerCompare(key->scantid, + BTreeTupleGetMaxHeapTID(itup)); + if (result > 0) + return 1; + } + + return 0; +} + +/* + * _bt_readpage() -- Load data from current index page into so->currPos + * + * Caller must have pinned and read-locked so->currPos.buf; the buffer's state + * is not changed here. Also, currPos.moreLeft and moreRight must be valid; + * they are updated as appropriate. All other fields of so->currPos are + * initialized from scratch here. + * + * We scan the current page starting at offnum and moving in the indicated + * direction. All items matching the scan keys are loaded into currPos.items. + * moreLeft or moreRight (as appropriate) is cleared if _bt_checkkeys reports + * that there can be no more matching tuples in the current scan direction. + * + * In the case of a parallel scan, caller must have called _bt_parallel_seize + * prior to calling this function; this function will invoke + * _bt_parallel_release before returning. + * + * Returns true if any matching items found on the page, false if none. + */ +static bool +_bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum) +{ + BTScanOpaque so = (BTScanOpaque) scan->opaque; + Page page; + BTPageOpaque opaque; + OffsetNumber minoff; + OffsetNumber maxoff; + int itemIndex; + bool continuescan; + int indnatts; + + /* + * We must have the buffer pinned and locked, but the usual macro can't be + * used here; this function is what makes it good for currPos. + */ + Assert(BufferIsValid(so->currPos.buf)); + + page = BufferGetPage(so->currPos.buf); + opaque = BTPageGetOpaque(page); + + /* allow next page be processed by parallel worker */ + if (scan->parallel_scan) + { + if (ScanDirectionIsForward(dir)) + _bt_parallel_release(scan, opaque->btpo_next); + else + _bt_parallel_release(scan, BufferGetBlockNumber(so->currPos.buf)); + } + + continuescan = true; /* default assumption */ + indnatts = IndexRelationGetNumberOfAttributes(scan->indexRelation); + minoff = P_FIRSTDATAKEY(opaque); + maxoff = PageGetMaxOffsetNumber(page); + + /* + * We note the buffer's block number so that we can release the pin later. + * This allows us to re-read the buffer if it is needed again for hinting. + */ + so->currPos.currPage = BufferGetBlockNumber(so->currPos.buf); + + /* + * We save the LSN of the page as we read it, so that we know whether it + * safe to apply LP_DEAD hints to the page later. This allows us to drop + * the pin for MVCC scans, which allows vacuum to avoid blocking. + */ + so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf); + + /* + * we must save the page's right-link while scanning it; this tells us + * where to step right to after we're done with these items. There is no + * corresponding need for the left-link, since splits always go right. + */ + so->currPos.nextPage = opaque->btpo_next; + + /* initialize tuple workspace to empty */ + so->currPos.nextTupleOffset = 0; + + /* + * Now that the current page has been made consistent, the macro should be + * good. + */ + Assert(BTScanPosIsPinned(so->currPos)); + + if (ScanDirectionIsForward(dir)) + { + /* load items[] in ascending order */ + itemIndex = 0; + + offnum = Max(offnum, minoff); + + while (offnum <= maxoff) + { + ItemId iid = PageGetItemId(page, offnum); + IndexTuple itup; + + /* + * If the scan specifies not to return killed tuples, then we + * treat a killed tuple as not passing the qual + */ + if (scan->ignore_killed_tuples && ItemIdIsDead(iid)) + { + offnum = OffsetNumberNext(offnum); + continue; + } + + itup = (IndexTuple) PageGetItem(page, iid); + + if (_bt_checkkeys(scan, itup, indnatts, dir, &continuescan)) + { + /* tuple passes all scan key conditions */ + if (!BTreeTupleIsPosting(itup)) + { + /* Remember it */ + _bt_saveitem(so, itemIndex, offnum, itup); + itemIndex++; + } + else + { + int tupleOffset; + + /* + * Set up state to return posting list, and remember first + * TID + */ + tupleOffset = + _bt_setuppostingitems(so, itemIndex, offnum, + BTreeTupleGetPostingN(itup, 0), + itup); + itemIndex++; + /* Remember additional TIDs */ + for (int i = 1; i < BTreeTupleGetNPosting(itup); i++) + { + _bt_savepostingitem(so, itemIndex, offnum, + BTreeTupleGetPostingN(itup, i), + tupleOffset); + itemIndex++; + } + } + } + /* When !continuescan, there can't be any more matches, so stop */ + if (!continuescan) + break; + + offnum = OffsetNumberNext(offnum); + } + + /* + * We don't need to visit page to the right when the high key + * indicates that no more matches will be found there. + * + * Checking the high key like this works out more often than you might + * think. Leaf page splits pick a split point between the two most + * dissimilar tuples (this is weighed against the need to evenly share + * free space). Leaf pages with high key attribute values that can + * only appear on non-pivot tuples on the right sibling page are + * common. + */ + if (continuescan && !P_RIGHTMOST(opaque)) + { + ItemId iid = PageGetItemId(page, P_HIKEY); + IndexTuple itup = (IndexTuple) PageGetItem(page, iid); + int truncatt; + + truncatt = BTreeTupleGetNAtts(itup, scan->indexRelation); + _bt_checkkeys(scan, itup, truncatt, dir, &continuescan); + } + + if (!continuescan) + so->currPos.moreRight = false; + + Assert(itemIndex <= MaxTIDsPerBTreePage); + so->currPos.firstItem = 0; + so->currPos.lastItem = itemIndex - 1; + so->currPos.itemIndex = 0; + } + else + { + /* load items[] in descending order */ + itemIndex = MaxTIDsPerBTreePage; + + offnum = Min(offnum, maxoff); + + while (offnum >= minoff) + { + ItemId iid = PageGetItemId(page, offnum); + IndexTuple itup; + bool tuple_alive; + bool passes_quals; + + /* + * If the scan specifies not to return killed tuples, then we + * treat a killed tuple as not passing the qual. Most of the + * time, it's a win to not bother examining the tuple's index + * keys, but just skip to the next tuple (previous, actually, + * since we're scanning backwards). However, if this is the first + * tuple on the page, we do check the index keys, to prevent + * uselessly advancing to the page to the left. This is similar + * to the high key optimization used by forward scans. + */ + if (scan->ignore_killed_tuples && ItemIdIsDead(iid)) + { + Assert(offnum >= P_FIRSTDATAKEY(opaque)); + if (offnum > P_FIRSTDATAKEY(opaque)) + { + offnum = OffsetNumberPrev(offnum); + continue; + } + + tuple_alive = false; + } + else + tuple_alive = true; + + itup = (IndexTuple) PageGetItem(page, iid); + + passes_quals = _bt_checkkeys(scan, itup, indnatts, dir, + &continuescan); + if (passes_quals && tuple_alive) + { + /* tuple passes all scan key conditions */ + if (!BTreeTupleIsPosting(itup)) + { + /* Remember it */ + itemIndex--; + _bt_saveitem(so, itemIndex, offnum, itup); + } + else + { + int tupleOffset; + + /* + * Set up state to return posting list, and remember first + * TID. + * + * Note that we deliberately save/return items from + * posting lists in ascending heap TID order for backwards + * scans. This allows _bt_killitems() to make a + * consistent assumption about the order of items + * associated with the same posting list tuple. + */ + itemIndex--; + tupleOffset = + _bt_setuppostingitems(so, itemIndex, offnum, + BTreeTupleGetPostingN(itup, 0), + itup); + /* Remember additional TIDs */ + for (int i = 1; i < BTreeTupleGetNPosting(itup); i++) + { + itemIndex--; + _bt_savepostingitem(so, itemIndex, offnum, + BTreeTupleGetPostingN(itup, i), + tupleOffset); + } + } + } + if (!continuescan) + { + /* there can't be any more matches, so stop */ + so->currPos.moreLeft = false; + break; + } + + offnum = OffsetNumberPrev(offnum); + } + + Assert(itemIndex >= 0); + so->currPos.firstItem = itemIndex; + so->currPos.lastItem = MaxTIDsPerBTreePage - 1; + so->currPos.itemIndex = MaxTIDsPerBTreePage - 1; + } + + return (so->currPos.firstItem <= so->currPos.lastItem); +} diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index c2665fce41..8742716383 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -279,8 +279,6 @@ static void _bt_sort_dedup_finish_pending(BTWriteState *wstate, BTPageState *state, BTDedupState dstate); static void _bt_uppershutdown(BTWriteState *wstate, BTPageState *state); -static void _bt_load(BTWriteState *wstate, - BTSpool *btspool, BTSpool *btspool2); static void _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent, int request); static void _bt_end_parallel(BTLeader *btleader); @@ -293,6 +291,8 @@ static void _bt_parallel_scan_and_sort(BTSpool *btspool, BTSpool *btspool2, Sharedsort *sharedsort2, int sortmem, bool progress); +#define NBT_SPECIALIZE_FILE "../../backend/access/nbtree/nbtsort_spec.c" +#include "access/nbtree_spec.h" /* * btbuild() -- build a new btree index. @@ -544,6 +544,7 @@ static void _bt_leafbuild(BTSpool *btspool, BTSpool *btspool2) { BTWriteState wstate; + nbts_prep_ctx(btspool->index); #ifdef BTREE_BUILD_STATS if (log_btree_build_stats) @@ -846,6 +847,7 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup, Size pgspc; Size itupsz; bool isleaf; + nbts_prep_ctx(wstate->index); /* * This is a handy place to check for cancel interrupts during the btree @@ -1178,264 +1180,6 @@ _bt_uppershutdown(BTWriteState *wstate, BTPageState *state) _bt_blwritepage(wstate, metapage, BTREE_METAPAGE); } -/* - * Read tuples in correct sort order from tuplesort, and load them into - * btree leaves. - */ -static void -_bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) -{ - BTPageState *state = NULL; - bool merge = (btspool2 != NULL); - IndexTuple itup, - itup2 = NULL; - bool load1; - TupleDesc tupdes = RelationGetDescr(wstate->index); - int i, - keysz = IndexRelationGetNumberOfKeyAttributes(wstate->index); - SortSupport sortKeys; - int64 tuples_done = 0; - bool deduplicate; - - deduplicate = wstate->inskey->allequalimage && !btspool->isunique && - BTGetDeduplicateItems(wstate->index); - - if (merge) - { - /* - * Another BTSpool for dead tuples exists. Now we have to merge - * btspool and btspool2. - */ - - /* the preparation of merge */ - itup = tuplesort_getindextuple(btspool->sortstate, true); - itup2 = tuplesort_getindextuple(btspool2->sortstate, true); - - /* Prepare SortSupport data for each column */ - sortKeys = (SortSupport) palloc0(keysz * sizeof(SortSupportData)); - - for (i = 0; i < keysz; i++) - { - SortSupport sortKey = sortKeys + i; - ScanKey scanKey = wstate->inskey->scankeys + i; - int16 strategy; - - sortKey->ssup_cxt = CurrentMemoryContext; - sortKey->ssup_collation = scanKey->sk_collation; - sortKey->ssup_nulls_first = - (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0; - sortKey->ssup_attno = scanKey->sk_attno; - /* Abbreviation is not supported here */ - sortKey->abbreviate = false; - - Assert(sortKey->ssup_attno != 0); - - strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ? - BTGreaterStrategyNumber : BTLessStrategyNumber; - - PrepareSortSupportFromIndexRel(wstate->index, strategy, sortKey); - } - - for (;;) - { - load1 = true; /* load BTSpool next ? */ - if (itup2 == NULL) - { - if (itup == NULL) - break; - } - else if (itup != NULL) - { - int32 compare = 0; - - for (i = 1; i <= keysz; i++) - { - SortSupport entry; - Datum attrDatum1, - attrDatum2; - bool isNull1, - isNull2; - - entry = sortKeys + i - 1; - attrDatum1 = index_getattr(itup, i, tupdes, &isNull1); - attrDatum2 = index_getattr(itup2, i, tupdes, &isNull2); - - compare = ApplySortComparator(attrDatum1, isNull1, - attrDatum2, isNull2, - entry); - if (compare > 0) - { - load1 = false; - break; - } - else if (compare < 0) - break; - } - - /* - * If key values are equal, we sort on ItemPointer. This is - * required for btree indexes, since heap TID is treated as an - * implicit last key attribute in order to ensure that all - * keys in the index are physically unique. - */ - if (compare == 0) - { - compare = ItemPointerCompare(&itup->t_tid, &itup2->t_tid); - Assert(compare != 0); - if (compare > 0) - load1 = false; - } - } - else - load1 = false; - - /* When we see first tuple, create first index page */ - if (state == NULL) - state = _bt_pagestate(wstate, 0); - - if (load1) - { - _bt_buildadd(wstate, state, itup, 0); - itup = tuplesort_getindextuple(btspool->sortstate, true); - } - else - { - _bt_buildadd(wstate, state, itup2, 0); - itup2 = tuplesort_getindextuple(btspool2->sortstate, true); - } - - /* Report progress */ - pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE, - ++tuples_done); - } - pfree(sortKeys); - } - else if (deduplicate) - { - /* merge is unnecessary, deduplicate into posting lists */ - BTDedupState dstate; - - dstate = (BTDedupState) palloc(sizeof(BTDedupStateData)); - dstate->deduplicate = true; /* unused */ - dstate->nmaxitems = 0; /* unused */ - dstate->maxpostingsize = 0; /* set later */ - /* Metadata about base tuple of current pending posting list */ - dstate->base = NULL; - dstate->baseoff = InvalidOffsetNumber; /* unused */ - dstate->basetupsize = 0; - /* Metadata about current pending posting list TIDs */ - dstate->htids = NULL; - dstate->nhtids = 0; - dstate->nitems = 0; - dstate->phystupsize = 0; /* unused */ - dstate->nintervals = 0; /* unused */ - - while ((itup = tuplesort_getindextuple(btspool->sortstate, - true)) != NULL) - { - /* When we see first tuple, create first index page */ - if (state == NULL) - { - state = _bt_pagestate(wstate, 0); - - /* - * Limit size of posting list tuples to 1/10 space we want to - * leave behind on the page, plus space for final item's line - * pointer. This is equal to the space that we'd like to - * leave behind on each leaf page when fillfactor is 90, - * allowing us to get close to fillfactor% space utilization - * when there happen to be a great many duplicates. (This - * makes higher leaf fillfactor settings ineffective when - * building indexes that have many duplicates, but packing - * leaf pages full with few very large tuples doesn't seem - * like a useful goal.) - */ - dstate->maxpostingsize = MAXALIGN_DOWN((BLCKSZ * 10 / 100)) - - sizeof(ItemIdData); - Assert(dstate->maxpostingsize <= BTMaxItemSize(state->btps_page) && - dstate->maxpostingsize <= INDEX_SIZE_MASK); - dstate->htids = palloc(dstate->maxpostingsize); - - /* start new pending posting list with itup copy */ - _bt_dedup_start_pending(dstate, CopyIndexTuple(itup), - InvalidOffsetNumber); - } - else if (_bt_keep_natts_fast(wstate->index, dstate->base, - itup) > keysz && - _bt_dedup_save_htid(dstate, itup)) - { - /* - * Tuple is equal to base tuple of pending posting list. Heap - * TID from itup has been saved in state. - */ - } - else - { - /* - * Tuple is not equal to pending posting list tuple, or - * _bt_dedup_save_htid() opted to not merge current item into - * pending posting list. - */ - _bt_sort_dedup_finish_pending(wstate, state, dstate); - pfree(dstate->base); - - /* start new pending posting list with itup copy */ - _bt_dedup_start_pending(dstate, CopyIndexTuple(itup), - InvalidOffsetNumber); - } - - /* Report progress */ - pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE, - ++tuples_done); - } - - if (state) - { - /* - * Handle the last item (there must be a last item when the - * tuplesort returned one or more tuples) - */ - _bt_sort_dedup_finish_pending(wstate, state, dstate); - pfree(dstate->base); - pfree(dstate->htids); - } - - pfree(dstate); - } - else - { - /* merging and deduplication are both unnecessary */ - while ((itup = tuplesort_getindextuple(btspool->sortstate, - true)) != NULL) - { - /* When we see first tuple, create first index page */ - if (state == NULL) - state = _bt_pagestate(wstate, 0); - - _bt_buildadd(wstate, state, itup, 0); - - /* Report progress */ - pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE, - ++tuples_done); - } - } - - /* Close down final pages and write the metapage */ - _bt_uppershutdown(wstate, state); - - /* - * When we WAL-logged index pages, we must nonetheless fsync index files. - * Since we're building outside shared buffers, a CHECKPOINT occurring - * during the build has no way to flush the previously written data to - * disk (indeed it won't know the index even exists). A crash later on - * would replay WAL from the checkpoint, therefore it wouldn't replay our - * earlier WAL entries. If we do not fsync those pages here, they might - * still not be on disk when the crash occurs. - */ - if (wstate->btws_use_wal) - smgrimmedsync(RelationGetSmgr(wstate->index), MAIN_FORKNUM); -} - /* * Create parallel context, and launch workers for leader. * diff --git a/src/backend/access/nbtree/nbtsort_spec.c b/src/backend/access/nbtree/nbtsort_spec.c new file mode 100644 index 0000000000..368d6f244c --- /dev/null +++ b/src/backend/access/nbtree/nbtsort_spec.c @@ -0,0 +1,280 @@ +/*------------------------------------------------------------------------- + * + * nbtsort_spec.c + * Index shape-specialized functions for nbtsort.c + * + * NOTES + * See also: access/nbtree/README section "nbtree specialization" + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/nbtree/nbtsort_spec.c + * + *------------------------------------------------------------------------- + */ + +#define _bt_load NBTS_FUNCTION(_bt_load) + +static void _bt_load(BTWriteState *wstate, + BTSpool *btspool, BTSpool *btspool2); + +/* + * Read tuples in correct sort order from tuplesort, and load them into + * btree leaves. + */ +static void +_bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) +{ + BTPageState *state = NULL; + bool merge = (btspool2 != NULL); + IndexTuple itup, + itup2 = NULL; + bool load1; + TupleDesc tupdes = RelationGetDescr(wstate->index); + int i, + keysz = IndexRelationGetNumberOfKeyAttributes(wstate->index); + SortSupport sortKeys; + int64 tuples_done = 0; + bool deduplicate; + + deduplicate = wstate->inskey->allequalimage && !btspool->isunique && + BTGetDeduplicateItems(wstate->index); + + if (merge) + { + /* + * Another BTSpool for dead tuples exists. Now we have to merge + * btspool and btspool2. + */ + + /* the preparation of merge */ + itup = tuplesort_getindextuple(btspool->sortstate, true); + itup2 = tuplesort_getindextuple(btspool2->sortstate, true); + + /* Prepare SortSupport data for each column */ + sortKeys = (SortSupport) palloc0(keysz * sizeof(SortSupportData)); + + for (i = 0; i < keysz; i++) + { + SortSupport sortKey = sortKeys + i; + ScanKey scanKey = wstate->inskey->scankeys + i; + int16 strategy; + + sortKey->ssup_cxt = CurrentMemoryContext; + sortKey->ssup_collation = scanKey->sk_collation; + sortKey->ssup_nulls_first = + (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0; + sortKey->ssup_attno = scanKey->sk_attno; + /* Abbreviation is not supported here */ + sortKey->abbreviate = false; + + Assert(sortKey->ssup_attno != 0); + + strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ? + BTGreaterStrategyNumber : BTLessStrategyNumber; + + PrepareSortSupportFromIndexRel(wstate->index, strategy, sortKey); + } + + for (;;) + { + load1 = true; /* load BTSpool next ? */ + if (itup2 == NULL) + { + if (itup == NULL) + break; + } + else if (itup != NULL) + { + int32 compare = 0; + + for (i = 1; i <= keysz; i++) + { + SortSupport entry; + Datum attrDatum1, + attrDatum2; + bool isNull1, + isNull2; + + entry = sortKeys + i - 1; + attrDatum1 = index_getattr(itup, i, tupdes, &isNull1); + attrDatum2 = index_getattr(itup2, i, tupdes, &isNull2); + + compare = ApplySortComparator(attrDatum1, isNull1, + attrDatum2, isNull2, + entry); + if (compare > 0) + { + load1 = false; + break; + } + else if (compare < 0) + break; + } + + /* + * If key values are equal, we sort on ItemPointer. This is + * required for btree indexes, since heap TID is treated as an + * implicit last key attribute in order to ensure that all + * keys in the index are physically unique. + */ + if (compare == 0) + { + compare = ItemPointerCompare(&itup->t_tid, &itup2->t_tid); + Assert(compare != 0); + if (compare > 0) + load1 = false; + } + } + else + load1 = false; + + /* When we see first tuple, create first index page */ + if (state == NULL) + state = _bt_pagestate(wstate, 0); + + if (load1) + { + _bt_buildadd(wstate, state, itup, 0); + itup = tuplesort_getindextuple(btspool->sortstate, true); + } + else + { + _bt_buildadd(wstate, state, itup2, 0); + itup2 = tuplesort_getindextuple(btspool2->sortstate, true); + } + + /* Report progress */ + pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE, + ++tuples_done); + } + pfree(sortKeys); + } + else if (deduplicate) + { + /* merge is unnecessary, deduplicate into posting lists */ + BTDedupState dstate; + + dstate = (BTDedupState) palloc(sizeof(BTDedupStateData)); + dstate->deduplicate = true; /* unused */ + dstate->nmaxitems = 0; /* unused */ + dstate->maxpostingsize = 0; /* set later */ + /* Metadata about base tuple of current pending posting list */ + dstate->base = NULL; + dstate->baseoff = InvalidOffsetNumber; /* unused */ + dstate->basetupsize = 0; + /* Metadata about current pending posting list TIDs */ + dstate->htids = NULL; + dstate->nhtids = 0; + dstate->nitems = 0; + dstate->phystupsize = 0; /* unused */ + dstate->nintervals = 0; /* unused */ + + while ((itup = tuplesort_getindextuple(btspool->sortstate, + true)) != NULL) + { + /* When we see first tuple, create first index page */ + if (state == NULL) + { + state = _bt_pagestate(wstate, 0); + + /* + * Limit size of posting list tuples to 1/10 space we want to + * leave behind on the page, plus space for final item's line + * pointer. This is equal to the space that we'd like to + * leave behind on each leaf page when fillfactor is 90, + * allowing us to get close to fillfactor% space utilization + * when there happen to be a great many duplicates. (This + * makes higher leaf fillfactor settings ineffective when + * building indexes that have many duplicates, but packing + * leaf pages full with few very large tuples doesn't seem + * like a useful goal.) + */ + dstate->maxpostingsize = MAXALIGN_DOWN((BLCKSZ * 10 / 100)) - + sizeof(ItemIdData); + Assert(dstate->maxpostingsize <= BTMaxItemSize(state->btps_page) && + dstate->maxpostingsize <= INDEX_SIZE_MASK); + dstate->htids = palloc(dstate->maxpostingsize); + + /* start new pending posting list with itup copy */ + _bt_dedup_start_pending(dstate, CopyIndexTuple(itup), + InvalidOffsetNumber); + } + else if (_bt_keep_natts_fast(wstate->index, dstate->base, + itup) > keysz && + _bt_dedup_save_htid(dstate, itup)) + { + /* + * Tuple is equal to base tuple of pending posting list. Heap + * TID from itup has been saved in state. + */ + } + else + { + /* + * Tuple is not equal to pending posting list tuple, or + * _bt_dedup_save_htid() opted to not merge current item into + * pending posting list. + */ + _bt_sort_dedup_finish_pending(wstate, state, dstate); + pfree(dstate->base); + + /* start new pending posting list with itup copy */ + _bt_dedup_start_pending(dstate, CopyIndexTuple(itup), + InvalidOffsetNumber); + } + + /* Report progress */ + pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE, + ++tuples_done); + } + + if (state) + { + /* + * Handle the last item (there must be a last item when the + * tuplesort returned one or more tuples) + */ + _bt_sort_dedup_finish_pending(wstate, state, dstate); + pfree(dstate->base); + pfree(dstate->htids); + } + + pfree(dstate); + } + else + { + /* merging and deduplication are both unnecessary */ + while ((itup = tuplesort_getindextuple(btspool->sortstate, + true)) != NULL) + { + /* When we see first tuple, create first index page */ + if (state == NULL) + state = _bt_pagestate(wstate, 0); + + _bt_buildadd(wstate, state, itup, 0); + + /* Report progress */ + pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE, + ++tuples_done); + } + } + + /* Close down final pages and write the metapage */ + _bt_uppershutdown(wstate, state); + + /* + * When we WAL-logged index pages, we must nonetheless fsync index files. + * Since we're building outside shared buffers, a CHECKPOINT occurring + * during the build has no way to flush the previously written data to + * disk (indeed it won't know the index even exists). A crash later on + * would replay WAL from the checkpoint, therefore it wouldn't replay our + * earlier WAL entries. If we do not fsync those pages here, they might + * still not be on disk when the crash occurs. + */ + if (wstate->btws_use_wal) + smgrimmedsync(RelationGetSmgr(wstate->index), MAIN_FORKNUM); +} diff --git a/src/backend/access/nbtree/nbtsplitloc.c b/src/backend/access/nbtree/nbtsplitloc.c index 43b67893d9..db2da1e303 100644 --- a/src/backend/access/nbtree/nbtsplitloc.c +++ b/src/backend/access/nbtree/nbtsplitloc.c @@ -639,6 +639,7 @@ _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff, ItemId itemid; IndexTuple tup; int keepnatts; + nbts_prep_ctx(state->rel); Assert(state->is_leaf && !state->is_rightmost); @@ -945,6 +946,7 @@ _bt_strategy(FindSplitData *state, SplitPoint *leftpage, *rightinterval; int perfectpenalty; int indnkeyatts = IndexRelationGetNumberOfKeyAttributes(state->rel); + nbts_prep_ctx(state->rel); /* Assume that alternative strategy won't be used for now */ *strategy = SPLIT_DEFAULT; @@ -1137,6 +1139,7 @@ _bt_split_penalty(FindSplitData *state, SplitPoint *split) { IndexTuple lastleft; IndexTuple firstright; + nbts_prep_ctx(state->rel); if (!state->is_leaf) { diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c index 7da499c4dd..37d644e9f3 100644 --- a/src/backend/access/nbtree/nbtutils.c +++ b/src/backend/access/nbtree/nbtutils.c @@ -50,130 +50,10 @@ static bool _bt_compare_scankey_args(IndexScanDesc scan, ScanKey op, bool *result); static bool _bt_fix_scankey_strategy(ScanKey skey, int16 *indoption); static void _bt_mark_scankey_required(ScanKey skey); -static bool _bt_check_rowcompare(ScanKey skey, - IndexTuple tuple, int tupnatts, TupleDesc tupdesc, - ScanDirection dir, bool *continuescan); -static int _bt_keep_natts(Relation rel, IndexTuple lastleft, - IndexTuple firstright, BTScanInsert itup_key); +#define NBT_SPECIALIZE_FILE "../../backend/access/nbtree/nbtutils_spec.c" +#include "access/nbtree_spec.h" -/* - * _bt_mkscankey - * Build an insertion scan key that contains comparison data from itup - * as well as comparator routines appropriate to the key datatypes. - * - * When itup is a non-pivot tuple, the returned insertion scan key is - * suitable for finding a place for it to go on the leaf level. Pivot - * tuples can be used to re-find leaf page with matching high key, but - * then caller needs to set scan key's pivotsearch field to true. This - * allows caller to search for a leaf page with a matching high key, - * which is usually to the left of the first leaf page a non-pivot match - * might appear on. - * - * The result is intended for use with _bt_compare() and _bt_truncate(). - * Callers that don't need to fill out the insertion scankey arguments - * (e.g. they use an ad-hoc comparison routine, or only need a scankey - * for _bt_truncate()) can pass a NULL index tuple. The scankey will - * be initialized as if an "all truncated" pivot tuple was passed - * instead. - * - * Note that we may occasionally have to share lock the metapage to - * determine whether or not the keys in the index are expected to be - * unique (i.e. if this is a "heapkeyspace" index). We assume a - * heapkeyspace index when caller passes a NULL tuple, allowing index - * build callers to avoid accessing the non-existent metapage. We - * also assume that the index is _not_ allequalimage when a NULL tuple - * is passed; CREATE INDEX callers call _bt_allequalimage() to set the - * field themselves. - */ -BTScanInsert -_bt_mkscankey(Relation rel, IndexTuple itup) -{ - BTScanInsert key; - ScanKey skey; - TupleDesc itupdesc; - int indnkeyatts; - int16 *indoption; - int tupnatts; - int i; - - itupdesc = RelationGetDescr(rel); - indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); - indoption = rel->rd_indoption; - tupnatts = itup ? BTreeTupleGetNAtts(itup, rel) : 0; - - Assert(tupnatts <= IndexRelationGetNumberOfAttributes(rel)); - - /* - * We'll execute search using scan key constructed on key columns. - * Truncated attributes and non-key attributes are omitted from the final - * scan key. - */ - key = palloc(offsetof(BTScanInsertData, scankeys) + - sizeof(ScanKeyData) * indnkeyatts); - if (itup) - _bt_metaversion(rel, &key->heapkeyspace, &key->allequalimage); - else - { - /* Utility statement callers can set these fields themselves */ - key->heapkeyspace = true; - key->allequalimage = false; - } - key->anynullkeys = false; /* initial assumption */ - key->nextkey = false; - key->pivotsearch = false; - key->keysz = Min(indnkeyatts, tupnatts); - key->scantid = key->heapkeyspace && itup ? - BTreeTupleGetHeapTID(itup) : NULL; - skey = key->scankeys; - for (i = 0; i < indnkeyatts; i++) - { - FmgrInfo *procinfo; - Datum arg; - bool null; - int flags; - - /* - * We can use the cached (default) support procs since no cross-type - * comparison can be needed. - */ - procinfo = index_getprocinfo(rel, i + 1, BTORDER_PROC); - - /* - * Key arguments built from truncated attributes (or when caller - * provides no tuple) are defensively represented as NULL values. They - * should never be used. - */ - if (i < tupnatts) - arg = index_getattr(itup, i + 1, itupdesc, &null); - else - { - arg = (Datum) 0; - null = true; - } - flags = (null ? SK_ISNULL : 0) | (indoption[i] << SK_BT_INDOPTION_SHIFT); - ScanKeyEntryInitializeWithInfo(&skey[i], - flags, - (AttrNumber) (i + 1), - InvalidStrategy, - InvalidOid, - rel->rd_indcollation[i], - procinfo, - arg); - /* Record if any key attribute is NULL (or truncated) */ - if (null) - key->anynullkeys = true; - } - - /* - * In NULLS NOT DISTINCT mode, we pretend that there are no null keys, so - * that full uniqueness check is done. - */ - if (rel->rd_index->indnullsnotdistinct) - key->anynullkeys = false; - - return key; -} /* * free a retracement stack made by _bt_search. @@ -1340,356 +1220,6 @@ _bt_mark_scankey_required(ScanKey skey) } } -/* - * Test whether an indextuple satisfies all the scankey conditions. - * - * Return true if so, false if not. If the tuple fails to pass the qual, - * we also determine whether there's any need to continue the scan beyond - * this tuple, and set *continuescan accordingly. See comments for - * _bt_preprocess_keys(), above, about how this is done. - * - * Forward scan callers can pass a high key tuple in the hopes of having - * us set *continuescan to false, and avoiding an unnecessary visit to - * the page to the right. - * - * scan: index scan descriptor (containing a search-type scankey) - * tuple: index tuple to test - * tupnatts: number of attributes in tupnatts (high key may be truncated) - * dir: direction we are scanning in - * continuescan: output parameter (will be set correctly in all cases) - */ -bool -_bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts, - ScanDirection dir, bool *continuescan) -{ - TupleDesc tupdesc; - BTScanOpaque so; - int keysz; - int ikey; - ScanKey key; - - Assert(BTreeTupleGetNAtts(tuple, scan->indexRelation) == tupnatts); - - *continuescan = true; /* default assumption */ - - tupdesc = RelationGetDescr(scan->indexRelation); - so = (BTScanOpaque) scan->opaque; - keysz = so->numberOfKeys; - - for (key = so->keyData, ikey = 0; ikey < keysz; key++, ikey++) - { - Datum datum; - bool isNull; - Datum test; - - if (key->sk_attno > tupnatts) - { - /* - * This attribute is truncated (must be high key). The value for - * this attribute in the first non-pivot tuple on the page to the - * right could be any possible value. Assume that truncated - * attribute passes the qual. - */ - Assert(ScanDirectionIsForward(dir)); - Assert(BTreeTupleIsPivot(tuple)); - continue; - } - - /* row-comparison keys need special processing */ - if (key->sk_flags & SK_ROW_HEADER) - { - if (_bt_check_rowcompare(key, tuple, tupnatts, tupdesc, dir, - continuescan)) - continue; - return false; - } - - datum = index_getattr(tuple, - key->sk_attno, - tupdesc, - &isNull); - - if (key->sk_flags & SK_ISNULL) - { - /* Handle IS NULL/NOT NULL tests */ - if (key->sk_flags & SK_SEARCHNULL) - { - if (isNull) - continue; /* tuple satisfies this qual */ - } - else - { - Assert(key->sk_flags & SK_SEARCHNOTNULL); - if (!isNull) - continue; /* tuple satisfies this qual */ - } - - /* - * Tuple fails this qual. If it's a required qual for the current - * scan direction, then we can conclude no further tuples will - * pass, either. - */ - if ((key->sk_flags & SK_BT_REQFWD) && - ScanDirectionIsForward(dir)) - *continuescan = false; - else if ((key->sk_flags & SK_BT_REQBKWD) && - ScanDirectionIsBackward(dir)) - *continuescan = false; - - /* - * In any case, this indextuple doesn't match the qual. - */ - return false; - } - - if (isNull) - { - if (key->sk_flags & SK_BT_NULLS_FIRST) - { - /* - * Since NULLs are sorted before non-NULLs, we know we have - * reached the lower limit of the range of values for this - * index attr. On a backward scan, we can stop if this qual - * is one of the "must match" subset. We can stop regardless - * of whether the qual is > or <, so long as it's required, - * because it's not possible for any future tuples to pass. On - * a forward scan, however, we must keep going, because we may - * have initially positioned to the start of the index. - */ - if ((key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) && - ScanDirectionIsBackward(dir)) - *continuescan = false; - } - else - { - /* - * Since NULLs are sorted after non-NULLs, we know we have - * reached the upper limit of the range of values for this - * index attr. On a forward scan, we can stop if this qual is - * one of the "must match" subset. We can stop regardless of - * whether the qual is > or <, so long as it's required, - * because it's not possible for any future tuples to pass. On - * a backward scan, however, we must keep going, because we - * may have initially positioned to the end of the index. - */ - if ((key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) && - ScanDirectionIsForward(dir)) - *continuescan = false; - } - - /* - * In any case, this indextuple doesn't match the qual. - */ - return false; - } - - test = FunctionCall2Coll(&key->sk_func, key->sk_collation, - datum, key->sk_argument); - - if (!DatumGetBool(test)) - { - /* - * Tuple fails this qual. If it's a required qual for the current - * scan direction, then we can conclude no further tuples will - * pass, either. - * - * Note: because we stop the scan as soon as any required equality - * qual fails, it is critical that equality quals be used for the - * initial positioning in _bt_first() when they are available. See - * comments in _bt_first(). - */ - if ((key->sk_flags & SK_BT_REQFWD) && - ScanDirectionIsForward(dir)) - *continuescan = false; - else if ((key->sk_flags & SK_BT_REQBKWD) && - ScanDirectionIsBackward(dir)) - *continuescan = false; - - /* - * In any case, this indextuple doesn't match the qual. - */ - return false; - } - } - - /* If we get here, the tuple passes all index quals. */ - return true; -} - -/* - * Test whether an indextuple satisfies a row-comparison scan condition. - * - * Return true if so, false if not. If not, also clear *continuescan if - * it's not possible for any future tuples in the current scan direction - * to pass the qual. - * - * This is a subroutine for _bt_checkkeys, which see for more info. - */ -static bool -_bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, - TupleDesc tupdesc, ScanDirection dir, bool *continuescan) -{ - ScanKey subkey = (ScanKey) DatumGetPointer(skey->sk_argument); - int32 cmpresult = 0; - bool result; - - /* First subkey should be same as the header says */ - Assert(subkey->sk_attno == skey->sk_attno); - - /* Loop over columns of the row condition */ - for (;;) - { - Datum datum; - bool isNull; - - Assert(subkey->sk_flags & SK_ROW_MEMBER); - - if (subkey->sk_attno > tupnatts) - { - /* - * This attribute is truncated (must be high key). The value for - * this attribute in the first non-pivot tuple on the page to the - * right could be any possible value. Assume that truncated - * attribute passes the qual. - */ - Assert(ScanDirectionIsForward(dir)); - Assert(BTreeTupleIsPivot(tuple)); - cmpresult = 0; - if (subkey->sk_flags & SK_ROW_END) - break; - subkey++; - continue; - } - - datum = index_getattr(tuple, - subkey->sk_attno, - tupdesc, - &isNull); - - if (isNull) - { - if (subkey->sk_flags & SK_BT_NULLS_FIRST) - { - /* - * Since NULLs are sorted before non-NULLs, we know we have - * reached the lower limit of the range of values for this - * index attr. On a backward scan, we can stop if this qual - * is one of the "must match" subset. We can stop regardless - * of whether the qual is > or <, so long as it's required, - * because it's not possible for any future tuples to pass. On - * a forward scan, however, we must keep going, because we may - * have initially positioned to the start of the index. - */ - if ((subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) && - ScanDirectionIsBackward(dir)) - *continuescan = false; - } - else - { - /* - * Since NULLs are sorted after non-NULLs, we know we have - * reached the upper limit of the range of values for this - * index attr. On a forward scan, we can stop if this qual is - * one of the "must match" subset. We can stop regardless of - * whether the qual is > or <, so long as it's required, - * because it's not possible for any future tuples to pass. On - * a backward scan, however, we must keep going, because we - * may have initially positioned to the end of the index. - */ - if ((subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) && - ScanDirectionIsForward(dir)) - *continuescan = false; - } - - /* - * In any case, this indextuple doesn't match the qual. - */ - return false; - } - - if (subkey->sk_flags & SK_ISNULL) - { - /* - * Unlike the simple-scankey case, this isn't a disallowed case. - * But it can never match. If all the earlier row comparison - * columns are required for the scan direction, we can stop the - * scan, because there can't be another tuple that will succeed. - */ - if (subkey != (ScanKey) DatumGetPointer(skey->sk_argument)) - subkey--; - if ((subkey->sk_flags & SK_BT_REQFWD) && - ScanDirectionIsForward(dir)) - *continuescan = false; - else if ((subkey->sk_flags & SK_BT_REQBKWD) && - ScanDirectionIsBackward(dir)) - *continuescan = false; - return false; - } - - /* Perform the test --- three-way comparison not bool operator */ - cmpresult = DatumGetInt32(FunctionCall2Coll(&subkey->sk_func, - subkey->sk_collation, - datum, - subkey->sk_argument)); - - if (subkey->sk_flags & SK_BT_DESC) - INVERT_COMPARE_RESULT(cmpresult); - - /* Done comparing if unequal, else advance to next column */ - if (cmpresult != 0) - break; - - if (subkey->sk_flags & SK_ROW_END) - break; - subkey++; - } - - /* - * At this point cmpresult indicates the overall result of the row - * comparison, and subkey points to the deciding column (or the last - * column if the result is "="). - */ - switch (subkey->sk_strategy) - { - /* EQ and NE cases aren't allowed here */ - case BTLessStrategyNumber: - result = (cmpresult < 0); - break; - case BTLessEqualStrategyNumber: - result = (cmpresult <= 0); - break; - case BTGreaterEqualStrategyNumber: - result = (cmpresult >= 0); - break; - case BTGreaterStrategyNumber: - result = (cmpresult > 0); - break; - default: - elog(ERROR, "unrecognized RowCompareType: %d", - (int) subkey->sk_strategy); - result = 0; /* keep compiler quiet */ - break; - } - - if (!result) - { - /* - * Tuple fails this qual. If it's a required qual for the current - * scan direction, then we can conclude no further tuples will pass, - * either. Note we have to look at the deciding column, not - * necessarily the first or last column of the row condition. - */ - if ((subkey->sk_flags & SK_BT_REQFWD) && - ScanDirectionIsForward(dir)) - *continuescan = false; - else if ((subkey->sk_flags & SK_BT_REQBKWD) && - ScanDirectionIsBackward(dir)) - *continuescan = false; - } - - return result; -} - /* * _bt_killitems - set LP_DEAD state for items an indexscan caller has * told us were killed @@ -2173,286 +1703,6 @@ btbuildphasename(int64 phasenum) } } -/* - * _bt_truncate() -- create tuple without unneeded suffix attributes. - * - * Returns truncated pivot index tuple allocated in caller's memory context, - * with key attributes copied from caller's firstright argument. If rel is - * an INCLUDE index, non-key attributes will definitely be truncated away, - * since they're not part of the key space. More aggressive suffix - * truncation can take place when it's clear that the returned tuple does not - * need one or more suffix key attributes. We only need to keep firstright - * attributes up to and including the first non-lastleft-equal attribute. - * Caller's insertion scankey is used to compare the tuples; the scankey's - * argument values are not considered here. - * - * Note that returned tuple's t_tid offset will hold the number of attributes - * present, so the original item pointer offset is not represented. Caller - * should only change truncated tuple's downlink. Note also that truncated - * key attributes are treated as containing "minus infinity" values by - * _bt_compare(). - * - * In the worst case (when a heap TID must be appended to distinguish lastleft - * from firstright), the size of the returned tuple is the size of firstright - * plus the size of an additional MAXALIGN()'d item pointer. This guarantee - * is important, since callers need to stay under the 1/3 of a page - * restriction on tuple size. If this routine is ever taught to truncate - * within an attribute/datum, it will need to avoid returning an enlarged - * tuple to caller when truncation + TOAST compression ends up enlarging the - * final datum. - */ -IndexTuple -_bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, - BTScanInsert itup_key) -{ - TupleDesc itupdesc = RelationGetDescr(rel); - int16 nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); - int keepnatts; - IndexTuple pivot; - IndexTuple tidpivot; - ItemPointer pivotheaptid; - Size newsize; - - /* - * We should only ever truncate non-pivot tuples from leaf pages. It's - * never okay to truncate when splitting an internal page. - */ - Assert(!BTreeTupleIsPivot(lastleft) && !BTreeTupleIsPivot(firstright)); - - /* Determine how many attributes must be kept in truncated tuple */ - keepnatts = _bt_keep_natts(rel, lastleft, firstright, itup_key); - -#ifdef DEBUG_NO_TRUNCATE - /* Force truncation to be ineffective for testing purposes */ - keepnatts = nkeyatts + 1; -#endif - - pivot = index_truncate_tuple(itupdesc, firstright, - Min(keepnatts, nkeyatts)); - - if (BTreeTupleIsPosting(pivot)) - { - /* - * index_truncate_tuple() just returns a straight copy of firstright - * when it has no attributes to truncate. When that happens, we may - * need to truncate away a posting list here instead. - */ - Assert(keepnatts == nkeyatts || keepnatts == nkeyatts + 1); - Assert(IndexRelationGetNumberOfAttributes(rel) == nkeyatts); - pivot->t_info &= ~INDEX_SIZE_MASK; - pivot->t_info |= MAXALIGN(BTreeTupleGetPostingOffset(firstright)); - } - - /* - * If there is a distinguishing key attribute within pivot tuple, we're - * done - */ - if (keepnatts <= nkeyatts) - { - BTreeTupleSetNAtts(pivot, keepnatts, false); - return pivot; - } - - /* - * We have to store a heap TID in the new pivot tuple, since no non-TID - * key attribute value in firstright distinguishes the right side of the - * split from the left side. nbtree conceptualizes this case as an - * inability to truncate away any key attributes, since heap TID is - * treated as just another key attribute (despite lacking a pg_attribute - * entry). - * - * Use enlarged space that holds a copy of pivot. We need the extra space - * to store a heap TID at the end (using the special pivot tuple - * representation). Note that the original pivot already has firstright's - * possible posting list/non-key attribute values removed at this point. - */ - newsize = MAXALIGN(IndexTupleSize(pivot)) + MAXALIGN(sizeof(ItemPointerData)); - tidpivot = palloc0(newsize); - memcpy(tidpivot, pivot, MAXALIGN(IndexTupleSize(pivot))); - /* Cannot leak memory here */ - pfree(pivot); - - /* - * Store all of firstright's key attribute values plus a tiebreaker heap - * TID value in enlarged pivot tuple - */ - tidpivot->t_info &= ~INDEX_SIZE_MASK; - tidpivot->t_info |= newsize; - BTreeTupleSetNAtts(tidpivot, nkeyatts, true); - pivotheaptid = BTreeTupleGetHeapTID(tidpivot); - - /* - * Lehman & Yao use lastleft as the leaf high key in all cases, but don't - * consider suffix truncation. It seems like a good idea to follow that - * example in cases where no truncation takes place -- use lastleft's heap - * TID. (This is also the closest value to negative infinity that's - * legally usable.) - */ - ItemPointerCopy(BTreeTupleGetMaxHeapTID(lastleft), pivotheaptid); - - /* - * We're done. Assert() that heap TID invariants hold before returning. - * - * Lehman and Yao require that the downlink to the right page, which is to - * be inserted into the parent page in the second phase of a page split be - * a strict lower bound on items on the right page, and a non-strict upper - * bound for items on the left page. Assert that heap TIDs follow these - * invariants, since a heap TID value is apparently needed as a - * tiebreaker. - */ -#ifndef DEBUG_NO_TRUNCATE - Assert(ItemPointerCompare(BTreeTupleGetMaxHeapTID(lastleft), - BTreeTupleGetHeapTID(firstright)) < 0); - Assert(ItemPointerCompare(pivotheaptid, - BTreeTupleGetHeapTID(lastleft)) >= 0); - Assert(ItemPointerCompare(pivotheaptid, - BTreeTupleGetHeapTID(firstright)) < 0); -#else - - /* - * Those invariants aren't guaranteed to hold for lastleft + firstright - * heap TID attribute values when they're considered here only because - * DEBUG_NO_TRUNCATE is defined (a heap TID is probably not actually - * needed as a tiebreaker). DEBUG_NO_TRUNCATE must therefore use a heap - * TID value that always works as a strict lower bound for items to the - * right. In particular, it must avoid using firstright's leading key - * attribute values along with lastleft's heap TID value when lastleft's - * TID happens to be greater than firstright's TID. - */ - ItemPointerCopy(BTreeTupleGetHeapTID(firstright), pivotheaptid); - - /* - * Pivot heap TID should never be fully equal to firstright. Note that - * the pivot heap TID will still end up equal to lastleft's heap TID when - * that's the only usable value. - */ - ItemPointerSetOffsetNumber(pivotheaptid, - OffsetNumberPrev(ItemPointerGetOffsetNumber(pivotheaptid))); - Assert(ItemPointerCompare(pivotheaptid, - BTreeTupleGetHeapTID(firstright)) < 0); -#endif - - return tidpivot; -} - -/* - * _bt_keep_natts - how many key attributes to keep when truncating. - * - * Caller provides two tuples that enclose a split point. Caller's insertion - * scankey is used to compare the tuples; the scankey's argument values are - * not considered here. - * - * This can return a number of attributes that is one greater than the - * number of key attributes for the index relation. This indicates that the - * caller must use a heap TID as a unique-ifier in new pivot tuple. - */ -static int -_bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, - BTScanInsert itup_key) -{ - int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); - TupleDesc itupdesc = RelationGetDescr(rel); - int keepnatts; - ScanKey scankey; - - /* - * _bt_compare() treats truncated key attributes as having the value minus - * infinity, which would break searches within !heapkeyspace indexes. We - * must still truncate away non-key attribute values, though. - */ - if (!itup_key->heapkeyspace) - return nkeyatts; - - scankey = itup_key->scankeys; - keepnatts = 1; - for (int attnum = 1; attnum <= nkeyatts; attnum++, scankey++) - { - Datum datum1, - datum2; - bool isNull1, - isNull2; - - datum1 = index_getattr(lastleft, attnum, itupdesc, &isNull1); - datum2 = index_getattr(firstright, attnum, itupdesc, &isNull2); - - if (isNull1 != isNull2) - break; - - if (!isNull1 && - DatumGetInt32(FunctionCall2Coll(&scankey->sk_func, - scankey->sk_collation, - datum1, - datum2)) != 0) - break; - - keepnatts++; - } - - /* - * Assert that _bt_keep_natts_fast() agrees with us in passing. This is - * expected in an allequalimage index. - */ - Assert(!itup_key->allequalimage || - keepnatts == _bt_keep_natts_fast(rel, lastleft, firstright)); - - return keepnatts; -} - -/* - * _bt_keep_natts_fast - fast bitwise variant of _bt_keep_natts. - * - * This is exported so that a candidate split point can have its effect on - * suffix truncation inexpensively evaluated ahead of time when finding a - * split location. A naive bitwise approach to datum comparisons is used to - * save cycles. - * - * The approach taken here usually provides the same answer as _bt_keep_natts - * will (for the same pair of tuples from a heapkeyspace index), since the - * majority of btree opclasses can never indicate that two datums are equal - * unless they're bitwise equal after detoasting. When an index only has - * "equal image" columns, routine is guaranteed to give the same result as - * _bt_keep_natts would. - * - * Callers can rely on the fact that attributes considered equal here are - * definitely also equal according to _bt_keep_natts, even when the index uses - * an opclass or collation that is not "allequalimage"/deduplication-safe. - * This weaker guarantee is good enough for nbtsplitloc.c caller, since false - * negatives generally only have the effect of making leaf page splits use a - * more balanced split point. - */ -int -_bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright) -{ - TupleDesc itupdesc = RelationGetDescr(rel); - int keysz = IndexRelationGetNumberOfKeyAttributes(rel); - int keepnatts; - - keepnatts = 1; - for (int attnum = 1; attnum <= keysz; attnum++) - { - Datum datum1, - datum2; - bool isNull1, - isNull2; - Form_pg_attribute att; - - datum1 = index_getattr(lastleft, attnum, itupdesc, &isNull1); - datum2 = index_getattr(firstright, attnum, itupdesc, &isNull2); - att = TupleDescAttr(itupdesc, attnum - 1); - - if (isNull1 != isNull2) - break; - - if (!isNull1 && - !datum_image_eq(datum1, datum2, att->attbyval, att->attlen)) - break; - - keepnatts++; - } - - return keepnatts; -} - /* * _bt_check_natts() -- Verify tuple has expected number of attributes. * diff --git a/src/backend/access/nbtree/nbtutils_spec.c b/src/backend/access/nbtree/nbtutils_spec.c new file mode 100644 index 0000000000..0288da22d6 --- /dev/null +++ b/src/backend/access/nbtree/nbtutils_spec.c @@ -0,0 +1,775 @@ +/*------------------------------------------------------------------------- + * + * nbtutils_spec.c + * Index shape-specialized functions for nbtutils.c + * + * NOTES + * See also: access/nbtree/README section "nbtree specialization" + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/nbtree/nbtutils_spec.c + * + *------------------------------------------------------------------------- + */ + +#define _bt_check_rowcompare NBTS_FUNCTION(_bt_check_rowcompare) +#define _bt_keep_natts NBTS_FUNCTION(_bt_keep_natts) + +static bool _bt_check_rowcompare(ScanKey skey, + IndexTuple tuple, int tupnatts, TupleDesc tupdesc, + ScanDirection dir, bool *continuescan); +static int _bt_keep_natts(Relation rel, IndexTuple lastleft, + IndexTuple firstright, BTScanInsert itup_key); + + +/* + * _bt_mkscankey + * Build an insertion scan key that contains comparison data from itup + * as well as comparator routines appropriate to the key datatypes. + * + * When itup is a non-pivot tuple, the returned insertion scan key is + * suitable for finding a place for it to go on the leaf level. Pivot + * tuples can be used to re-find leaf page with matching high key, but + * then caller needs to set scan key's pivotsearch field to true. This + * allows caller to search for a leaf page with a matching high key, + * which is usually to the left of the first leaf page a non-pivot match + * might appear on. + * + * The result is intended for use with _bt_compare() and _bt_truncate(). + * Callers that don't need to fill out the insertion scankey arguments + * (e.g. they use an ad-hoc comparison routine, or only need a scankey + * for _bt_truncate()) can pass a NULL index tuple. The scankey will + * be initialized as if an "all truncated" pivot tuple was passed + * instead. + * + * Note that we may occasionally have to share lock the metapage to + * determine whether or not the keys in the index are expected to be + * unique (i.e. if this is a "heapkeyspace" index). We assume a + * heapkeyspace index when caller passes a NULL tuple, allowing index + * build callers to avoid accessing the non-existent metapage. We + * also assume that the index is _not_ allequalimage when a NULL tuple + * is passed; CREATE INDEX callers call _bt_allequalimage() to set the + * field themselves. + */ +BTScanInsert +_bt_mkscankey(Relation rel, IndexTuple itup) +{ + BTScanInsert key; + ScanKey skey; + TupleDesc itupdesc; + int indnkeyatts; + int16 *indoption; + int tupnatts; + int i; + + itupdesc = RelationGetDescr(rel); + indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); + indoption = rel->rd_indoption; + tupnatts = itup ? BTreeTupleGetNAtts(itup, rel) : 0; + + Assert(tupnatts <= IndexRelationGetNumberOfAttributes(rel)); + + /* + * We'll execute search using scan key constructed on key columns. + * Truncated attributes and non-key attributes are omitted from the final + * scan key. + */ + key = palloc(offsetof(BTScanInsertData, scankeys) + + sizeof(ScanKeyData) * indnkeyatts); + if (itup) + _bt_metaversion(rel, &key->heapkeyspace, &key->allequalimage); + else + { + /* Utility statement callers can set these fields themselves */ + key->heapkeyspace = true; + key->allequalimage = false; + } + key->anynullkeys = false; /* initial assumption */ + key->nextkey = false; + key->pivotsearch = false; + key->keysz = Min(indnkeyatts, tupnatts); + key->scantid = key->heapkeyspace && itup ? + BTreeTupleGetHeapTID(itup) : NULL; + skey = key->scankeys; + for (i = 0; i < indnkeyatts; i++) + { + FmgrInfo *procinfo; + Datum arg; + bool null; + int flags; + + /* + * We can use the cached (default) support procs since no cross-type + * comparison can be needed. + */ + procinfo = index_getprocinfo(rel, i + 1, BTORDER_PROC); + + /* + * Key arguments built from truncated attributes (or when caller + * provides no tuple) are defensively represented as NULL values. They + * should never be used. + */ + if (i < tupnatts) + arg = index_getattr(itup, i + 1, itupdesc, &null); + else + { + arg = (Datum) 0; + null = true; + } + flags = (null ? SK_ISNULL : 0) | (indoption[i] << SK_BT_INDOPTION_SHIFT); + ScanKeyEntryInitializeWithInfo(&skey[i], + flags, + (AttrNumber) (i + 1), + InvalidStrategy, + InvalidOid, + rel->rd_indcollation[i], + procinfo, + arg); + /* Record if any key attribute is NULL (or truncated) */ + if (null) + key->anynullkeys = true; + } + + /* + * In NULLS NOT DISTINCT mode, we pretend that there are no null keys, so + * that full uniqueness check is done. + */ + if (rel->rd_index->indnullsnotdistinct) + key->anynullkeys = false; + + return key; +} + +/* + * Test whether an indextuple satisfies all the scankey conditions. + * + * Return true if so, false if not. If the tuple fails to pass the qual, + * we also determine whether there's any need to continue the scan beyond + * this tuple, and set *continuescan accordingly. See comments for + * _bt_preprocess_keys(), above, about how this is done. + * + * Forward scan callers can pass a high key tuple in the hopes of having + * us set *continuescan to false, and avoiding an unnecessary visit to + * the page to the right. + * + * scan: index scan descriptor (containing a search-type scankey) + * tuple: index tuple to test + * tupnatts: number of attributes in tupnatts (high key may be truncated) + * dir: direction we are scanning in + * continuescan: output parameter (will be set correctly in all cases) + */ +bool +_bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts, + ScanDirection dir, bool *continuescan) +{ + TupleDesc tupdesc; + BTScanOpaque so; + int keysz; + int ikey; + ScanKey key; + + Assert(BTreeTupleGetNAtts(tuple, scan->indexRelation) == tupnatts); + + *continuescan = true; /* default assumption */ + + tupdesc = RelationGetDescr(scan->indexRelation); + so = (BTScanOpaque) scan->opaque; + keysz = so->numberOfKeys; + + for (key = so->keyData, ikey = 0; ikey < keysz; key++, ikey++) + { + Datum datum; + bool isNull; + Datum test; + + if (key->sk_attno > tupnatts) + { + /* + * This attribute is truncated (must be high key). The value for + * this attribute in the first non-pivot tuple on the page to the + * right could be any possible value. Assume that truncated + * attribute passes the qual. + */ + Assert(ScanDirectionIsForward(dir)); + Assert(BTreeTupleIsPivot(tuple)); + continue; + } + + /* row-comparison keys need special processing */ + if (key->sk_flags & SK_ROW_HEADER) + { + if (_bt_check_rowcompare(key, tuple, tupnatts, tupdesc, dir, + continuescan)) + continue; + return false; + } + + datum = index_getattr(tuple, + key->sk_attno, + tupdesc, + &isNull); + + if (key->sk_flags & SK_ISNULL) + { + /* Handle IS NULL/NOT NULL tests */ + if (key->sk_flags & SK_SEARCHNULL) + { + if (isNull) + continue; /* tuple satisfies this qual */ + } + else + { + Assert(key->sk_flags & SK_SEARCHNOTNULL); + if (!isNull) + continue; /* tuple satisfies this qual */ + } + + /* + * Tuple fails this qual. If it's a required qual for the current + * scan direction, then we can conclude no further tuples will + * pass, either. + */ + if ((key->sk_flags & SK_BT_REQFWD) && + ScanDirectionIsForward(dir)) + *continuescan = false; + else if ((key->sk_flags & SK_BT_REQBKWD) && + ScanDirectionIsBackward(dir)) + *continuescan = false; + + /* + * In any case, this indextuple doesn't match the qual. + */ + return false; + } + + if (isNull) + { + if (key->sk_flags & SK_BT_NULLS_FIRST) + { + /* + * Since NULLs are sorted before non-NULLs, we know we have + * reached the lower limit of the range of values for this + * index attr. On a backward scan, we can stop if this qual + * is one of the "must match" subset. We can stop regardless + * of whether the qual is > or <, so long as it's required, + * because it's not possible for any future tuples to pass. On + * a forward scan, however, we must keep going, because we may + * have initially positioned to the start of the index. + */ + if ((key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) && + ScanDirectionIsBackward(dir)) + *continuescan = false; + } + else + { + /* + * Since NULLs are sorted after non-NULLs, we know we have + * reached the upper limit of the range of values for this + * index attr. On a forward scan, we can stop if this qual is + * one of the "must match" subset. We can stop regardless of + * whether the qual is > or <, so long as it's required, + * because it's not possible for any future tuples to pass. On + * a backward scan, however, we must keep going, because we + * may have initially positioned to the end of the index. + */ + if ((key->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) && + ScanDirectionIsForward(dir)) + *continuescan = false; + } + + /* + * In any case, this indextuple doesn't match the qual. + */ + return false; + } + + test = FunctionCall2Coll(&key->sk_func, key->sk_collation, + datum, key->sk_argument); + + if (!DatumGetBool(test)) + { + /* + * Tuple fails this qual. If it's a required qual for the current + * scan direction, then we can conclude no further tuples will + * pass, either. + * + * Note: because we stop the scan as soon as any required equality + * qual fails, it is critical that equality quals be used for the + * initial positioning in _bt_first() when they are available. See + * comments in _bt_first(). + */ + if ((key->sk_flags & SK_BT_REQFWD) && + ScanDirectionIsForward(dir)) + *continuescan = false; + else if ((key->sk_flags & SK_BT_REQBKWD) && + ScanDirectionIsBackward(dir)) + *continuescan = false; + + /* + * In any case, this indextuple doesn't match the qual. + */ + return false; + } + } + + /* If we get here, the tuple passes all index quals. */ + return true; +} + +/* + * Test whether an indextuple satisfies a row-comparison scan condition. + * + * Return true if so, false if not. If not, also clear *continuescan if + * it's not possible for any future tuples in the current scan direction + * to pass the qual. + * + * This is a subroutine for _bt_checkkeys, which see for more info. + */ +static bool +_bt_check_rowcompare(ScanKey skey, IndexTuple tuple, int tupnatts, + TupleDesc tupdesc, ScanDirection dir, bool *continuescan) +{ + ScanKey subkey = (ScanKey) DatumGetPointer(skey->sk_argument); + int32 cmpresult = 0; + bool result; + + /* First subkey should be same as the header says */ + Assert(subkey->sk_attno == skey->sk_attno); + + /* Loop over columns of the row condition */ + for (;;) + { + Datum datum; + bool isNull; + + Assert(subkey->sk_flags & SK_ROW_MEMBER); + + if (subkey->sk_attno > tupnatts) + { + /* + * This attribute is truncated (must be high key). The value for + * this attribute in the first non-pivot tuple on the page to the + * right could be any possible value. Assume that truncated + * attribute passes the qual. + */ + Assert(ScanDirectionIsForward(dir)); + Assert(BTreeTupleIsPivot(tuple)); + cmpresult = 0; + if (subkey->sk_flags & SK_ROW_END) + break; + subkey++; + continue; + } + + datum = index_getattr(tuple, + subkey->sk_attno, + tupdesc, + &isNull); + + if (isNull) + { + if (subkey->sk_flags & SK_BT_NULLS_FIRST) + { + /* + * Since NULLs are sorted before non-NULLs, we know we have + * reached the lower limit of the range of values for this + * index attr. On a backward scan, we can stop if this qual + * is one of the "must match" subset. We can stop regardless + * of whether the qual is > or <, so long as it's required, + * because it's not possible for any future tuples to pass. On + * a forward scan, however, we must keep going, because we may + * have initially positioned to the start of the index. + */ + if ((subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) && + ScanDirectionIsBackward(dir)) + *continuescan = false; + } + else + { + /* + * Since NULLs are sorted after non-NULLs, we know we have + * reached the upper limit of the range of values for this + * index attr. On a forward scan, we can stop if this qual is + * one of the "must match" subset. We can stop regardless of + * whether the qual is > or <, so long as it's required, + * because it's not possible for any future tuples to pass. On + * a backward scan, however, we must keep going, because we + * may have initially positioned to the end of the index. + */ + if ((subkey->sk_flags & (SK_BT_REQFWD | SK_BT_REQBKWD)) && + ScanDirectionIsForward(dir)) + *continuescan = false; + } + + /* + * In any case, this indextuple doesn't match the qual. + */ + return false; + } + + if (subkey->sk_flags & SK_ISNULL) + { + /* + * Unlike the simple-scankey case, this isn't a disallowed case. + * But it can never match. If all the earlier row comparison + * columns are required for the scan direction, we can stop the + * scan, because there can't be another tuple that will succeed. + */ + if (subkey != (ScanKey) DatumGetPointer(skey->sk_argument)) + subkey--; + if ((subkey->sk_flags & SK_BT_REQFWD) && + ScanDirectionIsForward(dir)) + *continuescan = false; + else if ((subkey->sk_flags & SK_BT_REQBKWD) && + ScanDirectionIsBackward(dir)) + *continuescan = false; + return false; + } + + /* Perform the test --- three-way comparison not bool operator */ + cmpresult = DatumGetInt32(FunctionCall2Coll(&subkey->sk_func, + subkey->sk_collation, + datum, + subkey->sk_argument)); + + if (subkey->sk_flags & SK_BT_DESC) + INVERT_COMPARE_RESULT(cmpresult); + + /* Done comparing if unequal, else advance to next column */ + if (cmpresult != 0) + break; + + if (subkey->sk_flags & SK_ROW_END) + break; + subkey++; + } + + /* + * At this point cmpresult indicates the overall result of the row + * comparison, and subkey points to the deciding column (or the last + * column if the result is "="). + */ + switch (subkey->sk_strategy) + { + /* EQ and NE cases aren't allowed here */ + case BTLessStrategyNumber: + result = (cmpresult < 0); + break; + case BTLessEqualStrategyNumber: + result = (cmpresult <= 0); + break; + case BTGreaterEqualStrategyNumber: + result = (cmpresult >= 0); + break; + case BTGreaterStrategyNumber: + result = (cmpresult > 0); + break; + default: + elog(ERROR, "unrecognized RowCompareType: %d", + (int) subkey->sk_strategy); + result = 0; /* keep compiler quiet */ + break; + } + + if (!result) + { + /* + * Tuple fails this qual. If it's a required qual for the current + * scan direction, then we can conclude no further tuples will pass, + * either. Note we have to look at the deciding column, not + * necessarily the first or last column of the row condition. + */ + if ((subkey->sk_flags & SK_BT_REQFWD) && + ScanDirectionIsForward(dir)) + *continuescan = false; + else if ((subkey->sk_flags & SK_BT_REQBKWD) && + ScanDirectionIsBackward(dir)) + *continuescan = false; + } + + return result; +} + +/* + * _bt_truncate() -- create tuple without unneeded suffix attributes. + * + * Returns truncated pivot index tuple allocated in caller's memory context, + * with key attributes copied from caller's firstright argument. If rel is + * an INCLUDE index, non-key attributes will definitely be truncated away, + * since they're not part of the key space. More aggressive suffix + * truncation can take place when it's clear that the returned tuple does not + * need one or more suffix key attributes. We only need to keep firstright + * attributes up to and including the first non-lastleft-equal attribute. + * Caller's insertion scankey is used to compare the tuples; the scankey's + * argument values are not considered here. + * + * Note that returned tuple's t_tid offset will hold the number of attributes + * present, so the original item pointer offset is not represented. Caller + * should only change truncated tuple's downlink. Note also that truncated + * key attributes are treated as containing "minus infinity" values by + * _bt_compare(). + * + * In the worst case (when a heap TID must be appended to distinguish lastleft + * from firstright), the size of the returned tuple is the size of firstright + * plus the size of an additional MAXALIGN()'d item pointer. This guarantee + * is important, since callers need to stay under the 1/3 of a page + * restriction on tuple size. If this routine is ever taught to truncate + * within an attribute/datum, it will need to avoid returning an enlarged + * tuple to caller when truncation + TOAST compression ends up enlarging the + * final datum. + */ +IndexTuple +_bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright, + BTScanInsert itup_key) +{ + TupleDesc itupdesc = RelationGetDescr(rel); + int16 nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); + int keepnatts; + IndexTuple pivot; + IndexTuple tidpivot; + ItemPointer pivotheaptid; + Size newsize; + + /* + * We should only ever truncate non-pivot tuples from leaf pages. It's + * never okay to truncate when splitting an internal page. + */ + Assert(!BTreeTupleIsPivot(lastleft) && !BTreeTupleIsPivot(firstright)); + + /* Determine how many attributes must be kept in truncated tuple */ + keepnatts = _bt_keep_natts(rel, lastleft, firstright, itup_key); + +#ifdef DEBUG_NO_TRUNCATE + /* Force truncation to be ineffective for testing purposes */ + keepnatts = nkeyatts + 1; +#endif + + pivot = index_truncate_tuple(itupdesc, firstright, + Min(keepnatts, nkeyatts)); + + if (BTreeTupleIsPosting(pivot)) + { + /* + * index_truncate_tuple() just returns a straight copy of firstright + * when it has no attributes to truncate. When that happens, we may + * need to truncate away a posting list here instead. + */ + Assert(keepnatts == nkeyatts || keepnatts == nkeyatts + 1); + Assert(IndexRelationGetNumberOfAttributes(rel) == nkeyatts); + pivot->t_info &= ~INDEX_SIZE_MASK; + pivot->t_info |= MAXALIGN(BTreeTupleGetPostingOffset(firstright)); + } + + /* + * If there is a distinguishing key attribute within pivot tuple, we're + * done + */ + if (keepnatts <= nkeyatts) + { + BTreeTupleSetNAtts(pivot, keepnatts, false); + return pivot; + } + + /* + * We have to store a heap TID in the new pivot tuple, since no non-TID + * key attribute value in firstright distinguishes the right side of the + * split from the left side. nbtree conceptualizes this case as an + * inability to truncate away any key attributes, since heap TID is + * treated as just another key attribute (despite lacking a pg_attribute + * entry). + * + * Use enlarged space that holds a copy of pivot. We need the extra space + * to store a heap TID at the end (using the special pivot tuple + * representation). Note that the original pivot already has firstright's + * possible posting list/non-key attribute values removed at this point. + */ + newsize = MAXALIGN(IndexTupleSize(pivot)) + MAXALIGN(sizeof(ItemPointerData)); + tidpivot = palloc0(newsize); + memcpy(tidpivot, pivot, MAXALIGN(IndexTupleSize(pivot))); + /* Cannot leak memory here */ + pfree(pivot); + + /* + * Store all of firstright's key attribute values plus a tiebreaker heap + * TID value in enlarged pivot tuple + */ + tidpivot->t_info &= ~INDEX_SIZE_MASK; + tidpivot->t_info |= newsize; + BTreeTupleSetNAtts(tidpivot, nkeyatts, true); + pivotheaptid = BTreeTupleGetHeapTID(tidpivot); + + /* + * Lehman & Yao use lastleft as the leaf high key in all cases, but don't + * consider suffix truncation. It seems like a good idea to follow that + * example in cases where no truncation takes place -- use lastleft's heap + * TID. (This is also the closest value to negative infinity that's + * legally usable.) + */ + ItemPointerCopy(BTreeTupleGetMaxHeapTID(lastleft), pivotheaptid); + + /* + * We're done. Assert() that heap TID invariants hold before returning. + * + * Lehman and Yao require that the downlink to the right page, which is to + * be inserted into the parent page in the second phase of a page split be + * a strict lower bound on items on the right page, and a non-strict upper + * bound for items on the left page. Assert that heap TIDs follow these + * invariants, since a heap TID value is apparently needed as a + * tiebreaker. + */ +#ifndef DEBUG_NO_TRUNCATE + Assert(ItemPointerCompare(BTreeTupleGetMaxHeapTID(lastleft), + BTreeTupleGetHeapTID(firstright)) < 0); + Assert(ItemPointerCompare(pivotheaptid, + BTreeTupleGetHeapTID(lastleft)) >= 0); + Assert(ItemPointerCompare(pivotheaptid, + BTreeTupleGetHeapTID(firstright)) < 0); +#else + + /* + * Those invariants aren't guaranteed to hold for lastleft + firstright + * heap TID attribute values when they're considered here only because + * DEBUG_NO_TRUNCATE is defined (a heap TID is probably not actually + * needed as a tiebreaker). DEBUG_NO_TRUNCATE must therefore use a heap + * TID value that always works as a strict lower bound for items to the + * right. In particular, it must avoid using firstright's leading key + * attribute values along with lastleft's heap TID value when lastleft's + * TID happens to be greater than firstright's TID. + */ + ItemPointerCopy(BTreeTupleGetHeapTID(firstright), pivotheaptid); + + /* + * Pivot heap TID should never be fully equal to firstright. Note that + * the pivot heap TID will still end up equal to lastleft's heap TID when + * that's the only usable value. + */ + ItemPointerSetOffsetNumber(pivotheaptid, + OffsetNumberPrev(ItemPointerGetOffsetNumber(pivotheaptid))); + Assert(ItemPointerCompare(pivotheaptid, + BTreeTupleGetHeapTID(firstright)) < 0); +#endif + + return tidpivot; +} + +/* + * _bt_keep_natts - how many key attributes to keep when truncating. + * + * Caller provides two tuples that enclose a split point. Caller's insertion + * scankey is used to compare the tuples; the scankey's argument values are + * not considered here. + * + * This can return a number of attributes that is one greater than the + * number of key attributes for the index relation. This indicates that the + * caller must use a heap TID as a unique-ifier in new pivot tuple. + */ +static int +_bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright, + BTScanInsert itup_key) +{ + int nkeyatts = IndexRelationGetNumberOfKeyAttributes(rel); + TupleDesc itupdesc = RelationGetDescr(rel); + int keepnatts; + ScanKey scankey; + + /* + * _bt_compare() treats truncated key attributes as having the value minus + * infinity, which would break searches within !heapkeyspace indexes. We + * must still truncate away non-key attribute values, though. + */ + if (!itup_key->heapkeyspace) + return nkeyatts; + + scankey = itup_key->scankeys; + keepnatts = 1; + for (int attnum = 1; attnum <= nkeyatts; attnum++, scankey++) + { + Datum datum1, + datum2; + bool isNull1, + isNull2; + + datum1 = index_getattr(lastleft, attnum, itupdesc, &isNull1); + datum2 = index_getattr(firstright, attnum, itupdesc, &isNull2); + + if (isNull1 != isNull2) + break; + + if (!isNull1 && + DatumGetInt32(FunctionCall2Coll(&scankey->sk_func, + scankey->sk_collation, + datum1, + datum2)) != 0) + break; + + keepnatts++; + } + + /* + * Assert that _bt_keep_natts_fast() agrees with us in passing. This is + * expected in an allequalimage index. + */ + Assert(!itup_key->allequalimage || + keepnatts == _bt_keep_natts_fast(rel, lastleft, firstright)); + + return keepnatts; +} + +/* + * _bt_keep_natts_fast - fast bitwise variant of _bt_keep_natts. + * + * This is exported so that a candidate split point can have its effect on + * suffix truncation inexpensively evaluated ahead of time when finding a + * split location. A naive bitwise approach to datum comparisons is used to + * save cycles. + * + * The approach taken here usually provides the same answer as _bt_keep_natts + * will (for the same pair of tuples from a heapkeyspace index), since the + * majority of btree opclasses can never indicate that two datums are equal + * unless they're bitwise equal after detoasting. When an index only has + * "equal image" columns, routine is guaranteed to give the same result as + * _bt_keep_natts would. + * + * Callers can rely on the fact that attributes considered equal here are + * definitely also equal according to _bt_keep_natts, even when the index uses + * an opclass or collation that is not "allequalimage"/deduplication-safe. + * This weaker guarantee is good enough for nbtsplitloc.c caller, since false + * negatives generally only have the effect of making leaf page splits use a + * more balanced split point. + */ +int +_bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright) +{ + TupleDesc itupdesc = RelationGetDescr(rel); + int keysz = IndexRelationGetNumberOfKeyAttributes(rel); + int keepnatts; + + keepnatts = 1; + for (int attnum = 1; attnum <= keysz; attnum++) + { + Datum datum1, + datum2; + bool isNull1, + isNull2; + Form_pg_attribute att; + + datum1 = index_getattr(lastleft, attnum, itupdesc, &isNull1); + datum2 = index_getattr(firstright, attnum, itupdesc, &isNull2); + att = TupleDescAttr(itupdesc, attnum - 1); + + if (isNull1 != isNull2) + break; + + if (!isNull1 && + !datum_image_eq(datum1, datum2, att->attbyval, att->attlen)) + break; + + keepnatts++; + } + + return keepnatts; +} diff --git a/src/backend/utils/sort/tuplesortvariants.c b/src/backend/utils/sort/tuplesortvariants.c index 84442a93c5..d93839620d 100644 --- a/src/backend/utils/sort/tuplesortvariants.c +++ b/src/backend/utils/sort/tuplesortvariants.c @@ -61,10 +61,6 @@ static void writetup_cluster(Tuplesortstate *state, LogicalTape *tape, SortTuple *stup); static void readtup_cluster(Tuplesortstate *state, SortTuple *stup, LogicalTape *tape, unsigned int tuplen); -static int comparetup_index_btree(const SortTuple *a, const SortTuple *b, - Tuplesortstate *state); -static int comparetup_index_btree_tiebreak(const SortTuple *a, const SortTuple *b, - Tuplesortstate *state); static int comparetup_index_hash(const SortTuple *a, const SortTuple *b, Tuplesortstate *state); static int comparetup_index_hash_tiebreak(const SortTuple *a, const SortTuple *b, @@ -140,6 +136,9 @@ typedef struct int datumTypeLen; } TuplesortDatumArg; +#define NBT_SPECIALIZE_FILE "../../backend/utils/sort/tuplesortvariants_spec.c" +#include "access/nbtree_spec.h" + Tuplesortstate * tuplesort_begin_heap(TupleDesc tupDesc, int nkeys, AttrNumber *attNums, @@ -228,6 +227,7 @@ tuplesort_begin_cluster(TupleDesc tupDesc, MemoryContext oldcontext; TuplesortClusterArg *arg; int i; + nbts_prep_ctx(indexRel); Assert(indexRel->rd_rel->relam == BTREE_AM_OID); @@ -340,6 +340,7 @@ tuplesort_begin_index_btree(Relation heapRel, TuplesortIndexBTreeArg *arg; MemoryContext oldcontext; int i; + nbts_prep_ctx(indexRel); oldcontext = MemoryContextSwitchTo(base->maincontext); arg = (TuplesortIndexBTreeArg *) palloc(sizeof(TuplesortIndexBTreeArg)); @@ -475,6 +476,7 @@ tuplesort_begin_index_gist(Relation heapRel, MemoryContext oldcontext; TuplesortIndexBTreeArg *arg; int i; + nbts_prep_ctx(indexRel); oldcontext = MemoryContextSwitchTo(base->maincontext); arg = (TuplesortIndexBTreeArg *) palloc(sizeof(TuplesortIndexBTreeArg)); @@ -1299,152 +1301,6 @@ removeabbrev_index(Tuplesortstate *state, SortTuple *stups, int count) } } -static int -comparetup_index_btree(const SortTuple *a, const SortTuple *b, - Tuplesortstate *state) -{ - /* - * This is similar to comparetup_heap(), but expects index tuples. There - * is also special handling for enforcing uniqueness, and special - * treatment for equal keys at the end. - */ - TuplesortPublic *base = TuplesortstateGetPublic(state); - SortSupport sortKey = base->sortKeys; - int32 compare; - - /* Compare the leading sort key */ - compare = ApplySortComparator(a->datum1, a->isnull1, - b->datum1, b->isnull1, - sortKey); - if (compare != 0) - return compare; - - /* Compare additional sort keys */ - return comparetup_index_btree_tiebreak(a, b, state); -} - -static int -comparetup_index_btree_tiebreak(const SortTuple *a, const SortTuple *b, - Tuplesortstate *state) -{ - TuplesortPublic *base = TuplesortstateGetPublic(state); - TuplesortIndexBTreeArg *arg = (TuplesortIndexBTreeArg *) base->arg; - SortSupport sortKey = base->sortKeys; - IndexTuple tuple1; - IndexTuple tuple2; - int keysz; - TupleDesc tupDes; - bool equal_hasnull = false; - int nkey; - int32 compare; - Datum datum1, - datum2; - bool isnull1, - isnull2; - - tuple1 = (IndexTuple) a->tuple; - tuple2 = (IndexTuple) b->tuple; - keysz = base->nKeys; - tupDes = RelationGetDescr(arg->index.indexRel); - - if (sortKey->abbrev_converter) - { - datum1 = index_getattr(tuple1, 1, tupDes, &isnull1); - datum2 = index_getattr(tuple2, 1, tupDes, &isnull2); - - compare = ApplySortAbbrevFullComparator(datum1, isnull1, - datum2, isnull2, - sortKey); - if (compare != 0) - return compare; - } - - /* they are equal, so we only need to examine one null flag */ - if (a->isnull1) - equal_hasnull = true; - - sortKey++; - for (nkey = 2; nkey <= keysz; nkey++, sortKey++) - { - datum1 = index_getattr(tuple1, nkey, tupDes, &isnull1); - datum2 = index_getattr(tuple2, nkey, tupDes, &isnull2); - - compare = ApplySortComparator(datum1, isnull1, - datum2, isnull2, - sortKey); - if (compare != 0) - return compare; /* done when we find unequal attributes */ - - /* they are equal, so we only need to examine one null flag */ - if (isnull1) - equal_hasnull = true; - } - - /* - * If btree has asked us to enforce uniqueness, complain if two equal - * tuples are detected (unless there was at least one NULL field and NULLS - * NOT DISTINCT was not set). - * - * It is sufficient to make the test here, because if two tuples are equal - * they *must* get compared at some stage of the sort --- otherwise the - * sort algorithm wouldn't have checked whether one must appear before the - * other. - */ - if (arg->enforceUnique && !(!arg->uniqueNullsNotDistinct && equal_hasnull)) - { - Datum values[INDEX_MAX_KEYS]; - bool isnull[INDEX_MAX_KEYS]; - char *key_desc; - - /* - * Some rather brain-dead implementations of qsort (such as the one in - * QNX 4) will sometimes call the comparison routine to compare a - * value to itself, but we always use our own implementation, which - * does not. - */ - Assert(tuple1 != tuple2); - - index_deform_tuple(tuple1, tupDes, values, isnull); - - key_desc = BuildIndexValueDescription(arg->index.indexRel, values, isnull); - - ereport(ERROR, - (errcode(ERRCODE_UNIQUE_VIOLATION), - errmsg("could not create unique index \"%s\"", - RelationGetRelationName(arg->index.indexRel)), - key_desc ? errdetail("Key %s is duplicated.", key_desc) : - errdetail("Duplicate keys exist."), - errtableconstraint(arg->index.heapRel, - RelationGetRelationName(arg->index.indexRel)))); - } - - /* - * If key values are equal, we sort on ItemPointer. This is required for - * btree indexes, since heap TID is treated as an implicit last key - * attribute in order to ensure that all keys in the index are physically - * unique. - */ - { - BlockNumber blk1 = ItemPointerGetBlockNumber(&tuple1->t_tid); - BlockNumber blk2 = ItemPointerGetBlockNumber(&tuple2->t_tid); - - if (blk1 != blk2) - return (blk1 < blk2) ? -1 : 1; - } - { - OffsetNumber pos1 = ItemPointerGetOffsetNumber(&tuple1->t_tid); - OffsetNumber pos2 = ItemPointerGetOffsetNumber(&tuple2->t_tid); - - if (pos1 != pos2) - return (pos1 < pos2) ? -1 : 1; - } - - /* ItemPointer values should never be equal */ - Assert(false); - - return 0; -} - static int comparetup_index_hash(const SortTuple *a, const SortTuple *b, Tuplesortstate *state) diff --git a/src/backend/utils/sort/tuplesortvariants_spec.c b/src/backend/utils/sort/tuplesortvariants_spec.c new file mode 100644 index 0000000000..705da09329 --- /dev/null +++ b/src/backend/utils/sort/tuplesortvariants_spec.c @@ -0,0 +1,175 @@ +/*------------------------------------------------------------------------- + * + * tuplesortvariants_spec.c + * Index shape-specialized functions for tuplesortvariants.c + * + * NOTES + * See also: access/nbtree/README section "nbtree specialization" + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/access/nbtree/tuplesortvariants_spec.c + * + *------------------------------------------------------------------------- + */ + +#define comparetup_index_btree NBTS_FUNCTION(comparetup_index_btree) +#define comparetup_index_btree_tiebreak NBTS_FUNCTION(comparetup_index_btree_tiebreak) + +static int comparetup_index_btree(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); +static int comparetup_index_btree_tiebreak(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state); + +static int +comparetup_index_btree(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state) +{ + /* + * This is similar to comparetup_heap(), but expects index tuples. There + * is also special handling for enforcing uniqueness, and special + * treatment for equal keys at the end. + */ + TuplesortPublic *base = TuplesortstateGetPublic(state); + SortSupport sortKey = base->sortKeys; + int32 compare; + + /* Compare the leading sort key */ + compare = ApplySortComparator(a->datum1, a->isnull1, + b->datum1, b->isnull1, + sortKey); + if (compare != 0) + return compare; + + /* Compare additional sort keys */ + return comparetup_index_btree_tiebreak(a, b, state); +} + +static int +comparetup_index_btree_tiebreak(const SortTuple *a, const SortTuple *b, + Tuplesortstate *state) +{ /* + * This is similar to comparetup_heap(), but expects index tuples. There + * is also special handling for enforcing uniqueness, and special + * treatment for equal keys at the end. + */ + TuplesortPublic *base = TuplesortstateGetPublic(state); + TuplesortIndexBTreeArg *arg = (TuplesortIndexBTreeArg *) base->arg; + SortSupport sortKey = base->sortKeys; + IndexTuple tuple1; + IndexTuple tuple2; + int keysz; + TupleDesc tupDes; + bool equal_hasnull = false; + int nkey; + int32 compare; + Datum datum1, + datum2; + bool isnull1, + isnull2; + + tuple1 = (IndexTuple) a->tuple; + tuple2 = (IndexTuple) b->tuple; + keysz = base->nKeys; + tupDes = RelationGetDescr(arg->index.indexRel); + + if (sortKey->abbrev_converter) + { + datum1 = index_getattr(tuple1, 1, tupDes, &isnull1); + datum2 = index_getattr(tuple2, 1, tupDes, &isnull2); + + compare = ApplySortAbbrevFullComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (compare != 0) + return compare; + } + + /* they are equal, so we only need to examine one null flag */ + if (a->isnull1) + equal_hasnull = true; + + sortKey++; + for (nkey = 2; nkey <= keysz; nkey++, sortKey++) + { + datum1 = index_getattr(tuple1, nkey, tupDes, &isnull1); + datum2 = index_getattr(tuple2, nkey, tupDes, &isnull2); + + compare = ApplySortComparator(datum1, isnull1, + datum2, isnull2, + sortKey); + if (compare != 0) + return compare; /* done when we find unequal attributes */ + + /* they are equal, so we only need to examine one null flag */ + if (isnull1) + equal_hasnull = true; + } + + /* + * If btree has asked us to enforce uniqueness, complain if two equal + * tuples are detected (unless there was at least one NULL field and NULLS + * NOT DISTINCT was not set). + * + * It is sufficient to make the test here, because if two tuples are equal + * they *must* get compared at some stage of the sort --- otherwise the + * sort algorithm wouldn't have checked whether one must appear before the + * other. + */ + if (arg->enforceUnique && !(!arg->uniqueNullsNotDistinct && equal_hasnull)) + { + Datum values[INDEX_MAX_KEYS]; + bool isnull[INDEX_MAX_KEYS]; + char *key_desc; + + /* + * Some rather brain-dead implementations of qsort (such as the one in + * QNX 4) will sometimes call the comparison routine to compare a + * value to itself, but we always use our own implementation, which + * does not. + */ + Assert(tuple1 != tuple2); + + index_deform_tuple(tuple1, tupDes, values, isnull); + + key_desc = BuildIndexValueDescription(arg->index.indexRel, values, isnull); + + ereport(ERROR, + (errcode(ERRCODE_UNIQUE_VIOLATION), + errmsg("could not create unique index \"%s\"", + RelationGetRelationName(arg->index.indexRel)), + key_desc ? errdetail("Key %s is duplicated.", key_desc) : + errdetail("Duplicate keys exist."), + errtableconstraint(arg->index.heapRel, + RelationGetRelationName(arg->index.indexRel)))); + } + + /* + * If key values are equal, we sort on ItemPointer. This is required for + * btree indexes, since heap TID is treated as an implicit last key + * attribute in order to ensure that all keys in the index are physically + * unique. + */ + { + BlockNumber blk1 = ItemPointerGetBlockNumber(&tuple1->t_tid); + BlockNumber blk2 = ItemPointerGetBlockNumber(&tuple2->t_tid); + + if (blk1 != blk2) + return (blk1 < blk2) ? -1 : 1; + } + { + OffsetNumber pos1 = ItemPointerGetOffsetNumber(&tuple1->t_tid); + OffsetNumber pos2 = ItemPointerGetOffsetNumber(&tuple2->t_tid); + + if (pos1 != pos2) + return (pos1 < pos2) ? -1 : 1; + } + + /* ItemPointer values should never be equal */ + Assert(false); + + return 0; +} diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 11f4184107..d1bbc4d2a8 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -1121,15 +1121,27 @@ typedef struct BTOptions #define PROGRESS_BTREE_PHASE_PERFORMSORT_2 4 #define PROGRESS_BTREE_PHASE_LEAF_LOAD 5 +typedef enum NBTS_CTX { + NBTS_CTX_CACHED, + NBTS_CTX_DEFAULT, /* fallback */ +} NBTS_CTX; + +static inline NBTS_CTX _nbt_spec_context(Relation irel) +{ + if (!PointerIsValid(irel)) + return NBTS_CTX_DEFAULT; + + return NBTS_CTX_CACHED; +} + + +#define NBT_SPECIALIZE_FILE "access/nbtree_specfuncs.h" +#include "nbtree_spec.h" + /* * external entry points for btree, in nbtree.c */ extern void btbuildempty(Relation index); -extern bool btinsert(Relation rel, Datum *values, bool *isnull, - ItemPointer ht_ctid, Relation heapRel, - IndexUniqueCheck checkUnique, - bool indexUnchanged, - struct IndexInfo *indexInfo); extern IndexScanDesc btbeginscan(Relation rel, int nkeys, int norderbys); extern Size btestimateparallelscan(void); extern void btinitparallelscan(void *target); @@ -1160,8 +1172,6 @@ extern void _bt_parallel_advance_array_keys(IndexScanDesc scan); /* * prototypes for functions in nbtdedup.c */ -extern void _bt_dedup_pass(Relation rel, Buffer buf, IndexTuple newitem, - Size newitemsz, bool bottomupdedup); extern bool _bt_bottomupdel_pass(Relation rel, Buffer buf, Relation heapRel, Size newitemsz); extern void _bt_dedup_start_pending(BTDedupState state, IndexTuple base, @@ -1177,9 +1187,6 @@ extern IndexTuple _bt_swap_posting(IndexTuple newitem, IndexTuple oposting, /* * prototypes for functions in nbtinsert.c */ -extern bool _bt_doinsert(Relation rel, IndexTuple itup, - IndexUniqueCheck checkUnique, bool indexUnchanged, - Relation heapRel); extern void _bt_finish_split(Relation rel, Relation heaprel, Buffer lbuf, BTStack stack); extern Buffer _bt_getstackbuf(Relation rel, Relation heaprel, BTStack stack, @@ -1230,16 +1237,6 @@ extern void _bt_pendingfsm_finalize(Relation rel, BTVacState *vstate); /* * prototypes for functions in nbtsearch.c */ -extern BTStack _bt_search(Relation rel, Relation heaprel, BTScanInsert key, - Buffer *bufP, int access, Snapshot snapshot); -extern Buffer _bt_moveright(Relation rel, Relation heaprel, BTScanInsert key, - Buffer buf, bool forupdate, BTStack stack, - int access, Snapshot snapshot, - AttrNumber *comparecol, char *tupdatabuf); -extern OffsetNumber _bt_binsrch_insert(Relation rel, BTInsertState insertstate, - AttrNumber highcmpcol); -extern int32 _bt_compare(Relation rel, BTScanInsert key, Page page, - OffsetNumber offnum, AttrNumber *comparecol); extern bool _bt_first(IndexScanDesc scan, ScanDirection dir); extern bool _bt_next(IndexScanDesc scan, ScanDirection dir); extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost, @@ -1248,7 +1245,6 @@ extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost, /* * prototypes for functions in nbtutils.c */ -extern BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup); extern void _bt_freestack(BTStack stack); extern void _bt_preprocess_array_keys(IndexScanDesc scan); extern void _bt_start_array_keys(IndexScanDesc scan, ScanDirection dir); @@ -1256,8 +1252,6 @@ extern bool _bt_advance_array_keys(IndexScanDesc scan, ScanDirection dir); extern void _bt_mark_array_keys(IndexScanDesc scan); extern void _bt_restore_array_keys(IndexScanDesc scan); extern void _bt_preprocess_keys(IndexScanDesc scan); -extern bool _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, - int tupnatts, ScanDirection dir, bool *continuescan); extern void _bt_killitems(IndexScanDesc scan); extern BTCycleId _bt_vacuum_cycleid(Relation rel); extern BTCycleId _bt_start_vacuum(Relation rel); @@ -1270,10 +1264,6 @@ extern bool btproperty(Oid index_oid, int attno, IndexAMProperty prop, const char *propname, bool *res, bool *isnull); extern char *btbuildphasename(int64 phasenum); -extern IndexTuple _bt_truncate(Relation rel, IndexTuple lastleft, - IndexTuple firstright, BTScanInsert itup_key); -extern int _bt_keep_natts_fast(Relation rel, IndexTuple lastleft, - IndexTuple firstright); extern bool _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum); extern void _bt_check_third_page(Relation rel, Relation heap, diff --git a/src/include/access/nbtree_spec.h b/src/include/access/nbtree_spec.h new file mode 100644 index 0000000000..fa38b09c6e --- /dev/null +++ b/src/include/access/nbtree_spec.h @@ -0,0 +1,183 @@ +/*------------------------------------------------------------------------- + * + * nbtree_specialize.h + * header file for postgres btree access method implementation. + * + * + * Portions Copyright (c) 1996-2022, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/nbtree_specialize.h + * + *------------------------------------------------------------------------- + * + * Specialize key-accessing functions and the hot code around those. + * + * Key attribute iteration is specialized through the use of the following + * macros: + * + * - nbts_attiterdeclare(itup) + * Declare the variables required to iterate over the provided IndexTuple's + * key attributes. Many tuples may have their attributes iterated over at the + * same time. + * - nbts_attiterinit(itup, initAttNum, tupDesc) + * Initialize the attribute iterator for the provided IndexTuple at + * the provided AttributeNumber. + * - nbts_foreachattr(initAttNum, endAttNum) + * Start a loop over the attributes, starting at initAttNum and ending at + * endAttNum, inclusive. It also takes care of truncated attributes. + * - nbts_attiter_attnum + * The current attribute number + * - nbts_attiter_nextattdatum(itup, tupDesc) + * Updates the attribute iterator state to the next attribute. Returns the + * datum of the next attribute, which might be null (see below) + * - nbts_attiter_curattisnull(itup) + * Returns whether the result from the last nbts_attiter_nextattdatum is + * null. + * - nbts_context(irel) + * Constructs a context that is used to call specialized functions. + * Note that this is unneeded in paths that are inaccessible to unspecialized + * code paths (i.e. code included through nbtree_spec.h), because that + * always calls the optimized functions directly. + */ + +/* + * Macros used in the nbtree specialization code. + */ +#define NBTS_TYPE_CACHED cached +#define NBTS_TYPE_DEFAULT default +#define NBTS_CTX_NAME __nbts_ctx + +/* contextual specializations */ +#define NBTS_MAKE_CTX(rel) const NBTS_CTX NBTS_CTX_NAME = _nbt_spec_context(rel) +#define NBTS_SPECIALIZE_NAME(name) ( \ + (NBTS_CTX_NAME) == NBTS_CTX_CACHED ? (NBTS_MAKE_NAME(name, NBTS_TYPE_CACHED)) : ( \ + NBTS_MAKE_NAME(name, NBTS_TYPE_DEFAULT) \ + ) \ +) + +/* how do we make names? */ +#define NBTS_MAKE_PREFIX(a) CppConcat(a,_) +#define NBTS_MAKE_NAME_(a,b) CppConcat(a,b) +#define NBTS_MAKE_NAME(a,b) NBTS_MAKE_NAME_(NBTS_MAKE_PREFIX(a),b) + +#define nbt_opt_specialize(rel) \ +do { \ + Assert(PointerIsValid(rel)); \ + if (unlikely((rel)->rd_indam->aminsert == btinsert_default)) \ + { \ + nbts_prep_ctx(rel); \ + _bt_specialize(rel); \ + } \ +} while (false) + +/* + * Protections against multiple inclusions - the definition of this macro is + * different for files included with the templating mechanism vs the users + * of this template, so redefine these macros at top and bottom. + */ +#ifdef NBTS_FUNCTION +#undef NBTS_FUNCTION +#endif +#define NBTS_FUNCTION(name) NBTS_MAKE_NAME(name, NBTS_TYPE) + +/* While specializing, the context is the local context */ +#ifdef nbts_prep_ctx +#undef nbts_prep_ctx +#endif +#define nbts_prep_ctx(rel) + +/* + * Specialization 1: CACHED + * + * Multiple key columns, optimized access for attcacheoff -cacheable offsets. + */ +#define NBTS_SPECIALIZING_CACHED +#define NBTS_TYPE NBTS_TYPE_CACHED + +#define nbts_attiterdeclare(itup) \ + bool NBTS_MAKE_NAME(itup, isNull) + +#define nbts_attiterinit(itup, initAttNum, tupDesc) do {} while (false) + +#define nbts_foreachattr(initAttNum, endAttNum) \ + for (int spec_i = (initAttNum); spec_i <= (endAttNum); spec_i++) + +#define nbts_attiter_attnum spec_i + +#define nbts_attiter_nextattdatum(itup, tupDesc) \ + index_getattr((itup), spec_i, (tupDesc), &(NBTS_MAKE_NAME(itup, isNull))) + +#define nbts_attiter_curattisnull(itup) \ + NBTS_MAKE_NAME(itup, isNull) + +#include NBT_SPECIALIZE_FILE + +#undef NBTS_SPECIALIZING_CACHED +#undef NBTS_TYPE +#undef nbts_attiterdeclare +#undef nbts_attiterinit +#undef nbts_foreachattr +#undef nbts_attiter_attnum +#undef nbts_attiter_nextattdatum +#undef nbts_attiter_curattisnull + +/* + * Specialization 2: DEFAULT + * + * "Default", externally accessible, not so optimized functions + */ + +/* Only the default context may need to specialize in some cases, so here's that */ +#undef nbts_prep_ctx +#define nbts_prep_ctx(rel) NBTS_MAKE_CTX(rel) + +#define NBTS_SPECIALIZING_DEFAULT +#define NBTS_TYPE NBTS_TYPE_DEFAULT + +#define nbts_attiterdeclare(itup) \ + bool NBTS_MAKE_NAME(itup, isNull) + +#define nbts_attiterinit(itup, initAttNum, tupDesc) + +#define nbts_foreachattr(initAttNum, endAttNum) \ + for (int spec_i = (initAttNum); spec_i <= (endAttNum); spec_i++) + +#define nbts_attiter_attnum spec_i + +#define nbts_attiter_nextattdatum(itup, tupDesc) \ + index_getattr((itup), spec_i, (tupDesc), &(NBTS_MAKE_NAME(itup, isNull))) + +#define nbts_attiter_curattisnull(itup) \ + NBTS_MAKE_NAME(itup, isNull) + +#include NBT_SPECIALIZE_FILE + +#undef NBTS_TYPE +#undef NBTS_SPECIALIZING_DEFAULT + +/* un-define the optimization macros */ +#undef nbts_attiterdeclare +#undef nbts_attiterinit +#undef nbts_foreachattr +#undef nbts_attiter_attnum +#undef nbts_attiter_nextattdatum +#undef nbts_attiter_curattisnull + +/* + * All next uses of nbts_prep_ctx are in non-templated code, so here we make + * sure we actually create the context. + */ +#undef nbts_prep_ctx +#define nbts_prep_ctx(rel) NBTS_MAKE_CTX(rel) + +/* + * from here on all NBTS_FUNCTIONs are from specialized function names that + * are being called. Change the result of those macros from a direct call + * call to a conditional call to the right place, depending on the correct + * context. + */ +#undef NBTS_FUNCTION +#define NBTS_FUNCTION(name) NBTS_SPECIALIZE_NAME(name) + +#undef NBT_SPECIALIZE_FILE diff --git a/src/include/access/nbtree_specfuncs.h b/src/include/access/nbtree_specfuncs.h new file mode 100644 index 0000000000..b87f5bf802 --- /dev/null +++ b/src/include/access/nbtree_specfuncs.h @@ -0,0 +1,65 @@ +/* + * prototypes for functions that are included in nbtree.h + */ + +#define _bt_specialize NBTS_FUNCTION(_bt_specialize) +#define btinsert NBTS_FUNCTION(btinsert) +#define _bt_dedup_pass NBTS_FUNCTION(_bt_dedup_pass) +#define _bt_doinsert NBTS_FUNCTION(_bt_doinsert) +#define _bt_search NBTS_FUNCTION(_bt_search) +#define _bt_moveright NBTS_FUNCTION(_bt_moveright) +#define _bt_binsrch_insert NBTS_FUNCTION(_bt_binsrch_insert) +#define _bt_compare NBTS_FUNCTION(_bt_compare) +#define _bt_mkscankey NBTS_FUNCTION(_bt_mkscankey) +#define _bt_checkkeys NBTS_FUNCTION(_bt_checkkeys) +#define _bt_truncate NBTS_FUNCTION(_bt_truncate) +#define _bt_keep_natts_fast NBTS_FUNCTION(_bt_keep_natts_fast) + +/* + * prototypes for functions in nbtree_spec.h + */ +extern void _bt_specialize(Relation rel); + +extern bool btinsert(Relation rel, Datum *values, bool *isnull, + ItemPointer ht_ctid, Relation heapRel, + IndexUniqueCheck checkUnique, bool indexUnchanged, + struct IndexInfo *indexInfo); + +/* + * prototypes for functions in nbtdedup_spec.h + */ +extern void _bt_dedup_pass(Relation rel, Buffer buf, IndexTuple newitem, + Size newitemsz, bool bottomupdedup); + + +/* + * prototypes for functions in nbtinsert_spec.h + */ + +extern bool _bt_doinsert(Relation rel, IndexTuple itup, + IndexUniqueCheck checkUnique, bool indexUnchanged, + Relation heapRel); + +/* + * prototypes for functions in nbtsearch_spec.h + */ +extern BTStack _bt_search(Relation rel, Relation heaprel, BTScanInsert key, + Buffer *bufP, int access, Snapshot snapshot); +extern Buffer _bt_moveright(Relation rel, Relation heaprel, BTScanInsert key, + Buffer buf, bool forupdate, BTStack stack, + int access, Snapshot snapshot, + AttrNumber *comparecol, char *tupdatabuf); +extern OffsetNumber _bt_binsrch_insert(Relation rel, BTInsertState insertstate, + AttrNumber highcmpcol); +extern int32 _bt_compare(Relation rel, BTScanInsert key, Page page, + OffsetNumber offnum, AttrNumber *comparecol); +/* + * prototypes for functions in nbtutils_spec.h + */ +extern BTScanInsert _bt_mkscankey(Relation rel, IndexTuple itup); +extern bool _bt_checkkeys(IndexScanDesc scan, IndexTuple tuple, int tupnatts, + ScanDirection dir, bool *continuescan); +extern IndexTuple _bt_truncate(Relation rel, IndexTuple lastleft, + IndexTuple firstright, BTScanInsert itup_key); +extern int _bt_keep_natts_fast(Relation rel, IndexTuple lastleft, + IndexTuple firstright); diff --git a/src/tools/pginclude/cpluspluscheck b/src/tools/pginclude/cpluspluscheck index 4e09c4686b..e504a2f114 100755 --- a/src/tools/pginclude/cpluspluscheck +++ b/src/tools/pginclude/cpluspluscheck @@ -116,6 +116,8 @@ do test "$f" = src/pl/tcl/pltclerrcodes.h && continue # Also not meant to be included standalone. + test "$f" = src/include/access/nbtree_spec.h && continue + test "$f" = src/include/access/nbtree_specfuncs.h && continue test "$f" = src/include/common/unicode_nonspacing_table.h && continue test "$f" = src/include/common/unicode_east_asian_fw_table.h && continue diff --git a/src/tools/pginclude/headerscheck b/src/tools/pginclude/headerscheck index 8dee1b5670..101888c806 100755 --- a/src/tools/pginclude/headerscheck +++ b/src/tools/pginclude/headerscheck @@ -111,6 +111,8 @@ do test "$f" = src/pl/tcl/pltclerrcodes.h && continue # Also not meant to be included standalone. + test "$f" = src/include/access/nbtree_spec.h && continue + test "$f" = src/include/access/nbtree_specfuncs.h && continue test "$f" = src/include/common/unicode_nonspacing_table.h && continue test "$f" = src/include/common/unicode_east_asian_fw_table.h && continue -- 2.40.1