diff --git a/src/backend/access/gin/ginget.c b/src/backend/access/gin/ginget.c index 60f005c..87cd9ea 100644 --- a/src/backend/access/gin/ginget.c +++ b/src/backend/access/gin/ginget.c @@ -120,7 +120,7 @@ collectMatchBitmap(GinBtreeData *btree, GinBtreeStack *stack, Form_pg_attribute attr; /* Initialize empty bitmap result */ - scanEntry->matchBitmap = tbm_create(work_mem * 1024L); + scanEntry->matchBitmap = tbm_create(work_mem * 1024L, NULL); /* Null query cannot partial-match anything */ if (scanEntry->isPartialMatch && diff --git a/src/backend/executor/nodeBitmapIndexscan.c b/src/backend/executor/nodeBitmapIndexscan.c index 4274e9a..94bb289 100644 --- a/src/backend/executor/nodeBitmapIndexscan.c +++ b/src/backend/executor/nodeBitmapIndexscan.c @@ -78,7 +78,7 @@ MultiExecBitmapIndexScan(BitmapIndexScanState *node) else { /* XXX should we use less than work_mem for this? */ - tbm = tbm_create(work_mem * 1024L); + tbm = tbm_create(work_mem * 1024L, NULL); } /* diff --git a/src/backend/executor/nodeBitmapOr.c b/src/backend/executor/nodeBitmapOr.c index 3c6155b..1d280be 100644 --- a/src/backend/executor/nodeBitmapOr.c +++ b/src/backend/executor/nodeBitmapOr.c @@ -129,7 +129,7 @@ MultiExecBitmapOr(BitmapOrState *node) if (result == NULL) /* first subplan */ { /* XXX should we use less than work_mem for this? */ - result = tbm_create(work_mem * 1024L); + result = tbm_create(work_mem * 1024L, NULL); } ((BitmapIndexScanState *) subnode)->biss_result = result; diff --git a/src/backend/nodes/tidbitmap.c b/src/backend/nodes/tidbitmap.c index 0885812..488c641 100644 --- a/src/backend/nodes/tidbitmap.c +++ b/src/backend/nodes/tidbitmap.c @@ -43,6 +43,9 @@ #include "access/htup_details.h" #include "nodes/bitmapset.h" #include "nodes/tidbitmap.h" +#include "storage/lwlock.h" +#include "utils/dsa.h" +#include "utils/relptr.h" /* * The maximum number of tuples per page is not large (typically 256 with @@ -102,6 +105,9 @@ typedef struct PagetableEntry bitmapword words[Max(WORDS_PER_PAGE, WORDS_PER_CHUNK)]; } PagetableEntry; +/* Relative pointer of PagetableEntry to share across workers. */ +relptr_declare(PagetableEntry, RelptrPagetableEntry); + /* * We want to avoid the overhead of creating the hashtable, which is * comparatively large, when not necessary. Particularly when we are using a @@ -139,6 +145,13 @@ struct TIDBitmap /* these are valid when iterating is true: */ PagetableEntry **spages; /* sorted exact-page list, or NULL */ PagetableEntry **schunks; /* sorted lossy-chunk list, or NULL */ + dsa_pointer dsa_data; /* dsa_pointer to the element array */ + dsa_area *dsa; /* reference to per-query dsa area */ + char *base; /* pointer to the element array */ + dsa_pointer dsapages; /* dsa_pointer to the page array */ + dsa_pointer dsachunks; /* dsa_pointer to the chunk array */ + RelptrPagetableEntry *relpages; /* page array of relptr */ + RelptrPagetableEntry *relchunks; /* chunk array of relptr */ }; /* @@ -156,6 +169,45 @@ struct TBMIterator TBMIterateResult output; /* MUST BE LAST (because variable-size) */ }; +/* Shared TBMInfo to shared across multiple workers */ +typedef struct TBMSharedInfo +{ + dsa_pointer dsa_data; /* dsa pointers to head of pagetable data */ + dsa_pointer spages; /* dsa pointer to page array */ + dsa_pointer schunks; /* dsa pointer to chunk array */ + PagetableEntry entry1; /* used when status == TBM_ONE_PAGE */ + TBMStatus status; /* status of TIDBitmap */ + int nentries; /* number of entries in pagetable */ + int maxentries; /* limit on same to meet maxbytes */ + int npages; /* number of exact entries in pagetable */ + int nchunks; /* number of lossy entries in pagetable */ +} TBMSharedInfo; + +/* + * This stores the shared members of TBMSharedIterator so that multiple + * workers can operate on the same state. It also stores the TBMSharedInfo, + * in order to share relptrs of the chunk and the pages arrays and other + * TBM related information with other workers. + */ +typedef struct TBMSharedIteratorState +{ + int spageptr; /* next spages index */ + int schunkptr; /* next schunks index */ + int schunkbit; /* next bit to check in current schunk */ + LWLock lock; /* lock to protect above members */ + TBMSharedInfo tbminfo; /* TBM info to shared across workers */ +} TBMSharedIteratorState; + +/* + * same as TBMIterator except that it holds a reference to the shared + * memory state so that multiple workers could operate on the same state. + */ +struct TBMSharedIterator +{ + TIDBitmap *tbm; /* TIDBitmap we're iterating over */ + TBMSharedIteratorState *state; + TBMIterateResult output; +}; /* Local function prototypes */ static void tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage); @@ -168,6 +220,8 @@ static bool tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno); static void tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno); static void tbm_lossify(TIDBitmap *tbm); static int tbm_comparator(const void *left, const void *right); +static int tbm_shared_comparator(const void *left, const void *right, + void *arg); /* * Simple inline murmur hash implementation for the exact width required, for @@ -187,6 +241,7 @@ hash_blockno(BlockNumber b) } /* define hashtable mapping block numbers to PagetableEntry's */ +#define SH_USE_NONDEFAULT_ALLOCATOR #define SH_PREFIX pagetable #define SH_ELEMENT_TYPE PagetableEntry #define SH_KEY_TYPE BlockNumber @@ -204,10 +259,12 @@ hash_blockno(BlockNumber b) * * The bitmap will live in the memory context that is CurrentMemoryContext * at the time of this call. It will be limited to (approximately) maxbytes - * total memory consumption. + * total memory consumption. If dsa passed to this function is not NULL + * then the memory for storing elements of the underlying page table will + * be allocated from the DSA. */ TIDBitmap * -tbm_create(long maxbytes) +tbm_create(long maxbytes, dsa_area *dsa) { TIDBitmap *tbm; long nbuckets; @@ -230,6 +287,7 @@ tbm_create(long maxbytes) nbuckets = Max(nbuckets, 16); /* sanity limit */ tbm->maxentries = (int) nbuckets; tbm->lossify_start = 0; + tbm->dsa = dsa; return tbm; } @@ -244,7 +302,7 @@ tbm_create_pagetable(TIDBitmap *tbm) Assert(tbm->status != TBM_HASH); Assert(tbm->pagetable == NULL); - tbm->pagetable = pagetable_create(tbm->mcxt, 128, NULL); + tbm->pagetable = pagetable_create(tbm->mcxt, 128, tbm); /* If entry1 is valid, push it into the hashtable */ if (tbm->status == TBM_ONE_PAGE) @@ -777,6 +835,129 @@ tbm_iterate(TBMIterator *iterator) } /* + * tbm_shared_iterate - scan through next page of a TIDBitmap + * + * As above, but this will iterate using shared iterator which is shared + * across multiple workers. We need to acquire the iterator LWLock, before + * accessing the shared members. + */ +TBMIterateResult * +tbm_shared_iterate(TBMSharedIterator *iterator) +{ + TIDBitmap *tbm = iterator->tbm; + TBMIterateResult *output = &iterator->output; + TBMSharedIteratorState *state = iterator->state; + + Assert(tbm->iterating); + + /* Acquire the LWLock before accessing the shared members */ + LWLockAcquire(&state->lock, LW_EXCLUSIVE); + + /* + * If lossy chunk pages remain, make sure we've advanced schunkptr/ + * schunkbit to the next set bit. + */ + while (state->schunkptr < tbm->nchunks) + { + PagetableEntry *chunk = relptr_access(tbm->base, + tbm->relchunks[state->schunkptr]); + int schunkbit = state->schunkbit; + + while (schunkbit < PAGES_PER_CHUNK) + { + int wordnum = WORDNUM(schunkbit); + int bitnum = BITNUM(schunkbit); + + if ((chunk->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0) + break; + schunkbit++; + } + if (schunkbit < PAGES_PER_CHUNK) + { + state->schunkbit = schunkbit; + break; + } + /* advance to next chunk */ + state->schunkptr++; + state->schunkbit = 0; + } + + /* + * If both chunk and per-page data remain, must output the numerically + * earlier page. + */ + if (state->schunkptr < tbm->nchunks) + { + PagetableEntry *chunk; + PagetableEntry *page; + BlockNumber chunk_blockno; + + chunk = relptr_access(tbm->base, tbm->relchunks[state->schunkptr]); + page = relptr_access(tbm->base, tbm->relpages[state->spageptr]); + chunk_blockno = chunk->blockno + state->schunkbit; + + if (state->spageptr >= tbm->npages || chunk_blockno < page->blockno) + { + /* Return a lossy page indicator from the chunk */ + output->blockno = chunk_blockno; + output->ntuples = -1; + output->recheck = true; + state->schunkbit++; + + LWLockRelease(&state->lock); + return output; + } + } + + if (state->spageptr < tbm->npages) + { + PagetableEntry *page; + int ntuples; + int wordnum; + + /* In ONE_PAGE state, we don't allocate an spages[] array */ + if (tbm->status == TBM_ONE_PAGE) + page = &tbm->entry1; + else + page = relptr_access(tbm->base, tbm->relpages[state->spageptr]); + + /* scan bitmap to extract individual offset numbers */ + ntuples = 0; + for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++) + { + bitmapword w = page->words[wordnum]; + + if (w != 0) + { + int off = wordnum * BITS_PER_BITMAPWORD + 1; + + while (w != 0) + { + if (w & 1) + output->offsets[ntuples++] = (OffsetNumber) off; + off++; + w >>= 1; + } + } + } + + output->blockno = page->blockno; + output->ntuples = ntuples; + output->recheck = page->recheck; + state->spageptr++; + + LWLockRelease(&state->lock); + + return output; + } + + LWLockRelease(&state->lock); + + /* Nothing more in the bitmap */ + return NULL; +} + +/* * tbm_end_iterate - finish an iteration over a TIDBitmap * * Currently this is just a pfree, but it might do more someday. (For @@ -790,6 +971,17 @@ tbm_end_iterate(TBMIterator *iterator) } /* + * tbm_end_shared_iterate - finish an iteration over a TIDBitmap + * + * As above, but it frees the memory of TBMSharedIterator. + */ +void +tbm_end_shared_iterate(TBMSharedIterator *iterator) +{ + pfree(iterator); +} + +/* * tbm_find_pageentry - find a PagetableEntry for the pageno * * Returns NULL if there is no non-lossy entry for the pageno. @@ -1061,3 +1253,246 @@ tbm_comparator(const void *left, const void *right) return 1; return 0; } + +/* + * As above, but this will get relptrs (relative pointer) of PagetableEntry + * and it needs to convert it to actual PagetableEntry before comparing the + * blockno. + */ +static int +tbm_shared_comparator(const void *left, const void *right, void *arg) +{ + PagetableEntry *lpage; + PagetableEntry *rpage; + + lpage = relptr_access((char *) arg, *((RelptrPagetableEntry *) left)); + rpage = relptr_access((char *) arg, *((RelptrPagetableEntry *) right)); + + if (lpage->blockno < rpage->blockno) + return -1; + else if (lpage->blockno > rpage->blockno) + return 1; + return 0; +} + +/* + * tbm_prepare_shared_iterate - prepare to iterator through a TIDBitmap + * by multiple workers using shared iterator. + * + * The TBMSharedIteratorState will be allocated from DSA so that multiple + * worker can attach to it and iterate jointly. + * + * This will convert the pagetable hash into page and chunk array of + * reptr (relative pointer) so that these arrays can be shared across + * multiple workers. + */ +dsa_pointer +tbm_prepare_shared_iterate(TIDBitmap *tbm) +{ + dsa_pointer iterator; + TBMSharedIteratorState *iterater_state; + TBMSharedInfo *tbminfo; + + /* + * Create the TBMIterator struct, with enough trailing space to serve the + * needs of the TBMIterateResult sub-struct. + */ + iterator = dsa_allocate(tbm->dsa, sizeof(TBMSharedIteratorState)); + iterater_state = dsa_get_address(tbm->dsa, iterator); + tbminfo = &iterater_state->tbminfo; + + /* + * If we have a hashtable, create and fill the sorted page lists, unless + * we already did that for a previous iterator. Note that the lists are + * attached to the bitmap not the iterator, so they can be used by more + * than one iterator. However, we keep dsa_pointer to these in the shared + * iterator so that other workers can have access to these and store in + * their local TBM. + */ + if (tbm->status == TBM_HASH && !tbm->iterating) + { + pagetable_iterator i; + PagetableEntry *page; + int npages; + int nchunks; + + /* + * Create page list and chunk list using relptr so that we can share + * this information across multiple workers. + */ + if (tbm->npages) + { + tbm->dsapages = dsa_allocate(tbm->dsa, + tbm->npages * (sizeof(RelptrPagetableEntry))); + tbm->relpages = dsa_get_address(tbm->dsa, tbm->dsapages); + } + if (tbm->nchunks) + { + tbm->dsachunks = dsa_allocate(tbm->dsa, + tbm->nchunks * (sizeof(RelptrPagetableEntry))); + tbm->relchunks = dsa_get_address(tbm->dsa, tbm->dsachunks); + } + + tbm->base = dsa_get_address(tbm->dsa, tbm->dsa_data); + + npages = nchunks = 0; + pagetable_start_iterate(tbm->pagetable, &i); + while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL) + { + if (page->ischunk) + relptr_store(tbm->base, tbm->relchunks[nchunks++], page); + else + relptr_store(tbm->base, tbm->relpages[npages++], page); + } + + Assert(npages == tbm->npages); + Assert(nchunks == tbm->nchunks); + if (npages > 1) + qsort_arg(tbm->relpages, npages, sizeof(RelptrPagetableEntry *), + tbm_shared_comparator, (void *) tbm->base); + if (nchunks > 1) + qsort_arg(tbm->relchunks, nchunks, sizeof(RelptrPagetableEntry *), + tbm_shared_comparator, (void *) tbm->base); + } + + /* + * Store the TBM member in the shared state. This is done to shared it + * across multiple workers. + */ + tbminfo->maxentries = tbm->maxentries; + tbminfo->nchunks = tbm->nchunks; + tbminfo->nentries = tbm->nentries; + tbminfo->npages = tbm->npages; + tbminfo->dsa_data = tbm->dsa_data; + tbminfo->spages = tbm->dsapages; + tbminfo->schunks = tbm->dsachunks; + tbminfo->status = tbm->status; + + if (tbm->status == TBM_ONE_PAGE) + memcpy(&tbminfo->entry1, &tbm->entry1, sizeof(PagetableEntry)); + + /* Initialize the shared iterator state. */ + iterater_state->schunkbit = 0; + iterater_state->schunkptr = 0; + iterater_state->spageptr = 0; + + + LWLockInitialize(&iterater_state->lock, LWTRANCHE_PARALLEL_TBM_ITERATOR); + + tbm->iterating = true; + + return iterator; +} + +/* + * tbm_attach_shared_iterate + * + * Allocate an iterator and attach shared iterator state to it so that + * multiple workers can access the same memory. We also need to copy + * some of the TBM related information from shared state to TBM because + * workers (other than the leader) would have created a local TBM therefore + * they have to get these information from shared location. + */ +TBMSharedIterator * +tbm_attach_shared_iterate(TIDBitmap *tbm, dsa_area *dsa, dsa_pointer iterator) +{ + TBMSharedIterator *shared_iterator; + TBMSharedInfo *tbminfo; + + shared_iterator = (TBMSharedIterator *) palloc(sizeof(TBMIterator) + + MAX_TUPLES_PER_PAGE * sizeof(OffsetNumber)); + + shared_iterator->tbm = tbm; + shared_iterator->state = + (TBMSharedIteratorState *) dsa_get_address(dsa, iterator); + tbminfo = &shared_iterator->state->tbminfo; + + /* + * If we have come here from worker other than the leader then we need to + * get TBM related information from shared iterator status. This needs to + * be done only once per worker. + */ + if (!tbm->iterating) + { + tbm->status = tbminfo->status; + tbm->nchunks = tbminfo->nchunks; + tbm->nentries = tbminfo->nentries; + tbm->npages = tbminfo->npages; + tbm->maxentries = tbminfo->maxentries; + if (tbm->status == TBM_HASH) + { + tbm->dsa_data = tbminfo->dsa_data; + tbm->base = dsa_get_address(dsa, tbm->dsa_data); + + /* Convert dsa pointer to the local pointer */ + if (tbm->npages) + tbm->relpages = dsa_get_address(tbm->dsa, tbminfo->spages); + + if (tbm->nchunks) + tbm->relchunks = dsa_get_address(tbm->dsa, tbminfo->schunks); + } + else + memcpy(&tbm->entry1, &tbminfo->entry1, sizeof(PagetableEntry)); + + tbm->iterating = true; + } + + return shared_iterator; +} + +/* + * pagetable_allocate + * + * Callback function for allocating the memory for hashtable elements. + * It allocates memory from DSA if tbm holds a reference to a dsa. + */ +static inline void * +pagetable_allocate(pagetable_hash *pagetable, Size size) +{ + TIDBitmap *tbm = (TIDBitmap *) pagetable->private_data; + dsa_pointer dsaptr; + char *ptr; + + if (tbm->dsa == NULL) + return MemoryContextAllocExtended(pagetable->ctx, size, + MCXT_ALLOC_HUGE | MCXT_ALLOC_ZERO); + + /* Add the size for storing dsa_pointer */ + dsaptr = dsa_allocate(tbm->dsa, size + sizeof(dsa_pointer)); + + tbm->dsa_data = dsaptr; + + ptr = dsa_get_address(tbm->dsa, dsaptr); + memset(ptr, 0, size + sizeof(dsa_pointer)); + + /* Store dsa_pointer */ + *((dsa_pointer *) ptr) = dsaptr; + + return (ptr + sizeof(dsa_pointer)); +} + +/* + * pagetable_free + * + * Callback function for freeing hash table elements. + */ +static inline void +pagetable_free(pagetable_hash *pagetable, void *pointer) +{ + TIDBitmap *tbm = (TIDBitmap *) pagetable->private_data; + dsa_pointer dsa_data; + + if (tbm->dsa == NULL) + return pfree(pointer); + + /* + * If TBM is in iterating phase that means pagetable is already created + * and we have come here during tbm_free. By this time we are already + * detached from the DSA because the GatherNode would have been shutdown. + */ + if (tbm->iterating) + return; + + dsa_data = *((dsa_pointer *) ((char *) pointer - sizeof(dsa_pointer))); + dsa_free(tbm->dsa, dsa_data); +} diff --git a/src/include/nodes/tidbitmap.h b/src/include/nodes/tidbitmap.h index 14992e0..17ed89d 100644 --- a/src/include/nodes/tidbitmap.h +++ b/src/include/nodes/tidbitmap.h @@ -23,6 +23,7 @@ #define TIDBITMAP_H #include "storage/itemptr.h" +#include "utils/dsa.h" /* @@ -33,6 +34,7 @@ typedef struct TIDBitmap TIDBitmap; /* Likewise, TBMIterator is private */ typedef struct TBMIterator TBMIterator; +typedef struct TBMSharedIterator TBMSharedIterator; /* Result structure for tbm_iterate */ typedef struct @@ -46,7 +48,7 @@ typedef struct /* function prototypes in nodes/tidbitmap.c */ -extern TIDBitmap *tbm_create(long maxbytes); +extern TIDBitmap *tbm_create(long maxbytes, dsa_area *dsa); extern void tbm_free(TIDBitmap *tbm); extern void tbm_add_tuples(TIDBitmap *tbm, @@ -62,5 +64,10 @@ extern bool tbm_is_empty(const TIDBitmap *tbm); extern TBMIterator *tbm_begin_iterate(TIDBitmap *tbm); extern TBMIterateResult *tbm_iterate(TBMIterator *iterator); extern void tbm_end_iterate(TBMIterator *iterator); +extern void tbm_end_shared_iterate(TBMSharedIterator *iterator); +extern dsa_pointer tbm_prepare_shared_iterate(TIDBitmap *tbm); +extern TBMSharedIterator *tbm_attach_shared_iterate(TIDBitmap *tbm, + dsa_area *dsa, dsa_pointer iterator); +extern TBMIterateResult *tbm_shared_iterate(TBMSharedIterator *iterator); #endif /* TIDBITMAP_H */ diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h index 8bd93c3..4d392e0 100644 --- a/src/include/storage/lwlock.h +++ b/src/include/storage/lwlock.h @@ -212,6 +212,7 @@ typedef enum BuiltinTrancheIds LWTRANCHE_LOCK_MANAGER, LWTRANCHE_PREDICATE_LOCK_MANAGER, LWTRANCHE_PARALLEL_QUERY_DSA, + LWTRANCHE_PARALLEL_TBM_ITERATOR, LWTRANCHE_FIRST_USER_DEFINED } BuiltinTrancheIds;