diff --git a/src/backend/access/gin/ginget.c b/src/backend/access/gin/ginget.c
index 60f005c..87cd9ea 100644
--- a/src/backend/access/gin/ginget.c
+++ b/src/backend/access/gin/ginget.c
@@ -120,7 +120,7 @@ collectMatchBitmap(GinBtreeData *btree, GinBtreeStack *stack,
Form_pg_attribute attr;
/* Initialize empty bitmap result */
- scanEntry->matchBitmap = tbm_create(work_mem * 1024L);
+ scanEntry->matchBitmap = tbm_create(work_mem * 1024L, NULL);
/* Null query cannot partial-match anything */
if (scanEntry->isPartialMatch &&
diff --git a/src/backend/executor/nodeBitmapIndexscan.c b/src/backend/executor/nodeBitmapIndexscan.c
index 4274e9a..94bb289 100644
--- a/src/backend/executor/nodeBitmapIndexscan.c
+++ b/src/backend/executor/nodeBitmapIndexscan.c
@@ -78,7 +78,7 @@ MultiExecBitmapIndexScan(BitmapIndexScanState *node)
else
{
/* XXX should we use less than work_mem for this? */
- tbm = tbm_create(work_mem * 1024L);
+ tbm = tbm_create(work_mem * 1024L, NULL);
}
/*
diff --git a/src/backend/executor/nodeBitmapOr.c b/src/backend/executor/nodeBitmapOr.c
index 3c6155b..1d280be 100644
--- a/src/backend/executor/nodeBitmapOr.c
+++ b/src/backend/executor/nodeBitmapOr.c
@@ -129,7 +129,7 @@ MultiExecBitmapOr(BitmapOrState *node)
if (result == NULL) /* first subplan */
{
/* XXX should we use less than work_mem for this? */
- result = tbm_create(work_mem * 1024L);
+ result = tbm_create(work_mem * 1024L, NULL);
}
((BitmapIndexScanState *) subnode)->biss_result = result;
diff --git a/src/backend/nodes/tidbitmap.c b/src/backend/nodes/tidbitmap.c
index 0885812..488c641 100644
--- a/src/backend/nodes/tidbitmap.c
+++ b/src/backend/nodes/tidbitmap.c
@@ -43,6 +43,9 @@
#include "access/htup_details.h"
#include "nodes/bitmapset.h"
#include "nodes/tidbitmap.h"
+#include "storage/lwlock.h"
+#include "utils/dsa.h"
+#include "utils/relptr.h"
/*
* The maximum number of tuples per page is not large (typically 256 with
@@ -102,6 +105,9 @@ typedef struct PagetableEntry
bitmapword words[Max(WORDS_PER_PAGE, WORDS_PER_CHUNK)];
} PagetableEntry;
+/* Relative pointer of PagetableEntry to share across workers. */
+relptr_declare(PagetableEntry, RelptrPagetableEntry);
+
/*
* We want to avoid the overhead of creating the hashtable, which is
* comparatively large, when not necessary. Particularly when we are using a
@@ -139,6 +145,13 @@ struct TIDBitmap
/* these are valid when iterating is true: */
PagetableEntry **spages; /* sorted exact-page list, or NULL */
PagetableEntry **schunks; /* sorted lossy-chunk list, or NULL */
+ dsa_pointer dsa_data; /* dsa_pointer to the element array */
+ dsa_area *dsa; /* reference to per-query dsa area */
+ char *base; /* pointer to the element array */
+ dsa_pointer dsapages; /* dsa_pointer to the page array */
+ dsa_pointer dsachunks; /* dsa_pointer to the chunk array */
+ RelptrPagetableEntry *relpages; /* page array of relptr */
+ RelptrPagetableEntry *relchunks; /* chunk array of relptr */
};
/*
@@ -156,6 +169,45 @@ struct TBMIterator
TBMIterateResult output; /* MUST BE LAST (because variable-size) */
};
+/* Shared TBMInfo to shared across multiple workers */
+typedef struct TBMSharedInfo
+{
+ dsa_pointer dsa_data; /* dsa pointers to head of pagetable data */
+ dsa_pointer spages; /* dsa pointer to page array */
+ dsa_pointer schunks; /* dsa pointer to chunk array */
+ PagetableEntry entry1; /* used when status == TBM_ONE_PAGE */
+ TBMStatus status; /* status of TIDBitmap */
+ int nentries; /* number of entries in pagetable */
+ int maxentries; /* limit on same to meet maxbytes */
+ int npages; /* number of exact entries in pagetable */
+ int nchunks; /* number of lossy entries in pagetable */
+} TBMSharedInfo;
+
+/*
+ * This stores the shared members of TBMSharedIterator so that multiple
+ * workers can operate on the same state. It also stores the TBMSharedInfo,
+ * in order to share relptrs of the chunk and the pages arrays and other
+ * TBM related information with other workers.
+ */
+typedef struct TBMSharedIteratorState
+{
+ int spageptr; /* next spages index */
+ int schunkptr; /* next schunks index */
+ int schunkbit; /* next bit to check in current schunk */
+ LWLock lock; /* lock to protect above members */
+ TBMSharedInfo tbminfo; /* TBM info to shared across workers */
+} TBMSharedIteratorState;
+
+/*
+ * same as TBMIterator except that it holds a reference to the shared
+ * memory state so that multiple workers could operate on the same state.
+ */
+struct TBMSharedIterator
+{
+ TIDBitmap *tbm; /* TIDBitmap we're iterating over */
+ TBMSharedIteratorState *state;
+ TBMIterateResult output;
+};
/* Local function prototypes */
static void tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage);
@@ -168,6 +220,8 @@ static bool tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno);
static void tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno);
static void tbm_lossify(TIDBitmap *tbm);
static int tbm_comparator(const void *left, const void *right);
+static int tbm_shared_comparator(const void *left, const void *right,
+ void *arg);
/*
* Simple inline murmur hash implementation for the exact width required, for
@@ -187,6 +241,7 @@ hash_blockno(BlockNumber b)
}
/* define hashtable mapping block numbers to PagetableEntry's */
+#define SH_USE_NONDEFAULT_ALLOCATOR
#define SH_PREFIX pagetable
#define SH_ELEMENT_TYPE PagetableEntry
#define SH_KEY_TYPE BlockNumber
@@ -204,10 +259,12 @@ hash_blockno(BlockNumber b)
*
* The bitmap will live in the memory context that is CurrentMemoryContext
* at the time of this call. It will be limited to (approximately) maxbytes
- * total memory consumption.
+ * total memory consumption. If dsa passed to this function is not NULL
+ * then the memory for storing elements of the underlying page table will
+ * be allocated from the DSA.
*/
TIDBitmap *
-tbm_create(long maxbytes)
+tbm_create(long maxbytes, dsa_area *dsa)
{
TIDBitmap *tbm;
long nbuckets;
@@ -230,6 +287,7 @@ tbm_create(long maxbytes)
nbuckets = Max(nbuckets, 16); /* sanity limit */
tbm->maxentries = (int) nbuckets;
tbm->lossify_start = 0;
+ tbm->dsa = dsa;
return tbm;
}
@@ -244,7 +302,7 @@ tbm_create_pagetable(TIDBitmap *tbm)
Assert(tbm->status != TBM_HASH);
Assert(tbm->pagetable == NULL);
- tbm->pagetable = pagetable_create(tbm->mcxt, 128, NULL);
+ tbm->pagetable = pagetable_create(tbm->mcxt, 128, tbm);
/* If entry1 is valid, push it into the hashtable */
if (tbm->status == TBM_ONE_PAGE)
@@ -777,6 +835,129 @@ tbm_iterate(TBMIterator *iterator)
}
/*
+ * tbm_shared_iterate - scan through next page of a TIDBitmap
+ *
+ * As above, but this will iterate using shared iterator which is shared
+ * across multiple workers. We need to acquire the iterator LWLock, before
+ * accessing the shared members.
+ */
+TBMIterateResult *
+tbm_shared_iterate(TBMSharedIterator *iterator)
+{
+ TIDBitmap *tbm = iterator->tbm;
+ TBMIterateResult *output = &iterator->output;
+ TBMSharedIteratorState *state = iterator->state;
+
+ Assert(tbm->iterating);
+
+ /* Acquire the LWLock before accessing the shared members */
+ LWLockAcquire(&state->lock, LW_EXCLUSIVE);
+
+ /*
+ * If lossy chunk pages remain, make sure we've advanced schunkptr/
+ * schunkbit to the next set bit.
+ */
+ while (state->schunkptr < tbm->nchunks)
+ {
+ PagetableEntry *chunk = relptr_access(tbm->base,
+ tbm->relchunks[state->schunkptr]);
+ int schunkbit = state->schunkbit;
+
+ while (schunkbit < PAGES_PER_CHUNK)
+ {
+ int wordnum = WORDNUM(schunkbit);
+ int bitnum = BITNUM(schunkbit);
+
+ if ((chunk->words[wordnum] & ((bitmapword) 1 << bitnum)) != 0)
+ break;
+ schunkbit++;
+ }
+ if (schunkbit < PAGES_PER_CHUNK)
+ {
+ state->schunkbit = schunkbit;
+ break;
+ }
+ /* advance to next chunk */
+ state->schunkptr++;
+ state->schunkbit = 0;
+ }
+
+ /*
+ * If both chunk and per-page data remain, must output the numerically
+ * earlier page.
+ */
+ if (state->schunkptr < tbm->nchunks)
+ {
+ PagetableEntry *chunk;
+ PagetableEntry *page;
+ BlockNumber chunk_blockno;
+
+ chunk = relptr_access(tbm->base, tbm->relchunks[state->schunkptr]);
+ page = relptr_access(tbm->base, tbm->relpages[state->spageptr]);
+ chunk_blockno = chunk->blockno + state->schunkbit;
+
+ if (state->spageptr >= tbm->npages || chunk_blockno < page->blockno)
+ {
+ /* Return a lossy page indicator from the chunk */
+ output->blockno = chunk_blockno;
+ output->ntuples = -1;
+ output->recheck = true;
+ state->schunkbit++;
+
+ LWLockRelease(&state->lock);
+ return output;
+ }
+ }
+
+ if (state->spageptr < tbm->npages)
+ {
+ PagetableEntry *page;
+ int ntuples;
+ int wordnum;
+
+ /* In ONE_PAGE state, we don't allocate an spages[] array */
+ if (tbm->status == TBM_ONE_PAGE)
+ page = &tbm->entry1;
+ else
+ page = relptr_access(tbm->base, tbm->relpages[state->spageptr]);
+
+ /* scan bitmap to extract individual offset numbers */
+ ntuples = 0;
+ for (wordnum = 0; wordnum < WORDS_PER_PAGE; wordnum++)
+ {
+ bitmapword w = page->words[wordnum];
+
+ if (w != 0)
+ {
+ int off = wordnum * BITS_PER_BITMAPWORD + 1;
+
+ while (w != 0)
+ {
+ if (w & 1)
+ output->offsets[ntuples++] = (OffsetNumber) off;
+ off++;
+ w >>= 1;
+ }
+ }
+ }
+
+ output->blockno = page->blockno;
+ output->ntuples = ntuples;
+ output->recheck = page->recheck;
+ state->spageptr++;
+
+ LWLockRelease(&state->lock);
+
+ return output;
+ }
+
+ LWLockRelease(&state->lock);
+
+ /* Nothing more in the bitmap */
+ return NULL;
+}
+
+/*
* tbm_end_iterate - finish an iteration over a TIDBitmap
*
* Currently this is just a pfree, but it might do more someday. (For
@@ -790,6 +971,17 @@ tbm_end_iterate(TBMIterator *iterator)
}
/*
+ * tbm_end_shared_iterate - finish an iteration over a TIDBitmap
+ *
+ * As above, but it frees the memory of TBMSharedIterator.
+ */
+void
+tbm_end_shared_iterate(TBMSharedIterator *iterator)
+{
+ pfree(iterator);
+}
+
+/*
* tbm_find_pageentry - find a PagetableEntry for the pageno
*
* Returns NULL if there is no non-lossy entry for the pageno.
@@ -1061,3 +1253,246 @@ tbm_comparator(const void *left, const void *right)
return 1;
return 0;
}
+
+/*
+ * As above, but this will get relptrs (relative pointer) of PagetableEntry
+ * and it needs to convert it to actual PagetableEntry before comparing the
+ * blockno.
+ */
+static int
+tbm_shared_comparator(const void *left, const void *right, void *arg)
+{
+ PagetableEntry *lpage;
+ PagetableEntry *rpage;
+
+ lpage = relptr_access((char *) arg, *((RelptrPagetableEntry *) left));
+ rpage = relptr_access((char *) arg, *((RelptrPagetableEntry *) right));
+
+ if (lpage->blockno < rpage->blockno)
+ return -1;
+ else if (lpage->blockno > rpage->blockno)
+ return 1;
+ return 0;
+}
+
+/*
+ * tbm_prepare_shared_iterate - prepare to iterator through a TIDBitmap
+ * by multiple workers using shared iterator.
+ *
+ * The TBMSharedIteratorState will be allocated from DSA so that multiple
+ * worker can attach to it and iterate jointly.
+ *
+ * This will convert the pagetable hash into page and chunk array of
+ * reptr (relative pointer) so that these arrays can be shared across
+ * multiple workers.
+ */
+dsa_pointer
+tbm_prepare_shared_iterate(TIDBitmap *tbm)
+{
+ dsa_pointer iterator;
+ TBMSharedIteratorState *iterater_state;
+ TBMSharedInfo *tbminfo;
+
+ /*
+ * Create the TBMIterator struct, with enough trailing space to serve the
+ * needs of the TBMIterateResult sub-struct.
+ */
+ iterator = dsa_allocate(tbm->dsa, sizeof(TBMSharedIteratorState));
+ iterater_state = dsa_get_address(tbm->dsa, iterator);
+ tbminfo = &iterater_state->tbminfo;
+
+ /*
+ * If we have a hashtable, create and fill the sorted page lists, unless
+ * we already did that for a previous iterator. Note that the lists are
+ * attached to the bitmap not the iterator, so they can be used by more
+ * than one iterator. However, we keep dsa_pointer to these in the shared
+ * iterator so that other workers can have access to these and store in
+ * their local TBM.
+ */
+ if (tbm->status == TBM_HASH && !tbm->iterating)
+ {
+ pagetable_iterator i;
+ PagetableEntry *page;
+ int npages;
+ int nchunks;
+
+ /*
+ * Create page list and chunk list using relptr so that we can share
+ * this information across multiple workers.
+ */
+ if (tbm->npages)
+ {
+ tbm->dsapages = dsa_allocate(tbm->dsa,
+ tbm->npages * (sizeof(RelptrPagetableEntry)));
+ tbm->relpages = dsa_get_address(tbm->dsa, tbm->dsapages);
+ }
+ if (tbm->nchunks)
+ {
+ tbm->dsachunks = dsa_allocate(tbm->dsa,
+ tbm->nchunks * (sizeof(RelptrPagetableEntry)));
+ tbm->relchunks = dsa_get_address(tbm->dsa, tbm->dsachunks);
+ }
+
+ tbm->base = dsa_get_address(tbm->dsa, tbm->dsa_data);
+
+ npages = nchunks = 0;
+ pagetable_start_iterate(tbm->pagetable, &i);
+ while ((page = pagetable_iterate(tbm->pagetable, &i)) != NULL)
+ {
+ if (page->ischunk)
+ relptr_store(tbm->base, tbm->relchunks[nchunks++], page);
+ else
+ relptr_store(tbm->base, tbm->relpages[npages++], page);
+ }
+
+ Assert(npages == tbm->npages);
+ Assert(nchunks == tbm->nchunks);
+ if (npages > 1)
+ qsort_arg(tbm->relpages, npages, sizeof(RelptrPagetableEntry *),
+ tbm_shared_comparator, (void *) tbm->base);
+ if (nchunks > 1)
+ qsort_arg(tbm->relchunks, nchunks, sizeof(RelptrPagetableEntry *),
+ tbm_shared_comparator, (void *) tbm->base);
+ }
+
+ /*
+ * Store the TBM member in the shared state. This is done to shared it
+ * across multiple workers.
+ */
+ tbminfo->maxentries = tbm->maxentries;
+ tbminfo->nchunks = tbm->nchunks;
+ tbminfo->nentries = tbm->nentries;
+ tbminfo->npages = tbm->npages;
+ tbminfo->dsa_data = tbm->dsa_data;
+ tbminfo->spages = tbm->dsapages;
+ tbminfo->schunks = tbm->dsachunks;
+ tbminfo->status = tbm->status;
+
+ if (tbm->status == TBM_ONE_PAGE)
+ memcpy(&tbminfo->entry1, &tbm->entry1, sizeof(PagetableEntry));
+
+ /* Initialize the shared iterator state. */
+ iterater_state->schunkbit = 0;
+ iterater_state->schunkptr = 0;
+ iterater_state->spageptr = 0;
+
+
+ LWLockInitialize(&iterater_state->lock, LWTRANCHE_PARALLEL_TBM_ITERATOR);
+
+ tbm->iterating = true;
+
+ return iterator;
+}
+
+/*
+ * tbm_attach_shared_iterate
+ *
+ * Allocate an iterator and attach shared iterator state to it so that
+ * multiple workers can access the same memory. We also need to copy
+ * some of the TBM related information from shared state to TBM because
+ * workers (other than the leader) would have created a local TBM therefore
+ * they have to get these information from shared location.
+ */
+TBMSharedIterator *
+tbm_attach_shared_iterate(TIDBitmap *tbm, dsa_area *dsa, dsa_pointer iterator)
+{
+ TBMSharedIterator *shared_iterator;
+ TBMSharedInfo *tbminfo;
+
+ shared_iterator = (TBMSharedIterator *) palloc(sizeof(TBMIterator) +
+ MAX_TUPLES_PER_PAGE * sizeof(OffsetNumber));
+
+ shared_iterator->tbm = tbm;
+ shared_iterator->state =
+ (TBMSharedIteratorState *) dsa_get_address(dsa, iterator);
+ tbminfo = &shared_iterator->state->tbminfo;
+
+ /*
+ * If we have come here from worker other than the leader then we need to
+ * get TBM related information from shared iterator status. This needs to
+ * be done only once per worker.
+ */
+ if (!tbm->iterating)
+ {
+ tbm->status = tbminfo->status;
+ tbm->nchunks = tbminfo->nchunks;
+ tbm->nentries = tbminfo->nentries;
+ tbm->npages = tbminfo->npages;
+ tbm->maxentries = tbminfo->maxentries;
+ if (tbm->status == TBM_HASH)
+ {
+ tbm->dsa_data = tbminfo->dsa_data;
+ tbm->base = dsa_get_address(dsa, tbm->dsa_data);
+
+ /* Convert dsa pointer to the local pointer */
+ if (tbm->npages)
+ tbm->relpages = dsa_get_address(tbm->dsa, tbminfo->spages);
+
+ if (tbm->nchunks)
+ tbm->relchunks = dsa_get_address(tbm->dsa, tbminfo->schunks);
+ }
+ else
+ memcpy(&tbm->entry1, &tbminfo->entry1, sizeof(PagetableEntry));
+
+ tbm->iterating = true;
+ }
+
+ return shared_iterator;
+}
+
+/*
+ * pagetable_allocate
+ *
+ * Callback function for allocating the memory for hashtable elements.
+ * It allocates memory from DSA if tbm holds a reference to a dsa.
+ */
+static inline void *
+pagetable_allocate(pagetable_hash *pagetable, Size size)
+{
+ TIDBitmap *tbm = (TIDBitmap *) pagetable->private_data;
+ dsa_pointer dsaptr;
+ char *ptr;
+
+ if (tbm->dsa == NULL)
+ return MemoryContextAllocExtended(pagetable->ctx, size,
+ MCXT_ALLOC_HUGE | MCXT_ALLOC_ZERO);
+
+ /* Add the size for storing dsa_pointer */
+ dsaptr = dsa_allocate(tbm->dsa, size + sizeof(dsa_pointer));
+
+ tbm->dsa_data = dsaptr;
+
+ ptr = dsa_get_address(tbm->dsa, dsaptr);
+ memset(ptr, 0, size + sizeof(dsa_pointer));
+
+ /* Store dsa_pointer */
+ *((dsa_pointer *) ptr) = dsaptr;
+
+ return (ptr + sizeof(dsa_pointer));
+}
+
+/*
+ * pagetable_free
+ *
+ * Callback function for freeing hash table elements.
+ */
+static inline void
+pagetable_free(pagetable_hash *pagetable, void *pointer)
+{
+ TIDBitmap *tbm = (TIDBitmap *) pagetable->private_data;
+ dsa_pointer dsa_data;
+
+ if (tbm->dsa == NULL)
+ return pfree(pointer);
+
+ /*
+ * If TBM is in iterating phase that means pagetable is already created
+ * and we have come here during tbm_free. By this time we are already
+ * detached from the DSA because the GatherNode would have been shutdown.
+ */
+ if (tbm->iterating)
+ return;
+
+ dsa_data = *((dsa_pointer *) ((char *) pointer - sizeof(dsa_pointer)));
+ dsa_free(tbm->dsa, dsa_data);
+}
diff --git a/src/include/nodes/tidbitmap.h b/src/include/nodes/tidbitmap.h
index 14992e0..17ed89d 100644
--- a/src/include/nodes/tidbitmap.h
+++ b/src/include/nodes/tidbitmap.h
@@ -23,6 +23,7 @@
#define TIDBITMAP_H
#include "storage/itemptr.h"
+#include "utils/dsa.h"
/*
@@ -33,6 +34,7 @@ typedef struct TIDBitmap TIDBitmap;
/* Likewise, TBMIterator is private */
typedef struct TBMIterator TBMIterator;
+typedef struct TBMSharedIterator TBMSharedIterator;
/* Result structure for tbm_iterate */
typedef struct
@@ -46,7 +48,7 @@ typedef struct
/* function prototypes in nodes/tidbitmap.c */
-extern TIDBitmap *tbm_create(long maxbytes);
+extern TIDBitmap *tbm_create(long maxbytes, dsa_area *dsa);
extern void tbm_free(TIDBitmap *tbm);
extern void tbm_add_tuples(TIDBitmap *tbm,
@@ -62,5 +64,10 @@ extern bool tbm_is_empty(const TIDBitmap *tbm);
extern TBMIterator *tbm_begin_iterate(TIDBitmap *tbm);
extern TBMIterateResult *tbm_iterate(TBMIterator *iterator);
extern void tbm_end_iterate(TBMIterator *iterator);
+extern void tbm_end_shared_iterate(TBMSharedIterator *iterator);
+extern dsa_pointer tbm_prepare_shared_iterate(TIDBitmap *tbm);
+extern TBMSharedIterator *tbm_attach_shared_iterate(TIDBitmap *tbm,
+ dsa_area *dsa, dsa_pointer iterator);
+extern TBMIterateResult *tbm_shared_iterate(TBMSharedIterator *iterator);
#endif /* TIDBITMAP_H */
diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h
index 8bd93c3..4d392e0 100644
--- a/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@ -212,6 +212,7 @@ typedef enum BuiltinTrancheIds
LWTRANCHE_LOCK_MANAGER,
LWTRANCHE_PREDICATE_LOCK_MANAGER,
LWTRANCHE_PARALLEL_QUERY_DSA,
+ LWTRANCHE_PARALLEL_TBM_ITERATOR,
LWTRANCHE_FIRST_USER_DEFINED
} BuiltinTrancheIds;