diff --git a/src/backend/access/gin/ginget.c b/src/backend/access/gin/ginget.c index 9ed9fd2..c8034d0 100644 --- a/src/backend/access/gin/ginget.c +++ b/src/backend/access/gin/ginget.c @@ -362,7 +362,7 @@ restartScanEntry: if (entry->matchBitmap && !tbm_is_empty(entry->matchBitmap)) { - entry->matchIterator = tbm_begin_iterate(entry->matchBitmap); + entry->matchIterator = tbm_begin_iterate(entry->matchBitmap, NULL); entry->isFinished = FALSE; } } diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index b019bc1..1dfd492 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -1753,6 +1753,22 @@ retry: } /* ---------------- + * heap_bm_update_snapshot + * + * Update snpashot info in heap scan descriptor. + * ---------------- + */ +void +heap_bm_update_snapshot(HeapScanDesc scan, Snapshot snapshot) +{ + Assert(IsMVCCSnapshot(snapshot)); + + RegisterSnapshot(snapshot); + scan->rs_snapshot = snapshot; + scan->rs_temp_snap = true; +} + +/* ---------------- * heap_getnext - retrieve next tuple in scan * * Fix to work with index relations. diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c index 72bacd5..1e34f26 100644 --- a/src/backend/executor/execParallel.c +++ b/src/backend/executor/execParallel.c @@ -29,6 +29,7 @@ #include "executor/nodeForeignscan.h" #include "executor/nodeSeqscan.h" #include "executor/tqueue.h" +#include "executor/nodeBitmapHeapscan.h" #include "nodes/nodeFuncs.h" #include "optimizer/planmain.h" #include "optimizer/planner.h" @@ -203,6 +204,10 @@ ExecParallelEstimate(PlanState *planstate, ExecParallelEstimateContext *e) ExecCustomScanEstimate((CustomScanState *) planstate, e->pcxt); break; + case T_BitmapHeapScanState: + ExecBitmapHeapEstimate((BitmapHeapScanState *) planstate, + e->pcxt); + break; default: break; } @@ -255,6 +260,11 @@ ExecParallelInitializeDSM(PlanState *planstate, ExecCustomScanInitializeDSM((CustomScanState *) planstate, d->pcxt); break; + case T_BitmapHeapScanState: + ExecBitmapHeapInitializeDSM((BitmapHeapScanState *) planstate, + d->pcxt); + break; + default: break; } @@ -724,6 +734,10 @@ ExecParallelInitializeWorker(PlanState *planstate, shm_toc *toc) ExecCustomScanInitializeWorker((CustomScanState *) planstate, toc); break; + case T_BitmapHeapScanState: + ExecBitmapHeapInitializeWorker( + (BitmapHeapScanState *) planstate, toc); + break; default: break; } diff --git a/src/backend/executor/nodeBitmapHeapscan.c b/src/backend/executor/nodeBitmapHeapscan.c index 449aacb..c3ad77b 100644 --- a/src/backend/executor/nodeBitmapHeapscan.c +++ b/src/backend/executor/nodeBitmapHeapscan.c @@ -47,16 +47,63 @@ #include "utils/spccache.h" #include "utils/snapmgr.h" #include "utils/tqual.h" - +#include "miscadmin.h" +#include "pgstat.h" static TupleTableSlot *BitmapHeapNext(BitmapHeapScanState *node); static void bitgetpage(HeapScanDesc scan, TBMIterateResult *tbmres); - +static bool pbms_is_leader(ParallelBitmapInfo *pbminfo); +static void pbms_set_parallel(PlanState *node); +static TBMIterateResult *pbms_parallel_iterate(TBMIterator *iterator, + ParallelIterator *parallel_iterator, + bool is_parallel); +static void prefetch_pages(int *prefetch_pages, int prefetch_target, + BitmapHeapScanState *node, HeapScanDesc scan); +static void update_prefetch_target(int *prefetch_target, int prefetch_maximum); /* ---------------------------------------------------------------- * BitmapHeapNext * * Retrieve next tuple from the BitmapHeapScan node's currentRelation + * + * + * [PARALLEL BITMAP HEAP SCAN ALGORITHM] + * + * #1. Shared TIDBitmap creation and initialization + * a) First worker to see the state as parallel bitmap info as + * PBM_INITIAL become leader and set the state to PBM_INPROGRESS + * All other workers see the state as PBM_INPROGRESS will wait for + * leader to complete the TIDBitmap. + * + * Leader Worker Processing: + * (Leader is responsible for creating shared TIDBitmap and create + * shared page and chunk array from TIDBitmap.) + * 1) Create TIDBitmap using DHT. + * 2) Begin Iterate: convert hash table into shared page and chunk + * array. + * 3) Restore local TIDBitmap variable information into + * ParallelBitmapInfo so that other worker can see those. + * 4) set state to PBM_FINISHED. + * 5) Wake up other workers. + * + * Other Worker Processing: + * 1) Wait until leader create shared TIDBitmap and shared page + * and chunk array. + * 2) Attach to shared page table, copy TIDBitmap from + * ParallelBitmapInfo to local TIDBitmap, we copy this to local + * TIDBitmap so that next level processing can read information + * same as in non parallel case and we can avoid extra changes + * in code. + * + * # At this level TIDBitmap is ready and all workers are awake # + * + * #2. Bitmap processing (Iterate and process the pages). + * . In this phase each worker will iterate over page and chunk array and + * select heap pages one by one. If prefetch is enable then there will + * be two iterator. + * . Since multiple worker are iterating over same page and chunk array + * we need to have a shared iterator, so we grab a spin lock and iterate + * within a lock. * ---------------------------------------------------------------- */ static TupleTableSlot * @@ -67,12 +114,19 @@ BitmapHeapNext(BitmapHeapScanState *node) TIDBitmap *tbm; TBMIterator *tbmiterator; TBMIterateResult *tbmres; - + ParallelBitmapInfo *pbminfo = node->parallel_bitmap; + bool is_parallel = node->parallel_bitmap ? true : false; #ifdef USE_PREFETCH TBMIterator *prefetch_iterator; #endif OffsetNumber targoffset; TupleTableSlot *slot; + ParallelTIDBitmap *parallel_tbm = NULL; + + /* Get the parallel TBM address in shared memory using offset */ + if (is_parallel) + parallel_tbm = (ParallelTIDBitmap*)((char *)pbminfo + + pbminfo->ptbm_offset); /* * extract necessary information from index scan node @@ -101,36 +155,106 @@ BitmapHeapNext(BitmapHeapScanState *node) */ if (tbm == NULL) { - tbm = (TIDBitmap *) MultiExecProcNode(outerPlanState(node)); + /* + * Process the lower level node only if either we are running + * in non parallel mode or we are leader worker. + * + * In parallel mode leader worker will immediately come out + * of the function, but all other worker will be blocked + * until leader worker wake them up. + */ + if (!is_parallel || pbms_is_leader(pbminfo)) + { + /* + * If we are in parallel mode recursively process the outer + * node and set parallel flag in lower level bitmap index scan. + * Later bitmap index node will use this flag to indicate + * tidbitmap that it needs to create an shared page table. + */ + if (is_parallel) + { + node->is_leader = true; + pbms_set_parallel(outerPlanState(node)); + } + + tbm = (TIDBitmap *) MultiExecProcNode(outerPlanState(node)); + } + else + { + /* + * By this time leader has already created the shared TBM. + * Here we need to create a local TBM and copy information from + * shared location. We also need to attach to shared page table + * using hash table handle stored in parallel_tbm(shared memory). + */ + tbm = tbm_create(work_mem * 1024L); + tbm_set_parallel(tbm, node->ss.ps.state->es_query_area); + tbm_attach_to_pagetable(tbm, parallel_tbm); + } if (!tbm || !IsA(tbm, TIDBitmap)) elog(ERROR, "unrecognized result from subplan"); node->tbm = tbm; - node->tbmiterator = tbmiterator = tbm_begin_iterate(tbm); + node->tbmiterator = tbmiterator + = tbm_begin_iterate(tbm, parallel_tbm); node->tbmres = tbmres = NULL; #ifdef USE_PREFETCH if (node->prefetch_maximum > 0) { - node->prefetch_iterator = prefetch_iterator = tbm_begin_iterate(tbm); - node->prefetch_pages = 0; - node->prefetch_target = -1; + node->prefetch_iterator = prefetch_iterator = + tbm_begin_iterate(tbm, parallel_tbm); + + /* These variable are used only in case of non parallel mode */ + if (!is_parallel) + { + node->prefetch_pages = 0; + node->prefetch_target = -1; + } } #endif /* USE_PREFETCH */ + + /* + * If we are in parallel mode and we are leader worker then + * copy the local TBM information to shared location, and wake + * up other workers. + */ + if (node->is_leader) + { + /* + * copy TBM local information in shared memory before waking + * up other workers. Other workers will create there own + * TBM and copy information from shared memory, they will + * also use hash table handle from shared memory for attaching + * to shared memory hash table. + */ + tbm_update_shared_members(tbm, parallel_tbm); + + /* Change the state under a lock */ + SpinLockAcquire(&pbminfo->state_mutex); + pbminfo->state = PBM_FINISHED; + SpinLockRelease(&pbminfo->state_mutex); + + /* Wake up all other workers. */ + ConditionVariableBroadcast(&pbminfo->cv); + } } for (;;) { Page dp; ItemId lp; + bool need_prefetch = false; /* * Get next page of results if needed */ if (tbmres == NULL) { - node->tbmres = tbmres = tbm_iterate(tbmiterator); + node->tbmres = tbmres = pbms_parallel_iterate(tbmiterator, + &pbminfo->tbmiterator, + is_parallel); if (tbmres == NULL) { /* no more entries in the bitmap */ @@ -138,17 +262,44 @@ BitmapHeapNext(BitmapHeapScanState *node) } #ifdef USE_PREFETCH - if (node->prefetch_pages > 0) + if (is_parallel) + { + /* + * If we are in parallel mode then acquire prefetch_mutex and + * check prefetch_pages from shared location. + */ + SpinLockAcquire(&pbminfo->prefetch_mutex); + if (pbminfo->prefetch_pages > 0) + pbminfo->prefetch_pages --; + else + need_prefetch = true; + SpinLockRelease(&pbminfo->prefetch_mutex); + } + else if (node->prefetch_pages > 0) { /* The main iterator has closed the distance by one page */ node->prefetch_pages--; } - else if (prefetch_iterator) + else + need_prefetch = true; + + if (prefetch_iterator && need_prefetch) { - /* Do not let the prefetch iterator get behind the main one */ - TBMIterateResult *tbmpre = tbm_iterate(prefetch_iterator); + TBMIterateResult *tbmpre; - if (tbmpre == NULL || tbmpre->blockno != tbmres->blockno) + /* Do not let the prefetch iterator get behind the main one */ + tbmpre = pbms_parallel_iterate(prefetch_iterator, + &pbminfo->prefetch_iterator, + is_parallel); + /* + * In case of parallel mode we can only ensure that prefetch + * iterator is not behind main iterator, but we can not + * ensure that current blockno in main iterator and prefetch + * iterator is same. It's possible that whatever blockno we + * are prefetching is getting processed by some other worker. + */ + if (!is_parallel && + (tbmpre == NULL || tbmpre->blockno != tbmres->blockno)) elog(ERROR, "prefetch and main iterators are out of sync"); } #endif /* USE_PREFETCH */ @@ -182,20 +333,25 @@ BitmapHeapNext(BitmapHeapScanState *node) #ifdef USE_PREFETCH - /* - * Increase prefetch target if it's not yet at the max. Note that - * we will increase it to zero after fetching the very first - * page/tuple, then to one after the second tuple is fetched, then - * it doubles as later pages are fetched. - */ - if (node->prefetch_target >= node->prefetch_maximum) - /* don't increase any further */ ; - else if (node->prefetch_target >= node->prefetch_maximum / 2) - node->prefetch_target = node->prefetch_maximum; - else if (node->prefetch_target > 0) - node->prefetch_target *= 2; - else - node->prefetch_target++; + /* Increase prefetch target if it's not yet at the max. */ + if (node->prefetch_target < node->prefetch_maximum) + { + if (!is_parallel) + update_prefetch_target(&node->prefetch_target, + node->prefetch_maximum); + else + { + /* + * If we are in parallel mode then grab prefetch_mutex + * before updating prefetch target. + */ + SpinLockAcquire(&pbminfo->prefetch_mutex); + update_prefetch_target(&pbminfo->prefetch_target, + node->prefetch_maximum); + SpinLockRelease(&pbminfo->prefetch_mutex); + } + } + #endif /* USE_PREFETCH */ } else @@ -211,8 +367,22 @@ BitmapHeapNext(BitmapHeapScanState *node) * Try to prefetch at least a few pages even before we get to the * second page if we don't stop reading after the first tuple. */ - if (node->prefetch_target < node->prefetch_maximum) - node->prefetch_target++; + if (!is_parallel) + { + if (node->prefetch_target < node->prefetch_maximum) + node->prefetch_target++; + } + else + { + /* + * If we are in parallel mode then grab prefetch_mutex + * before updating prefetch target. + */ + SpinLockAcquire(&pbminfo->prefetch_mutex); + if (pbminfo->prefetch_target < node->prefetch_maximum) + pbminfo->prefetch_target++; + SpinLockRelease(&pbminfo->prefetch_mutex); + } #endif /* USE_PREFETCH */ } @@ -236,21 +406,29 @@ BitmapHeapNext(BitmapHeapScanState *node) */ if (prefetch_iterator) { - while (node->prefetch_pages < node->prefetch_target) + if (pbminfo == NULL) { - TBMIterateResult *tbmpre = tbm_iterate(prefetch_iterator); + prefetch_pages(&node->prefetch_pages, + node->prefetch_target, node, scan); + } + else if(node->prefetch_pages < node->prefetch_target) + { + /* + * If we are in parallel mode then grab prefetch_mutex + * before going for prefetch. + */ + SpinLockAcquire(&pbminfo->prefetch_mutex); - if (tbmpre == NULL) - { - /* No more pages to prefetch */ - tbm_end_iterate(prefetch_iterator); - node->prefetch_iterator = prefetch_iterator = NULL; - break; - } - node->prefetch_pages++; - PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno); + prefetch_pages(&pbminfo->prefetch_pages, + pbminfo->prefetch_target, node, scan); + + SpinLockRelease(&pbminfo->prefetch_mutex); } + + /* Restore the prefetch_iterator */ + prefetch_iterator = node->prefetch_iterator; } + #endif /* USE_PREFETCH */ /* @@ -465,6 +643,22 @@ ExecReScanBitmapHeapScan(BitmapHeapScanState *node) node->tbmres = NULL; node->prefetch_iterator = NULL; + /* reset parallel bitmap scan info, if present */ + if (node->parallel_bitmap) + { + ParallelBitmapInfo *pbminfo = node->parallel_bitmap; + + pbminfo->state = PBM_INITIAL; + pbminfo->tbmiterator.schunkbit = 0; + pbminfo->tbmiterator.spageptr = 0; + pbminfo->tbmiterator.schunkptr = 0; + pbminfo->prefetch_iterator.schunkbit = 0; + pbminfo->prefetch_iterator.spageptr = 0; + pbminfo->prefetch_iterator.schunkptr = 0; + pbminfo->prefetch_pages = 0; + pbminfo->prefetch_target = -1; + } + ExecScanReScan(&node->ss); /* @@ -567,6 +761,8 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags) scanstate->prefetch_target = 0; /* may be updated below */ scanstate->prefetch_maximum = target_prefetch_pages; + scanstate->is_leader = false; + scanstate->parallel_bitmap = NULL; /* * Miscellaneous initialization @@ -653,3 +849,311 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags) */ return scanstate; } + +/*---------------- + * pbms_is_leader + * + * Check if we are the first one to come here, if yes then + * we become leader, otherwise we need to wait until leader + * create the shared TID bitmap and wake us up. + * --------------- + */ +static bool +pbms_is_leader(ParallelBitmapInfo *pbminfo) +{ + bool needWait = false; + bool queuedSelf = false; + bool leader = false; + + for(;;) + { + SpinLockAcquire(&pbminfo->state_mutex); + + /* + * if state is initial then we are the first one to come here + * set the state to in progress and mark ourself as leader + */ + if (pbminfo->state == PBM_INITIAL) + { + pbminfo->state = PBM_INPROGRESS; + leader = true; + } + + /* bitmap create is in progress so we need to wait */ + else if (pbminfo->state == PBM_INPROGRESS) + needWait = true; + + SpinLockRelease(&pbminfo->state_mutex); + + /* + * If we are a leader or else leader has already created a + * tid bitmap. + */ + if (leader || !needWait) + break; + + /* We need to queue */ + if (queuedSelf) + { + /* Sleep until leader send wake up signal */ + ConditionVariableSleep(WAIT_EVENT_PARALLEL_BITMAP_SCAN); + queuedSelf = false; + needWait = false; + } + else if (needWait) + { + /* Add ourself to wait queue */ + ConditionVariablePrepareToSleep(&pbminfo->cv); + queuedSelf = true; + } + } + + /* Cancel the sleep before return */ + ConditionVariableCancelSleep(); + + return leader; +} + +/* ---------------- + * pbms_iterator_init - initialize parallel bitmap scan iterator + * + * ---------------- + */ +static void +pbms_iterator_init(ParallelIterator *target) +{ + SpinLockInit(&target->mutex); + + target->schunkbit = 0; + target->schunkptr = 0; + target->spageptr = 0; +} + +/*------------------- + * pbms_set_parallel + * + * Parallel bitmap heap scan set below index scan node as parallel + * so that it can create shared TIDBitmap. + * ----------------- + */ +static void +pbms_set_parallel(PlanState *node) +{ + /* + * In case of BitmapOr or BitmapAnd set first bitmap index scan node + * as parallel, because only first node will create the main bitmap + * other bitmaps will be merged to the first bitmap so no need to + * create them in shared memory. + */ + switch(node->type) + { + case T_BitmapIndexScanState: + ((BitmapIndexScanState*)node)->biss_Parallel = true; + break; + case T_BitmapOrState: + pbms_set_parallel(((BitmapOrState*)node)->bitmapplans[0]); + break; + case T_BitmapAndState: + pbms_set_parallel(((BitmapAndState*)node)->bitmapplans[0]); + break; + default: + break; + } +} + +/* + * prefetch_pages + * + * Prefetch pages before going for actual processing of the page. + */ +static void +prefetch_pages(int *prefetch_pages, int prefetch_target, + BitmapHeapScanState *node, HeapScanDesc scan) +{ + TBMIterator *iterator = node->prefetch_iterator; + ParallelIterator *parallel_iteartor; + + /* + * If parallel bitmap info available means we are running + * in parallel mode. So use parallel iterator for prefetching. + */ + if (node->parallel_bitmap) + parallel_iteartor = &node->parallel_bitmap->prefetch_iterator; + + while (*prefetch_pages < prefetch_target) + { + TBMIterateResult *tbmpre = pbms_parallel_iterate(iterator, + parallel_iteartor, + node->parallel_bitmap ? true:false); + if (tbmpre == NULL) + { + /* No more pages to prefetch */ + tbm_end_iterate(iterator); + node->prefetch_iterator = NULL; + break; + } + + (*prefetch_pages)++; + PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno); + } +} + +/* + * update_prefetch_target + * + * Update the value of prefetch target + */ +static void +update_prefetch_target(int *prefetch_target, int prefetch_maximum) +{ + /* + * Increase prefetch target if it's not yet at the max. Note that + * we will increase it to zero after fetching the very first + * page/tuple, then to one after the second tuple is fetched, then + * it doubles as later pages are fetched. + */ + if (*prefetch_target >= prefetch_maximum) + /* don't increase any further */ ; + else if (*prefetch_target >= prefetch_maximum / 2) + *prefetch_target = prefetch_maximum; + else if (*prefetch_target > 0) + *prefetch_target *= 2; + else + (*prefetch_target)++; +} + +/* + * pbms_parallel_iterate + * + * Acquire a iterator lock. + * copy iterator state from shared iterator to local iterator. + * Call tbm_iterate and restore the state back to shared iterator. + */ +static TBMIterateResult * +pbms_parallel_iterate(TBMIterator *iterator, + ParallelIterator *parallel_iterator, + bool is_parallel) +{ + TBMIterateResult *output; + + /* If not running in parallel mode then directly call tbm_iterate. */ + if (!is_parallel) + return tbm_iterate(iterator); + + /* + * We are in parallel mode so grab parallel iterator mutex + * before calling iterator. + */ + SpinLockAcquire(¶llel_iterator->mutex); + + /* + * Now we have got lock on iterator so copy information from + * shared location to our local iterator. + */ + iterator->spageptr = parallel_iterator->spageptr; + iterator->schunkptr = parallel_iterator->schunkptr; + iterator->schunkbit = parallel_iterator->schunkbit; + + output = tbm_iterate(iterator); + + /* + * tbm_iterate would have changed the iterator value + * in local iterator so copy them back to shared location + * before releasing the lock. + */ + parallel_iterator->spageptr = iterator->spageptr; + parallel_iterator->schunkptr = iterator->schunkptr; + parallel_iterator->schunkbit = iterator->schunkbit; + + SpinLockRelease(¶llel_iterator->mutex); + + return output; +} + +/* ---------------------------------------------------------------- + * ExecBitmapHeapEstimate + * + * estimates the space required to serialize bitmap scan node. + * ---------------------------------------------------------------- + */ +void +ExecBitmapHeapEstimate(BitmapHeapScanState *node, + ParallelContext *pcxt) +{ + EState *estate = node->ss.ps.state; + Size offset = add_size(offsetof(ParallelBitmapInfo, + phs_snapshot_data), + EstimateSnapshotSpace(estate->es_snapshot)); + + /* Estimate the size for sharing parallel TBM info. */ + node->pscan_len = tbm_estimate_parallel_tidbitmap(offset); + + shm_toc_estimate_chunk(&pcxt->estimator, node->pscan_len); + shm_toc_estimate_keys(&pcxt->estimator, 1); +} + +/* ---------------------------------------------------------------- + * ExecBitmapHeapInitializeDSM + * + * Set up a parallel bitmap heap scan descriptor. + * ---------------------------------------------------------------- + */ +void +ExecBitmapHeapInitializeDSM(BitmapHeapScanState *node, + ParallelContext *pcxt) +{ + ParallelBitmapInfo *pbminfo; + EState *estate = node->ss.ps.state; + Snapshot snapshot; + Size offset = add_size(offsetof(ParallelBitmapInfo, + phs_snapshot_data), + EstimateSnapshotSpace(estate->es_snapshot)); + + pbminfo = shm_toc_allocate(pcxt->toc, node->pscan_len); + + /* Offset to parallel TBM info. */ + pbminfo->ptbm_offset = offset; + + /* Initialize shared tbmiterator and prefetch_iterator */ + pbms_iterator_init(&pbminfo->tbmiterator); + pbms_iterator_init(&pbminfo->prefetch_iterator); + + /* Initialize mutex to protect prefetch pages and target */ + SpinLockInit(&pbminfo->prefetch_mutex); + pbminfo->prefetch_pages = 0; + pbminfo->prefetch_target = -1; + + /* Initialize mutex to protect current state */ + SpinLockInit(&pbminfo->state_mutex); + pbminfo->state = PBM_INITIAL; + + ConditionVariableInit(&pbminfo->cv); + + SerializeSnapshot(estate->es_snapshot, pbminfo->phs_snapshot_data); + + shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id, pbminfo); + + node->parallel_bitmap = pbminfo; + snapshot = RestoreSnapshot(pbminfo->phs_snapshot_data); + + heap_bm_update_snapshot(node->ss.ss_currentScanDesc, snapshot); +} + +/* ---------------------------------------------------------------- + * ExecBitmapHeapInitializeWorker + * + * Copy relevant information from TOC into planstate. + * ---------------------------------------------------------------- + */ +void +ExecBitmapHeapInitializeWorker(BitmapHeapScanState *node, shm_toc *toc) +{ + ParallelBitmapInfo *pbminfo; + Snapshot snapshot; + + pbminfo = shm_toc_lookup(toc, node->ss.ps.plan->plan_node_id); + node->parallel_bitmap = pbminfo; + + snapshot = RestoreSnapshot(pbminfo->phs_snapshot_data); + heap_bm_update_snapshot(node->ss.ss_currentScanDesc, snapshot); +} diff --git a/src/backend/executor/nodeBitmapIndexscan.c b/src/backend/executor/nodeBitmapIndexscan.c index a364098..9cc5088 100644 --- a/src/backend/executor/nodeBitmapIndexscan.c +++ b/src/backend/executor/nodeBitmapIndexscan.c @@ -82,6 +82,13 @@ MultiExecBitmapIndexScan(BitmapIndexScanState *node) } /* + * If parallel flag is set then set flag in TIDBitmap to indicate + * that we need a shared page table. + */ + if (node->biss_Parallel) + tbm_set_parallel(tbm, node->ss.ps.state->es_query_area); + + /* * Get TIDs from index and insert into bitmap */ while (doscan) diff --git a/src/backend/nodes/tidbitmap.c b/src/backend/nodes/tidbitmap.c index dfeb7d5..9207741 100644 --- a/src/backend/nodes/tidbitmap.c +++ b/src/backend/nodes/tidbitmap.c @@ -44,6 +44,11 @@ #include "nodes/bitmapset.h" #include "nodes/tidbitmap.h" #include "utils/hsearch.h" +#include "storage/condition_variable.h" +#include "storage/lwlock.h" +#include "storage/shmem.h" +#include "storage/dht.h" +#include "storage/dsa.h" /* * The maximum number of tuples per page is not large (typically 256 with @@ -80,6 +85,8 @@ /* number of active words for a lossy chunk: */ #define WORDS_PER_CHUNK ((PAGES_PER_CHUNK - 1) / BITS_PER_BITMAPWORD + 1) +#define TBM_IS_SHARED(tbm) (tbm)->shared + /* * The hashtable entries are represented by this data structure. For * an exact page, blockno is the page number and bit k of the bitmap @@ -138,24 +145,28 @@ struct TIDBitmap /* these are valid when iterating is true: */ PagetableEntry **spages; /* sorted exact-page list, or NULL */ PagetableEntry **schunks; /* sorted lossy-chunk list, or NULL */ + bool shared; /* need to build shared tbm if set*/ + dht_hash_table_handle hash_handle; /* shared hash table handle */ + dht_hash_table *shared_pagetable; /* dynamic hash table */ + dsa_area *area; /* reference to per-query shared memory area */ }; /* - * When iterating over a bitmap in sorted order, a TBMIterator is used to - * track our progress. There can be several iterators scanning the same - * bitmap concurrently. Note that the bitmap becomes read-only as soon as - * any iterator is created. + * Holds shared members of TIDBitmap for parallel bitmap scan. */ -struct TBMIterator +struct ParallelTIDBitmap { - TIDBitmap *tbm; /* TIDBitmap we're iterating over */ - int spageptr; /* next spages index */ - int schunkptr; /* next schunks index */ - int schunkbit; /* next bit to check in current schunk */ - TBMIterateResult output; /* MUST BE LAST (because variable-size) */ + dht_hash_table_handle hash_handle; /* shared hash table handle */ + dsa_pointer dsa_pages; /* dsa pointers for all kind of pages */ + int nentries; /* number of entries in pagetable */ + int maxentries; /* limit on same to meet maxbytes */ + int npages; /* number of exact entries in pagetable */ + int nchunks; /* number of lossy entries in pagetable */ + int item_count; /* total item in dsa_pages */ + bool inited; /* set true after leader converts page */ + /* table to dsa_pointer's array. */ }; - /* Local function prototypes */ static void tbm_union_page(TIDBitmap *a, const PagetableEntry *bpage); static bool tbm_intersect_page(TIDBitmap *a, PagetableEntry *apage, @@ -167,8 +178,11 @@ static bool tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno); static void tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno); static void tbm_lossify(TIDBitmap *tbm); static int tbm_comparator(const void *left, const void *right); - - +static bool tbm_delete_entry(TIDBitmap *tbm, BlockNumber pageno); +static PagetableEntry* tbm_find_or_insert(TIDBitmap *tbm, BlockNumber pageno, + bool *found); +static PagetableEntry* tbm_find(const TIDBitmap *tbm, BlockNumber pageno); +static void* tbm_seq_search(TIDBitmap *tbm, void *iterator); /* * tbm_create - create an initially-empty bitmap * @@ -244,6 +258,33 @@ tbm_create_pagetable(TIDBitmap *tbm) } /* + * tbm_create_shared_pagetable + * + * Creates shared hash table using DHT for parallel bitmap scan. + */ +static void +tbm_create_shared_pagetable(TIDBitmap *tbm) +{ + dht_parameters params = {0}; + + params.key_size = sizeof(BlockNumber); + params.entry_size = sizeof(PagetableEntry); + params.compare_function = memcmp; + params.hash_function = tag_hash; + + params.tranche_id = LWLockNewTrancheId(); + + /* Create a dynamic hash table */ + tbm->shared_pagetable = dht_create(tbm->area, ¶ms); + if (tbm->shared_pagetable == NULL) + elog(ERROR, "could not create hash table"); + + /* Get the handle so that other backend can attach using this handle */ + tbm->hash_handle = dht_get_hash_table_handle(tbm->shared_pagetable); + tbm->status = TBM_HASH; +} + +/* * tbm_free - free a TIDBitmap */ void @@ -255,6 +296,11 @@ tbm_free(TIDBitmap *tbm) pfree(tbm->spages); if (tbm->schunks) pfree(tbm->schunks); + + /* If we have shared page table then detach from it */ + if (tbm->shared_pagetable) + dht_detach(tbm->shared_pagetable); + pfree(tbm); } @@ -431,6 +477,14 @@ void tbm_intersect(TIDBitmap *a, const TIDBitmap *b) { Assert(!a->iterating); + + /* + * In case of BitmapAnd, only first node will create shared TBM + * and all other node will have local TBM which will be merged + * with first shared TBM. + */ + Assert(!TBM_IS_SHARED(b)); + /* Nothing to do if a is empty */ if (a->nentries == 0) return; @@ -449,12 +503,34 @@ tbm_intersect(TIDBitmap *a, const TIDBitmap *b) } else { + void *iterator; + dht_iterator dhtiterator; HASH_SEQ_STATUS status; PagetableEntry *apage; Assert(a->status == TBM_HASH); - hash_seq_init(&status, a->pagetable); - while ((apage = (PagetableEntry *) hash_seq_search(&status)) != NULL) + + /* + * If we are using shared page table then use DHT iterator. + */ + if (TBM_IS_SHARED(a)) + { + iterator = &dhtiterator; + dht_iterate_begin(a->shared_pagetable, &dhtiterator, false); + } + else + { + iterator = &status; + hash_seq_init(&status, a->pagetable); + } + + /* + * scan complete hash table using unified tbm_seq_search function this + * function will take care to scan DHT if it's shared page table + * otherwise dynahash. + */ + while ((apage = + (PagetableEntry *) tbm_seq_search(a, iterator)) != NULL) { if (tbm_intersect_page(a, apage, b)) { @@ -464,12 +540,21 @@ tbm_intersect(TIDBitmap *a, const TIDBitmap *b) else a->npages--; a->nentries--; - if (hash_search(a->pagetable, - (void *) &apage->blockno, - HASH_REMOVE, NULL) == NULL) + + /* + * If we are using shared hash then we need to + * release the lock on element. + */ + if (TBM_IS_SHARED(a)) + dht_iterate_release(&dhtiterator); + + if (!tbm_delete_entry(a, apage->blockno)) elog(ERROR, "hash table corrupted"); } } + + if (TBM_IS_SHARED(a)) + dht_iterate_end(&dhtiterator); } } @@ -579,7 +664,7 @@ tbm_is_empty(const TIDBitmap *tbm) * contents repeatedly, including parallel scans. */ TBMIterator * -tbm_begin_iterate(TIDBitmap *tbm) +tbm_begin_iterate(TIDBitmap *tbm, ParallelTIDBitmap *parallel_info) { TBMIterator *iterator; @@ -592,11 +677,16 @@ tbm_begin_iterate(TIDBitmap *tbm) iterator->tbm = tbm; /* - * Initialize iteration pointers. + * Initialize iteration pointers, only if it's not shared tbm. + * In case of shared tbm, we will copy these values from + * shared iterator before calling tbm_iterate. */ - iterator->spageptr = 0; - iterator->schunkptr = 0; - iterator->schunkbit = 0; + if (!TBM_IS_SHARED(tbm)) + { + iterator->spageptr = 0; + iterator->schunkptr = 0; + iterator->schunkbit = 0; + } /* * If we have a hashtable, create and fill the sorted page lists, unless @@ -606,7 +696,6 @@ tbm_begin_iterate(TIDBitmap *tbm) */ if (tbm->status == TBM_HASH && !tbm->iterating) { - HASH_SEQ_STATUS status; PagetableEntry *page; int npages; int nchunks; @@ -620,15 +709,84 @@ tbm_begin_iterate(TIDBitmap *tbm) MemoryContextAlloc(tbm->mcxt, tbm->nchunks * sizeof(PagetableEntry *)); - hash_seq_init(&status, tbm->pagetable); - npages = nchunks = 0; - while ((page = (PagetableEntry *) hash_seq_search(&status)) != NULL) + /* + * If we have shared TBM means we are running in parallel mode. + * So iterate over DHT and construct page and chunk array. + * + * First leader worker will create array of dsa pointers which + * will holds dsa pointers for both pages and chunks, later + * while converting to local pointers we will identify them + * and copy in their respective array. + */ + if (TBM_IS_SHARED(tbm)) { - if (page->ischunk) - tbm->schunks[nchunks++] = page; - else - tbm->spages[npages++] = page; + dsa_pointer *dsa_spages; + dht_hash_table_item *item; + int ncount = 0; + int i; + + /* + * Iterate over DHT and create array of dsa_pointers. + * Only leader will perform this step after that inited + * flag will be set. + */ + if (!(parallel_info->inited) && + (tbm->npages > 0 || tbm->nchunks > 0)) + { + dht_iterator dhtiterator; + dsa_pointer dsa_page; + + parallel_info->dsa_pages = dsa_allocate(tbm->area, + (tbm->nchunks + tbm->npages) * sizeof(dsa_pointer)); + + dsa_spages = + dsa_get_address(tbm->area, parallel_info->dsa_pages); + + dht_iterate_begin(tbm->shared_pagetable, &dhtiterator, false); + while ((dsa_page = dht_iterate_next_dsa(&dhtiterator)) != + InvalidDsaPointer) + dsa_spages[ncount++] = dsa_page; + + parallel_info->inited = true; + parallel_info->item_count = ncount; + } + + /* + * This step will be done by all the workers including leader. + * Here we need to convert array of dsa pointers to local + * page and chunk array. + */ + dsa_spages = dsa_get_address(tbm->area, parallel_info->dsa_pages); + npages = nchunks = 0; + for (i = 0; i < parallel_info->item_count; i++) + { + item = dsa_get_address(tbm->area, dsa_spages[i]); + page = (PagetableEntry*)&(item->entry); + + if (page->ischunk) + tbm->schunks[nchunks++] = page; + else + tbm->spages[npages++] = page; + } + } + else + { + HASH_SEQ_STATUS status; + + /* Process local hash table, if we are not in parallel mode */ + npages = nchunks = 0; + + hash_seq_init(&status, tbm->pagetable); + while ((page = + (PagetableEntry *) hash_seq_search(&status)) != NULL) + { + if (page->ischunk) + tbm->schunks[nchunks++] = page; + else + tbm->spages[npages++] = page; + } } + Assert(npages == tbm->npages); Assert(nchunks == tbm->nchunks); if (npages > 1) @@ -791,11 +949,18 @@ tbm_find_pageentry(const TIDBitmap *tbm, BlockNumber pageno) return page; } - page = (PagetableEntry *) hash_search(tbm->pagetable, - (void *) &pageno, - HASH_FIND, NULL); + page = (PagetableEntry *) tbm_find(tbm, pageno); if (page == NULL) return NULL; + + /* + * If it's from shared hash table then release the entry. + * Only one worker is building hash table so it's okay to + * release it here. + */ + if (TBM_IS_SHARED(tbm)) + dht_release(tbm->shared_pagetable, (void*)page); + if (page->ischunk) return NULL; /* don't want a lossy chunk header */ return page; @@ -815,7 +980,11 @@ tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno) PagetableEntry *page; bool found; - if (tbm->status == TBM_EMPTY) + /* + * Use fixed slot only if it's local tidbitmap, If shared bitmap + * then directly insert into shared hash table. + */ + if ((tbm->status == TBM_EMPTY) && !TBM_IS_SHARED(tbm)) { /* Use the fixed slot */ page = &tbm->entry1; @@ -824,6 +993,7 @@ tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno) } else { + /* In case of shared TBM, status will never be TBM_ONE_PAGE */ if (tbm->status == TBM_ONE_PAGE) { page = &tbm->entry1; @@ -833,10 +1003,17 @@ tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno) tbm_create_pagetable(tbm); } + /* + * In case of parallel bitmap scan, we don't switch from TBM_EMPTY + * to TBM_ONE_PAGE. So even if we are here we may not have hash + * table ready, so if tbm->status is not yet TBM_HASH then create + * shared page table. + */ + if (tbm->status != TBM_HASH && TBM_IS_SHARED(tbm)) + tbm_create_shared_pagetable(tbm); + /* Look up or create an entry */ - page = (PagetableEntry *) hash_search(tbm->pagetable, - (void *) &pageno, - HASH_ENTER, &found); + page = tbm_find_or_insert(tbm, pageno, &found); } /* Initialize it if not present before */ @@ -849,6 +1026,10 @@ tbm_get_pageentry(TIDBitmap *tbm, BlockNumber pageno) tbm->npages++; } + /* Release the entry lock if it's from shared hash table */ + if (page && TBM_IS_SHARED(tbm)) + dht_release(tbm->shared_pagetable, (void*)page); + return page; } @@ -869,9 +1050,17 @@ tbm_page_is_lossy(const TIDBitmap *tbm, BlockNumber pageno) bitno = pageno % PAGES_PER_CHUNK; chunk_pageno = pageno - bitno; - page = (PagetableEntry *) hash_search(tbm->pagetable, - (void *) &chunk_pageno, - HASH_FIND, NULL); + + page = tbm_find(tbm, chunk_pageno); + + /* + * If entry is from shared page table then release the entry. + * Currently only one worker is building the bitmap so it's + * fine to release it here. + */ + if (page && TBM_IS_SHARED(tbm)) + dht_release(tbm->shared_pagetable, (void*)page); + if (page != NULL && page->ischunk) { int wordnum = WORDNUM(bitno); @@ -901,7 +1090,13 @@ tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno) /* We force the bitmap into hashtable mode whenever it's lossy */ if (tbm->status != TBM_HASH) - tbm_create_pagetable(tbm); + { + /* Create shared page table if we are in parallel mode. */ + if (TBM_IS_SHARED(tbm)) + tbm_create_shared_pagetable(tbm); + else + tbm_create_pagetable(tbm); + } bitno = pageno % PAGES_PER_CHUNK; chunk_pageno = pageno - bitno; @@ -912,20 +1107,16 @@ tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno) */ if (bitno != 0) { - if (hash_search(tbm->pagetable, - (void *) &pageno, - HASH_REMOVE, NULL) != NULL) + if (tbm_delete_entry(tbm, pageno)) { - /* It was present, so adjust counts */ - tbm->nentries--; - tbm->npages--; /* assume it must have been non-lossy */ + /* It was present, so adjust counts */ + tbm->nentries--; + tbm->npages--; /* assume it must have been non-lossy */ } } /* Look up or create entry for chunk-header page */ - page = (PagetableEntry *) hash_search(tbm->pagetable, - (void *) &chunk_pageno, - HASH_ENTER, &found); + page = (PagetableEntry *)tbm_find_or_insert(tbm, chunk_pageno, &found); /* Initialize it if not present before */ if (!found) @@ -954,6 +1145,10 @@ tbm_mark_page_lossy(TIDBitmap *tbm, BlockNumber pageno) wordnum = WORDNUM(bitno); bitnum = BITNUM(bitno); page->words[wordnum] |= ((bitmapword) 1 << bitnum); + + /* Unlock an entry which was locked by dht_find_or_insert. */ + if (TBM_IS_SHARED(tbm)) + dht_release(tbm->shared_pagetable, (void*)page); } /* @@ -964,21 +1159,28 @@ tbm_lossify(TIDBitmap *tbm) { HASH_SEQ_STATUS status; PagetableEntry *page; + dht_iterator dhtiterator; + void *iterator; - /* - * XXX Really stupid implementation: this just lossifies pages in - * essentially random order. We should be paying some attention to the - * number of bits set in each page, instead. - * - * Since we are called as soon as nentries exceeds maxentries, we should - * push nentries down to significantly less than maxentries, or else we'll - * just end up doing this again very soon. We shoot for maxentries/2. - */ Assert(!tbm->iterating); Assert(tbm->status == TBM_HASH); - hash_seq_init(&status, tbm->pagetable); - while ((page = (PagetableEntry *) hash_seq_search(&status)) != NULL) + /* + * If we are using shared page table then use DHT iterator otherwise + * dyna hash iterator + */ + if (TBM_IS_SHARED(tbm)) + { + iterator = &dhtiterator; + dht_iterate_begin(tbm->shared_pagetable, &dhtiterator, false); + } + else + { + iterator = &status; + hash_seq_init(&status, tbm->pagetable); + } + + while ((page = (PagetableEntry *) tbm_seq_search(tbm, iterator)) != NULL) { if (page->ischunk) continue; /* already a chunk header */ @@ -990,21 +1192,23 @@ tbm_lossify(TIDBitmap *tbm) if ((page->blockno % PAGES_PER_CHUNK) == 0) continue; + /* If we are using shared hash then release the lock on element. */ + if (TBM_IS_SHARED(tbm)) + dht_iterate_release(&dhtiterator); + /* This does the dirty work ... */ tbm_mark_page_lossy(tbm, page->blockno); if (tbm->nentries <= tbm->maxentries / 2) { /* we have done enough */ - hash_seq_term(&status); + if (TBM_IS_SHARED(tbm)) + dht_iterate_end(&dhtiterator); + else + hash_seq_term(&status); + break; } - - /* - * Note: tbm_mark_page_lossy may have inserted a lossy chunk into the - * hashtable. We can continue the same seq_search scan since we do - * not care whether we visit lossy chunks or not. - */ } /* @@ -1036,3 +1240,171 @@ tbm_comparator(const void *left, const void *right) return 1; return 0; } + +/* + * tbm_attach_to_pagetable + * + * Attach worker to shared TID bitmap created by leader worker. + */ +void +tbm_attach_to_pagetable(TIDBitmap *tbm, ParallelTIDBitmap *stbm) +{ + dht_parameters params = {0}; + + params.key_size = sizeof(BlockNumber); + params.entry_size = sizeof(PagetableEntry); + params.compare_function = memcmp; + params.hash_function = tag_hash; + + tbm->shared_pagetable = dht_attach(tbm->area, ¶ms, stbm->hash_handle); + + tbm->status = TBM_HASH; + tbm->nchunks = stbm->nchunks; + tbm->nentries = stbm->nentries; + tbm->npages = stbm->npages; + tbm->maxentries = stbm->maxentries; + tbm->shared = true; +} + +/* + * tbm_update_shared_members + * + * Restore leaders private tbm state to shared location. This must + * be called before waking up the other worker. + */ +void +tbm_update_shared_members(TIDBitmap *tbm, ParallelTIDBitmap *parallel_tbm) +{ + /* + * Copy private information to shared location before + * waking up the other workers. + */ + parallel_tbm->maxentries = tbm->maxentries; + parallel_tbm->nchunks = tbm->nchunks; + parallel_tbm->nentries = tbm->nentries; + parallel_tbm->npages = tbm->npages; + parallel_tbm->hash_handle = tbm->hash_handle; +} + +/* + * tbm_delete_entry + * + * Unified function to delete an entry from tbm page table + * if tbm is shared then it will operate on from shared hash + * table otherwise on local hash table. + */ +static bool +tbm_delete_entry(TIDBitmap *tbm, BlockNumber pageno) +{ + bool result = false; + if (TBM_IS_SHARED(tbm)) + { + /* Look up or create an entry */ + if (dht_delete_key(tbm->shared_pagetable, (void *) &pageno)) + result = true; + } + else + { + if (hash_search(tbm->pagetable, + (void *) &pageno, + HASH_REMOVE, NULL) != NULL) + result = true; + } + + return result; +} + +/* + * tbm_find_or_insert + * + * Unified function to find or insert an entry in page table. + * If tbm is shared then it will operate on from shared hash + * table otherwise on local hash table. + */ +static PagetableEntry* +tbm_find_or_insert(TIDBitmap *tbm, BlockNumber chunk_pageno, bool *found) +{ + PagetableEntry *page; + + if (TBM_IS_SHARED(tbm)) + page = (PagetableEntry *) dht_find_or_insert(tbm->shared_pagetable, + (void *) &chunk_pageno, + found); + else + page = (PagetableEntry *) hash_search(tbm->pagetable, + (void *) &chunk_pageno, + HASH_ENTER, found); + + return page; +} + +/* + * tbm_find + * + * Unified function to find an entry from tbm page table + * if tbm is shared then it will operate on shared hash + * table otherwise on local hash table. + */ +static PagetableEntry* +tbm_find(const TIDBitmap *tbm, BlockNumber pageno) +{ + PagetableEntry *page; + + if (TBM_IS_SHARED(tbm)) + page = (PagetableEntry *) dht_find(tbm->shared_pagetable, + (void *) &pageno, + false); + else + page = (PagetableEntry *) hash_search(tbm->pagetable, + (void *) &pageno, + HASH_FIND, NULL); + return page; +} + +/* + * tbm_set_parallel + * + * Mark tidbitmap as shared and also store DSA area in it. + * marking tidbitmap as shared is indication that this tidbitmap + * should be created in shared memory. DSA area will be used for + * creating bitmap using dynamic shared memory hash table. + */ +void +tbm_set_parallel(TIDBitmap *tbm, void *area) +{ + tbm->shared = true; + tbm->area = (dsa_area*)area; +} + +/* + * tbm_seq_search + * + * Unified function to scan full tbm page table + * if tbm is shared then it will operate on shared hash + * table otherwise on local hash table. + */ +static void* +tbm_seq_search(TIDBitmap *tbm, void *iterator) +{ + if (TBM_IS_SHARED(tbm)) + { + dht_iterator *dhtiterator = (dht_iterator*)iterator; + + return dht_iterate_next(dhtiterator); + } + else + { + HASH_SEQ_STATUS *status = (HASH_SEQ_STATUS*)iterator; + + return hash_seq_search(status); + } +} + +/* + * tbm_estimate_parallel_tidbitmap + * Estimate size of shared TIDBitmap related info. + */ +Size tbm_estimate_parallel_tidbitmap(Size size) +{ + return add_size(size, sizeof(ParallelTIDBitmap)); +} diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c index e42ef98..058e55a 100644 --- a/src/backend/optimizer/path/allpaths.c +++ b/src/backend/optimizer/path/allpaths.c @@ -126,6 +126,7 @@ static void subquery_push_qual(Query *subquery, static void recurse_push_qual(Node *setOp, Query *topquery, RangeTblEntry *rte, Index rti, Node *qual); static void remove_unused_subquery_outputs(Query *subquery, RelOptInfo *rel); +static int compute_parallel_worker(RelOptInfo *rel, BlockNumber pages); /* @@ -678,49 +679,7 @@ create_plain_partial_paths(PlannerInfo *root, RelOptInfo *rel) { int parallel_workers; - /* - * If the user has set the parallel_workers reloption, use that; otherwise - * select a default number of workers. - */ - if (rel->rel_parallel_workers != -1) - parallel_workers = rel->rel_parallel_workers; - else - { - int parallel_threshold; - - /* - * If this relation is too small to be worth a parallel scan, just - * return without doing anything ... unless it's an inheritance child. - * In that case, we want to generate a parallel path here anyway. It - * might not be worthwhile just for this relation, but when combined - * with all of its inheritance siblings it may well pay off. - */ - if (rel->pages < (BlockNumber) min_parallel_relation_size && - rel->reloptkind == RELOPT_BASEREL) - return; - - /* - * Select the number of workers based on the log of the size of the - * relation. This probably needs to be a good deal more - * sophisticated, but we need something here for now. Note that the - * upper limit of the min_parallel_relation_size GUC is chosen to - * prevent overflow here. - */ - parallel_workers = 1; - parallel_threshold = Max(min_parallel_relation_size, 1); - while (rel->pages >= (BlockNumber) (parallel_threshold * 3)) - { - parallel_workers++; - parallel_threshold *= 3; - if (parallel_threshold > INT_MAX / 3) - break; /* avoid overflow */ - } - } - - /* - * In no case use more than max_parallel_workers_per_gather workers. - */ - parallel_workers = Min(parallel_workers, max_parallel_workers_per_gather); + parallel_workers = compute_parallel_worker(rel, rel->pages); /* If any limit was set to zero, the user doesn't want a parallel scan. */ if (parallel_workers <= 0) @@ -2866,6 +2825,84 @@ remove_unused_subquery_outputs(Query *subquery, RelOptInfo *rel) } } +/* + * create_partial_bitmap_paths + * Build partial access paths for parallel scan of a plain relation + */ +void +create_partial_bitmap_paths(PlannerInfo *root, RelOptInfo *rel, + Path *bitmapqual) +{ + int parallel_workers; + double pages_fetched; + + /* Compute heap pages for bitmap heap scan */ + pages_fetched = compute_bitmap_pages(root, rel, bitmapqual, 1.0, + NULL, NULL); + + parallel_workers = compute_parallel_worker(rel, pages_fetched); + + /* If any limit was set to zero, the user doesn't want a parallel scan. */ + if (parallel_workers <= 0) + return; + + add_partial_path(rel, (Path*)create_bitmap_heap_path(root, rel, + bitmapqual, rel->lateral_relids, 1.0, parallel_workers)); +} + +static int +compute_parallel_worker(RelOptInfo *rel, BlockNumber pages) +{ + int parallel_workers; + + /* + * If the user has set the parallel_workers reloption, use that; otherwise + * select a default number of workers. + */ + if (rel->rel_parallel_workers != -1) + parallel_workers = rel->rel_parallel_workers; + else + { + int parallel_threshold; + + /* + * If this relation is too small to be worth a parallel scan, just + * return without doing anything ... unless it's an inheritance child. + * In that case, we want to generate a parallel path here anyway. It + * might not be worthwhile just for this relation, but when combined + * with all of its inheritance siblings it may well pay off. + */ + if (pages < (BlockNumber) min_parallel_relation_size && + rel->reloptkind == RELOPT_BASEREL) + return 0; + + /* + * Select the number of workers based on the log of the size of the + * relation. This probably needs to be a good deal more + * sophisticated, but we need something here for now. Note that the + * upper limit of the min_parallel_relation_size GUC is chosen to + * prevent overflow here. + */ + parallel_workers = 1; + parallel_threshold = Max(min_parallel_relation_size, 1); + while (pages >= (BlockNumber) (parallel_threshold * 3)) + { + parallel_workers++; + parallel_threshold *= 3; + if (parallel_threshold > INT_MAX / 3) + break; /* avoid overflow */ + } + } + + /* + * In no case use more than max_parallel_workers_per_gather workers. + */ + parallel_workers = Min(parallel_workers, max_parallel_workers_per_gather); + + return parallel_workers; +} + + /***************************************************************************** * DEBUG SUPPORT *****************************************************************************/ diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index 2a49639..c08997a 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -161,7 +161,7 @@ static Selectivity get_foreign_key_join_selectivity(PlannerInfo *root, static void set_rel_width(PlannerInfo *root, RelOptInfo *rel); static double relation_byte_size(double tuples, int width); static double page_size(double tuples, int width); - +static Cost adjust_cost_for_parallelism(Path *path, Cost cpu_run_cost); /* * clamp_row_est @@ -237,44 +237,7 @@ cost_seqscan(Path *path, PlannerInfo *root, /* Adjust costing for parallelism, if used. */ if (path->parallel_workers > 0) - { - double parallel_divisor = path->parallel_workers; - double leader_contribution; - - /* - * Early experience with parallel query suggests that when there is - * only one worker, the leader often makes a very substantial - * contribution to executing the parallel portion of the plan, but as - * more workers are added, it does less and less, because it's busy - * reading tuples from the workers and doing whatever non-parallel - * post-processing is needed. By the time we reach 4 workers, the - * leader no longer makes a meaningful contribution. Thus, for now, - * estimate that the leader spends 30% of its time servicing each - * worker, and the remainder executing the parallel plan. - */ - leader_contribution = 1.0 - (0.3 * path->parallel_workers); - if (leader_contribution > 0) - parallel_divisor += leader_contribution; - - /* - * In the case of a parallel plan, the row count needs to represent - * the number of tuples processed per worker. Otherwise, higher-level - * plan nodes that appear below the gather will be costed incorrectly, - * because they'll anticipate receiving more rows than any given copy - * will actually get. - */ - path->rows = clamp_row_est(path->rows / parallel_divisor); - - /* The CPU cost is divided among all the workers. */ - cpu_run_cost /= parallel_divisor; - - /* - * It may be possible to amortize some of the I/O cost, but probably - * not very much, because most operating systems already do aggressive - * prefetching. For now, we assume that the disk run cost can't be - * amortized at all. - */ - } + cpu_run_cost = adjust_cost_for_parallelism(path, cpu_run_cost); path->startup_cost = startup_cost; path->total_cost = startup_cost + cpu_run_cost + disk_run_cost; @@ -831,10 +794,10 @@ cost_bitmap_heap_scan(Path *path, PlannerInfo *root, RelOptInfo *baserel, Cost startup_cost = 0; Cost run_cost = 0; Cost indexTotalCost; - Selectivity indexSelectivity; QualCost qpqual_cost; Cost cpu_per_tuple; Cost cost_per_page; + Cost cpu_run_cost; double tuples_fetched; double pages_fetched; double spc_seq_page_cost, @@ -855,13 +818,12 @@ cost_bitmap_heap_scan(Path *path, PlannerInfo *root, RelOptInfo *baserel, if (!enable_bitmapscan) startup_cost += disable_cost; - /* - * Fetch total cost of obtaining the bitmap, as well as its total - * selectivity. - */ - cost_bitmap_tree_node(bitmapqual, &indexTotalCost, &indexSelectivity); + pages_fetched = compute_bitmap_pages(root, baserel, bitmapqual, + loop_count, &indexTotalCost, + &tuples_fetched); startup_cost += indexTotalCost; + T = (baserel->pages > 1) ? (double) baserel->pages : 1.0; /* Fetch estimated page costs for tablespace containing table. */ get_tablespace_page_costs(baserel->reltablespace, @@ -869,41 +831,6 @@ cost_bitmap_heap_scan(Path *path, PlannerInfo *root, RelOptInfo *baserel, &spc_seq_page_cost); /* - * Estimate number of main-table pages fetched. - */ - tuples_fetched = clamp_row_est(indexSelectivity * baserel->tuples); - - T = (baserel->pages > 1) ? (double) baserel->pages : 1.0; - - if (loop_count > 1) - { - /* - * For repeated bitmap scans, scale up the number of tuples fetched in - * the Mackert and Lohman formula by the number of scans, so that we - * estimate the number of pages fetched by all the scans. Then - * pro-rate for one scan. - */ - pages_fetched = index_pages_fetched(tuples_fetched * loop_count, - baserel->pages, - get_indexpath_pages(bitmapqual), - root); - pages_fetched /= loop_count; - } - else - { - /* - * For a single scan, the number of heap pages that need to be fetched - * is the same as the Mackert and Lohman formula for the case T <= b - * (ie, no re-reads needed). - */ - pages_fetched = (2.0 * T * tuples_fetched) / (2.0 * T + tuples_fetched); - } - if (pages_fetched >= T) - pages_fetched = T; - else - pages_fetched = ceil(pages_fetched); - - /* * For small numbers of pages we should charge spc_random_page_cost * apiece, while if nearly all the table's pages are being read, it's more * appropriate to charge spc_seq_page_cost apiece. The effect is @@ -932,8 +859,13 @@ cost_bitmap_heap_scan(Path *path, PlannerInfo *root, RelOptInfo *baserel, startup_cost += qpqual_cost.startup; cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple; + cpu_run_cost = cpu_per_tuple * tuples_fetched; - run_cost += cpu_per_tuple * tuples_fetched; + /* Adjust costing for parallelism, if used. */ + if (path->parallel_workers > 0) + cpu_run_cost = adjust_cost_for_parallelism(path, cpu_run_cost); + + run_cost += cpu_run_cost; /* tlist eval costs are paid per output row, not per tuple scanned */ startup_cost += path->pathtarget->cost.startup; @@ -944,6 +876,56 @@ cost_bitmap_heap_scan(Path *path, PlannerInfo *root, RelOptInfo *baserel, } /* + * adjust_cost_for_parallelism + * + * Adjust the cpu cost based on number of parallel workers, also update + * the number of rows processed at each worker. + */ +static Cost +adjust_cost_for_parallelism(Path *path, Cost cpu_run_cost) +{ + double parallel_divisor = path->parallel_workers; + double leader_contribution; + Cost cpu_cost = cpu_run_cost; + + /* + * Early experience with parallel query suggests that when there is + * only one worker, the leader often makes a very substantial + * contribution to executing the parallel portion of the plan, but as + * more workers are added, it does less and less, because it's busy + * reading tuples from the workers and doing whatever non-parallel + * post-processing is needed. By the time we reach 4 workers, the + * leader no longer makes a meaningful contribution. Thus, for now, + * estimate that the leader spends 30% of its time servicing each + * worker, and the remainder executing the parallel plan. + */ + leader_contribution = 1.0 - (0.3 * path->parallel_workers); + if (leader_contribution > 0) + parallel_divisor += leader_contribution; + + /* + * In the case of a parallel plan, the row count needs to represent + * the number of tuples processed per worker. Otherwise, higher-level + * plan nodes that appear below the gather will be costed incorrectly, + * because they'll anticipate receiving more rows than any given copy + * will actually get. + */ + path->rows = clamp_row_est(path->rows / parallel_divisor); + + /* The CPU cost is divided among all the workers. */ + cpu_cost /= parallel_divisor; + + /* + * It may be possible to amortize some of the I/O cost, but probably + * not very much, because most operating systems already do aggressive + * prefetching. For now, we assume that the disk run cost can't be + * amortized at all. + */ + + return cpu_cost; +} + +/* * cost_bitmap_tree_node * Extract cost and selectivity from a bitmap tree node (index/and/or) */ @@ -4765,3 +4747,69 @@ page_size(double tuples, int width) { return ceil(relation_byte_size(tuples, width) / BLCKSZ); } + +/* + * compute_bitmap_pages + * + * compute number of pages fetched from heap in bitmap heap scan. + */ +double +compute_bitmap_pages(PlannerInfo *root, RelOptInfo *baserel, Path *bitmapqual, + int loop_count, Cost *cost, double *tuple) +{ + Cost indexTotalCost; + Selectivity indexSelectivity; + double T; + double pages_fetched; + double tuples_fetched; + + /* + * Fetch total cost of obtaining the bitmap, as well as its total + * selectivity. + */ + cost_bitmap_tree_node(bitmapqual, &indexTotalCost, &indexSelectivity); + + /* + * Estimate number of main-table pages fetched. + */ + tuples_fetched = clamp_row_est(indexSelectivity * baserel->tuples); + + T = (baserel->pages > 1) ? (double) baserel->pages : 1.0; + + if (loop_count > 1) + { + /* + * For repeated bitmap scans, scale up the number of tuples fetched in + * the Mackert and Lohman formula by the number of scans, so that we + * estimate the number of pages fetched by all the scans. Then + * pro-rate for one scan. + */ + pages_fetched = index_pages_fetched(tuples_fetched * loop_count, + baserel->pages, + get_indexpath_pages(bitmapqual), + root); + pages_fetched /= loop_count; + } + else + { + /* + * For a single scan, the number of heap pages that need to be fetched + * is the same as the Mackert and Lohman formula for the case T <= b + * (ie, no re-reads needed). + */ + pages_fetched = + (2.0 * T * tuples_fetched) / (2.0 * T + tuples_fetched); + } + + if (pages_fetched >= T) + pages_fetched = T; + else + pages_fetched = ceil(pages_fetched); + + if (cost) + *cost = indexTotalCost; + if (tuple) + *tuple = tuples_fetched; + + return pages_fetched; +} diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c index 2952bfb..1a7bde0 100644 --- a/src/backend/optimizer/path/indxpath.c +++ b/src/backend/optimizer/path/indxpath.c @@ -337,8 +337,12 @@ create_index_paths(PlannerInfo *root, RelOptInfo *rel) bitmapqual = choose_bitmap_and(root, rel, bitindexpaths); bpath = create_bitmap_heap_path(root, rel, bitmapqual, - rel->lateral_relids, 1.0); + rel->lateral_relids, 1.0, 0); add_path(rel, (Path *) bpath); + + /* create a partial bitmap heap path */ + if (rel->consider_parallel && rel->lateral_relids == NULL) + create_partial_bitmap_paths(root, rel, bitmapqual); } /* @@ -410,7 +414,7 @@ create_index_paths(PlannerInfo *root, RelOptInfo *rel) required_outer = get_bitmap_tree_required_outer(bitmapqual); loop_count = get_loop_count(root, rel->relid, required_outer); bpath = create_bitmap_heap_path(root, rel, bitmapqual, - required_outer, loop_count); + required_outer, loop_count, 0); add_path(rel, (Path *) bpath); } } diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index 47158f6..4ffcf87 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -2803,7 +2803,7 @@ create_bitmap_subplan(PlannerInfo *root, Path *bitmapqual, plan->plan_rows = clamp_row_est(ipath->indexselectivity * ipath->path.parent->tuples); plan->plan_width = 0; /* meaningless */ - plan->parallel_aware = false; + plan->parallel_aware = bitmapqual->parallel_aware; *qual = get_actual_clauses(ipath->indexclauses); *indexqual = get_actual_clauses(ipath->indexquals); foreach(l, ipath->indexinfo->indpred) diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index abb7507..0801b6e 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -47,7 +47,6 @@ typedef enum static List *translate_sub_tlist(List *tlist, int relid); - /***************************************************************************** * MISC. PATH UTILITIES *****************************************************************************/ @@ -1071,7 +1070,8 @@ create_bitmap_heap_path(PlannerInfo *root, RelOptInfo *rel, Path *bitmapqual, Relids required_outer, - double loop_count) + double loop_count, + int parallel_degree) { BitmapHeapPath *pathnode = makeNode(BitmapHeapPath); @@ -1080,11 +1080,10 @@ create_bitmap_heap_path(PlannerInfo *root, pathnode->path.pathtarget = rel->reltarget; pathnode->path.param_info = get_baserel_parampathinfo(root, rel, required_outer); - pathnode->path.parallel_aware = false; + pathnode->path.parallel_aware = parallel_degree > 0 ? true : false; pathnode->path.parallel_safe = rel->consider_parallel; - pathnode->path.parallel_workers = 0; + pathnode->path.parallel_workers = parallel_degree; pathnode->path.pathkeys = NIL; /* always unordered */ - pathnode->bitmapqual = bitmapqual; cost_bitmap_heap_scan(&pathnode->path, root, rel, @@ -3192,7 +3191,7 @@ reparameterize_path(PlannerInfo *root, Path *path, rel, bpath->bitmapqual, required_outer, - loop_count); + loop_count, 0); } case T_SubqueryScan: { diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c index 5c6cb6b..3e48afe 100644 --- a/src/backend/postmaster/pgstat.c +++ b/src/backend/postmaster/pgstat.c @@ -3382,6 +3382,9 @@ pgstat_get_wait_ipc(WaitEventIPC w) case WAIT_EVENT_PARALLEL_FINISH: event_name = "ParallelFinish"; break; + case WAIT_EVENT_PARALLEL_BITMAP_SCAN: + event_name = "ParallelBitmapScan"; + break; case WAIT_EVENT_SAFE_SNAPSHOT: event_name = "SafeSnapshot"; break; diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index 0d12bbb..7cf0a3c 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -23,7 +23,6 @@ #include "utils/relcache.h" #include "utils/snapshot.h" - /* "options" flag bits for heap_insert */ #define HEAP_INSERT_SKIP_WAL 0x0001 #define HEAP_INSERT_SKIP_FSM 0x0002 @@ -178,6 +177,7 @@ extern void simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup); extern void heap_sync(Relation relation); +extern void heap_bm_update_snapshot(HeapScanDesc scan, Snapshot snapshot); /* in heap/pruneheap.c */ extern void heap_page_prune_opt(Relation relation, Buffer buffer); diff --git a/src/include/executor/nodeBitmapHeapscan.h b/src/include/executor/nodeBitmapHeapscan.h index 0ed9c78..8afecb7 100644 --- a/src/include/executor/nodeBitmapHeapscan.h +++ b/src/include/executor/nodeBitmapHeapscan.h @@ -15,10 +15,17 @@ #define NODEBITMAPHEAPSCAN_H #include "nodes/execnodes.h" +#include "access/parallel.h" extern BitmapHeapScanState *ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags); extern TupleTableSlot *ExecBitmapHeapScan(BitmapHeapScanState *node); extern void ExecEndBitmapHeapScan(BitmapHeapScanState *node); extern void ExecReScanBitmapHeapScan(BitmapHeapScanState *node); +extern void ExecBitmapHeapEstimate(BitmapHeapScanState *node, + ParallelContext *pcxt); +extern void ExecBitmapHeapInitializeDSM(BitmapHeapScanState *node, + ParallelContext *pcxt); +extern void ExecBitmapHeapInitializeWorker(BitmapHeapScanState *node, + shm_toc *toc); #endif /* NODEBITMAPHEAPSCAN_H */ diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index bb1f56a..78f1ac8 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -26,6 +26,8 @@ #include "utils/sortsupport.h" #include "utils/tuplestore.h" #include "utils/tuplesort.h" +#include "nodes/tidbitmap.h" +#include "storage/condition_variable.h" /* ---------------- @@ -1393,6 +1395,72 @@ typedef struct IndexOnlyScanState long ioss_HeapFetches; } IndexOnlyScanState; +/* + * Stores the information about current position of the + * shared iterator used for parallel bitmap heap scan. + */ +typedef struct +{ + slock_t mutex; /* mutual exclusion for below three fields */ + int spageptr; /* next spages index */ + int schunkptr; /* next schunks index */ + int schunkbit; /* next bit to check in current schunk */ +} ParallelIterator; + +/* ---------------- + * PBMState information : Current status of the TIDBitmap creation during + * parallel bitmap heap scan. + * + * PBM_INITIAL TIDBitmap creation is not yet started, so + * first worker to see this state will become + * leader and will create TIDbitmap. This will + * also set the state to PBM_INPROGRESS. + * PBM_INPROGRESS TIDBitmap creation is already in progress, + * so workers need to sleep until leader set the + * state to PBM_FINISHED and wake us up. + * PBM_FINISHED TIDBitmap creation is done, so now all worker + * can proceed to iterate over TIDBitmap. + * ---------------- + */ +typedef enum +{ + PBM_INITIAL, + PBM_INPROGRESS, + PBM_FINISHED +}PBMState; + +/* ---------------- + * ParallelBitmapInfo information + * relid OID of relation to scan + * tbmiterator main iterator + * prefetch_mutex mutual exclusion for prefetch members + * (prefetch_iterator, prefetch_pages and + * prefetch_target) + * prefetch_iterator iterator for scanning ahead of current pages + * prefetch_pages # pages prefetch iterator is ahead of current + * prefetch_target current target prefetch distance + * state_mutex mutual exclusion for state field + * cv conditional wait variable + * state current state of the TIDBitmap + * ptbm_offset offset in bytes of ParallelTIDBitmap. + * phs_snapshot_data snapshot data shared to worker + * ---------------- + */ +typedef struct ParallelBitmapInfo +{ + Oid relid; + ParallelIterator tbmiterator; + ParallelIterator prefetch_iterator; + slock_t prefetch_mutex; + int prefetch_pages; + int prefetch_target; + slock_t state_mutex; + ConditionVariable cv; + PBMState state; + Size ptbm_offset; + char phs_snapshot_data[FLEXIBLE_ARRAY_MEMBER]; +}ParallelBitmapInfo; + /* ---------------- * BitmapIndexScanState information * @@ -1407,6 +1475,8 @@ typedef struct IndexOnlyScanState * RuntimeContext expr context for evaling runtime Skeys * RelationDesc index relation descriptor * ScanDesc index scan descriptor + * Parallel Under parallel Bitmap heap so need to create shared + * TIDBitmap * ---------------- */ typedef struct BitmapIndexScanState @@ -1423,6 +1493,7 @@ typedef struct BitmapIndexScanState ExprContext *biss_RuntimeContext; Relation biss_RelationDesc; IndexScanDesc biss_ScanDesc; + bool biss_Parallel; } BitmapIndexScanState; /* ---------------- @@ -1438,7 +1509,11 @@ typedef struct BitmapIndexScanState * prefetch_pages # pages prefetch iterator is ahead of current * prefetch_target current target prefetch distance * prefetch_maximum maximum value for prefetch_target - * ---------------- + * pscan_len size of the shared memory for parallel bitmap + * is_leader is_leader is set true, if this worker become + * leader for parallel bitmap heap scan. + * parallel_bitmap shared memory for parallel bitmap scan + *---------------- */ typedef struct BitmapHeapScanState { @@ -1453,6 +1528,9 @@ typedef struct BitmapHeapScanState int prefetch_pages; int prefetch_target; int prefetch_maximum; + Size pscan_len; + bool is_leader; + ParallelBitmapInfo *parallel_bitmap; } BitmapHeapScanState; /* ---------------- diff --git a/src/include/nodes/tidbitmap.h b/src/include/nodes/tidbitmap.h index 9303299..e215256 100644 --- a/src/include/nodes/tidbitmap.h +++ b/src/include/nodes/tidbitmap.h @@ -31,9 +31,6 @@ */ typedef struct TIDBitmap TIDBitmap; -/* Likewise, TBMIterator is private */ -typedef struct TBMIterator TBMIterator; - /* Result structure for tbm_iterate */ typedef struct { @@ -44,6 +41,26 @@ typedef struct OffsetNumber offsets[FLEXIBLE_ARRAY_MEMBER]; } TBMIterateResult; +/* + * When iterating over a bitmap in sorted order, a TBMIterator is used to + * track our progress. There can be several iterators scanning the same + * bitmap concurrently. Note that the bitmap becomes read-only as soon as + * any iterator is created. + */ +typedef struct +{ + TIDBitmap *tbm; /* TIDBitmap we're iterating over */ + int spageptr; /* next spages index */ + int schunkptr; /* next schunks index */ + int schunkbit; /* next bit to check in current schunk */ + TBMIterateResult output; /* MUST BE LAST (because variable-size) */ +}TBMIterator; + +/* + * Holds shared members of TIDBitmap for parallel bitmap scan. + */ +typedef struct ParallelTIDBitmap ParallelTIDBitmap; + /* function prototypes in nodes/tidbitmap.c */ extern TIDBitmap *tbm_create(long maxbytes); @@ -58,9 +75,14 @@ extern void tbm_union(TIDBitmap *a, const TIDBitmap *b); extern void tbm_intersect(TIDBitmap *a, const TIDBitmap *b); extern bool tbm_is_empty(const TIDBitmap *tbm); - -extern TBMIterator *tbm_begin_iterate(TIDBitmap *tbm); +extern TBMIterator * tbm_begin_iterate(TIDBitmap *tbm, + ParallelTIDBitmap *parallel_info); extern TBMIterateResult *tbm_iterate(TBMIterator *iterator); extern void tbm_end_iterate(TBMIterator *iterator); +extern void tbm_attach_to_pagetable(TIDBitmap *tbm, ParallelTIDBitmap *stbm); +extern void tbm_update_shared_members(TIDBitmap *tbm, + ParallelTIDBitmap *parallel_tbm); +void tbm_set_parallel(TIDBitmap *tbm, void *area); +extern Size tbm_estimate_parallel_tidbitmap(Size size); #endif /* TIDBITMAP_H */ diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h index 2a4df2f..e1baacc 100644 --- a/src/include/optimizer/cost.h +++ b/src/include/optimizer/cost.h @@ -183,6 +183,8 @@ extern void set_cte_size_estimates(PlannerInfo *root, RelOptInfo *rel, double cte_rows); extern void set_foreign_size_estimates(PlannerInfo *root, RelOptInfo *rel); extern PathTarget *set_pathtarget_cost_width(PlannerInfo *root, PathTarget *target); +extern double compute_bitmap_pages(PlannerInfo *root, RelOptInfo *baserel, + Path *bitmapqual, int loop_count, Cost *cost, double *tuple); /* * prototypes for clausesel.c diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h index 71d9154..397d266 100644 --- a/src/include/optimizer/pathnode.h +++ b/src/include/optimizer/pathnode.h @@ -52,7 +52,8 @@ extern BitmapHeapPath *create_bitmap_heap_path(PlannerInfo *root, RelOptInfo *rel, Path *bitmapqual, Relids required_outer, - double loop_count); + double loop_count, + int parallel_degree); extern BitmapAndPath *create_bitmap_and_path(PlannerInfo *root, RelOptInfo *rel, List *bitmapquals); diff --git a/src/include/optimizer/paths.h b/src/include/optimizer/paths.h index 44abe83..3264f44 100644 --- a/src/include/optimizer/paths.h +++ b/src/include/optimizer/paths.h @@ -53,6 +53,8 @@ extern RelOptInfo *standard_join_search(PlannerInfo *root, int levels_needed, List *initial_rels); extern void generate_gather_paths(PlannerInfo *root, RelOptInfo *rel); +extern void create_partial_bitmap_paths(PlannerInfo *root, RelOptInfo *rel, + Path *bitmapqual); #ifdef OPTIMIZER_DEBUG extern void debug_print_rel(PlannerInfo *root, RelOptInfo *rel); diff --git a/src/include/pgstat.h b/src/include/pgstat.h index 27be549..7f7992a 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -784,6 +784,7 @@ typedef enum WAIT_EVENT_MQ_RECEIVE, WAIT_EVENT_MQ_SEND, WAIT_EVENT_PARALLEL_FINISH, + WAIT_EVENT_PARALLEL_BITMAP_SCAN, WAIT_EVENT_SAFE_SNAPSHOT, WAIT_EVENT_SYNC_REP } WaitEventIPC;