diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c index 46d7d06..3cf9417 100644 --- a/src/backend/optimizer/path/allpaths.c +++ b/src/backend/optimizer/path/allpaths.c @@ -126,6 +126,7 @@ static void subquery_push_qual(Query *subquery, static void recurse_push_qual(Node *setOp, Query *topquery, RangeTblEntry *rte, Index rti, Node *qual); static void remove_unused_subquery_outputs(Query *subquery, RelOptInfo *rel); +static int compute_parallel_worker(RelOptInfo *rel, BlockNumber pages); /* @@ -678,49 +679,7 @@ create_plain_partial_paths(PlannerInfo *root, RelOptInfo *rel) { int parallel_workers; - /* - * If the user has set the parallel_workers reloption, use that; otherwise - * select a default number of workers. - */ - if (rel->rel_parallel_workers != -1) - parallel_workers = rel->rel_parallel_workers; - else - { - int parallel_threshold; - - /* - * If this relation is too small to be worth a parallel scan, just - * return without doing anything ... unless it's an inheritance child. - * In that case, we want to generate a parallel path here anyway. It - * might not be worthwhile just for this relation, but when combined - * with all of its inheritance siblings it may well pay off. - */ - if (rel->pages < (BlockNumber) min_parallel_relation_size && - rel->reloptkind == RELOPT_BASEREL) - return; - - /* - * Select the number of workers based on the log of the size of the - * relation. This probably needs to be a good deal more - * sophisticated, but we need something here for now. Note that the - * upper limit of the min_parallel_relation_size GUC is chosen to - * prevent overflow here. - */ - parallel_workers = 1; - parallel_threshold = Max(min_parallel_relation_size, 1); - while (rel->pages >= (BlockNumber) (parallel_threshold * 3)) - { - parallel_workers++; - parallel_threshold *= 3; - if (parallel_threshold > INT_MAX / 3) - break; /* avoid overflow */ - } - } - - /* - * In no case use more than max_parallel_workers_per_gather workers. - */ - parallel_workers = Min(parallel_workers, max_parallel_workers_per_gather); + parallel_workers = compute_parallel_worker(rel, rel->pages); /* If any limit was set to zero, the user doesn't want a parallel scan. */ if (parallel_workers <= 0) @@ -2866,6 +2825,59 @@ remove_unused_subquery_outputs(Query *subquery, RelOptInfo *rel) } } +static int +compute_parallel_worker(RelOptInfo *rel, BlockNumber pages) +{ + int parallel_workers; + + /* + * If the user has set the parallel_workers reloption, use that; otherwise + * select a default number of workers. + */ + if (rel->rel_parallel_workers != -1) + parallel_workers = rel->rel_parallel_workers; + else + { + int parallel_threshold; + + /* + * If this relation is too small to be worth a parallel scan, just + * return without doing anything ... unless it's an inheritance child. + * In that case, we want to generate a parallel path here anyway. It + * might not be worthwhile just for this relation, but when combined + * with all of its inheritance siblings it may well pay off. + */ + if (pages < (BlockNumber) min_parallel_relation_size && + rel->reloptkind == RELOPT_BASEREL) + return 0; + + /* + * Select the number of workers based on the log of the size of the + * relation. This probably needs to be a good deal more + * sophisticated, but we need something here for now. Note that the + * upper limit of the min_parallel_relation_size GUC is chosen to + * prevent overflow here. + */ + parallel_workers = 1; + parallel_threshold = Max(min_parallel_relation_size, 1); + while (pages >= (BlockNumber) (parallel_threshold * 3)) + { + parallel_workers++; + parallel_threshold *= 3; + if (parallel_threshold > INT_MAX / 3) + break; /* avoid overflow */ + } + } + + /* + * In no case use more than max_parallel_workers_per_gather workers. + */ + parallel_workers = Min(parallel_workers, max_parallel_workers_per_gather); + + return parallel_workers; +} + + /***************************************************************************** * DEBUG SUPPORT *****************************************************************************/ diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index a52eb7e..deb973b 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -161,6 +161,7 @@ static Selectivity get_foreign_key_join_selectivity(PlannerInfo *root, static void set_rel_width(PlannerInfo *root, RelOptInfo *rel); static double relation_byte_size(double tuples, int width); static double page_size(double tuples, int width); +static Cost update_cost_for_parallelism(Path *path, Cost cpu_run_cost); /* @@ -237,44 +238,7 @@ cost_seqscan(Path *path, PlannerInfo *root, /* Adjust costing for parallelism, if used. */ if (path->parallel_workers > 0) - { - double parallel_divisor = path->parallel_workers; - double leader_contribution; - - /* - * Early experience with parallel query suggests that when there is - * only one worker, the leader often makes a very substantial - * contribution to executing the parallel portion of the plan, but as - * more workers are added, it does less and less, because it's busy - * reading tuples from the workers and doing whatever non-parallel - * post-processing is needed. By the time we reach 4 workers, the - * leader no longer makes a meaningful contribution. Thus, for now, - * estimate that the leader spends 30% of its time servicing each - * worker, and the remainder executing the parallel plan. - */ - leader_contribution = 1.0 - (0.3 * path->parallel_workers); - if (leader_contribution > 0) - parallel_divisor += leader_contribution; - - /* - * In the case of a parallel plan, the row count needs to represent - * the number of tuples processed per worker. Otherwise, higher-level - * plan nodes that appear below the gather will be costed incorrectly, - * because they'll anticipate receiving more rows than any given copy - * will actually get. - */ - path->rows = clamp_row_est(path->rows / parallel_divisor); - - /* The CPU cost is divided among all the workers. */ - cpu_run_cost /= parallel_divisor; - - /* - * It may be possible to amortize some of the I/O cost, but probably - * not very much, because most operating systems already do aggressive - * prefetching. For now, we assume that the disk run cost can't be - * amortized at all. - */ - } + cpu_run_cost = update_cost_for_parallelism(path, cpu_run_cost); path->startup_cost = startup_cost; path->total_cost = startup_cost + cpu_run_cost + disk_run_cost; @@ -831,7 +795,6 @@ cost_bitmap_heap_scan(Path *path, PlannerInfo *root, RelOptInfo *baserel, Cost startup_cost = 0; Cost run_cost = 0; Cost indexTotalCost; - Selectivity indexSelectivity; QualCost qpqual_cost; Cost cpu_per_tuple; Cost cost_per_page; @@ -855,13 +818,12 @@ cost_bitmap_heap_scan(Path *path, PlannerInfo *root, RelOptInfo *baserel, if (!enable_bitmapscan) startup_cost += disable_cost; - /* - * Fetch total cost of obtaining the bitmap, as well as its total - * selectivity. - */ - cost_bitmap_tree_node(bitmapqual, &indexTotalCost, &indexSelectivity); + pages_fetched = compute_bitmap_pages(root, baserel, bitmapqual, + loop_count, &indexTotalCost, + &tuples_fetched); startup_cost += indexTotalCost; + T = (baserel->pages > 1) ? (double) baserel->pages : 1.0; /* Fetch estimated page costs for tablespace containing table. */ get_tablespace_page_costs(baserel->reltablespace, @@ -869,41 +831,6 @@ cost_bitmap_heap_scan(Path *path, PlannerInfo *root, RelOptInfo *baserel, &spc_seq_page_cost); /* - * Estimate number of main-table pages fetched. - */ - tuples_fetched = clamp_row_est(indexSelectivity * baserel->tuples); - - T = (baserel->pages > 1) ? (double) baserel->pages : 1.0; - - if (loop_count > 1) - { - /* - * For repeated bitmap scans, scale up the number of tuples fetched in - * the Mackert and Lohman formula by the number of scans, so that we - * estimate the number of pages fetched by all the scans. Then - * pro-rate for one scan. - */ - pages_fetched = index_pages_fetched(tuples_fetched * loop_count, - baserel->pages, - get_indexpath_pages(bitmapqual), - root); - pages_fetched /= loop_count; - } - else - { - /* - * For a single scan, the number of heap pages that need to be fetched - * is the same as the Mackert and Lohman formula for the case T <= b - * (ie, no re-reads needed). - */ - pages_fetched = (2.0 * T * tuples_fetched) / (2.0 * T + tuples_fetched); - } - if (pages_fetched >= T) - pages_fetched = T; - else - pages_fetched = ceil(pages_fetched); - - /* * For small numbers of pages we should charge spc_random_page_cost * apiece, while if nearly all the table's pages are being read, it's more * appropriate to charge spc_seq_page_cost apiece. The effect is @@ -944,6 +871,56 @@ cost_bitmap_heap_scan(Path *path, PlannerInfo *root, RelOptInfo *baserel, } /* + * update_cost_for_parallelism + * + * Adjust the cpu cost based on number of parallel workers, also update + * the number of rows processed at each worker. + */ +static Cost +update_cost_for_parallelism(Path *path, Cost cpu_run_cost) +{ + double parallel_divisor = path->parallel_workers; + double leader_contribution; + Cost cpu_cost = cpu_run_cost; + + /* + * Early experience with parallel query suggests that when there is only + * one worker, the leader often makes a very substantial contribution to + * executing the parallel portion of the plan, but as more workers are + * added, it does less and less, because it's busy reading tuples from the + * workers and doing whatever non-parallel post-processing is needed. By + * the time we reach 4 workers, the leader no longer makes a meaningful + * contribution. Thus, for now, estimate that the leader spends 30% of + * its time servicing each worker, and the remainder executing the + * parallel plan. + */ + leader_contribution = 1.0 - (0.3 * path->parallel_workers); + if (leader_contribution > 0) + parallel_divisor += leader_contribution; + + /* + * In the case of a parallel plan, the row count needs to represent the + * number of tuples processed per worker. Otherwise, higher-level plan + * nodes that appear below the gather will be costed incorrectly, because + * they'll anticipate receiving more rows than any given copy will + * actually get. + */ + path->rows = clamp_row_est(path->rows / parallel_divisor); + + /* The CPU cost is divided among all the workers. */ + cpu_cost /= parallel_divisor; + + /* + * It may be possible to amortize some of the I/O cost, but probably not + * very much, because most operating systems already do aggressive + * prefetching. For now, we assume that the disk run cost can't be + * amortized at all. + */ + + return cpu_cost; +} + +/* * cost_bitmap_tree_node * Extract cost and selectivity from a bitmap tree node (index/and/or) */ @@ -4798,3 +4775,69 @@ page_size(double tuples, int width) { return ceil(relation_byte_size(tuples, width) / BLCKSZ); } + +/* + * compute_bitmap_pages + * + * compute number of pages fetched from heap in bitmap heap scan. + */ +double +compute_bitmap_pages(PlannerInfo *root, RelOptInfo *baserel, Path *bitmapqual, + int loop_count, Cost *cost, double *tuple) +{ + Cost indexTotalCost; + Selectivity indexSelectivity; + double T; + double pages_fetched; + double tuples_fetched; + + /* + * Fetch total cost of obtaining the bitmap, as well as its total + * selectivity. + */ + cost_bitmap_tree_node(bitmapqual, &indexTotalCost, &indexSelectivity); + + /* + * Estimate number of main-table pages fetched. + */ + tuples_fetched = clamp_row_est(indexSelectivity * baserel->tuples); + + T = (baserel->pages > 1) ? (double) baserel->pages : 1.0; + + if (loop_count > 1) + { + /* + * For repeated bitmap scans, scale up the number of tuples fetched in + * the Mackert and Lohman formula by the number of scans, so that we + * estimate the number of pages fetched by all the scans. Then + * pro-rate for one scan. + */ + pages_fetched = index_pages_fetched(tuples_fetched * loop_count, + baserel->pages, + get_indexpath_pages(bitmapqual), + root); + pages_fetched /= loop_count; + } + else + { + /* + * For a single scan, the number of heap pages that need to be fetched + * is the same as the Mackert and Lohman formula for the case T <= b + * (ie, no re-reads needed). + */ + pages_fetched = + (2.0 * T * tuples_fetched) / (2.0 * T + tuples_fetched); + } + + if (pages_fetched >= T) + pages_fetched = T; + else + pages_fetched = ceil(pages_fetched); + + if (cost) + *cost = indexTotalCost; + if (tuple) + *tuple = tuples_fetched; + + return pages_fetched; +} diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h index 39376ec..0e68264 100644 --- a/src/include/optimizer/cost.h +++ b/src/include/optimizer/cost.h @@ -183,6 +183,8 @@ extern void set_cte_size_estimates(PlannerInfo *root, RelOptInfo *rel, double cte_rows); extern void set_foreign_size_estimates(PlannerInfo *root, RelOptInfo *rel); extern PathTarget *set_pathtarget_cost_width(PlannerInfo *root, PathTarget *target); +extern double compute_bitmap_pages(PlannerInfo *root, RelOptInfo *baserel, + Path *bitmapqual, int loop_count, Cost *cost, double *tuple); /* * prototypes for clausesel.c