From 96046239014de8a7dec62e2f60b5210deb1bd32a Mon Sep 17 00:00:00 2001 From: David Fetter Date: Thu, 31 Dec 2020 16:42:07 -0800 Subject: [PATCH v10] first cut To: hackers MIME-Version: 1.0 Content-Type: multipart/mixed; boundary="------------2.29.2" This is a multi-part message in MIME format. --------------2.29.2 Content-Type: text/plain; charset=utf-8; format=fixed Content-Transfer-Encoding: 8bit create mode 100644 src/include/executor/nodeTidrangescan.h create mode 100644 src/backend/executor/nodeTidrangescan.c create mode 100644 src/test/regress/expected/tidrangescan.out create mode 100644 src/test/regress/sql/tidrangescan.sql --------------2.29.2 Content-Type: text/x-patch; name="v10-0001-first-cut.patch" Content-Transfer-Encoding: 8bit Content-Disposition: attachment; filename="v10-0001-first-cut.patch" diff --git src/include/access/tableam.h src/include/access/tableam.h index 387eb34a61..5776f8ba6e 100644 --- src/include/access/tableam.h +++ src/include/access/tableam.h @@ -218,6 +218,15 @@ typedef struct TableAmRoutine bool set_params, bool allow_strat, bool allow_sync, bool allow_pagemode); + /* + * Set the range of a scan. + * + * Optional callback: A table AM can implement this to enable TID range + * scans. + */ + void (*scan_setlimits) (TableScanDesc scan, + BlockNumber startBlk, BlockNumber numBlks); + /* * Return next tuple from `scan`, store in slot. */ @@ -875,6 +884,16 @@ table_rescan(TableScanDesc scan, scan->rs_rd->rd_tableam->scan_rescan(scan, key, false, false, false, false); } +/* + * Set the range of a scan. + */ +static inline void +table_scan_setlimits(TableScanDesc scan, + BlockNumber startBlk, BlockNumber numBlks) +{ + scan->rs_rd->rd_tableam->scan_setlimits(scan, startBlk, numBlks); +} + /* * Restart a relation scan after changing params. * diff --git src/include/catalog/pg_operator.dat src/include/catalog/pg_operator.dat index 9c6bf6c9d1..bb7193b9e7 100644 --- src/include/catalog/pg_operator.dat +++ src/include/catalog/pg_operator.dat @@ -237,15 +237,15 @@ oprname => '<', oprleft => 'tid', oprright => 'tid', oprresult => 'bool', oprcom => '>(tid,tid)', oprnegate => '>=(tid,tid)', oprcode => 'tidlt', oprrest => 'scalarltsel', oprjoin => 'scalarltjoinsel' }, -{ oid => '2800', descr => 'greater than', +{ oid => '2800', oid_symbol => 'TIDGreaterOperator', descr => 'greater than', oprname => '>', oprleft => 'tid', oprright => 'tid', oprresult => 'bool', oprcom => '<(tid,tid)', oprnegate => '<=(tid,tid)', oprcode => 'tidgt', oprrest => 'scalargtsel', oprjoin => 'scalargtjoinsel' }, -{ oid => '2801', descr => 'less than or equal', +{ oid => '2801', oid_symbol => 'TIDLessEqOperator', descr => 'less than or equal', oprname => '<=', oprleft => 'tid', oprright => 'tid', oprresult => 'bool', oprcom => '>=(tid,tid)', oprnegate => '>(tid,tid)', oprcode => 'tidle', oprrest => 'scalarlesel', oprjoin => 'scalarlejoinsel' }, -{ oid => '2802', descr => 'greater than or equal', +{ oid => '2802', oid_symbol => 'TIDGreaterEqOperator', descr => 'greater than or equal', oprname => '>=', oprleft => 'tid', oprright => 'tid', oprresult => 'bool', oprcom => '<=(tid,tid)', oprnegate => '<(tid,tid)', oprcode => 'tidge', oprrest => 'scalargesel', oprjoin => 'scalargejoinsel' }, diff --git src/include/executor/nodeTidrangescan.h src/include/executor/nodeTidrangescan.h new file mode 100644 index 0000000000..f0bbcc6a04 --- /dev/null +++ src/include/executor/nodeTidrangescan.h @@ -0,0 +1,24 @@ +/*------------------------------------------------------------------------- + * + * nodeTidrangescan.h + * + * + * + * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/executor/nodeTidrangescan.h + * + *------------------------------------------------------------------------- + */ +#ifndef NODETIDRANGESCAN_H +#define NODETIDRANGESCAN_H + +#include "nodes/execnodes.h" + +extern TidRangeScanState *ExecInitTidRangeScan(TidRangeScan *node, + EState *estate, int eflags); +extern void ExecEndTidRangeScan(TidRangeScanState *node); +extern void ExecReScanTidRangeScan(TidRangeScanState *node); + +#endif /* NODETIDRANGESCAN_H */ diff --git src/include/nodes/execnodes.h src/include/nodes/execnodes.h index 61ba4c3666..ae58ea9eb6 100644 --- src/include/nodes/execnodes.h +++ src/include/nodes/execnodes.h @@ -1611,6 +1611,29 @@ typedef struct TidScanState HeapTupleData tss_htup; } TidScanState; +/* ---------------- + * TidRangeScanState information + * + * trss_tidexprs list of TidOpExpr structs (see nodeTidrangescan.c) + * trss_startBlock first block to scan + * trss_endBlock last block to scan (inclusive) + * trss_startOffset first offset in first block to scan or InvalidBlockNumber + * when the range is not set + * trss_endOffset last offset in last block to scan (inclusive) + * trss_inScan is a scan currently in progress? + * ---------------- + */ +typedef struct TidRangeScanState +{ + ScanState ss; /* its first field is NodeTag */ + List *trss_tidexprs; + BlockNumber trss_startBlock; + BlockNumber trss_endBlock; + OffsetNumber trss_startOffset; + OffsetNumber trss_endOffset; + bool trss_inScan; +} TidRangeScanState; + /* ---------------- * SubqueryScanState information * diff --git src/include/nodes/nodes.h src/include/nodes/nodes.h index 3684f87a88..46d8cddfee 100644 --- src/include/nodes/nodes.h +++ src/include/nodes/nodes.h @@ -59,6 +59,7 @@ typedef enum NodeTag T_BitmapIndexScan, T_BitmapHeapScan, T_TidScan, + T_TidRangeScan, T_SubqueryScan, T_FunctionScan, T_ValuesScan, @@ -116,6 +117,7 @@ typedef enum NodeTag T_BitmapIndexScanState, T_BitmapHeapScanState, T_TidScanState, + T_TidRangeScanState, T_SubqueryScanState, T_FunctionScanState, T_TableFuncScanState, @@ -229,6 +231,7 @@ typedef enum NodeTag T_BitmapAndPath, T_BitmapOrPath, T_TidPath, + T_TidRangePath, T_SubqueryScanPath, T_ForeignPath, T_CustomPath, diff --git src/include/nodes/pathnodes.h src/include/nodes/pathnodes.h index b4059895de..79c5f77c82 100644 --- src/include/nodes/pathnodes.h +++ src/include/nodes/pathnodes.h @@ -732,6 +732,7 @@ typedef struct RelOptInfo List *joininfo; /* RestrictInfo structures for join clauses * involving this rel */ bool has_eclass_joins; /* T means joininfo is incomplete */ + bool has_scan_setlimits; /* Rel's table AM has scan_setlimits */ /* used by partitionwise joins: */ bool consider_partitionwise_join; /* consider partitionwise join @@ -1323,6 +1324,18 @@ typedef struct TidPath List *tidquals; /* qual(s) involving CTID = something */ } TidPath; +/* + * TidRangePath represents a scan by a continguous range of TIDs + * + * tidrangequals is an implicitly AND'ed list of qual expressions of the form + * "CTID relop pseudoconstant", where relop is one of >,>=,<,<=. + */ +typedef struct TidRangePath +{ + Path path; + List *tidrangequals; +} TidRangePath; + /* * SubqueryScanPath represents a scan of an unflattened subquery-in-FROM * diff --git src/include/nodes/plannodes.h src/include/nodes/plannodes.h index 7e6b10f86b..011fad0ac7 100644 --- src/include/nodes/plannodes.h +++ src/include/nodes/plannodes.h @@ -485,6 +485,19 @@ typedef struct TidScan List *tidquals; /* qual(s) involving CTID = something */ } TidScan; +/* ---------------- + * tid range scan node + * + * tidrangequals is an implicitly AND'ed list of qual expressions of the form + * "CTID relop pseudoconstant", where relop is one of >,>=,<,<=. + * ---------------- + */ +typedef struct TidRangeScan +{ + Scan scan; + List *tidrangequals; /* qual(s) involving CTID op something */ +} TidRangeScan; + /* ---------------- * subquery scan node * diff --git src/include/optimizer/cost.h src/include/optimizer/cost.h index 8e621d2f76..be980ea6dc 100644 --- src/include/optimizer/cost.h +++ src/include/optimizer/cost.h @@ -83,6 +83,9 @@ extern void cost_bitmap_or_node(BitmapOrPath *path, PlannerInfo *root); extern void cost_bitmap_tree_node(Path *path, Cost *cost, Selectivity *selec); extern void cost_tidscan(Path *path, PlannerInfo *root, RelOptInfo *baserel, List *tidquals, ParamPathInfo *param_info); +extern void cost_tidrangescan(Path *path, PlannerInfo *root, + RelOptInfo *baserel, List *tidquals, + ParamPathInfo *param_info); extern void cost_subqueryscan(SubqueryScanPath *path, PlannerInfo *root, RelOptInfo *baserel, ParamPathInfo *param_info); extern void cost_functionscan(Path *path, PlannerInfo *root, diff --git src/include/optimizer/pathnode.h src/include/optimizer/pathnode.h index 3bd7072ae8..0105f1fac4 100644 --- src/include/optimizer/pathnode.h +++ src/include/optimizer/pathnode.h @@ -63,6 +63,10 @@ extern BitmapOrPath *create_bitmap_or_path(PlannerInfo *root, List *bitmapquals); extern TidPath *create_tidscan_path(PlannerInfo *root, RelOptInfo *rel, List *tidquals, Relids required_outer); +extern TidRangePath *create_tidrangescan_path(PlannerInfo *root, + RelOptInfo *rel, + List *tidrangequals, + Relids required_outer); extern AppendPath *create_append_path(PlannerInfo *root, RelOptInfo *rel, List *subpaths, List *partial_subpaths, List *pathkeys, Relids required_outer, diff --git src/backend/access/heap/heapam_handler.c src/backend/access/heap/heapam_handler.c index 3eea215b85..df9e14234f 100644 --- src/backend/access/heap/heapam_handler.c +++ src/backend/access/heap/heapam_handler.c @@ -2539,6 +2539,7 @@ static const TableAmRoutine heapam_methods = { .scan_begin = heap_beginscan, .scan_end = heap_endscan, .scan_rescan = heap_rescan, + .scan_setlimits = heap_setscanlimits, .scan_getnextslot = heap_getnextslot, .parallelscan_estimate = table_block_parallelscan_estimate, diff --git src/backend/commands/explain.c src/backend/commands/explain.c index d797b5f53e..f4930ca8a5 100644 --- src/backend/commands/explain.c +++ src/backend/commands/explain.c @@ -1057,6 +1057,7 @@ ExplainPreScanNode(PlanState *planstate, Bitmapset **rels_used) case T_IndexOnlyScan: case T_BitmapHeapScan: case T_TidScan: + case T_TidRangeScan: case T_SubqueryScan: case T_FunctionScan: case T_TableFuncScan: @@ -1223,6 +1224,9 @@ ExplainNode(PlanState *planstate, List *ancestors, case T_TidScan: pname = sname = "Tid Scan"; break; + case T_TidRangeScan: + pname = sname = "Tid Range Scan"; + break; case T_SubqueryScan: pname = sname = "Subquery Scan"; break; @@ -1417,6 +1421,7 @@ ExplainNode(PlanState *planstate, List *ancestors, case T_SampleScan: case T_BitmapHeapScan: case T_TidScan: + case T_TidRangeScan: case T_SubqueryScan: case T_FunctionScan: case T_TableFuncScan: @@ -1871,6 +1876,23 @@ ExplainNode(PlanState *planstate, List *ancestors, planstate, es); } break; + case T_TidRangeScan: + { + /* + * The tidrangequals list has AND semantics, so be sure to + * show it as an AND condition. + */ + List *tidquals = ((TidRangeScan *) plan)->tidrangequals; + + if (list_length(tidquals) > 1) + tidquals = list_make1(make_andclause(tidquals)); + show_scan_qual(tidquals, "TID Cond", planstate, ancestors, es); + show_scan_qual(plan->qual, "Filter", planstate, ancestors, es); + if (plan->qual) + show_instrumentation_count("Rows Removed by Filter", 1, + planstate, es); + } + break; case T_ForeignScan: show_scan_qual(plan->qual, "Filter", planstate, ancestors, es); if (plan->qual) @@ -3558,6 +3580,7 @@ ExplainTargetRel(Plan *plan, Index rti, ExplainState *es) case T_IndexOnlyScan: case T_BitmapHeapScan: case T_TidScan: + case T_TidRangeScan: case T_ForeignScan: case T_CustomScan: case T_ModifyTable: diff --git src/backend/executor/Makefile src/backend/executor/Makefile index f990c6473a..74ac59faa1 100644 --- src/backend/executor/Makefile +++ src/backend/executor/Makefile @@ -67,6 +67,7 @@ OBJS = \ nodeSubplan.o \ nodeSubqueryscan.o \ nodeTableFuncscan.o \ + nodeTidrangescan.o \ nodeTidscan.o \ nodeUnique.o \ nodeValuesscan.o \ diff --git src/backend/executor/execAmi.c src/backend/executor/execAmi.c index 0c10f1d35c..5de60a36ac 100644 --- src/backend/executor/execAmi.c +++ src/backend/executor/execAmi.c @@ -51,6 +51,7 @@ #include "executor/nodeSubplan.h" #include "executor/nodeSubqueryscan.h" #include "executor/nodeTableFuncscan.h" +#include "executor/nodeTidrangescan.h" #include "executor/nodeTidscan.h" #include "executor/nodeUnique.h" #include "executor/nodeValuesscan.h" @@ -197,6 +198,10 @@ ExecReScan(PlanState *node) ExecReScanTidScan((TidScanState *) node); break; + case T_TidRangeScanState: + ExecReScanTidRangeScan((TidRangeScanState *) node); + break; + case T_SubqueryScanState: ExecReScanSubqueryScan((SubqueryScanState *) node); break; diff --git src/backend/executor/execProcnode.c src/backend/executor/execProcnode.c index 01b7b926bf..a0576ac41a 100644 --- src/backend/executor/execProcnode.c +++ src/backend/executor/execProcnode.c @@ -109,6 +109,7 @@ #include "executor/nodeSubplan.h" #include "executor/nodeSubqueryscan.h" #include "executor/nodeTableFuncscan.h" +#include "executor/nodeTidrangescan.h" #include "executor/nodeTidscan.h" #include "executor/nodeUnique.h" #include "executor/nodeValuesscan.h" @@ -238,6 +239,11 @@ ExecInitNode(Plan *node, EState *estate, int eflags) estate, eflags); break; + case T_TidRangeScan: + result = (PlanState *) ExecInitTidRangeScan((TidRangeScan *) node, + estate, eflags); + break; + case T_SubqueryScan: result = (PlanState *) ExecInitSubqueryScan((SubqueryScan *) node, estate, eflags); @@ -637,6 +643,10 @@ ExecEndNode(PlanState *node) ExecEndTidScan((TidScanState *) node); break; + case T_TidRangeScanState: + ExecEndTidRangeScan((TidRangeScanState *) node); + break; + case T_SubqueryScanState: ExecEndSubqueryScan((SubqueryScanState *) node); break; diff --git src/backend/executor/nodeTidrangescan.c src/backend/executor/nodeTidrangescan.c new file mode 100644 index 0000000000..8a72f52074 --- /dev/null +++ src/backend/executor/nodeTidrangescan.c @@ -0,0 +1,580 @@ +/*------------------------------------------------------------------------- + * + * nodeTidrangescan.c + * Routines to support tid range scans of relations + * + * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/executor/nodeTidrangescan.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/relscan.h" +#include "access/sysattr.h" +#include "access/tableam.h" +#include "catalog/pg_operator.h" +#include "executor/execdebug.h" +#include "executor/nodeTidrangescan.h" +#include "nodes/nodeFuncs.h" +#include "storage/bufmgr.h" +#include "utils/rel.h" + + +#define IsCTIDVar(node) \ + ((node) != NULL && \ + IsA((node), Var) && \ + ((Var *) (node))->varattno == SelfItemPointerAttributeNumber && \ + ((Var *) (node))->varlevelsup == 0) + +typedef enum +{ + TIDEXPR_UPPER_BOUND, + TIDEXPR_LOWER_BOUND +} TidExprType; + +/* Upper or lower range bound for scan */ +typedef struct TidOpExpr +{ + TidExprType exprtype; /* type of op */ + ExprState *exprstate; /* ExprState for a TID-yielding subexpr */ + bool inclusive; /* whether op is inclusive */ +} TidOpExpr; + +/* + * For the given 'expr', build and return an appropriate TidOpExpr taking into + * account the expr's operator and operand order. + */ +static TidOpExpr * +MakeTidOpExpr(OpExpr *expr, TidRangeScanState *tidstate) +{ + Node *arg1 = get_leftop((Expr *) expr); + Node *arg2 = get_rightop((Expr *) expr); + ExprState *exprstate = NULL; + bool invert = false; + TidOpExpr *tidopexpr; + + if (IsCTIDVar(arg1)) + exprstate = ExecInitExpr((Expr *) arg2, &tidstate->ss.ps); + else if (IsCTIDVar(arg2)) + { + exprstate = ExecInitExpr((Expr *) arg1, &tidstate->ss.ps); + invert = true; + } + else + elog(ERROR, "could not identify CTID variable"); + + tidopexpr = (TidOpExpr *) palloc0(sizeof(TidOpExpr)); + + switch (expr->opno) + { + case TIDLessEqOperator: + tidopexpr->inclusive = true; + /* fall through */ + case TIDLessOperator: + tidopexpr->exprtype = invert ? TIDEXPR_LOWER_BOUND : TIDEXPR_UPPER_BOUND; + break; + case TIDGreaterEqOperator: + tidopexpr->inclusive = true; + /* fall through */ + case TIDGreaterOperator: + tidopexpr->exprtype = invert ? TIDEXPR_UPPER_BOUND : TIDEXPR_LOWER_BOUND; + break; + default: + elog(ERROR, "could not identify CTID operator"); + } + + tidopexpr->exprstate = exprstate; + + return tidopexpr; +} + +/* + * Extract the qual subexpressions that yield TIDs to search for, + * and compile them into ExprStates if they're ordinary expressions. + */ +static void +TidExprListCreate(TidRangeScanState *tidrangestate) +{ + TidRangeScan *node = (TidRangeScan *) tidrangestate->ss.ps.plan; + List *tidexprs = NIL; + ListCell *l; + + foreach(l, node->tidrangequals) + { + OpExpr *opexpr = lfirst(l); + TidOpExpr *tidopexpr; + + if (!IsA(opexpr, OpExpr)) + elog(ERROR, "could not identify CTID expression"); + + tidopexpr = MakeTidOpExpr(opexpr, tidrangestate); + tidexprs = lappend(tidexprs, tidopexpr); + } + + tidrangestate->trss_tidexprs = tidexprs; +} + +/* + * Set 'lowerBound' based on 'tid'. If 'inclusive' is false then the + * lowerBound is incremented to the next tid value so that it becomes + * inclusive. If there is no valid next tid value then we return false, + * otherwise we return true. + */ +static bool +SetTidLowerBound(ItemPointer tid, bool inclusive, ItemPointer lowerBound) +{ + OffsetNumber offset; + + *lowerBound = *tid; + offset = ItemPointerGetOffsetNumberNoCheck(tid); + + if (!inclusive) + { + /* Check if the lower bound is actually in the next block. */ + if (offset >= MaxOffsetNumber) + { + BlockNumber block = ItemPointerGetBlockNumberNoCheck(lowerBound); + + /* + * If the lower bound was already at or above the maximum block + * number, then there is no valid value for it be set to. + */ + if (block >= MaxBlockNumber) + return false; + + /* Set the lowerBound to the first offset in the next block */ + ItemPointerSet(lowerBound, block + 1, 1); + } + else + ItemPointerSetOffsetNumber(lowerBound, OffsetNumberNext(offset)); + } + else if (offset == 0) + ItemPointerSetOffsetNumber(lowerBound, 1); + + return true; +} + +/* + * Set 'upperBound' based on 'tid'. If 'inclusive' is false then the + * upperBound is decremented to the previous tid value so that it becomes + * inclusive. If there is no valid previous tid value then we return false, + * otherwise we return true. + */ +static bool +SetTidUpperBound(ItemPointer tid, bool inclusive, ItemPointer upperBound) +{ + OffsetNumber offset; + + *upperBound = *tid; + offset = ItemPointerGetOffsetNumberNoCheck(tid); + + /* + * Since TID offsets start at 1, an inclusive upper bound with offset 0 + * can be treated as an exclusive bound. This has the benefit of + * eliminating that block from the scan range. + */ + if (inclusive && offset == 0) + inclusive = false; + + if (!inclusive) + { + /* Check if the upper bound is actually in the previous block. */ + if (offset == 0) + { + BlockNumber block = ItemPointerGetBlockNumberNoCheck(upperBound); + + /* + * If the upper bound was already in block 0, then there is no + * valid value for it to be set to. + */ + if (block == 0) + return false; + + ItemPointerSet(upperBound, block - 1, MaxOffsetNumber); + } + else + ItemPointerSetOffsetNumber(upperBound, OffsetNumberPrev(offset)); + } + + return true; +} + +/* ---------------------------------------------------------------- + * TidRangeEval + * + * Compute and set node's block and offset range to scan by evaluating + * the trss_tidexprs. If we detect an invalid range that cannot yield + * any rows, the range is left unset. + * ---------------------------------------------------------------- + */ +static void +TidRangeEval(TidRangeScanState *node) +{ + ExprContext *econtext = node->ss.ps.ps_ExprContext; + BlockNumber nblocks; + ItemPointerData lowerBound; + ItemPointerData upperBound; + ListCell *l; + + /* + * We silently discard any TIDs that are out of range at the time of scan + * start. (Since we hold at least AccessShareLock on the table, it won't + * be possible for someone to truncate away the blocks we intend to + * visit.) + */ + nblocks = RelationGetNumberOfBlocks(node->ss.ss_currentRelation); + + /* The biggest range on an empty table is empty; just skip it. */ + if (nblocks == 0) + return; + + /* Set the lower and upper bound to scan the whole table. */ + ItemPointerSet(&lowerBound, 0, 1); + ItemPointerSet(&upperBound, nblocks - 1, MaxOffsetNumber); + + foreach(l, node->trss_tidexprs) + { + TidOpExpr *tidopexpr = (TidOpExpr *) lfirst(l); + ItemPointer itemptr; + bool isNull; + + /* Evaluate this bound. */ + itemptr = (ItemPointer) + DatumGetPointer(ExecEvalExprSwitchContext(tidopexpr->exprstate, + econtext, + &isNull)); + + /* If the bound is NULL, *nothing* matches the qual. */ + if (isNull) + return; + + if (tidopexpr->exprtype == TIDEXPR_LOWER_BOUND) + { + ItemPointerData lb; + + /* + * If the lower bound is beyond the maximum value for ctid, then + * just bail without setting the range. No rows can match. + */ + if (!SetTidLowerBound(itemptr, tidopexpr->inclusive, &lb)) + return; + + if (ItemPointerCompare(&lb, &lowerBound) > 0) + lowerBound = lb; + } + + if (tidopexpr->exprtype == TIDEXPR_UPPER_BOUND) + { + ItemPointerData ub; + + /* + * If the upper bound is below the minimum value for ctid, then + * just bail without setting the range. No rows can match. + */ + if (!SetTidUpperBound(itemptr, tidopexpr->inclusive, &ub)) + return; + + if (ItemPointerCompare(&ub, &upperBound) < 0) + upperBound = ub; + } + } + + /* If the resulting range is not empty, set it. */ + if (ItemPointerCompare(&lowerBound, &upperBound) <= 0) + { + node->trss_startBlock = ItemPointerGetBlockNumberNoCheck(&lowerBound); + node->trss_endBlock = ItemPointerGetBlockNumberNoCheck(&upperBound); + node->trss_startOffset = ItemPointerGetOffsetNumberNoCheck(&lowerBound); + node->trss_endOffset = ItemPointerGetOffsetNumberNoCheck(&upperBound); + } +} + +/* ---------------------------------------------------------------- + * NextInTidRange + * + * Fetch the next tuple when scanning a range of TIDs. + * + * Since the table access method may return tuples that are in the scan + * limit, but not within the required TID range, this function will + * check for such tuples and skip over them. + * ---------------------------------------------------------------- + */ +static bool +NextInTidRange(TidRangeScanState *node, TableScanDesc scandesc, + TupleTableSlot *slot) +{ + for (;;) + { + BlockNumber block; + OffsetNumber offset; + + if (!table_scan_getnextslot(scandesc, ForwardScanDirection, slot)) + return false; + + /* Check that the tuple is within the required range. */ + block = ItemPointerGetBlockNumber(&slot->tts_tid); + offset = ItemPointerGetOffsetNumber(&slot->tts_tid); + + /* The tuple should never come from outside the scan limits. */ + Assert(block >= node->trss_startBlock && + block <= node->trss_endBlock); + + /* + * If the tuple is in the first block of the range and before the + * first requested offset, then we can skip it. + */ + if (block == node->trss_startBlock && offset < node->trss_startOffset) + { + ExecClearTuple(slot); + continue; + } + + /* + * Similarly, if the tuple is in the last block and after the last + * requested offset, we can end the scan. + */ + if (block == node->trss_endBlock && offset > node->trss_endOffset) + { + ExecClearTuple(slot); + return false; + } + + return true; + } +} + +/* ---------------------------------------------------------------- + * TidRangeNext + * + * Retrieve a tuple from the TidRangeScan node's currentRelation + * using the tids in the TidRangeScanState information. + * + * ---------------------------------------------------------------- + */ +static TupleTableSlot * +TidRangeNext(TidRangeScanState *node) +{ + TableScanDesc scandesc; + EState *estate; + TupleTableSlot *slot; + bool foundTuple; + + /* + * extract necessary information from tid scan node + */ + scandesc = node->ss.ss_currentScanDesc; + estate = node->ss.ps.state; + slot = node->ss.ss_ScanTupleSlot; + + Assert(ScanDirectionIsForward(estate->es_direction)); + + if (!node->trss_inScan) + { + BlockNumber blocks_to_scan; + + /* First time through, compute the list of TID ranges to be visited */ + if (node->trss_startBlock == InvalidBlockNumber) + TidRangeEval(node); + + if (scandesc == NULL) + { + scandesc = table_beginscan_strat(node->ss.ss_currentRelation, + estate->es_snapshot, + 0, NULL, + false, false); + node->ss.ss_currentScanDesc = scandesc; + } + + /* Compute the number of blocks to scan and set the scan limits. */ + if (node->trss_startBlock == InvalidBlockNumber) + { + /* If the range is empty, set the scan limits to zero blocks. */ + node->trss_startBlock = 0; + blocks_to_scan = 0; + } + else + blocks_to_scan = node->trss_endBlock - node->trss_startBlock + 1; + + table_scan_setlimits(scandesc, node->trss_startBlock, blocks_to_scan); + node->trss_inScan = true; + } + + /* Fetch the next tuple. */ + foundTuple = NextInTidRange(node, scandesc, slot); + + /* + * If we've exhausted all the tuples in the range, reset the inScan flag. + * This will cause the heap to be rescanned for any subsequent fetches, + * which is important for some cursor operations: for instance, FETCH LAST + * fetches all the tuples in order and then fetches one tuple in reverse. + */ + if (!foundTuple) + node->trss_inScan = false; + + return slot; +} + +/* + * TidRecheck -- access method routine to recheck a tuple in EvalPlanQual + */ +static bool +TidRangeRecheck(TidRangeScanState *node, TupleTableSlot *slot) +{ + /* + * XXX shouldn't we check here to make sure tuple is in TID range? In + * runtime-key case this is not certain, is it? + */ + return true; +} + +/* ---------------------------------------------------------------- + * ExecTidRangeScan(node) + * + * Scans the relation using tids and returns the next qualifying tuple. + * We call the ExecScan() routine and pass it the appropriate + * access method functions. + * + * Conditions: + * -- the "cursor" maintained by the AMI is positioned at the tuple + * returned previously. + * + * Initial States: + * -- the relation indicated is opened for scanning so that the + * "cursor" is positioned before the first qualifying tuple. + * -- trss_startBlock is InvalidBlockNumber + * ---------------------------------------------------------------- + */ +static TupleTableSlot * +ExecTidRangeScan(PlanState *pstate) +{ + TidRangeScanState *node = castNode(TidRangeScanState, pstate); + + return ExecScan(&node->ss, + (ExecScanAccessMtd) TidRangeNext, + (ExecScanRecheckMtd) TidRangeRecheck); +} + +/* ---------------------------------------------------------------- + * ExecReScanTidRangeScan(node) + * ---------------------------------------------------------------- + */ +void +ExecReScanTidRangeScan(TidRangeScanState *node) +{ + TableScanDesc scan = node->ss.ss_currentScanDesc; + + if (scan != NULL) + table_rescan(scan, NULL); + + /* mark scan as not in progress, and tid range list as not computed yet */ + node->trss_inScan = false; + node->trss_startBlock = InvalidBlockNumber; + + ExecScanReScan(&node->ss); +} + +/* ---------------------------------------------------------------- + * ExecEndTidRangeScan + * + * Releases any storage allocated through C routines. + * Returns nothing. + * ---------------------------------------------------------------- + */ +void +ExecEndTidRangeScan(TidRangeScanState *node) +{ + TableScanDesc scan = node->ss.ss_currentScanDesc; + + if (scan != NULL) + table_endscan(scan); + + /* + * Free the exprcontext + */ + ExecFreeExprContext(&node->ss.ps); + + /* + * clear out tuple table slots + */ + if (node->ss.ps.ps_ResultTupleSlot) + ExecClearTuple(node->ss.ps.ps_ResultTupleSlot); + ExecClearTuple(node->ss.ss_ScanTupleSlot); +} + +/* ---------------------------------------------------------------- + * ExecInitTidRangeScan + * + * Initializes the tid range scan's state information, creates + * scan keys, and opens the base and tid relations. + * + * Parameters: + * node: TidRangeScan node produced by the planner. + * estate: the execution state initialized in InitPlan. + * ---------------------------------------------------------------- + */ +TidRangeScanState * +ExecInitTidRangeScan(TidRangeScan *node, EState *estate, int eflags) +{ + TidRangeScanState *tidrangestate; + Relation currentRelation; + + /* + * create state structure + */ + tidrangestate = makeNode(TidRangeScanState); + tidrangestate->ss.ps.plan = (Plan *) node; + tidrangestate->ss.ps.state = estate; + tidrangestate->ss.ps.ExecProcNode = ExecTidRangeScan; + + /* + * Miscellaneous initialization + * + * create expression context for node + */ + ExecAssignExprContext(estate, &tidrangestate->ss.ps); + + /* + * mark scan as not in progress, and tid range as not computed yet + */ + tidrangestate->trss_inScan = false; + tidrangestate->trss_startBlock = InvalidBlockNumber; + + /* + * open the scan relation + */ + currentRelation = ExecOpenScanRelation(estate, node->scan.scanrelid, eflags); + + tidrangestate->ss.ss_currentRelation = currentRelation; + tidrangestate->ss.ss_currentScanDesc = NULL; /* no table scan here */ + + /* + * get the scan type from the relation descriptor. + */ + ExecInitScanTupleSlot(estate, &tidrangestate->ss, + RelationGetDescr(currentRelation), + table_slot_callbacks(currentRelation)); + + /* + * Initialize result type and projection. + */ + ExecInitResultTypeTL(&tidrangestate->ss.ps); + ExecAssignScanProjectionInfo(&tidrangestate->ss); + + /* + * initialize child expressions + */ + tidrangestate->ss.ps.qual = + ExecInitQual(node->scan.plan.qual, (PlanState *) tidrangestate); + + TidExprListCreate(tidrangestate); + + /* + * all done. + */ + return tidrangestate; +} diff --git src/backend/nodes/copyfuncs.c src/backend/nodes/copyfuncs.c index 70f8b718e0..2abc276e1c 100644 --- src/backend/nodes/copyfuncs.c +++ src/backend/nodes/copyfuncs.c @@ -585,6 +585,27 @@ _copyTidScan(const TidScan *from) return newnode; } +/* + * _copyTidRangeScan + */ +static TidRangeScan * +_copyTidRangeScan(const TidRangeScan *from) +{ + TidRangeScan *newnode = makeNode(TidRangeScan); + + /* + * copy node superclass fields + */ + CopyScanFields((const Scan *) from, (Scan *) newnode); + + /* + * copy remainder of node + */ + COPY_NODE_FIELD(tidrangequals); + + return newnode; +} + /* * _copySubqueryScan */ @@ -4889,6 +4910,9 @@ copyObjectImpl(const void *from) case T_TidScan: retval = _copyTidScan(from); break; + case T_TidRangeScan: + retval = _copyTidRangeScan(from); + break; case T_SubqueryScan: retval = _copySubqueryScan(from); break; diff --git src/backend/nodes/outfuncs.c src/backend/nodes/outfuncs.c index d78b16ed1d..93163e3a2f 100644 --- src/backend/nodes/outfuncs.c +++ src/backend/nodes/outfuncs.c @@ -608,6 +608,16 @@ _outTidScan(StringInfo str, const TidScan *node) WRITE_NODE_FIELD(tidquals); } +static void +_outTidRangeScan(StringInfo str, const TidRangeScan *node) +{ + WRITE_NODE_TYPE("TIDRANGESCAN"); + + _outScanInfo(str, (const Scan *) node); + + WRITE_NODE_FIELD(tidrangequals); +} + static void _outSubqueryScan(StringInfo str, const SubqueryScan *node) { @@ -3770,6 +3780,9 @@ outNode(StringInfo str, const void *obj) case T_TidScan: _outTidScan(str, obj); break; + case T_TidRangeScan: + _outTidRangeScan(str, obj); + break; case T_SubqueryScan: _outSubqueryScan(str, obj); break; diff --git src/backend/optimizer/README src/backend/optimizer/README index efb52858c8..4a6c348162 100644 --- src/backend/optimizer/README +++ src/backend/optimizer/README @@ -374,6 +374,7 @@ RelOptInfo - a relation or joined relations IndexPath - index scan BitmapHeapPath - top of a bitmapped index scan TidPath - scan by CTID + TidRangePath - scan a contiguous range of CTIDs SubqueryScanPath - scan a subquery-in-FROM ForeignPath - scan a foreign table, foreign join or foreign upper-relation CustomPath - for custom scan providers diff --git src/backend/optimizer/path/costsize.c src/backend/optimizer/path/costsize.c index 22d6935824..40cd6fe460 100644 --- src/backend/optimizer/path/costsize.c +++ src/backend/optimizer/path/costsize.c @@ -1283,6 +1283,101 @@ cost_tidscan(Path *path, PlannerInfo *root, path->total_cost = startup_cost + run_cost; } +/* + * cost_tidrangescan + * Determines and sets the costs of scanning a relation using a range of + * TIDs for 'path' + * + * 'baserel' is the relation to be scanned + * 'tidrangequals' is the list of TID-checkable range quals + * 'param_info' is the ParamPathInfo if this is a parameterized path, else NULL + */ +void +cost_tidrangescan(Path *path, PlannerInfo *root, + RelOptInfo *baserel, List *tidrangequals, + ParamPathInfo *param_info) +{ + Selectivity selectivity; + double pages; + Cost startup_cost = 0; + Cost run_cost = 0; + QualCost qpqual_cost; + Cost cpu_per_tuple; + QualCost tid_qual_cost; + double ntuples; + double nseqpages; + double spc_random_page_cost; + double spc_seq_page_cost; + + /* Should only be applied to base relations */ + Assert(baserel->relid > 0); + Assert(baserel->rtekind == RTE_RELATION); + + /* Mark the path with the correct row estimate */ + if (param_info) + path->rows = param_info->ppi_rows; + else + path->rows = baserel->rows; + + /* Count how many tuples and pages we expect to scan */ + selectivity = clauselist_selectivity(root, tidrangequals, baserel->relid, + JOIN_INNER, NULL); + pages = ceil(selectivity * baserel->pages); + + if (pages <= 0.0) + pages = 1.0; + + /* + * The first page in a range requires a random seek, but each subsequent + * page is just a normal sequential page read. NOTE: it's desirable for + * Tid Range Scans to cost more than the equivalent Sequential Scans, + * because Seq Scans have some performance advantages such as scan + * synchronization and parallelizability, and we'd prefer one of them to + * be picked unless a Tid Range Scan really is better. + */ + ntuples = selectivity * baserel->tuples; + nseqpages = pages - 1.0; + + if (!enable_tidscan) + startup_cost += disable_cost; + + /* + * The TID qual expressions will be computed once, any other baserestrict + * quals once per retrieved tuple. + */ + cost_qual_eval(&tid_qual_cost, tidrangequals, root); + + /* fetch estimated page cost for tablespace containing table */ + get_tablespace_page_costs(baserel->reltablespace, + &spc_random_page_cost, + &spc_seq_page_cost); + + /* disk costs; 1 random page and the remainder as seq pages */ + run_cost += spc_random_page_cost + spc_seq_page_cost * nseqpages; + + /* Add scanning CPU costs */ + get_restriction_qual_cost(root, baserel, param_info, &qpqual_cost); + + /* + * XXX currently we assume TID quals are a subset of qpquals at this + * point; they will be removed (if possible) when we create the plan, so + * we subtract their cost from the total qpqual cost. (If the TID quals + * can't be removed, this is a mistake and we're going to underestimate + * the CPU cost a bit.) + */ + startup_cost += qpqual_cost.startup + tid_qual_cost.per_tuple; + cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple - + tid_qual_cost.per_tuple; + run_cost += cpu_per_tuple * ntuples; + + /* tlist eval costs are paid per output row, not per tuple scanned */ + startup_cost += path->pathtarget->cost.startup; + run_cost += path->pathtarget->cost.per_tuple * path->rows; + + path->startup_cost = startup_cost; + path->total_cost = startup_cost + run_cost; +} + /* * cost_subqueryscan * Determines and returns the cost of scanning a subquery RTE. diff --git src/backend/optimizer/path/tidpath.c src/backend/optimizer/path/tidpath.c index 1463a82be8..aa4d6aefad 100644 --- src/backend/optimizer/path/tidpath.c +++ src/backend/optimizer/path/tidpath.c @@ -2,9 +2,9 @@ * * tidpath.c * Routines to determine which TID conditions are usable for scanning - * a given relation, and create TidPaths accordingly. + * a given relation, and create TidPaths and TidRangePaths accordingly. * - * What we are looking for here is WHERE conditions of the form + * For TidPaths, we look for WHERE conditions of the form * "CTID = pseudoconstant", which can be implemented by just fetching * the tuple directly via heap_fetch(). We can also handle OR'd conditions * such as (CTID = const1) OR (CTID = const2), as well as ScalarArrayOpExpr @@ -23,6 +23,9 @@ * a function, but in practice it works better to keep the special node * representation all the way through to execution. * + * Additionally, TidRangePaths may be created for conditions of the form + * "CTID relop pseudoconstant", where relop is one of >,>=,<,<=, and + * AND-clauses composed of such conditions. * * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California @@ -63,14 +66,14 @@ IsCTIDVar(Var *var, RelOptInfo *rel) /* * Check to see if a RestrictInfo is of the form - * CTID = pseudoconstant + * CTID OP pseudoconstant * or - * pseudoconstant = CTID - * where the CTID Var belongs to relation "rel", and nothing on the - * other side of the clause does. + * pseudoconstant OP CTID + * where OP is a binary operation, the CTID Var belongs to relation "rel", + * and nothing on the other side of the clause does. */ static bool -IsTidEqualClause(RestrictInfo *rinfo, RelOptInfo *rel) +IsBinaryTidClause(RestrictInfo *rinfo, RelOptInfo *rel) { OpExpr *node; Node *arg1, @@ -83,10 +86,9 @@ IsTidEqualClause(RestrictInfo *rinfo, RelOptInfo *rel) return false; node = (OpExpr *) rinfo->clause; - /* Operator must be tideq */ - if (node->opno != TIDEqualOperator) + /* OpExpr must have two arguments */ + if (list_length(node->args) != 2) return false; - Assert(list_length(node->args) == 2); arg1 = linitial(node->args); arg2 = lsecond(node->args); @@ -116,6 +118,50 @@ IsTidEqualClause(RestrictInfo *rinfo, RelOptInfo *rel) return true; /* success */ } +/* + * Check to see if a RestrictInfo is of the form + * CTID = pseudoconstant + * or + * pseudoconstant = CTID + * where the CTID Var belongs to relation "rel", and nothing on the + * other side of the clause does. + */ +static bool +IsTidEqualClause(RestrictInfo *rinfo, RelOptInfo *rel) +{ + if (!IsBinaryTidClause(rinfo, rel)) + return false; + + if (((OpExpr *) rinfo->clause)->opno == TIDEqualOperator) + return true; + + return false; +} + +/* + * Check to see if a RestrictInfo is of the form + * CTID OP pseudoconstant + * or + * pseudoconstant OP CTID + * where OP is a range operator such as <, <=, >, or >=, the CTID Var belongs + * to relation "rel", and nothing on the other side of the clause does. + */ +static bool +IsTidRangeClause(RestrictInfo *rinfo, RelOptInfo *rel) +{ + Oid opno; + + if (!IsBinaryTidClause(rinfo, rel)) + return false; + opno = ((OpExpr *) rinfo->clause)->opno; + + if (opno == TIDLessOperator || opno == TIDLessEqOperator || + opno == TIDGreaterOperator || opno == TIDGreaterEqOperator) + return true; + + return false; +} + /* * Check to see if a RestrictInfo is of the form * CTID = ANY (pseudoconstant_array) @@ -222,7 +268,7 @@ TidQualFromRestrictInfo(RestrictInfo *rinfo, RelOptInfo *rel) * * Returns a List of CTID qual RestrictInfos for the specified rel (with * implicit OR semantics across the list), or NIL if there are no usable - * conditions. + * equality conditions. * * This function is just concerned with handling AND/OR recursion. */ @@ -301,6 +347,33 @@ TidQualFromRestrictInfoList(List *rlist, RelOptInfo *rel) return rlst; } +/* + * Extract a set of CTID range conditions from implicit-AND List of RestrictInfos + * + * Returns a List of CTID range qual RestrictInfos for the specified rel + * (with implicit AND semantics across the list), or NIL if there are no + * usable range conditions. + */ +static List * +TidRangeQualFromRestrictInfoList(List *rlist, RelOptInfo *rel) +{ + List *rlst = NIL; + ListCell *l; + + if (!rel->has_scan_setlimits) + return NIL; + + foreach(l, rlist) + { + RestrictInfo *rinfo = lfirst_node(RestrictInfo, l); + + if (IsTidRangeClause(rinfo, rel)) + rlst = lappend(rlst, rinfo); + } + + return rlst; +} + /* * Given a list of join clauses involving our rel, create a parameterized * TidPath for each one that is a suitable TidEqual clause. @@ -385,6 +458,7 @@ void create_tidscan_paths(PlannerInfo *root, RelOptInfo *rel) { List *tidquals; + List *tidrangequals; /* * If any suitable quals exist in the rel's baserestrict list, generate a @@ -404,6 +478,26 @@ create_tidscan_paths(PlannerInfo *root, RelOptInfo *rel) required_outer)); } + /* + * If there are range quals in the baserestrict list, generate a + * TidRangePath. + */ + tidrangequals = TidRangeQualFromRestrictInfoList(rel->baserestrictinfo, + rel); + + if (tidrangequals) + { + /* + * This path uses no join clauses, but it could still have required + * parameterization due to LATERAL refs in its tlist. + */ + Relids required_outer = rel->lateral_relids; + + add_path(rel, (Path *) create_tidrangescan_path(root, rel, + tidrangequals, + required_outer)); + } + /* * Try to generate parameterized TidPaths using equality clauses extracted * from EquivalenceClasses. (This is important since simple "t1.ctid = diff --git src/backend/optimizer/plan/createplan.c src/backend/optimizer/plan/createplan.c index f7a8dae3c6..bdfee9cc61 100644 --- src/backend/optimizer/plan/createplan.c +++ src/backend/optimizer/plan/createplan.c @@ -129,6 +129,10 @@ static Plan *create_bitmap_subplan(PlannerInfo *root, Path *bitmapqual, static void bitmap_subplan_mark_shared(Plan *plan); static TidScan *create_tidscan_plan(PlannerInfo *root, TidPath *best_path, List *tlist, List *scan_clauses); +static TidRangeScan *create_tidrangescan_plan(PlannerInfo *root, + TidRangePath *best_path, + List *tlist, + List *scan_clauses); static SubqueryScan *create_subqueryscan_plan(PlannerInfo *root, SubqueryScanPath *best_path, List *tlist, List *scan_clauses); @@ -193,6 +197,8 @@ static BitmapHeapScan *make_bitmap_heapscan(List *qptlist, Index scanrelid); static TidScan *make_tidscan(List *qptlist, List *qpqual, Index scanrelid, List *tidquals); +static TidRangeScan *make_tidrangescan(List *qptlist, List *qpqual, + Index scanrelid, List *tidrangequals); static SubqueryScan *make_subqueryscan(List *qptlist, List *qpqual, Index scanrelid, @@ -384,6 +390,7 @@ create_plan_recurse(PlannerInfo *root, Path *best_path, int flags) case T_IndexOnlyScan: case T_BitmapHeapScan: case T_TidScan: + case T_TidRangeScan: case T_SubqueryScan: case T_FunctionScan: case T_TableFuncScan: @@ -679,6 +686,13 @@ create_scan_plan(PlannerInfo *root, Path *best_path, int flags) scan_clauses); break; + case T_TidRangeScan: + plan = (Plan *) create_tidrangescan_plan(root, + (TidRangePath *) best_path, + tlist, + scan_clauses); + break; + case T_SubqueryScan: plan = (Plan *) create_subqueryscan_plan(root, (SubqueryScanPath *) best_path, @@ -3440,6 +3454,71 @@ create_tidscan_plan(PlannerInfo *root, TidPath *best_path, return scan_plan; } +/* + * create_tidrangescan_plan + * Returns a tidrangescan plan for the base relation scanned by 'best_path' + * with restriction clauses 'scan_clauses' and targetlist 'tlist'. + */ +static TidRangeScan * +create_tidrangescan_plan(PlannerInfo *root, TidRangePath *best_path, + List *tlist, List *scan_clauses) +{ + TidRangeScan *scan_plan; + Index scan_relid = best_path->path.parent->relid; + List *tidrangequals = best_path->tidrangequals; + + /* it should be a base rel... */ + Assert(scan_relid > 0); + Assert(best_path->path.parent->rtekind == RTE_RELATION); + + /* + * The qpqual list must contain all restrictions not enforced by the + * tidrangequals list. tidrangequals has AND semantics, so we can simply + * remove any qual that appears in it. + */ + { + List *qpqual = NIL; + ListCell *l; + + foreach(l, scan_clauses) + { + RestrictInfo *rinfo = lfirst_node(RestrictInfo, l); + + if (rinfo->pseudoconstant) + continue; /* we may drop pseudoconstants here */ + if (list_member_ptr(tidrangequals, rinfo)) + continue; /* simple duplicate */ + qpqual = lappend(qpqual, rinfo); + } + scan_clauses = qpqual; + } + + /* Sort clauses into best execution order */ + scan_clauses = order_qual_clauses(root, scan_clauses); + + /* Reduce RestrictInfo lists to bare expressions; ignore pseudoconstants */ + tidrangequals = extract_actual_clauses(tidrangequals, false); + scan_clauses = extract_actual_clauses(scan_clauses, false); + + /* Replace any outer-relation variables with nestloop params */ + if (best_path->path.param_info) + { + tidrangequals = (List *) + replace_nestloop_params(root, (Node *) tidrangequals); + scan_clauses = (List *) + replace_nestloop_params(root, (Node *) scan_clauses); + } + + scan_plan = make_tidrangescan(tlist, + scan_clauses, + scan_relid, + tidrangequals); + + copy_generic_path_info(&scan_plan->scan.plan, &best_path->path); + + return scan_plan; +} + /* * create_subqueryscan_plan * Returns a subqueryscan plan for the base relation scanned by 'best_path' @@ -5373,6 +5452,25 @@ make_tidscan(List *qptlist, return node; } +static TidRangeScan * +make_tidrangescan(List *qptlist, + List *qpqual, + Index scanrelid, + List *tidrangequals) +{ + TidRangeScan *node = makeNode(TidRangeScan); + Plan *plan = &node->scan.plan; + + plan->targetlist = qptlist; + plan->qual = qpqual; + plan->lefttree = NULL; + plan->righttree = NULL; + node->scan.scanrelid = scanrelid; + node->tidrangequals = tidrangequals; + + return node; +} + static SubqueryScan * make_subqueryscan(List *qptlist, List *qpqual, diff --git src/backend/optimizer/plan/setrefs.c src/backend/optimizer/plan/setrefs.c index 127ea3d856..7ce2d00b2b 100644 --- src/backend/optimizer/plan/setrefs.c +++ src/backend/optimizer/plan/setrefs.c @@ -619,6 +619,22 @@ set_plan_refs(PlannerInfo *root, Plan *plan, int rtoffset) rtoffset, 1); } break; + case T_TidRangeScan: + { + TidRangeScan *splan = (TidRangeScan *) plan; + + splan->scan.scanrelid += rtoffset; + splan->scan.plan.targetlist = + fix_scan_list(root, splan->scan.plan.targetlist, + rtoffset, NUM_EXEC_TLIST(plan)); + splan->scan.plan.qual = + fix_scan_list(root, splan->scan.plan.qual, + rtoffset, NUM_EXEC_QUAL(plan)); + splan->tidrangequals = + fix_scan_list(root, splan->tidrangequals, + rtoffset, 1); /* v9_tid XXX Not sure this is right */ + } + break; case T_SubqueryScan: /* Needs special treatment, see comments below */ return set_subqueryscan_references(root, diff --git src/backend/optimizer/plan/subselect.c src/backend/optimizer/plan/subselect.c index fcce81926b..094d5b50d0 100644 --- src/backend/optimizer/plan/subselect.c +++ src/backend/optimizer/plan/subselect.c @@ -2367,6 +2367,12 @@ finalize_plan(PlannerInfo *root, Plan *plan, context.paramids = bms_add_members(context.paramids, scan_params); break; + case T_TidRangeScan: + finalize_primnode((Node *) ((TidRangeScan *) plan)->tidrangequals, + &context); + context.paramids = bms_add_members(context.paramids, scan_params); + break; + case T_SubqueryScan: { SubqueryScan *sscan = (SubqueryScan *) plan; diff --git src/backend/optimizer/util/pathnode.c src/backend/optimizer/util/pathnode.c index 51478957fb..e28d74afe9 100644 --- src/backend/optimizer/util/pathnode.c +++ src/backend/optimizer/util/pathnode.c @@ -1203,6 +1203,35 @@ create_tidscan_path(PlannerInfo *root, RelOptInfo *rel, List *tidquals, return pathnode; } +/* + * create_tidscan_path + * Creates a path corresponding to a scan by a range of TIDs, returning + * the pathnode. + */ +TidRangePath * +create_tidrangescan_path(PlannerInfo *root, RelOptInfo *rel, + List *tidrangequals, Relids required_outer) +{ + TidRangePath *pathnode = makeNode(TidRangePath); + + pathnode->path.pathtype = T_TidRangeScan; + pathnode->path.parent = rel; + pathnode->path.pathtarget = rel->reltarget; + pathnode->path.param_info = get_baserel_parampathinfo(root, rel, + required_outer); + pathnode->path.parallel_aware = false; + pathnode->path.parallel_safe = rel->consider_parallel; + pathnode->path.parallel_workers = 0; + pathnode->path.pathkeys = NIL; /* always unordered */ + + pathnode->tidrangequals = tidrangequals; + + cost_tidrangescan(&pathnode->path, root, rel, tidrangequals, + pathnode->path.param_info); + + return pathnode; +} + /* * create_append_path * Creates a path corresponding to an Append plan, returning the diff --git src/backend/optimizer/util/plancat.c src/backend/optimizer/util/plancat.c index daf1759623..4333f6c4c2 100644 --- src/backend/optimizer/util/plancat.c +++ src/backend/optimizer/util/plancat.c @@ -466,6 +466,10 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, /* Collect info about relation's foreign keys, if relevant */ get_relation_foreign_keys(root, rel, relation, inhparent); + /* Collect info about functions implemented by the rel's table AM. */ + rel->has_scan_setlimits = relation->rd_tableam && + relation->rd_tableam->scan_setlimits != NULL; + /* * Collect info about relation's partitioning scheme, if any. Only * inheritance parents may be partitioned. diff --git src/backend/optimizer/util/relnode.c src/backend/optimizer/util/relnode.c index 9c9a738c80..9536c238fb 100644 --- src/backend/optimizer/util/relnode.c +++ src/backend/optimizer/util/relnode.c @@ -247,6 +247,7 @@ build_simple_rel(PlannerInfo *root, int relid, RelOptInfo *parent) rel->baserestrict_min_security = UINT_MAX; rel->joininfo = NIL; rel->has_eclass_joins = false; + rel->has_scan_setlimits = false; rel->consider_partitionwise_join = false; /* might get changed later */ rel->part_scheme = NULL; rel->nparts = -1; @@ -659,6 +660,7 @@ build_join_rel(PlannerInfo *root, joinrel->baserestrict_min_security = UINT_MAX; joinrel->joininfo = NIL; joinrel->has_eclass_joins = false; + joinrel->has_scan_setlimits = false; joinrel->consider_partitionwise_join = false; /* might get changed later */ joinrel->top_parent_relids = NULL; joinrel->part_scheme = NULL; @@ -836,6 +838,7 @@ build_child_join_rel(PlannerInfo *root, RelOptInfo *outer_rel, joinrel->baserestrictcost.per_tuple = 0; joinrel->joininfo = NIL; joinrel->has_eclass_joins = false; + joinrel->has_scan_setlimits = false; joinrel->consider_partitionwise_join = false; /* might get changed later */ joinrel->top_parent_relids = NULL; joinrel->part_scheme = NULL; diff --git src/test/regress/expected/tidrangescan.out src/test/regress/expected/tidrangescan.out new file mode 100644 index 0000000000..fc11894c8e --- /dev/null +++ src/test/regress/expected/tidrangescan.out @@ -0,0 +1,245 @@ +-- tests for tidrangescans +SET enable_seqscan TO off; +CREATE TABLE tidrangescan(id integer, data text); +-- insert enough tuples to fill at least two pages +INSERT INTO tidrangescan SELECT i,repeat('x', 100) FROM generate_series(1,200) AS s(i); +-- remove all tuples after the 10th tuple on each page. Trying to ensure +-- we get the same layout with all CPU architectures and smaller than standard +-- page sizes. +DELETE FROM tidrangescan +WHERE substring(ctid::text from ',(\d+)\)')::integer > 10 OR substring(ctid::text from '\((\d+),')::integer > 2; +VACUUM tidrangescan; +-- range scans with upper bound +EXPLAIN (COSTS OFF) +SELECT ctid FROM tidrangescan WHERE ctid < '(1,0)'; + QUERY PLAN +----------------------------------- + Tid Range Scan on tidrangescan + TID Cond: (ctid < '(1,0)'::tid) +(2 rows) + +SELECT ctid FROM tidrangescan WHERE ctid < '(1,0)'; + ctid +-------- + (0,1) + (0,2) + (0,3) + (0,4) + (0,5) + (0,6) + (0,7) + (0,8) + (0,9) + (0,10) +(10 rows) + +EXPLAIN (COSTS OFF) +SELECT ctid FROM tidrangescan WHERE ctid <= '(1,5)'; + QUERY PLAN +------------------------------------ + Tid Range Scan on tidrangescan + TID Cond: (ctid <= '(1,5)'::tid) +(2 rows) + +SELECT ctid FROM tidrangescan WHERE ctid <= '(1,5)'; + ctid +-------- + (0,1) + (0,2) + (0,3) + (0,4) + (0,5) + (0,6) + (0,7) + (0,8) + (0,9) + (0,10) + (1,1) + (1,2) + (1,3) + (1,4) + (1,5) +(15 rows) + +EXPLAIN (COSTS OFF) +SELECT ctid FROM tidrangescan WHERE ctid < '(0,0)'; + QUERY PLAN +----------------------------------- + Tid Range Scan on tidrangescan + TID Cond: (ctid < '(0,0)'::tid) +(2 rows) + +SELECT ctid FROM tidrangescan WHERE ctid < '(0,0)'; + ctid +------ +(0 rows) + +-- range scans with lower bound +EXPLAIN (COSTS OFF) +SELECT ctid FROM tidrangescan WHERE ctid > '(2,8)'; + QUERY PLAN +----------------------------------- + Tid Range Scan on tidrangescan + TID Cond: (ctid > '(2,8)'::tid) +(2 rows) + +SELECT ctid FROM tidrangescan WHERE ctid > '(2,8)'; + ctid +-------- + (2,9) + (2,10) +(2 rows) + +EXPLAIN (COSTS OFF) +SELECT ctid FROM tidrangescan WHERE '(2,8)' < ctid; + QUERY PLAN +----------------------------------- + Tid Range Scan on tidrangescan + TID Cond: ('(2,8)'::tid < ctid) +(2 rows) + +SELECT ctid FROM tidrangescan WHERE '(2,8)' < ctid; + ctid +-------- + (2,9) + (2,10) +(2 rows) + +EXPLAIN (COSTS OFF) +SELECT ctid FROM tidrangescan WHERE ctid >= '(2,8)'; + QUERY PLAN +------------------------------------ + Tid Range Scan on tidrangescan + TID Cond: (ctid >= '(2,8)'::tid) +(2 rows) + +SELECT ctid FROM tidrangescan WHERE ctid >= '(2,8)'; + ctid +-------- + (2,8) + (2,9) + (2,10) +(3 rows) + +EXPLAIN (COSTS OFF) +SELECT ctid FROM tidrangescan WHERE ctid >= '(100,0)'; + QUERY PLAN +-------------------------------------- + Tid Range Scan on tidrangescan + TID Cond: (ctid >= '(100,0)'::tid) +(2 rows) + +SELECT ctid FROM tidrangescan WHERE ctid >= '(100,0)'; + ctid +------ +(0 rows) + +-- range scans with both bounds +EXPLAIN (COSTS OFF) +SELECT ctid FROM tidrangescan WHERE ctid > '(1,4)' AND '(1,7)' >= ctid; + QUERY PLAN +---------------------------------------------------------------- + Tid Range Scan on tidrangescan + TID Cond: ((ctid > '(1,4)'::tid) AND ('(1,7)'::tid >= ctid)) +(2 rows) + +SELECT ctid FROM tidrangescan WHERE ctid > '(1,4)' AND '(1,7)' >= ctid; + ctid +------- + (1,5) + (1,6) + (1,7) +(3 rows) + +EXPLAIN (COSTS OFF) +SELECT ctid FROM tidrangescan WHERE '(1,7)' >= ctid AND ctid > '(1,4)'; + QUERY PLAN +---------------------------------------------------------------- + Tid Range Scan on tidrangescan + TID Cond: (('(1,7)'::tid >= ctid) AND (ctid > '(1,4)'::tid)) +(2 rows) + +SELECT ctid FROM tidrangescan WHERE '(1,7)' >= ctid AND ctid > '(1,4)'; + ctid +------- + (1,5) + (1,6) + (1,7) +(3 rows) + +-- extreme offsets +SELECT ctid FROM tidrangescan where ctid > '(0,65535)' AND ctid < '(1,0)' LIMIT 1; + ctid +------ +(0 rows) + +SELECT ctid FROM tidrangescan where ctid < '(0,0)' LIMIT 1; + ctid +------ +(0 rows) + +-- empty table +CREATE TABLE tidrangescan_empty(id integer, data text); +EXPLAIN (COSTS OFF) +SELECT ctid FROM tidrangescan_empty WHERE ctid < '(1, 0)'; + QUERY PLAN +-------------------------------------- + Tid Range Scan on tidrangescan_empty + TID Cond: (ctid < '(1,0)'::tid) +(2 rows) + +SELECT ctid FROM tidrangescan_empty WHERE ctid < '(1, 0)'; + ctid +------ +(0 rows) + +EXPLAIN (COSTS OFF) +SELECT ctid FROM tidrangescan_empty WHERE ctid > '(9, 0)'; + QUERY PLAN +-------------------------------------- + Tid Range Scan on tidrangescan_empty + TID Cond: (ctid > '(9,0)'::tid) +(2 rows) + +SELECT ctid FROM tidrangescan_empty WHERE ctid > '(9, 0)'; + ctid +------ +(0 rows) + +-- cursors +BEGIN; +DECLARE c SCROLL CURSOR FOR SELECT ctid FROM tidrangescan WHERE ctid < '(1,0)'; +FETCH NEXT c; + ctid +------- + (0,1) +(1 row) + +FETCH NEXT c; + ctid +------- + (0,2) +(1 row) + +FETCH PRIOR c; + ctid +------- + (0,1) +(1 row) + +FETCH FIRST c; + ctid +------- + (0,1) +(1 row) + +FETCH LAST c; + ctid +-------- + (0,10) +(1 row) + +COMMIT; +DROP TABLE tidrangescan; +DROP TABLE tidrangescan_empty; +RESET enable_seqscan; diff --git src/test/regress/parallel_schedule src/test/regress/parallel_schedule index e0e1ef71dd..2b9763a869 100644 --- src/test/regress/parallel_schedule +++ src/test/regress/parallel_schedule @@ -80,7 +80,7 @@ test: brin gin gist spgist privileges init_privs security_label collate matview # ---------- # Another group of parallel tests # ---------- -test: create_table_like alter_generic alter_operator misc async dbsize misc_functions sysviews tsrf tid tidscan collate.icu.utf8 incremental_sort +test: create_table_like alter_generic alter_operator misc async dbsize misc_functions sysviews tsrf tid tidscan tidrangescan collate.icu.utf8 incremental_sort # rules cannot run concurrently with any test that creates # a view or rule in the public schema diff --git src/test/regress/sql/tidrangescan.sql src/test/regress/sql/tidrangescan.sql new file mode 100644 index 0000000000..d60439d56c --- /dev/null +++ src/test/regress/sql/tidrangescan.sql @@ -0,0 +1,83 @@ +-- tests for tidrangescans + +SET enable_seqscan TO off; +CREATE TABLE tidrangescan(id integer, data text); + +-- insert enough tuples to fill at least two pages +INSERT INTO tidrangescan SELECT i,repeat('x', 100) FROM generate_series(1,200) AS s(i); + +-- remove all tuples after the 10th tuple on each page. Trying to ensure +-- we get the same layout with all CPU architectures and smaller than standard +-- page sizes. +DELETE FROM tidrangescan +WHERE substring(ctid::text from ',(\d+)\)')::integer > 10 OR substring(ctid::text from '\((\d+),')::integer > 2; +VACUUM tidrangescan; + +-- range scans with upper bound +EXPLAIN (COSTS OFF) +SELECT ctid FROM tidrangescan WHERE ctid < '(1,0)'; +SELECT ctid FROM tidrangescan WHERE ctid < '(1,0)'; + +EXPLAIN (COSTS OFF) +SELECT ctid FROM tidrangescan WHERE ctid <= '(1,5)'; +SELECT ctid FROM tidrangescan WHERE ctid <= '(1,5)'; + +EXPLAIN (COSTS OFF) +SELECT ctid FROM tidrangescan WHERE ctid < '(0,0)'; +SELECT ctid FROM tidrangescan WHERE ctid < '(0,0)'; + +-- range scans with lower bound +EXPLAIN (COSTS OFF) +SELECT ctid FROM tidrangescan WHERE ctid > '(2,8)'; +SELECT ctid FROM tidrangescan WHERE ctid > '(2,8)'; + +EXPLAIN (COSTS OFF) +SELECT ctid FROM tidrangescan WHERE '(2,8)' < ctid; +SELECT ctid FROM tidrangescan WHERE '(2,8)' < ctid; + +EXPLAIN (COSTS OFF) +SELECT ctid FROM tidrangescan WHERE ctid >= '(2,8)'; +SELECT ctid FROM tidrangescan WHERE ctid >= '(2,8)'; + +EXPLAIN (COSTS OFF) +SELECT ctid FROM tidrangescan WHERE ctid >= '(100,0)'; +SELECT ctid FROM tidrangescan WHERE ctid >= '(100,0)'; + +-- range scans with both bounds +EXPLAIN (COSTS OFF) +SELECT ctid FROM tidrangescan WHERE ctid > '(1,4)' AND '(1,7)' >= ctid; +SELECT ctid FROM tidrangescan WHERE ctid > '(1,4)' AND '(1,7)' >= ctid; + +EXPLAIN (COSTS OFF) +SELECT ctid FROM tidrangescan WHERE '(1,7)' >= ctid AND ctid > '(1,4)'; +SELECT ctid FROM tidrangescan WHERE '(1,7)' >= ctid AND ctid > '(1,4)'; + +-- extreme offsets +SELECT ctid FROM tidrangescan where ctid > '(0,65535)' AND ctid < '(1,0)' LIMIT 1; +SELECT ctid FROM tidrangescan where ctid < '(0,0)' LIMIT 1; + +-- empty table +CREATE TABLE tidrangescan_empty(id integer, data text); + +EXPLAIN (COSTS OFF) +SELECT ctid FROM tidrangescan_empty WHERE ctid < '(1, 0)'; +SELECT ctid FROM tidrangescan_empty WHERE ctid < '(1, 0)'; + +EXPLAIN (COSTS OFF) +SELECT ctid FROM tidrangescan_empty WHERE ctid > '(9, 0)'; +SELECT ctid FROM tidrangescan_empty WHERE ctid > '(9, 0)'; + +-- cursors +BEGIN; +DECLARE c SCROLL CURSOR FOR SELECT ctid FROM tidrangescan WHERE ctid < '(1,0)'; +FETCH NEXT c; +FETCH NEXT c; +FETCH PRIOR c; +FETCH FIRST c; +FETCH LAST c; +COMMIT; + +DROP TABLE tidrangescan; +DROP TABLE tidrangescan_empty; + +RESET enable_seqscan; diff --git src/tools/pgindent/typedefs.list src/tools/pgindent/typedefs.list index 9cd047ba25..f12d60debf 100644 --- src/tools/pgindent/typedefs.list +++ src/tools/pgindent/typedefs.list @@ -2526,8 +2526,13 @@ TextPositionState TheLexeme TheSubstitute TidExpr +TidExprType TidHashKey +TidOpExpr TidPath +TidRangePath +TidRangeScan +TidRangeScanState TidScan TidScanState TimeADT --------------2.29.2--