From c40b4b59df27e24c2238f422d5c6fde6fe1a91d3 Mon Sep 17 00:00:00 2001 From: Alexander Kuzmenkov Date: Fri, 22 Mar 2019 17:25:01 +0300 Subject: [PATCH v13] Remove unique self joins. --- src/backend/nodes/outfuncs.c | 14 +- src/backend/optimizer/path/indxpath.c | 46 +- src/backend/optimizer/path/joinpath.c | 6 +- src/backend/optimizer/plan/analyzejoins.c | 998 ++++++++++++++++++++++++-- src/backend/optimizer/plan/planmain.c | 7 +- src/backend/optimizer/util/pathnode.c | 3 +- src/backend/optimizer/util/plancat.c | 7 +- src/backend/optimizer/util/relnode.c | 26 +- src/backend/utils/cache/relcache.c | 4 +- src/include/nodes/nodes.h | 1 + src/include/nodes/pathnodes.h | 25 +- src/include/optimizer/pathnode.h | 5 + src/include/optimizer/paths.h | 4 +- src/include/optimizer/planmain.h | 10 +- src/test/regress/expected/equivclass.out | 32 + src/test/regress/expected/insert_conflict.out | 2 +- src/test/regress/expected/join.out | 255 ++++++- src/test/regress/expected/stats_ext.out | 23 +- src/test/regress/sql/equivclass.sql | 17 + src/test/regress/sql/join.sql | 108 +++ 20 files changed, 1478 insertions(+), 115 deletions(-) diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c index 910a738..ba03fdf 100644 --- a/src/backend/nodes/outfuncs.c +++ b/src/backend/nodes/outfuncs.c @@ -2270,7 +2270,8 @@ _outRelOptInfo(StringInfo str, const RelOptInfo *node) WRITE_OID_FIELD(userid); WRITE_BOOL_FIELD(useridiscurrent); /* we don't try to print fdwroutine or fdw_private */ - /* can't print unique_for_rels/non_unique_for_rels; BMSes aren't Nodes */ + WRITE_NODE_FIELD(unique_for_rels); + /* can't print non_unique_for_rels; BMSes aren't Nodes */ WRITE_NODE_FIELD(baserestrictinfo); WRITE_UINT_FIELD(baserestrict_min_security); WRITE_NODE_FIELD(joininfo); @@ -2343,6 +2344,14 @@ _outStatisticExtInfo(StringInfo str, const StatisticExtInfo *node) } static void +_outUniqueRelInfo(StringInfo str, const UniqueRelInfo *node) +{ + WRITE_NODE_TYPE("UNIQUERELINFO"); + + WRITE_BITMAPSET_FIELD(outerrelids); +} + +static void _outEquivalenceClass(StringInfo str, const EquivalenceClass *node) { /* @@ -4095,6 +4104,9 @@ outNode(StringInfo str, const void *obj) case T_StatisticExtInfo: _outStatisticExtInfo(str, obj); break; + case T_UniqueRelInfo: + _outUniqueRelInfo(str, obj); + break; case T_ExtensibleNode: _outExtensibleNode(str, obj); break; diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c index 3434219..096e786 100644 --- a/src/backend/optimizer/path/indxpath.c +++ b/src/backend/optimizer/path/indxpath.c @@ -3583,7 +3583,8 @@ ec_member_matches_indexcol(PlannerInfo *root, RelOptInfo *rel, * relation_has_unique_index_for * Determine whether the relation provably has at most one row satisfying * a set of equality conditions, because the conditions constrain all - * columns of some unique index. + * columns of some unique index. If index_info is not null, it is set to + * point to a new UniqueRelInfo containing the index and conditions. * * The conditions can be represented in either or both of two ways: * 1. A list of RestrictInfo nodes, where the caller has already determined @@ -3604,12 +3605,24 @@ ec_member_matches_indexcol(PlannerInfo *root, RelOptInfo *rel, bool relation_has_unique_index_for(PlannerInfo *root, RelOptInfo *rel, List *restrictlist, - List *exprlist, List *oprlist) + List *exprlist, List *oprlist, + UniqueRelInfo **unique_info) { ListCell *ic; + /* + * If we are asked to return the information about unique index, do not + * stop after finding one index that guarantees uniqueness, and try to find + * several of them. This info is useful for self-join removal. To save + * time, don't try to find all of them, and stop after we have found some. + */ + int indexesNeeded = unique_info ? URI_MAX_INDEXES : 1; + int indexesFound = 0; Assert(list_length(exprlist) == list_length(oprlist)); + if (unique_info) + *unique_info = NULL; + /* Short-circuit if no indexes... */ if (rel->indexlist == NIL) return false; @@ -3660,6 +3673,7 @@ relation_has_unique_index_for(PlannerInfo *root, RelOptInfo *rel, { IndexOptInfo *ind = (IndexOptInfo *) lfirst(ic); int c; + List *column_values = NIL; /* * If the index is not unique, or not immediately enforced, or if it's @@ -3708,6 +3722,9 @@ relation_has_unique_index_for(PlannerInfo *root, RelOptInfo *rel, if (match_index_to_operand(rexpr, c, ind)) { matched = true; /* column is unique */ + column_values = lappend(column_values, rinfo->outer_is_left + ? get_leftop(rinfo->clause) + : get_rightop(rinfo->clause)); break; } } @@ -3750,10 +3767,31 @@ relation_has_unique_index_for(PlannerInfo *root, RelOptInfo *rel, /* Matched all key columns of this index? */ if (c == ind->nkeycolumns) - return true; + { + indexesFound++; + + if (unique_info) + { + /* This may be called in GEQO memory context. */ + MemoryContext oldContext = MemoryContextSwitchTo(root->planner_cxt); + if (!*unique_info) + *unique_info = makeNode(UniqueRelInfo); + (*unique_info)->n_indexes = indexesFound; + (*unique_info)->unique_indexes[indexesFound - 1] = ind->indexoid; + (*unique_info)->column_values[indexesFound - 1] + = list_copy(column_values); + MemoryContextSwitchTo(oldContext); + } + } + + if (column_values) + list_free(column_values); + + if (indexesFound >= indexesNeeded) + break; } - return false; + return indexesFound > 0; } /* diff --git a/src/backend/optimizer/path/joinpath.c b/src/backend/optimizer/path/joinpath.c index d8ff4bf..89cd236 100644 --- a/src/backend/optimizer/path/joinpath.c +++ b/src/backend/optimizer/path/joinpath.c @@ -176,7 +176,8 @@ add_paths_to_joinrel(PlannerInfo *root, innerrel, JOIN_INNER, restrictlist, - false); + false, + NULL /*index_info*/); break; default: extra.inner_unique = innerrel_is_unique(root, @@ -185,7 +186,8 @@ add_paths_to_joinrel(PlannerInfo *root, innerrel, jointype, restrictlist, - false); + false, + NULL /*index_info*/); break; } diff --git a/src/backend/optimizer/plan/analyzejoins.c b/src/backend/optimizer/plan/analyzejoins.c index a4efa69..54220e9 100644 --- a/src/backend/optimizer/plan/analyzejoins.c +++ b/src/backend/optimizer/plan/analyzejoins.c @@ -22,6 +22,7 @@ */ #include "postgres.h" +#include "catalog/pg_class.h" #include "nodes/nodeFuncs.h" #include "optimizer/clauses.h" #include "optimizer/joininfo.h" @@ -39,14 +40,15 @@ static void remove_rel_from_query(PlannerInfo *root, int relid, static List *remove_rel_from_joinlist(List *joinlist, int relid, int *nremoved); static bool rel_supports_distinctness(PlannerInfo *root, RelOptInfo *rel); static bool rel_is_distinct_for(PlannerInfo *root, RelOptInfo *rel, - List *clause_list); + List *clause_list, UniqueRelInfo **unique_info); static Oid distinct_col_search(int colno, List *colnos, List *opids); static bool is_innerrel_unique_for(PlannerInfo *root, Relids joinrelids, Relids outerrelids, RelOptInfo *innerrel, JoinType jointype, - List *restrictlist); + List *restrictlist, + UniqueRelInfo **unique_info); /* @@ -58,7 +60,7 @@ static bool is_innerrel_unique_for(PlannerInfo *root, * data structures that have to be updated are accessible via "root". */ List * -remove_useless_joins(PlannerInfo *root, List *joinlist) +remove_useless_left_joins(PlannerInfo *root, List *joinlist) { ListCell *lc; @@ -162,7 +164,6 @@ join_is_removable(PlannerInfo *root, SpecialJoinInfo *sjinfo) int innerrelid; RelOptInfo *innerrel; Relids joinrelids; - List *clause_list = NIL; ListCell *l; int attroff; @@ -238,67 +239,24 @@ join_is_removable(PlannerInfo *root, SpecialJoinInfo *sjinfo) } /* - * Search for mergejoinable clauses that constrain the inner rel against - * either the outer rel or a pseudoconstant. If an operator is - * mergejoinable then it behaves like equality for some btree opclass, so - * it's what we want. The mergejoinability test also eliminates clauses - * containing volatile functions, which we couldn't depend on. + * Check for pushed-down clauses referencing the inner rel. If there is + * such a clause then join removal has to be disallowed. We have to + * check this despite the previous attr_needed checks because of the + * possibility of pushed-down clauses referencing the rel. */ foreach(l, innerrel->joininfo) { RestrictInfo *restrictinfo = (RestrictInfo *) lfirst(l); - - /* - * If it's not a join clause for this outer join, we can't use it. - * Note that if the clause is pushed-down, then it is logically from - * above the outer join, even if it references no other rels (it might - * be from WHERE, for example). - */ - if (RINFO_IS_PUSHED_DOWN(restrictinfo, joinrelids)) - { - /* - * If such a clause actually references the inner rel then join - * removal has to be disallowed. We have to check this despite - * the previous attr_needed checks because of the possibility of - * pushed-down clauses referencing the rel. - */ - if (bms_is_member(innerrelid, restrictinfo->clause_relids)) + if (RINFO_IS_PUSHED_DOWN(restrictinfo, joinrelids) + && bms_is_member(innerrel->relid, restrictinfo->clause_relids)) return false; - continue; /* else, ignore; not useful here */ - } - - /* Ignore if it's not a mergejoinable clause */ - if (!restrictinfo->can_join || - restrictinfo->mergeopfamilies == NIL) - continue; /* not mergejoinable */ - - /* - * Check if clause has the form "outer op inner" or "inner op outer", - * and if so mark which side is inner. - */ - if (!clause_sides_match_join(restrictinfo, sjinfo->min_lefthand, - innerrel->relids)) - continue; /* no good for these input relations */ - - /* OK, add to list */ - clause_list = lappend(clause_list, restrictinfo); } - /* - * Now that we have the relevant equality join clauses, try to prove the - * innerrel distinct. - */ - if (rel_is_distinct_for(root, innerrel, clause_list)) - return true; - - /* - * Some day it would be nice to check for other methods of establishing - * distinctness. - */ - return false; + return is_innerrel_unique_for(root, joinrelids, sjinfo->min_lefthand, + innerrel, sjinfo->jointype, innerrel->joininfo, + NULL /*unique_index*/); } - /* * Remove the target relid from the planner's data structures, having * determined that there is no need to include it in the query. @@ -568,7 +526,7 @@ reduce_unique_semijoins(PlannerInfo *root) /* Test whether the innerrel is unique for those clauses. */ if (!innerrel_is_unique(root, joinrelids, sjinfo->min_lefthand, innerrel, - JOIN_SEMI, restrictlist, true)) + JOIN_SEMI, restrictlist, true, NULL /*index_info*/)) continue; /* OK, remove the SpecialJoinInfo from the list. */ @@ -643,10 +601,17 @@ rel_supports_distinctness(PlannerInfo *root, RelOptInfo *rel) * Note that the passed-in clause_list may be destructively modified! This * is OK for current uses, because the clause_list is built by the caller for * the sole purpose of passing to this function. + * + * If unique_index is not null, it is set to point to the index that guarantees + * uniqueness for a base relation. */ static bool -rel_is_distinct_for(PlannerInfo *root, RelOptInfo *rel, List *clause_list) +rel_is_distinct_for(PlannerInfo *root, RelOptInfo *rel, List *clause_list, + UniqueRelInfo **unique_info) { + if (unique_info) + *unique_info = NULL; + /* * We could skip a couple of tests here if we assume all callers checked * rel_supports_distinctness first, but it doesn't seem worth taking any @@ -661,8 +626,8 @@ rel_is_distinct_for(PlannerInfo *root, RelOptInfo *rel, List *clause_list) * relation_has_unique_index_for automatically adds any usable * restriction clauses for the rel, so we needn't do that here. */ - if (relation_has_unique_index_for(root, rel, clause_list, NIL, NIL)) - return true; + return relation_has_unique_index_for(root, rel, clause_list, NIL, NIL, + unique_info); } else if (rel->rtekind == RTE_SUBQUERY) { @@ -966,6 +931,10 @@ distinct_col_search(int colno, List *colnos, List *opids) * heuristic about whether to cache negative answers; it should be "true" * if making an inquiry that is not part of the normal bottom-up join search * sequence. + * + * If index_info_out is not null, it is set to point to a new UniqueRelInfo + * allocated in root memory context, that describes the index that guarantees + * uniqueness. */ bool innerrel_is_unique(PlannerInfo *root, @@ -974,12 +943,23 @@ innerrel_is_unique(PlannerInfo *root, RelOptInfo *innerrel, JoinType jointype, List *restrictlist, - bool force_cache) + bool force_cache, + UniqueRelInfo **unique_info_out) { MemoryContext old_context; ListCell *lc; + UniqueRelInfo *unique_info; - /* Certainly can't prove uniqueness when there are no joinclauses */ + if (unique_info_out) + *unique_info_out = NULL; + + /* + * It is possible to prove uniqueness even in the absence of joinclauses, + * just from baserestrictinfos alone. However, in these cases the inner + * relation returns one row at most, so join removal won't give much + * benefit. It seems better to save some planning time by ignoring these + * cases. + */ if (restrictlist == NIL) return false; @@ -999,10 +979,14 @@ innerrel_is_unique(PlannerInfo *root, */ foreach(lc, innerrel->unique_for_rels) { - Relids unique_for_rels = (Relids) lfirst(lc); + unique_info = (UniqueRelInfo *) lfirst(lc); - if (bms_is_subset(unique_for_rels, outerrelids)) + if (bms_is_subset(unique_info->outerrelids, outerrelids)) + { + if (unique_info_out) + *unique_info_out = unique_info; return true; /* Success! */ + } } /* @@ -1019,7 +1003,7 @@ innerrel_is_unique(PlannerInfo *root, /* No cached information, so try to make the proof. */ if (is_innerrel_unique_for(root, joinrelids, outerrelids, innerrel, - jointype, restrictlist)) + jointype, restrictlist, &unique_info)) { /* * Cache the positive result for future probes, being sure to keep it @@ -1032,10 +1016,15 @@ innerrel_is_unique(PlannerInfo *root, * supersets of them anyway. */ old_context = MemoryContextSwitchTo(root->planner_cxt); - innerrel->unique_for_rels = lappend(innerrel->unique_for_rels, - bms_copy(outerrelids)); + if (!unique_info) + unique_info = makeNode(UniqueRelInfo); + unique_info->outerrelids = bms_copy(outerrelids); + innerrel->unique_for_rels = lappend(innerrel->unique_for_rels, unique_info); MemoryContextSwitchTo(old_context); + if (unique_info_out) + *unique_info_out = unique_info; + return true; /* Success! */ } else @@ -1081,11 +1070,15 @@ is_innerrel_unique_for(PlannerInfo *root, Relids outerrelids, RelOptInfo *innerrel, JoinType jointype, - List *restrictlist) + List *restrictlist, + UniqueRelInfo **unique_info) { List *clause_list = NIL; ListCell *lc; + if (unique_info) + *unique_info = NULL; + /* * Search for mergejoinable clauses that constrain the inner rel against * the outer rel. If an operator is mergejoinable then it behaves like @@ -1123,5 +1116,872 @@ is_innerrel_unique_for(PlannerInfo *root, } /* Let rel_is_distinct_for() do the hard work */ - return rel_is_distinct_for(root, innerrel, clause_list); + return rel_is_distinct_for(root, innerrel, clause_list, unique_info); +} + +typedef struct +{ + Index oldRelid; + Index newRelid; +} ChangeVarnoContext; + +static bool +change_varno_walker(Node *node, ChangeVarnoContext *context) +{ + if (node == NULL) + return false; + + if (IsA(node, Var) && ((Var *) node)->varno == context->oldRelid) + { + ((Var *) node)->varno = context->newRelid; + ((Var *) node)->varnoold = context->newRelid; + return false; + } + + return expression_tree_walker(node, change_varno_walker, context); +} + +/* + * For all Vars in the expression that have varno = oldRelid, set + * varno = newRelid. + */ +static void +change_varno(Expr *expr, Index oldRelid, Index newRelid) +{ + ChangeVarnoContext context; + + context.oldRelid = oldRelid; + context.newRelid = newRelid; + change_varno_walker((Node *) expr, &context); +} + +/* + * Substitute newId for oldId in relids. + */ +static void +change_relid(Relids *relids, Index oldId, Index newId) +{ + if (bms_is_member(oldId, *relids)) + *relids = bms_add_member(bms_del_member(*relids, oldId), newId); +} + +/* + * Update EC members to point to the remaining relation instead of the removed + * one, removing duplicates. + */ +static void +update_ec_members(EquivalenceClass *ec, Index toRemove, Index toKeep) +{ + ListCell *prev = NULL; + ListCell *cell; + ListCell *next; + + for (cell = list_head(ec->ec_members); cell; cell = next) + { + ListCell *otherCell; + EquivalenceMember *em = lfirst(cell); + + next = lnext(cell); + + if (!bms_is_member(toRemove, em->em_relids)) + { + prev = cell; + continue; + } + + change_relid(&em->em_relids, toRemove, toKeep); + /* We only process inner joins */ + Assert(em->em_nullable_relids == NULL); + change_varno(em->em_expr, toRemove, toKeep); + + /* + * After we switched the equivalence member to the remaining relation, + * check that it is not the same as the existing member, and if it + * is, delete it. + */ + foreach (otherCell, ec->ec_members) + { + EquivalenceMember *other; + + if (otherCell == cell) + continue; + + other = castNode(EquivalenceMember, lfirst(otherCell)); + + if (equal(other->em_expr, em->em_expr)) + break; + } + + if (otherCell) + ec->ec_members = list_delete_cell(ec->ec_members, cell, prev); + else + prev = cell; + } +} + +/* + * Update EC sources to point to the remaining relation instead of the + * removed one. + */ +static void +update_ec_sources(List **sources, Index toRemove, Index toKeep) +{ + ListCell *prev = NULL; + ListCell *cell; + ListCell *next; + + for (cell = list_head(*sources); cell; cell = next) + { + ListCell *otherCell; + RestrictInfo *rinfo = castNode(RestrictInfo, lfirst(cell)); + + next = lnext(cell); + + if (!bms_is_member(toRemove, rinfo->required_relids)) + { + prev = cell; + continue; + } + + change_varno(rinfo->clause, toRemove, toKeep); + + /* + * After switching the clause to the remaining relation, check it for + * redundancy with existing ones. We don't have to check for + * redundancy with derived clauses, because we've just deleted them. + */ + foreach (otherCell, *sources) + { + RestrictInfo *other; + + if (otherCell == cell) + continue; + + other = castNode(RestrictInfo, lfirst(otherCell)); + if (equal(rinfo->clause, other->clause)) + break; + } + + if (otherCell) + *sources = list_delete_cell(*sources, cell, prev); + else + { + prev = cell; + + /* We will keep this RestrictInfo, correct its relids. */ + change_relid(&rinfo->required_relids, toRemove, toKeep); + change_relid(&rinfo->left_relids, toRemove, toKeep); + change_relid(&rinfo->right_relids, toRemove, toKeep); + change_relid(&rinfo->clause_relids, toRemove, toKeep); + } + } +} + +/* + * Scratch space for the unique self join removal code. + */ +typedef struct +{ + PlannerInfo *root; + + /* Temporary array for relation ids. */ + Index *relids; + + /* + * Array of Relids, one for each relation, indexed by relation id. Each + * element is a set of relation ids with which this relation has a special + * join. + */ + Relids *special_join_rels; + + /* Array of row marks indexed by relid. */ + PlanRowMark **row_marks; + + /* Bitmapset for join relids that is used to avoid reallocation. */ + Relids joinrelids; + + /* + * Top-level targetlist of the query. We have to update any references + * it has to the relations we remove. + */ + List *targetlist; +} UsjScratch; + +/* + * Remove a relation after we have proven that it participates only in an + * unneeded unique self join. + * + * The joinclauses list is destructively changed. + */ +static void +remove_self_join_rel(UsjScratch *scratch, Relids joinrelids, List *joinclauses, + RelOptInfo *toKeep, RelOptInfo *toRemove) +{ + PlannerInfo *root = scratch->root; + ListCell *cell; + List *toAppend; + + /* + * Transfer join and restriction clauses from the removed relation to the + * remaining one. We change the Vars of the clause to point to the + * remaining relation instead of the removed one. The clauses that require + * a subset of joinrelids become restriction clauses of the remaining + * relation, and others remain join clauses. We append them to + * baserestrictinfo and joininfo respectively, trying not to introduce + * duplicates. + * + * We also have to process the 'joinclauses' list here, because it + * contains EC-derived join clauses which must become filter clauses. It + * is not enough to just correct the ECs, because the EC-derived + * restrictions are generated before join removal (see + * generate_base_implied_equalities). + */ + toAppend = list_concat(joinclauses, toRemove->baserestrictinfo); + toAppend = list_concat(toAppend, toRemove->joininfo); + + foreach(cell, toAppend) + { + RestrictInfo *rinfo = lfirst_node(RestrictInfo, cell); + bool is_join_clause = !bms_is_subset(rinfo->required_relids, joinrelids); + List **target = is_join_clause ? &toKeep->joininfo : &toKeep->baserestrictinfo; + ListCell *otherCell; + + /* + * We can't have an EC-derived clause that joins to some third + * relation + */ + Assert(!(is_join_clause && rinfo->parent_ec != NULL)); + + /* + * Replace the references to the removed relation with references to + * the remaining one. We won't necessarily add this clause, because it + * may be already present in the joininfo or baserestrictinfo. Still, + * we have to switch it to point to the remaining relation. This is + * important for join clauses that reference both relations, because + * they are included in both joininfos. + */ + change_varno(rinfo->clause, toRemove->relid, toKeep->relid); + change_relid(&rinfo->required_relids, toRemove->relid, toKeep->relid); + change_relid(&rinfo->left_relids, toRemove->relid, toKeep->relid); + change_relid(&rinfo->right_relids, toRemove->relid, toKeep->relid); + change_relid(&rinfo->clause_relids, toRemove->relid, toKeep->relid); + + /* + * Don't add the clause if it is already present in the list, or + * derived from the same equivalence class, or is the same as another + * clause. + */ + foreach (otherCell, *target) + { + RestrictInfo *other = lfirst(otherCell); + + if (other == rinfo + || (rinfo->parent_ec != NULL + && other->parent_ec == rinfo->parent_ec) + || equal(rinfo->clause, other->clause)) + { + break; + } + } + if (otherCell != NULL) + continue; + + /* + * If the clause has the form of "X = X", replace it with null test. + */ + if (rinfo->mergeopfamilies) + { + Expr *leftOp = (Expr *) get_leftop(rinfo->clause); + Expr *rightOp = (Expr *) get_rightop(rinfo->clause); + + if (leftOp != NULL && equal(leftOp, rightOp)) + { + NullTest *test = makeNode(NullTest); + test->arg = leftOp; + test->nulltesttype = IS_NOT_NULL; + test->argisrow = false; + test->location = -1; + rinfo->clause = (Expr *) test; + } + } + + *target = lappend(*target, rinfo); + } + + /* + * Transfer the targetlist and attr_needed flags. + */ + Assert(toRemove->reltarget->sortgrouprefs == 0); + + foreach (cell, toRemove->reltarget->exprs) + { + Expr *node = lfirst(cell); + change_varno(node, toRemove->relid, toKeep->relid); + if (!list_member(toKeep->reltarget->exprs, node)) + toKeep->reltarget->exprs = lappend(toKeep->reltarget->exprs, + node); + } + + for (int i = toKeep->min_attr; i <= toKeep->max_attr; i++) + { + int attno = i - toKeep->min_attr; + toKeep->attr_needed[attno] = bms_add_members(toKeep->attr_needed[attno], + toRemove->attr_needed[attno]); + } + + /* + * If the removed relation has a row mark, transfer it to the remaining + * one. + * + * If both rels have row marks, just keep the one corresponding to the + * remaining relation, because we verified earlier that they have the same + * strength. + * + * Also make sure that the scratch->row_marks cache is up to date, because + * we are going to use it for further join removals. + */ + if (scratch->row_marks[toRemove->relid]) + { + PlanRowMark **markToRemove = &scratch->row_marks[toRemove->relid]; + PlanRowMark **markToKeep = &scratch->row_marks[toKeep->relid]; + + if (*markToKeep) + { + Assert((*markToKeep)->markType == (*markToRemove)->markType); + + root->rowMarks = list_delete_ptr(root->rowMarks, *markToKeep); + *markToKeep = NULL; + } + else + { + *markToKeep = *markToRemove; + *markToRemove = NULL; + + /* Shouldn't have inheritance children here. */ + Assert((*markToKeep)->rti == (*markToKeep)->prti); + + (*markToKeep)->rti = toKeep->relid; + (*markToKeep)->prti = toKeep->relid; + } + } + + /* + * Likewise replace references in SpecialJoinInfo data structures. + * + * This is relevant in case the join we're deleting is nested inside some + * special joins: the upper joins' relid sets have to be adjusted. + */ + foreach (cell, root->join_info_list) + { + SpecialJoinInfo *sjinfo = (SpecialJoinInfo *) lfirst(cell); + + change_relid(&sjinfo->min_lefthand, toRemove->relid, toKeep->relid); + change_relid(&sjinfo->min_righthand, toRemove->relid, toKeep->relid); + change_relid(&sjinfo->syn_lefthand, toRemove->relid, toKeep->relid); + change_relid(&sjinfo->syn_righthand, toRemove->relid, toKeep->relid); + } + + /* + * Likewise update references in PlaceHolderVar data structures. + */ + foreach(cell, root->placeholder_list) + { + PlaceHolderInfo *phinfo = (PlaceHolderInfo *) lfirst(cell); + + /* + * We are at an inner join of two base relations. A placeholder can't + * be needed here or evaluated here. + */ + Assert(!bms_is_subset(phinfo->ph_eval_at, joinrelids)); + Assert(!bms_is_subset(phinfo->ph_needed, joinrelids)); + + change_relid(&phinfo->ph_eval_at, toRemove->relid, toKeep->relid); + change_relid(&phinfo->ph_needed, toRemove->relid, toKeep->relid); + change_relid(&phinfo->ph_lateral, toRemove->relid, toKeep->relid); + change_relid(&phinfo->ph_var->phrels, toRemove->relid, toKeep->relid); + } + + /* + * Update the equivalence classes that reference the removed relations. + */ + foreach(cell, root->eq_classes) + { + EquivalenceClass *ec = lfirst(cell); + + if (!bms_is_member(toRemove->relid, ec->ec_relids)) + { + /* + * This EC doesn't reference the removed relation, nothing to be + * done for it. + */ + continue; + } + + /* + * Update the EC members to reference the remaining relation instead + * of the removed one. + */ + update_ec_members(ec, toRemove->relid, toKeep->relid); + change_relid(&ec->ec_relids, toRemove->relid, toKeep->relid); + + /* + * We will now update source and derived clauses of the EC. + * + * Restriction clauses for base relations are already distributed to + * the respective baserestrictinfo lists (see + * generate_implied_equalities). The above code has already processed + * this list, and updated these clauses to reference the remaining + * relation, so we can skip them here based on their relids. + * + * Likewise, we have already processed the join clauses that join the + * removed relation to the remaining one. + * + * Finally, there are join clauses that join the removed relation to + * some third relation. We can't just delete the source clauses and + * regenerate them from the EC, because the corresponding equality + * operators might be missing (see the handling of ec_broken). + * Therefore, we will update the references in the source clauses. + * + * Derived clauses can be generated again, so it is simpler to just + * delete them. + */ + list_free(ec->ec_derives); + ec->ec_derives = NULL; + update_ec_sources(&ec->ec_sources, toRemove->relid, toKeep->relid); + } + + /* + * Mark the rel as "dead" to show it is no longer part of the join tree. + * (Removing it from the baserel array altogether seems too risky.) + */ + toRemove->reloptkind = RELOPT_DEADREL; + + /* + * Update references to the removed relation from other baserels. + */ + for (int i = 1; i < root->simple_rel_array_size; i++) + { + RelOptInfo *otherrel = root->simple_rel_array[i]; + int attroff; + + /* no point in processing target rel itself */ + if (i == toRemove->relid) + continue; + + /* there may be empty slots corresponding to non-baserel RTEs */ + if (otherrel == NULL) + continue; + + Assert(otherrel->relid == i); /* sanity check on array */ + + /* Update attr_needed arrays. */ + for (attroff = otherrel->max_attr - otherrel->min_attr; + attroff >= 0; attroff--) + { + change_relid(&otherrel->attr_needed[attroff], toRemove->relid, + toKeep->relid); + } + + /* Update lateral references. */ + foreach (cell, otherrel->lateral_vars) + change_varno(lfirst(cell), toRemove->relid, toKeep->relid); + } +} + +/* + * Test for unique self-join. + * + * We prove that for every pair of joined rows, we actually have the same row + * on inner and outer side, as identified by ctid. This happens when for both + * sides of join we can apply the same unique index with the same index + * conditions. + */ +static bool +is_unique_self_join(PlannerInfo *root, Relids joinrelids, RelOptInfo *outer, + RelOptInfo *inner, List *restrictlist) +{ + UniqueRelInfo *outerInfo = NULL; + UniqueRelInfo *innerInfo = NULL; + int outerPos = 0, innerPos = 0; + + innerrel_is_unique(root, joinrelids, inner->relids, + outer, JOIN_INNER, restrictlist, true, &outerInfo); + if (!outerInfo || !outerInfo->n_indexes) + return false; + + innerrel_is_unique(root, joinrelids, outer->relids, + inner, JOIN_INNER, restrictlist, true, &innerInfo); + if (!innerInfo || !innerInfo->n_indexes) + return false; + + /* Find the same unique index for both relations. */ + for (;;) + { + Oid outerOid, innerOid; + List *outerValues, *innerValues; + bool haveSameValues; + + if (outerPos >= outerInfo->n_indexes) + break; + + if (innerPos >= innerInfo->n_indexes) + break; + + /* + * The indexes are sorted in ascending order by Oid, as guaranteed by + * RelationGetIndexList. + */ + outerOid = outerInfo->unique_indexes[outerPos]; + innerOid = innerInfo->unique_indexes[innerPos]; + + if (outerOid > innerOid) + { + innerPos++; + continue; + } + if (outerOid < innerOid) + { + outerPos++; + continue; + } + + /* + * We have proven that for both relations, the same unique index + * guarantees that there is at most one row where columns equal given + * values. These values must be the same for both relations, or else we + * won't match the same row on each side of join. A value may be either + * Const or Var of some other relation. For the purposes of this proof, + * the Vars of the inner and outer relation are the same, so we replace + * outer varno with inner and compare the column values using equal(). + */ + innerValues = copyObject(innerInfo->column_values[innerPos]); + outerValues = copyObject(outerInfo->column_values[outerPos]); + change_varno((Expr *) innerValues, outer->relid, inner->relid); + change_varno((Expr *) outerValues, outer->relid, inner->relid); + haveSameValues = equal(outerValues, innerValues); + list_free_deep(outerValues); + list_free_deep(innerValues); + + if (haveSameValues) + return true; + + outerPos++; + innerPos++; + } + return false; +} + +/* + * Find and remove unique self joins in a group of base relations that have + * the same Oid. + * + * Returns a set of relids that were removed. + */ +static Relids +remove_self_joins_one_group(UsjScratch *scratch, Index *relids, int n) +{ + PlannerInfo *root = scratch->root; + Relids joinrelids = scratch->joinrelids; + Relids result = NULL; + int i, o; + + if (n < 2) + return NULL; + + for (o = 0; o < n; o++) + { + RelOptInfo *outer = root->simple_rel_array[relids[o]]; + + for (i = o + 1; i < n; i++) + { + RelOptInfo *inner = root->simple_rel_array[relids[i]]; + List *restrictlist; + + /* A sanity check: the relations have the same Oid. */ + Assert(root->simple_rte_array[relids[i]]->relid + == root->simple_rte_array[relids[o]]->relid); + + /* + * This optimization applies to inner joins only, so skip any + * relations that form a special join. + */ + if (bms_is_member(relids[i], scratch->special_join_rels[relids[o]])) + continue; + + /* Reuse joinrelids bitset to avoid reallocation. */ + joinrelids = bms_del_members(joinrelids, joinrelids); + + /* + * We only deal with base rels here, so their relids bitset + * contains only one member -- their relid. + */ + joinrelids = bms_add_member(joinrelids, relids[o]); + joinrelids = bms_add_member(joinrelids, relids[i]); + + /* Is it a unique self join? */ + restrictlist = build_joinrel_restrictlist(root, joinrelids, outer, + inner); + if (!is_unique_self_join(root, joinrelids, outer, inner, + restrictlist)) + continue; + + /* + * We can't remove the join if the relations have row marks of + * different strength (e.g. one is locked FOR UPDATE and another + * just has ROW_MARK_REFERENCE for EvalPlanQual rechecking). + */ + if (scratch->row_marks[relids[i]] && scratch->row_marks[relids[o]] + && scratch->row_marks[relids[i]]->markType + != scratch->row_marks[relids[o]]->markType) + { + continue; + } + + /* + * We can remove either relation, so remove the outer one, to + * simplify this loop. + */ + remove_self_join_rel(scratch, joinrelids, restrictlist, inner, outer); + result = bms_add_member(result, relids[o]); + + /* + * Replace varno in root targetlist and HAVING clause. + */ + change_varno((Expr *) scratch->targetlist, relids[o], relids[i]); + change_varno((Expr *) root->parse->havingQual, relids[o], relids[i]); + + /* We removed the outer relation, try the next one. */ + break; + } + } + + scratch->joinrelids = joinrelids; + return result; +} + +/* + * A qsort comparator to sort the relids by the relation Oid. + */ +static int +compare_rte(const Index *left, const Index *right, PlannerInfo *root) +{ + Oid l = root->simple_rte_array[*left]->relid; + Oid r = root->simple_rte_array[*right]->relid; + + return l < r ? 1 : (l == r ? 0 : -1); +} + +/* + * Find and remove unique self joins on a particular level of the join tree. + * + * We sort the relations by Oid and then examine each group with the same Oid. + * If we removed any relation, remove it from joinlist as well. + */ +static void +remove_self_joins_one_level(UsjScratch *scratch, List **joinlist) +{ + ListCell *prev, *cell, *next; + Relids relidsToRemove = NULL; + Oid groupOid; + int groupStart; + int i; + int n = 0; + Index *relid_ascending = scratch->relids; + PlannerInfo *root = scratch->root; + + /* + * Collect the ids of base relations at this level of the join tree. + */ + foreach (cell, *joinlist) + { + RangeTblEntry *rte; + RelOptInfo *rel; + RangeTblRef *ref = (RangeTblRef *) lfirst(cell); + if (!IsA(ref, RangeTblRef)) + continue; + + rte = root->simple_rte_array[ref->rtindex]; + rel = root->simple_rel_array[ref->rtindex]; + + /* We only care about base relations from which we select something. */ + if (rte->rtekind != RTE_RELATION || rte->relkind != RELKIND_RELATION + || rel == NULL) + { + continue; + } + + /* + * This optimization won't work for tables that have inheritance + * children. + */ + if (rte->inh) + continue; + + relid_ascending[n++] = ref->rtindex; + + /* + * Limit the number of joins we process to control the quadratic + * behavior. + */ + if (n > join_collapse_limit) + break; + } + + if (n < 2) + return; + + /* + * Find and process the groups of relations that have same Oid. + */ + qsort_arg(relid_ascending, n, sizeof(*relid_ascending), + (qsort_arg_comparator) compare_rte, root); + groupOid = root->simple_rte_array[relid_ascending[0]]->relid; + groupStart = 0; + for (i = 1; i < n; i++) + { + RangeTblEntry *rte = root->simple_rte_array[relid_ascending[i]]; + Assert(rte->relid != InvalidOid); + if (rte->relid != groupOid) + { + relidsToRemove = bms_add_members(relidsToRemove, + remove_self_joins_one_group(scratch, &relid_ascending[groupStart], + i - groupStart)); + groupOid = rte->relid; + groupStart = i; + } + } + Assert(groupOid != InvalidOid); + Assert(groupStart < n); + relidsToRemove = bms_add_members(relidsToRemove, + remove_self_joins_one_group(scratch, &relid_ascending[groupStart], + n - groupStart)); + + /* Delete the removed relations from joinlist. */ + prev = NULL; + for (cell = list_head(*joinlist); cell; cell = next) + { + Node *node = lfirst(cell); + + next = lnext(cell); + + if (IsA(node, RangeTblRef) + && bms_is_member(((RangeTblRef *) node)->rtindex, relidsToRemove)) + { + *joinlist = list_delete_cell(*joinlist, cell, prev); + } + else + prev = cell; + } +} + +/* + * Find and remove unique self joins on a single level of a join tree, and + * recurse to handle deeper levels. + */ +static void +remove_self_joins_recurse(UsjScratch *scratch, List **joinlist) +{ + ListCell *lc; + foreach (lc, *joinlist) + { + switch (((Node *) lfirst(lc))->type) + { + case T_List: + remove_self_joins_recurse(scratch, (List **) &lfirst(lc)); + break; + case T_RangeTblRef: + break; + default: + Assert(false); + } + } + remove_self_joins_one_level(scratch, joinlist); +} + +/* + * Find and remove useless self joins. + * + * We search for joins where the same relation is joined to itself on all + * columns of some unique index. If this condition holds, then, for + * each outer row, only one inner row matches, and it is the same row + * of the same relation. This allows us to remove the join and replace + * it with a scan that combines WHERE clauses from both sides. The join + * clauses themselves assume the form of X = X and can be replaced with + * NOT NULL clauses. + * + * For the sake of simplicity, we don't apply this optimization to special + * joins. Here is a list of what we could do in some particular cases: + * 'a a1 semi join a a2': is reduced to inner by reduce_unique_semijoins, + * and then removed normally. + * 'a a1 anti join a a2': could simplify to a scan with 'outer quals AND + * (IS NULL on join columns OR NOT inner quals)'. + * 'a a1 left join a a2': could simplify to a scan like inner, but without + * NOT NULL condtions on join columns. + * 'a a1 left join (a a2 join b)': can't simplify this, because join to b + * can both remove rows and introduce duplicates. + * + * To search for removable joins, we order all the relations on their Oid, + * go over each set with the same Oid, and consider each pair of relations + * in this set. We check that both relation are made unique by the same + * unique index with the same clauses. + * + * To remove the join, we mark one of the participating relation as + * dead, and rewrite all references to it to point to the remaining + * relation. This includes modifying RestrictInfos, EquivalenceClasses and + * EquivalenceMembers. We also have to modify the row marks. The join clauses + * of the removed relation become either restriction or join clauses, based on + * whether they reference any relations not participating in the removed join. + * + * 'targetlist' is the top-level targetlist of query. If it has any references + * to the removed relations, we update them to point to the remaining ones. + */ +void +remove_useless_self_joins(PlannerInfo *root, List **joinlist, List *targetlist) +{ + ListCell *lc; + UsjScratch scratch; + + scratch.root = root; + scratch.relids = palloc(root->simple_rel_array_size * sizeof(Index)); + scratch.special_join_rels = palloc0(root->simple_rel_array_size * sizeof(Relids)); + scratch.row_marks = palloc0(root->simple_rel_array_size * sizeof(PlanRowMark *)); + scratch.joinrelids = NULL; + scratch.targetlist = targetlist; + + /* Find out which relations have special joins to which. */ + foreach(lc, root->join_info_list) + { + SpecialJoinInfo *info = (SpecialJoinInfo *) lfirst(lc); + int bit = -1; + while ((bit = bms_next_member(info->min_lefthand, bit)) >= 0) + { + RelOptInfo *rel = find_base_rel(root, bit); + scratch.special_join_rels[rel->relid] = + bms_add_members(scratch.special_join_rels[rel->relid], + info->min_righthand); + } + + bit = -1; + while ((bit = bms_next_member(info->min_righthand, bit)) >= 0) + { + RelOptInfo *rel = find_base_rel(root, bit); + scratch.special_join_rels[rel->relid] = + bms_add_members(scratch.special_join_rels[rel->relid], + info->min_lefthand); + } + } + + /* Collect row marks. */ + foreach (lc, root->rowMarks) + { + PlanRowMark *rowMark = (PlanRowMark *) lfirst(lc); + + /* Can't have more than one row mark for a relation. */ + Assert(scratch.row_marks[rowMark->rti] == NULL); + + scratch.row_marks[rowMark->rti] = rowMark; + } + + /* Finally, remove the joins. */ + remove_self_joins_recurse(&scratch, joinlist); } diff --git a/src/backend/optimizer/plan/planmain.c b/src/backend/optimizer/plan/planmain.c index 3cedd01..8d6036c 100644 --- a/src/backend/optimizer/plan/planmain.c +++ b/src/backend/optimizer/plan/planmain.c @@ -224,7 +224,7 @@ query_planner(PlannerInfo *root, List *tlist, * jointree preprocessing, but the necessary information isn't available * until we've built baserel data structures and classified qual clauses. */ - joinlist = remove_useless_joins(root, joinlist); + joinlist = remove_useless_left_joins(root, joinlist); /* * Also, reduce any semijoins with unique inner rels to plain inner joins. @@ -233,6 +233,11 @@ query_planner(PlannerInfo *root, List *tlist, reduce_unique_semijoins(root); /* + * Remove self joins on a unique column. + */ + remove_useless_self_joins(root, &joinlist, tlist); + + /* * Now distribute "placeholders" to base rels as needed. This has to be * done after join removal because removal could change whether a * placeholder is evaluable at a base rel. diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index 169e51e..493f425 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -1578,7 +1578,8 @@ create_unique_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath, if (rel->rtekind == RTE_RELATION && sjinfo->semi_can_btree && relation_has_unique_index_for(root, rel, NIL, sjinfo->semi_rhs_exprs, - sjinfo->semi_operators)) + sjinfo->semi_operators, + NULL /*index_info*/)) { pathnode->umethod = UNIQUE_PATH_NOOP; pathnode->path.rows = rel->rows; diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c index 30f4dc1..3d50ea2 100644 --- a/src/backend/optimizer/util/plancat.c +++ b/src/backend/optimizer/util/plancat.c @@ -420,7 +420,12 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, index_close(indexRelation, NoLock); - indexinfos = lcons(info, indexinfos); + /* + * Keep the list of IndexOptInfo sorted in the same order as the + * Oids returned by RelationGetIndexList. This ordering is used + * to simplify index matching for self-join removal. + */ + indexinfos = lappend(indexinfos, info); } list_free(indexoidlist); diff --git a/src/backend/optimizer/util/relnode.c b/src/backend/optimizer/util/relnode.c index 4130514..fde4118 100644 --- a/src/backend/optimizer/util/relnode.c +++ b/src/backend/optimizer/util/relnode.c @@ -39,14 +39,10 @@ typedef struct JoinHashEntry static void build_joinrel_tlist(PlannerInfo *root, RelOptInfo *joinrel, RelOptInfo *input_rel); -static List *build_joinrel_restrictlist(PlannerInfo *root, - RelOptInfo *joinrel, - RelOptInfo *outer_rel, - RelOptInfo *inner_rel); static void build_joinrel_joinlist(RelOptInfo *joinrel, RelOptInfo *outer_rel, RelOptInfo *inner_rel); -static List *subbuild_joinrel_restrictlist(RelOptInfo *joinrel, +static List *subbuild_joinrel_restrictlist(Relids joinrelids, List *joininfo_list, List *new_restrictlist); static List *subbuild_joinrel_joinlist(RelOptInfo *joinrel, @@ -556,7 +552,7 @@ build_join_rel(PlannerInfo *root, */ if (restrictlist_ptr) *restrictlist_ptr = build_joinrel_restrictlist(root, - joinrel, + joinrel->relids, outer_rel, inner_rel); return joinrel; @@ -659,7 +655,7 @@ build_join_rel(PlannerInfo *root, * caller might or might not need the restrictlist, but I need it anyway * for set_joinrel_size_estimates().) */ - restrictlist = build_joinrel_restrictlist(root, joinrel, + restrictlist = build_joinrel_restrictlist(root, joinrel->relids, outer_rel, inner_rel); if (restrictlist_ptr) *restrictlist_ptr = restrictlist; @@ -981,7 +977,7 @@ build_joinrel_tlist(PlannerInfo *root, RelOptInfo *joinrel, * the various joinlist entries ultimately refer to RestrictInfos * pushed into them by distribute_restrictinfo_to_rels(). * - * 'joinrel' is a join relation node + * 'joinrelids' is a join relation id set * 'outer_rel' and 'inner_rel' are a pair of relations that can be joined * to form joinrel. * @@ -994,9 +990,9 @@ build_joinrel_tlist(PlannerInfo *root, RelOptInfo *joinrel, * RestrictInfo nodes are no longer context-dependent. Instead, just include * the original nodes in the lists made for the join relation. */ -static List * +List * build_joinrel_restrictlist(PlannerInfo *root, - RelOptInfo *joinrel, + Relids joinrelids, RelOptInfo *outer_rel, RelOptInfo *inner_rel) { @@ -1007,8 +1003,8 @@ build_joinrel_restrictlist(PlannerInfo *root, * eliminating any duplicates (important since we will see many of the * same clauses arriving from both input relations). */ - result = subbuild_joinrel_restrictlist(joinrel, outer_rel->joininfo, NIL); - result = subbuild_joinrel_restrictlist(joinrel, inner_rel->joininfo, result); + result = subbuild_joinrel_restrictlist(joinrelids, outer_rel->joininfo, NIL); + result = subbuild_joinrel_restrictlist(joinrelids, inner_rel->joininfo, result); /* * Add on any clauses derived from EquivalenceClasses. These cannot be @@ -1017,7 +1013,7 @@ build_joinrel_restrictlist(PlannerInfo *root, */ result = list_concat(result, generate_join_implied_equalities(root, - joinrel->relids, + joinrelids, outer_rel->relids, inner_rel)); @@ -1043,7 +1039,7 @@ build_joinrel_joinlist(RelOptInfo *joinrel, } static List * -subbuild_joinrel_restrictlist(RelOptInfo *joinrel, +subbuild_joinrel_restrictlist(Relids joinrelids, List *joininfo_list, List *new_restrictlist) { @@ -1053,7 +1049,7 @@ subbuild_joinrel_restrictlist(RelOptInfo *joinrel, { RestrictInfo *rinfo = (RestrictInfo *) lfirst(l); - if (bms_is_subset(rinfo->required_relids, joinrel->relids)) + if (bms_is_subset(rinfo->required_relids, joinrelids)) { /* * This clause becomes a restriction clause for the joinrel, since diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c index 84609e0..3bf5bff 100644 --- a/src/backend/utils/cache/relcache.c +++ b/src/backend/utils/cache/relcache.c @@ -4298,7 +4298,9 @@ RelationGetFKeyList(Relation relation) * needed by the executor, since for index types that we obtain exclusive * locks on when updating the index, all backends must lock the indexes in * the same order or we will get deadlocks (see ExecOpenIndices()). Any - * consistent ordering would do, but ordering by OID is easy. + * consistent ordering would do, but ordering by OID is easy. We also rely on + * the resulting ordering of IndexOptInfos to simplify index matching for + * self-join removal. * * Since shared cache inval causes the relcache's copy of the list to go away, * we return a copy of the list palloc'd in the caller's context. The caller diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h index ffb4cd4..59ca1fe 100644 --- a/src/include/nodes/nodes.h +++ b/src/include/nodes/nodes.h @@ -272,6 +272,7 @@ typedef enum NodeTag T_RollupData, T_GroupingSetData, T_StatisticExtInfo, + T_UniqueRelInfo, /* * TAGS FOR MEMORY NODES (memnodes.h) diff --git a/src/include/nodes/pathnodes.h b/src/include/nodes/pathnodes.h index 253e0b7..f6c358e 100644 --- a/src/include/nodes/pathnodes.h +++ b/src/include/nodes/pathnodes.h @@ -522,8 +522,11 @@ typedef struct PartitionSchemeData *PartitionScheme; * populate these fields, for base rels; but someday they might be used for * join rels too: * - * unique_for_rels - list of Relid sets, each one being a set of other - * rels for which this one has been proven unique + * unique_for_rels - list of UniqueRelInfos, each of them recording + * a set of other rels for which this one has been proven + * unique. If this is a baserel that is made unique by an + * index, UniqueRelInfo also stores the information about + * that index. * non_unique_for_rels - list of Relid sets, each one being a set of * other rels for which we have tried and failed to prove * this one unique @@ -2471,4 +2474,22 @@ typedef struct JoinCostWorkspace double inner_rows_total; } JoinCostWorkspace; +/* + * UniqueRelInfo records a fact that a relation is unique when being joined + * to other relation(s) specified by outerrelids. If the uniqueness was proved + * using a unique index, this index is recorded along with the values that + * constrain index columns. If there are other unique indexes that can + * guarantee uniqueness, they are also saved, up to URI_MAX_INDEXES. The + * indexes are ordered by ascending Oid. + */ +#define URI_MAX_INDEXES 5 +typedef struct UniqueRelInfo +{ + NodeTag tag; + Relids outerrelids; + int n_indexes; + Oid unique_indexes[URI_MAX_INDEXES]; + List *column_values[URI_MAX_INDEXES]; +} UniqueRelInfo; + #endif /* PATHNODES_H */ diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h index 574bb85..7eb59c1 100644 --- a/src/include/optimizer/pathnode.h +++ b/src/include/optimizer/pathnode.h @@ -287,6 +287,11 @@ extern RelOptInfo *build_join_rel(PlannerInfo *root, RelOptInfo *inner_rel, SpecialJoinInfo *sjinfo, List **restrictlist_ptr); + +extern List *build_joinrel_restrictlist(PlannerInfo *root, + Relids joinrelids, + RelOptInfo *outer_rel, + RelOptInfo *inner_rel); extern Relids min_join_parameterization(PlannerInfo *root, Relids joinrelids, RelOptInfo *outer_rel, diff --git a/src/include/optimizer/paths.h b/src/include/optimizer/paths.h index 36d12bc..4260227 100644 --- a/src/include/optimizer/paths.h +++ b/src/include/optimizer/paths.h @@ -70,9 +70,11 @@ extern void debug_print_rel(PlannerInfo *root, RelOptInfo *rel); * routines to generate index paths */ extern void create_index_paths(PlannerInfo *root, RelOptInfo *rel); + extern bool relation_has_unique_index_for(PlannerInfo *root, RelOptInfo *rel, List *restrictlist, - List *exprlist, List *oprlist); + List *exprlist, List *oprlist, + UniqueRelInfo **info); extern bool indexcol_is_bool_constant_for_query(IndexOptInfo *index, int indexcol); extern bool match_index_to_operand(Node *operand, int indexcol, diff --git a/src/include/optimizer/planmain.h b/src/include/optimizer/planmain.h index b093a3c..252b8da 100644 --- a/src/include/optimizer/planmain.h +++ b/src/include/optimizer/planmain.h @@ -16,6 +16,7 @@ #include "nodes/pathnodes.h" #include "nodes/plannodes.h" +#include "optimizer/paths.h" /* GUC parameters */ #define DEFAULT_CURSOR_TUPLE_FRACTION 0.1 @@ -95,13 +96,18 @@ extern void match_foreign_keys_to_quals(PlannerInfo *root); /* * prototypes for plan/analyzejoins.c */ -extern List *remove_useless_joins(PlannerInfo *root, List *joinlist); +extern List *remove_useless_left_joins(PlannerInfo *root, List *joinlist); extern void reduce_unique_semijoins(PlannerInfo *root); extern bool query_supports_distinctness(Query *query); extern bool query_is_distinct_for(Query *query, List *colnos, List *opids); + extern bool innerrel_is_unique(PlannerInfo *root, Relids joinrelids, Relids outerrelids, RelOptInfo *innerrel, - JoinType jointype, List *restrictlist, bool force_cache); + JoinType jointype, List *restrictlist, bool force_cache, + UniqueRelInfo **unique_info); + +extern void remove_useless_self_joins(PlannerInfo *root, List **jointree, + List *tlist); /* * prototypes for plan/setrefs.c diff --git a/src/test/regress/expected/equivclass.out b/src/test/regress/expected/equivclass.out index c448d85..a57905f 100644 --- a/src/test/regress/expected/equivclass.out +++ b/src/test/regress/expected/equivclass.out @@ -439,3 +439,35 @@ explain (costs off) Filter: ((unique1 = unique1) OR (unique2 = unique2)) (2 rows) +-- Test that broken ECs are processed correctly during self join removal. +-- Disable merge joins so that we don't get an error about missing commutator. +-- Test both orientations of the join clause, because only one of them breaks +-- the EC. +set enable_mergejoin to off; +explain (costs off) + select * from ec0 m join ec0 n on m.ff = n.ff + join ec1 p on m.ff + n.ff = p.f1; + QUERY PLAN +---------------------------------------- + Nested Loop + Join Filter: ((n.ff + n.ff) = p.f1) + -> Seq Scan on ec1 p + -> Materialize + -> Seq Scan on ec0 n + Filter: (ff IS NOT NULL) +(6 rows) + +explain (costs off) + select * from ec0 m join ec0 n on m.ff = n.ff + join ec1 p on p.f1::int8 = (m.ff + n.ff)::int8alias1; + QUERY PLAN +--------------------------------------------------------------- + Nested Loop + Join Filter: ((p.f1)::bigint = ((n.ff + n.ff))::int8alias1) + -> Seq Scan on ec1 p + -> Materialize + -> Seq Scan on ec0 n + Filter: (ff IS NOT NULL) +(6 rows) + +reset enable_mergejoin; diff --git a/src/test/regress/expected/insert_conflict.out b/src/test/regress/expected/insert_conflict.out index 1338b2b..306dbd1 100644 --- a/src/test/regress/expected/insert_conflict.out +++ b/src/test/regress/expected/insert_conflict.out @@ -53,7 +53,7 @@ explain (costs off) insert into insertconflicttest values(0, 'Crowberry') on con Conflict Filter: (alternatives: SubPlan 1 or hashed SubPlan 2) -> Result SubPlan 1 - -> Index Only Scan using both_index_expr_key on insertconflicttest ii + -> Index Only Scan using op_index_key on insertconflicttest ii Index Cond: (key = excluded.key) SubPlan 2 -> Seq Scan on insertconflicttest ii_1 diff --git a/src/test/regress/expected/join.out b/src/test/regress/expected/join.out index 88fcd52..13aa5ba 100644 --- a/src/test/regress/expected/join.out +++ b/src/test/regress/expected/join.out @@ -4365,11 +4365,13 @@ explain (costs off) select p.* from (parent p left join child c on (p.k = c.k)) join parent x on p.k = x.k where p.k = 1 and p.k = 2; - QUERY PLAN --------------------------- + QUERY PLAN +------------------------------------------------ Result One-Time Filter: false -(2 rows) + -> Index Scan using parent_pkey on parent x + Index Cond: (k = 1) +(4 rows) -- bug 5255: this is not optimizable by join removal begin; @@ -4486,6 +4488,253 @@ select * from (0 rows) -- +-- test that semi- or inner self-joins on a unique column are removed +-- +-- enable only nestloop to get more predictable plans +set enable_hashjoin to off; +set enable_mergejoin to off; +create table sj (a int unique, b int); +insert into sj values (1, null), (null, 2), (2, 1); +analyze sj; +select p.* from sj p, sj q where q.a = p.a and q.b = q.a - 1; + a | b +---+--- + 2 | 1 +(1 row) + +explain (costs off) +select p.* from sj p, sj q where q.a = p.a and q.b = q.a - 1; + QUERY PLAN +----------------------------------------------- + Seq Scan on sj q + Filter: ((a IS NOT NULL) AND (b = (a - 1))) +(2 rows) + +explain (costs off) +select * from sj p +where exists (select * from sj q + where q.a = p.a and q.b < 10); + QUERY PLAN +------------------------------------------ + Seq Scan on sj q + Filter: ((a IS NOT NULL) AND (b < 10)) +(2 rows) + +-- subselect that references the removed relation +explain (costs off) +select t1.a, (select a from sj where a = t2.a and a = t1.a) +from sj t1, sj t2 +where t1.a = t2.a; + QUERY PLAN +------------------------------------------ + Seq Scan on sj t2 + Filter: (a IS NOT NULL) + SubPlan 1 + -> Result + One-Time Filter: (t2.a = t2.a) + -> Seq Scan on sj + Filter: (a = t2.a) +(7 rows) + +-- self-join under outer join +explain (costs off) +select * from sj x join sj y on x.a = y.a +left join int8_tbl z on x.a = z.q1; + QUERY PLAN +------------------------------------ + Nested Loop Left Join + Join Filter: (y.a = z.q1) + -> Seq Scan on sj y + Filter: (a IS NOT NULL) + -> Materialize + -> Seq Scan on int8_tbl z +(6 rows) + +explain (costs off) +select * from sj x join sj y on x.a = y.a +left join int8_tbl z on y.a = z.q1; + QUERY PLAN +------------------------------------ + Nested Loop Left Join + Join Filter: (y.a = z.q1) + -> Seq Scan on sj y + Filter: (a IS NOT NULL) + -> Materialize + -> Seq Scan on int8_tbl z +(6 rows) + +-- Test that placeholders are updated correctly after join removal +explain (costs off) +select * from (values (1)) x +left join (select coalesce(y.q1, 1) from int8_tbl y + right join sj j1 inner join sj j2 on j1.a = j2.a + on true) z +on true; + QUERY PLAN +------------------------------------------ + Nested Loop Left Join + -> Result + -> Nested Loop Left Join + -> Seq Scan on sj j2 + Filter: (a IS NOT NULL) + -> Materialize + -> Seq Scan on int8_tbl y +(7 rows) + +-- HAVING clause +explain (costs off) +select p.b from sj p join sj q on p.a = q.a group by p.b having sum(p.a) = 1; + QUERY PLAN +--------------------------------- + HashAggregate + Group Key: q.b + Filter: (sum(q.a) = 1) + -> Seq Scan on sj q + Filter: (a IS NOT NULL) +(5 rows) + +-- update lateral references +explain (costs off) +select 1 from (select x.* from sj x, sj y where x.a = y.a) q, + lateral generate_series(1, q.a) gs(i); + QUERY PLAN +------------------------------------------- + Nested Loop + -> Seq Scan on sj y + Filter: (a IS NOT NULL) + -> Function Scan on generate_series gs +(4 rows) + +explain (costs off) +select 1 from (select y.* from sj x, sj y where x.a = y.a) q, + lateral generate_series(1, q.a) gs(i); + QUERY PLAN +------------------------------------------- + Nested Loop + -> Seq Scan on sj y + Filter: (a IS NOT NULL) + -> Function Scan on generate_series gs +(4 rows) + +-- Test that a non-EC-derived join clause is processed correctly. Use an +-- outer join so that we can't form an EC. +explain (costs off) select * from sj p join sj q on p.a = q.a + left join sj r on p.a + q.a = r.a; + QUERY PLAN +------------------------------------ + Nested Loop Left Join + Join Filter: ((q.a + q.a) = r.a) + -> Seq Scan on sj q + Filter: (a IS NOT NULL) + -> Materialize + -> Seq Scan on sj r +(6 rows) + +-- FIXME this constant false filter doesn't look good. Should we merge +-- equivalence classes? +explain (costs off) +select * from sj p, sj q where p.a = q.a and p.b = 1 and q.b = 2; + QUERY PLAN +----------------------------------------------------- + Seq Scan on sj q + Filter: ((a IS NOT NULL) AND (b = 2) AND (b = 1)) +(2 rows) + +-- Check that attr_needed is updated correctly after self-join removal. In this +-- test, the join of j1 with j2 is removed. k1.b is required at either j1 or j2. +-- If this info is lost, join targetlist for (k1, k2) will not contain k1.b. +-- Use index scan for k1 so that we don't get 'b' from physical tlist used for +-- seqscan. Also disable reordering of joins because this test depends on a +-- particular join tree. +create table sk (a int, b int); +create index on sk(a); +set join_collapse_limit to 1; +set enable_seqscan to off; +explain (costs off) select 1 from + (sk k1 join sk k2 on k1.a = k2.a) + join (sj j1 join sj j2 on j1.a = j2.a) on j1.b = k1.b; + QUERY PLAN +----------------------------------------------------- + Nested Loop + Join Filter: (k1.b = j2.b) + -> Nested Loop + -> Index Scan using sk_a_idx on sk k1 + -> Index Only Scan using sk_a_idx on sk k2 + Index Cond: (a = k1.a) + -> Materialize + -> Index Scan using sj_a_key on sj j2 + Index Cond: (a IS NOT NULL) +(9 rows) + +explain (costs off) select 1 from + (sk k1 join sk k2 on k1.a = k2.a) + join (sj j1 join sj j2 on j1.a = j2.a) on j2.b = k1.b; + QUERY PLAN +----------------------------------------------------- + Nested Loop + Join Filter: (k1.b = j2.b) + -> Nested Loop + -> Index Scan using sk_a_idx on sk k1 + -> Index Only Scan using sk_a_idx on sk k2 + Index Cond: (a = k1.a) + -> Materialize + -> Index Scan using sj_a_key on sj j2 + Index Cond: (a IS NOT NULL) +(9 rows) + +reset join_collapse_limit; +reset enable_seqscan; +-- If index conditions are different for each side, we won't select the same +-- row on both sides, so the join can't be removed. +create table sl(a int, b int); +create unique index sl_ab on sl(a, b); +explain (costs off) +select * from sl t1, sl t2 +where t1.a = t2.a and t1.b = 1 and t2.b = 2; + QUERY PLAN +---------------------------------------------- + Nested Loop + Join Filter: (t1.a = t2.a) + -> Bitmap Heap Scan on sl t1 + Recheck Cond: (b = 1) + -> Bitmap Index Scan on sl_ab + Index Cond: (b = 1) + -> Materialize + -> Bitmap Heap Scan on sl t2 + Recheck Cond: (b = 2) + -> Bitmap Index Scan on sl_ab + Index Cond: (b = 2) +(11 rows) + +-- Check that we can cope with multiple matching unique indexes on each side +create table sm(a int unique, b int unique, c int unique); +explain (costs off) +select * from sm m, sm n where m.a = n.b and m.c = n.c; + QUERY PLAN +----------------------------------------- + Seq Scan on sm n + Filter: ((c IS NOT NULL) AND (a = b)) +(2 rows) + +explain (costs off) +select * from sm m, sm n where m.a = n.c and m.b = n.b; + QUERY PLAN +----------------------------------------- + Seq Scan on sm n + Filter: ((b IS NOT NULL) AND (a = c)) +(2 rows) + +explain (costs off) +select * from sm m, sm n where m.c = n.b and m.a = n.a; + QUERY PLAN +----------------------------------------- + Seq Scan on sm n + Filter: ((a IS NOT NULL) AND (c = b)) +(2 rows) + +reset enable_hashjoin; +reset enable_mergejoin; +-- -- Test hints given on incorrect column references are useful -- select t1.uunique1 from diff --git a/src/test/regress/expected/stats_ext.out b/src/test/regress/expected/stats_ext.out index 054a381..6ba3b0a 100644 --- a/src/test/regress/expected/stats_ext.out +++ b/src/test/regress/expected/stats_ext.out @@ -404,7 +404,7 @@ EXPLAIN (COSTS OFF) --------------------------------------------------- Bitmap Heap Scan on functional_dependencies Recheck Cond: ((a = 1) AND (b = '1'::text)) - -> Bitmap Index Scan on fdeps_abc_idx + -> Bitmap Index Scan on fdeps_ab_idx Index Cond: ((a = 1) AND (b = '1'::text)) (4 rows) @@ -425,7 +425,7 @@ EXPLAIN (COSTS OFF) --------------------------------------------------- Bitmap Heap Scan on functional_dependencies Recheck Cond: ((a = 1) AND (b = '1'::text)) - -> Bitmap Index Scan on fdeps_abc_idx + -> Bitmap Index Scan on fdeps_ab_idx Index Cond: ((a = 1) AND (b = '1'::text)) (4 rows) @@ -445,19 +445,20 @@ INSERT INTO functional_dependencies (a, b, c, filler1) ANALYZE functional_dependencies; EXPLAIN (COSTS OFF) SELECT * FROM functional_dependencies WHERE a = 1 AND b = '1'; - QUERY PLAN ------------------------------------------------------------ - Index Scan using fdeps_abc_idx on functional_dependencies + QUERY PLAN +---------------------------------------------------------- + Index Scan using fdeps_ab_idx on functional_dependencies Index Cond: ((a = 1) AND (b = '1'::text)) (2 rows) EXPLAIN (COSTS OFF) SELECT * FROM functional_dependencies WHERE a = 1 AND b = '1' AND c = 1; - QUERY PLAN ------------------------------------------------------------ - Index Scan using fdeps_abc_idx on functional_dependencies - Index Cond: ((a = 1) AND (b = '1'::text) AND (c = 1)) -(2 rows) + QUERY PLAN +---------------------------------------------------------- + Index Scan using fdeps_ab_idx on functional_dependencies + Index Cond: ((a = 1) AND (b = '1'::text)) + Filter: (c = 1) +(3 rows) -- create statistics CREATE STATISTICS func_deps_stat (dependencies) ON a, b, c FROM functional_dependencies; @@ -468,7 +469,7 @@ EXPLAIN (COSTS OFF) --------------------------------------------------- Bitmap Heap Scan on functional_dependencies Recheck Cond: ((a = 1) AND (b = '1'::text)) - -> Bitmap Index Scan on fdeps_abc_idx + -> Bitmap Index Scan on fdeps_ab_idx Index Cond: ((a = 1) AND (b = '1'::text)) (4 rows) diff --git a/src/test/regress/sql/equivclass.sql b/src/test/regress/sql/equivclass.sql index 85aa65d..b1e2483 100644 --- a/src/test/regress/sql/equivclass.sql +++ b/src/test/regress/sql/equivclass.sql @@ -262,3 +262,20 @@ explain (costs off) -- this could be converted, but isn't at present explain (costs off) select * from tenk1 where unique1 = unique1 or unique2 = unique2; + + +-- Test that broken ECs are processed correctly during self join removal. +-- Disable merge joins so that we don't get an error about missing commutator. +-- Test both orientations of the join clause, because only one of them breaks +-- the EC. +set enable_mergejoin to off; + +explain (costs off) + select * from ec0 m join ec0 n on m.ff = n.ff + join ec1 p on m.ff + n.ff = p.f1; + +explain (costs off) + select * from ec0 m join ec0 n on m.ff = n.ff + join ec1 p on p.f1::int8 = (m.ff + n.ff)::int8alias1; + +reset enable_mergejoin; diff --git a/src/test/regress/sql/join.sql b/src/test/regress/sql/join.sql index c247509..c2937ed 100644 --- a/src/test/regress/sql/join.sql +++ b/src/test/regress/sql/join.sql @@ -1570,6 +1570,114 @@ select * from int8_tbl x join (int4_tbl x cross join int4_tbl y(ff)) j on q1 = f1; -- ok -- +-- test that semi- or inner self-joins on a unique column are removed +-- + +-- enable only nestloop to get more predictable plans +set enable_hashjoin to off; +set enable_mergejoin to off; + +create table sj (a int unique, b int); +insert into sj values (1, null), (null, 2), (2, 1); +analyze sj; + +select p.* from sj p, sj q where q.a = p.a and q.b = q.a - 1; + +explain (costs off) +select p.* from sj p, sj q where q.a = p.a and q.b = q.a - 1; + +explain (costs off) +select * from sj p +where exists (select * from sj q + where q.a = p.a and q.b < 10); + +-- subselect that references the removed relation +explain (costs off) +select t1.a, (select a from sj where a = t2.a and a = t1.a) +from sj t1, sj t2 +where t1.a = t2.a; + +-- self-join under outer join +explain (costs off) +select * from sj x join sj y on x.a = y.a +left join int8_tbl z on x.a = z.q1; + +explain (costs off) +select * from sj x join sj y on x.a = y.a +left join int8_tbl z on y.a = z.q1; + +-- Test that placeholders are updated correctly after join removal +explain (costs off) +select * from (values (1)) x +left join (select coalesce(y.q1, 1) from int8_tbl y + right join sj j1 inner join sj j2 on j1.a = j2.a + on true) z +on true; + +-- HAVING clause +explain (costs off) +select p.b from sj p join sj q on p.a = q.a group by p.b having sum(p.a) = 1; + +-- update lateral references +explain (costs off) +select 1 from (select x.* from sj x, sj y where x.a = y.a) q, + lateral generate_series(1, q.a) gs(i); + +explain (costs off) +select 1 from (select y.* from sj x, sj y where x.a = y.a) q, + lateral generate_series(1, q.a) gs(i); + +-- Test that a non-EC-derived join clause is processed correctly. Use an +-- outer join so that we can't form an EC. +explain (costs off) select * from sj p join sj q on p.a = q.a + left join sj r on p.a + q.a = r.a; + +-- FIXME this constant false filter doesn't look good. Should we merge +-- equivalence classes? +explain (costs off) +select * from sj p, sj q where p.a = q.a and p.b = 1 and q.b = 2; + +-- Check that attr_needed is updated correctly after self-join removal. In this +-- test, the join of j1 with j2 is removed. k1.b is required at either j1 or j2. +-- If this info is lost, join targetlist for (k1, k2) will not contain k1.b. +-- Use index scan for k1 so that we don't get 'b' from physical tlist used for +-- seqscan. Also disable reordering of joins because this test depends on a +-- particular join tree. +create table sk (a int, b int); +create index on sk(a); +set join_collapse_limit to 1; +set enable_seqscan to off; +explain (costs off) select 1 from + (sk k1 join sk k2 on k1.a = k2.a) + join (sj j1 join sj j2 on j1.a = j2.a) on j1.b = k1.b; +explain (costs off) select 1 from + (sk k1 join sk k2 on k1.a = k2.a) + join (sj j1 join sj j2 on j1.a = j2.a) on j2.b = k1.b; +reset join_collapse_limit; +reset enable_seqscan; + +-- If index conditions are different for each side, we won't select the same +-- row on both sides, so the join can't be removed. +create table sl(a int, b int); +create unique index sl_ab on sl(a, b); + +explain (costs off) +select * from sl t1, sl t2 +where t1.a = t2.a and t1.b = 1 and t2.b = 2; + +-- Check that we can cope with multiple matching unique indexes on each side +create table sm(a int unique, b int unique, c int unique); +explain (costs off) +select * from sm m, sm n where m.a = n.b and m.c = n.c; +explain (costs off) +select * from sm m, sm n where m.a = n.c and m.b = n.b; +explain (costs off) +select * from sm m, sm n where m.c = n.b and m.a = n.a; + +reset enable_hashjoin; +reset enable_mergejoin; + +-- -- Test hints given on incorrect column references are useful -- -- 2.7.4