diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index 23e5526..4c1bae6 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -2697,26 +2697,63 @@ neqjoinsel(PG_FUNCTION_ARGS) Oid eqop; float8 result; - /* - * We want 1 - eqjoinsel() where the equality operator is the one - * associated with this != operator, that is, its negator. - */ - eqop = get_negator(operator); - if (eqop) + + if (jointype == JOIN_SEMI || jointype == JOIN_ANTI) { - result = DatumGetFloat8(DirectFunctionCall5(eqjoinsel, - PointerGetDatum(root), - ObjectIdGetDatum(eqop), - PointerGetDatum(args), - Int16GetDatum(jointype), - PointerGetDatum(sjinfo))); + VariableStatData leftvar; + VariableStatData rightvar; + double nullfrac; + bool reversed; + HeapTuple statsTuple; + + get_join_variables(root, args, sjinfo, &leftvar, &rightvar, &reversed); + statsTuple = reversed ? rightvar.statsTuple : leftvar.statsTuple; + if (HeapTupleIsValid(statsTuple)) + nullfrac = ((Form_pg_statistic) GETSTRUCT(statsTuple))->stanullfrac; + else + nullfrac = 0.0; + ReleaseVariableStats(leftvar); + ReleaseVariableStats(rightvar); + + /* + * For semi-joins, if there is more than one distinct value in the RHS + * relation then every non-null LHS row must find a row to join since + * it can only be equal to one of them. We'll assume that there is + * always more than one distinct RHS value for the sake of stability, + * though in theory we could have special cases for empty RHS + * (selectivity = 0) and single-distinct-value RHS (selectivity = + * fraction of LHS that has the same value as the single RHS value). + * + * For anti-joins, if we use the same assumption that there is more + * than one distinct key in the RHS relation, then every non-null LHS + * row must be suppressed by the anti-join leaving only nullfrac. + */ + result = 1.0 - nullfrac; } else { - /* Use default selectivity (should we raise an error instead?) */ - result = DEFAULT_EQ_SEL; + /* + * We want 1 - eqjoinsel() where the equality operator is the one + * associated with this != operator, that is, its negator. + */ + eqop = get_negator(operator); + if (eqop) + { + result = DatumGetFloat8(DirectFunctionCall5(eqjoinsel, + PointerGetDatum(root), + ObjectIdGetDatum(eqop), + PointerGetDatum(args), + Int16GetDatum(jointype), + PointerGetDatum(sjinfo))); + } + else + { + /* Use default selectivity (should we raise an error instead?) */ + result = DEFAULT_EQ_SEL; + } + result = 1.0 - result; } - result = 1.0 - result; + PG_RETURN_FLOAT8(result); } diff --git a/src/test/regress/expected/join.out b/src/test/regress/expected/join.out index 9f4c88d..10bfb68 100644 --- a/src/test/regress/expected/join.out +++ b/src/test/regress/expected/join.out @@ -1845,6 +1845,89 @@ SELECT '' AS "xxx", * | 1 | 4 | one | -1 (1 row) +-- SEMI and ANTI join neq selectivity +ANALYZE J1_TBL; +ANALYZE J2_TBL; +EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) +SELECT * FROM J1_TBL t1 +WHERE EXISTS (SELECT * FROM J2_TBL t2 WHERE t1.i <> t2.i); + QUERY PLAN +----------------------------------------------------------- + Nested Loop Semi Join (actual rows=9 loops=1) + Join Filter: (t1.i <> t2.i) + Rows Removed by Join Filter: 19 + -> Seq Scan on j1_tbl t1 (actual rows=11 loops=1) + -> Materialize (actual rows=3 loops=11) + -> Seq Scan on j2_tbl t2 (actual rows=9 loops=1) +(6 rows) + +SELECT count(*) FROM J1_TBL t1 +WHERE EXISTS (SELECT * FROM J2_TBL t2 WHERE t1.i <> t2.i); + count +------- + 9 +(1 row) + +EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) +SELECT * FROM J1_TBL t1 +WHERE EXISTS (SELECT * FROM J2_TBL t2 WHERE t2.i <> t1.i); + QUERY PLAN +----------------------------------------------------------- + Nested Loop Semi Join (actual rows=9 loops=1) + Join Filter: (t2.i <> t1.i) + Rows Removed by Join Filter: 19 + -> Seq Scan on j1_tbl t1 (actual rows=11 loops=1) + -> Materialize (actual rows=3 loops=11) + -> Seq Scan on j2_tbl t2 (actual rows=9 loops=1) +(6 rows) + +SELECT count(*) FROM J1_TBL t1 +WHERE EXISTS (SELECT * FROM J2_TBL t2 WHERE t2.i <> t1.i); + count +------- + 9 +(1 row) + +EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) +SELECT * FROM J1_TBL t1 +WHERE NOT EXISTS (SELECT * FROM J2_TBL t2 WHERE t1.i <> t2.i); + QUERY PLAN +----------------------------------------------------------- + Nested Loop Anti Join (actual rows=2 loops=1) + Join Filter: (t1.i <> t2.i) + Rows Removed by Join Filter: 19 + -> Seq Scan on j1_tbl t1 (actual rows=11 loops=1) + -> Materialize (actual rows=3 loops=11) + -> Seq Scan on j2_tbl t2 (actual rows=9 loops=1) +(6 rows) + +SELECT count(*) FROM J1_TBL t1 +WHERE NOT EXISTS (SELECT * FROM J2_TBL t2 WHERE t1.i <> t2.i); + count +------- + 2 +(1 row) + +EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) +SELECT * FROM J1_TBL t1 +WHERE NOT EXISTS (SELECT * FROM J2_TBL t2 WHERE t2.i <> t1.i); + QUERY PLAN +----------------------------------------------------------- + Nested Loop Anti Join (actual rows=2 loops=1) + Join Filter: (t2.i <> t1.i) + Rows Removed by Join Filter: 19 + -> Seq Scan on j1_tbl t1 (actual rows=11 loops=1) + -> Materialize (actual rows=3 loops=11) + -> Seq Scan on j2_tbl t2 (actual rows=9 loops=1) +(6 rows) + +SELECT count(*) FROM J1_TBL t1 +WHERE NOT EXISTS (SELECT * FROM J2_TBL t2 WHERE t2.i <> t1.i); + count +------- + 2 +(1 row) + -- -- More complicated constructs -- diff --git a/src/test/regress/sql/join.sql b/src/test/regress/sql/join.sql index 835d675..e97f57c 100644 --- a/src/test/regress/sql/join.sql +++ b/src/test/regress/sql/join.sql @@ -193,6 +193,32 @@ SELECT '' AS "xxx", * SELECT '' AS "xxx", * FROM J1_TBL LEFT JOIN J2_TBL USING (i) WHERE (i = 1); +-- SEMI and ANTI join neq selectivity +ANALYZE J1_TBL; +ANALYZE J2_TBL; +EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) +SELECT * FROM J1_TBL t1 +WHERE EXISTS (SELECT * FROM J2_TBL t2 WHERE t1.i <> t2.i); +SELECT count(*) FROM J1_TBL t1 +WHERE EXISTS (SELECT * FROM J2_TBL t2 WHERE t1.i <> t2.i); + +EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) +SELECT * FROM J1_TBL t1 +WHERE EXISTS (SELECT * FROM J2_TBL t2 WHERE t2.i <> t1.i); +SELECT count(*) FROM J1_TBL t1 +WHERE EXISTS (SELECT * FROM J2_TBL t2 WHERE t2.i <> t1.i); + +EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) +SELECT * FROM J1_TBL t1 +WHERE NOT EXISTS (SELECT * FROM J2_TBL t2 WHERE t1.i <> t2.i); +SELECT count(*) FROM J1_TBL t1 +WHERE NOT EXISTS (SELECT * FROM J2_TBL t2 WHERE t1.i <> t2.i); + +EXPLAIN (ANALYZE, COSTS OFF, TIMING OFF, SUMMARY OFF) +SELECT * FROM J1_TBL t1 +WHERE NOT EXISTS (SELECT * FROM J2_TBL t2 WHERE t2.i <> t1.i); +SELECT count(*) FROM J1_TBL t1 +WHERE NOT EXISTS (SELECT * FROM J2_TBL t2 WHERE t2.i <> t1.i); -- -- More complicated constructs