From c9936b1a2460c3b3cf3a42cf1ef51b4d018c6c07 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@ispras.ru>
Date: Sat, 11 Mar 2017 00:36:31 +0300
Subject: [PATCH 5/8] Reversed HashJoin implementation.

The main point is that tuples are pushed immediately after the match, e.g. we
scan the whole bucket in one loop and pushTuple each match.
---
 src/backend/executor/execProcnode.c |  39 +++
 src/backend/executor/nodeHash.c     | 242 +++++++++++++++++-
 src/backend/executor/nodeHashjoin.c | 479 +++++++++++++-----------------------
 src/include/executor/nodeHash.h     |   9 +-
 src/include/executor/nodeHashjoin.h | 100 +++++++-
 src/include/nodes/execnodes.h       |   2 +
 6 files changed, 547 insertions(+), 324 deletions(-)

diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c
index b0468667bb..88e14d144a 100644
--- a/src/backend/executor/execProcnode.c
+++ b/src/backend/executor/execProcnode.c
@@ -156,6 +156,22 @@ ExecInitNode(Plan *node, EState *estate, int eflags, PlanState *parent)
 			result = (PlanState *) ExecInitSeqScan((SeqScan *) node,
 												   estate, eflags, parent);
 			break;
+
+		/*
+		 * join nodes
+		 */
+		case T_HashJoin:
+			result = (PlanState *) ExecInitHashJoin((HashJoin *) node,
+													estate, eflags, parent);
+			break;
+
+		/*
+		 * materialization nodes
+		 */
+		case T_Hash:
+			result = (PlanState *) ExecInitHash((Hash *) node,
+												estate, eflags, parent);
+			break;
 		default:
 			elog(ERROR, "unrecognized/unsupported node type: %d",
 				 (int) nodeTag(node));
@@ -231,6 +247,15 @@ pushTuple(TupleTableSlot *slot, PlanState *node, PlanState *pusher)
 	/* does push come from the outer side? */
 	push_from_outer = outerPlanState(node) == pusher;

+	if (nodeTag(node) == T_HashState)
+		return pushTupleToHash(slot, (HashState *) node);
+
+	else if (nodeTag(node) == T_HashJoinState && push_from_outer)
+		return pushTupleToHashJoinFromOuter(slot, (HashJoinState *) node);
+
+	else if (nodeTag(node) == T_HashJoinState && !push_from_outer)
+		return pushTupleToHashJoinFromInner(slot, (HashJoinState *) node);
+
 	elog(ERROR, "node type not supported: %d", (int) nodeTag(node));
 }

@@ -280,6 +305,20 @@ ExecEndNode(PlanState *node)
 			ExecEndSeqScan((SeqScanState *) node);
 			break;

+		/*
+		 * join nodes
+		 */
+		case T_HashJoinState:
+			ExecEndHashJoin((HashJoinState *) node);
+			break;
+
+		/*
+		 * materialization nodes
+		 */
+		case T_HashState:
+			ExecEndHash((HashState *) node);
+			break;
+
 		default:
 			elog(ERROR, "unrecognized/unsupported node type: %d",
 				 (int) nodeTag(node));
diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c
index 43e65ca04e..06fe45f29b 100644
--- a/src/backend/executor/nodeHash.c
+++ b/src/backend/executor/nodeHash.c
@@ -50,17 +50,95 @@ static void ExecHashRemoveNextSkewBucket(HashJoinTable hashtable);

 static void *dense_alloc(HashJoinTable hashtable, Size size);

-/* ----------------------------------------------------------------
- *		ExecHash
- *
- *		stub for pro forma compliance
- * ----------------------------------------------------------------
+
+/* Put incoming tuples to the hastable; when NULL received, finalize building
+ * hashatable and notify HashJoin about that.
  */
-TupleTableSlot *
-ExecHash(HashState *node)
+bool
+pushTupleToHash(TupleTableSlot *slot, HashState *node)
 {
-	elog(ERROR, "Hash node does not support ExecProcNode call convention");
-	return NULL;
+	List	   *hashkeys;
+	HashJoinTable hashtable;
+	ExprContext *econtext;
+	uint32		hashvalue;
+	HashJoinState *hj_node;
+
+	hj_node = (HashJoinState *) node->ps.parent;
+
+	/* Create the hastable. In vanilla Postgres this code is in HashJoin */
+	if (node->first_time_through)
+	{
+		Assert(node->hashtable == NULL);
+
+		node->hashtable = ExecHashTableCreate((Hash *) node->ps.plan,
+											  hj_node->hj_HashOperators,
+											  HJ_FILL_INNER(hj_node));
+
+		/* must provide our own instrumentation support */
+		if (node->ps.instrument)
+			InstrStartNode(node->ps.instrument);
+
+		node->first_time_through = false;
+	}
+
+	/*
+	 * get state info from node
+	 */
+	hashtable = node->hashtable;
+
+	/*
+	 * set expression context
+	 */
+	hashkeys = node->hashkeys;
+	econtext = node->ps.ps_ExprContext;
+
+	/* NULL tuple received; let HashJoin know that the hashtable is built
+	   and exit */
+	if (TupIsNull(slot))
+	{
+		/* resize the hash table if needed (NTUP_PER_BUCKET exceeded) */
+		if (hashtable->nbuckets != hashtable->nbuckets_optimal)
+			ExecHashIncreaseNumBuckets(hashtable);
+
+		/* Account for the buckets in spaceUsed (reported in EXPLAIN ANALYZE) */
+		hashtable->spaceUsed += hashtable->nbuckets * sizeof(HashJoinTuple);
+		if (hashtable->spaceUsed > hashtable->spacePeak)
+			hashtable->spacePeak = hashtable->spaceUsed;
+
+		/* must provide our own instrumentation support */
+		if (node->ps.instrument)
+			InstrStopNode(node->ps.instrument, hashtable->totalTuples);
+
+		pushTuple(NULL, (PlanState *) node->ps.parent, (PlanState *) node);
+		return false;
+	}
+
+	/* We have to compute the hash value */
+	econtext->ecxt_innertuple = slot;
+	if (ExecHashGetHashValue(hashtable, econtext, hashkeys,
+							 false, hashtable->keepNulls,
+							 &hashvalue))
+	{
+		int			bucketNumber;
+
+		bucketNumber = ExecHashGetSkewBucket(hashtable, hashvalue);
+		if (bucketNumber != INVALID_SKEW_BUCKET_NO)
+		{
+			/* It's a skew tuple, so put it into that hash table */
+			ExecHashSkewTableInsert(hashtable, slot, hashvalue,
+									bucketNumber);
+			hashtable->skewTuples += 1;
+		}
+		else
+		{
+			/* Not subject to skew optimization, so insert normally */
+			ExecHashTableInsert(hashtable, slot, hashvalue);
+		}
+		hashtable->totalTuples += 1;
+	}
+
+	/* ready to accept another tuple */
+	return true;
 }

 /* ----------------------------------------------------------------
@@ -159,7 +237,7 @@ MultiExecHash(HashState *node)
  * ----------------------------------------------------------------
  */
 HashState *
-ExecInitHash(Hash *node, EState *estate, int eflags)
+ExecInitHash(Hash *node, EState *estate, int eflags, PlanState *parent)
 {
 	HashState  *hashstate;

@@ -172,8 +250,10 @@ ExecInitHash(Hash *node, EState *estate, int eflags)
 	hashstate = makeNode(HashState);
 	hashstate->ps.plan = (Plan *) node;
 	hashstate->ps.state = estate;
+	hashstate->ps.parent = parent;
 	hashstate->hashtable = NULL;
 	hashstate->hashkeys = NIL;	/* will be set by parent HashJoin */
+	hashstate->first_time_through = true;

 	/*
 	 * Miscellaneous initialization
@@ -201,7 +281,7 @@ ExecInitHash(Hash *node, EState *estate, int eflags)
 	 * initialize child nodes
 	 */
 	outerPlanState(hashstate) = ExecInitNode(outerPlan(node), estate, eflags,
-											 (PlanState*) hashstate);
+											 (PlanState *) hashstate);

 	/*
 	 * initialize tuple type. no need to initialize projection info because
@@ -1115,6 +1195,68 @@ ExecScanHashBucket(HashJoinState *hjstate,
 }

 /*
+ * ExecScanHashBucket
+ *		scan a hash bucket for matches to the current outer tuple and push
+ *		them
+ *
+ * The current outer tuple must be stored in econtext->ecxt_outertuple.
+ *
+ * Returns true, if parent still accepts tuples, false otherwise.
+ */
+bool
+ExecScanHashBucketAndPush(HashJoinState *hjstate,
+						  ExprContext *econtext)
+{
+	List	   *hjclauses = hjstate->hashclauses;
+	HashJoinTable hashtable = hjstate->hj_HashTable;
+	HashJoinTuple hashTuple;
+	uint32		hashvalue = hjstate->hj_CurHashValue;
+	bool parent_accepts_tuples = true;
+
+	/*
+	 * For now, we don't support pausing execution; we either push all matching
+	 * tuples from the bucket at once or don't touch it at all.
+	 */
+	Assert(hjstate->hj_CurTuple == NULL);
+
+	/*
+	 * If the tuple hashed to a skew bucket then scan the skew bucket
+	 * otherwise scan the standard hashtable bucket.
+	 */
+	if (hjstate->hj_CurSkewBucketNo != INVALID_SKEW_BUCKET_NO)
+		hashTuple = hashtable->skewBucket[hjstate->hj_CurSkewBucketNo]->tuples;
+	else
+		hashTuple = hashtable->buckets[hjstate->hj_CurBucketNo];
+
+	while (hashTuple != NULL)
+	{
+		if (hashTuple->hashvalue == hashvalue)
+		{
+			TupleTableSlot *inntuple;
+
+			/* insert hashtable's tuple into exec slot so ExecQual sees it */
+			inntuple = ExecStoreMinimalTuple(HJTUPLE_MINTUPLE(hashTuple),
+											 hjstate->hj_HashTupleSlot,
+											 false);	/* do not pfree */
+			econtext->ecxt_innertuple = inntuple;
+
+			/* reset temp memory each time to avoid leaks from qual expr */
+			ResetExprContext(econtext);
+
+			if (ExecQual(hjclauses, econtext, false))
+			{
+				hjstate->hj_CurTuple = hashTuple;
+				parent_accepts_tuples = CheckJoinQualAndPush(hjstate);
+			}
+		}
+
+		hashTuple = hashTuple->next;
+	}
+
+	return parent_accepts_tuples;
+}
+
+/*
  * ExecPrepHashTableForUnmatched
  *		set up for a series of ExecScanHashTableForUnmatched calls
  */
@@ -1206,6 +1348,84 @@ ExecScanHashTableForUnmatched(HashJoinState *hjstate, ExprContext *econtext)
 }

 /*
+ * ExecScanHashTableForUnmatchedAndPush
+ *		scan the hash table for unmatched inner tuples and push them
+ *
+ * Like ExecScanHashTableForUnmatched, but pushes all tuples immediately.
+ * Returns true, if parent still accepts tuples, false otherwise
+ */
+bool
+ExecScanHashTableForUnmatchedAndPush(HashJoinState *hjstate,
+									 ExprContext *econtext)
+{
+	HashJoinTable hashtable = hjstate->hj_HashTable;
+	HashJoinTuple hashTuple = NULL;
+	bool parent_accepts_tuples = true;
+
+	/*
+	 * For now, we don't support pausing execution and don't enter here twice
+	 */
+	Assert(hjstate->hj_CurTuple == NULL);
+
+	for (;;)
+	{
+		/*
+		 * hj_CurTuple is the address of the tuple last returned from the
+		 * current bucket, or NULL if it's time to start scanning a new
+		 * bucket.
+		 */
+		if (hashTuple != NULL)
+			hashTuple = hashTuple->next;
+		else if (hjstate->hj_CurBucketNo < hashtable->nbuckets)
+		{
+			hashTuple = hashtable->buckets[hjstate->hj_CurBucketNo];
+			hjstate->hj_CurBucketNo++;
+		}
+		else if (hjstate->hj_CurSkewBucketNo < hashtable->nSkewBuckets)
+		{
+			int			j = hashtable->skewBucketNums[hjstate->hj_CurSkewBucketNo];
+
+			hashTuple = hashtable->skewBucket[j]->tuples;
+			hjstate->hj_CurSkewBucketNo++;
+		}
+		else
+			break;				/* finished all buckets */
+
+		while (hashTuple != NULL)
+		{
+			if (!HeapTupleHeaderHasMatch(HJTUPLE_MINTUPLE(hashTuple)))
+			{
+				TupleTableSlot *inntuple;
+
+				/* insert hashtable's tuple into exec slot */
+				inntuple = ExecStoreMinimalTuple(HJTUPLE_MINTUPLE(hashTuple),
+												 hjstate->hj_HashTupleSlot,
+												 false);		/* do not pfree */
+				econtext->ecxt_innertuple = inntuple;
+
+				/*
+				 * Reset temp memory each time; although this function doesn't
+				 * do any qual eval, the caller will, so let's keep it
+				 * parallel to ExecScanHashBucket.
+				 */
+				ResetExprContext(econtext);
+
+				/*
+				 * Since right now we don't support pausing execution anyway,
+				 * it is probably unnecessary.
+				 */
+				hjstate->hj_CurTuple = hashTuple;
+				parent_accepts_tuples = PushUnmatched(hjstate);
+			}
+
+			hashTuple = hashTuple->next;
+		}
+	}
+
+	return parent_accepts_tuples;
+}
+
+/*
  * ExecHashTableReset
  *
  *		reset hash table header for new batch
diff --git a/src/backend/executor/nodeHashjoin.c b/src/backend/executor/nodeHashjoin.c
index b48863f90b..6c637548e1 100644
--- a/src/backend/executor/nodeHashjoin.c
+++ b/src/backend/executor/nodeHashjoin.c
@@ -27,172 +27,149 @@
 /*
  * States of the ExecHashJoin state machine
  */
-#define HJ_BUILD_HASHTABLE		1
-#define HJ_NEED_NEW_OUTER		2
-#define HJ_SCAN_BUCKET			3
-#define HJ_FILL_OUTER_TUPLE		4
-#define HJ_FILL_INNER_TUPLES	5
-#define HJ_NEED_NEW_BATCH		6
-
-/* Returns true if doing null-fill on outer relation */
-#define HJ_FILL_OUTER(hjstate)	((hjstate)->hj_NullInnerTupleSlot != NULL)
-/* Returns true if doing null-fill on inner relation */
-#define HJ_FILL_INNER(hjstate)	((hjstate)->hj_NullOuterTupleSlot != NULL)
-
-static TupleTableSlot *ExecHashJoinOuterGetTuple(PlanState *outerNode,
-						  HashJoinState *hjstate,
-						  uint32 *hashvalue);
-static TupleTableSlot *ExecHashJoinGetSavedTuple(HashJoinState *hjstate,
-						  BufFile *file,
-						  uint32 *hashvalue,
-						  TupleTableSlot *tupleSlot);
+#define HJ_BUILD_HASHTABLE				1
+#define HJ_NEED_NEW_OUTER				2
+#define HJ_SCAN_BUCKET					3
+#define HJ_FILL_OUTER_TUPLE				4
+#define HJ_FILL_INNER_TUPLES			5
+#define HJ_NEED_NEW_BATCH				6
+#define HJ_WAITING_FOR_NEW_OUTER		7
+#define HJ_HANDLE_NEW_OUTER				8
+#define HJ_TAKE_OUTER_FROM_TEMP_FILE	9
+
+static TupleTableSlot *ExecHashJoinGetSavedTuple(BufFile *file,
+												 uint32 *hashvalue,
+												 TupleTableSlot *tupleSlot);
 static bool ExecHashJoinNewBatch(HashJoinState *hjstate);
+static TupleTableSlot *TakeOuterFromTempFile(HashJoinState *hjstate,
+											 uint32 *hashvalue);

-/* ----------------------------------------------------------------
- *		ExecHashJoin
- *
- *		This function implements the Hybrid Hashjoin algorithm.
- *
- *		Note: the relation we build hash table on is the "inner"
- *			  the other one is "outer".
- * ----------------------------------------------------------------
+
+/*
+ * This function will be called from Hash node with NULL slot, signaling
+ * that the hashtable is built.
+ * "Extract-one-outer-tuple-to-check-if-it-is-null-before-building-hashtable"
+ * optimization is not implemented for now, the hashtable will be always built
+ * first.
+ */
+bool
+pushTupleToHashJoinFromInner(TupleTableSlot *slot, HashJoinState *node)
+{
+	HashJoinTable hashtable;
+	HashState *hashNode;
+
+	hashNode = (HashState *) innerPlanState(node);
+
+	/* we should get there only once */
+	Assert(node->hj_JoinState == HJ_BUILD_HASHTABLE);
+	/* we will fish out the tuples from Hash node ourselves */
+	Assert(TupIsNull(slot));
+
+	/* we always build the hashtable first */
+	node->hj_FirstOuterTupleSlot = NULL;
+
+	hashtable = hashNode->hashtable;
+	node->hj_HashTable = hashtable;
+
+	/*
+	 * need to remember whether nbatch has increased since we
+	 * began scanning the outer relation
+	 */
+	hashtable->nbatch_outstart = hashtable->nbatch;
+
+	/*
+	 * Reset OuterNotEmpty for scan.
+	 */
+	node->hj_OuterNotEmpty = false;
+
+	node->hj_JoinState = HJ_WAITING_FOR_NEW_OUTER;
+
+	/* Don't send us anything on the inner side */
+	return false;
+}
+
+/*
+ * Push from the outer side. Find matches and send them upward to HashJoin's
+ * parent. Return true if this parent ready to accept yet another tuple, false
+ * otherwise. When this function is called, the hashtable must already
+ * be filled.
  */
-TupleTableSlot *				/* return: a tuple or NULL */
-ExecHashJoin(HashJoinState *node)
+bool
+pushTupleToHashJoinFromOuter(TupleTableSlot *slot, HashJoinState *node)
 {
-	PlanState  *outerNode;
-	HashState  *hashNode;
-	List	   *joinqual;
-	List	   *otherqual;
 	ExprContext *econtext;
 	HashJoinTable hashtable;
-	TupleTableSlot *outerTupleSlot;
 	uint32		hashvalue;
 	int			batchno;

 	/*
 	 * get information from HashJoin node
 	 */
-	joinqual = node->js.joinqual;
-	otherqual = node->js.ps.qual;
-	hashNode = (HashState *) innerPlanState(node);
-	outerNode = outerPlanState(node);
-	hashtable = node->hj_HashTable;
 	econtext = node->js.ps.ps_ExprContext;
+	hashtable = node->hj_HashTable;

-	/*
-	 * Reset per-tuple memory context to free any expression evaluation
-	 * storage allocated in the previous tuple cycle.
-	 */
-	ResetExprContext(econtext);
+	/* We must always be in this state when the tuple is pushed */
+	Assert(node->hj_JoinState == HJ_WAITING_FOR_NEW_OUTER);

-	/*
-	 * run the hash join state machine
-	 */
-	for (;;)
+	if (!TupIsNull(slot))
 	{
-		switch (node->hj_JoinState)
+		/*
+		 * We have to compute the tuple's hash value.
+		 */
+		econtext->ecxt_outertuple = slot;
+		if (!ExecHashGetHashValue(hashtable, econtext,
+								  node->hj_OuterHashKeys,
+								  true,		/* outer tuple */
+								  HJ_FILL_OUTER(node),
+								  &hashvalue))
 		{
-			case HJ_BUILD_HASHTABLE:
+			/*
+			 * That tuple couldn't match because of a NULL, so discard it and
+			 * wait for the next one.
+			 */
+			return true;
+		}
+	}

-				/*
-				 * First time through: build hash table for inner relation.
-				 */
-				Assert(hashtable == NULL);
+	/* ready to handle this slot */
+	node->hj_JoinState = HJ_HANDLE_NEW_OUTER;

-				/*
-				 * If the outer relation is completely empty, and it's not
-				 * right/full join, we can quit without building the hash
-				 * table.  However, for an inner join it is only a win to
-				 * check this when the outer relation's startup cost is less
-				 * than the projected cost of building the hash table.
-				 * Otherwise it's best to build the hash table first and see
-				 * if the inner relation is empty.  (When it's a left join, we
-				 * should always make this check, since we aren't going to be
-				 * able to skip the join on the strength of an empty inner
-				 * relation anyway.)
-				 *
-				 * If we are rescanning the join, we make use of information
-				 * gained on the previous scan: don't bother to try the
-				 * prefetch if the previous scan found the outer relation
-				 * nonempty. This is not 100% reliable since with new
-				 * parameters the outer relation might yield different
-				 * results, but it's a good heuristic.
-				 *
-				 * The only way to make the check is to try to fetch a tuple
-				 * from the outer plan node.  If we succeed, we have to stash
-				 * it away for later consumption by ExecHashJoinOuterGetTuple.
-				 */
-				if (HJ_FILL_INNER(node))
-				{
-					/* no chance to not build the hash table */
-					node->hj_FirstOuterTupleSlot = NULL;
-				}
-				else if (HJ_FILL_OUTER(node) ||
-						 (outerNode->plan->startup_cost < hashNode->ps.plan->total_cost &&
-						  !node->hj_OuterNotEmpty))
+	/* Push tuples matching to the received outer tuple while we can */
+	for (;;)
+	{
+		switch(node->hj_JoinState)
+		{
+			case HJ_NEED_NEW_OUTER:
+				if (hashtable->curbatch == 0)
 				{
-					node->hj_FirstOuterTupleSlot = ExecProcNode(outerNode);
-					if (TupIsNull(node->hj_FirstOuterTupleSlot))
-					{
-						node->hj_OuterNotEmpty = false;
-						return NULL;
-					}
-					else
-						node->hj_OuterNotEmpty = true;
+					/*
+					 * On the first batch, we always fetch tuples from below
+					 * nodes, not from temp files. So, setting the state to
+					 * waiting for new outer and telling the node below that
+					 * we are ready to accept the tuple
+					 */
+					node->hj_JoinState = HJ_WAITING_FOR_NEW_OUTER;
+					return true;
 				}
-				else
-					node->hj_FirstOuterTupleSlot = NULL;
-
-				/*
-				 * create the hash table
-				 */
-				hashtable = ExecHashTableCreate((Hash *) hashNode->ps.plan,
-												node->hj_HashOperators,
-												HJ_FILL_INNER(node));
-				node->hj_HashTable = hashtable;
-
-				/*
-				 * execute the Hash node, to build the hash table
-				 */
-				hashNode->hashtable = hashtable;
-				(void) MultiExecProcNode((PlanState *) hashNode);
-
-				/*
-				 * If the inner relation is completely empty, and we're not
-				 * doing a left outer join, we can quit without scanning the
-				 * outer relation.
-				 */
-				if (hashtable->totalTuples == 0 && !HJ_FILL_OUTER(node))
-					return NULL;
-
 				/*
-				 * need to remember whether nbatch has increased since we
-				 * began scanning the outer relation
+				 * on subsequent batches, we always take tuples from temp
+				 * files
 				 */
-				hashtable->nbatch_outstart = hashtable->nbatch;
-
-				/*
-				 * Reset OuterNotEmpty for scan.  (It's OK if we fetched a
-				 * tuple above, because ExecHashJoinOuterGetTuple will
-				 * immediately set it again.)
-				 */
-				node->hj_OuterNotEmpty = false;
-
-				node->hj_JoinState = HJ_NEED_NEW_OUTER;
+				slot = TakeOuterFromTempFile(node, &hashvalue);
+				/* ready to hande this slot */
+				node->hj_JoinState = HJ_HANDLE_NEW_OUTER;

 				/* FALL THRU */

-			case HJ_NEED_NEW_OUTER:
-
-				/*
-				 * We don't have an outer tuple, try to get the next one
+			case HJ_HANDLE_NEW_OUTER:
+				/* Handle new outer tuple, either from temp files or nodes
+				 * below. It can be NULL, which means the end of batch.
+				 * hashvalue must be set at this moment, and the tuple must
+				 * be in 'slot' variable.
 				 */
-				outerTupleSlot = ExecHashJoinOuterGetTuple(outerNode,
-														   node,
-														   &hashvalue);
-				if (TupIsNull(outerTupleSlot))
+
+				if (TupIsNull(slot))
 				{
 					/* end of batch, or maybe whole join */
 					if (HJ_FILL_INNER(node))
@@ -206,7 +183,7 @@ ExecHashJoin(HashJoinState *node)
 					continue;
 				}

-				econtext->ecxt_outertuple = outerTupleSlot;
+				econtext->ecxt_outertuple = slot;
 				node->hj_MatchedOuter = false;

 				/*
@@ -232,14 +209,18 @@ ExecHashJoin(HashJoinState *node)
 					 * Save it in the corresponding outer-batch file.
 					 */
 					Assert(batchno > hashtable->curbatch);
-					ExecHashJoinSaveTuple(ExecFetchSlotMinimalTuple(outerTupleSlot),
+					ExecHashJoinSaveTuple(ExecFetchSlotMinimalTuple(slot),
 										  hashvalue,
-										&hashtable->outerBatchFile[batchno]);
-					/* Loop around, staying in HJ_NEED_NEW_OUTER state */
-					continue;
+										  &hashtable->outerBatchFile[batchno]);
+					/* In fact, this can only happen while we are processing
+					 * the first batch, so we just wait for the new outer
+					 * tuple
+					 */
+					node->hj_JoinState = HJ_WAITING_FOR_NEW_OUTER;
+					return true;
 				}

-				/* OK, let's scan the bucket for matches */
+				/* OK, let's scan this bucket for matches with this tuple */
 				node->hj_JoinState = HJ_SCAN_BUCKET;

 				/* FALL THRU */
@@ -254,55 +235,14 @@ ExecHashJoin(HashJoinState *node)
 				CHECK_FOR_INTERRUPTS();

 				/*
-				 * Scan the selected hash bucket for matches to current outer
+				 * Push all matching tuples from selected hash bucket
 				 */
-				if (!ExecScanHashBucket(node, econtext))
-				{
-					/* out of matches; check for possible outer-join fill */
-					node->hj_JoinState = HJ_FILL_OUTER_TUPLE;
-					continue;
-				}
+				if (!ExecScanHashBucketAndPush(node, econtext))
+					return false;

-				/*
-				 * We've got a match, but still need to test non-hashed quals.
-				 * ExecScanHashBucket already set up all the state needed to
-				 * call ExecQual.
-				 *
-				 * If we pass the qual, then save state for next call and have
-				 * ExecProject form the projection, store it in the tuple
-				 * table, and return the slot.
-				 *
-				 * Only the joinquals determine tuple match status, but all
-				 * quals must pass to actually return the tuple.
-				 */
-				if (joinqual == NIL || ExecQual(joinqual, econtext, false))
-				{
-					node->hj_MatchedOuter = true;
-					HeapTupleHeaderSetMatch(HJTUPLE_MINTUPLE(node->hj_CurTuple));
+				node->hj_JoinState = HJ_FILL_OUTER_TUPLE;

-					/* In an antijoin, we never return a matched tuple */
-					if (node->js.jointype == JOIN_ANTI)
-					{
-						node->hj_JoinState = HJ_NEED_NEW_OUTER;
-						continue;
-					}
-
-					/*
-					 * In a semijoin, we'll consider returning the first
-					 * match, but after that we're done with this outer tuple.
-					 */
-					if (node->js.jointype == JOIN_SEMI)
-						node->hj_JoinState = HJ_NEED_NEW_OUTER;
-
-					if (otherqual == NIL ||
-						ExecQual(otherqual, econtext, false))
-						return ExecProject(node->js.ps.ps_ProjInfo);
-					else
-						InstrCountFiltered2(node, 1);
-				}
-				else
-					InstrCountFiltered1(node, 1);
-				break;
+				/* FALL THRU */

 			case HJ_FILL_OUTER_TUPLE:

@@ -313,20 +253,16 @@ ExecHashJoin(HashJoinState *node)
 				 */
 				node->hj_JoinState = HJ_NEED_NEW_OUTER;

-				if (!node->hj_MatchedOuter &&
-					HJ_FILL_OUTER(node))
+				if (!node->hj_MatchedOuter && HJ_FILL_OUTER(node))
 				{
 					/*
 					 * Generate a fake join tuple with nulls for the inner
-					 * tuple, and return it if it passes the non-join quals.
+					 * tuple, and push it if it passes the non-join quals.
 					 */
 					econtext->ecxt_innertuple = node->hj_NullInnerTupleSlot;

-					if (otherqual == NIL ||
-						ExecQual(otherqual, econtext, false))
-						return ExecProject(node->js.ps.ps_ProjInfo);
-					else
-						InstrCountFiltered2(node, 1);
+					if (!CheckOtherQualAndPush(node))
+						return false;
 				}
 				break;

@@ -337,24 +273,10 @@ ExecHashJoin(HashJoinState *node)
 				 * so any unmatched inner tuples in the hashtable have to be
 				 * emitted before we continue to the next batch.
 				 */
-				if (!ExecScanHashTableForUnmatched(node, econtext))
-				{
-					/* no more unmatched tuples */
-					node->hj_JoinState = HJ_NEED_NEW_BATCH;
-					continue;
-				}
-
-				/*
-				 * Generate a fake join tuple with nulls for the outer tuple,
-				 * and return it if it passes the non-join quals.
-				 */
-				econtext->ecxt_outertuple = node->hj_NullOuterTupleSlot;
+				if (!ExecScanHashTableForUnmatchedAndPush(node, econtext))
+					return false;

-				if (otherqual == NIL ||
-					ExecQual(otherqual, econtext, false))
-					return ExecProject(node->js.ps.ps_ProjInfo);
-				else
-					InstrCountFiltered2(node, 1);
+				node->hj_JoinState = HJ_NEED_NEW_BATCH;
 				break;

 			case HJ_NEED_NEW_BATCH:
@@ -363,17 +285,43 @@ ExecHashJoin(HashJoinState *node)
 				 * Try to advance to next batch.  Done if there are no more.
 				 */
 				if (!ExecHashJoinNewBatch(node))
-					return NULL;	/* end of join */
+				{
+					/* let parent know that we are done */
+					pushTuple(NULL, node->js.ps.parent, (PlanState *) node);
+					return false;	/* end of join */
+				}
 				node->hj_JoinState = HJ_NEED_NEW_OUTER;
 				break;

-			default:
-				elog(ERROR, "unrecognized hashjoin state: %d",
-					 (int) node->hj_JoinState);
 		}
 	}
 }

+/*
+ * Get next outer tuple from saved temp files. We are processing not the first
+ * batch if we are here. On success, the tuple's hash value is stored at
+ * *hashvalue, re-read from the temp file.
+ * Returns NULL on the end of batch, a tuple otherwise.
+ */
+static TupleTableSlot *TakeOuterFromTempFile(HashJoinState *hjstate,
+											 uint32 *hashvalue)
+{
+	HashJoinTable hashtable = hjstate->hj_HashTable;
+	int			curbatch = hashtable->curbatch;
+	BufFile    *file = hashtable->outerBatchFile[curbatch];
+
+	/*
+	 * In outer-join cases, we could get here even though the batch file
+	 * is empty.
+	 */
+	if (file == NULL)
+		return NULL;
+
+	return ExecHashJoinGetSavedTuple(file,
+									 hashvalue,
+									 hjstate->hj_OuterTupleSlot);
+}
+
 /* ----------------------------------------------------------------
  *		ExecInitHashJoin
  *
@@ -381,7 +329,7 @@ ExecHashJoin(HashJoinState *node)
  * ----------------------------------------------------------------
  */
 HashJoinState *
-ExecInitHashJoin(HashJoin *node, EState *estate, int eflags)
+ExecInitHashJoin(HashJoin *node, EState *estate, int eflags, PlanState *parent)
 {
 	HashJoinState *hjstate;
 	Plan	   *outerNode;
@@ -400,6 +348,7 @@ ExecInitHashJoin(HashJoin *node, EState *estate, int eflags)
 	hjstate = makeNode(HashJoinState);
 	hjstate->js.ps.plan = (Plan *) node;
 	hjstate->js.ps.state = estate;
+	hjstate->js.ps.parent = parent;

 	/*
 	 * Miscellaneous initialization
@@ -579,89 +528,6 @@ ExecEndHashJoin(HashJoinState *node)
 }

 /*
- * ExecHashJoinOuterGetTuple
- *
- *		get the next outer tuple for hashjoin: either by
- *		executing the outer plan node in the first pass, or from
- *		the temp files for the hashjoin batches.
- *
- * Returns a null slot if no more outer tuples (within the current batch).
- *
- * On success, the tuple's hash value is stored at *hashvalue --- this is
- * either originally computed, or re-read from the temp file.
- */
-static TupleTableSlot *
-ExecHashJoinOuterGetTuple(PlanState *outerNode,
-						  HashJoinState *hjstate,
-						  uint32 *hashvalue)
-{
-	HashJoinTable hashtable = hjstate->hj_HashTable;
-	int			curbatch = hashtable->curbatch;
-	TupleTableSlot *slot;
-
-	if (curbatch == 0)			/* if it is the first pass */
-	{
-		/*
-		 * Check to see if first outer tuple was already fetched by
-		 * ExecHashJoin() and not used yet.
-		 */
-		slot = hjstate->hj_FirstOuterTupleSlot;
-		if (!TupIsNull(slot))
-			hjstate->hj_FirstOuterTupleSlot = NULL;
-		else
-			slot = ExecProcNode(outerNode);
-
-		while (!TupIsNull(slot))
-		{
-			/*
-			 * We have to compute the tuple's hash value.
-			 */
-			ExprContext *econtext = hjstate->js.ps.ps_ExprContext;
-
-			econtext->ecxt_outertuple = slot;
-			if (ExecHashGetHashValue(hashtable, econtext,
-									 hjstate->hj_OuterHashKeys,
-									 true,		/* outer tuple */
-									 HJ_FILL_OUTER(hjstate),
-									 hashvalue))
-			{
-				/* remember outer relation is not empty for possible rescan */
-				hjstate->hj_OuterNotEmpty = true;
-
-				return slot;
-			}
-
-			/*
-			 * That tuple couldn't match because of a NULL, so discard it and
-			 * continue with the next one.
-			 */
-			slot = ExecProcNode(outerNode);
-		}
-	}
-	else if (curbatch < hashtable->nbatch)
-	{
-		BufFile    *file = hashtable->outerBatchFile[curbatch];
-
-		/*
-		 * In outer-join cases, we could get here even though the batch file
-		 * is empty.
-		 */
-		if (file == NULL)
-			return NULL;
-
-		slot = ExecHashJoinGetSavedTuple(hjstate,
-										 file,
-										 hashvalue,
-										 hjstate->hj_OuterTupleSlot);
-		if (!TupIsNull(slot))
-			return slot;
-	}
-
-	/* End of this batch */
-	return NULL;
-}
-
-/*
  * ExecHashJoinNewBatch
  *		switch to a new hashjoin batch
  *
@@ -769,8 +635,7 @@ ExecHashJoinNewBatch(HashJoinState *hjstate)
 					(errcode_for_file_access(),
 				   errmsg("could not rewind hash-join temporary file: %m")));

-		while ((slot = ExecHashJoinGetSavedTuple(hjstate,
-												 innerFile,
+		while ((slot = ExecHashJoinGetSavedTuple(innerFile,
 												 &hashvalue,
 												 hjstate->hj_HashTupleSlot)))
 		{
@@ -849,8 +714,7 @@ ExecHashJoinSaveTuple(MinimalTuple tuple, uint32 hashvalue,
  * itself is stored in the given slot.
  */
 static TupleTableSlot *
-ExecHashJoinGetSavedTuple(HashJoinState *hjstate,
-						  BufFile *file,
+ExecHashJoinGetSavedTuple(BufFile *file,
 						  uint32 *hashvalue,
 						  TupleTableSlot *tupleSlot)
 {
@@ -893,7 +757,6 @@ ExecHashJoinGetSavedTuple(HashJoinState *hjstate,
 	return ExecStoreMinimalTuple(tuple, tupleSlot, true);
 }

-
 void
 ExecReScanHashJoin(HashJoinState *node)
 {
diff --git a/src/include/executor/nodeHash.h b/src/include/executor/nodeHash.h
index fe5c2642d7..1ac95a20fd 100644
--- a/src/include/executor/nodeHash.h
+++ b/src/include/executor/nodeHash.h
@@ -16,8 +16,9 @@

 #include "nodes/execnodes.h"

-extern HashState *ExecInitHash(Hash *node, EState *estate, int eflags);
-extern TupleTableSlot *ExecHash(HashState *node);
+extern HashState *ExecInitHash(Hash *node, EState *estate, int eflags,
+							   PlanState* parent);
+extern bool pushTupleToHash(TupleTableSlot *slot, HashState *node);
 extern Node *MultiExecHash(HashState *node);
 extern void ExecEndHash(HashState *node);
 extern void ExecReScanHash(HashState *node);
@@ -39,9 +40,13 @@ extern void ExecHashGetBucketAndBatch(HashJoinTable hashtable,
 						  int *bucketno,
 						  int *batchno);
 extern bool ExecScanHashBucket(HashJoinState *hjstate, ExprContext *econtext);
+extern bool ExecScanHashBucketAndPush(HashJoinState *hjstate,
+									  ExprContext *econtext);
 extern void ExecPrepHashTableForUnmatched(HashJoinState *hjstate);
 extern bool ExecScanHashTableForUnmatched(HashJoinState *hjstate,
 							  ExprContext *econtext);
+extern bool ExecScanHashTableForUnmatchedAndPush(HashJoinState *hjstate,
+							  ExprContext *econtext);
 extern void ExecHashTableReset(HashJoinTable hashtable);
 extern void ExecHashTableResetMatchFlags(HashJoinTable hashtable);
 extern void ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew,
diff --git a/src/include/executor/nodeHashjoin.h b/src/include/executor/nodeHashjoin.h
index ddc32b1de3..8b3b88917c 100644
--- a/src/include/executor/nodeHashjoin.h
+++ b/src/include/executor/nodeHashjoin.h
@@ -16,13 +16,107 @@

 #include "nodes/execnodes.h"
 #include "storage/buffile.h"
+#include "executor/executor.h"
+#include "executor/hashjoin.h"
+#include "access/htup_details.h"
+#include "utils/memutils.h"

-extern HashJoinState *ExecInitHashJoin(HashJoin *node, EState *estate, int eflags);
-extern TupleTableSlot *ExecHashJoin(HashJoinState *node);
+/* Returns true if doing null-fill on outer relation */
+#define HJ_FILL_OUTER(hjstate)	((hjstate)->hj_NullInnerTupleSlot != NULL)
+/* Returns true if doing null-fill on inner relation */
+#define HJ_FILL_INNER(hjstate)	((hjstate)->hj_NullOuterTupleSlot != NULL)
+
+extern HashJoinState *ExecInitHashJoin(HashJoin *node, EState *estate,
+									   int eflags, PlanState *parent);
+extern bool pushTupleToHashJoinFromInner(TupleTableSlot *slot,
+								  HashJoinState *node);
+extern bool pushTupleToHashJoinFromOuter(TupleTableSlot *slot,
+										 HashJoinState *node);
 extern void ExecEndHashJoin(HashJoinState *node);
 extern void ExecReScanHashJoin(HashJoinState *node);

 extern void ExecHashJoinSaveTuple(MinimalTuple tuple, uint32 hashvalue,
 					  BufFile **fileptr);

-#endif   /* NODEHASHJOIN_H */
+/* inline funcs decls and implementations */
+#pragma GCC diagnostic warning "-Winline"
+static inline bool CheckOtherQualAndPush(HashJoinState *node);
+static inline bool PushUnmatched(HashJoinState *node);
+static inline bool CheckJoinQualAndPush(HashJoinState *node);
+
+/*
+ * Everything is ready for checking otherqual and projecting; do that,
+ * and push the result.
+ *
+ * Returns true if parent accepts more tuples, false otherwise
+ */
+static inline bool CheckOtherQualAndPush(HashJoinState *node)
+{
+	ExprContext *econtext = node->js.ps.ps_ExprContext;
+	List *otherqual = node->js.ps.qual;
+	TupleTableSlot *slot;
+
+	if (otherqual == NIL ||
+		ExecQual(otherqual, econtext, false))
+	{
+		slot = ExecProject(node->js.ps.ps_ProjInfo);
+		return pushTuple(slot, node->js.ps.parent, (PlanState *) node);
+	}
+	else
+		InstrCountFiltered2(node, 1);
+	return true;
+}
+
+/*
+ * Push inner tuple with no match, ExecScanHashTableForUnmatchedAndPush
+ * prepared state needed for ExecQual.
+ *
+ * Returns true if parent accepts more tuples, false otherwise.
+ */
+static inline bool PushUnmatched(HashJoinState *node)
+{
+	ExprContext *econtext = node->js.ps.ps_ExprContext;
+	/*
+	 * Reset per-tuple memory context to free any expression evaluation
+	 * storage.
+	 */
+	ResetExprContext(econtext);
+
+	/*
+	 * Generate a fake join tuple with nulls for the outer tuple,
+	 * and return it if it passes the non-join quals.
+	 */
+	econtext->ecxt_outertuple = node->hj_NullOuterTupleSlot;
+	return CheckOtherQualAndPush(node);
+}
+
+/*
+ * We have found inner tuple with hashed quals matched to the current outer
+ * tuple. Now check non-hashed quals, other quals, then project and push
+ * the result.
+ *
+ * State for ExecQual was already set by ExecScanHashBucketAndPush and before.
+ * Returns true if parent accepts more tuples, false otherwise.
+ */
+static inline bool CheckJoinQualAndPush(HashJoinState *node)
+{
+	List	   *joinqual = node->js.joinqual;
+	ExprContext *econtext = node->js.ps.ps_ExprContext;
+
+	/*
+	 * Only the joinquals determine tuple match status, but all
+	 * quals must pass to actually return the tuple.
+	 */
+	if (joinqual == NIL || ExecQual(joinqual, econtext, false))
+	{
+		node->hj_MatchedOuter = true;
+		HeapTupleHeaderSetMatch(HJTUPLE_MINTUPLE(node->hj_CurTuple));
+		return CheckOtherQualAndPush(node);
+	}
+	else
+		InstrCountFiltered1(node, 1);
+
+	return true;
+}
+
+#endif	 /* NODEHASHJOIN_H */
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index da7fd9c7ac..abbe67ba0c 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -2145,6 +2145,8 @@ typedef struct HashState
 	HashJoinTable hashtable;	/* hash table for the hashjoin */
 	List	   *hashkeys;		/* list of ExprState nodes */
 	/* hashkeys is same as parent's hj_InnerHashKeys */
+	/* on the first push we must build the hashtable */
+	bool first_time_through;
 } HashState;

 /* ----------------
-- 
2.11.0