diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index 0d9663c..db3a953 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -1900,18 +1900,20 @@ show_hash_info(HashState *hashstate, ExplainState *es) if (es->format != EXPLAIN_FORMAT_TEXT) { ExplainPropertyLong("Hash Buckets", hashtable->nbuckets, es); + ExplainPropertyLong("Original Hash Buckets", + hashtable->nbuckets_original, es); ExplainPropertyLong("Hash Batches", hashtable->nbatch, es); ExplainPropertyLong("Original Hash Batches", hashtable->nbatch_original, es); ExplainPropertyLong("Peak Memory Usage", spacePeakKb, es); } - else if (hashtable->nbatch_original != hashtable->nbatch) + else if ((hashtable->nbatch_original != hashtable->nbatch) || (hashtable->nbuckets_original != hashtable->nbuckets)) { appendStringInfoSpaces(es->str, es->indent * 2); appendStringInfo(es->str, - "Buckets: %d Batches: %d (originally %d) Memory Usage: %ldkB\n", - hashtable->nbuckets, hashtable->nbatch, - hashtable->nbatch_original, spacePeakKb); + "Buckets: %d (originally %d) Batches: %d (originally %d) Memory Usage: %ldkB\n", + hashtable->nbuckets, hashtable->nbuckets_original, + hashtable->nbatch, hashtable->nbatch_original, spacePeakKb); } else { diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c index 589b2f1..879b336 100644 --- a/src/backend/executor/nodeHash.c +++ b/src/backend/executor/nodeHash.c @@ -39,6 +39,7 @@ static void ExecHashIncreaseNumBatches(HashJoinTable hashtable); +static void ExecHashIncreaseNumBuckets(HashJoinTable hashtable); static void ExecHashBuildSkewHash(HashJoinTable hashtable, Hash *node, int mcvsToUse); static void ExecHashSkewTableInsert(HashJoinTable hashtable, @@ -271,6 +272,7 @@ ExecHashTableCreate(Hash *node, List *hashOperators, bool keepNulls) */ hashtable = (HashJoinTable) palloc(sizeof(HashJoinTableData)); hashtable->nbuckets = nbuckets; + hashtable->nbuckets_original = nbuckets; hashtable->log2_nbuckets = log2_nbuckets; hashtable->buckets = NULL; hashtable->keepNulls = keepNulls; @@ -386,6 +388,22 @@ ExecHashTableCreate(Hash *node, List *hashOperators, bool keepNulls) /* Target bucket loading (tuples per bucket) */ #define NTUP_PER_BUCKET 10 +/* Multiple of NTUP_PER_BUCKET triggering the increase of nbuckets. + * + * Once we reach the threshold we double the number of buckets, and we + * want to get 1.0 on average (to get NTUP_PER_BUCKET on average). That + * means these two equations should hold + * + * b = 2a (growth) + * (a + b)/2 = 1 (oscillate around NTUP_PER_BUCKET) + * + * which means b=1.3333 (a = b/2). If we wanted higher threshold, we + * could grow the nbuckets to (4*nbuckets), thus using (b=4a) for + * growth, leading to (b=1.6). Or (b=8a) giving 1.7777 etc. + * + * Let's start with doubling the bucket count, i.e. 1.333. */ +#define NTUP_GROW_THRESHOLD 1.333 + void ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew, int *numbuckets, @@ -682,6 +700,92 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable) } /* + * ExecHashIncreaseNumBuckets + * increase the original number of buckets in order to reduce + * number of tuples per bucket + */ +static void +ExecHashIncreaseNumBuckets(HashJoinTable hashtable) +{ + int i; + int ntuples = 0; + int oldnbuckets = hashtable->nbuckets; + HashJoinTuple *oldbuckets = hashtable->buckets; + MemoryContext oldcxt; + + /* XXX Not sure if we should update the info about used space here. + * The code seems to ignore the space used for 'buckets' and we're not + * allocating more space for tuples (just shuffling them to the new + * buckets). And the amount of memory used for buckets is quite small + * (just an array of pointers, thus ~8kB per 1k buckets on 64-bit). */ + + /* XXX Should we disable growth if (nbuckets * NTUP_PER_BUCKET) + * reaches work_mem (or something like that)? We shouldn't really + * get into such position (should be handled by increasing the + * number of batches, which is called right before this). */ + + /* XXX Maybe adding info into hashjoin explain output (e.g. initial + * nbuckets, time spent growing the table) would be appropriate. */ + + /* update the hashtable info, so that we can compute buckets etc. */ + hashtable->log2_nbuckets += 1; + hashtable->nbuckets *= 2; + + Assert(hashtable->nbuckets > 1); + Assert(hashtable->nbuckets == (1 << hashtable->log2_nbuckets)); + +#ifdef HJDEBUG + printf("Increasing nbuckets to %d\n", hashtable->nbuckets); +#endif + + /* TODO Maybe it'd be better to resize the buckets in place (should be possible, + * but when I tried it I always ended up with a strange infinite loop). */ + + /* allocate a new bucket list (use the batch context as before) */ + oldcxt = MemoryContextSwitchTo(hashtable->batchCxt); + + hashtable->buckets = (HashJoinTuple *) palloc0(hashtable->nbuckets * sizeof(HashJoinTuple)); + + MemoryContextSwitchTo(oldcxt); + + /* walk through the old buckets, move the buckets into the new table */ + for (i = 0; i < oldnbuckets; i++) + { + + HashJoinTuple tuple = oldbuckets[i]; + + while (tuple != NULL) + { + /* save link in case we delete */ + HashJoinTuple nexttuple = tuple->next; + int bucketno; + int batchno; + + ExecHashGetBucketAndBatch(hashtable, tuple->hashvalue, + &bucketno, &batchno); + + /* move it to the correct bucket */ + tuple->next = hashtable->buckets[bucketno]; + hashtable->buckets[bucketno] = tuple; + + /* process the next tuple */ + tuple = nexttuple; + + ntuples++; + } + } + + pfree(oldbuckets); + +#ifdef HJDEBUG + printf("Nbuckets increased to %d, average items per bucket %.1f\n", + hashtable->nbuckets, (float)ntuples / hashtable->nbuckets); +#endif + +} + + +/* * ExecHashTableInsert * insert a tuple into the hash table depending on the hash value * it may just go to a temp file for later batches @@ -740,6 +844,22 @@ ExecHashTableInsert(HashJoinTable hashtable, hashtable->spacePeak = hashtable->spaceUsed; if (hashtable->spaceUsed > hashtable->spaceAllowed) ExecHashIncreaseNumBatches(hashtable); + + /* check average number of tuples per bucket, increase (2*nbuckets) if needed */ + /* FIXME This attempts to compute number of tuples in the hashtable using hashTupleSize, + * i.e. the size of the current tuple, which is not really accurate. Need to track + * the count properly. + */ + if (hashtable->spaceUsed / hashTupleSize / hashtable->nbuckets > NTUP_GROW_THRESHOLD * NTUP_PER_BUCKET) { + +#ifdef HJDEBUG + printf("Increasing nbucket to %d because average per bucket = %d\n", + nbuckets, hashtable->spaceUsed / hashTupleSize / hashtable->nbuckets); +#endif + + ExecHashIncreaseNumBuckets(hashtable); + } + } else { diff --git a/src/include/executor/hashjoin.h b/src/include/executor/hashjoin.h index 3beae40..03e91a6 100644 --- a/src/include/executor/hashjoin.h +++ b/src/include/executor/hashjoin.h @@ -106,6 +106,7 @@ typedef struct HashSkewBucket typedef struct HashJoinTableData { int nbuckets; /* # buckets in the in-memory hash table */ + int nbuckets_original; /* # buckets when starting the first hash */ int log2_nbuckets; /* its log2 (nbuckets must be a power of 2) */ /* buckets[i] is head of list of tuples in i'th in-memory bucket */