diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index 781a736..ee3fffb 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -1900,18 +1900,20 @@ show_hash_info(HashState *hashstate, ExplainState *es) if (es->format != EXPLAIN_FORMAT_TEXT) { ExplainPropertyLong("Hash Buckets", hashtable->nbuckets, es); + ExplainPropertyLong("Original Hash Buckets", + hashtable->nbuckets_original, es); ExplainPropertyLong("Hash Batches", hashtable->nbatch, es); ExplainPropertyLong("Original Hash Batches", hashtable->nbatch_original, es); ExplainPropertyLong("Peak Memory Usage", spacePeakKb, es); } - else if (hashtable->nbatch_original != hashtable->nbatch) + else if ((hashtable->nbatch_original != hashtable->nbatch) || (hashtable->nbuckets_original != hashtable->nbuckets)) { appendStringInfoSpaces(es->str, es->indent * 2); appendStringInfo(es->str, - "Buckets: %d Batches: %d (originally %d) Memory Usage: %ldkB\n", - hashtable->nbuckets, hashtable->nbatch, - hashtable->nbatch_original, spacePeakKb); + "Buckets: %d (originally %d) Batches: %d (originally %d) Memory Usage: %ldkB\n", + hashtable->nbuckets, hashtable->nbuckets_original, + hashtable->nbatch, hashtable->nbatch_original, spacePeakKb); } else { diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c index 6a9d956..014b6f1 100644 --- a/src/backend/executor/nodeHash.c +++ b/src/backend/executor/nodeHash.c @@ -37,8 +37,10 @@ #include "utils/lsyscache.h" #include "utils/syscache.h" +bool enable_hash_resize = true; static void ExecHashIncreaseNumBatches(HashJoinTable hashtable); +static void ExecHashIncreaseNumBuckets(HashJoinTable hashtable); static void ExecHashBuildSkewHash(HashJoinTable hashtable, Hash *node, int mcvsToUse); static void ExecHashSkewTableInsert(HashJoinTable hashtable, @@ -49,6 +51,9 @@ static void ExecHashRemoveNextSkewBucket(HashJoinTable hashtable); static char * chunk_alloc(HashJoinTable hashtable, int tupleSize); +/* Memory needed for optimal number of buckets. */ +#define BUCKETS_SPACE(htab) ((htab)->nbuckets_optimal * sizeof(void*)) + /* ---------------------------------------------------------------- * ExecHash * @@ -117,6 +122,7 @@ MultiExecHash(HashState *node) /* It's a skew tuple, so put it into that hash table */ ExecHashSkewTableInsert(hashtable, slot, hashvalue, bucketNumber); + hashtable->skewTuples += 1; } else { @@ -127,6 +133,30 @@ MultiExecHash(HashState *node) } } + /* If average number of tuples per bucket is over the defined threshold, + * increase the number of buckets to get below it. */ + if (enable_hash_resize) { + + /* + * Do we actually need to resize? Only scale up, don't scale down (for example + * after adding a batch, it's possible that nbuckets > nbuckets_optimal). + */ + if (hashtable->nbuckets < hashtable->nbuckets_optimal) { + +#ifdef HJDEBUG + printf("Increasing nbuckets %d => %d\n", + hashtable->nbuckets, hashtable->nbuckets_optimal); +#endif + ExecHashIncreaseNumBuckets(hashtable); + } + + } + + /* Account for the buckets in spaceUsed (reported in EXPLAIN ANALYZE) */ + hashtable->spaceUsed += BUCKETS_SPACE(hashtable); + if (hashtable->spaceUsed > hashtable->spacePeak) + hashtable->spacePeak = hashtable->spaceUsed; + /* must provide our own instrumentation support */ if (node->ps.instrument) InstrStopNode(node->ps.instrument, hashtable->totalTuples); @@ -239,6 +269,7 @@ ExecHashTableCreate(Hash *node, List *hashOperators, bool keepNulls) int nbatch; int num_skew_mcvs; int log2_nbuckets; + int log2_nbatch; int nkeys; int i; ListCell *ho; @@ -263,6 +294,10 @@ ExecHashTableCreate(Hash *node, List *hashOperators, bool keepNulls) log2_nbuckets = my_log2(nbuckets); Assert(nbuckets == (1 << log2_nbuckets)); + /* nbatch must be a power of 2 */ + log2_nbatch = my_log2(nbatch); + Assert(nbatch == (1 << log2_nbatch)); + /* * Initialize the hash table control block. * @@ -271,6 +306,8 @@ ExecHashTableCreate(Hash *node, List *hashOperators, bool keepNulls) */ hashtable = (HashJoinTable) palloc(sizeof(HashJoinTableData)); hashtable->nbuckets = nbuckets; + hashtable->nbuckets_original = nbuckets; + hashtable->nbuckets_optimal = nbuckets; hashtable->log2_nbuckets = log2_nbuckets; hashtable->buckets = NULL; hashtable->keepNulls = keepNulls; @@ -281,10 +318,12 @@ ExecHashTableCreate(Hash *node, List *hashOperators, bool keepNulls) hashtable->skewBucketNums = NULL; hashtable->nbatch = nbatch; hashtable->curbatch = 0; + hashtable->log2_nbatch = log2_nbatch; hashtable->nbatch_original = nbatch; hashtable->nbatch_outstart = nbatch; hashtable->growEnabled = true; hashtable->totalTuples = 0; + hashtable->skewTuples = 0; hashtable->innerBatchFile = NULL; hashtable->outerBatchFile = NULL; hashtable->spaceUsed = 0; @@ -377,7 +416,6 @@ ExecHashTableCreate(Hash *node, List *hashOperators, bool keepNulls) return hashtable; } - /* * Compute appropriate size for hashtable given the estimated size of the * relation to be hashed (number of rows and average row width). @@ -386,7 +424,7 @@ ExecHashTableCreate(Hash *node, List *hashOperators, bool keepNulls) */ /* Target bucket loading (tuples per bucket) */ -#define NTUP_PER_BUCKET 10 +#define NTUP_PER_BUCKET 1 void ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew, @@ -396,6 +434,7 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew, { int tupsize; double inner_rel_bytes; + double buckets_bytes; long hash_table_bytes; long skew_table_bytes; long max_pointers; @@ -417,6 +456,9 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew, MAXALIGN(tupwidth); inner_rel_bytes = ntuples * tupsize; + /* Estimate memory needed for buckets, with the target NTUP_PER_BUCKET */ + buckets_bytes = sizeof(void*) * my_log2(ntuples / NTUP_PER_BUCKET); + /* * Target in-memory hashtable size is work_mem kilobytes. */ @@ -468,23 +510,48 @@ ExecChooseHashTableSize(double ntuples, int tupwidth, bool useskew, /* also ensure we avoid integer overflow in nbatch and nbuckets */ max_pointers = Min(max_pointers, INT_MAX / 2); - if (inner_rel_bytes > hash_table_bytes) + if ((inner_rel_bytes + buckets_bytes) > hash_table_bytes) { /* We'll need multiple batches */ long lbuckets; double dbatch; int minbatch; - - lbuckets = (hash_table_bytes / tupsize) / NTUP_PER_BUCKET; - lbuckets = Min(lbuckets, max_pointers); - nbuckets = (int) lbuckets; + double batch_bytes; dbatch = ceil(inner_rel_bytes / hash_table_bytes); dbatch = Min(dbatch, max_pointers); minbatch = (int) dbatch; nbatch = 2; + + /* increase nbatch until the hashtable fits into work_mem */ while (nbatch < minbatch) nbatch <<= 1; + + /* now, we need to see if the buckets fit into work_mem too */ + lbuckets = 1 << (my_log2(ntuples / nbatch / NTUP_PER_BUCKET)); + + /* assuming equally-sized batches, compute bytes of hash table and buckets */ + batch_bytes = inner_rel_bytes / nbatch; + buckets_bytes = lbuckets * sizeof(void*); + + /* we need to get both into work_mem */ + while (batch_bytes + buckets_bytes > hash_table_bytes) { + + /* increment number of batches, and re-evaluate the amount of memory */ + nbatch <<= 1; + + /* now, we need to see if the buckets fit into work_mem too */ + lbuckets = 1 << (my_log2(ntuples / nbatch / NTUP_PER_BUCKET)); + + /* assuming equally-sized batches, compute bytes of hash table and buckets */ + batch_bytes = inner_rel_bytes / nbatch; + buckets_bytes = lbuckets * sizeof(void*); + + } + + /* protect against nbucket overflow */ + lbuckets = Min(lbuckets, max_pointers); + nbuckets = (int) lbuckets; } else { @@ -561,6 +628,8 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable) MemoryContext oldcxt; long ninmemory; long nfreed; + double batchTuples; + HashChunk oldchunks = hashtable->chunks_batch; /* do nothing if we've decided to shut off growth */ @@ -607,6 +676,7 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable) MemoryContextSwitchTo(oldcxt); hashtable->nbatch = nbatch; + hashtable->log2_nbatch += 1; /* * Scan through the existing hash table entries and dump out any that are @@ -700,9 +770,123 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable) printf("Disabling further increase of nbatch\n"); #endif } + + /* + * After increasing the number of batches, the number of tuples per batch + * is lower (~1/2), so the optimal number of buckets might be lower too. + * It's probably (nbuckets_optimal/2) but let's compute it anyway + * (let's start with nbuckets_original - we already have that memory + * allocated, so let's use it). + */ + hashtable->nbuckets_optimal = hashtable->nbuckets_optimal; + batchTuples = (hashtable->totalTuples - hashtable->skewTuples) / hashtable->nbatch; + while ((batchTuples / hashtable->nbuckets_optimal > NTUP_PER_BUCKET) + && (hashtable->nbuckets_optimal <= (INT_MAX/2))) { + hashtable->nbuckets_optimal *= 2; + } } /* + * ExecHashIncreaseNumBuckets + * increase the original number of buckets in order to reduce + * number of tuples per bucket + */ +static void +ExecHashIncreaseNumBuckets(HashJoinTable hashtable) +{ + HashChunk chunk; + + /* do nothing if not an increase (it's called increase for a reason) */ + if (hashtable->nbuckets >= hashtable->nbuckets_optimal) + return; + + /* + * We already know the optimal number of buckets, so let's just + * compute the log2_nbuckets for it. + */ + hashtable->nbuckets = hashtable->nbuckets_optimal; + hashtable->log2_nbuckets = my_log2(hashtable->nbuckets_optimal); + + /* XXX Not sure if we should update the info about used space here. + * The code seems to ignore the space used for 'buckets' and we're not + * allocating more space for tuples (just shuffling them to the new + * buckets). And the amount of memory used for buckets is quite small + * (just an array of pointers, thus ~8kB per 1k buckets on 64-bit). */ + + /* XXX Should we disable growth if (nbuckets * NTUP_PER_BUCKET) + * reaches work_mem (or something like that)? We shouldn't really + * get into such position (should be handled by increasing the + * number of batches, which is called right before this). */ + + /* XXX Maybe adding info into hashjoin explain output (e.g. initial + * nbuckets, time spent growing the table) would be appropriate. */ + + Assert(hashtable->nbuckets > 1); + Assert(hashtable->nbuckets <= (INT_MAX/2)); + Assert(hashtable->nbuckets == (1 << hashtable->log2_nbuckets)); + +#ifdef HJDEBUG + printf("Increasing nbuckets to %d\n", hashtable->nbuckets); +#endif + + /* Just reallocate the proper number of buckets - we don't need to + * walk through them - we can walk the dense-allocated chunks + * (just like in ExecHashIncreaseNumBatches, but without all the + * copying into new chunks) + */ + + /* just resize the existing bucket list to the right size */ + hashtable->buckets = (HashJoinTuple *) repalloc(hashtable->buckets, + hashtable->nbuckets * sizeof(HashJoinTuple)); + + /* XXX Maybe not needed? */ + memset(hashtable->buckets, 0, sizeof(void*) * hashtable->nbuckets); + + /* so, let's scan through the chunks, and all tuples in each chunk */ + chunk = hashtable->chunks_batch; + while (chunk != NULL) { + + /* position within the buffer (up to chunk->used) */ + size_t idx = 0; + + /* process all tuples stored in this chunk (and then free it) */ + while (idx < chunk->used) { + + /* get the hashjoin tuple and mintuple stored right after it */ + HashJoinTuple hashTuple = (HashJoinTuple)(chunk->data + idx); + MinimalTuple tuple = HJTUPLE_MINTUPLE(hashTuple); + + int hashTupleSize = (HJTUPLE_OVERHEAD + tuple->t_len); + + int bucketno; + int batchno; + + ExecHashGetBucketAndBatch(hashtable, hashTuple->hashvalue, + &bucketno, &batchno); + + /* just add the tuple to the proper bucket */ + hashTuple->next = hashtable->buckets[bucketno]; + hashtable->buckets[bucketno] = hashTuple; + + /* bytes occupied in memory HJ tuple overhead + actual tuple length */ + idx += hashTupleSize; + + } + + /* proceed to the next chunk */ + chunk = chunk->next; + + } + +#ifdef HJDEBUG + printf("Nbuckets increased to %d, average items per bucket %.1f\n", + hashtable->nbuckets, batchTuples / hashtable->nbuckets); +#endif + +} + + +/* * ExecHashTableInsert * insert a tuple into the hash table depending on the hash value * it may just go to a temp file for later batches @@ -755,12 +939,26 @@ ExecHashTableInsert(HashJoinTable hashtable, hashTuple->next = hashtable->buckets[bucketno]; hashtable->buckets[bucketno] = hashTuple; + /* Increase the number of buckets (optimal) if we exceeded NTUP_PER_BUCKET. */ + if (((hashtable->totalTuples - hashtable->skewTuples) / hashtable->nbatch) + >= (hashtable->nbuckets_optimal * NTUP_PER_BUCKET)) { + + /* overflow protection */ + if (hashtable->nbuckets_optimal <= (INT_MAX/2)) + hashtable->nbuckets_optimal *= 2; + + } + /* Account for space used, and back off if we've used too much */ hashtable->spaceUsed += hashTupleSize; if (hashtable->spaceUsed > hashtable->spacePeak) hashtable->spacePeak = hashtable->spaceUsed; - if (hashtable->spaceUsed > hashtable->spaceAllowed) + + /* Do we need to increase number of batches? Account for space required + * by buckets (optimal number). */ + if (hashtable->spaceUsed + BUCKETS_SPACE(hashtable) > hashtable->spaceAllowed) ExecHashIncreaseNumBatches(hashtable); + } else { @@ -877,14 +1075,12 @@ ExecHashGetHashValue(HashJoinTable hashtable, * chains), and must only cause the batch number to remain the same or * increase. Our algorithm is * bucketno = hashvalue MOD nbuckets - * batchno = (hashvalue DIV nbuckets) MOD nbatch + * batchno = highers log2(nbatch) bits * where nbuckets and nbatch are both expected to be powers of 2, so we can * do the computations by shifting and masking. (This assumes that all hash * functions are good about randomizing all their output bits, else we are * likely to have very skewed bucket or batch occupancy.) * - * nbuckets doesn't change over the course of the join. - * * nbatch is always a power of 2; we increase it only by doubling it. This * effectively adds one more bit to the top of the batchno. */ @@ -901,7 +1097,7 @@ ExecHashGetBucketAndBatch(HashJoinTable hashtable, { /* we can do MOD by masking, DIV by shifting */ *bucketno = hashvalue & (nbuckets - 1); - *batchno = (hashvalue >> hashtable->log2_nbuckets) & (nbatch - 1); + *batchno = (hashvalue >> (32 - hashtable->log2_nbatch)); } else { diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 6c52db8..9dae612 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -788,6 +788,15 @@ static struct config_bool ConfigureNamesBool[] = NULL, NULL, NULL }, { + {"enable_hash_resize", PGC_USERSET, QUERY_TUNING_METHOD, + gettext_noop("Enables dynamic increase of hash buckets."), + NULL + }, + &enable_hash_resize, + true, + NULL, NULL, NULL + }, + { {"geqo", PGC_USERSET, QUERY_TUNING_GEQO, gettext_noop("Enables genetic query optimization."), gettext_noop("This algorithm attempts to do planning without " diff --git a/src/include/executor/hashjoin.h b/src/include/executor/hashjoin.h index c80a4d1..9d03de2 100644 --- a/src/include/executor/hashjoin.h +++ b/src/include/executor/hashjoin.h @@ -122,8 +122,11 @@ typedef struct HashChunkData* HashChunk; typedef struct HashJoinTableData { int nbuckets; /* # buckets in the in-memory hash table */ + int nbuckets_original; /* # buckets when starting the first hash */ int log2_nbuckets; /* its log2 (nbuckets must be a power of 2) */ + int nbuckets_optimal; /* optimal number of buckets (for batch) */ + /* buckets[i] is head of list of tuples in i'th in-memory bucket */ struct HashJoinTupleData **buckets; /* buckets array is per-batch storage, as are all the tuples */ @@ -138,6 +141,7 @@ typedef struct HashJoinTableData int nbatch; /* number of batches */ int curbatch; /* current batch #; 0 during 1st pass */ + int log2_nbatch; /* its log2 (nbuckets must be a power of 2) */ int nbatch_original; /* nbatch when we started inner scan */ int nbatch_outstart; /* nbatch when we started outer scan */ @@ -145,6 +149,7 @@ typedef struct HashJoinTableData bool growEnabled; /* flag to shut off nbatch increases */ double totalTuples; /* # tuples obtained from inner plan */ + double skewTuples; /* # tuples inserted into skew tuples */ /* * These arrays are allocated for the life of the hash join, but only if diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h index 75e2afb..e7c5f50 100644 --- a/src/include/optimizer/cost.h +++ b/src/include/optimizer/cost.h @@ -62,6 +62,7 @@ extern bool enable_material; extern bool enable_mergejoin; extern bool enable_hashjoin; extern int constraint_exclusion; +extern bool enable_hash_resize; extern double clamp_row_est(double nrows); extern double index_pages_fetched(double tuples_fetched, BlockNumber pages,