diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c index 78f15f0..91fcc05 100644 --- a/src/backend/storage/ipc/shmem.c +++ b/src/backend/storage/ipc/shmem.c @@ -265,7 +265,7 @@ InitShmemIndex(void) */ HTAB * ShmemInitHash(const char *name, /* table string name for shmem index */ - long init_size, /* initial table size */ + long init_size, /* initial table size */ // AALEKSEEV: is ignored, refactor! long max_size, /* max size of the table */ HASHCTL *infoP, /* info about key and bucket size */ int hash_flags) /* info about infoP */ @@ -299,7 +299,7 @@ ShmemInitHash(const char *name, /* table string name for shmem index */ /* Pass location of hashtable header to hash_create */ infoP->hctl = (HASHHDR *) location; - return hash_create(name, init_size, infoP, hash_flags); + return hash_create(name, max_size, infoP, hash_flags); } /* diff --git a/src/backend/utils/hash/dynahash.c b/src/backend/utils/hash/dynahash.c index eacffc4..07a46db 100644 --- a/src/backend/utils/hash/dynahash.c +++ b/src/backend/utils/hash/dynahash.c @@ -87,6 +87,8 @@ #include "access/xact.h" #include "storage/shmem.h" #include "storage/spin.h" +#include "storage/lock.h" +#include "storage/lwlock.h" #include "utils/dynahash.h" #include "utils/memutils.h" @@ -118,6 +120,13 @@ typedef HASHELEMENT *HASHBUCKET; /* A hash segment is an array of bucket headers */ typedef HASHBUCKET *HASHSEGMENT; +// AALEKSEEV: TODO: comment, should be power of two +#define GLOBAL_FREE_LIST_PARTITIONS_NUM 4 +#define GLOBAL_FREE_LIST_PARTITIONS_MASK (GLOBAL_FREE_LIST_PARTITIONS_NUM-1) + +// AALEKSEEV: TODO: comment +#define GLOBAL_FREE_LIST_ALLOC_NUM 8 + /* * Header structure for a hash table --- contains all changeable info * @@ -129,11 +138,24 @@ typedef HASHBUCKET *HASHSEGMENT; struct HASHHDR { /* In a partitioned table, take this lock to touch nentries or freeList */ - slock_t mutex; /* unused if not partitioned table */ + slock_t mutex[GLOBAL_FREE_LIST_PARTITIONS_NUM]; /* unused if not partitioned table */ + HASHELEMENT * globalFreeList[GLOBAL_FREE_LIST_PARTITIONS_NUM]; + + // AALEKSEEV: fix comments /* These fields change during entry addition/deletion */ - long nentries; /* number of entries in hash table */ - HASHELEMENT *freeList; /* linked list of free elements */ + /* number of entries in hash table */ + long nentries[NUM_LOCK_PARTITIONS]; + + // AALEKSEEV: TODO: comment + long totalAllocated[NUM_LOCK_PARTITIONS]; + + /* linked list of free elements */ + HASHELEMENT *freeList[NUM_LOCK_PARTITIONS]; + + // AALEKSEEV: TODO: comment + HASHELEMENT *freeListTail[NUM_LOCK_PARTITIONS]; + HASHELEMENT *freeListWouldBeTail[NUM_LOCK_PARTITIONS]; /* These fields can change, but not in a partitioned table */ /* Also, dsize can't change in a shared table, even if unpartitioned */ @@ -166,6 +188,9 @@ struct HASHHDR #define IS_PARTITIONED(hctl) ((hctl)->num_partitions != 0) +// AALEKSEEV: add comment +#define PARTITION_IDX(hctl, hashcode) (IS_PARTITIONED(hctl) ? LockHashPartition(hashcode) : 0) + /* * Top control structure for a hashtable --- in a shared table, each backend * has its own copy (OK since no fields change at runtime) @@ -192,6 +217,28 @@ struct HTAB int sshift; /* segment shift = log2(ssize) */ }; +typedef struct +{ + HASHELEMENT * tail; + uint32 nitems; +#ifdef USE_ASSERT_CHECKING + uint32 magic; +#endif +} FREELISTBATCH; + +#ifdef USE_ASSERT_CHECKING + +#define FREE_LIST_BATCH_MAGIC 0x1234ABCD +#define FREE_LIST_BATCH_SET_MAGIC(batchp) ((batchp)->magic = FREE_LIST_BATCH_MAGIC) +#define FREE_LIST_BATCH_CHECK_MAGIC(batchp) (AssertMacro((batchp)->magic == FREE_LIST_BATCH_MAGIC)) + +#else + +#define FREE_LIST_BATCH_SET_MAGIC(batchp) ((void)true) +#define FREE_LIST_BATCH_CHECK_MAGIC(batchp) ((void)true) + +#endif + /* * Key (also entry) part of a HASHELEMENT */ @@ -219,10 +266,10 @@ static long hash_accesses, */ static void *DynaHashAlloc(Size size); static HASHSEGMENT seg_alloc(HTAB *hashp); -static bool element_alloc(HTAB *hashp, int nelem); +static bool element_alloc(HTAB *hashp, int nelem, int glob_part_idx); static bool dir_realloc(HTAB *hashp); static bool expand_table(HTAB *hashp); -static HASHBUCKET get_hash_entry(HTAB *hashp); +static HASHBUCKET get_hash_entry(HTAB *hashp, int partition_idx); static void hdefault(HTAB *hashp); static int choose_nelem_alloc(Size entrysize); static bool init_htab(HTAB *hashp, long nelem); @@ -260,6 +307,197 @@ string_compare(const char *key1, const char *key2, Size keysize) return strncmp(key1, key2, keysize - 1); } +// AALEKSEEV TODO comment +static void +global_free_list_init(HTAB* hashp, int glob_partition_idx) +{ + FREELISTBATCH* batchp; + HASHHDR *hctl = hashp->hctl; + HASHELEMENT* batchHead = hctl->globalFreeList[glob_partition_idx]; + HASHELEMENT* currElement = batchHead; + int elementsInBatch = 0; + + Assert(currElement != NULL); + + for(;;) + { + elementsInBatch++; + + if(currElement->link == NULL) + break; + + if(elementsInBatch == GLOBAL_FREE_LIST_ALLOC_NUM) + { + batchp = (FREELISTBATCH*)ELEMENTKEY(batchHead); + batchp->tail = currElement; + batchp->nitems = elementsInBatch; + FREE_LIST_BATCH_SET_MAGIC(batchp); + + batchHead = currElement->link; + elementsInBatch = 0; + } + + currElement = currElement->link; + } + + batchp = (FREELISTBATCH*)ELEMENTKEY(batchHead); + batchp->tail = currElement; + batchp->nitems = elementsInBatch; + FREE_LIST_BATCH_SET_MAGIC(batchp); +} + +// AALEKSEEV: acquire elements - WE KNOW WE NEED THEM. +// return true on success, false on failure +static bool +global_free_list_acquire_from_partition(HTAB* hashp, int partition_idx, int glob_partition_idx) +{ + FREELISTBATCH* batchp; + HASHELEMENT *firstElement; + HASHHDR *hctl = hashp->hctl; + + Assert(IS_PARTITIONED(hctl)); + Assert(hctl->freeList[partition_idx] == NULL); + Assert(hctl->nentries[partition_idx] == hctl->totalAllocated[partition_idx]); + + SpinLockAcquire(&hctl->mutex[glob_partition_idx]); + firstElement = hctl->globalFreeList[glob_partition_idx]; + + if(!firstElement) + { + SpinLockRelease(&hctl->mutex[glob_partition_idx]); + return false; + } + + batchp = (FREELISTBATCH*)ELEMENTKEY(firstElement); + FREE_LIST_BATCH_CHECK_MAGIC(batchp); + + hctl->globalFreeList[glob_partition_idx] = batchp->tail->link; + SpinLockRelease(&hctl->mutex[glob_partition_idx]); + + batchp->tail->link = NULL; + hctl->freeList[partition_idx] = firstElement; + hctl->freeListTail[partition_idx] = batchp->tail; + hctl->freeListWouldBeTail[partition_idx] = NULL; + hctl->totalAllocated[partition_idx] += batchp->nitems; + + return true; +} + +static bool +global_free_list_acquire(HTAB* hashp, int partition_idx) +{ + int idx, + start = partition_idx & GLOBAL_FREE_LIST_PARTITIONS_MASK; + + idx = start; + do { + if(global_free_list_acquire_from_partition(hashp, partition_idx, idx)) + return true; + idx = (idx + 1) & GLOBAL_FREE_LIST_PARTITIONS_MASK; + } while(idx != start); + + return false; +} + +// AALEKSEEV: relesase elements IF POSSIBLE. +static void +global_free_list_try_release(HTAB* hashp, int partition_idx) +{ +#ifdef USE_ASSERT_CHECKING + int nreleased; +#endif + FREELISTBATCH* batchp; + HASHELEMENT *releasedFirst, + *releasedLast; + int global_free_list_part = partition_idx & GLOBAL_FREE_LIST_PARTITIONS_MASK; + + HASHHDR *hctl = hashp->hctl; + + if(!IS_PARTITIONED(hctl)) + return; + + /* keep at least GLOBAL_FREE_LIST_ALLOC_NUM items in each freeList */ + if(hctl->totalAllocated[partition_idx] <= GLOBAL_FREE_LIST_ALLOC_NUM) + return; + + if(hctl->totalAllocated[partition_idx] - hctl->nentries[partition_idx] <= GLOBAL_FREE_LIST_ALLOC_NUM) + return; + + Assert(hctl->freeList[partition_idx] != NULL); + Assert(hctl->freeListWouldBeTail[partition_idx] != NULL); + Assert(hctl->freeListWouldBeTail[partition_idx]->link != NULL); + Assert(hctl->freeListTail[partition_idx] != NULL); + + releasedFirst = hctl->freeListWouldBeTail[partition_idx]->link; + releasedLast = hctl->freeListTail[partition_idx]; + +#ifdef USE_ASSERT_CHECKING + releasedLast = releasedFirst; + nreleased = 1; + while(nreleased < GLOBAL_FREE_LIST_ALLOC_NUM) + { + Assert(releasedLast->link != NULL); + + releasedLast = releasedLast->link; + nreleased++; + } + Assert(releasedLast == hctl->freeListTail[partition_idx]); +#endif + + hctl->totalAllocated[partition_idx] -= GLOBAL_FREE_LIST_ALLOC_NUM; + hctl->freeListTail[partition_idx] = hctl->freeListWouldBeTail[partition_idx]; + hctl->freeListTail[partition_idx]->link = NULL; + hctl->freeListWouldBeTail[partition_idx] = NULL; + + batchp = (FREELISTBATCH*)ELEMENTKEY(releasedFirst); + batchp->tail = releasedLast; + batchp->nitems = GLOBAL_FREE_LIST_ALLOC_NUM; + FREE_LIST_BATCH_SET_MAGIC(batchp); + + SpinLockAcquire(&hctl->mutex[global_free_list_part]); + releasedLast->link = hctl->globalFreeList[global_free_list_part]; + hctl->globalFreeList[global_free_list_part] = releasedFirst; + SpinLockRelease(&hctl->mutex[global_free_list_part]); +} + +/* remove entry from freelist, bump nentries */ +static inline HASHELEMENT* +free_list_get(HTAB* hashp, int partition_idx) +{ + HASHHDR *hctl = hashp->hctl; + HASHELEMENT* element = hctl->freeList[partition_idx]; + + Assert(element != NULL); + + hctl->freeList[partition_idx] = element->link; + hctl->nentries[partition_idx]++; + +#ifdef USE_ASSERT_CHECKING + if(hctl->freeListWouldBeTail[partition_idx] == element) + hctl->freeListWouldBeTail[partition_idx] = NULL; + if(element->link == NULL) + hctl->freeListTail[partition_idx] = NULL; +#endif + + return element; +} + +/* add the record to the freelist for this table. */ +static inline void +free_list_put(HTAB* hashp, HASHELEMENT* element, int partition_idx) +{ + HASHHDR *hctl = hashp->hctl; + + if(hctl->freeList[partition_idx] == NULL) + hctl->freeListTail[partition_idx] = element; + + element->link = hctl->freeList[partition_idx]; + hctl->freeList[partition_idx] = element; + hctl->nentries[partition_idx]--; + + if(hctl->totalAllocated[partition_idx] - hctl->nentries[partition_idx] == (GLOBAL_FREE_LIST_ALLOC_NUM+1)) + hctl->freeListWouldBeTail[partition_idx] = element; +} /************************** CREATE ROUTINES **********************/ @@ -282,6 +520,7 @@ hash_create(const char *tabname, long nelem, HASHCTL *info, int flags) { HTAB *hashp; HASHHDR *hctl; + int i, nelem_alloc; /* * For shared hash tables, we have a local hash header (HTAB struct) that @@ -408,7 +647,7 @@ hash_create(const char *tabname, long nelem, HASHCTL *info, int flags) if (!hashp->hctl) ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), - errmsg("out of memory"))); + errmsg("out of memory (3)"))); // AALEKSEEV: fix string } hashp->frozen = false; @@ -422,6 +661,9 @@ hash_create(const char *tabname, long nelem, HASHCTL *info, int flags) /* Doesn't make sense to partition a local hash table */ Assert(flags & HASH_SHARED_MEM); + // AALEKSEEV: comment. For storing links in globalFreeList + Assert(info->entrysize >= sizeof(FREELISTBATCH)); + /* * The number of partitions had better be a power of 2. Also, it must * be less than INT_MAX (see init_htab()), so call the int version of @@ -482,10 +724,32 @@ hash_create(const char *tabname, long nelem, HASHCTL *info, int flags) if ((flags & HASH_SHARED_MEM) || nelem < hctl->nelem_alloc) { - if (!element_alloc(hashp, (int) nelem)) - ereport(ERROR, - (errcode(ERRCODE_OUT_OF_MEMORY), - errmsg("out of memory"))); + if(IS_PARTITIONED(hashp->hctl)) + { + nelem_alloc = nelem / GLOBAL_FREE_LIST_PARTITIONS_NUM; + if(nelem_alloc == 0) + nelem_alloc = 1; + + Assert(nelem_alloc > 0); + for(i = 0; i < GLOBAL_FREE_LIST_PARTITIONS_NUM; i++) + { + if (!element_alloc(hashp, nelem_alloc, i)) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory (1.1)"))); // AALEKSEEV: fix string + + global_free_list_init(hashp, i); + global_free_list_acquire(hashp, i); + } + } + else + { + if (!element_alloc(hashp, nelem, 0)) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory (1.2)"))); // AALEKSEEV: fix string + + } } if (flags & HASH_FIXED_SIZE) @@ -503,8 +767,9 @@ hdefault(HTAB *hashp) MemSet(hctl, 0, sizeof(HASHHDR)); - hctl->nentries = 0; - hctl->freeList = NULL; + // AALEKSEEV: redundant! + // hctl->nentries = 0; + // hctl->freeList = NULL; hctl->dsize = DEF_DIRSIZE; hctl->nsegs = 0; @@ -572,12 +837,14 @@ init_htab(HTAB *hashp, long nelem) HASHSEGMENT *segp; int nbuckets; int nsegs; + int i; /* * initialize mutex if it's a partitioned table */ if (IS_PARTITIONED(hctl)) - SpinLockInit(&hctl->mutex); + for(i = 0; i < GLOBAL_FREE_LIST_PARTITIONS_NUM; i++) + SpinLockInit(&(hctl->mutex[i])); /* * Divide number of elements by the fill factor to determine a desired @@ -648,7 +915,8 @@ init_htab(HTAB *hashp, long nelem) "HIGH MASK ", hctl->high_mask, "LOW MASK ", hctl->low_mask, "NSEGS ", hctl->nsegs, - "NENTRIES ", hctl->nentries); + // AALEKSEEV: fix this + "NENTRIES ", hctl->nentries[0]); #endif return true; } @@ -769,7 +1037,8 @@ hash_stats(const char *where, HTAB *hashp) where, hashp->hctl->accesses, hashp->hctl->collisions); fprintf(stderr, "hash_stats: entries %ld keysize %ld maxp %u segmentcount %ld\n", - hashp->hctl->nentries, (long) hashp->hctl->keysize, + // AALEKSEEV: fix this + hashp->hctl->nentries[0], (long) hashp->hctl->keysize, hashp->hctl->max_bucket, hashp->hctl->nsegs); fprintf(stderr, "%s: total accesses %ld total collisions %ld\n", where, hash_accesses, hash_collisions); @@ -863,6 +1132,7 @@ hash_search_with_hash_value(HTAB *hashp, HASHBUCKET currBucket; HASHBUCKET *prevBucketPtr; HashCompareFunc match; + int partition_idx = PARTITION_IDX(hctl, hashvalue); #if HASH_STATISTICS hash_accesses++; @@ -885,7 +1155,7 @@ hash_search_with_hash_value(HTAB *hashp, * order of these tests is to try to check cheaper conditions first. */ if (!IS_PARTITIONED(hctl) && !hashp->frozen && - hctl->nentries / (long) (hctl->max_bucket + 1) >= hctl->ffactor && + hctl->nentries[0] / (long) (hctl->max_bucket + 1) >= hctl->ffactor && !has_seq_scans(hashp)) (void) expand_table(hashp); } @@ -942,21 +1212,21 @@ hash_search_with_hash_value(HTAB *hashp, if (currBucket != NULL) { /* if partitioned, must lock to touch nentries and freeList */ - if (IS_PARTITIONED(hctl)) - SpinLockAcquire(&hctl->mutex); - - Assert(hctl->nentries > 0); - hctl->nentries--; + // AALEKSEEV: remove this + // if (IS_PARTITIONED(hctl)) + // SpinLockAcquire(&hctl->mutex); + Assert(hctl->nentries[partition_idx] > 0); + /* remove record from hash bucket's chain. */ *prevBucketPtr = currBucket->link; - /* add the record to the freelist for this table. */ - currBucket->link = hctl->freeList; - hctl->freeList = currBucket; + free_list_put(hashp, currBucket, partition_idx); + global_free_list_try_release(hashp, partition_idx); - if (IS_PARTITIONED(hctl)) - SpinLockRelease(&hctl->mutex); + // AALEKSEEV: remove this + // if (IS_PARTITIONED(hctl)) + // SpinLockRelease(&hctl->mutex); /* * better hope the caller is synchronizing access to this @@ -982,7 +1252,7 @@ hash_search_with_hash_value(HTAB *hashp, elog(ERROR, "cannot insert into frozen hashtable \"%s\"", hashp->tabname); - currBucket = get_hash_entry(hashp); + currBucket = get_hash_entry(hashp, partition_idx); if (currBucket == NULL) { /* out of memory */ @@ -996,7 +1266,7 @@ hash_search_with_hash_value(HTAB *hashp, else ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), - errmsg("out of memory"))); + errmsg("out of memory (2)"))); // AALEKSEEV: fix string } /* link into hashbucket chain */ @@ -1175,41 +1445,34 @@ hash_update_hash_key(HTAB *hashp, * create a new entry if possible */ static HASHBUCKET -get_hash_entry(HTAB *hashp) +get_hash_entry(HTAB *hashp, int partition_idx) { HASHHDR *hctl = hashp->hctl; - HASHBUCKET newElement; + bool alloc_result; for (;;) { /* if partitioned, must lock to touch nentries and freeList */ - if (IS_PARTITIONED(hctl)) - SpinLockAcquire(&hctl->mutex); + // if (IS_PARTITIONED(hctl)) + // SpinLockAcquire(&hctl->mutex); /* try to get an entry from the freelist */ - newElement = hctl->freeList; - if (newElement != NULL) + if (hctl->freeList[partition_idx] != NULL) break; /* no free elements. allocate another chunk of buckets */ - if (IS_PARTITIONED(hctl)) - SpinLockRelease(&hctl->mutex); + if(IS_PARTITIONED(hctl)) + alloc_result = global_free_list_acquire(hashp, partition_idx); + else + alloc_result = element_alloc(hashp, hctl->nelem_alloc, 0); - if (!element_alloc(hashp, hctl->nelem_alloc)) - { - /* out of memory */ - return NULL; - } + if(!alloc_result) + return NULL; /* out of memory */ } + // if (IS_PARTITIONED(hctl)) + // SpinLockRelease(&hctl->mutex); - /* remove entry from freelist, bump nentries */ - hctl->freeList = newElement->link; - hctl->nentries++; - - if (IS_PARTITIONED(hctl)) - SpinLockRelease(&hctl->mutex); - - return newElement; + return free_list_get(hashp, partition_idx); } /* @@ -1218,11 +1481,21 @@ get_hash_entry(HTAB *hashp) long hash_get_num_entries(HTAB *hashp) { + int i; + long sum = hashp->hctl->nentries[0]; + /* * We currently don't bother with the mutex; it's only sensible to call * this function if you've got lock on all partitions of the table. */ - return hashp->hctl->nentries; + + if(!IS_PARTITIONED(hashp->hctl)) + return sum; + + for(i = 1; i < NUM_LOCK_PARTITIONS; i++) + sum += hashp->hctl->nentries[i]; + + return sum; } /* @@ -1530,7 +1803,7 @@ seg_alloc(HTAB *hashp) * allocate some new elements and link them into the free list */ static bool -element_alloc(HTAB *hashp, int nelem) +element_alloc(HTAB *hashp, int nelem, int glob_part_idx) { HASHHDR *hctl = hashp->hctl; Size elementSize; @@ -1562,15 +1835,23 @@ element_alloc(HTAB *hashp, int nelem) } /* if partitioned, must lock to touch freeList */ - if (IS_PARTITIONED(hctl)) - SpinLockAcquire(&hctl->mutex); + // if (IS_PARTITIONED(hctl)) + // SpinLockAcquire(&hctl->mutex); /* freelist could be nonempty if two backends did this concurrently */ - firstElement->link = hctl->freeList; - hctl->freeList = prevElement; + if(IS_PARTITIONED(hctl)) + { + firstElement->link = hctl->globalFreeList[glob_part_idx]; + hctl->globalFreeList[glob_part_idx] = prevElement; + } + else + { + firstElement->link = hctl->freeList[0]; + hctl->freeList[0] = prevElement; + } - if (IS_PARTITIONED(hctl)) - SpinLockRelease(&hctl->mutex); + // if (IS_PARTITIONED(hctl)) + // SpinLockRelease(&hctl->mutex); return true; } diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h index ff34529..1a5405d 100644 --- a/src/include/storage/lwlock.h +++ b/src/include/storage/lwlock.h @@ -128,13 +128,14 @@ extern char *MainLWLockNames[]; * having this file include lock.h or bufmgr.h would be backwards. */ -/* Number of partitions of the shared buffer mapping hashtable */ -#define NUM_BUFFER_PARTITIONS 128 - /* Number of partitions the shared lock tables are divided into */ -#define LOG2_NUM_LOCK_PARTITIONS 4 +#define LOG2_NUM_LOCK_PARTITIONS 7 // 4 // AALEKSEEV: KEEP THIS! its a bottleneck for new dynahash.c #define NUM_LOCK_PARTITIONS (1 << LOG2_NUM_LOCK_PARTITIONS) + /* Number of partitions of the shared buffer mapping hashtable */ + // AALEKSEEV: refactor +#define NUM_BUFFER_PARTITIONS NUM_LOCK_PARTITIONS + /* Number of partitions the shared predicate lock tables are divided into */ #define LOG2_NUM_PREDICATELOCK_PARTITIONS 4 #define NUM_PREDICATELOCK_PARTITIONS (1 << LOG2_NUM_PREDICATELOCK_PARTITIONS)