diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c index 78f15f0..91fcc05 100644 --- a/src/backend/storage/ipc/shmem.c +++ b/src/backend/storage/ipc/shmem.c @@ -265,7 +265,7 @@ InitShmemIndex(void) */ HTAB * ShmemInitHash(const char *name, /* table string name for shmem index */ - long init_size, /* initial table size */ + long init_size, /* initial table size */ // AALEKSEEV: is ignored, refactor! long max_size, /* max size of the table */ HASHCTL *infoP, /* info about key and bucket size */ int hash_flags) /* info about infoP */ @@ -299,7 +299,7 @@ ShmemInitHash(const char *name, /* table string name for shmem index */ /* Pass location of hashtable header to hash_create */ infoP->hctl = (HASHHDR *) location; - return hash_create(name, init_size, infoP, hash_flags); + return hash_create(name, max_size, infoP, hash_flags); } /* diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c index d43fb61..9d66359 100644 --- a/src/backend/storage/lmgr/lwlock.c +++ b/src/backend/storage/lmgr/lwlock.c @@ -359,6 +359,16 @@ NumLWLocks(void) /* slot.c needs one for each slot */ numLocks += max_replication_slots; + // AALEKSEEV: refactor! + /* buf_table.c needs one for partitioned hash table "Shared Buffer Lookup Table" */ + // numLocks += 1; + + /* lock.c needs two for partitioned hash tables "LOCK hash" and "PROCLOCK hash" */ + // numLocks += 2; + + /* predicate.c need two for partitioned hash tables "PREDICATELOCKTARGET hash" and "PREDICATELOCK hash" */ + // numLocks += 2; + /* * Add any requested by loadable modules; for backwards-compatibility * reasons, allocate at least NUM_USER_DEFINED_LWLOCKS of them even if diff --git a/src/backend/utils/hash/dynahash.c b/src/backend/utils/hash/dynahash.c index eacffc4..bc4fa4a 100644 --- a/src/backend/utils/hash/dynahash.c +++ b/src/backend/utils/hash/dynahash.c @@ -87,6 +87,8 @@ #include "access/xact.h" #include "storage/shmem.h" #include "storage/spin.h" +#include "storage/lock.h" +#include "storage/lwlock.h" #include "utils/dynahash.h" #include "utils/memutils.h" @@ -129,11 +131,17 @@ typedef HASHBUCKET *HASHSEGMENT; struct HASHHDR { /* In a partitioned table, take this lock to touch nentries or freeList */ - slock_t mutex; /* unused if not partitioned table */ + //slock_t mutex; /* unused if not partitioned table */ + + // AALEKSEEV: fix comments + //LWLock* lock; // only for partitioned hash tables + slock_t mutex[NUM_LOCK_PARTITIONS]; /* These fields change during entry addition/deletion */ - long nentries; /* number of entries in hash table */ - HASHELEMENT *freeList; /* linked list of free elements */ + /* number of entries in hash table */ + long nentries[NUM_LOCK_PARTITIONS]; + /* linked list of free elements */ + HASHELEMENT *freeList[NUM_LOCK_PARTITIONS]; /* These fields can change, but not in a partitioned table */ /* Also, dsize can't change in a shared table, even if unpartitioned */ @@ -166,6 +174,9 @@ struct HASHHDR #define IS_PARTITIONED(hctl) ((hctl)->num_partitions != 0) +// AALEKSEEV: add comment +#define PARTITION_IDX(hctl, hashcode) (IS_PARTITIONED(hctl) ? LockHashPartition(hashcode) : 0) + /* * Top control structure for a hashtable --- in a shared table, each backend * has its own copy (OK since no fields change at runtime) @@ -219,10 +230,10 @@ static long hash_accesses, */ static void *DynaHashAlloc(Size size); static HASHSEGMENT seg_alloc(HTAB *hashp); -static bool element_alloc(HTAB *hashp, int nelem); +static bool element_alloc(HTAB *hashp, int nelem, int partition_idx); static bool dir_realloc(HTAB *hashp); static bool expand_table(HTAB *hashp); -static HASHBUCKET get_hash_entry(HTAB *hashp); +static HASHBUCKET get_hash_entry(HTAB *hashp, int partition_idx); static void hdefault(HTAB *hashp); static int choose_nelem_alloc(Size entrysize); static bool init_htab(HTAB *hashp, long nelem); @@ -282,6 +293,7 @@ hash_create(const char *tabname, long nelem, HASHCTL *info, int flags) { HTAB *hashp; HASHHDR *hctl; + int i, partitions_number, nelem_alloc; /* * For shared hash tables, we have a local hash header (HTAB struct) that @@ -408,7 +420,7 @@ hash_create(const char *tabname, long nelem, HASHCTL *info, int flags) if (!hashp->hctl) ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), - errmsg("out of memory"))); + errmsg("out of memory (3)"))); // AALEKSEEV: fix string } hashp->frozen = false; @@ -482,10 +494,20 @@ hash_create(const char *tabname, long nelem, HASHCTL *info, int flags) if ((flags & HASH_SHARED_MEM) || nelem < hctl->nelem_alloc) { - if (!element_alloc(hashp, (int) nelem)) - ereport(ERROR, - (errcode(ERRCODE_OUT_OF_MEMORY), - errmsg("out of memory"))); + if(IS_PARTITIONED(hashp->hctl)) + partitions_number = NUM_LOCK_PARTITIONS; + else + partitions_number = 1; + + nelem_alloc = ((int) nelem) / partitions_number; + if(nelem_alloc == 0) + nelem_alloc = 1; + + for(i = 0; i < partitions_number; i++) + if (!element_alloc(hashp, nelem_alloc, i)) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory (1)"))); // AALEKSEEV: fix string } if (flags & HASH_FIXED_SIZE) @@ -503,8 +525,11 @@ hdefault(HTAB *hashp) MemSet(hctl, 0, sizeof(HASHHDR)); - hctl->nentries = 0; - hctl->freeList = NULL; + // AALEKSEEV: redundant! + // hctl->nentries = 0; + // hctl->freeList = NULL; + + // hctl->lock = NULL; hctl->dsize = DEF_DIRSIZE; hctl->nsegs = 0; @@ -572,12 +597,19 @@ init_htab(HTAB *hashp, long nelem) HASHSEGMENT *segp; int nbuckets; int nsegs; - + int i; /* * initialize mutex if it's a partitioned table */ - if (IS_PARTITIONED(hctl)) - SpinLockInit(&hctl->mutex); + if (IS_PARTITIONED(hctl)) + { + for(i = 0; i < NUM_LOCK_PARTITIONS; i++) + SpinLockInit(&(hctl->mutex[i])); + } + +// AALEKSEEV: remove + //if(IS_PARTITIONED(hctl)) + //hctl->lock = LWLockAssign(); /* * Divide number of elements by the fill factor to determine a desired @@ -648,7 +680,8 @@ init_htab(HTAB *hashp, long nelem) "HIGH MASK ", hctl->high_mask, "LOW MASK ", hctl->low_mask, "NSEGS ", hctl->nsegs, - "NENTRIES ", hctl->nentries); + // AALEKSEEV: fix this + "NENTRIES ", hctl->nentries[0]); #endif return true; } @@ -769,7 +802,8 @@ hash_stats(const char *where, HTAB *hashp) where, hashp->hctl->accesses, hashp->hctl->collisions); fprintf(stderr, "hash_stats: entries %ld keysize %ld maxp %u segmentcount %ld\n", - hashp->hctl->nentries, (long) hashp->hctl->keysize, + // AALEKSEEV: fix this + hashp->hctl->nentries[0], (long) hashp->hctl->keysize, hashp->hctl->max_bucket, hashp->hctl->nsegs); fprintf(stderr, "%s: total accesses %ld total collisions %ld\n", where, hash_accesses, hash_collisions); @@ -863,6 +897,7 @@ hash_search_with_hash_value(HTAB *hashp, HASHBUCKET currBucket; HASHBUCKET *prevBucketPtr; HashCompareFunc match; + int partition_idx = PARTITION_IDX(hctl, hashvalue); #if HASH_STATISTICS hash_accesses++; @@ -885,7 +920,7 @@ hash_search_with_hash_value(HTAB *hashp, * order of these tests is to try to check cheaper conditions first. */ if (!IS_PARTITIONED(hctl) && !hashp->frozen && - hctl->nentries / (long) (hctl->max_bucket + 1) >= hctl->ffactor && + hctl->nentries[0] / (long) (hctl->max_bucket + 1) >= hctl->ffactor && !has_seq_scans(hashp)) (void) expand_table(hashp); } @@ -942,21 +977,25 @@ hash_search_with_hash_value(HTAB *hashp, if (currBucket != NULL) { /* if partitioned, must lock to touch nentries and freeList */ + // AALEKSEEV: remove this if (IS_PARTITIONED(hctl)) - SpinLockAcquire(&hctl->mutex); + //LWLockAcquire(hctl->lock, LW_SHARED); + SpinLockAcquire(&(hctl->mutex[partition_idx])); - Assert(hctl->nentries > 0); - hctl->nentries--; + Assert(hctl->nentries[partition_idx] > 0); + hctl->nentries[partition_idx]--; /* remove record from hash bucket's chain. */ *prevBucketPtr = currBucket->link; /* add the record to the freelist for this table. */ - currBucket->link = hctl->freeList; - hctl->freeList = currBucket; + currBucket->link = hctl->freeList[partition_idx]; + hctl->freeList[partition_idx] = currBucket; + // AALEKSEEV: remove this if (IS_PARTITIONED(hctl)) - SpinLockRelease(&hctl->mutex); + // LWLockRelease(hctl->lock); + SpinLockRelease(&hctl->mutex[partition_idx]); /* * better hope the caller is synchronizing access to this @@ -982,7 +1021,7 @@ hash_search_with_hash_value(HTAB *hashp, elog(ERROR, "cannot insert into frozen hashtable \"%s\"", hashp->tabname); - currBucket = get_hash_entry(hashp); + currBucket = get_hash_entry(hashp, partition_idx); if (currBucket == NULL) { /* out of memory */ @@ -996,7 +1035,7 @@ hash_search_with_hash_value(HTAB *hashp, else ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), - errmsg("out of memory"))); + errmsg("out of memory (2)"))); // AALEKSEEV: fix string } /* link into hashbucket chain */ @@ -1175,41 +1214,81 @@ hash_update_hash_key(HTAB *hashp, * create a new entry if possible */ static HASHBUCKET -get_hash_entry(HTAB *hashp) +get_hash_entry(HTAB *hashp, int partition_idx) { HASHHDR *hctl = hashp->hctl; HASHBUCKET newElement; + int i, steal_from_idx; for (;;) { /* if partitioned, must lock to touch nentries and freeList */ if (IS_PARTITIONED(hctl)) - SpinLockAcquire(&hctl->mutex); + // LWLockAcquire(hctl->lock, LW_SHARED); + SpinLockAcquire(&hctl->mutex[partition_idx]); /* try to get an entry from the freelist */ - newElement = hctl->freeList; + newElement = hctl->freeList[partition_idx]; + if (newElement != NULL) - break; + { + /* remove entry from freelist, bump nentries */ + hctl->freeList[partition_idx] = newElement->link; + hctl->nentries[partition_idx]++; + if (IS_PARTITIONED(hctl)) + // LWLockRelease(hctl->lock); + SpinLockRelease(&hctl->mutex[partition_idx]); + + return newElement; + } - /* no free elements. allocate another chunk of buckets */ if (IS_PARTITIONED(hctl)) - SpinLockRelease(&hctl->mutex); + SpinLockRelease(&hctl->mutex[partition_idx]); + // LWLockRelease(hctl->lock); - if (!element_alloc(hashp, hctl->nelem_alloc)) + /* no free elements. allocate another chunk of buckets */ + if (!element_alloc(hashp, hctl->nelem_alloc, partition_idx)) { - /* out of memory */ - return NULL; - } - } + if (!IS_PARTITIONED(hctl)) + return NULL; /* out of memory */ - /* remove entry from freelist, bump nentries */ - hctl->freeList = newElement->link; - hctl->nentries++; + /* try to "steal" element from another partition */ + // LWLockAcquire(hctl->lock, LW_EXCLUSIVE); - if (IS_PARTITIONED(hctl)) - SpinLockRelease(&hctl->mutex); + // for(i = 0; i < NUM_LOCK_PARTITIONS; i++) + // SpinLockAcquire(&(hctl->mutex[i])); + + steal_from_idx = partition_idx; + for(;;) + { + steal_from_idx = (steal_from_idx + 1) % NUM_LOCK_PARTITIONS; + if(steal_from_idx == partition_idx) + break; + + SpinLockAcquire(&(hctl->mutex[steal_from_idx])); + newElement = hctl->freeList[steal_from_idx]; + + if(newElement != NULL) + { + hctl->freeList[steal_from_idx] = newElement->link; + SpinLockRelease(&(hctl->mutex[steal_from_idx])); + + SpinLockAcquire(&hctl->mutex[partition_idx]); + hctl->nentries[partition_idx]++; + SpinLockRelease(&hctl->mutex[partition_idx]); + + break; + } - return newElement; + SpinLockRelease(&(hctl->mutex[steal_from_idx])); + } + + // LWLockRelease(hctl->lock); + // for(i = 0; i < NUM_LOCK_PARTITIONS; i++) + // SpinLockRelease(&(hctl->mutex[i])); + return newElement; + } + } } /* @@ -1218,11 +1297,21 @@ get_hash_entry(HTAB *hashp) long hash_get_num_entries(HTAB *hashp) { + int i; + long sum = hashp->hctl->nentries[0]; + /* * We currently don't bother with the mutex; it's only sensible to call * this function if you've got lock on all partitions of the table. */ - return hashp->hctl->nentries; + + if(!IS_PARTITIONED(hashp->hctl)) + return sum; + + for(i = 1; i < NUM_LOCK_PARTITIONS; i++) + sum += hashp->hctl->nentries[i]; + + return sum; } /* @@ -1530,7 +1619,7 @@ seg_alloc(HTAB *hashp) * allocate some new elements and link them into the free list */ static bool -element_alloc(HTAB *hashp, int nelem) +element_alloc(HTAB *hashp, int nelem, int partition_idx) { HASHHDR *hctl = hashp->hctl; Size elementSize; @@ -1562,15 +1651,23 @@ element_alloc(HTAB *hashp, int nelem) } /* if partitioned, must lock to touch freeList */ + // if (IS_PARTITIONED(hctl)) + // SpinLockAcquire(&hctl->mutex); + if (IS_PARTITIONED(hctl)) - SpinLockAcquire(&hctl->mutex); + // LWLockAcquire(hctl->lock, LW_SHARED); + SpinLockAcquire(&hctl->mutex[partition_idx]); /* freelist could be nonempty if two backends did this concurrently */ - firstElement->link = hctl->freeList; - hctl->freeList = prevElement; + firstElement->link = hctl->freeList[partition_idx]; + hctl->freeList[partition_idx] = prevElement; if (IS_PARTITIONED(hctl)) - SpinLockRelease(&hctl->mutex); + // LWLockRelease(hctl->lock); + SpinLockRelease(&hctl->mutex[partition_idx]); + + // if (IS_PARTITIONED(hctl)) + // SpinLockRelease(&hctl->mutex); return true; } diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h index ff34529..767f20b 100644 --- a/src/include/storage/lwlock.h +++ b/src/include/storage/lwlock.h @@ -128,13 +128,14 @@ extern char *MainLWLockNames[]; * having this file include lock.h or bufmgr.h would be backwards. */ -/* Number of partitions of the shared buffer mapping hashtable */ -#define NUM_BUFFER_PARTITIONS 128 - /* Number of partitions the shared lock tables are divided into */ #define LOG2_NUM_LOCK_PARTITIONS 4 #define NUM_LOCK_PARTITIONS (1 << LOG2_NUM_LOCK_PARTITIONS) + /* Number of partitions of the shared buffer mapping hashtable */ + // AALEKSEEV: refactor +#define NUM_BUFFER_PARTITIONS NUM_LOCK_PARTITIONS + /* Number of partitions the shared predicate lock tables are divided into */ #define LOG2_NUM_PREDICATELOCK_PARTITIONS 4 #define NUM_PREDICATELOCK_PARTITIONS (1 << LOG2_NUM_PREDICATELOCK_PARTITIONS)