diff --git a/contrib/pg_stat_statements/pg_stat_statements.c b/contrib/pg_stat_statements/pg_stat_statements.c index 9673fe0..0c8e4fb 100644 --- a/contrib/pg_stat_statements/pg_stat_statements.c +++ b/contrib/pg_stat_statements/pg_stat_statements.c @@ -495,7 +495,7 @@ pgss_shmem_startup(void) info.hash = pgss_hash_fn; info.match = pgss_match_fn; pgss_hash = ShmemInitHash("pg_stat_statements hash", - pgss_max, pgss_max, + pgss_max, &info, HASH_ELEM | HASH_FUNCTION | HASH_COMPARE); diff --git a/src/backend/storage/buffer/buf_table.c b/src/backend/storage/buffer/buf_table.c index 39e8baf..dd5acb7 100644 --- a/src/backend/storage/buffer/buf_table.c +++ b/src/backend/storage/buffer/buf_table.c @@ -62,7 +62,7 @@ InitBufTable(int size) info.num_partitions = NUM_BUFFER_PARTITIONS; SharedBufHash = ShmemInitHash("Shared Buffer Lookup Table", - size, size, + size, &info, HASH_ELEM | HASH_BLOBS | HASH_PARTITION); } diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c index 81506ea..4c18701 100644 --- a/src/backend/storage/ipc/shmem.c +++ b/src/backend/storage/ipc/shmem.c @@ -237,7 +237,7 @@ InitShmemIndex(void) hash_flags = HASH_ELEM; ShmemIndex = ShmemInitHash("ShmemIndex", - SHMEM_INDEX_SIZE, SHMEM_INDEX_SIZE, + SHMEM_INDEX_SIZE, &info, hash_flags); } @@ -255,17 +255,12 @@ InitShmemIndex(void) * exceeded substantially (since it's used to compute directory size and * the hash table buckets will get overfull). * - * init_size is the number of hashtable entries to preallocate. For a table - * whose maximum size is certain, this should be equal to max_size; that - * ensures that no run-time out-of-shared-memory failures can occur. - * * Note: before Postgres 9.0, this function returned NULL for some failure * cases. Now, it always throws error instead, so callers need not check * for NULL. */ HTAB * ShmemInitHash(const char *name, /* table string name for shmem index */ - long init_size, /* initial table size */ long max_size, /* max size of the table */ HASHCTL *infoP, /* info about key and bucket size */ int hash_flags) /* info about infoP */ @@ -299,7 +294,7 @@ ShmemInitHash(const char *name, /* table string name for shmem index */ /* Pass location of hashtable header to hash_create */ infoP->hctl = (HASHHDR *) location; - return hash_create(name, init_size, infoP, hash_flags); + return hash_create(name, max_size, infoP, hash_flags); } /* diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c index 9c2e49c..8d9b36a 100644 --- a/src/backend/storage/lmgr/lock.c +++ b/src/backend/storage/lmgr/lock.c @@ -373,8 +373,7 @@ void InitLocks(void) { HASHCTL info; - long init_table_size, - max_table_size; + long max_table_size; bool found; /* @@ -382,7 +381,6 @@ InitLocks(void) * calculations must agree with LockShmemSize! */ max_table_size = NLOCKENTS(); - init_table_size = max_table_size / 2; /* * Allocate hash table for LOCK structs. This stores per-locked-object @@ -394,14 +392,12 @@ InitLocks(void) info.num_partitions = NUM_LOCK_PARTITIONS; LockMethodLockHash = ShmemInitHash("LOCK hash", - init_table_size, max_table_size, &info, HASH_ELEM | HASH_BLOBS | HASH_PARTITION); /* Assume an average of 2 holders per lock */ max_table_size *= 2; - init_table_size *= 2; /* * Allocate hash table for PROCLOCK structs. This stores @@ -413,7 +409,6 @@ InitLocks(void) info.num_partitions = NUM_LOCK_PARTITIONS; LockMethodProcLockHash = ShmemInitHash("PROCLOCK hash", - init_table_size, max_table_size, &info, HASH_ELEM | HASH_FUNCTION | HASH_PARTITION); diff --git a/src/backend/storage/lmgr/predicate.c b/src/backend/storage/lmgr/predicate.c index d9d4e22..fc72d2d 100644 --- a/src/backend/storage/lmgr/predicate.c +++ b/src/backend/storage/lmgr/predicate.c @@ -1116,7 +1116,6 @@ InitPredicateLocks(void) PredicateLockTargetHash = ShmemInitHash("PREDICATELOCKTARGET hash", max_table_size, - max_table_size, &info, HASH_ELEM | HASH_BLOBS | HASH_PARTITION | HASH_FIXED_SIZE); @@ -1144,7 +1143,6 @@ InitPredicateLocks(void) PredicateLockHash = ShmemInitHash("PREDICATELOCK hash", max_table_size, - max_table_size, &info, HASH_ELEM | HASH_FUNCTION | HASH_PARTITION | HASH_FIXED_SIZE); @@ -1225,7 +1223,6 @@ InitPredicateLocks(void) SerializableXidHash = ShmemInitHash("SERIALIZABLEXID hash", max_table_size, - max_table_size, &info, HASH_ELEM | HASH_BLOBS | HASH_FIXED_SIZE); diff --git a/src/backend/utils/hash/dynahash.c b/src/backend/utils/hash/dynahash.c index 24a53da..ce9bb9c 100644 --- a/src/backend/utils/hash/dynahash.c +++ b/src/backend/utils/hash/dynahash.c @@ -15,7 +15,7 @@ * to hash_create. This prevents any attempt to split buckets on-the-fly. * Therefore, each hash bucket chain operates independently, and no fields * of the hash header change after init except nentries and freeList. - * A partitioned table uses a spinlock to guard changes of those two fields. + * A partitioned table uses spinlocks to guard changes of those fields. * This lets any subset of the hash buckets be treated as a separately * lockable partition. We expect callers to use the low-order bits of a * lookup key's hash value as a partition number --- this will work because @@ -87,6 +87,7 @@ #include "access/xact.h" #include "storage/shmem.h" #include "storage/spin.h" +#include "storage/lock.h" #include "utils/dynahash.h" #include "utils/memutils.h" @@ -128,12 +129,26 @@ typedef HASHBUCKET *HASHSEGMENT; */ struct HASHHDR { - /* In a partitioned table, take this lock to touch nentries or freeList */ - slock_t mutex; /* unused if not partitioned table */ - - /* These fields change during entry addition/deletion */ - long nentries; /* number of entries in hash table */ - HASHELEMENT *freeList; /* linked list of free elements */ + /* + * There are two fields declared below: nentries and freeList. nentries + * stores current number of entries in a hash table. freeList is a linked + * list of free elements. + * + * To keep these fields consistent in a partitioned table we need to + * synchronize access to them using a spinlock. But it turned out that a + * single spinlock can create a bottleneck. To prevent lock contention an + * array of NUM_LOCK_PARTITIONS spinlocks is used. Each spinlock + * corresponds to a single table partition (see PARTITION_IDX definition) + * and protects one element of nentries and freeList arrays. Since + * partitions are locked on a calling side depending on lower bits of a + * hash value this particular number of spinlocks prevents deadlocks. + * + * If hash table is not partitioned only nentries[0] and freeList[0] are + * used and spinlocks are not used at all. + */ + slock_t mutex[NUM_LOCK_PARTITIONS]; /* array of spinlocks */ + long nentries[NUM_LOCK_PARTITIONS]; /* number of entries */ + HASHELEMENT *freeList[NUM_LOCK_PARTITIONS]; /* lists of free elements */ /* These fields can change, but not in a partitioned table */ /* Also, dsize can't change in a shared table, even if unpartitioned */ @@ -166,6 +181,8 @@ struct HASHHDR #define IS_PARTITIONED(hctl) ((hctl)->num_partitions != 0) +#define PARTITION_IDX(hctl, hashcode) (IS_PARTITIONED(hctl) ? LockHashPartition(hashcode) : 0) + /* * Top control structure for a hashtable --- in a shared table, each backend * has its own copy (OK since no fields change at runtime) @@ -219,10 +236,10 @@ static long hash_accesses, */ static void *DynaHashAlloc(Size size); static HASHSEGMENT seg_alloc(HTAB *hashp); -static bool element_alloc(HTAB *hashp, int nelem); +static bool element_alloc(HTAB *hashp, int nelem, int partition_idx); static bool dir_realloc(HTAB *hashp); static bool expand_table(HTAB *hashp); -static HASHBUCKET get_hash_entry(HTAB *hashp); +static HASHBUCKET get_hash_entry(HTAB *hashp, int partition_idx); static void hdefault(HTAB *hashp); static int choose_nelem_alloc(Size entrysize); static bool init_htab(HTAB *hashp, long nelem); @@ -282,6 +299,9 @@ hash_create(const char *tabname, long nelem, HASHCTL *info, int flags) { HTAB *hashp; HASHHDR *hctl; + int i, + partitions_number, + nelem_alloc; /* * For shared hash tables, we have a local hash header (HTAB struct) that @@ -482,10 +502,24 @@ hash_create(const char *tabname, long nelem, HASHCTL *info, int flags) if ((flags & HASH_SHARED_MEM) || nelem < hctl->nelem_alloc) { - if (!element_alloc(hashp, (int) nelem)) - ereport(ERROR, - (errcode(ERRCODE_OUT_OF_MEMORY), - errmsg("out of memory"))); + /* + * If hash table is partitioned all freeLists have equal number of + * elements. Otherwise only freeList[0] is used. + */ + if (IS_PARTITIONED(hashp->hctl)) + partitions_number = NUM_LOCK_PARTITIONS; + else + partitions_number = 1; + + nelem_alloc = ((int) nelem) / partitions_number; + if (nelem_alloc == 0) + nelem_alloc = 1; + + for (i = 0; i < partitions_number; i++) + if (!element_alloc(hashp, nelem_alloc, i)) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); } if (flags & HASH_FIXED_SIZE) @@ -503,9 +537,6 @@ hdefault(HTAB *hashp) MemSet(hctl, 0, sizeof(HASHHDR)); - hctl->nentries = 0; - hctl->freeList = NULL; - hctl->dsize = DEF_DIRSIZE; hctl->nsegs = 0; @@ -572,12 +603,14 @@ init_htab(HTAB *hashp, long nelem) HASHSEGMENT *segp; int nbuckets; int nsegs; + int i; /* * initialize mutex if it's a partitioned table */ if (IS_PARTITIONED(hctl)) - SpinLockInit(&hctl->mutex); + for (i = 0; i < NUM_LOCK_PARTITIONS; i++) + SpinLockInit(&(hctl->mutex[i])); /* * Divide number of elements by the fill factor to determine a desired @@ -648,7 +681,7 @@ init_htab(HTAB *hashp, long nelem) "HIGH MASK ", hctl->high_mask, "LOW MASK ", hctl->low_mask, "NSEGS ", hctl->nsegs, - "NENTRIES ", hctl->nentries); + "NENTRIES ", hash_get_num_entries(hctl)); #endif return true; } @@ -769,7 +802,7 @@ hash_stats(const char *where, HTAB *hashp) where, hashp->hctl->accesses, hashp->hctl->collisions); fprintf(stderr, "hash_stats: entries %ld keysize %ld maxp %u segmentcount %ld\n", - hashp->hctl->nentries, (long) hashp->hctl->keysize, + hash_get_num_entries(hashp), (long) hashp->hctl->keysize, hashp->hctl->max_bucket, hashp->hctl->nsegs); fprintf(stderr, "%s: total accesses %ld total collisions %ld\n", where, hash_accesses, hash_collisions); @@ -863,6 +896,7 @@ hash_search_with_hash_value(HTAB *hashp, HASHBUCKET currBucket; HASHBUCKET *prevBucketPtr; HashCompareFunc match; + int partition_idx = PARTITION_IDX(hctl, hashvalue); #if HASH_STATISTICS hash_accesses++; @@ -885,7 +919,7 @@ hash_search_with_hash_value(HTAB *hashp, * order of these tests is to try to check cheaper conditions first. */ if (!IS_PARTITIONED(hctl) && !hashp->frozen && - hctl->nentries / (long) (hctl->max_bucket + 1) >= hctl->ffactor && + hctl->nentries[0] / (long) (hctl->max_bucket + 1) >= hctl->ffactor && !has_seq_scans(hashp)) (void) expand_table(hashp); } @@ -943,20 +977,20 @@ hash_search_with_hash_value(HTAB *hashp, { /* if partitioned, must lock to touch nentries and freeList */ if (IS_PARTITIONED(hctl)) - SpinLockAcquire(&hctl->mutex); + SpinLockAcquire(&(hctl->mutex[partition_idx])); - Assert(hctl->nentries > 0); - hctl->nentries--; + Assert(hctl->nentries[partition_idx] > 0); + hctl->nentries[partition_idx]--; /* remove record from hash bucket's chain. */ *prevBucketPtr = currBucket->link; /* add the record to the freelist for this table. */ - currBucket->link = hctl->freeList; - hctl->freeList = currBucket; + currBucket->link = hctl->freeList[partition_idx]; + hctl->freeList[partition_idx] = currBucket; if (IS_PARTITIONED(hctl)) - SpinLockRelease(&hctl->mutex); + SpinLockRelease(&hctl->mutex[partition_idx]); /* * better hope the caller is synchronizing access to this @@ -982,7 +1016,7 @@ hash_search_with_hash_value(HTAB *hashp, elog(ERROR, "cannot insert into frozen hashtable \"%s\"", hashp->tabname); - currBucket = get_hash_entry(hashp); + currBucket = get_hash_entry(hashp, partition_idx); if (currBucket == NULL) { /* out of memory */ @@ -1175,41 +1209,71 @@ hash_update_hash_key(HTAB *hashp, * create a new entry if possible */ static HASHBUCKET -get_hash_entry(HTAB *hashp) +get_hash_entry(HTAB *hashp, int partition_idx) { - HASHHDR *hctl = hashp->hctl; + HASHHDR *hctl = hashp->hctl; HASHBUCKET newElement; + int i, + borrow_from_idx; for (;;) { /* if partitioned, must lock to touch nentries and freeList */ if (IS_PARTITIONED(hctl)) - SpinLockAcquire(&hctl->mutex); + SpinLockAcquire(&hctl->mutex[partition_idx]); /* try to get an entry from the freelist */ - newElement = hctl->freeList; + newElement = hctl->freeList[partition_idx]; + if (newElement != NULL) - break; + { + /* remove entry from freelist, bump nentries */ + hctl->freeList[partition_idx] = newElement->link; + hctl->nentries[partition_idx]++; + if (IS_PARTITIONED(hctl)) + SpinLockRelease(&hctl->mutex[partition_idx]); + + return newElement; + } - /* no free elements. allocate another chunk of buckets */ if (IS_PARTITIONED(hctl)) - SpinLockRelease(&hctl->mutex); + SpinLockRelease(&hctl->mutex[partition_idx]); - if (!element_alloc(hashp, hctl->nelem_alloc)) + /* no free elements. allocate another chunk of buckets */ + if (!element_alloc(hashp, hctl->nelem_alloc, partition_idx)) { - /* out of memory */ - return NULL; - } - } + if (!IS_PARTITIONED(hctl)) + return NULL; /* out of memory */ - /* remove entry from freelist, bump nentries */ - hctl->freeList = newElement->link; - hctl->nentries++; + /* try to borrow element from another partition */ + borrow_from_idx = partition_idx; + for (;;) + { + borrow_from_idx = (borrow_from_idx + 1) % NUM_LOCK_PARTITIONS; + if (borrow_from_idx == partition_idx) + break; - if (IS_PARTITIONED(hctl)) - SpinLockRelease(&hctl->mutex); + SpinLockAcquire(&(hctl->mutex[borrow_from_idx])); + newElement = hctl->freeList[borrow_from_idx]; + + if (newElement != NULL) + { + hctl->freeList[borrow_from_idx] = newElement->link; + SpinLockRelease(&(hctl->mutex[borrow_from_idx])); + + SpinLockAcquire(&hctl->mutex[partition_idx]); + hctl->nentries[partition_idx]++; + SpinLockRelease(&hctl->mutex[partition_idx]); + + break; + } - return newElement; + SpinLockRelease(&(hctl->mutex[borrow_from_idx])); + } + + return newElement; + } + } } /* @@ -1218,11 +1282,21 @@ get_hash_entry(HTAB *hashp) long hash_get_num_entries(HTAB *hashp) { + int i; + long sum = hashp->hctl->nentries[0]; + /* * We currently don't bother with the mutex; it's only sensible to call * this function if you've got lock on all partitions of the table. */ - return hashp->hctl->nentries; + + if (!IS_PARTITIONED(hashp->hctl)) + return sum; + + for (i = 1; i < NUM_LOCK_PARTITIONS; i++) + sum += hashp->hctl->nentries[i]; + + return sum; } /* @@ -1530,9 +1604,9 @@ seg_alloc(HTAB *hashp) * allocate some new elements and link them into the free list */ static bool -element_alloc(HTAB *hashp, int nelem) +element_alloc(HTAB *hashp, int nelem, int partition_idx) { - HASHHDR *hctl = hashp->hctl; + HASHHDR *hctl = hashp->hctl; Size elementSize; HASHELEMENT *firstElement; HASHELEMENT *tmpElement; @@ -1563,14 +1637,14 @@ element_alloc(HTAB *hashp, int nelem) /* if partitioned, must lock to touch freeList */ if (IS_PARTITIONED(hctl)) - SpinLockAcquire(&hctl->mutex); + SpinLockAcquire(&hctl->mutex[partition_idx]); /* freelist could be nonempty if two backends did this concurrently */ - firstElement->link = hctl->freeList; - hctl->freeList = prevElement; + firstElement->link = hctl->freeList[partition_idx]; + hctl->freeList[partition_idx] = prevElement; if (IS_PARTITIONED(hctl)) - SpinLockRelease(&hctl->mutex); + SpinLockRelease(&hctl->mutex[partition_idx]); return true; } diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h index 5e8825e..177371b 100644 --- a/src/include/storage/lwlock.h +++ b/src/include/storage/lwlock.h @@ -128,13 +128,19 @@ extern char *MainLWLockNames[]; * having this file include lock.h or bufmgr.h would be backwards. */ -/* Number of partitions of the shared buffer mapping hashtable */ -#define NUM_BUFFER_PARTITIONS 128 - -/* Number of partitions the shared lock tables are divided into */ -#define LOG2_NUM_LOCK_PARTITIONS 4 +/* + * Number of partitions the shared lock tables are divided into. + * + * This particular number of partitions significantly reduces lock contention + * in partitioned hash tables, almost if partitioned tables didn't use any + * locking at all. + */ +#define LOG2_NUM_LOCK_PARTITIONS 7 #define NUM_LOCK_PARTITIONS (1 << LOG2_NUM_LOCK_PARTITIONS) +/* Number of partitions of the shared buffer mapping hashtable */ +#define NUM_BUFFER_PARTITIONS NUM_LOCK_PARTITIONS + /* Number of partitions the shared predicate lock tables are divided into */ #define LOG2_NUM_PREDICATELOCK_PARTITIONS 4 #define NUM_PREDICATELOCK_PARTITIONS (1 << LOG2_NUM_PREDICATELOCK_PARTITIONS) diff --git a/src/include/storage/shmem.h b/src/include/storage/shmem.h index 6468e66..50cf928 100644 --- a/src/include/storage/shmem.h +++ b/src/include/storage/shmem.h @@ -37,7 +37,7 @@ extern void InitShmemAllocation(void); extern void *ShmemAlloc(Size size); extern bool ShmemAddrIsValid(const void *addr); extern void InitShmemIndex(void); -extern HTAB *ShmemInitHash(const char *name, long init_size, long max_size, +extern HTAB *ShmemInitHash(const char *name, long max_size, HASHCTL *infoP, int hash_flags); extern void *ShmemInitStruct(const char *name, Size size, bool *foundPtr); extern Size add_size(Size s1, Size s2);