diff --git a/doc/src/sgml/backup.sgml b/doc/src/sgml/backup.sgml index 6eaed1e..dd6e851 100644 --- a/doc/src/sgml/backup.sgml +++ b/doc/src/sgml/backup.sgml @@ -1536,19 +1536,6 @@ archive_command = 'local_backup_script.sh "%p" "%f"' - Operations on hash indexes are not presently WAL-logged, so - replay will not update these indexes. This will mean that any new inserts - will be ignored by the index, updated rows will apparently disappear and - deleted rows will still retain pointers. In other words, if you modify a - table with a hash index on it then you will get incorrect query results - on a standby server. When recovery completes it is recommended that you - manually - each such index after completing a recovery operation. - - - - - If a command is executed while a base backup is being taken, and then the template database that the CREATE DATABASE copied diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index 8d7b3bf..10f8074 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -2138,10 +2138,9 @@ include_dir 'conf.d' has materialized a result set, no error will be generated even if the underlying rows in the referenced table have been vacuumed away. Some tables cannot safely be vacuumed early, and so will not be - affected by this setting. Examples include system catalogs and any - table which has a hash index. For such tables this setting will - neither reduce bloat nor create a possibility of a snapshot - too old error on scanning. + affected by this setting. Example include system catalogs. For + such tables this setting will neither reduce bloat nor create a + possibility of a snapshot too old error on scanning. diff --git a/doc/src/sgml/high-availability.sgml b/doc/src/sgml/high-availability.sgml index a1a9532..964af84 100644 --- a/doc/src/sgml/high-availability.sgml +++ b/doc/src/sgml/high-availability.sgml @@ -2353,12 +2353,6 @@ LOG: database system is ready to accept read only connections - Operations on hash indexes are not presently WAL-logged, so - replay will not update these indexes. - - - - Full knowledge of running transactions is required before snapshots can be taken. Transactions that use large numbers of subtransactions (currently greater than 64) will delay the start of read only diff --git a/doc/src/sgml/indices.sgml b/doc/src/sgml/indices.sgml index 271c135..e40750e 100644 --- a/doc/src/sgml/indices.sgml +++ b/doc/src/sgml/indices.sgml @@ -193,18 +193,6 @@ CREATE INDEX name ON table - - - Hash index operations are not presently WAL-logged, - so hash indexes might need to be rebuilt with REINDEX - after a database crash if there were unwritten changes. - Also, changes to hash indexes are not replicated over streaming or - file-based replication after the initial base backup, so they - give wrong answers to queries that subsequently use them. - For these reasons, hash index use is presently discouraged. - - - index diff --git a/doc/src/sgml/ref/create_index.sgml b/doc/src/sgml/ref/create_index.sgml index fcb7a60..7163b03 100644 --- a/doc/src/sgml/ref/create_index.sgml +++ b/doc/src/sgml/ref/create_index.sgml @@ -510,19 +510,6 @@ Indexes: they can be useful. - - - Hash index operations are not presently WAL-logged, - so hash indexes might need to be rebuilt with REINDEX - after a database crash if there were unwritten changes. - Also, changes to hash indexes are not replicated over streaming or - file-based replication after the initial base backup, so they - give wrong answers to queries that subsequently use them. - Hash indexes are also not properly restored during point-in-time - recovery. For these reasons, hash index use is presently discouraged. - - - Currently, only the B-tree, GiST, GIN, and BRIN index methods support multicolumn indexes. Up to 32 fields can be specified by default. diff --git a/src/backend/access/hash/Makefile b/src/backend/access/hash/Makefile index e2e7e91..b154569 100644 --- a/src/backend/access/hash/Makefile +++ b/src/backend/access/hash/Makefile @@ -13,6 +13,6 @@ top_builddir = ../../../.. include $(top_builddir)/src/Makefile.global OBJS = hash.o hashfunc.o hashinsert.o hashovfl.o hashpage.o hashsearch.o \ - hashsort.o hashutil.o hashvalidate.o + hashsort.o hashutil.o hashvalidate.o hash_xlog.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/access/hash/README b/src/backend/access/hash/README index 01ea115..06ef477 100644 --- a/src/backend/access/hash/README +++ b/src/backend/access/hash/README @@ -248,7 +248,6 @@ The insertion algorithm is rather similar: split happened) take the buffer content lock on bucket page in exclusive mode retake meta page buffer content lock in shared mode - release pin on metapage -- (so far same as reader, except for acquisition of buffer content lock in exclusive mode on primary bucket page) if the bucket-being-split flag is set for a bucket and pin count on it is @@ -263,13 +262,17 @@ The insertion algorithm is rather similar: if current page is full, release lock but not pin, read/exclusive-lock next page; repeat as needed >> see below if no space in any page of bucket + take buffer content lock in exclusive mode on metapage insert tuple at appropriate place in page - mark current page dirty and release buffer content lock and pin - if the current page is not a bucket page, release the pin on bucket page - pin meta page and take buffer content lock in exclusive mode + mark current page dirty increment tuple count, decide if split needed - mark meta page dirty and release buffer content lock and pin - done if no split needed, else enter Split algorithm below + mark meta page dirty + write WAL for insertion of tuple + release the buffer content lock on metapage + release buffer content lock on current page + if current page is not a bucket page, release the pin on bucket page + if split is needed, enter Split algorithm below + release the pin on metapage To speed searches, the index entries within any individual index page are kept sorted by hash code; the insertion code must take care to insert new @@ -304,12 +307,17 @@ existing bucket in two, thereby lowering the fill ratio: try to finish the split and the cleanup work if that succeeds, start over; if it fails, give up mark the old and new buckets indicating split is in progress + mark both old and new buckets as dirty + write WAL for allocation of new page for split copy the tuples that belongs to new bucket from old bucket, marking them as moved-by-split + write WAL record for moving tuples to new page once the new page is full + or all the pages of old bucket are finished release lock but not pin for primary bucket page of old bucket, read/shared-lock next page; repeat as needed clear the bucket-being-split and bucket-being-populated flags mark the old bucket indicating split-cleanup + write WAL for changing the flags on both old and new buckets The split operation's attempt to acquire cleanup-lock on the old bucket number could fail if another process holds any lock or pin on it. We do not want to @@ -345,6 +353,8 @@ The fourth operation is garbage collection (bulk deletion): acquire cleanup lock on primary bucket page loop: scan and remove tuples + mark the target page dirty + write WAL for deleting tuples from target page if this is the last bucket page, break out of loop pin and x-lock next page release prior lock and pin (except keep pin on primary bucket page) @@ -359,7 +369,8 @@ The fourth operation is garbage collection (bulk deletion): check if number of buckets changed if so, release content lock and pin and return to for-each-bucket loop else update metapage tuple count - mark meta page dirty and release buffer content lock and pin + mark meta page dirty and write WAL for update of metapage + release buffer content lock and pin Note that this is designed to allow concurrent splits and scans. If a split occurs, tuples relocated into the new bucket will be visited twice by the @@ -401,18 +412,16 @@ Obtaining an overflow page: search for a free page (zero bit in bitmap) if found: set bit in bitmap - mark bitmap page dirty and release content lock + mark bitmap page dirty take metapage buffer content lock in exclusive mode if first-free-bit value did not change, update it and mark meta page dirty - release meta page buffer content lock - return page number else (not found): release bitmap page buffer content lock loop back to try next bitmap page, if any -- here when we have checked all bitmap pages; we hold meta excl. lock extend index to add another overflow page; update meta information - mark meta page dirty and release buffer content lock + mark meta page dirty return page number It is slightly annoying to release and reacquire the metapage lock @@ -432,12 +441,15 @@ like this: -- having determined that no space is free in the target bucket: remember last page of bucket, drop write lock on it - call free-page-acquire routine re-write-lock last page of bucket if it is not last anymore, step to the last page - update (former) last page to point to new page + execute free-page-acquire (Obtaining an overflow page) mechanism described above + update (former) last page to point to the new page and mark the buffer dirty. write-lock and initialize new page, with back link to former last page - write and release former last page + write WAL for addition of overflow page + release the locks on meta page and bitmap page acquired in free-page-acquire algorithm + release the lock on former last page + release the lock on new overflow page insert tuple into new page -- etc. @@ -464,12 +476,14 @@ accessors of pages in the bucket. The algorithm is: determine which bitmap page contains the free space bit for page release meta page buffer content lock pin bitmap page and take buffer content lock in exclusive mode - update bitmap bit - mark bitmap page dirty and release buffer content lock and pin - if page number is less than what we saw as first-free-bit in meta: retake meta page buffer content lock in exclusive mode + move (insert) tuples that belong to the overflow page being freed + update bitmap bit + mark bitmap page dirty if page number is still less than first-free-bit, update first-free-bit field and mark meta page dirty + write WAL for delinking overflow page operation + release buffer content lock and pin release meta page buffer content lock and pin We have to do it this way because we must clear the bitmap bit before @@ -480,8 +494,101 @@ page acquirer will scan more bitmap bits than he needs to. What must be avoided is having first-free-bit greater than the actual first free bit, because then that free page would never be found by searchers. -All the freespace operations should be called while holding no buffer -locks. Since they need no lmgr locks, deadlock is not possible. +The reason of moving tuples from overflow page while delinking the later is +to make that as an atomic operation. Not doing so could lead to spurious reads +on standby. Basically, the user might see the same tuple twice. + + +WAL Considerations +------------------ + +The hash index operations like create index, insert, delete, bucket split, +allocate overflow page, squeeze (overflow pages are freed in this operation) +in themselves doesn't guarantee hash index consistency after a crash. To +provide robustness, we write WAL for each of these operations which get +replayed on crash recovery. + +Multiple WAL records are being written for create index operation, first for +initializing the metapage, followed by one for each new bucket created during +operation followed by one for initializing the bitmap page. If the system +crashes after any operation, the whole operation is rolled back. We have +considered to write a single WAL record for the whole operation, but for that +we need to limit the number of initial buckets that can be created during the +operation. As we can log only the fixed number of pages XLR_MAX_BLOCK_ID (32) +with current XLog machinery, it is better to write multiple WAL records for this +operation. The downside of restricting the number of buckets is that we need +to perform the split operation if the number of tuples are more than what can be +accomodated in the initial set of buckets and it is not unusual to have large +number of tuples during create index operation. + +Ordinary item insertions (that don't force a page split or need a new overflow +page) are single WAL entries. They touch a single bucket page and meta page, +metapage is updated during replay as it is updated during original operation. + +An insertion that causes an addition of an overflow page is logged as a single +WAL entry preceded by a WAL entry for new overflow page required to insert +a tuple. There is a corner case where by the time we try to use newly +allocated overflow page, it already gets used by concurrent insertions, for +such a case, a new overflow page will be allocated and a separate WAL entry +will be made for the same. + +An insertion that causes a bucket split is logged as a single WAL entry, +followed by a WAL entry for allocating a new bucket, followed by a WAL entry +for each overflow bucket page in the new bucket to which the tuples are moved +from old bucket, followed by a WAL entry to indicate that split is complete +for both old and new buckets. + +A split operation which requires overflow pages to complete the operation +will need to write a WAL record for each new allocation of an overflow page. + +As splitting involves multiple atomic actions, it's possible that the +system crashes between moving tuples from bucket pages of the old bucket to new +bucket. In such a case, after recovery, both the old and new buckets will be +marked with bucket-being-split and bucket-being-populated flags respectively +which indicates that split is in progress for those buckets. The reader +algorithm works correctly, as it will scan both the old and new buckets when +the split is in progress as explained in the reader algorithm section above. + +We finish the split at next insert or split operation on the old bucket as +explained in insert and split algorithm above. It could be done during +searches, too, but it seems best not to put any extra updates in what would +otherwise be a read-only operation (updating is not possible in hot standby +mode anyway). It would seem natural to complete the split in VACUUM, but since +splitting a bucket might require allocating a new page, it might fail if you +run out of disk space. That would be bad during VACUUM - the reason for +running VACUUM in the first place might be that you run out of disk space, +and now VACUUM won't finish because you're out of disk space. In contrast, +an insertion can require enlarging the physical file anyway. + +Deletion of tuples from a bucket is performed for two reasons, one for +removing the dead tuples and other for removing the tuples that are moved by +split. WAL entry is made for each bucket page from which tuples are removed, +followed by a WAL entry to clear the garbage flag if the tuples moved by split +are removed. Another separate WAL entry is made for updating the metapage if +the deletion is performed for removing the dead tuples by vacuum. + +As deletion involves multiple atomic operations, it is quite possible that +system crashes after (a) removing tuples from some of the bucket pages +(b) before clearing the garbage flag (c) before updating the metapage. If the +system crashes before completing (b), it will again try to clean the bucket +during next vacuum or insert after recovery which can have some performance +impact, but it will work fine. If the system crashes before completing (c), +after recovery there could be some additional splits till the next vacuum +updates the metapage, but the other operations like insert, delete and scan +will work correctly. We can fix this problem by actually updating the metapage +based on delete operation during replay, but not sure if it is worth the +complication. + +Squeeze operation moves tuples from one of the buckets later in the chain to +one of the bucket earlier in chain and writes WAL record when either the +bucket to which it is writing tuples is filled or bucket from which it +is removing the tuples becomes empty. + +As Squeeze operation involves writing multiple atomic operations, it is +quite possible, that system crashes before completing the operation on +entire bucket. After recovery, the operations will work correctly, but +the index will remain bloated and can impact performance of read and +insert operations until the next vacuum squeeze the bucket completely. Other Notes diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c index 1fa087a..3f663c9 100644 --- a/src/backend/access/hash/hash.c +++ b/src/backend/access/hash/hash.c @@ -27,6 +27,7 @@ #include "optimizer/plancat.h" #include "utils/index_selfuncs.h" #include "utils/rel.h" +#include "miscadmin.h" /* Working state for hashbuild and its callback */ @@ -115,7 +116,7 @@ hashbuild(Relation heap, Relation index, IndexInfo *indexInfo) estimate_rel_size(heap, NULL, &relpages, &reltuples, &allvisfrac); /* Initialize the hash index metadata page and initial buckets */ - num_buckets = _hash_metapinit(index, reltuples, MAIN_FORKNUM); + num_buckets = _hash_init(index, reltuples, MAIN_FORKNUM); /* * If we just insert the tuples into the index in scan order, then @@ -177,7 +178,7 @@ hashbuild(Relation heap, Relation index, IndexInfo *indexInfo) void hashbuildempty(Relation index) { - _hash_metapinit(index, 0, INIT_FORKNUM); + _hash_init(index, 0, INIT_FORKNUM); } /* @@ -297,6 +298,11 @@ hashgettuple(IndexScanDesc scan, ScanDirection dir) buf = so->hashso_curbuf; Assert(BufferIsValid(buf)); page = BufferGetPage(buf); + + /* + * We don't need test for old snapshot here as the current buffer is + * pinned, so vacuum can't clean the page. + */ maxoffnum = PageGetMaxOffsetNumber(page); for (offnum = ItemPointerGetOffsetNumber(current); offnum <= maxoffnum; @@ -610,6 +616,7 @@ loop_top: } /* Okay, we're really done. Update tuple count in metapage. */ + START_CRIT_SECTION(); if (orig_maxbucket == metap->hashm_maxbucket && orig_ntuples == metap->hashm_ntuples) @@ -636,6 +643,26 @@ loop_top: } MarkBufferDirty(metabuf); + + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) + { + xl_hash_update_meta_page xlrec; + XLogRecPtr recptr; + + xlrec.ntuples = metap->hashm_ntuples; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, sizeof(SizeOfHashUpdateMetaPage)); + + XLogRegisterBuffer(0, metabuf, REGBUF_STANDARD); + + recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_UPDATE_META_PAGE); + PageSetLSN(BufferGetPage(metabuf), recptr); + } + + END_CRIT_SECTION(); + _hash_relbuf(rel, metabuf); /* return statistics */ @@ -803,9 +830,40 @@ hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf, */ if (ndeletable > 0) { + /* No ereport(ERROR) until changes are logged */ + START_CRIT_SECTION(); + PageIndexMultiDelete(page, deletable, ndeletable); bucket_dirty = true; MarkBufferDirty(buf); + + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) + { + xl_hash_delete xlrec; + XLogRecPtr recptr; + + xlrec.is_primary_bucket_page = (buf == bucket_buf) ? true : false; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfHashDelete); + + /* + * bucket buffer needs to be registered to ensure that we can + * acquire a cleanup lock on it during replay. + */ + if (!xlrec.is_primary_bucket_page) + XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD); + + XLogRegisterBuffer(1, buf, REGBUF_STANDARD); + XLogRegisterBufData(1, (char *) deletable, + ndeletable * sizeof(OffsetNumber)); + + recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_DELETE); + PageSetLSN(BufferGetPage(buf), recptr); + } + + END_CRIT_SECTION(); } /* bail out if there are no more pages to scan. */ @@ -853,8 +911,25 @@ hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf, page = BufferGetPage(bucket_buf); bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page); + /* No ereport(ERROR) until changes are logged */ + START_CRIT_SECTION(); + bucket_opaque->hasho_flag &= ~LH_BUCKET_NEEDS_SPLIT_CLEANUP; MarkBufferDirty(bucket_buf); + + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) + { + XLogRecPtr recptr; + + XLogBeginInsert(); + XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD); + + recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_CLEANUP); + PageSetLSN(page, recptr); + } + + END_CRIT_SECTION(); } /* @@ -868,9 +943,3 @@ hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf, else LockBuffer(bucket_buf, BUFFER_LOCK_UNLOCK); } - -void -hash_redo(XLogReaderState *record) -{ - elog(PANIC, "hash_redo: unimplemented"); -} diff --git a/src/backend/access/hash/hash_xlog.c b/src/backend/access/hash/hash_xlog.c new file mode 100644 index 0000000..41429a7 --- /dev/null +++ b/src/backend/access/hash/hash_xlog.c @@ -0,0 +1,970 @@ +/*------------------------------------------------------------------------- + * + * hash_xlog.c + * WAL replay logic for hash index. + * + * + * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/access/hash/hash_xlog.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/hash_xlog.h" +#include "access/xlogutils.h" + +/* + * replay a hash index meta page + */ +static void +hash_xlog_init_meta_page(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + Page page; + Buffer metabuf; + + xl_hash_init_meta_page *xlrec = (xl_hash_init_meta_page *) XLogRecGetData(record); + + /* create the index' metapage */ + metabuf = XLogInitBufferForRedo(record, 0); + Assert(BufferIsValid(metabuf)); + _hash_init_metabuffer(metabuf, xlrec->num_tuples, xlrec->procid, + xlrec->ffactor, true); + page = (Page) BufferGetPage(metabuf); + PageSetLSN(page, lsn); + MarkBufferDirty(metabuf); + /* all done */ + UnlockReleaseBuffer(metabuf); +} + +/* + * replay a hash index bitmap page + */ +static void +hash_xlog_init_bitmap_page(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + Buffer bitmapbuf; + Buffer metabuf; + Page page; + HashMetaPage metap; + uint32 num_buckets; + + xl_hash_init_bitmap_page *xlrec = (xl_hash_init_bitmap_page *) XLogRecGetData(record); + + /* + * Initialize bitmap page + */ + bitmapbuf = XLogInitBufferForRedo(record, 0); + _hash_initbitmapbuffer(bitmapbuf, xlrec->bmsize, true); + PageSetLSN(BufferGetPage(bitmapbuf), lsn); + MarkBufferDirty(bitmapbuf); + UnlockReleaseBuffer(bitmapbuf); + + /* add the new bitmap page to the metapage's list of bitmaps */ + if (XLogReadBufferForRedo(record, 1, &metabuf) == BLK_NEEDS_REDO) + { + /* + * Note: in normal operation, we'd update the metapage while still + * holding lock on the page we inserted into. But during replay it's + * not necessary to hold that lock, since no other index updates can + * be happening concurrently. + */ + page = BufferGetPage(metabuf); + metap = HashPageGetMeta(page); + + num_buckets = metap->hashm_maxbucket + 1; + metap->hashm_mapp[metap->hashm_nmaps] = num_buckets + 1; + metap->hashm_nmaps++; + + PageSetLSN(page, lsn); + MarkBufferDirty(metabuf); + } + if (BufferIsValid(metabuf)) + UnlockReleaseBuffer(metabuf); +} + +/* + * replay a hash index insert without split + */ +static void +hash_xlog_insert(XLogReaderState *record) +{ + HashMetaPage metap; + XLogRecPtr lsn = record->EndRecPtr; + xl_hash_insert *xlrec = (xl_hash_insert *) XLogRecGetData(record); + Buffer buffer; + Page page; + + if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) + { + Size datalen; + char *datapos = XLogRecGetBlockData(record, 0, &datalen); + + page = BufferGetPage(buffer); + + if (PageAddItem(page, (Item) datapos, datalen, xlrec->offnum, + false, false) == InvalidOffsetNumber) + elog(PANIC, "hash_insert_redo: failed to add item"); + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); + + if (XLogReadBufferForRedo(record, 1, &buffer) == BLK_NEEDS_REDO) + { + /* + * Note: in normal operation, we'd update the metapage while still + * holding lock on the page we inserted into. But during replay it's + * not necessary to hold that lock, since no other index updates can + * be happening concurrently. + */ + page = BufferGetPage(buffer); + metap = HashPageGetMeta(page); + metap->hashm_ntuples += 1; + + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); +} + +/* + * replay addition of overflow page for hash index + */ +static void +hash_xlog_addovflpage(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_hash_addovflpage *xlrec = (xl_hash_addovflpage *) XLogRecGetData(record); + Buffer leftbuf; + Buffer ovflbuf; + Buffer metabuf; + BlockNumber leftblk; + BlockNumber rightblk; + BlockNumber newmapblk = InvalidBlockNumber; + Page ovflpage; + HashPageOpaque ovflopaque; + uint32 *num_bucket; + char *data; + Size datalen PG_USED_FOR_ASSERTS_ONLY; + bool new_bmpage = false; + + XLogRecGetBlockTag(record, 0, NULL, NULL, &rightblk); + XLogRecGetBlockTag(record, 1, NULL, NULL, &leftblk); + + ovflbuf = XLogInitBufferForRedo(record, 0); + Assert(BufferIsValid(ovflbuf)); + + data = XLogRecGetBlockData(record, 0, &datalen); + num_bucket = (uint32 *) data; + Assert(datalen == sizeof(uint32)); + _hash_initbuf(ovflbuf, *num_bucket, LH_OVERFLOW_PAGE, true); + /* update backlink */ + ovflpage = BufferGetPage(ovflbuf); + ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage); + ovflopaque->hasho_prevblkno = leftblk; + + PageSetLSN(ovflpage, lsn); + MarkBufferDirty(ovflbuf); + + if (XLogReadBufferForRedo(record, 1, &leftbuf) == BLK_NEEDS_REDO) + { + Page leftpage; + HashPageOpaque leftopaque; + + leftpage = BufferGetPage(leftbuf); + leftopaque = (HashPageOpaque) PageGetSpecialPointer(leftpage); + leftopaque->hasho_nextblkno = rightblk; + + PageSetLSN(leftpage, lsn); + MarkBufferDirty(leftbuf); + } + + /* + * We need to release the locks once the prev pointer of overflow bucket + * and next of left bucket are set, otherwise concurrent read might skip + * the bucket. + */ + if (BufferIsValid(leftbuf)) + UnlockReleaseBuffer(leftbuf); + UnlockReleaseBuffer(ovflbuf); + + /* + * Note: in normal operation, we'd update the bitmap and meta page while + * still holding lock on the overflow pages we updated. But during replay + * it's not necessary to hold those locks, since no other index updates + * can be happening concurrently. + */ + if (XLogRecHasBlockRef(record, 2)) + { + Buffer mapbuffer; + + if (XLogReadBufferForRedo(record, 2, &mapbuffer) == BLK_NEEDS_REDO) + { + Page mappage = (Page) BufferGetPage(mapbuffer); + uint32 *freep = NULL; + char *data; + uint32 *bitmap_page_bit; + + freep = HashPageGetBitmap(mappage); + + data = XLogRecGetBlockData(record, 2, &datalen); + bitmap_page_bit = (uint32 *) data; + + SETBIT(freep, *bitmap_page_bit); + + PageSetLSN(mappage, lsn); + MarkBufferDirty(mapbuffer); + } + if (BufferIsValid(mapbuffer)) + UnlockReleaseBuffer(mapbuffer); + } + + if (XLogRecHasBlockRef(record, 3)) + { + Buffer newmapbuf; + + newmapbuf = XLogInitBufferForRedo(record, 3); + + _hash_initbitmapbuffer(newmapbuf, xlrec->bmsize, true); + + new_bmpage = true; + newmapblk = BufferGetBlockNumber(newmapbuf); + + MarkBufferDirty(newmapbuf); + PageSetLSN(BufferGetPage(newmapbuf), lsn); + + UnlockReleaseBuffer(newmapbuf); + } + + if (XLogReadBufferForRedo(record, 4, &metabuf) == BLK_NEEDS_REDO) + { + HashMetaPage metap; + Page page; + uint32 *firstfree_ovflpage; + + data = XLogRecGetBlockData(record, 4, &datalen); + firstfree_ovflpage = (uint32 *) data; + + page = BufferGetPage(metabuf); + metap = HashPageGetMeta(page); + metap->hashm_firstfree = *firstfree_ovflpage; + + if (!xlrec->bmpage_found) + { + metap->hashm_spares[metap->hashm_ovflpoint]++; + + if (new_bmpage) + { + Assert(BlockNumberIsValid(newmapblk)); + + metap->hashm_mapp[metap->hashm_nmaps] = newmapblk; + metap->hashm_nmaps++; + metap->hashm_spares[metap->hashm_ovflpoint]++; + } + } + + PageSetLSN(page, lsn); + MarkBufferDirty(metabuf); + } + if (BufferIsValid(metabuf)) + UnlockReleaseBuffer(metabuf); +} + +/* + * replay allocation of page for split operation + */ +static void +hash_xlog_split_allocpage(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_hash_split_allocpage *xlrec = (xl_hash_split_allocpage *) XLogRecGetData(record); + Buffer oldbuf; + Buffer newbuf; + Buffer metabuf; + Size datalen PG_USED_FOR_ASSERTS_ONLY; + char *data; + XLogRedoAction action; + + /* replay the record for old bucket */ + action = XLogReadBufferForRedo(record, 0, &oldbuf); + + /* + * Note that we still update the page even if it was restored from a full + * page image, because the bucket flag is not included in the image. + */ + if (action == BLK_NEEDS_REDO || action == BLK_RESTORED) + { + Page oldpage; + HashPageOpaque oldopaque; + + oldpage = BufferGetPage(oldbuf); + oldopaque = (HashPageOpaque) PageGetSpecialPointer(oldpage); + + oldopaque->hasho_flag = xlrec->old_bucket_flag; + + PageSetLSN(oldpage, lsn); + MarkBufferDirty(oldbuf); + } + + /* + * There is no harm in releasing the lock on old bucket before new bucket + * during replay as no other bucket splits can be happening concurrently. + */ + if (BufferIsValid(oldbuf)) + UnlockReleaseBuffer(oldbuf); + + /* replay the record for new bucket */ + newbuf = XLogInitBufferForRedo(record, 1); + _hash_initbuf(newbuf, xlrec->new_bucket, xlrec->new_bucket_flag, true); + MarkBufferDirty(newbuf); + PageSetLSN(BufferGetPage(newbuf), lsn); + + UnlockReleaseBuffer(newbuf); + + /* + * Note: in normal operation, we'd update the meta page while still + * holding lock on the old and new bucket pages. But during replay it's + * not necessary to hold those locks, since no other bucket splits can be + * happening concurrently. + */ + + /* replay the record for metapage changes */ + if (XLogReadBufferForRedo(record, 2, &metabuf) == BLK_NEEDS_REDO) + { + Page page; + HashMetaPage metap; + + page = BufferGetPage(metabuf); + metap = HashPageGetMeta(page); + metap->hashm_maxbucket = xlrec->new_bucket; + + data = XLogRecGetBlockData(record, 2, &datalen); + + if (xlrec->flags & XLH_SPLIT_META_UPDATE_MASKS) + { + uint32 lowmask; + uint32 *highmask; + + /* extract low and high masks. */ + memcpy(&lowmask, data, sizeof(uint32)); + highmask = (uint32 *) ((char *) data + sizeof(uint32)); + + /* update metapage */ + metap->hashm_lowmask = lowmask; + metap->hashm_highmask = *highmask; + + data += sizeof(uint32) * 2; + } + + if (xlrec->flags & XLH_SPLIT_META_UPDATE_SPLITPOINT) + { + uint32 ovflpoint; + uint32 *ovflpages; + + /* extract information of overflow pages. */ + memcpy(&ovflpoint, data, sizeof(uint32)); + ovflpages = (uint32 *) ((char *) data + sizeof(uint32)); + + /* update metapage */ + metap->hashm_spares[ovflpoint] = *ovflpages; + metap->hashm_ovflpoint = ovflpoint; + } + + MarkBufferDirty(metabuf); + PageSetLSN(BufferGetPage(metabuf), lsn); + } + + if (BufferIsValid(metabuf)) + UnlockReleaseBuffer(metabuf); +} + +/* + * replay of split operation + */ +static void +hash_xlog_split_page(XLogReaderState *record) +{ + Buffer buf; + + if (XLogReadBufferForRedo(record, 0, &buf) != BLK_RESTORED) + elog(ERROR, "Hash split record did not contain a full-page image"); + + UnlockReleaseBuffer(buf); +} + +/* + * replay completion of split operation + */ +static void +hash_xlog_split_complete(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_hash_split_complete *xlrec = (xl_hash_split_complete *) XLogRecGetData(record); + Buffer oldbuf; + Buffer newbuf; + XLogRedoAction action; + + /* replay the record for old bucket */ + action = XLogReadBufferForRedo(record, 0, &oldbuf); + + /* + * Note that we still update the page even if it was restored from a full + * page image, because the bucket flag is not included in the image. + */ + if (action == BLK_NEEDS_REDO || action == BLK_RESTORED) + { + Page oldpage; + HashPageOpaque oldopaque; + + oldpage = BufferGetPage(oldbuf); + oldopaque = (HashPageOpaque) PageGetSpecialPointer(oldpage); + + oldopaque->hasho_flag = xlrec->old_bucket_flag; + + PageSetLSN(oldpage, lsn); + MarkBufferDirty(oldbuf); + } + if (BufferIsValid(oldbuf)) + UnlockReleaseBuffer(oldbuf); + + /* replay the record for new bucket */ + action = XLogReadBufferForRedo(record, 1, &newbuf); + + /* + * Note that we still update the page even if it was restored from a full + * page image, because the bucket flag is not included in the image. + */ + if (action == BLK_NEEDS_REDO || action == BLK_RESTORED) + { + Page newpage; + HashPageOpaque nopaque; + + newpage = BufferGetPage(newbuf); + nopaque = (HashPageOpaque) PageGetSpecialPointer(newpage); + + nopaque->hasho_flag = xlrec->new_bucket_flag; + + PageSetLSN(newpage, lsn); + MarkBufferDirty(newbuf); + } + if (BufferIsValid(newbuf)) + UnlockReleaseBuffer(newbuf); +} + +/* + * replay move of page contents for squeeze operation of hash index + */ +static void +hash_xlog_move_page_contents(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_hash_move_page_contents *xldata = (xl_hash_move_page_contents *) XLogRecGetData(record); + Buffer bucketbuf = InvalidBuffer; + Buffer writebuf = InvalidBuffer; + Buffer deletebuf = InvalidBuffer; + XLogRedoAction action; + + /* + * Ensure to have a cleanup lock on primary bucket page before we start + * with the actual replay operation. This is to ensure that neither a + * scan can start nor a scan can be already-in-progress during the replay + * of this operation. If we allow scans during this operation, then they + * can miss some records or show the same record multiple times. + */ + if (xldata->is_prim_bucket_same_wrt) + action = XLogReadBufferForRedoExtended(record, 1, RBM_NORMAL, true, &writebuf); + else + { + RelFileNode rnode; + BlockNumber blkno; + + XLogRecGetBlockTag(record, 0, &rnode, NULL, &blkno); + + bucketbuf = XLogReadBufferExtended(rnode, MAIN_FORKNUM, blkno, + RBM_NORMAL); + if (BufferIsValid(bucketbuf)) + LockBufferForCleanup(bucketbuf); + + action = XLogReadBufferForRedo(record, 1, &writebuf); + } + + /* replay the record for adding entries in overflow buffer */ + if (action == BLK_NEEDS_REDO) + { + Page writepage; + char *begin; + char *data; + Size datalen; + uint16 ninserted = 0; + + data = begin = XLogRecGetBlockData(record, 1, &datalen); + + writepage = (Page) BufferGetPage(writebuf); + + if (xldata->ntups > 0) + { + OffsetNumber *towrite = (OffsetNumber *) data; + + data += sizeof(OffsetNumber) * xldata->ntups; + + while (data - begin < datalen) + { + IndexTuple itup = (IndexTuple) data; + Size itemsz; + OffsetNumber l; + + itemsz = IndexTupleDSize(*itup); + itemsz = MAXALIGN(itemsz); + + data += itemsz; + + l = PageAddItem(writepage, (Item) itup, itemsz, towrite[ninserted], false, false); + if (l == InvalidOffsetNumber) + elog(ERROR, "hash_xlog_move_page_contents: failed to add item to hash index page, size %d bytes", + (int) itemsz); + + ninserted++; + } + } + + /* + * number of tuples inserted must be same as requested in REDO record. + */ + Assert(ninserted == xldata->ntups); + + PageSetLSN(writepage, lsn); + MarkBufferDirty(writebuf); + } + + /* replay the record for deleting entries from overflow buffer */ + if (XLogReadBufferForRedo(record, 2, &deletebuf) == BLK_NEEDS_REDO) + { + Page page; + char *ptr; + Size len; + + ptr = XLogRecGetBlockData(record, 2, &len); + + page = (Page) BufferGetPage(deletebuf); + + if (len > 0) + { + OffsetNumber *unused; + OffsetNumber *unend; + + unused = (OffsetNumber *) ptr; + unend = (OffsetNumber *) ((char *) ptr + len); + + if ((unend - unused) > 0) + PageIndexMultiDelete(page, unused, unend - unused); + } + + PageSetLSN(page, lsn); + MarkBufferDirty(deletebuf); + } + + /* + * Replay is complete, now we can release the buffers. We release locks at + * end of replay operation to ensure that we hold lock on primary bucket + * page till end of operation. We can optimize by releasing the lock on + * write buffer as soon as the operation for same is complete, if it is + * not same as primary bucket page, but that doesn't seem to be worth + * complicating the code. + */ + if (BufferIsValid(deletebuf)) + UnlockReleaseBuffer(deletebuf); + + if (BufferIsValid(writebuf)) + UnlockReleaseBuffer(writebuf); + + if (BufferIsValid(bucketbuf)) + UnlockReleaseBuffer(bucketbuf); +} + +/* + * replay squeeze page operation of hash index + */ +static void +hash_xlog_squeeze_page(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_hash_squeeze_page *xldata = (xl_hash_squeeze_page *) XLogRecGetData(record); + Buffer bucketbuf = InvalidBuffer; + Buffer writebuf; + Buffer ovflbuf; + Buffer prevbuf = InvalidBuffer; + Buffer mapbuf; + XLogRedoAction action; + + /* + * Ensure to have a cleanup lock on primary bucket page before we start + * with the actual replay operation. This is to ensure that neither a + * scan can start nor a scan can be already-in-progress during the replay + * of this operation. If we allow scans during this operation, then they + * can miss some records or show the same record multiple times. + */ + if (xldata->is_prim_bucket_same_wrt) + action = XLogReadBufferForRedoExtended(record, 1, RBM_NORMAL, true, &writebuf); + else + { + RelFileNode rnode; + BlockNumber blkno; + + XLogRecGetBlockTag(record, 0, &rnode, NULL, &blkno); + + bucketbuf = XLogReadBufferExtended(rnode, MAIN_FORKNUM, blkno, + RBM_NORMAL); + if (BufferIsValid(bucketbuf)) + LockBufferForCleanup(bucketbuf); + + action = XLogReadBufferForRedo(record, 1, &writebuf); + } + + /* replay the record for adding entries in overflow buffer */ + if (action == BLK_NEEDS_REDO) + { + Page writepage; + char *begin; + char *data; + Size datalen; + uint16 ninserted = 0; + + data = begin = XLogRecGetBlockData(record, 1, &datalen); + + writepage = (Page) BufferGetPage(writebuf); + + if (xldata->ntups > 0) + { + OffsetNumber *towrite = (OffsetNumber *) data; + + data += sizeof(OffsetNumber) * xldata->ntups; + + while (data - begin < datalen) + { + IndexTuple itup = (IndexTuple) data; + Size itemsz; + OffsetNumber l; + + itemsz = IndexTupleDSize(*itup); + itemsz = MAXALIGN(itemsz); + + data += itemsz; + + l = PageAddItem(writepage, (Item) itup, itemsz, towrite[ninserted], false, false); + if (l == InvalidOffsetNumber) + elog(ERROR, "hash_xlog_squeeze_page: failed to add item to hash index page, size %d bytes", + (int) itemsz); + + ninserted++; + } + } + + /* + * number of tuples inserted must be same as requested in REDO record. + */ + Assert(ninserted == xldata->ntups); + + /* + * if the page on which are adding tuples is a page previous to freed + * overflow page, then update its nextblno. + */ + if (xldata->is_prev_bucket_same_wrt) + { + HashPageOpaque writeopaque = (HashPageOpaque) PageGetSpecialPointer(writepage); + + writeopaque->hasho_nextblkno = xldata->nextblkno; + } + + PageSetLSN(writepage, lsn); + MarkBufferDirty(writebuf); + } + + /* replay the record for initializing overflow buffer */ + if (XLogReadBufferForRedo(record, 2, &ovflbuf) == BLK_NEEDS_REDO) + { + Page ovflpage; + + ovflpage = BufferGetPage(ovflbuf); + + _hash_pageinit(ovflpage, BufferGetPageSize(ovflbuf)); + + PageSetLSN(ovflpage, lsn); + MarkBufferDirty(ovflbuf); + } + if (BufferIsValid(ovflbuf)) + UnlockReleaseBuffer(ovflbuf); + + /* replay the record for page previous to the freed overflow page */ + if (!xldata->is_prev_bucket_same_wrt && + XLogReadBufferForRedo(record, 3, &prevbuf) == BLK_NEEDS_REDO) + { + Page prevpage = BufferGetPage(prevbuf); + HashPageOpaque prevopaque = (HashPageOpaque) PageGetSpecialPointer(prevpage); + + prevopaque->hasho_nextblkno = xldata->nextblkno; + + PageSetLSN(prevpage, lsn); + MarkBufferDirty(prevbuf); + } + if (BufferIsValid(prevbuf)) + UnlockReleaseBuffer(prevbuf); + + /* replay the record for page next to the freed overflow page */ + if (XLogRecHasBlockRef(record, 4)) + { + Buffer nextbuf; + + if (XLogReadBufferForRedo(record, 4, &nextbuf) == BLK_NEEDS_REDO) + { + Page nextpage = BufferGetPage(nextbuf); + HashPageOpaque nextopaque = (HashPageOpaque) PageGetSpecialPointer(nextpage); + + nextopaque->hasho_prevblkno = xldata->prevblkno; + + PageSetLSN(nextpage, lsn); + MarkBufferDirty(nextbuf); + } + if (BufferIsValid(nextbuf)) + UnlockReleaseBuffer(nextbuf); + } + + /* replay the record for bitmap page */ + if (XLogReadBufferForRedo(record, 5, &mapbuf) == BLK_NEEDS_REDO) + { + Page mappage = (Page) BufferGetPage(mapbuf); + uint32 *freep = NULL; + char *data; + uint32 *bitmap_page_bit; + Size datalen; + + freep = HashPageGetBitmap(mappage); + + data = XLogRecGetBlockData(record, 5, &datalen); + bitmap_page_bit = (uint32 *) data; + + CLRBIT(freep, *bitmap_page_bit); + + PageSetLSN(mappage, lsn); + MarkBufferDirty(mapbuf); + } + if (BufferIsValid(mapbuf)) + UnlockReleaseBuffer(mapbuf); + + /* replay the record for meta page */ + if (XLogRecHasBlockRef(record, 6)) + { + Buffer metabuf; + + if (XLogReadBufferForRedo(record, 6, &metabuf) == BLK_NEEDS_REDO) + { + HashMetaPage metap; + Page page; + char *data; + uint32 *firstfree_ovflpage; + Size datalen; + + data = XLogRecGetBlockData(record, 6, &datalen); + firstfree_ovflpage = (uint32 *) data; + + page = BufferGetPage(metabuf); + metap = HashPageGetMeta(page); + metap->hashm_firstfree = *firstfree_ovflpage; + + PageSetLSN(page, lsn); + MarkBufferDirty(metabuf); + } + if (BufferIsValid(metabuf)) + UnlockReleaseBuffer(metabuf); + } + + /* + * We release locks on writebuf and bucketbuf at end of replay operation + * to ensure that we hold lock on primary bucket page till end of + * operation. We can optimize by releasing the lock on write buffer as + * soon as the operation for same is complete, if it is not same as + * primary bucket page, but that doesn't seem to be worth complicating the + * code. + */ + if (BufferIsValid(writebuf)) + UnlockReleaseBuffer(writebuf); + + if (BufferIsValid(bucketbuf)) + UnlockReleaseBuffer(bucketbuf); +} + +/* + * replay delete operation of hash index + */ +static void +hash_xlog_delete(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + xl_hash_delete *xldata = (xl_hash_delete *) XLogRecGetData(record); + Buffer bucketbuf = InvalidBuffer; + Buffer deletebuf; + Page page; + XLogRedoAction action; + + /* + * Ensure to have a cleanup lock on primary bucket page before we start + * with the actual replay operation. This is to ensure that neither a + * scan can start nor a scan can be already-in-progress during the replay + * of this operation. If we allow scans during this operation, then they + * can miss some records or show the same record multiple times. + */ + if (xldata->is_primary_bucket_page) + action = XLogReadBufferForRedoExtended(record, 1, RBM_NORMAL, true, &deletebuf); + else + { + RelFileNode rnode; + BlockNumber blkno; + + XLogRecGetBlockTag(record, 0, &rnode, NULL, &blkno); + + bucketbuf = XLogReadBufferExtended(rnode, MAIN_FORKNUM, blkno, + RBM_NORMAL); + if (BufferIsValid(bucketbuf)) + LockBufferForCleanup(bucketbuf); + + action = XLogReadBufferForRedo(record, 1, &deletebuf); + } + + /* replay the record for deleting entries in bucket page */ + if (action == BLK_NEEDS_REDO) + { + char *ptr; + Size len; + + ptr = XLogRecGetBlockData(record, 1, &len); + + page = (Page) BufferGetPage(deletebuf); + + if (len > 0) + { + OffsetNumber *unused; + OffsetNumber *unend; + + unused = (OffsetNumber *) ptr; + unend = (OffsetNumber *) ((char *) ptr + len); + + if ((unend - unused) > 0) + PageIndexMultiDelete(page, unused, unend - unused); + } + + PageSetLSN(page, lsn); + MarkBufferDirty(deletebuf); + } + if (BufferIsValid(deletebuf)) + UnlockReleaseBuffer(deletebuf); + + if (BufferIsValid(bucketbuf)) + UnlockReleaseBuffer(bucketbuf); +} + +/* + * replay split cleanup flag operation for primary bucket page. + */ +static void +hash_xlog_split_cleanup(XLogReaderState *record) +{ + XLogRecPtr lsn = record->EndRecPtr; + Buffer buffer; + Page page; + + if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) + { + HashPageOpaque bucket_opaque; + + page = (Page) BufferGetPage(buffer); + + bucket_opaque = (HashPageOpaque) PageGetSpecialPointer(page); + bucket_opaque->hasho_flag &= ~LH_BUCKET_NEEDS_SPLIT_CLEANUP; + PageSetLSN(page, lsn); + MarkBufferDirty(buffer); + } + if (BufferIsValid(buffer)) + UnlockReleaseBuffer(buffer); +} + +/* + * replay for update meta page + */ +static void +hash_xlog_update_meta_page(XLogReaderState *record) +{ + HashMetaPage metap; + XLogRecPtr lsn = record->EndRecPtr; + xl_hash_update_meta_page *xldata = (xl_hash_update_meta_page *) XLogRecGetData(record); + Buffer metabuf; + Page page; + + if (XLogReadBufferForRedo(record, 0, &metabuf) == BLK_NEEDS_REDO) + { + page = BufferGetPage(metabuf); + metap = HashPageGetMeta(page); + + metap->hashm_ntuples = xldata->ntuples; + + PageSetLSN(page, lsn); + MarkBufferDirty(metabuf); + } + if (BufferIsValid(metabuf)) + UnlockReleaseBuffer(metabuf); +} + +void +hash_redo(XLogReaderState *record) +{ + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + + switch (info) + { + case XLOG_HASH_INIT_META_PAGE: + hash_xlog_init_meta_page(record); + break; + case XLOG_HASH_INIT_BITMAP_PAGE: + hash_xlog_init_bitmap_page(record); + break; + case XLOG_HASH_INSERT: + hash_xlog_insert(record); + break; + case XLOG_HASH_ADD_OVFL_PAGE: + hash_xlog_addovflpage(record); + break; + case XLOG_HASH_SPLIT_ALLOCATE_PAGE: + hash_xlog_split_allocpage(record); + break; + case XLOG_HASH_SPLIT_PAGE: + hash_xlog_split_page(record); + break; + case XLOG_HASH_SPLIT_COMPLETE: + hash_xlog_split_complete(record); + break; + case XLOG_HASH_MOVE_PAGE_CONTENTS: + hash_xlog_move_page_contents(record); + break; + case XLOG_HASH_SQUEEZE_PAGE: + hash_xlog_squeeze_page(record); + break; + case XLOG_HASH_DELETE: + hash_xlog_delete(record); + break; + case XLOG_HASH_SPLIT_CLEANUP: + hash_xlog_split_cleanup(record); + break; + case XLOG_HASH_UPDATE_META_PAGE: + hash_xlog_update_meta_page(record); + break; + default: + elog(PANIC, "hash_redo: unknown op code %u", info); + } +} diff --git a/src/backend/access/hash/hashinsert.c b/src/backend/access/hash/hashinsert.c index 46df589..9e78b40 100644 --- a/src/backend/access/hash/hashinsert.c +++ b/src/backend/access/hash/hashinsert.c @@ -16,6 +16,8 @@ #include "postgres.h" #include "access/hash.h" +#include "access/hash_xlog.h" +#include "miscadmin.h" #include "utils/rel.h" @@ -45,6 +47,7 @@ _hash_doinsert(Relation rel, IndexTuple itup) uint32 maxbucket; uint32 highmask; uint32 lowmask; + OffsetNumber itup_off; /* * Get the hash key for the item (it's stored in the index tuple itself). @@ -206,35 +209,63 @@ restart_insert: Assert(pageopaque->hasho_bucket == bucket); } - /* found page with enough space, so add the item here */ - (void) _hash_pgaddtup(rel, buf, itemsz, itup); - - /* - * dirty and release the modified page. if the page we modified was an - * overflow page, we also need to separately drop the pin we retained on - * the primary bucket page. - */ - MarkBufferDirty(buf); - _hash_relbuf(rel, buf); - if (buf != bucket_buf) - _hash_dropbuf(rel, bucket_buf); - /* * Write-lock the metapage so we can increment the tuple count. After * incrementing it, check to see if it's time for a split. */ LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); + /* Do the update. No ereport(ERROR) until changes are logged */ + START_CRIT_SECTION(); + + /* found page with enough space, so add the item here */ + itup_off = _hash_pgaddtup(rel, buf, itemsz, itup); + MarkBufferDirty(buf); + + /* metapage operations */ metap->hashm_ntuples += 1; /* Make sure this stays in sync with _hash_expandtable() */ do_expand = metap->hashm_ntuples > (double) metap->hashm_ffactor * (metap->hashm_maxbucket + 1); - /* Write out the metapage and drop lock, but keep pin */ MarkBufferDirty(metabuf); + + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) + { + xl_hash_insert xlrec; + XLogRecPtr recptr; + + xlrec.offnum = itup_off; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfHashInsert); + + XLogRegisterBuffer(1, metabuf, REGBUF_STANDARD); + + XLogRegisterBuffer(0, buf, REGBUF_STANDARD); + XLogRegisterBufData(0, (char *) itup, IndexTupleDSize(*itup)); + + recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_INSERT); + + PageSetLSN(BufferGetPage(buf), recptr); + PageSetLSN(BufferGetPage(metabuf), recptr); + } + + END_CRIT_SECTION(); + + /* drop lock on metapage, but keep pin */ LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); + /* + * Release the modified page and ensure to release the pin on primary + * page. + */ + _hash_relbuf(rel, buf); + if (buf != bucket_buf) + _hash_dropbuf(rel, bucket_buf); + /* Attempt to split if a split is needed */ if (do_expand) _hash_expandtable(rel, metabuf); @@ -275,3 +306,44 @@ _hash_pgaddtup(Relation rel, Buffer buf, Size itemsize, IndexTuple itup) return itup_off; } + +/* + * _hash_pgaddmultitup() -- add a tuple vector to a particular page in the + * index. + * + * This routine has same requirements for locking and tuple ordering as + * _hash_pgaddtup(). + * + * Returns the offset number array at which the tuples were inserted. + */ +void +_hash_pgaddmultitup(Relation rel, Buffer buf, IndexTuple *itups, + OffsetNumber *itup_offsets, uint16 nitups) +{ + OffsetNumber itup_off; + Page page; + uint32 hashkey; + int i; + + _hash_checkpage(rel, buf, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE); + page = BufferGetPage(buf); + + for (i = 0; i < nitups; i++) + { + Size itemsize; + + itemsize = IndexTupleDSize(*itups[i]); + itemsize = MAXALIGN(itemsize); + + /* Find where to insert the tuple (preserving page's hashkey ordering) */ + hashkey = _hash_get_indextuple_hashkey(itups[i]); + itup_off = _hash_binsearch(page, hashkey); + + itup_offsets[i] = itup_off; + + if (PageAddItem(page, (Item) itups[i], itemsize, itup_off, false, false) + == InvalidOffsetNumber) + elog(ERROR, "failed to add index item to \"%s\"", + RelationGetRelationName(rel)); + } +} diff --git a/src/backend/access/hash/hashovfl.c b/src/backend/access/hash/hashovfl.c index 6b106f3..c7c4922 100644 --- a/src/backend/access/hash/hashovfl.c +++ b/src/backend/access/hash/hashovfl.c @@ -18,10 +18,11 @@ #include "postgres.h" #include "access/hash.h" +#include "access/hash_xlog.h" +#include "miscadmin.h" #include "utils/rel.h" -static Buffer _hash_getovflpage(Relation rel, Buffer metabuf); static uint32 _hash_firstfreebit(uint32 map); @@ -84,7 +85,9 @@ blkno_to_bitno(HashMetaPage metap, BlockNumber ovflblkno) * dropped before exiting (we assume the caller is not interested in 'buf' * anymore) if not asked to retain. The pin will be retained only for the * primary bucket. The returned overflow page will be pinned and - * write-locked; it is guaranteed to be empty. + * write-locked; it is not guaranteed to be empty as we release and require + * lock on it, so the caller must ensure that it has required space + * in the page. * * The caller must hold a pin, but no lock, on the metapage buffer. * That buffer is returned in the same state. @@ -102,13 +105,37 @@ _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf, bool retain_pin) Page ovflpage; HashPageOpaque pageopaque; HashPageOpaque ovflopaque; - - /* allocate and lock an empty overflow page */ - ovflbuf = _hash_getovflpage(rel, metabuf); + HashMetaPage metap; + Buffer mapbuf = InvalidBuffer; + Buffer newmapbuf = InvalidBuffer; + BlockNumber blkno; + uint32 orig_firstfree; + uint32 splitnum; + uint32 *freep = NULL; + uint32 max_ovflpg; + uint32 bit; + uint32 bitmap_page_bit; + uint32 first_page; + uint32 last_bit; + uint32 last_page; + uint32 i, + j; + bool page_found = false; /* - * Write-lock the tail page. It is okay to hold two buffer locks here - * since there cannot be anyone else contending for access to ovflbuf. + * Write-lock the tail page. Here, we need to maintain locking order such + * that, first acquire the lock on tail page of bucket, then on meta page + * to find and lock the bitmap page and if it is found, then lock on meta + * page is released, then finally acquire the lock on new overflow buffer. + * We need this locking order to avoid deadlock with backends that are + * doing inserts. + * + * Note: We could have avoided locking many buffers here if we make two + * WAL records for acquiring an overflow page (one to allocate an overflow + * page and another to add it to overflow bucket chain). However, doing + * so can leak an overflow page, if the system crashes after allocation. + * Needless to say, it is better to have a single record from a + * performance point of view as well. */ LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); @@ -136,56 +163,6 @@ _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf, bool retain_pin) buf = _hash_getbuf(rel, nextblkno, HASH_WRITE, LH_OVERFLOW_PAGE); } - /* now that we have correct backlink, initialize new overflow page */ - ovflpage = BufferGetPage(ovflbuf); - ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage); - ovflopaque->hasho_prevblkno = BufferGetBlockNumber(buf); - ovflopaque->hasho_nextblkno = InvalidBlockNumber; - ovflopaque->hasho_bucket = pageopaque->hasho_bucket; - ovflopaque->hasho_flag = LH_OVERFLOW_PAGE; - ovflopaque->hasho_page_id = HASHO_PAGE_ID; - - MarkBufferDirty(ovflbuf); - - /* logically chain overflow page to previous page */ - pageopaque->hasho_nextblkno = BufferGetBlockNumber(ovflbuf); - MarkBufferDirty(buf); - if ((pageopaque->hasho_flag & LH_BUCKET_PAGE) && retain_pin) - LockBuffer(buf, BUFFER_LOCK_UNLOCK); - else - _hash_relbuf(rel, buf); - - return ovflbuf; -} - -/* - * _hash_getovflpage() - * - * Find an available overflow page and return it. The returned buffer - * is pinned and write-locked, and has had _hash_pageinit() applied, - * but it is caller's responsibility to fill the special space. - * - * The caller must hold a pin, but no lock, on the metapage buffer. - * That buffer is left in the same state at exit. - */ -static Buffer -_hash_getovflpage(Relation rel, Buffer metabuf) -{ - HashMetaPage metap; - Buffer mapbuf = 0; - Buffer newbuf; - BlockNumber blkno; - uint32 orig_firstfree; - uint32 splitnum; - uint32 *freep = NULL; - uint32 max_ovflpg; - uint32 bit; - uint32 first_page; - uint32 last_bit; - uint32 last_page; - uint32 i, - j; - /* Get exclusive lock on the meta page */ LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); @@ -234,11 +211,31 @@ _hash_getovflpage(Relation rel, Buffer metabuf) for (; bit <= last_inpage; j++, bit += BITS_PER_MAP) { if (freep[j] != ALL_SET) + { + page_found = true; + + /* Reacquire exclusive lock on the meta page */ + LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); + + /* convert bit to bit number within page */ + bit += _hash_firstfreebit(freep[j]); + bitmap_page_bit = bit; + + /* convert bit to absolute bit number */ + bit += (i << BMPG_SHIFT(metap)); + /* Calculate address of the recycled overflow page */ + blkno = bitno_to_blkno(metap, bit); + + /* Fetch and init the recycled page */ + ovflbuf = _hash_getinitbuf(rel, blkno); + goto found; + } } /* No free space here, try to advance to next map page */ _hash_relbuf(rel, mapbuf); + mapbuf = InvalidBuffer; i++; j = 0; /* scan from start of next map page */ bit = 0; @@ -262,8 +259,15 @@ _hash_getovflpage(Relation rel, Buffer metabuf) * convenient to pre-mark them as "in use" too. */ bit = metap->hashm_spares[splitnum]; - _hash_initbitmap(rel, metap, bitno_to_blkno(metap, bit), MAIN_FORKNUM); - metap->hashm_spares[splitnum]++; + newmapbuf = _hash_getnewbuf(rel, bitno_to_blkno(metap, bit), MAIN_FORKNUM); + + /* add the new bitmap page to the metapage's list of bitmaps */ + /* metapage already has a write lock */ + if (metap->hashm_nmaps >= HASH_MAX_BITMAPS) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("out of overflow pages in hash index \"%s\"", + RelationGetRelationName(rel)))); } else { @@ -274,7 +278,8 @@ _hash_getovflpage(Relation rel, Buffer metabuf) } /* Calculate address of the new overflow page */ - bit = metap->hashm_spares[splitnum]; + bit = BufferIsValid(newmapbuf) ? + metap->hashm_spares[splitnum] + 1 : metap->hashm_spares[splitnum]; blkno = bitno_to_blkno(metap, bit); /* @@ -282,41 +287,51 @@ _hash_getovflpage(Relation rel, Buffer metabuf) * relation length stays in sync with ours. XXX It's annoying to do this * with metapage write lock held; would be better to use a lock that * doesn't block incoming searches. + * + * It is okay to hold two buffer locks here (one on tail page of bucket + * and other on new overflow page) since there cannot be anyone else + * contending for access to ovflbuf. */ - newbuf = _hash_getnewbuf(rel, blkno, MAIN_FORKNUM); + ovflbuf = _hash_getnewbuf(rel, blkno, MAIN_FORKNUM); - metap->hashm_spares[splitnum]++; +found: /* - * Adjust hashm_firstfree to avoid redundant searches. But don't risk - * changing it if someone moved it while we were searching bitmap pages. + * Do the update. No ereport(ERROR) until changes are logged. We want to + * log the changes for bitmap page and overflow page together to avoid + * loss of pages in case the new page is added. */ - if (metap->hashm_firstfree == orig_firstfree) - metap->hashm_firstfree = bit + 1; + START_CRIT_SECTION(); - /* Write updated metapage and release lock, but not pin */ - MarkBufferDirty(metabuf); - LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); - - return newbuf; - -found: - /* convert bit to bit number within page */ - bit += _hash_firstfreebit(freep[j]); + if (page_found) + { + Assert(BufferIsValid(mapbuf)); - /* mark page "in use" in the bitmap */ - SETBIT(freep, bit); - MarkBufferDirty(mapbuf); - _hash_relbuf(rel, mapbuf); + /* mark page "in use" in the bitmap */ + SETBIT(freep, bitmap_page_bit); + MarkBufferDirty(mapbuf); + } + else + { + /* update the count to indicate new overflow page is added */ + metap->hashm_spares[splitnum]++; - /* Reacquire exclusive lock on the meta page */ - LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); + if (BufferIsValid(newmapbuf)) + { + _hash_initbitmapbuffer(newmapbuf, metap->hashm_bmsize, false); + MarkBufferDirty(newmapbuf); - /* convert bit to absolute bit number */ - bit += (i << BMPG_SHIFT(metap)); + metap->hashm_mapp[metap->hashm_nmaps] = BufferGetBlockNumber(newmapbuf); + metap->hashm_nmaps++; + metap->hashm_spares[splitnum]++; + MarkBufferDirty(metabuf); + } - /* Calculate address of the recycled overflow page */ - blkno = bitno_to_blkno(metap, bit); + /* + * for new overflow page, we don't need to explicitly set the bit in + * bitmap page, as by default that will be set to "in use". + */ + } /* * Adjust hashm_firstfree to avoid redundant searches. But don't risk @@ -325,19 +340,101 @@ found: if (metap->hashm_firstfree == orig_firstfree) { metap->hashm_firstfree = bit + 1; - - /* Write updated metapage and release lock, but not pin */ MarkBufferDirty(metabuf); - LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); } - else + + /* now that we have correct backlink, initialize new overflow page */ + ovflpage = BufferGetPage(ovflbuf); + ovflopaque = (HashPageOpaque) PageGetSpecialPointer(ovflpage); + ovflopaque->hasho_prevblkno = BufferGetBlockNumber(buf); + ovflopaque->hasho_nextblkno = InvalidBlockNumber; + ovflopaque->hasho_bucket = pageopaque->hasho_bucket; + ovflopaque->hasho_flag = LH_OVERFLOW_PAGE; + ovflopaque->hasho_page_id = HASHO_PAGE_ID; + + MarkBufferDirty(ovflbuf); + + /* logically chain overflow page to previous page */ + pageopaque->hasho_nextblkno = BufferGetBlockNumber(ovflbuf); + + MarkBufferDirty(buf); + + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) { - /* We didn't change the metapage, so no need to write */ - LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); + XLogRecPtr recptr; + xl_hash_addovflpage xlrec; + + xlrec.bmpage_found = page_found; + xlrec.bmsize = metap->hashm_bmsize; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfHashAddOvflPage); + + XLogRegisterBuffer(0, ovflbuf, REGBUF_WILL_INIT); + XLogRegisterBufData(0, (char *) &pageopaque->hasho_bucket, sizeof(Bucket)); + + XLogRegisterBuffer(1, buf, REGBUF_STANDARD); + + if (BufferIsValid(mapbuf)) + { + /* + * As bitmap page doesn't have standard page layout, so this will + * allow us to log the data. + */ + XLogRegisterBuffer(2, mapbuf, REGBUF_STANDARD); + XLogRegisterBufData(2, (char *) &bitmap_page_bit, sizeof(uint32)); + } + + if (BufferIsValid(newmapbuf)) + XLogRegisterBuffer(3, newmapbuf, REGBUF_WILL_INIT); + + /* + * To replay meta page changes, we can log the entire metapage which + * doesn't seem advisable considering size of hash metapage or we can + * log the individual updated values which seems doable, but we prefer + * to perform exact operations on metapage during replay as are done + * during actual operation. That looks straight forward and has an + * advantage of using lesser space in WAL. + */ + XLogRegisterBuffer(4, metabuf, REGBUF_STANDARD); + XLogRegisterBufData(4, (char *) &metap->hashm_firstfree, sizeof(uint32)); + + recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_ADD_OVFL_PAGE); + + PageSetLSN(BufferGetPage(ovflbuf), recptr); + PageSetLSN(BufferGetPage(buf), recptr); + + if (BufferIsValid(mapbuf)) + PageSetLSN(BufferGetPage(mapbuf), recptr); + + if (BufferIsValid(newmapbuf)) + PageSetLSN(BufferGetPage(newmapbuf), recptr); } - /* Fetch, init, and return the recycled page */ - return _hash_getinitbuf(rel, blkno); + END_CRIT_SECTION(); + + if ((pageopaque->hasho_flag & LH_BUCKET_PAGE) && retain_pin) + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + else + _hash_relbuf(rel, buf); + + if (BufferIsValid(mapbuf)) + _hash_relbuf(rel, mapbuf); + + LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); + + if (BufferIsValid(newmapbuf)) + _hash_relbuf(rel, newmapbuf); + + /* + * we need to release and reacquire the lock on overflow buffer to ensure + * that standby shouldn't see an intermediate state of it. + */ + LockBuffer(ovflbuf, BUFFER_LOCK_UNLOCK); + LockBuffer(ovflbuf, BUFFER_LOCK_EXCLUSIVE); + + return ovflbuf; } /* @@ -370,6 +467,12 @@ _hash_firstfreebit(uint32 map) * Remove this overflow page from its bucket's chain, and mark the page as * free. On entry, ovflbuf is write-locked; it is released before exiting. * + * Add the tuples (itups) to wbuf in this function, we could do that in the + * caller as well. The advantage of doing it here is we can easily write + * the WAL for XLOG_HASH_SQUEEZE_PAGE operation. Addition of tuples and + * removal of overflow page has to done as an atomic operation, otherwise + * during replay on standby users might find duplicate records. + * * Since this function is invoked in VACUUM, we provide an access strategy * parameter that controls fetches of the bucket pages. * @@ -382,13 +485,14 @@ _hash_firstfreebit(uint32 map) * has a lock on same. */ BlockNumber -_hash_freeovflpage(Relation rel, Buffer ovflbuf, Buffer wbuf, +_hash_freeovflpage(Relation rel, Buffer bucketbuf, Buffer ovflbuf, + Buffer wbuf, IndexTuple *itups, OffsetNumber *itup_offsets, + Size *tups_size, uint16 nitups, BufferAccessStrategy bstrategy) { HashMetaPage metap; Buffer metabuf; Buffer mapbuf; - Buffer prevbuf = InvalidBuffer; BlockNumber ovflblkno; BlockNumber prevblkno; BlockNumber blkno; @@ -402,6 +506,9 @@ _hash_freeovflpage(Relation rel, Buffer ovflbuf, Buffer wbuf, int32 bitmappage, bitmapbit; Bucket bucket PG_USED_FOR_ASSERTS_ONLY; + Buffer prevbuf = InvalidBuffer; + Buffer nextbuf = InvalidBuffer; + bool update_metap = false; /* Get information from the doomed page */ _hash_checkpage(rel, ovflbuf, LH_OVERFLOW_PAGE); @@ -414,15 +521,6 @@ _hash_freeovflpage(Relation rel, Buffer ovflbuf, Buffer wbuf, bucket = ovflopaque->hasho_bucket; /* - * Zero the page for debugging's sake; then write and release it. (Note: - * if we failed to zero the page here, we'd have problems with the Assert - * in _hash_pageinit() when the page is reused.) - */ - MemSet(ovflpage, 0, BufferGetPageSize(ovflbuf)); - MarkBufferDirty(ovflbuf); - _hash_relbuf(rel, ovflbuf); - - /* * Fix up the bucket chain. this is a doubly-linked list, so we must fix * up the bucket chain members behind and ahead of the overflow page being * deleted. Concurrency issues are avoided by using lock chaining as @@ -430,8 +528,6 @@ _hash_freeovflpage(Relation rel, Buffer ovflbuf, Buffer wbuf, */ if (BlockNumberIsValid(prevblkno)) { - Page prevpage; - HashPageOpaque prevopaque; if (prevblkno == writeblkno) prevbuf = wbuf; @@ -441,32 +537,13 @@ _hash_freeovflpage(Relation rel, Buffer ovflbuf, Buffer wbuf, HASH_WRITE, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE, bstrategy); - - prevpage = BufferGetPage(prevbuf); - prevopaque = (HashPageOpaque) PageGetSpecialPointer(prevpage); - - Assert(prevopaque->hasho_bucket == bucket); - prevopaque->hasho_nextblkno = nextblkno; - - MarkBufferDirty(prevbuf); - if (prevblkno != writeblkno) - _hash_relbuf(rel, prevbuf); } if (BlockNumberIsValid(nextblkno)) - { - Buffer nextbuf = _hash_getbuf_with_strategy(rel, - nextblkno, - HASH_WRITE, - LH_OVERFLOW_PAGE, - bstrategy); - Page nextpage = BufferGetPage(nextbuf); - HashPageOpaque nextopaque = (HashPageOpaque) PageGetSpecialPointer(nextpage); - - Assert(nextopaque->hasho_bucket == bucket); - nextopaque->hasho_prevblkno = prevblkno; - MarkBufferDirty(nextbuf); - _hash_relbuf(rel, nextbuf); - } + nextbuf = _hash_getbuf_with_strategy(rel, + nextblkno, + HASH_WRITE, + LH_OVERFLOW_PAGE, + bstrategy); /* Note: bstrategy is intentionally not used for metapage and bitmap */ @@ -487,62 +564,184 @@ _hash_freeovflpage(Relation rel, Buffer ovflbuf, Buffer wbuf, /* Release metapage lock while we access the bitmap page */ LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); - /* Clear the bitmap bit to indicate that this overflow page is free */ + /* read the bitmap page to clear the bitmap bit */ mapbuf = _hash_getbuf(rel, blkno, HASH_WRITE, LH_BITMAP_PAGE); mappage = BufferGetPage(mapbuf); freep = HashPageGetBitmap(mappage); Assert(ISSET(freep, bitmapbit)); - CLRBIT(freep, bitmapbit); - MarkBufferDirty(mapbuf); - _hash_relbuf(rel, mapbuf); /* Get write-lock on metapage to update firstfree */ LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); + /* This operation needs to log multiple tuples, prepare WAL for that */ + if (RelationNeedsWAL(rel)) + XLogEnsureRecordSpace(HASH_XLOG_FREE_OVFL_BUFS, XLR_NORMAL_RDATAS + nitups); + + START_CRIT_SECTION(); + + /* + * we have to insert tuples on the "write" page, being careful to preserve + * hashkey ordering. (If we insert many tuples into the same "write" page + * it would be worth qsort'ing them). + */ + if (nitups > 0) + { + _hash_pgaddmultitup(rel, wbuf, itups, itup_offsets, nitups); + MarkBufferDirty(wbuf); + } + + /* + * Initialise the freed overflow page, here we can't complete zeroed the + * page as WAL replay routines expect pages to be initialized. See + * explanation of RBM_NORMAL mode atop XLogReadBufferExtended. + */ + _hash_pageinit(ovflpage, BufferGetPageSize(ovflbuf)); + MarkBufferDirty(ovflbuf); + + if (BufferIsValid(prevbuf)) + { + Page prevpage = BufferGetPage(prevbuf); + HashPageOpaque prevopaque = (HashPageOpaque) PageGetSpecialPointer(prevpage); + + Assert(prevopaque->hasho_bucket == bucket); + prevopaque->hasho_nextblkno = nextblkno; + MarkBufferDirty(prevbuf); + } + if (BufferIsValid(nextbuf)) + { + Page nextpage = BufferGetPage(nextbuf); + HashPageOpaque nextopaque = (HashPageOpaque) PageGetSpecialPointer(nextpage); + + Assert(nextopaque->hasho_bucket == bucket); + nextopaque->hasho_prevblkno = prevblkno; + MarkBufferDirty(nextbuf); + } + + /* Clear the bitmap bit to indicate that this overflow page is free */ + CLRBIT(freep, bitmapbit); + MarkBufferDirty(mapbuf); + + /* if this is now the first free page, update hashm_firstfree */ if (ovflbitno < metap->hashm_firstfree) { metap->hashm_firstfree = ovflbitno; + update_metap = true; MarkBufferDirty(metabuf); } - _hash_relbuf(rel, metabuf); + + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) + { + xl_hash_squeeze_page xlrec; + XLogRecPtr recptr; + int i; + + xlrec.prevblkno = prevblkno; + xlrec.nextblkno = nextblkno; + xlrec.ntups = nitups; + xlrec.is_prim_bucket_same_wrt = (wbuf == bucketbuf) ? true : false; + xlrec.is_prev_bucket_same_wrt = (wbuf == prevbuf) ? true : false; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfHashSqueezePage); + + /* + * bucket buffer needs to be registered to ensure that we can acquire + * a cleanup lock on it during replay. + */ + if (!xlrec.is_prim_bucket_same_wrt) + XLogRegisterBuffer(0, bucketbuf, REGBUF_STANDARD); + + XLogRegisterBuffer(1, wbuf, REGBUF_STANDARD); + if (xlrec.ntups > 0) + { + XLogRegisterBufData(1, (char *) itup_offsets, + nitups * sizeof(OffsetNumber)); + for (i = 0; i < nitups; i++) + XLogRegisterBufData(1, (char *) itups[i], tups_size[i]); + } + + XLogRegisterBuffer(2, ovflbuf, REGBUF_STANDARD); + + /* + * If prevpage and the writepage (block in which we are moving tuples + * from overflow) are same, then no need to separately register + * prevpage. During replay, we can directly update the nextblock in + * writepage. + */ + if (BufferIsValid(prevbuf) && !xlrec.is_prev_bucket_same_wrt) + XLogRegisterBuffer(3, prevbuf, REGBUF_STANDARD); + + if (BufferIsValid(nextbuf)) + XLogRegisterBuffer(4, nextbuf, REGBUF_STANDARD); + + XLogRegisterBuffer(5, mapbuf, REGBUF_STANDARD); + XLogRegisterBufData(5, (char *) &bitmapbit, sizeof(uint32)); + + if (update_metap) + { + XLogRegisterBuffer(6, metabuf, REGBUF_STANDARD); + XLogRegisterBufData(6, (char *) &metap->hashm_firstfree, sizeof(uint32)); + } + + recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SQUEEZE_PAGE); + + PageSetLSN(BufferGetPage(wbuf), recptr); + PageSetLSN(BufferGetPage(ovflbuf), recptr); + + if (BufferIsValid(prevbuf) && !xlrec.is_prev_bucket_same_wrt) + PageSetLSN(BufferGetPage(prevbuf), recptr); + if (BufferIsValid(nextbuf)) + PageSetLSN(BufferGetPage(nextbuf), recptr); + + PageSetLSN(BufferGetPage(mapbuf), recptr); + + if (update_metap) + PageSetLSN(BufferGetPage(metabuf), recptr); + } + + END_CRIT_SECTION(); + + /* release previous bucket if it is not same as write bucket */ + if (BufferIsValid(prevbuf) && prevblkno != writeblkno) + _hash_relbuf(rel, prevbuf); + + if (BufferIsValid(ovflbuf)) + _hash_relbuf(rel, ovflbuf); + + if (BufferIsValid(nextbuf)) + _hash_relbuf(rel, nextbuf); + + if (BufferIsValid(mapbuf)) + _hash_relbuf(rel, mapbuf); + + if (BufferIsValid(metabuf)) + _hash_relbuf(rel, metabuf); return nextblkno; } /* - * _hash_initbitmap() + * _hash_initbitmapbuffer() * - * Initialize a new bitmap page. The metapage has a write-lock upon - * entering the function, and must be written by caller after return. - * - * 'blkno' is the block number of the new bitmap page. - * - * All bits in the new bitmap page are set to "1", indicating "in use". + * Initialize a new bitmap page. All bits in the new bitmap page are set to + * "1", indicating "in use". */ void -_hash_initbitmap(Relation rel, HashMetaPage metap, BlockNumber blkno, - ForkNumber forkNum) +_hash_initbitmapbuffer(Buffer buf, uint16 bmsize, bool initpage) { - Buffer buf; Page pg; HashPageOpaque op; uint32 *freep; - /* - * It is okay to write-lock the new bitmap page while holding metapage - * write lock, because no one else could be contending for the new page. - * Also, the metapage lock makes it safe to extend the index using - * _hash_getnewbuf. - * - * There is some loss of concurrency in possibly doing I/O for the new - * page while holding the metapage lock, but this path is taken so seldom - * that it's not worth worrying about. - */ - buf = _hash_getnewbuf(rel, blkno, forkNum); pg = BufferGetPage(buf); + /* initialize the page */ + if (initpage) + _hash_pageinit(pg, BufferGetPageSize(buf)); + /* initialize the page's special space */ op = (HashPageOpaque) PageGetSpecialPointer(pg); op->hasho_prevblkno = InvalidBlockNumber; @@ -553,23 +752,14 @@ _hash_initbitmap(Relation rel, HashMetaPage metap, BlockNumber blkno, /* set all of the bits to 1 */ freep = HashPageGetBitmap(pg); - MemSet(freep, 0xFF, BMPGSZ_BYTE(metap)); - - /* dirty the new bitmap page, and release write lock and pin */ - MarkBufferDirty(buf); - _hash_relbuf(rel, buf); + MemSet(freep, 0xFF, bmsize); - /* add the new bitmap page to the metapage's list of bitmaps */ - /* metapage already has a write lock */ - if (metap->hashm_nmaps >= HASH_MAX_BITMAPS) - ereport(ERROR, - (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), - errmsg("out of overflow pages in hash index \"%s\"", - RelationGetRelationName(rel)))); - - metap->hashm_mapp[metap->hashm_nmaps] = blkno; - - metap->hashm_nmaps++; + /* + * Set pd_lower just past the end of the bitmap page data. We could even + * set pd_lower equal to pd_upper, but this is more precise and makes the + * page look compressible to xlog.c. + */ + ((PageHeader) pg)->pd_lower = ((char *) freep + bmsize) - (char *) pg; } @@ -619,7 +809,6 @@ _hash_squeezebucket(Relation rel, Page rpage; HashPageOpaque wopaque; HashPageOpaque ropaque; - bool wbuf_dirty; /* * start squeezing into the primary bucket page. @@ -665,15 +854,23 @@ _hash_squeezebucket(Relation rel, /* * squeeze the tuples. */ - wbuf_dirty = false; for (;;) { OffsetNumber roffnum; OffsetNumber maxroffnum; OffsetNumber deletable[MaxOffsetNumber]; - int ndeletable = 0; + IndexTuple itups[MaxIndexTuplesPerPage]; + Size tups_size[MaxIndexTuplesPerPage]; + OffsetNumber *itup_offsets; + uint16 ndeletable = 0; + uint16 nitups = 0; + Size all_tups_size = 0; + int i; bool retain_pin = false; + itup_offsets = (OffsetNumber *) palloc(MaxIndexTuplesPerPage * sizeof(OffsetNumber)); + +readpage: /* Scan each tuple in "read" page */ maxroffnum = PageGetMaxOffsetNumber(rpage); for (roffnum = FirstOffsetNumber; @@ -694,11 +891,13 @@ _hash_squeezebucket(Relation rel, /* * Walk up the bucket chain, looking for a page big enough for - * this item. Exit if we reach the read page. + * this item and all other accumulated items. Exit if we reach + * the read page. */ - while (PageGetFreeSpace(wpage) < itemsz) + while (PageGetFreeSpaceForMulTups(wpage, nitups + 1) < (all_tups_size + itemsz)) { Buffer next_wbuf = InvalidBuffer; + bool tups_moved = false; Assert(!PageIsEmpty(wpage)); @@ -715,50 +914,130 @@ _hash_squeezebucket(Relation rel, HASH_WRITE, LH_OVERFLOW_PAGE, bstrategy); + if (nitups > 0) + { + Assert(nitups == ndeletable); + + /* + * This operation needs to log multiple tuples, prepare + * WAL for that. + */ + if (RelationNeedsWAL(rel)) + XLogEnsureRecordSpace(0, XLR_NORMAL_RDATAS + nitups); + + START_CRIT_SECTION(); + + /* + * we have to insert tuples on the "write" page, being + * careful to preserve hashkey ordering. (If we insert + * many tuples into the same "write" page it would be + * worth qsort'ing them). + */ + _hash_pgaddmultitup(rel, wbuf, itups, itup_offsets, nitups); + MarkBufferDirty(wbuf); + + /* Delete tuples we already moved off read page */ + PageIndexMultiDelete(rpage, deletable, ndeletable); + MarkBufferDirty(rbuf); + + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) + { + XLogRecPtr recptr; + xl_hash_move_page_contents xlrec; + + xlrec.ntups = nitups; + xlrec.is_prim_bucket_same_wrt = (wbuf == bucket_buf) ? true : false; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfHashMovePageContents); + + /* + * bucket buffer needs to be registered to ensure that + * we can acquire a cleanup lock on it during replay. + */ + if (!xlrec.is_prim_bucket_same_wrt) + XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD); + + XLogRegisterBuffer(1, wbuf, REGBUF_STANDARD); + XLogRegisterBufData(1, (char *) itup_offsets, + nitups * sizeof(OffsetNumber)); + for (i = 0; i < nitups; i++) + XLogRegisterBufData(1, (char *) itups[i], tups_size[i]); + + XLogRegisterBuffer(2, rbuf, REGBUF_STANDARD); + XLogRegisterBufData(2, (char *) deletable, + ndeletable * sizeof(OffsetNumber)); + + recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_MOVE_PAGE_CONTENTS); + + PageSetLSN(BufferGetPage(wbuf), recptr); + PageSetLSN(BufferGetPage(rbuf), recptr); + } + + END_CRIT_SECTION(); + + tups_moved = true; + } /* * release the lock on previous page after acquiring the lock * on next page */ - if (wbuf_dirty) - MarkBufferDirty(wbuf); if (retain_pin) LockBuffer(wbuf, BUFFER_LOCK_UNLOCK); else _hash_relbuf(rel, wbuf); + /* + * We need to release and if required reacquire the lock on + * rbuf to ensure that standby shouldn't see an intermediate + * state of it. If we don't release the lock, after replay of + * XLOG_HASH_SQUEEZE_PAGE on standby users will be able to + * view the results of partial deletion on rblkno. + */ + LockBuffer(rbuf, BUFFER_LOCK_UNLOCK); + /* nothing more to do if we reached the read page */ if (rblkno == wblkno) { - if (ndeletable > 0) - { - /* Delete tuples we already moved off read page */ - PageIndexMultiDelete(rpage, deletable, ndeletable); - MarkBufferDirty(rbuf); - } - _hash_relbuf(rel, rbuf); + _hash_dropbuf(rel, rbuf); return; } + LockBuffer(rbuf, BUFFER_LOCK_EXCLUSIVE); + wbuf = next_wbuf; wpage = BufferGetPage(wbuf); wopaque = (HashPageOpaque) PageGetSpecialPointer(wpage); Assert(wopaque->hasho_bucket == bucket); - wbuf_dirty = false; retain_pin = false; - } - /* - * we have found room so insert on the "write" page, being careful - * to preserve hashkey ordering. (If we insert many tuples into - * the same "write" page it would be worth qsort'ing instead of - * doing repeated _hash_pgaddtup.) - */ - (void) _hash_pgaddtup(rel, wbuf, itemsz, itup); - wbuf_dirty = true; + /* be tidy */ + for (i = 0; i < nitups; i++) + pfree(itups[i]); + nitups = 0; + all_tups_size = 0; + ndeletable = 0; + /* + * after moving the tuples, rpage would have been compacted, + * so we need to rescan it. + */ + if (tups_moved) + goto readpage; + } /* remember tuple for deletion from "read" page */ deletable[ndeletable++] = roffnum; + + /* + * we need a copy of index tuples as they can be freed as part of + * overflow page, however we need them to write a WAL record in + * _hash_freeovflpage. + */ + itups[nitups] = CopyIndexTuple(itup); + tups_size[nitups++] = itemsz; + all_tups_size += itemsz; } /* @@ -776,10 +1055,14 @@ _hash_squeezebucket(Relation rel, Assert(BlockNumberIsValid(rblkno)); /* free this overflow page (releases rbuf) */ - _hash_freeovflpage(rel, rbuf, wbuf, bstrategy); + _hash_freeovflpage(rel, bucket_buf, rbuf, wbuf, itups, itup_offsets, + tups_size, nitups, bstrategy); + + /* be tidy */ + for (i = 0; i < nitups; i++) + pfree(itups[i]); - if (wbuf_dirty) - MarkBufferDirty(wbuf); + pfree(itup_offsets); /* are we freeing the page adjacent to wbuf? */ if (rblkno == wblkno) diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c index 45e184c..d7eb8c7 100644 --- a/src/backend/access/hash/hashpage.c +++ b/src/backend/access/hash/hashpage.c @@ -29,6 +29,7 @@ #include "postgres.h" #include "access/hash.h" +#include "access/hash_xlog.h" #include "miscadmin.h" #include "storage/lmgr.h" #include "storage/smgr.h" @@ -40,12 +41,11 @@ static void _hash_splitbucket(Relation rel, Buffer metabuf, Bucket obucket, Bucket nbucket, Buffer obuf, Buffer nbuf, + HTAB *htab, uint32 maxbucket, uint32 highmask, uint32 lowmask); -static void _hash_splitbucket_guts(Relation rel, Buffer metabuf, - Bucket obucket, Bucket nbucket, Buffer obuf, - Buffer nbuf, HTAB *htab, uint32 maxbucket, - uint32 highmask, uint32 lowmask); +static void + log_split_page(Relation rel, Buffer buf); /* @@ -160,6 +160,29 @@ _hash_getinitbuf(Relation rel, BlockNumber blkno) } /* + * _hash_initbuf() -- Get and initialize a buffer by bucket number. + */ +void +_hash_initbuf(Buffer buf, uint32 num_bucket, uint32 flag, bool initpage) +{ + HashPageOpaque pageopaque; + Page page; + + page = BufferGetPage(buf); + + /* initialize the page */ + if (initpage) + _hash_pageinit(page, BufferGetPageSize(buf)); + + pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); + pageopaque->hasho_prevblkno = InvalidBlockNumber; + pageopaque->hasho_nextblkno = InvalidBlockNumber; + pageopaque->hasho_bucket = num_bucket; + pageopaque->hasho_flag = flag; + pageopaque->hasho_page_id = HASHO_PAGE_ID; +} + +/* * _hash_getnewbuf() -- Get a new page at the end of the index. * * This has the same API as _hash_getinitbuf, except that we are adding @@ -291,7 +314,7 @@ _hash_dropscanbuf(Relation rel, HashScanOpaque so) /* - * _hash_metapinit() -- Initialize the metadata page of a hash index, + * _hash_init() -- Initialize the metadata page of a hash index, * the initial buckets, and the initial bitmap page. * * The initial number of buckets is dependent on num_tuples, an estimate @@ -303,19 +326,18 @@ _hash_dropscanbuf(Relation rel, HashScanOpaque so) * multiple buffer locks is ignored. */ uint32 -_hash_metapinit(Relation rel, double num_tuples, ForkNumber forkNum) +_hash_init(Relation rel, double num_tuples, ForkNumber forkNum) { - HashMetaPage metap; - HashPageOpaque pageopaque; Buffer metabuf; Buffer buf; + Buffer bitmapbuf; Page pg; + HashMetaPage metap; + RegProcedure procid; int32 data_width; int32 item_width; int32 ffactor; - double dnumbuckets; uint32 num_buckets; - uint32 log2_num_buckets; uint32 i; /* safety check */ @@ -337,6 +359,154 @@ _hash_metapinit(Relation rel, double num_tuples, ForkNumber forkNum) if (ffactor < 10) ffactor = 10; + procid = index_getprocid(rel, 1, HASHPROC); + + /* + * We initialize the metapage, the first N bucket pages, and the first + * bitmap page in sequence, using _hash_getnewbuf to cause smgrextend() + * calls to occur. This ensures that the smgr level has the right idea of + * the physical index length. + * + * Critical section not required, because on error the creation of the + * whole relation will be rolled back. + */ + metabuf = _hash_getnewbuf(rel, HASH_METAPAGE, forkNum); + _hash_init_metabuffer(metabuf, num_tuples, procid, ffactor, false); + MarkBufferDirty(metabuf); + + pg = BufferGetPage(metabuf); + metap = HashPageGetMeta(pg); + + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) + { + xl_hash_init_meta_page xlrec; + XLogRecPtr recptr; + + /* + * Here, we could have copied the entire metapage as well to WAL + * record and restore it as it is during replay. However the size of + * metapage is not small (more than 600 bytes), so recording just the + * information required to construct metapage. + */ + xlrec.num_tuples = num_tuples; + xlrec.procid = metap->hashm_procid; + xlrec.ffactor = metap->hashm_ffactor; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfHashInitMetaPage); + XLogRegisterBuffer(0, metabuf, REGBUF_WILL_INIT); + + recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_INIT_META_PAGE); + + PageSetLSN(BufferGetPage(metabuf), recptr); + } + + num_buckets = metap->hashm_maxbucket + 1; + + /* + * Release buffer lock on the metapage while we initialize buckets. + * Otherwise, we'll be in interrupt holdoff and the CHECK_FOR_INTERRUPTS + * won't accomplish anything. It's a bad idea to hold buffer locks for + * long intervals in any case, since that can block the bgwriter. + */ + LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); + + /* + * Initialize and WAL Log the first N buckets + */ + for (i = 0; i < num_buckets; i++) + { + BlockNumber blkno; + + /* Allow interrupts, in case N is huge */ + CHECK_FOR_INTERRUPTS(); + + blkno = BUCKET_TO_BLKNO(metap, i); + buf = _hash_getnewbuf(rel, blkno, forkNum); + _hash_initbuf(buf, i, LH_BUCKET_PAGE, false); + MarkBufferDirty(buf); + + log_newpage(&rel->rd_node, + forkNum, + blkno, + BufferGetPage(buf), + true); + _hash_relbuf(rel, buf); + } + + /* Now reacquire buffer lock on metapage */ + LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); + + /* + * Initialize bitmap page + */ + bitmapbuf = _hash_getnewbuf(rel, num_buckets + 1, forkNum); + _hash_initbitmapbuffer(bitmapbuf, metap->hashm_bmsize, false); + MarkBufferDirty(bitmapbuf); + + /* add the new bitmap page to the metapage's list of bitmaps */ + /* metapage already has a write lock */ + if (metap->hashm_nmaps >= HASH_MAX_BITMAPS) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("out of overflow pages in hash index \"%s\"", + RelationGetRelationName(rel)))); + + metap->hashm_mapp[metap->hashm_nmaps] = num_buckets + 1; + + metap->hashm_nmaps++; + MarkBufferDirty(metabuf); + + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) + { + xl_hash_init_bitmap_page xlrec; + XLogRecPtr recptr; + + xlrec.bmsize = metap->hashm_bmsize; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfHashInitBitmapPage); + XLogRegisterBuffer(0, bitmapbuf, REGBUF_WILL_INIT); + + /* + * We can log the exact changes made to meta page, however as no + * concurrent operation could see the index during the replay of this + * record, we can perform the operations during replay as they are + * done here. + */ + XLogRegisterBuffer(1, metabuf, REGBUF_STANDARD); + + recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_INIT_BITMAP_PAGE); + + PageSetLSN(BufferGetPage(bitmapbuf), recptr); + PageSetLSN(BufferGetPage(metabuf), recptr); + } + + /* all done */ + _hash_relbuf(rel, bitmapbuf); + _hash_relbuf(rel, metabuf); + + return num_buckets; +} + +/* + * _hash_init_metabuffer() -- Initialize the metadata page of a hash index. + */ +void +_hash_init_metabuffer(Buffer buf, double num_tuples, RegProcedure procid, + uint16 ffactor, bool initpage) +{ + HashMetaPage metap; + HashPageOpaque pageopaque; + Page page; + double dnumbuckets; + uint32 num_buckets; + uint32 log2_num_buckets; + uint32 i; + + /* * Choose the number of initial bucket pages to match the fill factor * given the estimated number of tuples. We round up the result to the @@ -356,30 +526,25 @@ _hash_metapinit(Relation rel, double num_tuples, ForkNumber forkNum) Assert(num_buckets == (((uint32) 1) << log2_num_buckets)); Assert(log2_num_buckets < HASH_MAX_SPLITPOINTS); - /* - * We initialize the metapage, the first N bucket pages, and the first - * bitmap page in sequence, using _hash_getnewbuf to cause smgrextend() - * calls to occur. This ensures that the smgr level has the right idea of - * the physical index length. - */ - metabuf = _hash_getnewbuf(rel, HASH_METAPAGE, forkNum); - pg = BufferGetPage(metabuf); + page = BufferGetPage(buf); + if (initpage) + _hash_pageinit(page, BufferGetPageSize(buf)); - pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg); + pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); pageopaque->hasho_prevblkno = InvalidBlockNumber; pageopaque->hasho_nextblkno = InvalidBlockNumber; pageopaque->hasho_bucket = -1; pageopaque->hasho_flag = LH_META_PAGE; pageopaque->hasho_page_id = HASHO_PAGE_ID; - metap = HashPageGetMeta(pg); + metap = HashPageGetMeta(page); metap->hashm_magic = HASH_MAGIC; metap->hashm_version = HASH_VERSION; metap->hashm_ntuples = 0; metap->hashm_nmaps = 0; metap->hashm_ffactor = ffactor; - metap->hashm_bsize = HashGetMaxBitmapSize(pg); + metap->hashm_bsize = HashGetMaxBitmapSize(page); /* find largest bitmap array size that will fit in page size */ for (i = _hash_log2(metap->hashm_bsize); i > 0; --i) { @@ -396,7 +561,7 @@ _hash_metapinit(Relation rel, double num_tuples, ForkNumber forkNum) * pretty useless for normal operation (in fact, hashm_procid is not used * anywhere), but it might be handy for forensic purposes so we keep it. */ - metap->hashm_procid = index_getprocid(rel, 1, HASHPROC); + metap->hashm_procid = procid; /* * We initialize the index with N buckets, 0 .. N-1, occupying physical @@ -415,47 +580,11 @@ _hash_metapinit(Relation rel, double num_tuples, ForkNumber forkNum) metap->hashm_firstfree = 0; /* - * Release buffer lock on the metapage while we initialize buckets. - * Otherwise, we'll be in interrupt holdoff and the CHECK_FOR_INTERRUPTS - * won't accomplish anything. It's a bad idea to hold buffer locks for - * long intervals in any case, since that can block the bgwriter. - */ - MarkBufferDirty(metabuf); - LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); - - /* - * Initialize the first N buckets - */ - for (i = 0; i < num_buckets; i++) - { - /* Allow interrupts, in case N is huge */ - CHECK_FOR_INTERRUPTS(); - - buf = _hash_getnewbuf(rel, BUCKET_TO_BLKNO(metap, i), forkNum); - pg = BufferGetPage(buf); - pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg); - pageopaque->hasho_prevblkno = InvalidBlockNumber; - pageopaque->hasho_nextblkno = InvalidBlockNumber; - pageopaque->hasho_bucket = i; - pageopaque->hasho_flag = LH_BUCKET_PAGE; - pageopaque->hasho_page_id = HASHO_PAGE_ID; - MarkBufferDirty(buf); - _hash_relbuf(rel, buf); - } - - /* Now reacquire buffer lock on metapage */ - LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE); - - /* - * Initialize first bitmap page + * Set pd_lower just past the end of the metadata. This is to log + * full_page_image of metapage in xloginsert.c. */ - _hash_initbitmap(rel, metap, num_buckets + 1, forkNum); - - /* all done */ - MarkBufferDirty(metabuf); - _hash_relbuf(rel, metabuf); - - return num_buckets; + ((PageHeader) page)->pd_lower = + ((char *) metap + sizeof(HashMetaPageData)) - (char *) page; } /* @@ -464,7 +593,6 @@ _hash_metapinit(Relation rel, double num_tuples, ForkNumber forkNum) void _hash_pageinit(Page page, Size size) { - Assert(PageIsNew(page)); PageInit(page, size, sizeof(HashPageOpaqueData)); } @@ -492,10 +620,14 @@ _hash_expandtable(Relation rel, Buffer metabuf) Buffer buf_nblkno; Buffer buf_oblkno; Page opage; + Page npage; HashPageOpaque oopaque; + HashPageOpaque nopaque; uint32 maxbucket; uint32 highmask; uint32 lowmask; + bool metap_update_masks = false; + bool metap_update_splitpoint = false; restart_expand: @@ -531,7 +663,7 @@ restart_expand: * than a disk block then this would be an independent constraint. * * If you change this, see also the maximum initial number of buckets in - * _hash_metapinit(). + * _hash_init(). */ if (metap->hashm_maxbucket >= (uint32) 0x7FFFFFFE) goto fail; @@ -655,7 +787,11 @@ restart_expand: * The number of buckets in the new splitpoint is equal to the total * number already in existence, i.e. new_bucket. Currently this maps * one-to-one to blocks required, but someday we may need a more - * complicated calculation here. + * complicated calculation here. We treat allocation of buckets as a + * separate WAL action. Even if we fail after this operation, it + * won't be a leak on standby, as next split will consume this space. + * In any case, even without failure we don't use all the space in one + * split operation. */ if (!_hash_alloc_buckets(rel, start_nblkno, new_bucket)) { @@ -680,18 +816,43 @@ restart_expand: goto fail; } - /* - * Okay to proceed with split. Update the metapage bucket mapping info. - * - * Since we are scribbling on the metapage data right in the shared - * buffer, any failure in this next little bit leaves us with a big + * Since we are scribbling on the pages in the shared buffers, establish a + * critical section. Any failure in this next code leaves us with a big * problem: the metapage is effectively corrupt but could get written back - * to disk. We don't really expect any failure, but just to be sure, - * establish a critical section. + * to disk. */ START_CRIT_SECTION(); + opage = BufferGetPage(buf_oblkno); + oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); + + /* + * Mark the old bucket to indicate that split is in progress. At + * operation end, we clear split-in-progress flag. + */ + oopaque->hasho_flag |= LH_BUCKET_BEING_SPLIT; + + MarkBufferDirty(buf_oblkno); + + npage = BufferGetPage(buf_nblkno); + + /* + * initialize the new bucket's primary page and mark it to indicate that + * split is in progress. + */ + nopaque = (HashPageOpaque) PageGetSpecialPointer(npage); + nopaque->hasho_prevblkno = InvalidBlockNumber; + nopaque->hasho_nextblkno = InvalidBlockNumber; + nopaque->hasho_bucket = new_bucket; + nopaque->hasho_flag = LH_BUCKET_PAGE | LH_BUCKET_BEING_POPULATED; + nopaque->hasho_page_id = HASHO_PAGE_ID; + + MarkBufferDirty(buf_nblkno); + + /* + * Okay to proceed with split. Update the metapage bucket mapping info. + */ metap->hashm_maxbucket = new_bucket; if (new_bucket > metap->hashm_highmask) @@ -699,6 +860,7 @@ restart_expand: /* Starting a new doubling */ metap->hashm_lowmask = metap->hashm_highmask; metap->hashm_highmask = new_bucket | metap->hashm_lowmask; + metap_update_masks = true; } /* @@ -711,10 +873,10 @@ restart_expand: { metap->hashm_spares[spare_ndx] = metap->hashm_spares[metap->hashm_ovflpoint]; metap->hashm_ovflpoint = spare_ndx; + metap_update_splitpoint = true; } - /* Done mucking with metapage */ - END_CRIT_SECTION(); + MarkBufferDirty(metabuf); /* * Copy bucket mapping info now; this saves re-accessing the meta page @@ -727,16 +889,71 @@ restart_expand: highmask = metap->hashm_highmask; lowmask = metap->hashm_lowmask; - /* Write out the metapage and drop lock, but keep pin */ - MarkBufferDirty(metabuf); + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) + { + xl_hash_split_allocpage xlrec; + XLogRecPtr recptr; + + xlrec.new_bucket = maxbucket; + xlrec.old_bucket_flag = oopaque->hasho_flag; + xlrec.new_bucket_flag = nopaque->hasho_flag; + xlrec.flags = 0; + + XLogBeginInsert(); + + XLogRegisterBuffer(0, buf_oblkno, REGBUF_STANDARD); + XLogRegisterBuffer(1, buf_nblkno, REGBUF_WILL_INIT); + XLogRegisterBuffer(2, metabuf, REGBUF_STANDARD); + + if (metap_update_masks) + { + xlrec.flags |= XLH_SPLIT_META_UPDATE_MASKS; + XLogRegisterBufData(2, (char *) &metap->hashm_lowmask, sizeof(uint32)); + XLogRegisterBufData(2, (char *) &metap->hashm_highmask, sizeof(uint32)); + } + + if (metap_update_splitpoint) + { + xlrec.flags |= XLH_SPLIT_META_UPDATE_SPLITPOINT; + XLogRegisterBufData(2, (char *) &metap->hashm_ovflpoint, + sizeof(uint32)); + XLogRegisterBufData(2, + (char *) &metap->hashm_spares[metap->hashm_ovflpoint], + sizeof(uint32)); + } + + XLogRegisterData((char *) &xlrec, SizeOfHashSplitAllocPage); + + recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_ALLOCATE_PAGE); + + PageSetLSN(BufferGetPage(buf_oblkno), recptr); + PageSetLSN(BufferGetPage(buf_nblkno), recptr); + PageSetLSN(BufferGetPage(metabuf), recptr); + } + + END_CRIT_SECTION(); + + /* drop lock, but keep pin */ LockBuffer(metabuf, BUFFER_LOCK_UNLOCK); + /* + * we need to release and reacquire the lock on new block buffer to ensure + * that standby shouldn't see an intermediate state of it. + */ + LockBuffer(buf_nblkno, BUFFER_LOCK_UNLOCK); + LockBuffer(buf_nblkno, BUFFER_LOCK_EXCLUSIVE); + /* Relocate records to the new bucket */ _hash_splitbucket(rel, metabuf, old_bucket, new_bucket, - buf_oblkno, buf_nblkno, + buf_oblkno, buf_nblkno, NULL, maxbucket, highmask, lowmask); + /* all done, now release the locks and pins on primary buckets. */ + _hash_relbuf(rel, buf_oblkno); + _hash_relbuf(rel, buf_nblkno); + return; /* Here if decide not to split or fail to acquire old bucket lock */ @@ -776,6 +993,7 @@ _hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks) { BlockNumber lastblock; char zerobuf[BLCKSZ]; + Page page; lastblock = firstblock + nblocks - 1; @@ -786,7 +1004,21 @@ _hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks) if (lastblock < firstblock || lastblock == InvalidBlockNumber) return false; - MemSet(zerobuf, 0, sizeof(zerobuf)); + page = (Page) zerobuf; + + /* + * Initialise the new bucket page, here we can't complete zeroed the page + * as WAL replay routines expect pages to be initialized. See explanation + * of RBM_NORMAL mode atop XLogReadBufferExtended. + */ + _hash_pageinit(page, BLCKSZ); + + if (RelationNeedsWAL(rel)) + log_newpage(&rel->rd_node, + MAIN_FORKNUM, + lastblock, + zerobuf, + true); RelationOpenSmgr(rel); smgrextend(rel->rd_smgr, MAIN_FORKNUM, lastblock, zerobuf, false); @@ -794,14 +1026,19 @@ _hash_alloc_buckets(Relation rel, BlockNumber firstblock, uint32 nblocks) return true; } - /* * _hash_splitbucket -- split 'obucket' into 'obucket' and 'nbucket' * + * This routine is used to partition the tuples between old and new bucket and + * is used to finish the incomplete split operations. To finish the previously + * interrupted split operation, the caller needs to fill htab. If htab is set, + * then we skip the movement of tuples that exists in htab, otherwise NULL + * value of htab indicates movement of all the tuples that belong to the new + * bucket. + * * We are splitting a bucket that consists of a base bucket page and zero * or more overflow (bucket chain) pages. We must relocate tuples that - * belong in the new bucket, and compress out any free space in the old - * bucket. + * belong in the new bucket. * * The caller must hold cleanup locks on both buckets to ensure that * no one else is trying to access them (see README). @@ -827,69 +1064,11 @@ _hash_splitbucket(Relation rel, Bucket nbucket, Buffer obuf, Buffer nbuf, + HTAB *htab, uint32 maxbucket, uint32 highmask, uint32 lowmask) { - Page opage; - Page npage; - HashPageOpaque oopaque; - HashPageOpaque nopaque; - - opage = BufferGetPage(obuf); - oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); - - /* - * Mark the old bucket to indicate that split is in progress. At - * operation end, we clear split-in-progress flag. - */ - oopaque->hasho_flag |= LH_BUCKET_BEING_SPLIT; - - npage = BufferGetPage(nbuf); - - /* - * initialize the new bucket's primary page and mark it to indicate that - * split is in progress. - */ - nopaque = (HashPageOpaque) PageGetSpecialPointer(npage); - nopaque->hasho_prevblkno = InvalidBlockNumber; - nopaque->hasho_nextblkno = InvalidBlockNumber; - nopaque->hasho_bucket = nbucket; - nopaque->hasho_flag = LH_BUCKET_PAGE | LH_BUCKET_BEING_POPULATED; - nopaque->hasho_page_id = HASHO_PAGE_ID; - - _hash_splitbucket_guts(rel, metabuf, obucket, - nbucket, obuf, nbuf, NULL, - maxbucket, highmask, lowmask); - - /* all done, now release the locks and pins on primary buckets. */ - _hash_relbuf(rel, obuf); - _hash_relbuf(rel, nbuf); -} - -/* - * _hash_splitbucket_guts -- Helper function to perform the split operation - * - * This routine is used to partition the tuples between old and new bucket and - * to finish incomplete split operations. To finish the previously - * interrupted split operation, caller needs to fill htab. If htab is set, then - * we skip the movement of tuples that exists in htab, otherwise NULL value of - * htab indicates movement of all the tuples that belong to new bucket. - * - * Caller needs to lock and unlock the old and new primary buckets. - */ -static void -_hash_splitbucket_guts(Relation rel, - Buffer metabuf, - Bucket obucket, - Bucket nbucket, - Buffer obuf, - Buffer nbuf, - HTAB *htab, - uint32 maxbucket, - uint32 highmask, - uint32 lowmask) -{ Buffer bucket_obuf; Buffer bucket_nbuf; Page opage; @@ -974,11 +1153,14 @@ _hash_splitbucket_guts(Relation rel, itemsz = IndexTupleDSize(*new_itup); itemsz = MAXALIGN(itemsz); - if (PageGetFreeSpace(npage) < itemsz) + while (PageGetFreeSpace(npage) < itemsz) { - /* write out nbuf and drop lock, but keep pin */ - MarkBufferDirty(nbuf); + /* log the split operation before releasing the lock */ + log_split_page(rel, nbuf); + + /* drop lock, but keep pin */ LockBuffer(nbuf, BUFFER_LOCK_UNLOCK); + /* chain to a new overflow page */ nbuf = _hash_addovflpage(rel, metabuf, nbuf, (nbuf == bucket_nbuf) ? true : false); npage = BufferGetPage(nbuf); @@ -986,6 +1168,13 @@ _hash_splitbucket_guts(Relation rel, } /* + * Change the shared buffer state in critical section, + * otherwise any error could make it unrecoverable after + * recovery. + */ + START_CRIT_SECTION(); + + /* * Insert tuple on new page, using _hash_pgaddtup to ensure * correct ordering by hashkey. This is a tad inefficient * since we may have to shuffle itempointers repeatedly. @@ -994,6 +1183,8 @@ _hash_splitbucket_guts(Relation rel, */ (void) _hash_pgaddtup(rel, nbuf, itemsz, new_itup); + END_CRIT_SECTION(); + /* be tidy */ pfree(new_itup); } @@ -1016,7 +1207,16 @@ _hash_splitbucket_guts(Relation rel, /* Exit loop if no more overflow pages in old bucket */ if (!BlockNumberIsValid(oblkno)) + { + /* log the split operation before releasing the lock */ + log_split_page(rel, nbuf); + + if (nbuf == bucket_nbuf) + LockBuffer(nbuf, BUFFER_LOCK_UNLOCK); + else + _hash_relbuf(rel, nbuf); break; + } /* Else, advance to next old page */ obuf = _hash_getbuf(rel, oblkno, HASH_READ, LH_OVERFLOW_PAGE); @@ -1032,17 +1232,6 @@ _hash_splitbucket_guts(Relation rel, * To avoid deadlocks due to locking order of buckets, first lock the old * bucket and then the new bucket. */ - if (nbuf == bucket_nbuf) - { - MarkBufferDirty(bucket_nbuf); - LockBuffer(bucket_nbuf, BUFFER_LOCK_UNLOCK); - } - else - { - MarkBufferDirty(nbuf); - _hash_relbuf(rel, nbuf); - } - LockBuffer(bucket_obuf, BUFFER_LOCK_EXCLUSIVE); opage = BufferGetPage(bucket_obuf); oopaque = (HashPageOpaque) PageGetSpecialPointer(opage); @@ -1051,6 +1240,8 @@ _hash_splitbucket_guts(Relation rel, npage = BufferGetPage(bucket_nbuf); nopaque = (HashPageOpaque) PageGetSpecialPointer(npage); + START_CRIT_SECTION(); + oopaque->hasho_flag &= ~LH_BUCKET_BEING_SPLIT; nopaque->hasho_flag &= ~LH_BUCKET_BEING_POPULATED; @@ -1067,6 +1258,29 @@ _hash_splitbucket_guts(Relation rel, */ MarkBufferDirty(bucket_obuf); MarkBufferDirty(bucket_nbuf); + + if (RelationNeedsWAL(rel)) + { + XLogRecPtr recptr; + xl_hash_split_complete xlrec; + + xlrec.old_bucket_flag = oopaque->hasho_flag; + xlrec.new_bucket_flag = nopaque->hasho_flag; + + XLogBeginInsert(); + + XLogRegisterData((char *) &xlrec, SizeOfHashSplitComplete); + + XLogRegisterBuffer(0, bucket_obuf, REGBUF_STANDARD); + XLogRegisterBuffer(1, bucket_nbuf, REGBUF_STANDARD); + + recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_COMPLETE); + + PageSetLSN(BufferGetPage(bucket_obuf), recptr); + PageSetLSN(BufferGetPage(bucket_nbuf), recptr); + } + + END_CRIT_SECTION(); } /* @@ -1183,11 +1397,44 @@ _hash_finish_split(Relation rel, Buffer metabuf, Buffer obuf, Bucket obucket, npageopaque = (HashPageOpaque) PageGetSpecialPointer(npage); nbucket = npageopaque->hasho_bucket; - _hash_splitbucket_guts(rel, metabuf, obucket, - nbucket, obuf, bucket_nbuf, tidhtab, - maxbucket, highmask, lowmask); + _hash_splitbucket(rel, metabuf, obucket, + nbucket, obuf, bucket_nbuf, tidhtab, + maxbucket, highmask, lowmask); _hash_relbuf(rel, bucket_nbuf); LockBuffer(obuf, BUFFER_LOCK_UNLOCK); hash_destroy(tidhtab); } + +/* + * log_split_page() -- Log the split operation + * + * We log the split operation when the new page in new bucket gets full, + * so we log the entire page. + * + * 'buf' must be locked by the caller which is also responsible for unlocking + * it. + */ +static void +log_split_page(Relation rel, Buffer buf) +{ + START_CRIT_SECTION(); + + MarkBufferDirty(buf); + + if (RelationNeedsWAL(rel)) + { + XLogRecPtr recptr; + + XLogBeginInsert(); + + XLogRegisterBuffer(0, buf, REGBUF_FORCE_IMAGE | REGBUF_STANDARD); + + recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_PAGE); + + PageSetLSN(BufferGetPage(buf), recptr); + } + + END_CRIT_SECTION(); +} + diff --git a/src/backend/access/hash/hashsearch.c b/src/backend/access/hash/hashsearch.c index 913b87c..4d741ee 100644 --- a/src/backend/access/hash/hashsearch.c +++ b/src/backend/access/hash/hashsearch.c @@ -123,6 +123,7 @@ _hash_readnext(IndexScanDesc scan, if (block_found) { *pagep = BufferGetPage(*bufp); + TestForOldSnapshot(scan->xs_snapshot, rel, *pagep); *opaquep = (HashPageOpaque) PageGetSpecialPointer(*pagep); } } @@ -159,6 +160,7 @@ _hash_readprev(IndexScanDesc scan, *bufp = _hash_getbuf(rel, blkno, HASH_READ, LH_BUCKET_PAGE | LH_OVERFLOW_PAGE); *pagep = BufferGetPage(*bufp); + TestForOldSnapshot(scan->xs_snapshot, rel, *pagep); *opaquep = (HashPageOpaque) PageGetSpecialPointer(*pagep); /* @@ -328,6 +330,7 @@ _hash_first(IndexScanDesc scan, ScanDirection dir) _hash_dropbuf(rel, metabuf); page = BufferGetPage(buf); + TestForOldSnapshot(scan->xs_snapshot, rel, page); opaque = (HashPageOpaque) PageGetSpecialPointer(page); Assert(opaque->hasho_bucket == bucket); @@ -362,6 +365,7 @@ _hash_first(IndexScanDesc scan, ScanDirection dir) LockBuffer(buf, BUFFER_LOCK_UNLOCK); old_buf = _hash_getbuf(rel, old_blkno, HASH_READ, LH_BUCKET_PAGE); + TestForOldSnapshot(scan->xs_snapshot, rel, BufferGetPage(old_buf)); /* * remember the split bucket buffer so as to use it later for @@ -564,6 +568,7 @@ _hash_step(IndexScanDesc scan, Buffer *bufP, ScanDirection dir) _hash_readprev(scan, &buf, &page, &opaque); if (BufferIsValid(buf)) { + TestForOldSnapshot(scan->xs_snapshot, rel, page); maxoff = PageGetMaxOffsetNumber(page); offnum = _hash_binsearch_last(page, so->hashso_sk_hash); } diff --git a/src/backend/access/rmgrdesc/hashdesc.c b/src/backend/access/rmgrdesc/hashdesc.c index 12e1818..fc04363 100644 --- a/src/backend/access/rmgrdesc/hashdesc.c +++ b/src/backend/access/rmgrdesc/hashdesc.c @@ -19,10 +19,143 @@ void hash_desc(StringInfo buf, XLogReaderState *record) { + char *rec = XLogRecGetData(record); + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + + switch (info) + { + case XLOG_HASH_INIT_META_PAGE: + { + xl_hash_init_meta_page *xlrec = (xl_hash_init_meta_page *) rec; + + appendStringInfo(buf, "num_tuples %g, fillfactor %d", + xlrec->num_tuples, xlrec->ffactor); + break; + } + case XLOG_HASH_INIT_BITMAP_PAGE: + { + xl_hash_init_bitmap_page *xlrec = (xl_hash_init_bitmap_page *) rec; + + appendStringInfo(buf, "bmsize %d", xlrec->bmsize); + break; + } + case XLOG_HASH_INSERT: + { + xl_hash_insert *xlrec = (xl_hash_insert *) rec; + + appendStringInfo(buf, "off %u", xlrec->offnum); + break; + } + case XLOG_HASH_ADD_OVFL_PAGE: + { + xl_hash_addovflpage *xlrec = (xl_hash_addovflpage *) rec; + + appendStringInfo(buf, "bmsize %d, bmpage_found %c", + xlrec->bmsize, (xlrec->bmpage_found) ? 'T' : 'F'); + break; + } + case XLOG_HASH_SPLIT_ALLOCATE_PAGE: + { + xl_hash_split_allocpage *xlrec = (xl_hash_split_allocpage *) rec; + + appendStringInfo(buf, "new_bucket %u, meta_page_masks_updated %c, issplitpoint_changed %c", + xlrec->new_bucket, + (xlrec->flags & XLH_SPLIT_META_UPDATE_MASKS) ? 'T' : 'F', + (xlrec->flags & XLH_SPLIT_META_UPDATE_SPLITPOINT) ? 'T' : 'F'); + break; + } + case XLOG_HASH_SPLIT_COMPLETE: + { + xl_hash_split_complete *xlrec = (xl_hash_split_complete *) rec; + + appendStringInfo(buf, "split_complete_old_bucket %c, split_complete_new_bucket %c", + (xlrec->old_bucket_flag & LH_BUCKET_BEING_SPLIT) ? 'F' : 'T', + (xlrec->new_bucket_flag & LH_BUCKET_BEING_POPULATED) ? 'F' : 'T'); + break; + } + case XLOG_HASH_MOVE_PAGE_CONTENTS: + { + xl_hash_move_page_contents *xlrec = (xl_hash_move_page_contents *) rec; + + appendStringInfo(buf, "ntups %d, is_primary %c", + xlrec->ntups, + xlrec->is_prim_bucket_same_wrt ? 'T' : 'F'); + break; + } + case XLOG_HASH_SQUEEZE_PAGE: + { + xl_hash_squeeze_page *xlrec = (xl_hash_squeeze_page *) rec; + + appendStringInfo(buf, "prevblkno %u, nextblkno %u, ntups %d, is_primary %c", + xlrec->prevblkno, + xlrec->nextblkno, + xlrec->ntups, + xlrec->is_prim_bucket_same_wrt ? 'T' : 'F'); + break; + } + case XLOG_HASH_DELETE: + { + xl_hash_delete *xlrec = (xl_hash_delete *) rec; + + appendStringInfo(buf, "is_primary %c", + xlrec->is_primary_bucket_page ? 'T' : 'F'); + break; + } + case XLOG_HASH_UPDATE_META_PAGE: + { + xl_hash_update_meta_page *xlrec = (xl_hash_update_meta_page *) rec; + + appendStringInfo(buf, "ntuples %g", + xlrec->ntuples); + break; + } + } } const char * hash_identify(uint8 info) { - return NULL; + const char *id = NULL; + + switch (info & ~XLR_INFO_MASK) + { + case XLOG_HASH_INIT_META_PAGE: + id = "INIT_META_PAGE"; + break; + case XLOG_HASH_INIT_BITMAP_PAGE: + id = "INIT_BITMAP_PAGE"; + break; + case XLOG_HASH_INSERT: + id = "INSERT"; + break; + case XLOG_HASH_ADD_OVFL_PAGE: + id = "ADD_OVFL_PAGE"; + break; + case XLOG_HASH_SPLIT_ALLOCATE_PAGE: + id = "SPLIT_ALLOCATE_PAGE"; + break; + case XLOG_HASH_SPLIT_PAGE: + id = "SPLIT_PAGE"; + break; + case XLOG_HASH_SPLIT_COMPLETE: + id = "SPLIT_COMPLETE"; + break; + case XLOG_HASH_MOVE_PAGE_CONTENTS: + id = "MOVE_PAGE_CONTENTS"; + break; + case XLOG_HASH_SQUEEZE_PAGE: + id = "SQUEEZE_PAGE"; + break; + case XLOG_HASH_DELETE: + id = "DELETE"; + break; + case XLOG_HASH_SPLIT_CLEANUP: + id = "SPLIT_CLEANUP"; + break; + case XLOG_HASH_UPDATE_META_PAGE: + id = "UPDATE_META_PAGE"; + break; + } + + return id; } diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c index eeb2b1f..82f019d 100644 --- a/src/backend/commands/indexcmds.c +++ b/src/backend/commands/indexcmds.c @@ -502,11 +502,6 @@ DefineIndex(Oid relationId, accessMethodForm = (Form_pg_am) GETSTRUCT(tuple); amRoutine = GetIndexAmRoutine(accessMethodForm->amhandler); - if (strcmp(accessMethodName, "hash") == 0 && - RelationNeedsWAL(rel)) - ereport(WARNING, - (errmsg("hash indexes are not WAL-logged and their use is discouraged"))); - if (stmt->unique && !amRoutine->amcanunique) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c index 73aa0c0..31b66d2 100644 --- a/src/backend/storage/page/bufpage.c +++ b/src/backend/storage/page/bufpage.c @@ -598,6 +598,33 @@ PageGetFreeSpace(Page page) } /* + * PageGetFreeSpaceForMulTups + * Returns the size of the free (allocatable) space on a page, + * reduced by the space needed for multiple new line pointers. + * + * Note: this should usually only be used on index pages. Use + * PageGetHeapFreeSpace on heap pages. + */ +Size +PageGetFreeSpaceForMulTups(Page page, int ntups) +{ + int space; + + /* + * Use signed arithmetic here so that we behave sensibly if pd_lower > + * pd_upper. + */ + space = (int) ((PageHeader) page)->pd_upper - + (int) ((PageHeader) page)->pd_lower; + + if (space < (int) (ntups * sizeof(ItemIdData))) + return 0; + space -= ntups * sizeof(ItemIdData); + + return (Size) space; +} + +/* * PageGetExactFreeSpace * Returns the size of the free (allocatable) space on a page, * without any consideration for adding/removing line pointers. diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c index 2a68359..266122a 100644 --- a/src/backend/utils/cache/relcache.c +++ b/src/backend/utils/cache/relcache.c @@ -5731,13 +5731,10 @@ RelationIdIsInInitFile(Oid relationId) /* * Tells whether any index for the relation is unlogged. * - * Any index using the hash AM is implicitly unlogged. - * * Note: There doesn't seem to be any way to have an unlogged index attached - * to a permanent table except to create a hash index, but it seems best to - * keep this general so that it returns sensible results even when they seem - * obvious (like for an unlogged table) and to handle possible future unlogged - * indexes on permanent tables. + * to a permanent table, but it seems best to keep this general so that it + * returns sensible results even when they seem obvious (like for an unlogged + * table) and to handle possible future unlogged indexes on permanent tables. */ bool RelationHasUnloggedIndex(Relation rel) @@ -5759,8 +5756,7 @@ RelationHasUnloggedIndex(Relation rel) elog(ERROR, "cache lookup failed for relation %u", indexoid); reltup = (Form_pg_class) GETSTRUCT(tp); - if (reltup->relpersistence == RELPERSISTENCE_UNLOGGED - || reltup->relam == HASH_AM_OID) + if (reltup->relpersistence == RELPERSISTENCE_UNLOGGED) result = true; ReleaseSysCache(tp); diff --git a/src/include/access/hash.h b/src/include/access/hash.h index bc08f81..bd56ee2 100644 --- a/src/include/access/hash.h +++ b/src/include/access/hash.h @@ -310,13 +310,15 @@ extern Datum hash_uint32(uint32 k); extern void _hash_doinsert(Relation rel, IndexTuple itup); extern OffsetNumber _hash_pgaddtup(Relation rel, Buffer buf, Size itemsize, IndexTuple itup); +extern void _hash_pgaddmultitup(Relation rel, Buffer buf, IndexTuple *itups, + OffsetNumber *itup_offsets, uint16 nitups); /* hashovfl.c */ extern Buffer _hash_addovflpage(Relation rel, Buffer metabuf, Buffer buf, bool retain_pin); -extern BlockNumber _hash_freeovflpage(Relation rel, Buffer ovflbuf, Buffer wbuf, - BufferAccessStrategy bstrategy); -extern void _hash_initbitmap(Relation rel, HashMetaPage metap, - BlockNumber blkno, ForkNumber forkNum); +extern BlockNumber _hash_freeovflpage(Relation rel, Buffer bucketbuf, Buffer ovflbuf, + Buffer wbuf, IndexTuple *itups, OffsetNumber *itup_offsets, + Size *tups_size, uint16 nitups, BufferAccessStrategy bstrategy); +extern void _hash_initbitmapbuffer(Buffer buf, uint16 bmsize, bool initpage); extern void _hash_squeezebucket(Relation rel, Bucket bucket, BlockNumber bucket_blkno, Buffer bucket_buf, @@ -328,6 +330,8 @@ extern Buffer _hash_getbuf(Relation rel, BlockNumber blkno, extern Buffer _hash_getbuf_with_condlock_cleanup(Relation rel, BlockNumber blkno, int flags); extern Buffer _hash_getinitbuf(Relation rel, BlockNumber blkno); +extern void _hash_initbuf(Buffer buf, uint32 num_bucket, uint32 flag, + bool initpage); extern Buffer _hash_getnewbuf(Relation rel, BlockNumber blkno, ForkNumber forkNum); extern Buffer _hash_getbuf_with_strategy(Relation rel, BlockNumber blkno, @@ -336,8 +340,10 @@ extern Buffer _hash_getbuf_with_strategy(Relation rel, BlockNumber blkno, extern void _hash_relbuf(Relation rel, Buffer buf); extern void _hash_dropbuf(Relation rel, Buffer buf); extern void _hash_dropscanbuf(Relation rel, HashScanOpaque so); -extern uint32 _hash_metapinit(Relation rel, double num_tuples, - ForkNumber forkNum); +extern uint32 _hash_init(Relation rel, double num_tuples, + ForkNumber forkNum); +extern void _hash_init_metabuffer(Buffer buf, double num_tuples, + RegProcedure procid, uint16 ffactor, bool initpage); extern void _hash_pageinit(Page page, Size size); extern void _hash_expandtable(Relation rel, Buffer metabuf); extern void _hash_finish_split(Relation rel, Buffer metabuf, Buffer obuf, diff --git a/src/include/access/hash_xlog.h b/src/include/access/hash_xlog.h index 5f941a9..35c1b8f 100644 --- a/src/include/access/hash_xlog.h +++ b/src/include/access/hash_xlog.h @@ -17,6 +17,245 @@ #include "access/hash.h" #include "access/xlogreader.h" +/* Number of buffers required for XLOG_HASH_SQUEEZE_PAGE operation */ +#define HASH_XLOG_FREE_OVFL_BUFS 6 + +/* + * XLOG records for hash operations + */ +#define XLOG_HASH_INIT_META_PAGE 0x00 /* initialize the meta page */ +#define XLOG_HASH_INIT_BITMAP_PAGE 0x10 /* initialize the bitmap page */ +#define XLOG_HASH_INSERT 0x20 /* add index tuple without split */ +#define XLOG_HASH_ADD_OVFL_PAGE 0x30 /* add overflow page */ +#define XLOG_HASH_SPLIT_ALLOCATE_PAGE 0x40 /* allocate new page for split */ +#define XLOG_HASH_SPLIT_PAGE 0x50 /* split page */ +#define XLOG_HASH_SPLIT_COMPLETE 0x60 /* completion of split + * operation */ +#define XLOG_HASH_MOVE_PAGE_CONTENTS 0x70 /* remove tuples from one page + * and add to another page */ +#define XLOG_HASH_SQUEEZE_PAGE 0x80 /* add tuples to one of the previous + * pages in chain and free the ovfl + * page */ +#define XLOG_HASH_DELETE 0x90 /* delete index tuples from a page */ +#define XLOG_HASH_SPLIT_CLEANUP 0xA0 /* clear split-cleanup flag in primary + * bucket page after deleting tuples + * that are moved due to split */ +#define XLOG_HASH_UPDATE_META_PAGE 0xB0 /* update meta page after + * vacuum */ + + +/* + * xl_hash_split_allocpage flag values, 8 bits are available. + */ +#define XLH_SPLIT_META_UPDATE_MASKS (1<<0) +#define XLH_SPLIT_META_UPDATE_SPLITPOINT (1<<1) + +/* + * Data to regenerate the meta-data page + */ +typedef struct xl_hash_metadata +{ + HashMetaPageData metadata; +} xl_hash_metadata; + +/* + * This is what we need to know about a HASH index create. + * + * Backup block 0: metapage + */ +typedef struct xl_hash_createidx +{ + double num_tuples; + RegProcedure procid; + uint16 ffactor; +} xl_hash_createidx; +#define SizeOfHashCreateIdx (offsetof(xl_hash_createidx, ffactor) + sizeof(uint16)) + +/* + * This is what we need to know about simple (without split) insert. + * + * This data record is used for XLOG_HASH_INSERT + * + * Backup Blk 0: original page (data contains the inserted tuple) + * Backup Blk 1: metapage (xl_hash_metadata) + */ +typedef struct xl_hash_insert +{ + OffsetNumber offnum; +} xl_hash_insert; + +#define SizeOfHashInsert (offsetof(xl_hash_insert, offnum) + sizeof(OffsetNumber)) + +/* + * This is what we need to know about addition of overflow page. + * + * This data record is used for XLOG_HASH_ADD_OVFL_PAGE + * + * Backup Blk 0: newly allocated overflow page + * Backup Blk 1: page before new overflow page in the bucket chain + * Backup Blk 2: bitmap page + * Backup Blk 3: new bitmap page + * Backup Blk 4: metapage + */ +typedef struct xl_hash_addovflpage +{ + uint16 bmsize; + bool bmpage_found; +} xl_hash_addovflpage; + +#define SizeOfHashAddOvflPage \ + (offsetof(xl_hash_addovflpage, bmpage_found) + sizeof(bool)) + +/* + * This is what we need to know about allocating a page for split. + * + * This data record is used for XLOG_HASH_SPLIT_ALLOCATE_PAGE + * + * Backup Blk 0: page for old bucket + * Backup Blk 1: page for new bucket + * Backup Blk 2: metapage + */ +typedef struct xl_hash_split_allocpage +{ + uint32 new_bucket; + uint16 old_bucket_flag; + uint16 new_bucket_flag; + uint8 flags; +} xl_hash_split_allocpage; + +#define SizeOfHashSplitAllocPage \ + (offsetof(xl_hash_split_allocpage, flags) + sizeof(uint8)) + +/* + * This is what we need to know about completing the split operation. + * + * This data record is used for XLOG_HASH_SPLIT_COMPLETE + * + * Backup Blk 0: page for old bucket + * Backup Blk 1: page for new bucket + */ +typedef struct xl_hash_split_complete +{ + uint16 old_bucket_flag; + uint16 new_bucket_flag; +} xl_hash_split_complete; + +#define SizeOfHashSplitComplete \ + (offsetof(xl_hash_split_complete, new_bucket_flag) + sizeof(uint16)) + +/* + * This is what we need to know about move page contents required during + * squeeze operation. + * + * This data record is used for XLOG_HASH_MOVE_PAGE_CONTENTS + * + * Backup Blk 0: bucket page + * Backup Blk 1: page containing moved tuples + * Backup Blk 2: page from which tuples will be removed + */ +typedef struct xl_hash_move_page_contents +{ + uint16 ntups; + bool is_prim_bucket_same_wrt; /* TRUE if the page to which + * tuples are moved is same as + * primary bucket page */ +} xl_hash_move_page_contents; + +#define SizeOfHashMovePageContents \ + (offsetof(xl_hash_move_page_contents, is_prim_bucket_same_wrt) + sizeof(bool)) + +/* + * This is what we need to know about the squeeze page operation. + * + * This data record is used for XLOG_HASH_SQUEEZE_PAGE + * + * Backup Blk 0: page containing tuples moved from freed overflow page + * Backup Blk 1: freed overflow page + * Backup Blk 2: page previous to the freed overflow page + * Backup Blk 3: page next to the freed overflow page + * Backup Blk 4: bitmap page containing info of freed overflow page + * Backup Blk 5: meta page + */ +typedef struct xl_hash_squeeze_page +{ + BlockNumber prevblkno; + BlockNumber nextblkno; + uint16 ntups; + bool is_prim_bucket_same_wrt; /* TRUE if the page to which + * tuples are moved is same as + * primary bucket page */ + bool is_prev_bucket_same_wrt; /* TRUE if the page to which + * tuples are moved is the + * page previous to the freed + * overflow page */ +} xl_hash_squeeze_page; + +#define SizeOfHashSqueezePage \ + (offsetof(xl_hash_squeeze_page, is_prev_bucket_same_wrt) + sizeof(bool)) + +/* + * This is what we need to know about the deletion of index tuples from a page. + * + * This data record is used for XLOG_HASH_DELETE + * + * Backup Blk 0: primary bucket page + * Backup Blk 1: page from which tuples are deleted + */ +typedef struct xl_hash_delete +{ + bool is_primary_bucket_page; /* TRUE if the operation is for + * primary bucket page */ +} xl_hash_delete; + +#define SizeOfHashDelete (offsetof(xl_hash_delete, is_primary_bucket_page) + sizeof(bool)) + +/* + * This is what we need for metapage update operation. + * + * This data record is used for XLOG_HASH_UPDATE_META_PAGE + * + * Backup Blk 0: meta page + */ +typedef struct xl_hash_update_meta_page +{ + double ntuples; +} xl_hash_update_meta_page; + +#define SizeOfHashUpdateMetaPage \ + (offsetof(xl_hash_update_meta_page, ntuples) + sizeof(double)) + +/* + * This is what we need to initialize metapage. + * + * This data record is used for XLOG_HASH_INIT_META_PAGE + * + * Backup Blk 0: meta page + */ +typedef struct xl_hash_init_meta_page +{ + double num_tuples; + RegProcedure procid; + uint16 ffactor; +} xl_hash_init_meta_page; + +#define SizeOfHashInitMetaPage \ + (offsetof(xl_hash_init_meta_page, ffactor) + sizeof(uint16)) + +/* + * This is what we need to initialize bitmap page. + * + * This data record is used for XLOG_HASH_INIT_BITMAP_PAGE + * + * Backup Blk 0: bitmap page + * Backup Blk 1: meta page + */ +typedef struct xl_hash_init_bitmap_page +{ + uint16 bmsize; +} xl_hash_init_bitmap_page; + +#define SizeOfHashInitBitmapPage \ + (offsetof(xl_hash_init_bitmap_page, bmsize) + sizeof(uint16)) extern void hash_redo(XLogReaderState *record); extern void hash_desc(StringInfo buf, XLogReaderState *record); diff --git a/src/include/storage/bufpage.h b/src/include/storage/bufpage.h index ad4ab5f..6ea46ef 100644 --- a/src/include/storage/bufpage.h +++ b/src/include/storage/bufpage.h @@ -425,6 +425,7 @@ extern Page PageGetTempPageCopySpecial(Page page); extern void PageRestoreTempPage(Page tempPage, Page oldPage); extern void PageRepairFragmentation(Page page); extern Size PageGetFreeSpace(Page page); +extern Size PageGetFreeSpaceForMulTups(Page page, int ntups); extern Size PageGetExactFreeSpace(Page page); extern Size PageGetHeapFreeSpace(Page page); extern void PageIndexTupleDelete(Page page, OffsetNumber offset); diff --git a/src/test/regress/expected/create_index.out b/src/test/regress/expected/create_index.out index e663f9a..6b2f693 100644 --- a/src/test/regress/expected/create_index.out +++ b/src/test/regress/expected/create_index.out @@ -2335,13 +2335,9 @@ Options: fastupdate=on, gin_pending_list_limit=128 -- HASH -- CREATE INDEX hash_i4_index ON hash_i4_heap USING hash (random int4_ops); -WARNING: hash indexes are not WAL-logged and their use is discouraged CREATE INDEX hash_name_index ON hash_name_heap USING hash (random name_ops); -WARNING: hash indexes are not WAL-logged and their use is discouraged CREATE INDEX hash_txt_index ON hash_txt_heap USING hash (random text_ops); -WARNING: hash indexes are not WAL-logged and their use is discouraged CREATE INDEX hash_f8_index ON hash_f8_heap USING hash (random float8_ops); -WARNING: hash indexes are not WAL-logged and their use is discouraged CREATE UNLOGGED TABLE unlogged_hash_table (id int4); CREATE INDEX unlogged_hash_index ON unlogged_hash_table USING hash (id int4_ops); DROP TABLE unlogged_hash_table; @@ -2350,7 +2346,6 @@ DROP TABLE unlogged_hash_table; -- maintenance_work_mem setting and fillfactor: SET maintenance_work_mem = '1MB'; CREATE INDEX hash_tuplesort_idx ON tenk1 USING hash (stringu1 name_ops) WITH (fillfactor = 10); -WARNING: hash indexes are not WAL-logged and their use is discouraged EXPLAIN (COSTS OFF) SELECT count(*) FROM tenk1 WHERE stringu1 = 'TVAAAA'; QUERY PLAN diff --git a/src/test/regress/expected/enum.out b/src/test/regress/expected/enum.out index 514d1d0..0e60304 100644 --- a/src/test/regress/expected/enum.out +++ b/src/test/regress/expected/enum.out @@ -383,7 +383,6 @@ DROP INDEX enumtest_btree; -- Hash index / opclass with the = operator -- CREATE INDEX enumtest_hash ON enumtest USING hash (col); -WARNING: hash indexes are not WAL-logged and their use is discouraged SELECT * FROM enumtest WHERE col = 'orange'; col -------- diff --git a/src/test/regress/expected/hash_index.out b/src/test/regress/expected/hash_index.out index f8b9f02..0a18efa 100644 --- a/src/test/regress/expected/hash_index.out +++ b/src/test/regress/expected/hash_index.out @@ -201,7 +201,6 @@ SELECT h.seqno AS f20000 -- CREATE TABLE hash_split_heap (keycol INT); CREATE INDEX hash_split_index on hash_split_heap USING HASH (keycol); -WARNING: hash indexes are not WAL-logged and their use is discouraged INSERT INTO hash_split_heap SELECT 1 FROM generate_series(1, 70000) a; VACUUM FULL hash_split_heap; -- Let's do a backward scan. @@ -230,5 +229,4 @@ DROP TABLE hash_temp_heap CASCADE; CREATE TABLE hash_heap_float4 (x float4, y int); INSERT INTO hash_heap_float4 VALUES (1.1,1); CREATE INDEX hash_idx ON hash_heap_float4 USING hash (x); -WARNING: hash indexes are not WAL-logged and their use is discouraged DROP TABLE hash_heap_float4 CASCADE; diff --git a/src/test/regress/expected/macaddr.out b/src/test/regress/expected/macaddr.out index e84ff5f..151f9ce 100644 --- a/src/test/regress/expected/macaddr.out +++ b/src/test/regress/expected/macaddr.out @@ -41,7 +41,6 @@ SELECT * FROM macaddr_data; CREATE INDEX macaddr_data_btree ON macaddr_data USING btree (b); CREATE INDEX macaddr_data_hash ON macaddr_data USING hash (b); -WARNING: hash indexes are not WAL-logged and their use is discouraged SELECT a, b, trunc(b) FROM macaddr_data ORDER BY 2, 1; a | b | trunc ----+-------------------+------------------- diff --git a/src/test/regress/expected/replica_identity.out b/src/test/regress/expected/replica_identity.out index 1a04ec5..f0bb253 100644 --- a/src/test/regress/expected/replica_identity.out +++ b/src/test/regress/expected/replica_identity.out @@ -12,7 +12,6 @@ CREATE UNIQUE INDEX test_replica_identity_keyab_key ON test_replica_identity (ke CREATE UNIQUE INDEX test_replica_identity_oid_idx ON test_replica_identity (oid); CREATE UNIQUE INDEX test_replica_identity_nonkey ON test_replica_identity (keya, nonkey); CREATE INDEX test_replica_identity_hash ON test_replica_identity USING hash (nonkey); -WARNING: hash indexes are not WAL-logged and their use is discouraged CREATE UNIQUE INDEX test_replica_identity_expr ON test_replica_identity (keya, keyb, (3)); CREATE UNIQUE INDEX test_replica_identity_partial ON test_replica_identity (keya, keyb) WHERE keyb != '3'; -- default is 'd'/DEFAULT for user created tables diff --git a/src/test/regress/expected/uuid.out b/src/test/regress/expected/uuid.out index 59cb1e0..d907519 100644 --- a/src/test/regress/expected/uuid.out +++ b/src/test/regress/expected/uuid.out @@ -114,7 +114,6 @@ SELECT COUNT(*) FROM guid1 WHERE guid_field >= '22222222-2222-2222-2222-22222222 -- btree and hash index creation test CREATE INDEX guid1_btree ON guid1 USING BTREE (guid_field); CREATE INDEX guid1_hash ON guid1 USING HASH (guid_field); -WARNING: hash indexes are not WAL-logged and their use is discouraged -- unique index test CREATE UNIQUE INDEX guid1_unique_BTREE ON guid1 USING BTREE (guid_field); -- should fail