diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml index e34a2f9..6cce1f3 100644 --- a/doc/src/sgml/catalogs.sgml +++ b/doc/src/sgml/catalogs.sgml @@ -551,6 +551,13 @@ + aminsertcleanup + regproc + pg_proc.oid + Post-INSERT cleanup finction (optional) + + + amvacuumcleanup regproc pg_proc.oid diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index 7493ca9..d7236f8 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -3525,6 +3525,11 @@ COPY postgres_log FROM '/full/path/to/logfile.csv' WITH csv; This setting can be overridden for individual tables by entries in pg_autovacuum. + + This parameter affects on vacuuming a table with GIN + index, it also specifies the minimum number of inserted or updated + tuples needed to trigger a VACUUM on thos table. + diff --git a/doc/src/sgml/gin.sgml b/doc/src/sgml/gin.sgml index 1c5841a..adc77c4 100644 --- a/doc/src/sgml/gin.sgml +++ b/doc/src/sgml/gin.sgml @@ -188,9 +188,45 @@ list of heap pointers (PL, posting list) if the list is small enough. + + GIN fast update technique + + + Updating a GIN index tends to be slow because of the + intrinsic nature of inverted indexes: inserting or updating one heap row + can cause many inserts into the index (one for each key extracted + from the indexed value). As of + PostgreSQL 8.4, this problem is alleviated + by postponing most of the work until the next VACUUM. + Newly inserted index entries are temporarily stored in an unsorted list of + pending entries. VACUUM inserts all pending entries into the + main GIN index data structure, + using the same bulk insert techniques used during initial index creation. + This greatly improves GIN index update speed, even + counting the additional vacuum overhead. + + + + The disadvantage of this approach is that searches must scan the list + of pending entries in addition to searching the regular index, and so + a large list of pending entries will slow searches significantly. + It's recommended to use properly-configured autovacuum with tables + having GIN indexes, to keep this overhead to + reasonable levels. + + + + If consistently-fast search speed is more important than update speed, + use of pending entries can be disabled by turning off the + FASTUPDATE storage parameter for a + GIN index. See for details. + + + Partial match algorithm - + GIN can support partial match queries, in which the query does not determine an exact match for one or more keys, but the possible @@ -225,11 +261,18 @@ Create vs insert - In most cases, insertion into a GIN index is slow + Insertion into a GIN index can be slow due to the likelihood of many keys being inserted for each value. So, for bulk insertions into a table it is advisable to drop the GIN index and recreate it after finishing bulk insertion. + + + As of PostgreSQL 8.4, this advice is less + necessary since delayed indexing is used (see for details). But for very large updates + it may still be best to drop and recreate the index. + diff --git a/doc/src/sgml/ref/create_index.sgml b/doc/src/sgml/ref/create_index.sgml index 8b502e6..b75ccc9 100644 --- a/doc/src/sgml/ref/create_index.sgml +++ b/doc/src/sgml/ref/create_index.sgml @@ -265,7 +265,7 @@ CREATE [ UNIQUE ] INDEX [ CONCURRENTLY ] name The WITH clause can specify storage parameters for indexes. Each index method can have its own set of allowed storage - parameters. The built-in index methods all accept a single parameter: + parameters. All built-in index methods accept this parameter: @@ -292,6 +292,36 @@ CREATE [ UNIQUE ] INDEX [ CONCURRENTLY ] name + + GIN indexes accept an additional parameter: + + + + + + FASTUPDATE + + + This setting controls usage of the fast update technique described in + . It is a Boolean parameter: + ON enables fast update, OFF disables it. + (Alternative spellings of ON and OFF are + allowed as described in .) The + default is ON. + + + + + Turning FASTUPDATE off via ALTER INDEX prevents + future insertions from going into the list of pending index entries, + but does not in itself flush previous entries. You might want to do a + VACUUM afterward to ensure the pending list is emptied. + + + + + + @@ -500,6 +530,13 @@ CREATE UNIQUE INDEX title_idx ON films (title) WITH (fillfactor = 70); + To create a GIN index with fast update turned off: + +CREATE INDEX gin_idx ON documents_table (locations) WITH (fastupdate = off); + + + + To create an index on the column code in the table films and have the index reside in the tablespace indexspace: diff --git a/doc/src/sgml/ref/vacuum.sgml b/doc/src/sgml/ref/vacuum.sgml index bee0667..952481c 100644 --- a/doc/src/sgml/ref/vacuum.sgml +++ b/doc/src/sgml/ref/vacuum.sgml @@ -63,6 +63,13 @@ VACUUM [ FULL ] [ FREEZE ] [ VERBOSE ] ANALYZE [ blocks. This form is much slower and requires an exclusive lock on each table while it is being processed. + + + For tables with GIN indexes, VACUUM (in + any form) also completes any delayed index insertions, by moving pending + index entries to the appropriate places in the main GIN index + structure. (See for more details.) + diff --git a/doc/src/sgml/textsearch.sgml b/doc/src/sgml/textsearch.sgml index 1b1310c..8560c07 100644 --- a/doc/src/sgml/textsearch.sgml +++ b/doc/src/sgml/textsearch.sgml @@ -3224,7 +3224,9 @@ SELECT plainto_tsquery('supernovae stars'); - GIN indexes are about ten times slower to update than GiST + GIN indexes are moderately slower to update than GiST indexes, but + about 10 times slower if fast update support was disabled + (see for details) diff --git a/src/backend/access/gin/Makefile b/src/backend/access/gin/Makefile index 93442ae..99ded7a 100644 --- a/src/backend/access/gin/Makefile +++ b/src/backend/access/gin/Makefile @@ -14,6 +14,6 @@ include $(top_builddir)/src/Makefile.global OBJS = ginutil.o gininsert.o ginxlog.o ginentrypage.o gindatapage.o \ ginbtree.o ginscan.o ginget.o ginvacuum.o ginarrayproc.o \ - ginbulk.o + ginbulk.o ginfast.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/access/gin/ginbulk.c b/src/backend/access/gin/ginbulk.c index 5219e55..63b5be5 100644 --- a/src/backend/access/gin/ginbulk.c +++ b/src/backend/access/gin/ginbulk.c @@ -197,6 +197,8 @@ ginInsertRecordBA(BuildAccumulator *accum, ItemPointer heapptr, OffsetNumber att if (nentry <= 0) return; + Assert(ItemPointerIsValid(heapptr) && attnum >= FirstOffsetNumber); + i = nentry - 1; for (; i > 0; i >>= 1) nbit++; diff --git a/src/backend/access/gin/gindatapage.c b/src/backend/access/gin/gindatapage.c index bf0651d..3c188f3 100644 --- a/src/backend/access/gin/gindatapage.c +++ b/src/backend/access/gin/gindatapage.c @@ -43,8 +43,14 @@ MergeItemPointers(ItemPointerData *dst, ItemPointerData *a, uint32 na, ItemPoint while (aptr - a < na && bptr - b < nb) { - if (compareItemPointers(aptr, bptr) > 0) + int cmp = compareItemPointers(aptr, bptr); + if (cmp > 0) *dptr++ = *bptr++; + else if ( cmp == 0 ) + { + *dptr++ = *bptr++; + aptr++; + } else *dptr++ = *aptr++; } @@ -630,11 +636,16 @@ insertItemPointer(GinPostingTreeScan *gdi, ItemPointerData *items, uint32 nitem) gdi->stack = ginFindLeafPage(&gdi->btree, gdi->stack); if (gdi->btree.findItem(&(gdi->btree), gdi->stack)) - elog(ERROR, "item pointer (%u,%d) already exists", - ItemPointerGetBlockNumber(gdi->btree.items + gdi->btree.curitem), - ItemPointerGetOffsetNumber(gdi->btree.items + gdi->btree.curitem)); - - ginInsertValue(&(gdi->btree), gdi->stack); + { + /* + * gdi->btree.items[ gdi->btree.curitem ] already exists in index + */ + gdi->btree.curitem ++; + LockBuffer(gdi->stack->buffer, GIN_UNLOCK); + freeGinBtreeStack(gdi->stack); + } + else + ginInsertValue(&(gdi->btree), gdi->stack); gdi->stack = NULL; } diff --git a/src/backend/access/gin/ginfast.c b/src/backend/access/gin/ginfast.c new file mode 100644 index 0000000..3ca335d --- /dev/null +++ b/src/backend/access/gin/ginfast.c @@ -0,0 +1,761 @@ +/*------------------------------------------------------------------------- + * + * ginfast.c + * Fast insert routines for the Postgres inverted index access method. + * Pending entries are stored in linear list of pages and vacuum + * will transfer them into regular structure. + * + * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * $PostgreSQL$ + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/genam.h" +#include "access/gin.h" +#include "access/tuptoaster.h" +#include "catalog/index.h" +#include "commands/vacuum.h" +#include "miscadmin.h" +#include "storage/bufmgr.h" +#include "utils/memutils.h" + + +static int32 +writeListPage(Relation index, Buffer buffer, IndexTuple *tuples, int32 ntuples, BlockNumber rightlink) +{ + Page page = BufferGetPage(buffer); + int i, freesize, size=0; + OffsetNumber l, off; + + START_CRIT_SECTION(); + + GinInitBuffer(buffer, GIN_LIST); + + off = FirstOffsetNumber; + + for(i=0; irightlink = rightlink; + /* + * tail page may contain only the whole row(s) or final + * part of row placed on previous pages + */ + if ( rightlink == InvalidBlockNumber ) + GinPageSetFullRow(page); + + freesize = PageGetFreeSpace(page); + + MarkBufferDirty(buffer); + + if (!index->rd_istemp) + { + XLogRecData rdata[2]; + ginxlogInsertListPage data; + XLogRecPtr recptr; + char *ptr; + + rdata[0].buffer = buffer; + rdata[0].buffer_std = true; + rdata[0].data = (char*)&data; + rdata[0].len = sizeof(ginxlogInsertListPage); + rdata[0].next = rdata+1; + + rdata[1].buffer = InvalidBuffer; + ptr = rdata[1].data = palloc( size ); + rdata[1].len = size; + rdata[1].next = NULL; + + for(i=0; i 0); + + /* + * Split tuples for pages + */ + for(i=0;ihead = BufferGetBlockNumber(curBuffer); + } + + prevBuffer = curBuffer;; + startTuple = i; + size = 0; + } + + tupsize = IndexTupleSize(tuples[i]) + sizeof(ItemIdData); + + if ( size + tupsize >= GinListPageSize ) + { + i--; + curBuffer = InvalidBuffer; + } + else + { + size += tupsize; + } + } + + /* + * Write last page + */ + res->tail = BufferGetBlockNumber(curBuffer); + res->tailFreeSize = writeListPage(index, curBuffer, tuples+startTuple, ntuples-startTuple, InvalidBlockNumber); +} + +#define GIN_PAGE_FREESIZE \ + ( BLCKSZ - MAXALIGN(SizeOfPageHeaderData) - MAXALIGN(sizeof(GinPageOpaqueData)) ) +/* + * Inserts collected values during normal insertion. Function guarantees + * that all values of heap will be stored sequentually with + * preserving order + */ +void +ginHeapTupleFastInsert(Relation index, GinTupleCollector *collector) +{ + Buffer metabuffer; + Page metapage; + GinMetaPageData *metadata = NULL; + XLogRecData rdata[2]; + Buffer buffer = InvalidBuffer; + Page page = NULL; + ginxlogUpdateMeta data; + bool separateList = false; + + if ( collector->ntuples == 0 ) + return; + + data.node = index->rd_node; + data.ntuples = 0; + data.newRightlink = data.prevTail = InvalidBlockNumber; + + rdata[0].buffer = InvalidBuffer; + rdata[0].data = (char *) &data; + rdata[0].len = sizeof(ginxlogUpdateMeta); + rdata[0].next = NULL; + + metabuffer = ReadBuffer(index, GIN_METAPAGE_BLKNO); + metapage = BufferGetPage(metabuffer); + + if ( collector->sumsize + collector->ntuples * sizeof(ItemIdData) > GIN_PAGE_FREESIZE ) + { + /* + * Total size is greater than one page => make sublist + */ + separateList = true; + } + else + { + LockBuffer(metabuffer, GIN_EXCLUSIVE); + metadata = GinPageGetMeta(metapage); + + if ( metadata->head == InvalidBlockNumber || + collector->sumsize + collector->ntuples * sizeof(ItemIdData) > metadata->tailFreeSize ) + { + /* + * Pending list is empty or total size is greater than freespace + * on tail page => make sublist + * We unlock metabuffer to keep high concurrency + */ + separateList = true; + LockBuffer(metabuffer, GIN_UNLOCK); + } + } + + if ( separateList ) + { + GinMetaPageData sublist; + + /* + * We should make sublist separately and append it to the tail + */ + memset( &sublist, 0, sizeof(GinMetaPageData) ); + + makeSublist(index, collector->tuples, collector->ntuples, &sublist); + + /* + * metapage was unlocked, see above + */ + LockBuffer(metabuffer, GIN_EXCLUSIVE); + metadata = GinPageGetMeta(metapage); + + if ( metadata->head == InvalidBlockNumber ) + { + /* + * Sublist becomes main list + */ + START_CRIT_SECTION(); + memcpy(metadata, &sublist, sizeof(GinMetaPageData) ); + memcpy(&data.metadata, &sublist, sizeof(GinMetaPageData) ); + } + else + { + /* + * merge lists + */ + + data.prevTail = metadata->tail; + buffer = ReadBuffer(index, metadata->tail); + LockBuffer(buffer, GIN_EXCLUSIVE); + page = BufferGetPage(buffer); + Assert(GinPageGetOpaque(page)->rightlink == InvalidBlockNumber); + + START_CRIT_SECTION(); + + GinPageGetOpaque(page)->rightlink = sublist.head; + metadata->tail = sublist.tail; + metadata->tailFreeSize = sublist.tailFreeSize; + + memcpy(&data.metadata, metadata, sizeof(GinMetaPageData) ); + data.newRightlink = sublist.head; + + MarkBufferDirty(buffer); + } + } + else + { + /* + * Insert into tail page, metapage is already locked + */ + + OffsetNumber l, off; + int i, tupsize; + char *ptr; + + buffer = ReadBuffer(index, metadata->tail); + LockBuffer(buffer, GIN_EXCLUSIVE); + page = BufferGetPage(buffer); + off = (PageIsEmpty(page)) ? FirstOffsetNumber : + OffsetNumberNext(PageGetMaxOffsetNumber(page)); + + rdata[0].next = rdata + 1; + + rdata[1].buffer = buffer; + rdata[1].buffer_std = true; + ptr = rdata[1].data = (char *) palloc( collector->sumsize ); + rdata[1].len = collector->sumsize; + rdata[1].next = NULL; + + data.ntuples = collector->ntuples; + + START_CRIT_SECTION(); + + for(i=0; intuples; i++) + { + tupsize = IndexTupleSize(collector->tuples[i]); + l = PageAddItem(page, (Item)collector->tuples[i], tupsize, off, false, false); + + if (l == InvalidOffsetNumber) + elog(ERROR, "failed to add item to index page in \"%s\"", + RelationGetRelationName(index)); + + memcpy(ptr, collector->tuples[i], tupsize); + ptr+=tupsize; + + off++; + } + + metadata->tailFreeSize -= collector->sumsize + collector->ntuples * sizeof(ItemIdData); + memcpy(&data.metadata, metadata, sizeof(GinMetaPageData) ); + MarkBufferDirty(buffer); + } + + /* + * Make real write + */ + + MarkBufferDirty(metabuffer); + if ( !index->rd_istemp ) + { + XLogRecPtr recptr; + + recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_UPDATE_META_PAGE, rdata); + PageSetLSN(metapage, recptr); + PageSetTLI(metapage, ThisTimeLineID); + + if ( buffer != InvalidBuffer ) + { + PageSetLSN(page, recptr); + PageSetTLI(page, ThisTimeLineID); + } + } + + if (buffer != InvalidBuffer) + UnlockReleaseBuffer(buffer); + UnlockReleaseBuffer(metabuffer); + + END_CRIT_SECTION(); +} + +/* + * Collect values from one tuples to be indexed. All values for + * one tuples shouold be written at once - to guarantee consistent state + */ +uint32 +ginHeapTupleFastCollect(Relation index, GinState *ginstate, GinTupleCollector *collector, + OffsetNumber attnum, Datum value, ItemPointer item) +{ + Datum *entries; + int32 i, + nentries; + + entries = extractEntriesSU(ginstate, attnum, value, &nentries); + + if (nentries == 0) + /* nothing to insert */ + return 0; + + /* + * Allocate/reallocate memory for storing collected tuples + */ + if ( collector->tuples == NULL ) + { + collector->lentuples = nentries * index->rd_att->natts; + collector->tuples = (IndexTuple*)palloc(sizeof(IndexTuple) * collector->lentuples); + } + + while ( collector->ntuples + nentries > collector->lentuples ) + { + collector->lentuples *= 2; + collector->tuples = (IndexTuple*)repalloc( collector->tuples, + sizeof(IndexTuple) * collector->lentuples); + } + + /* + * Creates tuple's array + */ + for (i = 0; i < nentries; i++) + { + int32 tupsize; + + collector->tuples[collector->ntuples + i] = GinFormTuple(ginstate, attnum, entries[i], NULL, 0); + collector->tuples[collector->ntuples + i]->t_tid = *item; + tupsize = IndexTupleSize(collector->tuples[collector->ntuples + i]); + + if ( tupsize > TOAST_INDEX_TARGET || tupsize >= GinMaxItemSize) + elog(ERROR, "huge tuple"); + + collector->sumsize += tupsize; + } + + collector->ntuples += nentries; + + return nentries; +} + +/* + * Deletes first pages in list before newHead page. + * If newHead == InvalidBlockNumber then function drops the whole list. + * returns true if concurrent completion process is running + */ +static bool +shiftList(Relation index, Buffer metabuffer, BlockNumber newHead, + IndexBulkDeleteResult *stats) +{ +#define NDELETE_AT_ONCE (16) + Buffer buffers[NDELETE_AT_ONCE]; + ginxlogDeleteListPages data; + XLogRecData rdata[1]; + Page metapage; + GinMetaPageData *metadata; + BlockNumber blknoToDelete; + + metapage = BufferGetPage(metabuffer); + metadata = GinPageGetMeta(metapage); + blknoToDelete = metadata->head; + + data.node = index->rd_node; + + rdata[0].buffer = InvalidBuffer; + rdata[0].data = (char *) &data; + rdata[0].len = sizeof(ginxlogDeleteListPages); + rdata[0].next = NULL; + + do + { + Page page; + int i; + + data.ndeleted = 0; + while( data.ndeleted < NDELETE_AT_ONCE && blknoToDelete != newHead ) + { + data.toDelete[ data.ndeleted ] = blknoToDelete; + buffers[ data.ndeleted ] = ReadBuffer(index, blknoToDelete); + LockBufferForCleanup( buffers[ data.ndeleted ] ); + page = BufferGetPage( buffers[ data.ndeleted ] ); + + data.ndeleted++; + stats->pages_deleted++; + + if ( GinPageIsDeleted(page) ) + { + /* concurrent deletion process is detected */ + for(i=0;irightlink; + } + + START_CRIT_SECTION(); + + metadata->head = blknoToDelete; + if ( blknoToDelete == InvalidBlockNumber ) + { + metadata->tail = InvalidBlockNumber; + metadata->tailFreeSize = 0; + } + memcpy( &data.metadata, metadata, sizeof(GinMetaPageData)); + MarkBufferDirty( metabuffer ); + + for(i=0; iflags = GIN_DELETED; + MarkBufferDirty( buffers[ i ] ); + } + + if ( !index->rd_istemp ) + { + XLogRecPtr recptr; + + recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_DELETE_LISTPAGE, rdata); + PageSetLSN(metapage, recptr); + PageSetTLI(metapage, ThisTimeLineID); + + for(i=0; invalues >= datums->maxvalues) + { + datums->maxvalues *= 2; + datums->values = (Datum*)repalloc( datums->values, sizeof(Datum)*datums->maxvalues); + } + + datums->values[ datums->nvalues++ ] = datum; +} + +/* + * Go through all tuples on page and collect values in memory + */ + +static void +processPendingPage(BuildAccumulator *accum, DatumArray *da, Page page, OffsetNumber startoff) +{ + ItemPointerData heapptr; + OffsetNumber i,maxoff; + OffsetNumber attrnum, curattnum; + + maxoff = PageGetMaxOffsetNumber(page); + Assert( maxoff >= FirstOffsetNumber ); + ItemPointerSetInvalid(&heapptr); + attrnum = 0; + + for (i = startoff; i <= maxoff; i = OffsetNumberNext(i)) + { + IndexTuple itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, i)); + + curattnum = gintuple_get_attrnum(accum->ginstate, itup); + + if ( !ItemPointerIsValid(&heapptr) ) + { + heapptr = itup->t_tid; + attrnum = curattnum; + } + else if ( !(ItemPointerEquals(&heapptr, &itup->t_tid) && curattnum == attrnum) ) + { + /* + * We can insert several datums per call, but only for one heap tuple + * and one column. + */ + ginInsertRecordBA(accum, &heapptr, attrnum, da->values, da->nvalues); + da->nvalues = 0; + heapptr = itup->t_tid; + attrnum = curattnum; + } + addDatum(da, gin_index_getattr(accum->ginstate, itup)); + } + + ginInsertRecordBA(accum, &heapptr, attrnum, da->values, da->nvalues); +} + +/* + * Moves tuples from pending pages into regular GIN structure. + * Function doesn't require special locking and could be called + * in any time but only one at the same time. + */ + +Datum +gininsertcleanup(PG_FUNCTION_ARGS) +{ + IndexVacuumInfo *info = (IndexVacuumInfo *) PG_GETARG_POINTER(0); + IndexBulkDeleteResult *stats = (IndexBulkDeleteResult *) PG_GETARG_POINTER(1); + Relation index = info->index; + GinState ginstate; + Buffer metabuffer, buffer; + Page metapage, page; + GinMetaPageData *metadata; + MemoryContext opCtx, oldCtx; + BuildAccumulator accum; + DatumArray datums; + BlockNumber blkno; + + /* Set up all-zero stats if ginbulkdelete wasn't called */ + if (stats == NULL) + stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); + + initGinState(&ginstate, index); + + metabuffer = ReadBuffer(index, GIN_METAPAGE_BLKNO); + LockBuffer(metabuffer, GIN_SHARE); + metapage = BufferGetPage(metabuffer); + metadata = GinPageGetMeta(metapage); + + if ( metadata->head == InvalidBlockNumber ) + { + UnlockReleaseBuffer(metabuffer); + PG_RETURN_POINTER(stats); + } + + /* + * Init + */ + datums.maxvalues=128; + datums.nvalues = 0; + datums.values = (Datum*)palloc(sizeof(Datum)*datums.maxvalues); + + ginInitBA(&accum); + accum.ginstate = &ginstate; + + opCtx = AllocSetContextCreate(CurrentMemoryContext, + "Gin refresh temporary context", + ALLOCSET_DEFAULT_MINSIZE, + ALLOCSET_DEFAULT_INITSIZE, + ALLOCSET_DEFAULT_MAXSIZE); + + oldCtx = MemoryContextSwitchTo(opCtx); + + /* + * Read and lock head + */ + blkno = metadata->head; + buffer = ReadBuffer(index, blkno); + LockBuffer(buffer, GIN_SHARE); + page = BufferGetPage(buffer); + + LockBuffer(metabuffer, GIN_UNLOCK); + + for(;;) + { + /* + * reset datum's collector and read page's datums into memory + */ + datums.nvalues = 0; + + if ( GinPageIsDeleted(page) ) + { + /* concurrent completion is running */ + UnlockReleaseBuffer( buffer ); + break; + } + + processPendingPage(&accum, &datums, page, FirstOffsetNumber); + + vacuum_delay_point(); + + /* + * Is it time to flush memory to disk? + */ + if ( GinPageGetOpaque(page)->rightlink == InvalidBlockNumber || + ( GinPageHasFullRow(page) && accum.allocatedMemory > maintenance_work_mem * 1024L ) ) + { + ItemPointerData *list; + uint32 nlist; + Datum entry; + OffsetNumber maxoff, attnum; + + /* + * Unlock current page to increase performance. + * Changes of page will be checked later by comparing + * maxoff after completion of memory flush. + */ + maxoff = PageGetMaxOffsetNumber(page); + LockBuffer(buffer, GIN_UNLOCK); + + /* + * Moving collected data into regular structure can take + * significant amount of time - so, run it without locking pending + * list. + */ + while ((list = ginGetEntry(&accum, &attnum, &entry, &nlist)) != NULL) + { + vacuum_delay_point(); + ginEntryInsert(index, &ginstate, attnum, entry, list, nlist, FALSE); + } + + /* + * Lock the whole list to remove pages + */ + LockBuffer(metabuffer, GIN_EXCLUSIVE); + LockBuffer(buffer, GIN_SHARE); + + if ( GinPageIsDeleted(page) ) + { + /* concurrent completion is running */ + UnlockReleaseBuffer(buffer); + LockBuffer(metabuffer, GIN_UNLOCK); + break; + } + + /* + * While we keeped page unlocked it might be changed - + * add read the changes separately. On one page is rather + * small - so, overused memory isn't very big, although + * we should reinit accumulator. We need to make a + * check only once because now both page and metapage are + * locked. Inserion algorithm gurantees that inserted row(s) + * will not continue on next page. + */ + if ( PageGetMaxOffsetNumber(page) != maxoff ) + { + ginInitBA(&accum); + datums.nvalues = 0; + processPendingPage(&accum, &datums, page, maxoff+1); + + while ((list = ginGetEntry(&accum, &attnum, &entry, &nlist)) != NULL) + ginEntryInsert(index, &ginstate, attnum, entry, list, nlist, FALSE); + } + + /* + * Remember next page - it will become a new head + */ + blkno = GinPageGetOpaque(page)->rightlink; + UnlockReleaseBuffer(buffer); /* shiftList will do exclusive locking */ + + /* + * remove readed pages from pending list, at this point all + * content of readed pages is in regular structure + */ + if ( shiftList(index, metabuffer, blkno, stats) ) + { + /* concurrent completion is running */ + LockBuffer(metabuffer, GIN_UNLOCK); + break; + } + + Assert( blkno == metadata->head ); + LockBuffer(metabuffer, GIN_UNLOCK); + + /* + * if we remove the whole list just exit + */ + if ( blkno == InvalidBlockNumber ) + break; + + /* + * reinit state + */ + MemoryContextReset(opCtx); + ginInitBA(&accum); + } + else + { + blkno = GinPageGetOpaque(page)->rightlink; + UnlockReleaseBuffer(buffer); + } + + + /* + * Read next page in pending list + */ + CHECK_FOR_INTERRUPTS(); + buffer = ReadBuffer(index, blkno); + LockBuffer(buffer, GIN_SHARE); + page = BufferGetPage(buffer); + } + + ReleaseBuffer(metabuffer); + MemoryContextSwitchTo(oldCtx); + MemoryContextDelete(opCtx); + + PG_RETURN_POINTER(stats); +} diff --git a/src/backend/access/gin/ginget.c b/src/backend/access/gin/ginget.c index 23131e5..69c15fc 100644 --- a/src/backend/access/gin/ginget.c +++ b/src/backend/access/gin/ginget.c @@ -268,6 +268,15 @@ startScanEntry(Relation index, GinState *ginstate, GinScanEntry entry) Page page; bool needUnlock = TRUE; + entry->buffer = InvalidBuffer; + entry->offset = InvalidOffsetNumber; + entry->list = NULL; + entry->nlist = 0; + entry->partialMatch = NULL; + entry->partialMatchResult = NULL; + entry->reduceResult = FALSE; + entry->predictNumberResult = 0; + if (entry->master != NULL) { entry->isFinished = entry->master->isFinished; @@ -285,14 +294,6 @@ startScanEntry(Relation index, GinState *ginstate, GinScanEntry entry) page = BufferGetPage(stackEntry->buffer); entry->isFinished = TRUE; - entry->buffer = InvalidBuffer; - entry->offset = InvalidOffsetNumber; - entry->list = NULL; - entry->nlist = 0; - entry->partialMatch = NULL; - entry->partialMatchResult = NULL; - entry->reduceResult = FALSE; - entry->predictNumberResult = 0; if ( entry->isPartialMatch ) { @@ -350,9 +351,10 @@ startScanEntry(Relation index, GinState *ginstate, GinScanEntry entry) entry->buffer = scanBeginPostingTree(gdi); /* - * We keep buffer pinned because we need to prevent deletition + * We keep buffer pinned because we need to prevent deletion of * page during scan. See GIN's vacuum implementation. RefCount - * is increased to keep buffer pinned after freeGinBtreeStack() call. + * is increased to keep buffer pinned after freeGinBtreeStack() + * call. */ IncrBufferRefCount(entry->buffer); @@ -429,6 +431,15 @@ startScan(IndexScanDesc scan) uint32 i; GinScanOpaque so = (GinScanOpaque) scan->opaque; + /* + * If isScanFastInsert is still true, set up to scan the pending-insert + * list rather than the main index. + */ + if (so->isScanFastInsert) + { + return; + } + for (i = 0; i < so->nkeys; i++) startScanKey(scan->indexRelation, &so->ginstate, so->keys + i); } @@ -671,6 +682,336 @@ keyGetItem(Relation index, GinState *ginstate, MemoryContext tempCtx, return FALSE; } +typedef struct fastPosition { + Buffer fastBuffer; + OffsetNumber firstOffset; + OffsetNumber lastOffset; + ItemPointerData item; +} fastPosition; + + +/* + * Get ItemPointer of next heap row to be checked from fast insert storage. + * Returns false if there are no more. + * + * The fastBuffer is presumed pinned and share-locked on entry, and is + * pinned and share-locked on success exit. On failure exit it's released. + */ +static bool +scanGetCandidate(IndexScanDesc scan, fastPosition *pos) +{ + OffsetNumber maxoff; + Page page; + IndexTuple itup; + + ItemPointerSetInvalid( &pos->item ); + for(;;) + { + page = BufferGetPage(pos->fastBuffer); + + maxoff = PageGetMaxOffsetNumber(page); + if ( pos->firstOffset > maxoff ) + { + BlockNumber blkno = GinPageGetOpaque(page)->rightlink; + if ( blkno == InvalidBlockNumber ) + { + UnlockReleaseBuffer(pos->fastBuffer); + pos->fastBuffer=InvalidBuffer; + + return false; + } + else + { + /* + * Here we should prevent deletion of next page by + * insertcleanup process, which uses LockBufferForCleanup. + * So, we pin next page before unpin current one + */ + Buffer tmpbuf = ReadBuffer(scan->indexRelation, blkno); + + UnlockReleaseBuffer( pos->fastBuffer); + pos->fastBuffer=tmpbuf; + LockBuffer( pos->fastBuffer, GIN_SHARE ); + + pos->firstOffset = FirstOffsetNumber; + } + } + else + { + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, pos->firstOffset)); + pos->item = itup->t_tid; + if ( GinPageGetOpaque(page)->flags & GIN_LIST_FULLROW ) + { + /* + * find itempointer to the next row + */ + for(pos->lastOffset = pos->firstOffset+1; pos->lastOffset<=maxoff; pos->lastOffset++) + { + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, pos->lastOffset)); + if (!ItemPointerEquals(&pos->item, &itup->t_tid)) + break; + } + } + else + { + /* + * All itempointers are the same on this page + */ + pos->lastOffset = maxoff + 1; + } + break; + } + } + + return true; +} + +static bool +matchPartialInPendingList(GinState *ginstate, Page page, OffsetNumber off, + OffsetNumber maxoff, Datum value, OffsetNumber attrnum, + Datum *datum, bool *datumExtracted, StrategyNumber strategy) +{ + IndexTuple itup; + int res; + + while( off < maxoff ) + { + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off)); + if ( attrnum != gintuple_get_attrnum(ginstate, itup) ) + return false; + + if (datumExtracted[ off-1 ] == false) + { + datum[ off-1 ] = gin_index_getattr(ginstate, itup); + datumExtracted[ off-1 ] = true; + } + + res = DatumGetInt32(FunctionCall3(&ginstate->comparePartialFn[attrnum], + value, + datum[ off-1 ], + UInt16GetDatum(strategy))); + if ( res == 0 ) + return true; + else if (res>0) + return false; + } + + return false; +} +/* + * Sets entryRes array for each key by looking on + * every entry per indexed value (row) in fast insert storage. + * returns true if at least one of datum was matched by key's entry + * + * The fastBuffer is presumed pinned and share-locked on entry. + */ +static bool +collectDatumForItem(IndexScanDesc scan, fastPosition *pos) +{ + GinScanOpaque so = (GinScanOpaque) scan->opaque; + OffsetNumber attrnum; + Page page; + IndexTuple itup; + int i, j; + bool hasMatch = false; + + /* + * Resets entryRes + */ + for (i = 0; i < so->nkeys; i++) + { + GinScanKey key = so->keys + i; + memset( key->entryRes, FALSE, key->nentries ); + } + + for(;;) + { + Datum datum[ BLCKSZ/sizeof(IndexTupleData) ]; + bool datumExtracted[ BLCKSZ/sizeof(IndexTupleData) ]; + + Assert( pos->lastOffset > pos->firstOffset ); + memset(datumExtracted + pos->firstOffset - 1, 0, sizeof(bool) * (pos->lastOffset - pos->firstOffset )); + + page = BufferGetPage(pos->fastBuffer); + + for(i = 0; i < so->nkeys; i++) + { + GinScanKey key = so->keys + i; + + for(j=0; jnentries; j++) + { + OffsetNumber StopLow = pos->firstOffset, + StopHigh = pos->lastOffset, + StopMiddle; + GinScanEntry entry = key->scanEntry + j; + + if ( key->entryRes[j] ) + continue; + + while (StopLow < StopHigh) + { + StopMiddle = StopLow + ((StopHigh - StopLow) >> 1); + + itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, StopMiddle)); + attrnum = gintuple_get_attrnum(&so->ginstate, itup); + + if (key->attnum < attrnum) + StopHigh = StopMiddle; + else if (key->attnum > attrnum) + StopLow = StopMiddle + 1; + else + { + int res; + + if (datumExtracted[ StopMiddle-1 ] == false) + { + datum[ StopMiddle-1 ] = gin_index_getattr(&so->ginstate, itup); + datumExtracted[ StopMiddle-1 ] = true; + } + res = compareEntries(&so->ginstate, + entry->attnum, + entry->entry, + datum[ StopMiddle-1 ]); + + if ( res == 0 ) + { + if ( entry->isPartialMatch ) + key->entryRes[j] = matchPartialInPendingList(&so->ginstate, page, StopMiddle, + pos->lastOffset, entry->entry, entry->attnum, + datum, datumExtracted, entry->strategy); + else + key->entryRes[j] = true; + break; + } + else if ( res < 0 ) + StopHigh = StopMiddle; + else + StopLow = StopMiddle + 1; + } + } + + if ( StopLow>=StopHigh && entry->isPartialMatch ) + key->entryRes[j] = matchPartialInPendingList(&so->ginstate, page, StopHigh, + pos->lastOffset, entry->entry, entry->attnum, + datum, datumExtracted, entry->strategy); + + hasMatch |= key->entryRes[j]; + } + } + + pos->firstOffset = pos->lastOffset; + + if ( GinPageGetOpaque(page)->flags & GIN_LIST_FULLROW ) + { + /* + * We scan all values from one tuple, go to next one + */ + + return hasMatch; + } + else + { + ItemPointerData item = pos->item; + + if ( scanGetCandidate(scan, pos) == false || !ItemPointerEquals(&pos->item, &item) ) + elog(ERROR,"Could not process tuple"); /* XXX should not be here ! */ + } + } + + return hasMatch; +} + +/* + * Collect all matched rows from pending list in bitmap + */ +static TIDBitmap* +scanFastInsert(IndexScanDesc scan) +{ + GinScanOpaque so = (GinScanOpaque) scan->opaque; + MemoryContext oldCtx; + bool recheck, keyrecheck, match; + TIDBitmap *tbm = NULL; + int i; + fastPosition pos; + Buffer metabuffer = ReadBuffer(scan->indexRelation, GIN_METAPAGE_BLKNO); + BlockNumber blkno; + + LockBuffer(metabuffer, GIN_SHARE); + blkno = GinPageGetMeta(BufferGetPage(metabuffer))->head; + + /* + * fetch head of list before unlocking metapage. + * head page must be pinned to prevent deletion by vacuum process + */ + if ( blkno == InvalidBlockNumber ) + { + /* No pending list, so proceed with normal scan */ + UnlockReleaseBuffer( metabuffer ); + return NULL; + } + + pos.fastBuffer = ReadBuffer(scan->indexRelation, blkno); + LockBuffer(pos.fastBuffer, GIN_SHARE); + pos.firstOffset = FirstOffsetNumber; + UnlockReleaseBuffer( metabuffer ); + + /* + * loop for each heap row + */ + while( scanGetCandidate(scan, &pos) ) + { + + /* + * Check entries in rows and setup entryRes array + */ + if (!collectDatumForItem(scan, &pos)) + continue; + + /* + * check for consistent + */ + oldCtx = MemoryContextSwitchTo(so->tempCtx); + recheck = false; + match = true; + + for (i = 0; match && i < so->nkeys; i++) + { + GinScanKey key = so->keys + i; + + keyrecheck = true; + + if ( DatumGetBool(FunctionCall4(&so->ginstate.consistentFn[ key->attnum-1 ], + PointerGetDatum(key->entryRes), + UInt16GetDatum(key->strategy), + key->query, + PointerGetDatum(&keyrecheck))) == false ) + { + match = false; + } + + recheck |= keyrecheck; + } + + MemoryContextSwitchTo(oldCtx); + MemoryContextReset(so->tempCtx); + + if ( match ) + { + if ( tbm == NULL ) + tbm = tbm_create( work_mem * 1024L ); + tbm_add_tuples(tbm, &pos.item, 1, recheck); + } + } + + if ( tbm && tbm_has_lossy(tbm) ) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("not enough memory to store result of pending list or VACUUME table" ), + errhint("Increase the \"work_mem\" parameter."))); + + return tbm; +} + /* * Get heap item pointer from scan * returns true if found @@ -693,44 +1034,112 @@ scanGetItem(IndexScanDesc scan, ItemPointerData *item, bool *recheck) */ *recheck = false; - ItemPointerSetMin(item); - for (i = 0; i < so->nkeys; i++) + /* + * First of all we should check fast insert list of pages + */ + if ( so->isScanFastInsert ) { - GinScanKey key = so->keys + i; + if ( so->scanFastTuples ) + { + /* + * Items from pending list is already collected in memory + */ - if (keyGetItem(scan->indexRelation, &so->ginstate, so->tempCtx, - key, &keyrecheck)) - return FALSE; /* finished one of keys */ - if (compareItemPointers(item, &key->curItem) < 0) - *item = key->curItem; - *recheck |= keyrecheck; - } + if ( so->scanFastResult == NULL || so->scanFastOffset >= so->scanFastResult->ntuples ) + { + so->scanFastResult = tbm_iterate( so->scanFastTuples ); - for (i = 1; i <= so->nkeys; i++) - { - GinScanKey key = so->keys + i - 1; + if ( so->scanFastResult == NULL ) + { + /* scan of pending pages is finished */ + so->isScanFastInsert = false; + startScan(scan); + return scanGetItem(scan, item, recheck); + } + Assert( so->scanFastResult->ntuples >= 0 ); + so->scanFastOffset = 0; + } + + ItemPointerSet(item, + so->scanFastResult->blockno, + so->scanFastResult->offsets[ so->scanFastOffset ]); + *recheck = true; /* be conserative due to concurrent + removal from pending list */ - for (;;) + so->scanFastOffset ++; + + return true; + } + else { - int cmp = compareItemPointers(item, &key->curItem); + /* + * Collect ItemPointers in memory + */ + so->scanFastTuples = scanFastInsert(scan); - if (cmp == 0) - break; - else if (cmp > 0) + if ( so->scanFastTuples == NULL ) { - if (keyGetItem(scan->indexRelation, &so->ginstate, so->tempCtx, - key, &keyrecheck)) - return FALSE; /* finished one of keys */ - *recheck |= keyrecheck; + /* nothing found */ + so->isScanFastInsert = false; + startScan(scan); } else - { /* returns to begin */ + { + tbm_begin_iterate(so->scanFastTuples); + } + + return scanGetItem(scan, item, recheck); + } + } + + /* + * Regular scanning with filtering by already returned + * ItemPointers from pending list + */ + + do + { + ItemPointerSetMin(item); + *recheck = false; + + for (i = 0; i < so->nkeys; i++) + { + GinScanKey key = so->keys + i; + + if (keyGetItem(scan->indexRelation, &so->ginstate, so->tempCtx, + key, &keyrecheck)) + return FALSE; /* finished one of keys */ + if (compareItemPointers(item, &key->curItem) < 0) *item = key->curItem; - i = 0; + *recheck |= keyrecheck; + } + + for (i = 1; i <= so->nkeys; i++) + { + GinScanKey key = so->keys + i - 1; + + for (;;) + { + int cmp = compareItemPointers(item, &key->curItem); + + if (cmp == 0) break; + else if (cmp > 0) + { + if (keyGetItem(scan->indexRelation, &so->ginstate, so->tempCtx, + key, &keyrecheck)) + return FALSE; /* finished one of keys */ + *recheck |= keyrecheck; + } + else + { /* returns to begin */ + *item = key->curItem; + i = 0; + break; + } } } - } + } while( so->scanFastTuples && tbm_check_tuple(so->scanFastTuples, item) ); return TRUE; } diff --git a/src/backend/access/gin/gininsert.c b/src/backend/access/gin/gininsert.c index 4be89bc..062ddba 100644 --- a/src/backend/access/gin/gininsert.c +++ b/src/backend/access/gin/gininsert.c @@ -138,7 +138,7 @@ addItemPointersToTuple(Relation index, GinState *ginstate, GinBtreeStack *stack, /* * Inserts only one entry to the index, but it can add more than 1 ItemPointer. */ -static void +void ginEntryInsert(Relation index, GinState *ginstate, OffsetNumber attnum, Datum value, ItemPointerData *items, uint32 nitem, bool isBuild) { @@ -273,7 +273,7 @@ ginbuild(PG_FUNCTION_ARGS) IndexBuildResult *result; double reltuples; GinBuildState buildstate; - Buffer buffer; + Buffer RootBuffer, MetaBuffer; ItemPointerData *list; Datum entry; uint32 nlist; @@ -286,11 +286,17 @@ ginbuild(PG_FUNCTION_ARGS) initGinState(&buildstate.ginstate, index); + /* initialize the meta page */ + MetaBuffer = GinNewBuffer(index); + /* initialize the root page */ - buffer = GinNewBuffer(index); + RootBuffer = GinNewBuffer(index); + START_CRIT_SECTION(); - GinInitBuffer(buffer, GIN_LEAF); - MarkBufferDirty(buffer); + GinInitMetabuffer(MetaBuffer); + MarkBufferDirty(MetaBuffer); + GinInitBuffer(RootBuffer, GIN_LEAF); + MarkBufferDirty(RootBuffer); if (!index->rd_istemp) { @@ -303,16 +309,19 @@ ginbuild(PG_FUNCTION_ARGS) rdata.len = sizeof(RelFileNode); rdata.next = NULL; - page = BufferGetPage(buffer); - - recptr = XLogInsert(RM_GIN_ID, XLOG_GIN_CREATE_INDEX, &rdata); + + page = BufferGetPage(RootBuffer); PageSetLSN(page, recptr); PageSetTLI(page, ThisTimeLineID); + page = BufferGetPage(MetaBuffer); + PageSetLSN(page, recptr); + PageSetTLI(page, ThisTimeLineID); } - UnlockReleaseBuffer(buffer); + UnlockReleaseBuffer(MetaBuffer); + UnlockReleaseBuffer(RootBuffer); END_CRIT_SECTION(); /* build the index */ @@ -417,9 +426,26 @@ gininsert(PG_FUNCTION_ARGS) initGinState(&ginstate, index); - for(i=0; inatts;i++) - if ( !isnull[i] ) - res += ginHeapTupleInsert(index, &ginstate, (OffsetNumber)(i+1), values[i], ht_ctid); + if ( GinGetUseFastUpdate(index) ) + { + GinTupleCollector collector; + + memset(&collector, 0, sizeof(GinTupleCollector)); + for(i=0; inatts;i++) + if ( !isnull[i] ) + res += ginHeapTupleFastCollect(index, &ginstate, &collector, + (OffsetNumber)(i+1), values[i], ht_ctid); + + ginHeapTupleFastInsert(index, &collector); + } + else + { + for(i=0; inatts;i++) + if ( !isnull[i] ) + res += ginHeapTupleInsert(index, &ginstate, + (OffsetNumber)(i+1), values[i], ht_ctid); + + } MemoryContextSwitchTo(oldCtx); MemoryContextDelete(insertCtx); diff --git a/src/backend/access/gin/ginscan.c b/src/backend/access/gin/ginscan.c index bc51e94..0c0ce52 100644 --- a/src/backend/access/gin/ginscan.c +++ b/src/backend/access/gin/ginscan.c @@ -159,6 +159,9 @@ newScanKey(IndexScanDesc scan) errmsg("GIN indexes do not support whole-index scans"))); so->isVoidRes = false; + so->isScanFastInsert = true; + so->scanFastTuples = NULL; + so->scanFastResult = NULL; for (i = 0; i < scan->numberOfKeys; i++) { @@ -233,8 +236,11 @@ ginrescan(PG_FUNCTION_ARGS) else { freeScanKeys(so->keys, so->nkeys); + if ( so->scanFastTuples ) + tbm_free( so->scanFastTuples ); } + so->scanFastTuples = NULL; so->keys = NULL; if (scankey && scan->numberOfKeys > 0) @@ -256,6 +262,8 @@ ginendscan(PG_FUNCTION_ARGS) if (so != NULL) { freeScanKeys(so->keys, so->nkeys); + if ( so->scanFastTuples ) + tbm_free( so->scanFastTuples ); MemoryContextDelete(so->tempCtx); diff --git a/src/backend/access/gin/ginutil.c b/src/backend/access/gin/ginutil.c index 5e71c85..6633dce 100644 --- a/src/backend/access/gin/ginutil.c +++ b/src/backend/access/gin/ginutil.c @@ -21,6 +21,7 @@ #include "storage/freespace.h" #include "storage/indexfsm.h" #include "storage/lmgr.h" +#include "utils/guc.h" void initGinState(GinState *state, Relation index) @@ -57,7 +58,7 @@ initGinState(GinState *state, Relation index) CurrentMemoryContext); /* - * Check opclass capability to do partial match. + * Check opclass capability to do partial match. */ if ( index_getprocid(index, i+1, GIN_COMPARE_PARTIAL_PROC) != InvalidOid ) { @@ -88,7 +89,7 @@ gintuple_get_attrnum(GinState *ginstate, IndexTuple tuple) bool isnull; /* - * First attribute is always int16, so we can safely use any + * First attribute is always int16, so we can safely use any * tuple descriptor to obtain first attribute of tuple */ res = index_getattr(tuple, FirstOffsetNumber, ginstate->tupdesc[0], @@ -213,6 +214,20 @@ GinInitBuffer(Buffer b, uint32 f) GinInitPage(BufferGetPage(b), f, BufferGetPageSize(b)); } +void +GinInitMetabuffer(Buffer b) +{ + GinMetaPageData *metadata; + Page page = BufferGetPage(b); + + GinInitPage(page, GIN_META, BufferGetPageSize(b)); + + metadata = GinPageGetMeta(page); + + metadata->head = metadata->tail = InvalidBlockNumber; + metadata->tailFreeSize = 0; +} + int compareEntries(GinState *ginstate, OffsetNumber attnum, Datum a, Datum b) { @@ -310,12 +325,10 @@ extractEntriesSU(GinState *ginstate, OffsetNumber attnum, Datum value, int32 *ne return entries; } -Datum -ginoptions(PG_FUNCTION_ARGS) +static int +parseFillfactor(char *value, bool validate) { - Datum reloptions = PG_GETARG_DATUM(0); - bool validate = PG_GETARG_BOOL(1); - bytea *result; + int fillfactor; /* * It's not clear that fillfactor is useful for GIN, but for the moment @@ -324,10 +337,73 @@ ginoptions(PG_FUNCTION_ARGS) #define GIN_MIN_FILLFACTOR 10 #define GIN_DEFAULT_FILLFACTOR 100 - result = default_reloptions(reloptions, validate, - GIN_MIN_FILLFACTOR, - GIN_DEFAULT_FILLFACTOR); - if (result) - PG_RETURN_BYTEA_P(result); - PG_RETURN_NULL(); + if (value == NULL) + return GIN_DEFAULT_FILLFACTOR; + + if (!parse_int(value, &fillfactor, 0, NULL)) + { + if (validate) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("fillfactor must be an integer: \"%s\"", + value))); + return GIN_DEFAULT_FILLFACTOR; + } + + if (fillfactor < GIN_MIN_FILLFACTOR || fillfactor > 100) + { + if (validate) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("fillfactor=%d is out of range (should be between %d and 100)", + fillfactor, GIN_MIN_FILLFACTOR))); + return GIN_DEFAULT_FILLFACTOR; + } + + return fillfactor; +} + +static bool +parseFastupdate(char *value, bool validate) +{ + bool result; + + if (value == NULL) + return GIN_DEFAULT_USE_FASTUPDATE; + + if (!parse_bool(value, &result)) + { + if (validate) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("fastupdate=\"%s\" is not recognized", + value))); + return GIN_DEFAULT_USE_FASTUPDATE; + } + + return result; +} + +Datum +ginoptions(PG_FUNCTION_ARGS) +{ + Datum reloptions = PG_GETARG_DATUM(0); + bool validate = PG_GETARG_BOOL(1); + static const char *const gin_keywords[2] = {"fillfactor", "fastupdate"}; + char *values[2]; + GinOptions *options; + + parseRelOptions(reloptions, 2, gin_keywords, values, validate); + + /* If no options, just return NULL */ + if (values[0] == NULL && values[1] == NULL) + PG_RETURN_NULL(); + + options = (GinOptions *) palloc(sizeof(GinOptions)); + SET_VARSIZE(options, sizeof(GinOptions)); + + options->std.fillfactor = parseFillfactor(values[0], validate); + options->useFastUpdate = parseFastupdate(values[1], validate); + + PG_RETURN_BYTEA_P(options); } diff --git a/src/backend/access/gin/ginvacuum.c b/src/backend/access/gin/ginvacuum.c index b180cd7..4146995 100644 --- a/src/backend/access/gin/ginvacuum.c +++ b/src/backend/access/gin/ginvacuum.c @@ -595,7 +595,14 @@ ginbulkdelete(PG_FUNCTION_ARGS) /* first time through? */ if (stats == NULL) - stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); + { + stats = (IndexBulkDeleteResult *)DatumGetPointer( + DirectFunctionCall2(gininsertcleanup, + PG_GETARG_DATUM(0), + PG_GETARG_DATUM(1) + )); + } + /* we'll re-count the tuples each time */ stats->num_index_tuples = 0; @@ -703,9 +710,18 @@ ginvacuumcleanup(PG_FUNCTION_ARGS) BlockNumber lastBlock = GIN_ROOT_BLKNO, lastFilledBlock = GIN_ROOT_BLKNO; - /* Set up all-zero stats if ginbulkdelete wasn't called */ + /* + * Set up all-zero stats and finalyze fast insertion + * if ginbulkdelete wasn't called + */ if (stats == NULL) - stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); + { + stats = (IndexBulkDeleteResult *)DatumGetPointer( + DirectFunctionCall2(gininsertcleanup, + PG_GETARG_DATUM(0), + PG_GETARG_DATUM(1) + )); + } /* * XXX we always report the heap tuple count as the number of index diff --git a/src/backend/access/gin/ginxlog.c b/src/backend/access/gin/ginxlog.c index 0d40bfb..76db49c 100644 --- a/src/backend/access/gin/ginxlog.c +++ b/src/backend/access/gin/ginxlog.c @@ -71,20 +71,30 @@ static void ginRedoCreateIndex(XLogRecPtr lsn, XLogRecord *record) { RelFileNode *node = (RelFileNode *) XLogRecGetData(record); - Buffer buffer; + Buffer RootBuffer, MetaBuffer; Page page; - buffer = XLogReadBuffer(*node, GIN_ROOT_BLKNO, true); - Assert(BufferIsValid(buffer)); - page = (Page) BufferGetPage(buffer); + MetaBuffer = XLogReadBuffer(*node, GIN_METAPAGE_BLKNO, true); + Assert(BufferIsValid(MetaBuffer)); + GinInitMetabuffer(MetaBuffer); + + page = (Page) BufferGetPage(MetaBuffer); + PageSetLSN(page, lsn); + PageSetTLI(page, ThisTimeLineID); - GinInitBuffer(buffer, GIN_LEAF); + RootBuffer = XLogReadBuffer(*node, GIN_ROOT_BLKNO, true); + Assert(BufferIsValid(RootBuffer)); + page = (Page) BufferGetPage(RootBuffer); + + GinInitBuffer(RootBuffer, GIN_LEAF); PageSetLSN(page, lsn); PageSetTLI(page, ThisTimeLineID); - MarkBufferDirty(buffer); - UnlockReleaseBuffer(buffer); + MarkBufferDirty(MetaBuffer); + UnlockReleaseBuffer(MetaBuffer); + MarkBufferDirty(RootBuffer); + UnlockReleaseBuffer(RootBuffer); } static void @@ -433,6 +443,161 @@ ginRedoDeletePage(XLogRecPtr lsn, XLogRecord *record) } } +static void +ginRedoUpdateMetapage(XLogRecPtr lsn, XLogRecord *record) +{ + ginxlogUpdateMeta *data = (ginxlogUpdateMeta*) XLogRecGetData(record); + Buffer metabuffer; + Page metapage; + + metabuffer = XLogReadBuffer(data->node, GIN_METAPAGE_BLKNO, false); + metapage = BufferGetPage(metabuffer); + + if (!XLByteLE(lsn, PageGetLSN(metapage))) + { + memcpy( GinPageGetMeta(metapage), &data->metadata, sizeof(GinMetaPageData)); + PageSetLSN(metapage, lsn); + PageSetTLI(metapage, ThisTimeLineID); + MarkBufferDirty(metabuffer); + } + + if ( data->ntuples > 0 ) + { + /* + * insert into tail page + */ + if (!(record->xl_info & XLR_BKP_BLOCK_1)) + { + Buffer buffer = XLogReadBuffer(data->node, data->metadata.tail, false); + Page page = BufferGetPage(buffer); + + if ( !XLByteLE(lsn, PageGetLSN(page))) + { + OffsetNumber l, off = (PageIsEmpty(page)) ? FirstOffsetNumber : + OffsetNumberNext(PageGetMaxOffsetNumber(page)); + int i, tupsize; + IndexTuple tuples = (IndexTuple) (XLogRecGetData(record) + sizeof(ginxlogUpdateMeta)); + + for(i=0; intuples; i++) + { + tupsize = IndexTupleSize(tuples); + + l = PageAddItem(page, (Item)tuples, tupsize, off, false, false); + + if (l == InvalidOffsetNumber) + elog(ERROR, "failed to add item to index page"); + + tuples = (IndexTuple)( ((char*)tuples) + tupsize ); + } + + PageSetLSN(page, lsn); + PageSetTLI(page, ThisTimeLineID); + MarkBufferDirty(buffer); + } + UnlockReleaseBuffer(buffer); + } + } + else if ( data->prevTail != InvalidBlockNumber ) + { + /* + * New tail + */ + + Buffer buffer = XLogReadBuffer(data->node, data->prevTail, false); + Page page = BufferGetPage(buffer); + + if ( !XLByteLE(lsn, PageGetLSN(page))) + { + GinPageGetOpaque(page)->rightlink = data->newRightlink; + + PageSetLSN(page, lsn); + PageSetTLI(page, ThisTimeLineID); + MarkBufferDirty(buffer); + } + UnlockReleaseBuffer(buffer); + } + + UnlockReleaseBuffer(metabuffer); +} + +static void +ginRedoInsertListPage(XLogRecPtr lsn, XLogRecord *record) +{ + ginxlogInsertListPage *data = (ginxlogInsertListPage*) XLogRecGetData(record); + Buffer buffer; + Page page; + OffsetNumber l, off = FirstOffsetNumber; + int i, tupsize; + IndexTuple tuples = (IndexTuple) (XLogRecGetData(record) + sizeof(ginxlogInsertListPage)); + + if (record->xl_info & XLR_BKP_BLOCK_1) + return; + + buffer = XLogReadBuffer(data->node, data->blkno, true); + page = BufferGetPage(buffer); + + GinInitBuffer(buffer, GIN_LIST); + GinPageGetOpaque(page)->rightlink = data->rightlink; + if ( data->rightlink == InvalidBlockNumber ) + GinPageSetFullRow(page); + + for(i=0; intuples; i++) + { + tupsize = IndexTupleSize(tuples); + + l = PageAddItem(page, (Item)tuples, tupsize, off, false, false); + + if (l == InvalidOffsetNumber) + elog(ERROR, "failed to add item to index page"); + + tuples = (IndexTuple)( ((char*)tuples) + tupsize ); + } + + PageSetLSN(page, lsn); + PageSetTLI(page, ThisTimeLineID); + MarkBufferDirty(buffer); + + UnlockReleaseBuffer(buffer); +} + +static void +ginRedoDeleteListPages(XLogRecPtr lsn, XLogRecord *record) +{ + ginxlogDeleteListPages *data = (ginxlogDeleteListPages*) XLogRecGetData(record); + Buffer metabuffer; + Page metapage; + int i; + + metabuffer = XLogReadBuffer(data->node, GIN_METAPAGE_BLKNO, false); + metapage = BufferGetPage(metabuffer); + + if (!XLByteLE(lsn, PageGetLSN(metapage))) + { + memcpy( GinPageGetMeta(metapage), &data->metadata, sizeof(GinMetaPageData)); + PageSetLSN(metapage, lsn); + PageSetTLI(metapage, ThisTimeLineID); + MarkBufferDirty(metabuffer); + } + + for(i=0; indeleted; i++) + { + Buffer buffer = XLogReadBuffer(data->node,data->toDelete[i],false); + Page page = BufferGetPage(buffer); + + if ( !XLByteLE(lsn, PageGetLSN(page))) + { + GinPageGetOpaque(page)->flags = GIN_DELETED; + + PageSetLSN(page, lsn); + PageSetTLI(page, ThisTimeLineID); + MarkBufferDirty(buffer); + } + + UnlockReleaseBuffer(buffer); + } + UnlockReleaseBuffer(metabuffer); +} + void gin_redo(XLogRecPtr lsn, XLogRecord *record) { @@ -459,6 +624,15 @@ gin_redo(XLogRecPtr lsn, XLogRecord *record) case XLOG_GIN_DELETE_PAGE: ginRedoDeletePage(lsn, record); break; + case XLOG_GIN_UPDATE_META_PAGE: + ginRedoUpdateMetapage(lsn, record); + break; + case XLOG_GIN_INSERT_LISTPAGE: + ginRedoInsertListPage(lsn, record); + break; + case XLOG_GIN_DELETE_LISTPAGE: + ginRedoDeleteListPages(lsn, record); + break; default: elog(PANIC, "gin_redo: unknown op code %u", info); } @@ -514,6 +688,18 @@ gin_desc(StringInfo buf, uint8 xl_info, char *rec) appendStringInfo(buf, "Delete page, "); desc_node(buf, ((ginxlogDeletePage *) rec)->node, ((ginxlogDeletePage *) rec)->blkno); break; + case XLOG_GIN_UPDATE_META_PAGE: + appendStringInfo(buf, "Update metapage, "); + desc_node(buf, ((ginxlogUpdateMeta *) rec)->node, ((ginxlogUpdateMeta *) rec)->metadata.tail); + break; + case XLOG_GIN_INSERT_LISTPAGE: + appendStringInfo(buf, "insert new list page, "); + desc_node(buf, ((ginxlogInsertListPage *) rec)->node, ((ginxlogInsertListPage *) rec)->blkno); + break; + case XLOG_GIN_DELETE_LISTPAGE: + appendStringInfo(buf, "Delete list page (%d), ", ((ginxlogDeleteListPages *) rec)->ndeleted); + desc_node(buf, ((ginxlogDeleteListPages *) rec)->node, ((ginxlogDeleteListPages *) rec)->metadata.head); + break; default: elog(PANIC, "gin_desc: unknown op code %u", info); } diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql index 2fc6f05..4434ab4 100644 --- a/src/backend/catalog/system_views.sql +++ b/src/backend/catalog/system_views.sql @@ -193,6 +193,7 @@ CREATE VIEW pg_stat_all_tables AS pg_stat_get_tuples_updated(C.oid) AS n_tup_upd, pg_stat_get_tuples_deleted(C.oid) AS n_tup_del, pg_stat_get_tuples_hot_updated(C.oid) AS n_tup_hot_upd, + pg_stat_get_fresh_inserted_tuples(C.oid) AS n_fresh_tup, pg_stat_get_live_tuples(C.oid) AS n_live_tup, pg_stat_get_dead_tuples(C.oid) AS n_dead_tup, pg_stat_get_last_vacuum_time(C.oid) as last_vacuum, diff --git a/src/backend/nodes/tidbitmap.c b/src/backend/nodes/tidbitmap.c index ffc882f..0864a04 100644 --- a/src/backend/nodes/tidbitmap.c +++ b/src/backend/nodes/tidbitmap.c @@ -306,6 +306,47 @@ tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids, } /* + * tbm_check_tuple - Check presence of tuple's ID in a TIDBitmap + */ +bool +tbm_check_tuple(TIDBitmap *tbm, const ItemPointer tid) { + BlockNumber blk = ItemPointerGetBlockNumber(tid); + OffsetNumber off = ItemPointerGetOffsetNumber(tid); + PagetableEntry *page; + int wordnum, + bitnum; + + /* safety check to ensure we don't overrun bit array bounds */ + if (off < 1 || off > MAX_TUPLES_PER_PAGE) + elog(ERROR, "tuple offset out of range: %u", off); + + if (tbm_page_is_lossy(tbm, blk)) + return true; /* whole page is already marked */ + + page = tbm_get_pageentry(tbm, blk); + if (page->ischunk) + { + wordnum = bitnum = 0; + } + else + { + wordnum = WORDNUM(off - 1); + bitnum = BITNUM(off - 1); + } + + return ( page->words[wordnum] & ((bitmapword) 1 << bitnum) ) ? true : false; +} + +/* + * tbm_has_lossy - returns true if there is at least one lossy page + */ +bool +tbm_has_lossy(TIDBitmap *tbm) +{ + return (tbm->nchunks>0); +} + +/* * tbm_union - set union * * a is modified in-place, b is not changed diff --git a/src/backend/postmaster/autovacuum.c b/src/backend/postmaster/autovacuum.c index 2c68779..324ae44 100644 --- a/src/backend/postmaster/autovacuum.c +++ b/src/backend/postmaster/autovacuum.c @@ -2477,6 +2477,58 @@ table_recheck_autovac(Oid relid, HTAB *table_toast_map) } /* + * relation_has_pending_indexes + * + * Returns true if relation has indexes with delayed insertion. + * Currently, only GIN has that possiblity + */ + +static bool +relation_has_pending_indexes(Oid relid, Form_pg_class classForm) +{ + Relation rel; + List *indexoidlist; + ListCell *indexoidscan; + bool has = false; + + /* only ordinary cataloged heap can contains such indexes */ + if ( classForm->relkind != RELKIND_RELATION ) + return false; + + /* has not indexes at all */ + if ( classForm->relhasindex == false ) + return false; + + rel = RelationIdGetRelation(relid); + + indexoidlist = RelationGetIndexList(rel); + + foreach(indexoidscan, indexoidlist) + { + Oid indexoid = lfirst_oid(indexoidscan); + Relation irel = RelationIdGetRelation(indexoid); + + /* + * Currently, only GIN + */ + if ( irel->rd_rel->relam == GIN_AM_OID ) + { + RelationClose(irel); + has = true; + break; + } + + RelationClose(irel); + } + + list_free(indexoidlist); + + RelationClose(rel); + + return has; +} + +/* * relation_needs_vacanalyze * * Check whether a relation needs to be vacuumed or analyzed; return each into @@ -2533,7 +2585,8 @@ relation_needs_vacanalyze(Oid relid, /* number of vacuum (resp. analyze) tuples at this time */ float4 vactuples, - anltuples; + anltuples, + instuples; /* freeze parameters */ int freeze_max_age; @@ -2598,6 +2651,7 @@ relation_needs_vacanalyze(Oid relid, vactuples = tabentry->n_dead_tuples; anltuples = tabentry->n_live_tuples + tabentry->n_dead_tuples - tabentry->last_anl_tuples; + instuples = tabentry->n_inserted_tuples; vacthresh = (float4) vac_base_thresh + vac_scale_factor * reltuples; anlthresh = (float4) anl_base_thresh + anl_scale_factor * reltuples; @@ -2611,8 +2665,13 @@ relation_needs_vacanalyze(Oid relid, NameStr(classForm->relname), vactuples, vacthresh, anltuples, anlthresh); - /* Determine if this table needs vacuum or analyze. */ - *dovacuum = force_vacuum || (vactuples > vacthresh); + /* + * Determine if this table needs vacuum or analyze. + * Use vac_base_thresh as a theshhold for instuples because + * search time of GIN's pending pages is linear by its number. + */ + *dovacuum = force_vacuum || (vactuples > vacthresh) || + (relation_has_pending_indexes(relid, classForm) && instuples > vac_base_thresh); *doanalyze = (anltuples > anlthresh); } else diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c index 5ae0ec1..24573ff 100644 --- a/src/backend/postmaster/pgstat.c +++ b/src/backend/postmaster/pgstat.c @@ -3537,6 +3537,9 @@ pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len) tabentry->tuples_updated = tabmsg[i].t_counts.t_tuples_updated; tabentry->tuples_deleted = tabmsg[i].t_counts.t_tuples_deleted; tabentry->tuples_hot_updated = tabmsg[i].t_counts.t_tuples_hot_updated; + tabentry->n_inserted_tuples = tabmsg[i].t_counts.t_tuples_inserted + + tabmsg[i].t_counts.t_tuples_updated - + tabmsg[i].t_counts.t_tuples_hot_updated; tabentry->n_live_tuples = tabmsg[i].t_counts.t_new_live_tuples; tabentry->n_dead_tuples = tabmsg[i].t_counts.t_new_dead_tuples; tabentry->blocks_fetched = tabmsg[i].t_counts.t_blocks_fetched; @@ -3560,6 +3563,9 @@ pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len) tabentry->tuples_updated += tabmsg[i].t_counts.t_tuples_updated; tabentry->tuples_deleted += tabmsg[i].t_counts.t_tuples_deleted; tabentry->tuples_hot_updated += tabmsg[i].t_counts.t_tuples_hot_updated; + tabentry->n_inserted_tuples += tabmsg[i].t_counts.t_tuples_inserted + + tabmsg[i].t_counts.t_tuples_updated - + tabmsg[i].t_counts.t_tuples_hot_updated; tabentry->n_live_tuples += tabmsg[i].t_counts.t_new_live_tuples; tabentry->n_dead_tuples += tabmsg[i].t_counts.t_new_dead_tuples; tabentry->blocks_fetched += tabmsg[i].t_counts.t_blocks_fetched; @@ -3570,6 +3576,8 @@ pgstat_recv_tabstat(PgStat_MsgTabstat *msg, int len) tabentry->n_live_tuples = Max(tabentry->n_live_tuples, 0); /* Likewise for n_dead_tuples */ tabentry->n_dead_tuples = Max(tabentry->n_dead_tuples, 0); + /* Likewise for n_inserted_tuples */ + tabentry->n_inserted_tuples = Max(tabentry->n_inserted_tuples, 0); /* * Add per-table stats to the per-database entry, too. @@ -3770,6 +3778,7 @@ pgstat_recv_vacuum(PgStat_MsgVacuum *msg, int len) tabentry->n_live_tuples = msg->m_tuples; /* Resetting dead_tuples to 0 is an approximation ... */ tabentry->n_dead_tuples = 0; + tabentry->n_inserted_tuples = 0; if (msg->m_analyze) { if (msg->m_scanned_all) diff --git a/src/backend/utils/adt/pgstatfuncs.c b/src/backend/utils/adt/pgstatfuncs.c index 77c2baa..381de6f 100644 --- a/src/backend/utils/adt/pgstatfuncs.c +++ b/src/backend/utils/adt/pgstatfuncs.c @@ -31,6 +31,7 @@ extern Datum pg_stat_get_tuples_updated(PG_FUNCTION_ARGS); extern Datum pg_stat_get_tuples_deleted(PG_FUNCTION_ARGS); extern Datum pg_stat_get_tuples_hot_updated(PG_FUNCTION_ARGS); extern Datum pg_stat_get_live_tuples(PG_FUNCTION_ARGS); +extern Datum pg_stat_get_fresh_inserted_tuples(PG_FUNCTION_ARGS); extern Datum pg_stat_get_dead_tuples(PG_FUNCTION_ARGS); extern Datum pg_stat_get_blocks_fetched(PG_FUNCTION_ARGS); extern Datum pg_stat_get_blocks_hit(PG_FUNCTION_ARGS); @@ -209,6 +210,20 @@ pg_stat_get_live_tuples(PG_FUNCTION_ARGS) PG_RETURN_INT64(result); } +Datum +pg_stat_get_fresh_inserted_tuples(PG_FUNCTION_ARGS) +{ + Oid relid = PG_GETARG_OID(0); + int64 result; + PgStat_StatTabEntry *tabentry; + + if ((tabentry = pgstat_fetch_stat_tabentry(relid)) == NULL) + result = 0; + else + result = (int64) (tabentry->n_inserted_tuples); + + PG_RETURN_INT64(result); +} Datum pg_stat_get_dead_tuples(PG_FUNCTION_ARGS) diff --git a/src/include/access/gin.h b/src/include/access/gin.h index 0fd2cbd..f514358 100644 --- a/src/include/access/gin.h +++ b/src/include/access/gin.h @@ -21,6 +21,7 @@ #include "storage/buf.h" #include "storage/off.h" #include "storage/relfilenode.h" +#include "utils/rel.h" /* @@ -52,11 +53,34 @@ typedef struct GinPageOpaqueData typedef GinPageOpaqueData *GinPageOpaque; -#define GIN_ROOT_BLKNO (0) +#define GIN_METAPAGE_BLKNO (0) +#define GIN_ROOT_BLKNO (1) #define GIN_DATA (1 << 0) #define GIN_LEAF (1 << 1) #define GIN_DELETED (1 << 2) +#define GIN_META (1 << 3) +#define GIN_LIST (1 << 4) +#define GIN_LIST_FULLROW (1 << 5) /* makes sense only on GIN_LIST page */ + +typedef struct GinMetaPageData +{ + /* + * Pointers to head and tail of list of GIN_LIST pages. These store + * fast-inserted entries that haven't yet been moved into the regular + * GIN structure. + */ + BlockNumber head; + BlockNumber tail; + + /* + * Free space in bytes in the list's tail page. + */ + uint32 tailFreeSize; +} GinMetaPageData; + +#define GinPageGetMeta(p) \ + ((GinMetaPageData *) PageGetContents(p)) /* * Works on page @@ -68,6 +92,8 @@ typedef GinPageOpaqueData *GinPageOpaque; #define GinPageSetNonLeaf(page) ( GinPageGetOpaque(page)->flags &= ~GIN_LEAF ) #define GinPageIsData(page) ( GinPageGetOpaque(page)->flags & GIN_DATA ) #define GinPageSetData(page) ( GinPageGetOpaque(page)->flags |= GIN_DATA ) +#define GinPageHasFullRow(page) ( GinPageGetOpaque(page)->flags & GIN_LIST_FULLROW ) +#define GinPageSetFullRow(page) ( GinPageGetOpaque(page)->flags |= GIN_LIST_FULLROW ) #define GinPageIsDeleted(page) ( GinPageGetOpaque(page)->flags & GIN_DELETED) #define GinPageSetDeleted(page) ( GinPageGetOpaque(page)->flags |= GIN_DELETED) @@ -135,6 +161,20 @@ typedef struct - GinPageGetOpaque(page)->maxoff * GinSizeOfItem(page) \ - MAXALIGN(sizeof(GinPageOpaqueData))) +/* + * storage type for GIN's options. Must be upward compatible with + * StdRdOptions, since we might call RelationGetFillFactor(). + */ +typedef struct GinOptions +{ + StdRdOptions std; /* standard options */ + bool useFastUpdate; /* use fast updates? */ +} GinOptions; + +#define GIN_DEFAULT_USE_FASTUPDATE true +#define GinGetUseFastUpdate(relation) \ + ((relation)->rd_options ? \ + ((GinOptions *) (relation)->rd_options)->useFastUpdate : GIN_DEFAULT_USE_FASTUPDATE) #define GIN_UNLOCK BUFFER_LOCK_UNLOCK #define GIN_SHARE BUFFER_LOCK_SHARE @@ -234,12 +274,49 @@ typedef struct ginxlogDeletePage BlockNumber rightLink; } ginxlogDeletePage; + +#define XLOG_GIN_UPDATE_META_PAGE 0x60 + +typedef struct ginxlogUpdateMeta +{ + RelFileNode node; + GinMetaPageData metadata; + BlockNumber prevTail; + BlockNumber newRightlink; + int32 ntuples; /* if ntuples > 0 then metadata.tail was updated with + that tuples else new sub list was inserted */ + /* follows array of inserted tuples */ +} ginxlogUpdateMeta; + +#define XLOG_GIN_INSERT_LISTPAGE 0x70 + +typedef struct ginxlogInsertListPage +{ + RelFileNode node; + BlockNumber blkno; + BlockNumber rightlink; + int32 ntuples; + /* follows array of inserted tuples */ +} ginxlogInsertListPage; + +#define XLOG_GIN_DELETE_LISTPAGE 0x80 + +#define NDELETE_AT_ONCE (16) +typedef struct ginxlogDeleteListPages +{ + RelFileNode node; + GinMetaPageData metadata; + int32 ndeleted; + BlockNumber toDelete[ NDELETE_AT_ONCE ]; +} ginxlogDeleteListPages; + /* ginutil.c */ extern Datum ginoptions(PG_FUNCTION_ARGS); extern void initGinState(GinState *state, Relation index); extern Buffer GinNewBuffer(Relation index); extern void GinInitBuffer(Buffer b, uint32 f); extern void GinInitPage(Page page, uint32 f, Size pageSize); +extern void GinInitMetabuffer(Buffer b); extern int compareEntries(GinState *ginstate, OffsetNumber attnum, Datum a, Datum b); extern int compareAttEntries(GinState *ginstate, OffsetNumber attnum_a, Datum a, OffsetNumber attnum_b, Datum b); @@ -252,6 +329,8 @@ extern OffsetNumber gintuple_get_attrnum(GinState *ginstate, IndexTuple tuple); /* gininsert.c */ extern Datum ginbuild(PG_FUNCTION_ARGS); extern Datum gininsert(PG_FUNCTION_ARGS); +extern void ginEntryInsert(Relation index, GinState *ginstate, OffsetNumber attnum, Datum value, + ItemPointerData *items, uint32 nitem, bool isBuild); /* ginxlog.c */ extern void gin_redo(XLogRecPtr lsn, XLogRecord *record); @@ -425,6 +504,10 @@ typedef struct GinScanOpaqueData uint32 nkeys; bool isVoidRes; /* true if ginstate.extractQueryFn guarantees * that nothing will be found */ + bool isScanFastInsert; /* scan process in scanning fast update pages */ + TIDBitmap *scanFastTuples; + TBMIterateResult *scanFastResult; + OffsetNumber scanFastOffset; } GinScanOpaqueData; typedef GinScanOpaqueData *GinScanOpaque; @@ -488,4 +571,23 @@ extern void ginInsertRecordBA(BuildAccumulator *accum, OffsetNumber attnum, Datum *entries, int32 nentry); extern ItemPointerData *ginGetEntry(BuildAccumulator *accum, OffsetNumber *attnum, Datum *entry, uint32 *n); +/* ginfast.c */ + +typedef struct GinTupleCollector { + IndexTuple *tuples; + uint32 ntuples; + uint32 lentuples; + uint32 sumsize; +} GinTupleCollector; + +extern void ginHeapTupleFastInsert(Relation index, GinTupleCollector *collector); +extern uint32 ginHeapTupleFastCollect(Relation index, GinState *ginstate, + GinTupleCollector *collector, + OffsetNumber attnum, Datum value, ItemPointer item); + +#define GinListPageSize \ + ( BLCKSZ - SizeOfPageHeaderData - MAXALIGN(sizeof(GinPageOpaqueData)) ) + +extern Datum gininsertcleanup(PG_FUNCTION_ARGS); + #endif diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h index d405d82..9165f08 100644 --- a/src/include/catalog/pg_proc.h +++ b/src/include/catalog/pg_proc.h @@ -2928,6 +2928,8 @@ DATA(insert OID = 1933 ( pg_stat_get_tuples_deleted PGNSP PGUID 12 1 0 0 f f f DESCR("statistics: number of tuples deleted"); DATA(insert OID = 1972 ( pg_stat_get_tuples_hot_updated PGNSP PGUID 12 1 0 0 f f f t f s 1 0 20 "26" _null_ _null_ _null_ _null_ pg_stat_get_tuples_hot_updated _null_ _null_ _null_ )); DESCR("statistics: number of tuples hot updated"); +DATA(insert OID = 2316 ( pg_stat_get_fresh_inserted_tuples PGNSP PGUID 12 1 0 0 f f f t f s 1 0 20 "26" _null_ _null_ _null_ _null_ pg_stat_get_fresh_inserted_tuples _null_ _null_ _null_ )); +DESCR("statistics: number of inserted tuples since last vacuum"); DATA(insert OID = 2878 ( pg_stat_get_live_tuples PGNSP PGUID 12 1 0 0 f f f t f s 1 0 20 "26" _null_ _null_ _null_ _null_ pg_stat_get_live_tuples _null_ _null_ _null_ )); DESCR("statistics: number of live tuples"); DATA(insert OID = 2879 ( pg_stat_get_dead_tuples PGNSP PGUID 12 1 0 0 f f f t f s 1 0 20 "26" _null_ _null_ _null_ _null_ pg_stat_get_dead_tuples _null_ _null_ _null_ )); diff --git a/src/include/nodes/tidbitmap.h b/src/include/nodes/tidbitmap.h index 56d6a0d..c8dbeea 100644 --- a/src/include/nodes/tidbitmap.h +++ b/src/include/nodes/tidbitmap.h @@ -49,6 +49,8 @@ extern void tbm_free(TIDBitmap *tbm); extern void tbm_add_tuples(TIDBitmap *tbm, const ItemPointer tids, int ntids, bool recheck); +extern bool tbm_check_tuple(TIDBitmap *tbm, const ItemPointer tid); +extern bool tbm_has_lossy(TIDBitmap *tbm); extern void tbm_union(TIDBitmap *a, const TIDBitmap *b); extern void tbm_intersect(TIDBitmap *a, const TIDBitmap *b); diff --git a/src/include/pgstat.h b/src/include/pgstat.h index 4a1e274..79754dc 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -476,6 +476,8 @@ typedef struct PgStat_StatTabEntry PgStat_Counter tuples_deleted; PgStat_Counter tuples_hot_updated; + PgStat_Counter n_inserted_tuples; /* number of non-hot inserted tuples + * since last vacuum */ PgStat_Counter n_live_tuples; PgStat_Counter n_dead_tuples; PgStat_Counter last_anl_tuples; diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out index 977b17c..c037696 100644 --- a/src/test/regress/expected/rules.out +++ b/src/test/regress/expected/rules.out @@ -1291,14 +1291,14 @@ SELECT viewname, definition FROM pg_views WHERE schemaname <> 'information_schem pg_shadow | SELECT pg_authid.rolname AS usename, pg_authid.oid AS usesysid, pg_authid.rolcreatedb AS usecreatedb, pg_authid.rolsuper AS usesuper, pg_authid.rolcatupdate AS usecatupd, pg_authid.rolpassword AS passwd, (pg_authid.rolvaliduntil)::abstime AS valuntil, pg_authid.rolconfig AS useconfig FROM pg_authid WHERE pg_authid.rolcanlogin; pg_stat_activity | SELECT s.datid, d.datname, s.procpid, s.usesysid, u.rolname AS usename, s.current_query, s.waiting, s.xact_start, s.query_start, s.backend_start, s.client_addr, s.client_port FROM pg_database d, pg_stat_get_activity(NULL::integer) s(datid, procpid, usesysid, current_query, waiting, xact_start, query_start, backend_start, client_addr, client_port), pg_authid u WHERE ((s.datid = d.oid) AND (s.usesysid = u.oid)); pg_stat_all_indexes | SELECT c.oid AS relid, i.oid AS indexrelid, n.nspname AS schemaname, c.relname, i.relname AS indexrelname, pg_stat_get_numscans(i.oid) AS idx_scan, pg_stat_get_tuples_returned(i.oid) AS idx_tup_read, pg_stat_get_tuples_fetched(i.oid) AS idx_tup_fetch FROM (((pg_class c JOIN pg_index x ON ((c.oid = x.indrelid))) JOIN pg_class i ON ((i.oid = x.indexrelid))) LEFT JOIN pg_namespace n ON ((n.oid = c.relnamespace))) WHERE (c.relkind = ANY (ARRAY['r'::"char", 't'::"char"])); - pg_stat_all_tables | SELECT c.oid AS relid, n.nspname AS schemaname, c.relname, pg_stat_get_numscans(c.oid) AS seq_scan, pg_stat_get_tuples_returned(c.oid) AS seq_tup_read, (sum(pg_stat_get_numscans(i.indexrelid)))::bigint AS idx_scan, ((sum(pg_stat_get_tuples_fetched(i.indexrelid)))::bigint + pg_stat_get_tuples_fetched(c.oid)) AS idx_tup_fetch, pg_stat_get_tuples_inserted(c.oid) AS n_tup_ins, pg_stat_get_tuples_updated(c.oid) AS n_tup_upd, pg_stat_get_tuples_deleted(c.oid) AS n_tup_del, pg_stat_get_tuples_hot_updated(c.oid) AS n_tup_hot_upd, pg_stat_get_live_tuples(c.oid) AS n_live_tup, pg_stat_get_dead_tuples(c.oid) AS n_dead_tup, pg_stat_get_last_vacuum_time(c.oid) AS last_vacuum, pg_stat_get_last_autovacuum_time(c.oid) AS last_autovacuum, pg_stat_get_last_analyze_time(c.oid) AS last_analyze, pg_stat_get_last_autoanalyze_time(c.oid) AS last_autoanalyze FROM ((pg_class c LEFT JOIN pg_index i ON ((c.oid = i.indrelid))) LEFT JOIN pg_namespace n ON ((n.oid = c.relnamespace))) WHERE (c.relkind = ANY (ARRAY['r'::"char", 't'::"char"])) GROUP BY c.oid, n.nspname, c.relname; + pg_stat_all_tables | SELECT c.oid AS relid, n.nspname AS schemaname, c.relname, pg_stat_get_numscans(c.oid) AS seq_scan, pg_stat_get_tuples_returned(c.oid) AS seq_tup_read, (sum(pg_stat_get_numscans(i.indexrelid)))::bigint AS idx_scan, ((sum(pg_stat_get_tuples_fetched(i.indexrelid)))::bigint + pg_stat_get_tuples_fetched(c.oid)) AS idx_tup_fetch, pg_stat_get_tuples_inserted(c.oid) AS n_tup_ins, pg_stat_get_tuples_updated(c.oid) AS n_tup_upd, pg_stat_get_tuples_deleted(c.oid) AS n_tup_del, pg_stat_get_tuples_hot_updated(c.oid) AS n_tup_hot_upd, pg_stat_get_fresh_inserted_tuples(c.oid) AS n_fresh_tup, pg_stat_get_live_tuples(c.oid) AS n_live_tup, pg_stat_get_dead_tuples(c.oid) AS n_dead_tup, pg_stat_get_last_vacuum_time(c.oid) AS last_vacuum, pg_stat_get_last_autovacuum_time(c.oid) AS last_autovacuum, pg_stat_get_last_analyze_time(c.oid) AS last_analyze, pg_stat_get_last_autoanalyze_time(c.oid) AS last_autoanalyze FROM ((pg_class c LEFT JOIN pg_index i ON ((c.oid = i.indrelid))) LEFT JOIN pg_namespace n ON ((n.oid = c.relnamespace))) WHERE (c.relkind = ANY (ARRAY['r'::"char", 't'::"char"])) GROUP BY c.oid, n.nspname, c.relname; pg_stat_bgwriter | SELECT pg_stat_get_bgwriter_timed_checkpoints() AS checkpoints_timed, pg_stat_get_bgwriter_requested_checkpoints() AS checkpoints_req, pg_stat_get_bgwriter_buf_written_checkpoints() AS buffers_checkpoint, pg_stat_get_bgwriter_buf_written_clean() AS buffers_clean, pg_stat_get_bgwriter_maxwritten_clean() AS maxwritten_clean, pg_stat_get_buf_written_backend() AS buffers_backend, pg_stat_get_buf_alloc() AS buffers_alloc; pg_stat_database | SELECT d.oid AS datid, d.datname, pg_stat_get_db_numbackends(d.oid) AS numbackends, pg_stat_get_db_xact_commit(d.oid) AS xact_commit, pg_stat_get_db_xact_rollback(d.oid) AS xact_rollback, (pg_stat_get_db_blocks_fetched(d.oid) - pg_stat_get_db_blocks_hit(d.oid)) AS blks_read, pg_stat_get_db_blocks_hit(d.oid) AS blks_hit, pg_stat_get_db_tuples_returned(d.oid) AS tup_returned, pg_stat_get_db_tuples_fetched(d.oid) AS tup_fetched, pg_stat_get_db_tuples_inserted(d.oid) AS tup_inserted, pg_stat_get_db_tuples_updated(d.oid) AS tup_updated, pg_stat_get_db_tuples_deleted(d.oid) AS tup_deleted FROM pg_database d; pg_stat_sys_indexes | SELECT pg_stat_all_indexes.relid, pg_stat_all_indexes.indexrelid, pg_stat_all_indexes.schemaname, pg_stat_all_indexes.relname, pg_stat_all_indexes.indexrelname, pg_stat_all_indexes.idx_scan, pg_stat_all_indexes.idx_tup_read, pg_stat_all_indexes.idx_tup_fetch FROM pg_stat_all_indexes WHERE ((pg_stat_all_indexes.schemaname = ANY (ARRAY['pg_catalog'::name, 'information_schema'::name])) OR (pg_stat_all_indexes.schemaname ~ '^pg_toast'::text)); - pg_stat_sys_tables | SELECT pg_stat_all_tables.relid, pg_stat_all_tables.schemaname, pg_stat_all_tables.relname, pg_stat_all_tables.seq_scan, pg_stat_all_tables.seq_tup_read, pg_stat_all_tables.idx_scan, pg_stat_all_tables.idx_tup_fetch, pg_stat_all_tables.n_tup_ins, pg_stat_all_tables.n_tup_upd, pg_stat_all_tables.n_tup_del, pg_stat_all_tables.n_tup_hot_upd, pg_stat_all_tables.n_live_tup, pg_stat_all_tables.n_dead_tup, pg_stat_all_tables.last_vacuum, pg_stat_all_tables.last_autovacuum, pg_stat_all_tables.last_analyze, pg_stat_all_tables.last_autoanalyze FROM pg_stat_all_tables WHERE ((pg_stat_all_tables.schemaname = ANY (ARRAY['pg_catalog'::name, 'information_schema'::name])) OR (pg_stat_all_tables.schemaname ~ '^pg_toast'::text)); + pg_stat_sys_tables | SELECT pg_stat_all_tables.relid, pg_stat_all_tables.schemaname, pg_stat_all_tables.relname, pg_stat_all_tables.seq_scan, pg_stat_all_tables.seq_tup_read, pg_stat_all_tables.idx_scan, pg_stat_all_tables.idx_tup_fetch, pg_stat_all_tables.n_tup_ins, pg_stat_all_tables.n_tup_upd, pg_stat_all_tables.n_tup_del, pg_stat_all_tables.n_tup_hot_upd, pg_stat_all_tables.n_fresh_tup, pg_stat_all_tables.n_live_tup, pg_stat_all_tables.n_dead_tup, pg_stat_all_tables.last_vacuum, pg_stat_all_tables.last_autovacuum, pg_stat_all_tables.last_analyze, pg_stat_all_tables.last_autoanalyze FROM pg_stat_all_tables WHERE ((pg_stat_all_tables.schemaname = ANY (ARRAY['pg_catalog'::name, 'information_schema'::name])) OR (pg_stat_all_tables.schemaname ~ '^pg_toast'::text)); pg_stat_user_functions | SELECT p.oid AS funcid, n.nspname AS schemaname, p.proname AS funcname, pg_stat_get_function_calls(p.oid) AS calls, (pg_stat_get_function_time(p.oid) / 1000) AS total_time, (pg_stat_get_function_self_time(p.oid) / 1000) AS self_time FROM (pg_proc p LEFT JOIN pg_namespace n ON ((n.oid = p.pronamespace))) WHERE ((p.prolang <> (12)::oid) AND (pg_stat_get_function_calls(p.oid) IS NOT NULL)); pg_stat_user_indexes | SELECT pg_stat_all_indexes.relid, pg_stat_all_indexes.indexrelid, pg_stat_all_indexes.schemaname, pg_stat_all_indexes.relname, pg_stat_all_indexes.indexrelname, pg_stat_all_indexes.idx_scan, pg_stat_all_indexes.idx_tup_read, pg_stat_all_indexes.idx_tup_fetch FROM pg_stat_all_indexes WHERE ((pg_stat_all_indexes.schemaname <> ALL (ARRAY['pg_catalog'::name, 'information_schema'::name])) AND (pg_stat_all_indexes.schemaname !~ '^pg_toast'::text)); - pg_stat_user_tables | SELECT pg_stat_all_tables.relid, pg_stat_all_tables.schemaname, pg_stat_all_tables.relname, pg_stat_all_tables.seq_scan, pg_stat_all_tables.seq_tup_read, pg_stat_all_tables.idx_scan, pg_stat_all_tables.idx_tup_fetch, pg_stat_all_tables.n_tup_ins, pg_stat_all_tables.n_tup_upd, pg_stat_all_tables.n_tup_del, pg_stat_all_tables.n_tup_hot_upd, pg_stat_all_tables.n_live_tup, pg_stat_all_tables.n_dead_tup, pg_stat_all_tables.last_vacuum, pg_stat_all_tables.last_autovacuum, pg_stat_all_tables.last_analyze, pg_stat_all_tables.last_autoanalyze FROM pg_stat_all_tables WHERE ((pg_stat_all_tables.schemaname <> ALL (ARRAY['pg_catalog'::name, 'information_schema'::name])) AND (pg_stat_all_tables.schemaname !~ '^pg_toast'::text)); + pg_stat_user_tables | SELECT pg_stat_all_tables.relid, pg_stat_all_tables.schemaname, pg_stat_all_tables.relname, pg_stat_all_tables.seq_scan, pg_stat_all_tables.seq_tup_read, pg_stat_all_tables.idx_scan, pg_stat_all_tables.idx_tup_fetch, pg_stat_all_tables.n_tup_ins, pg_stat_all_tables.n_tup_upd, pg_stat_all_tables.n_tup_del, pg_stat_all_tables.n_tup_hot_upd, pg_stat_all_tables.n_fresh_tup, pg_stat_all_tables.n_live_tup, pg_stat_all_tables.n_dead_tup, pg_stat_all_tables.last_vacuum, pg_stat_all_tables.last_autovacuum, pg_stat_all_tables.last_analyze, pg_stat_all_tables.last_autoanalyze FROM pg_stat_all_tables WHERE ((pg_stat_all_tables.schemaname <> ALL (ARRAY['pg_catalog'::name, 'information_schema'::name])) AND (pg_stat_all_tables.schemaname !~ '^pg_toast'::text)); pg_statio_all_indexes | SELECT c.oid AS relid, i.oid AS indexrelid, n.nspname AS schemaname, c.relname, i.relname AS indexrelname, (pg_stat_get_blocks_fetched(i.oid) - pg_stat_get_blocks_hit(i.oid)) AS idx_blks_read, pg_stat_get_blocks_hit(i.oid) AS idx_blks_hit FROM (((pg_class c JOIN pg_index x ON ((c.oid = x.indrelid))) JOIN pg_class i ON ((i.oid = x.indexrelid))) LEFT JOIN pg_namespace n ON ((n.oid = c.relnamespace))) WHERE (c.relkind = ANY (ARRAY['r'::"char", 't'::"char"])); pg_statio_all_sequences | SELECT c.oid AS relid, n.nspname AS schemaname, c.relname, (pg_stat_get_blocks_fetched(c.oid) - pg_stat_get_blocks_hit(c.oid)) AS blks_read, pg_stat_get_blocks_hit(c.oid) AS blks_hit FROM (pg_class c LEFT JOIN pg_namespace n ON ((n.oid = c.relnamespace))) WHERE (c.relkind = 'S'::"char"); pg_statio_all_tables | SELECT c.oid AS relid, n.nspname AS schemaname, c.relname, (pg_stat_get_blocks_fetched(c.oid) - pg_stat_get_blocks_hit(c.oid)) AS heap_blks_read, pg_stat_get_blocks_hit(c.oid) AS heap_blks_hit, (sum((pg_stat_get_blocks_fetched(i.indexrelid) - pg_stat_get_blocks_hit(i.indexrelid))))::bigint AS idx_blks_read, (sum(pg_stat_get_blocks_hit(i.indexrelid)))::bigint AS idx_blks_hit, (pg_stat_get_blocks_fetched(t.oid) - pg_stat_get_blocks_hit(t.oid)) AS toast_blks_read, pg_stat_get_blocks_hit(t.oid) AS toast_blks_hit, (pg_stat_get_blocks_fetched(x.oid) - pg_stat_get_blocks_hit(x.oid)) AS tidx_blks_read, pg_stat_get_blocks_hit(x.oid) AS tidx_blks_hit FROM ((((pg_class c LEFT JOIN pg_index i ON ((c.oid = i.indrelid))) LEFT JOIN pg_class t ON ((c.reltoastrelid = t.oid))) LEFT JOIN pg_class x ON ((t.reltoastidxid = x.oid))) LEFT JOIN pg_namespace n ON ((n.oid = c.relnamespace))) WHERE (c.relkind = ANY (ARRAY['r'::"char", 't'::"char"])) GROUP BY c.oid, n.nspname, c.relname, t.oid, x.oid;