diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c index 868c14ec8f..6cc25806e6 100644 --- a/contrib/amcheck/verify_nbtree.c +++ b/contrib/amcheck/verify_nbtree.c @@ -25,6 +25,7 @@ #include "commands/tablecmds.h" #include "miscadmin.h" #include "storage/lmgr.h" +#include "storage/procarray.h" #include "utils/memutils.h" #include "utils/snapmgr.h" @@ -284,7 +285,7 @@ bt_check_every_level(Relation rel, bool readonly) * RecentGlobalXmin assertion matches index_getnext_tid(). See note on * RecentGlobalXmin/B-Tree page deletion. */ - Assert(TransactionIdIsValid(RecentGlobalXmin)); + Assert(TransactionIdIsValid(GetRecentGlobalXmin())); /* * Initialize state for entire verification operation diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index 4dd9d029e6..cbe6bb2ac7 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -17681,10 +17681,6 @@ SELECT collation for ('foo' COLLATE "de_DE"); - txid_snapshot_xip - - - txid_snapshot_xmax @@ -17731,11 +17727,6 @@ SELECT collation for ('foo' COLLATE "de_DE"); get current snapshot - txid_snapshot_xip(txid_snapshot) - setof bigint - get in-progress transaction IDs in snapshot - - txid_snapshot_xmax(txid_snapshot) bigint get xmax of snapshot diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 3acef279f4..9e853ec02b 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -2023,8 +2023,6 @@ heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer, if (all_dead) *all_dead = first_call; - Assert(TransactionIdIsValid(RecentGlobalXmin)); - Assert(ItemPointerGetBlockNumber(tid) == BufferGetBlockNumber(buffer)); offnum = ItemPointerGetOffsetNumber(tid); at_chain_start = first_call; @@ -2123,7 +2121,7 @@ heap_hot_search_buffer(ItemPointer tid, Relation relation, Buffer buffer, * planner's get_actual_variable_range() function to match. */ if (all_dead && *all_dead && - !HeapTupleIsSurelyDead(heapTuple, RecentGlobalXmin)) + !HeapTupleIsSurelyDead(heapTuple, GetRecentGlobalXmin())) *all_dead = false; /* @@ -3784,9 +3782,8 @@ l2: update_xact = InvalidTransactionId; /* - * There was no UPDATE in the MultiXact; or it aborted. No - * TransactionIdIsInProgress() call needed here, since we called - * MultiXactIdWait() above. + * There was no UPDATE in the MultiXact; or it aborted. It cannot + * be in-progress anymore, since we called MultiXactIdWait() above. */ if (!TransactionIdIsValid(update_xact) || TransactionIdDidAbort(update_xact)) @@ -5267,7 +5264,7 @@ heap_acquire_tuplock(Relation relation, ItemPointer tid, LockTupleMode mode, * either here, or within MultiXactIdExpand. * * There is a similar race condition possible when the old xmax was a regular - * TransactionId. We test TransactionIdIsInProgress again just to narrow the + * TransactionId. We test TransactionIdGetStatus again just to narrow the * window, but it's still possible to end up creating an unnecessary * MultiXactId. Fortunately this is harmless. */ @@ -5278,6 +5275,7 @@ compute_new_xmax_infomask(TransactionId xmax, uint16 old_infomask, TransactionId *result_xmax, uint16 *result_infomask, uint16 *result_infomask2) { + TransactionIdStatus xidstatus; TransactionId new_xmax; uint16 new_infomask, new_infomask2; @@ -5413,7 +5411,7 @@ l5: new_xmax = MultiXactIdCreate(xmax, status, add_to_xmax, new_status); GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2); } - else if (TransactionIdIsInProgress(xmax)) + else if ((xidstatus = TransactionIdGetStatus(xmax)) == XID_INPROGRESS) { /* * If the XMAX is a valid, in-progress TransactionId, then we need to @@ -5442,8 +5440,9 @@ l5: /* * LOCK_ONLY can be present alone only when a page has been * upgraded by pg_upgrade. But in that case, - * TransactionIdIsInProgress() should have returned false. We - * assume it's no longer locked in this case. + * TransactionIdGetStatus() should not have returned + * XID_INPROGRESS. We assume it's no longer locked in this + * case. */ elog(WARNING, "LOCK_ONLY found for Xid in progress %u", xmax); old_infomask |= HEAP_XMAX_INVALID; @@ -5496,7 +5495,7 @@ l5: GetMultiXactIdHintBits(new_xmax, &new_infomask, &new_infomask2); } else if (!HEAP_XMAX_IS_LOCKED_ONLY(old_infomask) && - TransactionIdDidCommit(xmax)) + xidstatus == XID_COMMITTED) { /* * It's a committed update, so we gotta preserve him as updater of the @@ -5525,7 +5524,7 @@ l5: /* * Can get here iff the locking/updating transaction was running when * the infomask was extracted from the tuple, but finished before - * TransactionIdIsInProgress got to run. Deal with it as if there was + * TransactionIdGetStatus got to run. Deal with it as if there was * no locker at all in the first place. */ old_infomask |= HEAP_XMAX_INVALID; @@ -5558,15 +5557,11 @@ test_lockmode_for_conflict(MultiXactStatus status, TransactionId xid, LockTupleMode mode, bool *needwait) { MultiXactStatus wantedstatus; + TransactionIdStatus xidstatus; *needwait = false; wantedstatus = get_mxact_status_for_lock(mode, false); - /* - * Note: we *must* check TransactionIdIsInProgress before - * TransactionIdDidAbort/Commit; see comment at top of tqual.c for an - * explanation. - */ if (TransactionIdIsCurrentTransactionId(xid)) { /* @@ -5576,7 +5571,9 @@ test_lockmode_for_conflict(MultiXactStatus status, TransactionId xid, */ return HeapTupleSelfUpdated; } - else if (TransactionIdIsInProgress(xid)) + xidstatus = TransactionIdGetStatus(xid); + + if (xidstatus == XID_INPROGRESS) { /* * If the locking transaction is running, what we do depends on @@ -5596,37 +5593,34 @@ test_lockmode_for_conflict(MultiXactStatus status, TransactionId xid, */ return HeapTupleMayBeUpdated; } - else if (TransactionIdDidAbort(xid)) + else if (xidstatus == XID_ABORTED) return HeapTupleMayBeUpdated; - else if (TransactionIdDidCommit(xid)) - { - /* - * The other transaction committed. If it was only a locker, then the - * lock is completely gone now and we can return success; but if it - * was an update, then what we do depends on whether the two lock - * modes conflict. If they conflict, then we must report error to - * caller. But if they don't, we can fall through to allow the current - * transaction to lock the tuple. - * - * Note: the reason we worry about ISUPDATE here is because as soon as - * a transaction ends, all its locks are gone and meaningless, and - * thus we can ignore them; whereas its updates persist. In the - * TransactionIdIsInProgress case, above, we don't need to check - * because we know the lock is still "alive" and thus a conflict needs - * always be checked. - */ - if (!ISUPDATE_from_mxstatus(status)) - return HeapTupleMayBeUpdated; - if (DoLockModesConflict(LOCKMODE_from_mxstatus(status), - LOCKMODE_from_mxstatus(wantedstatus))) - /* bummer */ - return HeapTupleUpdated; + /* + * The other transaction committed. If it was only a locker, then the + * lock is completely gone now and we can return success; but if it + * was an update, then what we do depends on whether the two lock + * modes conflict. If they conflict, then we must report error to + * caller. But if they don't, we can fall through to allow the current + * transaction to lock the tuple. + * + * Note: the reason we worry about ISUPDATE here is because as soon as + * a transaction ends, all its locks are gone and meaningless, and + * thus we can ignore them; whereas its updates persist. In the + * XID_INPROGRESS case, above, we don't need to check + * because we know the lock is still "alive" and thus a conflict needs + * always be checked. + */ + Assert(xidstatus == XID_COMMITTED); + if (!ISUPDATE_from_mxstatus(status)) return HeapTupleMayBeUpdated; - } - /* Not in progress, not aborted, not committed -- must have crashed */ + if (DoLockModesConflict(LOCKMODE_from_mxstatus(status), + LOCKMODE_from_mxstatus(wantedstatus))) + /* bummer */ + return HeapTupleUpdated; + return HeapTupleMayBeUpdated; } @@ -6160,8 +6154,8 @@ heap_abort_speculative(Relation relation, HeapTuple tuple) * RecentGlobalXmin. That's not pretty, but it doesn't seem worth * inventing a nicer API for this. */ - Assert(TransactionIdIsValid(RecentGlobalXmin)); - PageSetPrunable(page, RecentGlobalXmin); + Assert(TransactionIdIsValid(GetRecentGlobalXmin())); + PageSetPrunable(page, GetRecentGlobalXmin()); /* store transaction information of xact deleting the tuple */ tp.t_data->t_infomask &= ~(HEAP_XMAX_BITS | HEAP_MOVED); @@ -6483,6 +6477,7 @@ FreezeMultiXactId(MultiXactId multi, uint16 t_infomask, if (ISUPDATE_from_mxstatus(members[i].status)) { TransactionId xid = members[i].xid; + TransactionIdStatus xidstatus; /* * It's an update; should we keep it? If the transaction is known @@ -6495,13 +6490,13 @@ FreezeMultiXactId(MultiXactId multi, uint16 t_infomask, * TransactionIdIsInProgress before TransactionIdDidCommit, * because of race conditions explained in detail in tqual.c. */ - if (TransactionIdIsCurrentTransactionId(xid) || - TransactionIdIsInProgress(xid)) + xidstatus = TransactionIdGetStatus(xid); + if (xidstatus == XID_INPROGRESS) { Assert(!TransactionIdIsValid(update_xid)); update_xid = xid; } - else if (TransactionIdDidCommit(xid)) + else if (xidstatus == XID_COMMITTED) { /* * The transaction committed, so we can tell caller to set @@ -6539,8 +6534,7 @@ FreezeMultiXactId(MultiXactId multi, uint16 t_infomask, else { /* We only keep lockers if they are still running */ - if (TransactionIdIsCurrentTransactionId(members[i].xid) || - TransactionIdIsInProgress(members[i].xid)) + if (TransactionIdGetStatus(members[i].xid) == XID_INPROGRESS) { /* running locker cannot possibly be older than the cutoff */ Assert(!TransactionIdPrecedes(members[i].xid, cutoff_xid)); @@ -7014,6 +7008,7 @@ DoesMultiXactIdConflict(MultiXactId multi, uint16 infomask, { TransactionId memxid; LOCKMODE memlockmode; + TransactionIdStatus xidstatus; memlockmode = LOCKMODE_from_mxstatus(members[i].status); @@ -7026,16 +7021,18 @@ DoesMultiXactIdConflict(MultiXactId multi, uint16 infomask, if (TransactionIdIsCurrentTransactionId(memxid)) continue; + xidstatus = TransactionIdGetStatus(memxid); + if (ISUPDATE_from_mxstatus(members[i].status)) { /* ignore aborted updaters */ - if (TransactionIdDidAbort(memxid)) + if (xidstatus == XID_ABORTED) continue; } else { /* ignore lockers-only that are no longer in progress */ - if (!TransactionIdIsInProgress(memxid)) + if (xidstatus != XID_INPROGRESS) continue; } @@ -7115,7 +7112,7 @@ Do_MultiXactIdWait(MultiXactId multi, MultiXactStatus status, if (!DoLockModesConflict(LOCKMODE_from_mxstatus(memstatus), LOCKMODE_from_mxstatus(status))) { - if (remaining && TransactionIdIsInProgress(memxid)) + if (remaining && TransactionIdGetStatus(memxid) == XID_INPROGRESS) remain++; continue; } diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c index 9f33e0ce07..0a61804483 100644 --- a/src/backend/access/heap/pruneheap.c +++ b/src/backend/access/heap/pruneheap.c @@ -23,6 +23,7 @@ #include "miscadmin.h" #include "pgstat.h" #include "storage/bufmgr.h" +#include "storage/procarray.h" #include "utils/snapmgr.h" #include "utils/rel.h" #include "utils/tqual.h" @@ -101,10 +102,10 @@ heap_page_prune_opt(Relation relation, Buffer buffer) */ if (IsCatalogRelation(relation) || RelationIsAccessibleInLogicalDecoding(relation)) - OldestXmin = RecentGlobalXmin; + OldestXmin = GetRecentGlobalXmin(); else OldestXmin = - TransactionIdLimitedForOldSnapshots(RecentGlobalDataXmin, + TransactionIdLimitedForOldSnapshots(GetRecentGlobalDataXmin(), relation); Assert(TransactionIdIsValid(OldestXmin)); diff --git a/src/backend/access/index/indexam.c b/src/backend/access/index/indexam.c index edf4172eb2..ff3ec0dbeb 100644 --- a/src/backend/access/index/indexam.c +++ b/src/backend/access/index/indexam.c @@ -530,8 +530,6 @@ index_getnext_tid(IndexScanDesc scan, ScanDirection direction) SCAN_CHECKS; CHECK_SCAN_PROCEDURE(amgettuple); - Assert(TransactionIdIsValid(RecentGlobalXmin)); - /* * The AM's amgettuple proc finds the next index entry matching the scan * keys, and puts the TID into scan->xs_ctup.t_self. It should also set diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README index a3f11da8d5..db92670e68 100644 --- a/src/backend/access/nbtree/README +++ b/src/backend/access/nbtree/README @@ -321,6 +321,9 @@ older than RecentGlobalXmin. As collateral damage, this implementation also waits for running XIDs with no snapshots and for snapshots taken until the next transaction to allocate an XID commits. +XXX: now that we use CSNs as snapshots, it would be more +straightforward to use something based on CSNs instead of RecentGlobalXmin. + Reclaiming a page doesn't actually change its state on disk --- we simply record it in the shared-memory free space map, from which it will be handed out the next time a new page is needed for a page split. The diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index c77434904e..eba1cc9ee1 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -31,6 +31,7 @@ #include "storage/indexfsm.h" #include "storage/lmgr.h" #include "storage/predicate.h" +#include "storage/procarray.h" #include "utils/snapmgr.h" static bool _bt_mark_page_halfdead(Relation rel, Buffer buf, BTStack stack); @@ -761,7 +762,7 @@ _bt_page_recyclable(Page page) */ opaque = (BTPageOpaque) PageGetSpecialPointer(page); if (P_ISDELETED(opaque) && - TransactionIdPrecedes(opaque->btpo.xact, RecentGlobalXmin)) + TransactionIdPrecedes(opaque->btpo.xact, GetRecentGlobalXmin())) return true; return false; } diff --git a/src/backend/access/rmgrdesc/standbydesc.c b/src/backend/access/rmgrdesc/standbydesc.c index 278546a728..39dda72361 100644 --- a/src/backend/access/rmgrdesc/standbydesc.c +++ b/src/backend/access/rmgrdesc/standbydesc.c @@ -19,21 +19,10 @@ static void standby_desc_running_xacts(StringInfo buf, xl_running_xacts *xlrec) { - int i; - appendStringInfo(buf, "nextXid %u latestCompletedXid %u oldestRunningXid %u", xlrec->nextXid, xlrec->latestCompletedXid, xlrec->oldestRunningXid); - if (xlrec->xcnt > 0) - { - appendStringInfo(buf, "; %d xacts:", xlrec->xcnt); - for (i = 0; i < xlrec->xcnt; i++) - appendStringInfo(buf, " %u", xlrec->xids[i]); - } - - if (xlrec->subxid_overflow) - appendStringInfoString(buf, "; subxid ovf"); } void diff --git a/src/backend/access/rmgrdesc/xactdesc.c b/src/backend/access/rmgrdesc/xactdesc.c index 3aafa79e52..ef09f3c86a 100644 --- a/src/backend/access/rmgrdesc/xactdesc.c +++ b/src/backend/access/rmgrdesc/xactdesc.c @@ -255,17 +255,6 @@ xact_desc_abort(StringInfo buf, uint8 info, xl_xact_abort *xlrec) } } -static void -xact_desc_assignment(StringInfo buf, xl_xact_assignment *xlrec) -{ - int i; - - appendStringInfoString(buf, "subxacts:"); - - for (i = 0; i < xlrec->nsubxacts; i++) - appendStringInfo(buf, " %u", xlrec->xsub[i]); -} - void xact_desc(StringInfo buf, XLogReaderState *record) { @@ -285,18 +274,6 @@ xact_desc(StringInfo buf, XLogReaderState *record) xact_desc_abort(buf, XLogRecGetInfo(record), xlrec); } - else if (info == XLOG_XACT_ASSIGNMENT) - { - xl_xact_assignment *xlrec = (xl_xact_assignment *) rec; - - /* - * Note that we ignore the WAL record's xid, since we're more - * interested in the top-level xid that issued the record and which - * xids are being reported here. - */ - appendStringInfo(buf, "xtop %u: ", xlrec->xtop); - xact_desc_assignment(buf, xlrec); - } } const char * @@ -321,9 +298,6 @@ xact_identify(uint8 info) case XLOG_XACT_ABORT_PREPARED: id = "ABORT_PREPARED"; break; - case XLOG_XACT_ASSIGNMENT: - id = "ASSIGNMENT"; - break; } return id; diff --git a/src/backend/access/spgist/spgvacuum.c b/src/backend/access/spgist/spgvacuum.c index d7d5e90ef3..20aed5755f 100644 --- a/src/backend/access/spgist/spgvacuum.c +++ b/src/backend/access/spgist/spgvacuum.c @@ -26,6 +26,7 @@ #include "storage/bufmgr.h" #include "storage/indexfsm.h" #include "storage/lmgr.h" +#include "storage/procarray.h" #include "utils/snapmgr.h" @@ -521,7 +522,7 @@ vacuumRedirectAndPlaceholder(Relation index, Buffer buffer) dt = (SpGistDeadTuple) PageGetItem(page, PageGetItemId(page, i)); if (dt->tupstate == SPGIST_REDIRECT && - TransactionIdPrecedes(dt->xid, RecentGlobalXmin)) + TransactionIdPrecedes(dt->xid, GetRecentGlobalXmin())) { dt->tupstate = SPGIST_PLACEHOLDER; Assert(opaque->nRedirection > 0); diff --git a/src/backend/access/transam/Makefile b/src/backend/access/transam/Makefile index 16fbe47269..fea6d28e33 100644 --- a/src/backend/access/transam/Makefile +++ b/src/backend/access/transam/Makefile @@ -12,8 +12,8 @@ subdir = src/backend/access/transam top_builddir = ../../../.. include $(top_builddir)/src/Makefile.global -OBJS = clog.o commit_ts.o generic_xlog.o multixact.o parallel.o rmgr.o slru.o \ - subtrans.o timeline.o transam.o twophase.o twophase_rmgr.o varsup.o \ +OBJS = clog.o commit_ts.o csnlog.o generic_xlog.o multixact.o parallel.o rmgr.o slru.o \ + timeline.o transam.o twophase.o twophase_rmgr.o varsup.o \ xact.o xlog.o xlogarchive.o xlogfuncs.o \ xloginsert.o xlogreader.o xlogutils.o diff --git a/src/backend/access/transam/README b/src/backend/access/transam/README index ad4083eb6b..b090722560 100644 --- a/src/backend/access/transam/README +++ b/src/backend/access/transam/README @@ -244,44 +244,24 @@ transaction Y as committed, then snapshot A must consider transaction Y as committed". What we actually enforce is strict serialization of commits and rollbacks -with snapshot-taking: we do not allow any transaction to exit the set of -running transactions while a snapshot is being taken. (This rule is -stronger than necessary for consistency, but is relatively simple to -enforce, and it assists with some other issues as explained below.) The -implementation of this is that GetSnapshotData takes the ProcArrayLock in -shared mode (so that multiple backends can take snapshots in parallel), -but ProcArrayEndTransaction must take the ProcArrayLock in exclusive mode -while clearing MyPgXact->xid at transaction end (either commit or abort). -(To reduce context switching, when multiple transactions commit nearly -simultaneously, we have one backend take ProcArrayLock and clear the XIDs -of multiple processes at once.) - -ProcArrayEndTransaction also holds the lock while advancing the shared -latestCompletedXid variable. This allows GetSnapshotData to use -latestCompletedXid + 1 as xmax for its snapshot: there can be no -transaction >= this xid value that the snapshot needs to consider as -completed. - -In short, then, the rule is that no transaction may exit the set of -currently-running transactions between the time we fetch latestCompletedXid -and the time we finish building our snapshot. However, this restriction -only applies to transactions that have an XID --- read-only transactions -can end without acquiring ProcArrayLock, since they don't affect anyone -else's snapshot nor latestCompletedXid. - -Transaction start, per se, doesn't have any interlocking with these -considerations, since we no longer assign an XID immediately at transaction -start. But when we do decide to allocate an XID, GetNewTransactionId must -store the new XID into the shared ProcArray before releasing XidGenLock. -This ensures that all top-level XIDs <= latestCompletedXid are either -present in the ProcArray, or not running anymore. (This guarantee doesn't -apply to subtransaction XIDs, because of the possibility that there's not -room for them in the subxid array; instead we guarantee that they are -present or the overflow flag is set.) If a backend released XidGenLock -before storing its XID into MyPgXact, then it would be possible for another -backend to allocate and commit a later XID, causing latestCompletedXid to -pass the first backend's XID, before that value became visible in the -ProcArray. That would break GetOldestXmin, as discussed below. +with snapshot-taking. Each commit is assigned a Commit Sequence Number, or +CSN for short, using a monotonically increasing counter. A snapshot is +represented by the value of the CSN counter, at the time the snapshot was +taken. All (committed) transactions with a CSN <= the snapshot's CSN are +considered as visible to the snapshot. + +When checking the visibility of a tuple, we need to look up the CSN +of the xmin/xmax. For that purpose, we store the CSN of each +transaction in the Commit Sequence Number log (csnlog). + +So, a snapshot is simply a CSN, such that all transactions that committed +before that CSN are visible, and everything later is still considered as +in-progress. However, to avoid consulting the csnlog every time the visibilty +of a tuple is checked, we also record a lower and upper bound of the XIDs +considered visible by the snapshot, in SnapshotData. When a snapshot is +taken, xmax is set to the current nextXid value; any transaction that begins +after the snapshot is surely still running. The xmin is tracked lazily in +shared memory, by AdvanceRecentGlobalXmin(). We allow GetNewTransactionId to store the XID into MyPgXact->xid (or the subxid array) without taking ProcArrayLock. This was once necessary to @@ -293,42 +273,29 @@ once, rather than assume they can read it multiple times and get the same answer each time. (Use volatile-qualified pointers when doing this, to ensure that the C compiler does exactly what you tell it to.) -Another important activity that uses the shared ProcArray is GetOldestXmin, -which must determine a lower bound for the oldest xmin of any active MVCC -snapshot, system-wide. Each individual backend advertises the smallest -xmin of its own snapshots in MyPgXact->xmin, or zero if it currently has no +Another important activity that uses the shared ProcArray is GetOldestSnapshot +which must determine a lower bound for the oldest of any active MVCC +snapshots, system-wide. Each individual backend advertises the earliest +of its own snapshots in MyPgXact->snapshotcsn, or zero if it currently has no live snapshots (eg, if it's between transactions or hasn't yet set a -snapshot for a new transaction). GetOldestXmin takes the MIN() of the -valid xmin fields. It does this with only shared lock on ProcArrayLock, -which means there is a potential race condition against other backends -doing GetSnapshotData concurrently: we must be certain that a concurrent -backend that is about to set its xmin does not compute an xmin less than -what GetOldestXmin returns. We ensure that by including all the active -XIDs into the MIN() calculation, along with the valid xmins. The rule that -transactions can't exit without taking exclusive ProcArrayLock ensures that -concurrent holders of shared ProcArrayLock will compute the same minimum of -currently-active XIDs: no xact, in particular not the oldest, can exit -while we hold shared ProcArrayLock. So GetOldestXmin's view of the minimum -active XID will be the same as that of any concurrent GetSnapshotData, and -so it can't produce an overestimate. If there is no active transaction at -all, GetOldestXmin returns latestCompletedXid + 1, which is a lower bound -for the xmin that might be computed by concurrent or later GetSnapshotData -calls. (We know that no XID less than this could be about to appear in -the ProcArray, because of the XidGenLock interlock discussed above.) - -GetSnapshotData also performs an oldest-xmin calculation (which had better -match GetOldestXmin's) and stores that into RecentGlobalXmin, which is used -for some tuple age cutoff checks where a fresh call of GetOldestXmin seems -too expensive. Note that while it is certain that two concurrent -executions of GetSnapshotData will compute the same xmin for their own -snapshots, as argued above, it is not certain that they will arrive at the -same estimate of RecentGlobalXmin. This is because we allow XID-less -transactions to clear their MyPgXact->xmin asynchronously (without taking -ProcArrayLock), so one execution might see what had been the oldest xmin, -and another not. This is OK since RecentGlobalXmin need only be a valid -lower bound. As noted above, we are already assuming that fetch/store -of the xid fields is atomic, so assuming it for xmin as well is no extra -risk. +snapshot for a new transaction). GetOldestSnapshot takes the MIN() of the +snapshots. + +For freezing tuples, vacuum needs to know the oldest XID that is still +considered running by any active transaction. That is, the oldest XID still +considered running by the oldest active snapshot, as returned by +GetOldestSnapshotCSN(). This value is somewhat expensive to calculate, so +the most recently calculated value is kept in shared memory +(SharedVariableCache->recentXmin), and is recalculated lazily by +AdvanceRecentGlobalXmin() function. AdvanceRecentGlobalXmin() first scans +the proc array, and makes note of the oldest active XID. That XID - 1 will +become the new xmin. It then waits until all currently active snapshots have +finished. Any snapshot that begins later will see the xmin as finished, so +after all the active snapshots have finished, xmin will be visible to +everyone. However, AdvanceRecentGlobalXmin() does not actually block waiting +for anything; instead it contains a state machine that advances if possible, +when AdvanceRecentGlobalXmin() is called. AdvanceRecentGlobalXmin() is +called periodically by the WAL writer, so that it doesn't get very stale. pg_xact and pg_subtrans @@ -343,21 +310,10 @@ from disk. They also allow information to be permanent across server restarts. pg_xact records the commit status for each transaction that has been assigned an XID. A transaction can be in progress, committed, aborted, or -"sub-committed". This last state means that it's a subtransaction that's no -longer running, but its parent has not updated its state yet. It is not -necessary to update a subtransaction's transaction status to subcommit, so we -can just defer it until main transaction commit. The main role of marking -transactions as sub-committed is to provide an atomic commit protocol when -transaction status is spread across multiple clog pages. As a result, whenever -transaction status spreads across multiple pages we must use a two-phase commit -protocol: the first phase is to mark the subtransactions as sub-committed, then -we mark the top level transaction and all its subtransactions committed (in -that order). Thus, subtransactions that have not aborted appear as in-progress -even when they have already finished, and the subcommit status appears as a -very short transitory state during main transaction commit. Subtransaction -abort is always marked in clog as soon as it occurs. When the transaction -status all fit in a single CLOG page, we atomically mark them all as committed -without bothering with the intermediate sub-commit state. +"committing". For committed transactions, the clog stores the commit WAL +record's LSN. This last state means that the transaction is just about to +write its commit WAL record, or just did so, but it hasn't yet updated the +clog with the record's LSN. Savepoints are implemented using subtransactions. A subtransaction is a transaction inside a transaction; its commit or abort status is not only @@ -370,7 +326,7 @@ transaction. The "subtransaction parent" (pg_subtrans) mechanism records, for each transaction with an XID, the TransactionId of its parent transaction. This information is stored as soon as the subtransaction is assigned an XID. -Top-level transactions do not have a parent, so they leave their pg_subtrans +Top-level transactions do not have a parent, so they leave their pg_csnlog entries set to the default value of zero (InvalidTransactionId). pg_subtrans is used to check whether the transaction in question is still diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c index bbf9ce1a3a..c15c242c26 100644 --- a/src/backend/access/transam/clog.c +++ b/src/backend/access/transam/clog.c @@ -33,6 +33,7 @@ #include "postgres.h" #include "access/clog.h" +#include "access/mvccvars.h" #include "access/slru.h" #include "access/transam.h" #include "access/xlog.h" @@ -74,13 +75,6 @@ ((xid) % (TransactionId) CLOG_XACTS_PER_PAGE) / CLOG_XACTS_PER_LSN_GROUP) /* - * The number of subtransactions below which we consider to apply clog group - * update optimization. Testing reveals that the number higher than this can - * hurt performance. - */ -#define THRESHOLD_SUBTRANS_CLOG_OPT 5 - -/* * Link to shared-memory data structures for CLOG control */ static SlruCtlData ClogCtlData; @@ -93,23 +87,23 @@ static bool CLOGPagePrecedes(int page1, int page2); static void WriteZeroPageXlogRec(int pageno); static void WriteTruncateXlogRec(int pageno, TransactionId oldestXact, Oid oldestXidDb); -static void TransactionIdSetPageStatus(TransactionId xid, int nsubxids, - TransactionId *subxids, XidStatus status, - XLogRecPtr lsn, int pageno, - bool all_xact_same_page); -static void TransactionIdSetStatusBit(TransactionId xid, XidStatus status, - XLogRecPtr lsn, int slotno); -static void set_status_by_pages(int nsubxids, TransactionId *subxids, - XidStatus status, XLogRecPtr lsn); -static bool TransactionGroupUpdateXidStatus(TransactionId xid, - XidStatus status, XLogRecPtr lsn, int pageno); -static void TransactionIdSetPageStatusInternal(TransactionId xid, int nsubxids, - TransactionId *subxids, XidStatus status, - XLogRecPtr lsn, int pageno); +static void CLogSetPageStatus(TransactionId xid, int nsubxids, + TransactionId *subxids, CLogXidStatus status, + XLogRecPtr lsn, int pageno, + bool all_xacts_same_page); +static void CLogSetStatusBit(TransactionId xid, CLogXidStatus status, + XLogRecPtr lsn, int slotno); +static bool CLogGroupUpdateXidStatus(TransactionId xid, int nsubxids, + TransactionId *subxids, CLogXidStatus status, + XLogRecPtr lsn, int pageno); +static void CLogSetPageStatusInternal(TransactionId xid, int nsubxids, + TransactionId *subxids, CLogXidStatus status, + XLogRecPtr lsn, int pageno); + /* - * TransactionIdSetTreeStatus + * CLogSetTreeStatus * * Record the final state of transaction entries in the commit log for * a transaction and its subtransaction tree. Take care to ensure this is @@ -127,30 +121,13 @@ static void TransactionIdSetPageStatusInternal(TransactionId xid, int nsubxids, * caller guarantees the commit record is already flushed in that case. It * should be InvalidXLogRecPtr for abort cases, too. * - * In the commit case, atomicity is limited by whether all the subxids are in - * the same CLOG page as xid. If they all are, then the lock will be grabbed - * only once, and the status will be set to committed directly. Otherwise - * we must - * 1. set sub-committed all subxids that are not on the same page as the - * main xid - * 2. atomically set committed the main xid and the subxids on the same page - * 3. go over the first bunch again and set them committed - * Note that as far as concurrent checkers are concerned, main transaction - * commit as a whole is still atomic. - * - * Example: - * TransactionId t commits and has subxids t1, t2, t3, t4 - * t is on page p1, t1 is also on p1, t2 and t3 are on p2, t4 is on p3 - * 1. update pages2-3: - * page2: set t2,t3 as sub-committed - * page3: set t4 as sub-committed - * 2. update page1: - * set t1 as sub-committed, - * then set t as committed, - then set t1 as committed - * 3. update pages2-3: - * page2: set t2,t3 as committed - * page3: set t4 as committed + * The atomicity is limited by whether all the subxids are in the same CLOG + * page as xid. If they all are, then the lock will be grabbed only once, + * and the status will be set to committed directly. Otherwise there is + * a window that the parent will be seen as committed, while (some of) the + * children are still seen as in-progress. That's OK with the current use, + * as visibility checking code will not rely on the CLOG for recent + * transactions (CSNLOG will be used instead). * * NB: this is a low-level routine and is NOT the preferred entry point * for most uses; functions in transam.c are the intended callers. @@ -160,153 +137,75 @@ static void TransactionIdSetPageStatusInternal(TransactionId xid, int nsubxids, * cache yet. */ void -TransactionIdSetTreeStatus(TransactionId xid, int nsubxids, - TransactionId *subxids, XidStatus status, XLogRecPtr lsn) +CLogSetTreeStatus(TransactionId xid, int nsubxids, + TransactionId *subxids, CLogXidStatus status, XLogRecPtr lsn) { - int pageno = TransactionIdToPage(xid); /* get page of parent */ + TransactionId topXid; + int pageno; int i; + int offset; - Assert(status == TRANSACTION_STATUS_COMMITTED || - status == TRANSACTION_STATUS_ABORTED); + Assert(status == CLOG_XID_STATUS_COMMITTED || + status == CLOG_XID_STATUS_ABORTED); /* - * See how many subxids, if any, are on the same page as the parent, if - * any. + * Update the clog page-by-page. On first iteration, we will set the + * status of the top-XID, and any subtransactions on the same page. */ - for (i = 0; i < nsubxids; i++) - { - if (TransactionIdToPage(subxids[i]) != pageno) - break; - } - - /* - * Do all items fit on a single page? - */ - if (i == nsubxids) - { - /* - * Set the parent and all subtransactions in a single call - */ - TransactionIdSetPageStatus(xid, nsubxids, subxids, status, lsn, - pageno, true); - } - else - { - int nsubxids_on_first_page = i; - - /* - * If this is a commit then we care about doing this correctly (i.e. - * using the subcommitted intermediate status). By here, we know - * we're updating more than one page of clog, so we must mark entries - * that are *not* on the first page so that they show as subcommitted - * before we then return to update the status to fully committed. - * - * To avoid touching the first page twice, skip marking subcommitted - * for the subxids on that first page. - */ - if (status == TRANSACTION_STATUS_COMMITTED) - set_status_by_pages(nsubxids - nsubxids_on_first_page, - subxids + nsubxids_on_first_page, - TRANSACTION_STATUS_SUB_COMMITTED, lsn); - - /* - * Now set the parent and subtransactions on same page as the parent, - * if any - */ - pageno = TransactionIdToPage(xid); - TransactionIdSetPageStatus(xid, nsubxids_on_first_page, subxids, status, - lsn, pageno, false); - - /* - * Now work through the rest of the subxids one clog page at a time, - * starting from the second page onwards, like we did above. - */ - set_status_by_pages(nsubxids - nsubxids_on_first_page, - subxids + nsubxids_on_first_page, - status, lsn); - } -} - -/* - * Helper for TransactionIdSetTreeStatus: set the status for a bunch of - * transactions, chunking in the separate CLOG pages involved. We never - * pass the whole transaction tree to this function, only subtransactions - * that are on different pages to the top level transaction id. - */ -static void -set_status_by_pages(int nsubxids, TransactionId *subxids, - XidStatus status, XLogRecPtr lsn) -{ - int pageno = TransactionIdToPage(subxids[0]); - int offset = 0; - int i = 0; - - Assert(nsubxids > 0); /* else the pageno fetch above is unsafe */ - - while (i < nsubxids) + pageno = TransactionIdToPage(xid); /* get page of parent */ + topXid = xid; + offset = 0; + i = 0; + for (;;) { int num_on_page = 0; - int nextpageno; - do + while (i < nsubxids && TransactionIdToPage(subxids[i]) == pageno) { - nextpageno = TransactionIdToPage(subxids[i]); - if (nextpageno != pageno) - break; num_on_page++; i++; - } while (i < nsubxids); + } + + CLogSetPageStatus(topXid, + num_on_page, subxids + offset, + status, lsn, pageno, + nsubxids == num_on_page); + + if (i == nsubxids) + break; - TransactionIdSetPageStatus(InvalidTransactionId, - num_on_page, subxids + offset, - status, lsn, pageno, false); offset = i; - pageno = nextpageno; + pageno = TransactionIdToPage(subxids[offset]); + topXid = InvalidTransactionId; } } /* - * Record the final state of transaction entries in the commit log for all - * entries on a single page. Atomic only on this page. + * Record the final state of transaction entries in the commit log for + * all entries on a single page. Atomic only on this page. + * + * Otherwise API is same as CLogSetTreeStatus() */ static void -TransactionIdSetPageStatus(TransactionId xid, int nsubxids, - TransactionId *subxids, XidStatus status, - XLogRecPtr lsn, int pageno, - bool all_xact_same_page) +CLogSetPageStatus(TransactionId xid, int nsubxids, + TransactionId *subxids, CLogXidStatus status, + XLogRecPtr lsn, int pageno, + bool all_xact_same_page) { - /* Can't use group update when PGPROC overflows. */ - StaticAssertStmt(THRESHOLD_SUBTRANS_CLOG_OPT <= PGPROC_MAX_CACHED_SUBXIDS, - "group clog threshold less than PGPROC cached subxids"); - /* * When there is contention on CLogControlLock, we try to group multiple * updates; a single leader process will perform transaction status * updates for multiple backends so that the number of times * CLogControlLock needs to be acquired is reduced. * - * For this optimization to be safe, the XID in MyPgXact and the subxids - * in MyProc must be the same as the ones for which we're setting the - * status. Check that this is the case. - * * For this optimization to be efficient, we shouldn't have too many * sub-XIDs and all of the XIDs for which we're adjusting clog should be * on the same page. Check those conditions, too. */ if (all_xact_same_page && xid == MyPgXact->xid && - nsubxids <= THRESHOLD_SUBTRANS_CLOG_OPT && - nsubxids == MyPgXact->nxids && - memcmp(subxids, MyProc->subxids.xids, - nsubxids * sizeof(TransactionId)) == 0) + nsubxids <= THRESHOLD_SUBTRANS_CLOG_OPT) { /* - * We don't try to do group update optimization if a process has - * overflowed the subxids array in its PGPROC, since in that case we - * don't have a complete list of XIDs for it. - */ - Assert(THRESHOLD_SUBTRANS_CLOG_OPT <= PGPROC_MAX_CACHED_SUBXIDS); - - /* * If we can immediately acquire CLogControlLock, we update the status * of our own XID and release the lock. If not, try use group XID * update. If that doesn't work out, fall back to waiting for the @@ -315,12 +214,13 @@ TransactionIdSetPageStatus(TransactionId xid, int nsubxids, if (LWLockConditionalAcquire(CLogControlLock, LW_EXCLUSIVE)) { /* Got the lock without waiting! Do the update. */ - TransactionIdSetPageStatusInternal(xid, nsubxids, subxids, status, - lsn, pageno); + CLogSetPageStatusInternal(xid, nsubxids, subxids, status, + lsn, pageno); LWLockRelease(CLogControlLock); return; } - else if (TransactionGroupUpdateXidStatus(xid, status, lsn, pageno)) + else if (CLogGroupUpdateXidStatus(xid, nsubxids, subxids, status, + lsn, pageno)) { /* Group update mechanism has done the work. */ return; @@ -331,8 +231,8 @@ TransactionIdSetPageStatus(TransactionId xid, int nsubxids, /* Group update not applicable, or couldn't accept this page number. */ LWLockAcquire(CLogControlLock, LW_EXCLUSIVE); - TransactionIdSetPageStatusInternal(xid, nsubxids, subxids, status, - lsn, pageno); + CLogSetPageStatusInternal(xid, nsubxids, subxids, status, + lsn, pageno); LWLockRelease(CLogControlLock); } @@ -342,17 +242,15 @@ TransactionIdSetPageStatus(TransactionId xid, int nsubxids, * We don't do any locking here; caller must handle that. */ static void -TransactionIdSetPageStatusInternal(TransactionId xid, int nsubxids, - TransactionId *subxids, XidStatus status, - XLogRecPtr lsn, int pageno) +CLogSetPageStatusInternal(TransactionId xid, int nsubxids, + TransactionId *subxids, CLogXidStatus status, + XLogRecPtr lsn, int pageno) { int slotno; int i; - Assert(status == TRANSACTION_STATUS_COMMITTED || - status == TRANSACTION_STATUS_ABORTED || - (status == TRANSACTION_STATUS_SUB_COMMITTED && !TransactionIdIsValid(xid))); - Assert(LWLockHeldByMeInMode(CLogControlLock, LW_EXCLUSIVE)); + Assert(status == CLOG_XID_STATUS_COMMITTED || + status == CLOG_XID_STATUS_ABORTED); /* * If we're doing an async commit (ie, lsn is valid), then we must wait @@ -365,38 +263,15 @@ TransactionIdSetPageStatusInternal(TransactionId xid, int nsubxids, */ slotno = SimpleLruReadPage(ClogCtl, pageno, XLogRecPtrIsInvalid(lsn), xid); - /* - * Set the main transaction id, if any. - * - * If we update more than one xid on this page while it is being written - * out, we might find that some of the bits go to disk and others don't. - * If we are updating commits on the page with the top-level xid that - * could break atomicity, so we subcommit the subxids first before we mark - * the top-level commit. - */ + /* Set the main transaction id, if any. */ if (TransactionIdIsValid(xid)) - { - /* Subtransactions first, if needed ... */ - if (status == TRANSACTION_STATUS_COMMITTED) - { - for (i = 0; i < nsubxids; i++) - { - Assert(ClogCtl->shared->page_number[slotno] == TransactionIdToPage(subxids[i])); - TransactionIdSetStatusBit(subxids[i], - TRANSACTION_STATUS_SUB_COMMITTED, - lsn, slotno); - } - } - - /* ... then the main transaction */ - TransactionIdSetStatusBit(xid, status, lsn, slotno); - } + CLogSetStatusBit(xid, status, lsn, slotno); /* Set the subtransactions */ for (i = 0; i < nsubxids; i++) { Assert(ClogCtl->shared->page_number[slotno] == TransactionIdToPage(subxids[i])); - TransactionIdSetStatusBit(subxids[i], status, lsn, slotno); + CLogSetStatusBit(subxids[i], status, lsn, slotno); } ClogCtl->shared->page_dirty[slotno] = true; @@ -417,8 +292,9 @@ TransactionIdSetPageStatusInternal(TransactionId xid, int nsubxids, * number we need to update differs from those processes already waiting. */ static bool -TransactionGroupUpdateXidStatus(TransactionId xid, XidStatus status, - XLogRecPtr lsn, int pageno) +CLogGroupUpdateXidStatus(TransactionId xid, int nsubxids, + TransactionId *subxids, CLogXidStatus status, + XLogRecPtr lsn, int pageno) { volatile PROC_HDR *procglobal = ProcGlobal; PGPROC *proc = MyProc; @@ -437,6 +313,8 @@ TransactionGroupUpdateXidStatus(TransactionId xid, XidStatus status, proc->clogGroupMemberXidStatus = status; proc->clogGroupMemberPage = pageno; proc->clogGroupMemberLsn = lsn; + proc->clogGroupNSubxids = nsubxids; + memcpy(&proc->clogGroupSubxids[0], subxids, nsubxids * sizeof(TransactionId)); nextidx = pg_atomic_read_u32(&procglobal->clogGroupFirst); @@ -517,20 +395,13 @@ TransactionGroupUpdateXidStatus(TransactionId xid, XidStatus status, while (nextidx != INVALID_PGPROCNO) { PGPROC *proc = &ProcGlobal->allProcs[nextidx]; - PGXACT *pgxact = &ProcGlobal->allPgXact[nextidx]; - - /* - * Overflowed transactions should not use group XID status update - * mechanism. - */ - Assert(!pgxact->overflowed); - TransactionIdSetPageStatusInternal(proc->clogGroupMemberXid, - pgxact->nxids, - proc->subxids.xids, - proc->clogGroupMemberXidStatus, - proc->clogGroupMemberLsn, - proc->clogGroupMemberPage); + CLogSetPageStatusInternal(proc->clogGroupMemberXid, + proc->clogGroupNSubxids, + proc->clogGroupSubxids, + proc->clogGroupMemberXidStatus, + proc->clogGroupMemberLsn, + proc->clogGroupMemberPage); /* Move to next proc in list. */ nextidx = pg_atomic_read_u32(&proc->clogGroupNext); @@ -569,7 +440,7 @@ TransactionGroupUpdateXidStatus(TransactionId xid, XidStatus status, * Must be called with CLogControlLock held */ static void -TransactionIdSetStatusBit(TransactionId xid, XidStatus status, XLogRecPtr lsn, int slotno) +CLogSetStatusBit(TransactionId xid, CLogXidStatus status, XLogRecPtr lsn, int slotno) { int byteno = TransactionIdToByte(xid); int bshift = TransactionIdToBIndex(xid) * CLOG_BITS_PER_XACT; @@ -581,22 +452,12 @@ TransactionIdSetStatusBit(TransactionId xid, XidStatus status, XLogRecPtr lsn, i curval = (*byteptr >> bshift) & CLOG_XACT_BITMASK; /* - * When replaying transactions during recovery we still need to perform - * the two phases of subcommit and then commit. However, some transactions - * are already correctly marked, so we just treat those as a no-op which - * allows us to keep the following Assert as restrictive as possible. - */ - if (InRecovery && status == TRANSACTION_STATUS_SUB_COMMITTED && - curval == TRANSACTION_STATUS_COMMITTED) - return; - - /* * Current state change should be from 0 or subcommitted to target state * or we should already be there when replaying changes during recovery. */ Assert(curval == 0 || - (curval == TRANSACTION_STATUS_SUB_COMMITTED && - status != TRANSACTION_STATUS_IN_PROGRESS) || + (curval == CLOG_XID_STATUS_SUB_COMMITTED && + status != CLOG_XID_STATUS_IN_PROGRESS) || curval == status); /* note this assumes exclusive access to the clog page */ @@ -637,8 +498,8 @@ TransactionIdSetStatusBit(TransactionId xid, XidStatus status, XLogRecPtr lsn, i * NB: this is a low-level routine and is NOT the preferred entry point * for most uses; TransactionLogFetch() in transam.c is the intended caller. */ -XidStatus -TransactionIdGetStatus(TransactionId xid, XLogRecPtr *lsn) +CLogXidStatus +CLogGetStatus(TransactionId xid, XLogRecPtr *lsn) { int pageno = TransactionIdToPage(xid); int byteno = TransactionIdToByte(xid); @@ -646,7 +507,7 @@ TransactionIdGetStatus(TransactionId xid, XLogRecPtr *lsn) int slotno; int lsnindex; char *byteptr; - XidStatus status; + CLogXidStatus status; /* lock is acquired by SimpleLruReadPage_ReadOnly */ diff --git a/src/backend/access/transam/commit_ts.c b/src/backend/access/transam/commit_ts.c index 7b7bf2b2bf..1668b00507 100644 --- a/src/backend/access/transam/commit_ts.c +++ b/src/backend/access/transam/commit_ts.c @@ -26,6 +26,7 @@ #include "access/commit_ts.h" #include "access/htup_details.h" +#include "access/mvccvars.h" #include "access/slru.h" #include "access/transam.h" #include "catalog/pg_type.h" diff --git a/src/backend/access/transam/csnlog.c b/src/backend/access/transam/csnlog.c new file mode 100644 index 0000000000..4d3139593a --- /dev/null +++ b/src/backend/access/transam/csnlog.c @@ -0,0 +1,766 @@ +/*------------------------------------------------------------------------- + * + * csnlog.c + * Tracking Commit-Sequence-Numbers and in-progress subtransactions + * + * The pg_csnlog manager is a pg_clog-like manager that stores the commit + * sequence number, or parent transaction Id, for each transaction. It is + * a fundamental part of MVCC. + * + * The csnlog serves two purposes: + * + * 1. While a transaction is in progress, it stores the parent transaction + * Id for each in-progress subtransaction. A main transaction has a parent + * of InvalidTransactionId, and each subtransaction has its immediate + * parent. The tree can easily be walked from child to parent, but not in + * the opposite direction. + * + * 2. After a transaction has committed, it stores the Commit Sequence + * Number of the commit. + * + * We can use the same structure for both, because we don't care about the + * parent-child relationships subtransaction after commit. + * + * This code is based on clog.c, but the robustness requirements + * are completely different from pg_clog, because we only need to remember + * pg_csnlog information for currently-open and recently committed + * transactions. Thus, there is no need to preserve data over a crash and + * restart. + * + * There are no XLOG interactions since we do not care about preserving + * data across crashes. During database startup, we simply force the + * currently-active page of CSNLOG to zeroes. + * + * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/backend/access/transam/csnlog.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/csnlog.h" +#include "access/mvccvars.h" +#include "access/slru.h" +#include "access/subtrans.h" +#include "access/transam.h" +#include "miscadmin.h" +#include "pg_trace.h" +#include "utils/snapmgr.h" + +/* + * Defines for CSNLOG page sizes. A page is the same BLCKSZ as is used + * everywhere else in Postgres. + * + * Note: because TransactionIds are 32 bits and wrap around at 0xFFFFFFFF, + * CSNLOG page numbering also wraps around at 0xFFFFFFFF/CSNLOG_XACTS_PER_PAGE, + * and CSNLOG segment numbering at + * 0xFFFFFFFF/CLOG_XACTS_PER_PAGE/SLRU_PAGES_PER_SEGMENT. We need take no + * explicit notice of that fact in this module, except when comparing segment + * and page numbers in TruncateCSNLOG (see CSNLOGPagePrecedes). + */ + +/* We store the commit LSN for each xid */ +#define CSNLOG_XACTS_PER_PAGE (BLCKSZ / sizeof(CommitSeqNo)) + +#define TransactionIdToPage(xid) ((xid) / (TransactionId) CSNLOG_XACTS_PER_PAGE) +#define TransactionIdToPgIndex(xid) ((xid) % (TransactionId) CSNLOG_XACTS_PER_PAGE) + +/* We allocate new log pages in batches */ +#define BATCH_SIZE 128 + +/* + * Link to shared-memory data structures for CLOG control + */ +static SlruCtlData CsnlogCtlData; + +#define CsnlogCtl (&CsnlogCtlData) + + +static int ZeroCSNLOGPage(int pageno); +static bool CSNLOGPagePrecedes(int page1, int page2); +static void CSNLogSetPageStatus(TransactionId xid, int nsubxids, + TransactionId *subxids, + CommitSeqNo csn, int pageno); +static void CSNLogSetCSN(TransactionId xid, CommitSeqNo csn, int slotno); +static CommitSeqNo InternalGetCommitSeqNo(TransactionId xid); +static CommitSeqNo RecursiveGetCommitSeqNo(TransactionId xid); + +/* + * CSNLogSetCommitSeqNo + * + * Record the status and CSN of transaction entries in the commit log for a + * transaction and its subtransaction tree. Take care to ensure this is + * efficient, and as atomic as possible. + * + * xid is a single xid to set status for. This will typically be the + * top level transactionid for a top level commit or abort. It can + * also be a subtransaction when we record transaction aborts. + * + * subxids is an array of xids of length nsubxids, representing subtransactions + * in the tree of xid. In various cases nsubxids may be zero. + * + * csn is the commit sequence number of the transaction. It should be + * InvalidCommitSeqNo for abort cases. + * + * Note: This doesn't guarantee atomicity. The caller can use the + * COMMITSEQNO_COMMITTING special value for that. + */ +void +CSNLogSetCommitSeqNo(TransactionId xid, int nsubxids, + TransactionId *subxids, CommitSeqNo csn) +{ + int nextSubxid; + int topPage; + TransactionId topXid; + TransactionId oldestActiveXid = pg_atomic_read_u32( + &ShmemVariableCache->oldestActiveXid); + + Assert(!TransactionIdIsNormal(xid) + || TransactionIdPrecedesOrEquals(oldestActiveXid, xid)); + + if (csn == InvalidCommitSeqNo || xid == BootstrapTransactionId) + { + if (IsBootstrapProcessingMode()) + csn = COMMITSEQNO_FROZEN; + else + elog(ERROR, "cannot mark transaction committed without CSN"); + } + + /* + * We set the status of child transaction before the status of parent + * transactions, so that another process can correctly determine the + * resulting status of a child transaction. See RecursiveGetCommitSeqNo(). + */ + topXid = InvalidTransactionId; + topPage = TransactionIdToPage(xid); + nextSubxid = nsubxids - 1; + do + { + int currentPage = topPage; + int subxidsOnPage = 0; + for (; nextSubxid >= 0; nextSubxid--) + { + int subxidPage = TransactionIdToPage(subxids[nextSubxid]); + + if (subxidsOnPage == 0) + currentPage = subxidPage; + + if (currentPage != subxidPage) + break; + + subxidsOnPage++; + } + + if (currentPage == topPage) + { + Assert(topXid == InvalidTransactionId); + topXid = xid; + } + + CSNLogSetPageStatus(topXid, subxidsOnPage, subxids + nextSubxid + 1, + csn, currentPage); + } + while (nextSubxid >= 0); + + if (topXid == InvalidTransactionId) + { + /* + * No subxids were on the same page as the main xid; we have to update + * it separately + */ + CSNLogSetPageStatus(xid, 0, NULL, csn, topPage); + } +} + +/* + * Record the final state of transaction entries in the csn log for + * all entries on a single page. Atomic only on this page. + * + * Otherwise API is same as TransactionIdSetTreeStatus() + */ +static void +CSNLogSetPageStatus(TransactionId xid, int nsubxids, + TransactionId *subxids, + CommitSeqNo csn, int pageno) +{ + int slotno; + int i; + + LWLockAcquire(CSNLogControlLock, LW_SHARED); + + slotno = SimpleLruReadPage_ReadOnly_Locked(CsnlogCtl, pageno, xid); + + /* + * We set the status of child transaction before the status of parent + * transactions, so that another process can correctly determine the + * resulting status of a child transaction. See RecursiveGetCommitSeqNo(). + */ + for (i = nsubxids - 1; i >= 0; i--) + { + Assert(CsnlogCtl->shared->page_number[slotno] == TransactionIdToPage(subxids[i])); + CSNLogSetCSN(subxids[i], csn, slotno); + pg_write_barrier(); + } + + if (TransactionIdIsValid(xid)) + CSNLogSetCSN(xid, csn, slotno); + + CsnlogCtl->shared->page_dirty[slotno] = true; + + LWLockRelease(CSNLogControlLock); +} + + + +/* + * Record the parent of a subtransaction in the subtrans log. + * + * In some cases we may need to overwrite an existing value. + */ +void +SubTransSetParent(TransactionId xid, TransactionId parent) +{ + int pageno = TransactionIdToPage(xid); + int entryno = TransactionIdToPgIndex(xid); + int slotno; + CommitSeqNo *ptr; + CommitSeqNo newcsn; + + Assert(TransactionIdIsValid(parent)); + Assert(TransactionIdFollows(xid, parent)); + + newcsn = CSN_SUBTRANS_BIT | (uint64) parent; + + /* + * Shared page access is enough to set the subtransaction parent. + * It is set when the subtransaction is assigned an xid, + * and can be read only later, after the subtransaction have modified + * some tuples. + */ + slotno = SimpleLruReadPage_ReadOnly(CsnlogCtl, pageno, xid); + ptr = (CommitSeqNo *) CsnlogCtl->shared->page_buffer[slotno]; + ptr += entryno; + + /* + * It's possible we'll try to set the parent xid multiple times but we + * shouldn't ever be changing the xid from one valid xid to another valid + * xid, which would corrupt the data structure. + */ + if (*ptr != newcsn) + { + Assert(*ptr == COMMITSEQNO_INPROGRESS); + *ptr = newcsn; + CsnlogCtl->shared->page_dirty[slotno] = true; + } + + + LWLockRelease(CSNLogControlLock); +} + +/* + * Interrogate the parent of a transaction in the csnlog. + */ +TransactionId +SubTransGetParent(TransactionId xid) +{ + CommitSeqNo csn; + + LWLockAcquire(CSNLogControlLock, LW_SHARED); + + csn = InternalGetCommitSeqNo(xid); + + LWLockRelease(CSNLogControlLock); + + if (COMMITSEQNO_IS_SUBTRANS(csn)) + return (TransactionId) (csn & 0xFFFFFFFF); + else + return InvalidTransactionId; +} + +/* + * SubTransGetTopmostTransaction + * + * Returns the topmost transaction of the given transaction id. + * + * Because we cannot look back further than TransactionXmin, it is possible + * that this function will lie and return an intermediate subtransaction ID + * instead of the true topmost parent ID. This is OK, because in practice + * we only care about detecting whether the topmost parent is still running + * or is part of a current snapshot's list of still-running transactions. + * Therefore, any XID before TransactionXmin is as good as any other. + */ +TransactionId +SubTransGetTopmostTransaction(TransactionId xid) +{ + TransactionId parentXid = xid, + previousXid = xid; + + /* Can't ask about stuff that might not be around anymore */ + Assert(TransactionIdFollowsOrEquals(xid, TransactionXmin)); + + while (TransactionIdIsValid(parentXid)) + { + previousXid = parentXid; + if (TransactionIdPrecedes(parentXid, TransactionXmin)) + break; + parentXid = SubTransGetParent(parentXid); + + /* + * By convention the parent xid gets allocated first, so should always + * precede the child xid. Anything else points to a corrupted data + * structure that could lead to an infinite loop, so exit. + */ + if (!TransactionIdPrecedes(parentXid, previousXid)) + elog(ERROR, "pg_csnlog contains invalid entry: xid %u points to parent xid %u", + previousXid, parentXid); + } + + Assert(TransactionIdIsValid(previousXid)); + + return previousXid; +} + +/* + * Sets the commit status of a single transaction. + * + * Must be called with CSNLogControlLock held + */ +static void +CSNLogSetCSN(TransactionId xid, CommitSeqNo csn, int slotno) +{ + int entryno = TransactionIdToPgIndex(xid); + CommitSeqNo *ptr; + + ptr = (CommitSeqNo *) (CsnlogCtl->shared->page_buffer[slotno] + entryno * sizeof(XLogRecPtr)); + + /* + * Current state change should be from 0 to target state. (Allow setting + * it again to same value.) + */ + Assert(COMMITSEQNO_IS_INPROGRESS(*ptr) || + COMMITSEQNO_IS_COMMITTING(*ptr) || + COMMITSEQNO_IS_SUBTRANS(*ptr) || + *ptr == csn); + + *ptr = csn; +} + +/* + * Interrogate the state of a transaction in the commit log. + * + * Aside from the actual commit status, this function returns (into *lsn) + * an LSN that is late enough to be able to guarantee that if we flush up to + * that LSN then we will have flushed the transaction's commit record to disk. + * The result is not necessarily the exact LSN of the transaction's commit + * record! For example, for long-past transactions (those whose clog pages + * already migrated to disk), we'll return InvalidXLogRecPtr. Also, because + * we group transactions on the same clog page to conserve storage, we might + * return the LSN of a later transaction that falls into the same group. + * + * NB: this is a low-level routine and is NOT the preferred entry point + * for most uses; TransactionIdGetCommitSeqNo() in transam.c is the intended caller. + */ +CommitSeqNo +CSNLogGetCommitSeqNo(TransactionId xid) +{ + CommitSeqNo csn; + + LWLockAcquire(CSNLogControlLock, LW_SHARED); + + csn = RecursiveGetCommitSeqNo(xid); + + LWLockRelease(CSNLogControlLock); + + return csn; +} + +/* Determine the CSN of a transaction, walking the subtransaction tree if needed */ +static CommitSeqNo +RecursiveGetCommitSeqNo(TransactionId xid) +{ + CommitSeqNo csn; + + csn = InternalGetCommitSeqNo(xid); + + if (COMMITSEQNO_IS_SUBTRANS(csn)) + { + TransactionId parentXid = csn & ~CSN_SUBTRANS_BIT; + CommitSeqNo parentCsn = RecursiveGetCommitSeqNo(parentXid); + + Assert(!COMMITSEQNO_IS_SUBTRANS(parentCsn)); + + /* + * The parent and child transaction status update is not atomic. We + * must take care not to use the updated parent status with the old + * child status, or else we can wrongly see a committed subtransaction + * as aborted. This happens when the parent is already marked as + * committed and the child is not yet marked. + */ + pg_read_barrier(); + + csn = InternalGetCommitSeqNo(xid); + + if (COMMITSEQNO_IS_SUBTRANS(csn)) + { + if (COMMITSEQNO_IS_ABORTED(parentCsn) + || COMMITSEQNO_IS_COMMITTED(parentCsn)) + { + csn = COMMITSEQNO_ABORTED; + } + else if (COMMITSEQNO_IS_INPROGRESS(parentCsn)) + csn = COMMITSEQNO_INPROGRESS; + else if (COMMITSEQNO_IS_COMMITTING(parentCsn)) + csn = COMMITSEQNO_COMMITTING; + else + Assert(false); + } + } + + return csn; +} + +/* + * Get the raw CSN value. + */ +static CommitSeqNo +InternalGetCommitSeqNo(TransactionId xid) +{ + int pageno = TransactionIdToPage(xid); + int entryno = TransactionIdToPgIndex(xid); + int slotno; + + /* Can't ask about stuff that might not be around anymore */ + Assert(TransactionIdFollowsOrEquals(xid, TransactionXmin)); + + if (!TransactionIdIsNormal(xid)) + { + if (xid == InvalidTransactionId) + return COMMITSEQNO_ABORTED; + if (xid == FrozenTransactionId || xid == BootstrapTransactionId) + return COMMITSEQNO_FROZEN; + } + + slotno = SimpleLruReadPage_ReadOnly_Locked(CsnlogCtl, pageno, xid); + return *(CommitSeqNo *) (CsnlogCtl->shared->page_buffer[slotno] + + entryno * sizeof(XLogRecPtr)); +} + +/* + * Find the next xid that is in progress. + * We do not care about the subtransactions, they are accounted for + * by their respective top-level transactions. + */ +TransactionId +CSNLogGetNextActiveXid(TransactionId xid, + TransactionId end) +{ + Assert(TransactionIdIsValid(TransactionXmin)); + + LWLockAcquire(CSNLogControlLock, LW_SHARED); + + for (;;) + { + int pageno; + int slotno; + int entryno; + + if (!TransactionIdPrecedes(xid, end)) + goto end; + + pageno = TransactionIdToPage(xid); + slotno = SimpleLruReadPage_ReadOnly_Locked(CsnlogCtl, pageno, xid); + + for (entryno = TransactionIdToPgIndex(xid); entryno < CSNLOG_XACTS_PER_PAGE; + entryno++) + { + CommitSeqNo csn; + + if (!TransactionIdPrecedes(xid, end)) + goto end; + + csn = *(XLogRecPtr *) (CsnlogCtl->shared->page_buffer[slotno] + entryno * sizeof(XLogRecPtr)); + + if (COMMITSEQNO_IS_INPROGRESS(csn) + || COMMITSEQNO_IS_COMMITTING(csn)) + { + goto end; + } + + TransactionIdAdvance(xid); + } + } + +end: + LWLockRelease(CSNLogControlLock); + + return xid; +} + +/* + * Number of shared CSNLOG buffers. + */ +Size +CSNLOGShmemBuffers(void) +{ + return Min(128, Max(BATCH_SIZE, NBuffers / 512)); +} + +/* + * Initialization of shared memory for CSNLOG + */ +Size +CSNLOGShmemSize(void) +{ + return SimpleLruShmemSize(CSNLOGShmemBuffers(), 0); +} + +void +CSNLOGShmemInit(void) +{ + CsnlogCtl->PagePrecedes = CSNLOGPagePrecedes; + SimpleLruInit(CsnlogCtl, "CSNLOG Ctl", CSNLOGShmemBuffers(), 0, + CSNLogControlLock, "pg_csnlog", LWTRANCHE_CSNLOG_BUFFERS); +} + +/* + * This func must be called ONCE on system install. It creates + * the initial CSNLOG segment. (The pg_csnlog directory is assumed to + * have been created by initdb, and CSNLOGShmemInit must have been + * called already.) + */ +void +BootStrapCSNLOG(void) +{ + int slotno; + + LWLockAcquire(CSNLogControlLock, LW_EXCLUSIVE); + + /* Create and zero the first page of the commit log */ + slotno = ZeroCSNLOGPage(0); + + /* Make sure it's written out */ + SimpleLruWritePage(CsnlogCtl, slotno); + Assert(!CsnlogCtl->shared->page_dirty[slotno]); + + LWLockRelease(CSNLogControlLock); +} + + +/* + * Initialize (or reinitialize) a page of CLOG to zeroes. + * If writeXlog is TRUE, also emit an XLOG record saying we did this. + * + * The page is not actually written, just set up in shared memory. + * The slot number of the new page is returned. + * + * Control lock must be held at entry, and will be held at exit. + */ +static int +ZeroCSNLOGPage(int pageno) +{ + return SimpleLruZeroPage(CsnlogCtl, pageno); +} + +/* + * This must be called ONCE during postmaster or standalone-backend startup, + * after StartupXLOG has initialized ShmemVariableCache->nextXid. + * + * oldestActiveXID is the oldest XID of any prepared transaction, or nextXid + * if there are none. + */ +void +StartupCSNLOG(TransactionId oldestActiveXID) +{ + int startPage; + int endPage; + + /* + * Since we don't expect pg_csnlog to be valid across crashes, we + * initialize the currently-active page(s) to zeroes during startup. + * Whenever we advance into a new page, ExtendCSNLOG will likewise zero + * the new page without regard to whatever was previously on disk. + */ + LWLockAcquire(CSNLogControlLock, LW_EXCLUSIVE); + + startPage = TransactionIdToPage(oldestActiveXID); + endPage = TransactionIdToPage(ShmemVariableCache->nextXid); + endPage = ((endPage + BATCH_SIZE - 1) / BATCH_SIZE) * BATCH_SIZE; + + while (startPage != endPage) + { + (void) ZeroCSNLOGPage(startPage); + startPage++; + /* must account for wraparound */ + if (startPage > TransactionIdToPage(MaxTransactionId)) + startPage = 0; + } + (void) ZeroCSNLOGPage(startPage); + + LWLockRelease(CSNLogControlLock); +} + +/* + * This must be called ONCE during postmaster or standalone-backend shutdown + */ +void +ShutdownCSNLOG(void) +{ + /* + * Flush dirty CLOG pages to disk + * + * This is not actually necessary from a correctness point of view. We do + * it merely as a debugging aid. + */ + TRACE_POSTGRESQL_CSNLOG_CHECKPOINT_START(false); + SimpleLruFlush(CsnlogCtl, false); + TRACE_POSTGRESQL_CSNLOG_CHECKPOINT_DONE(false); +} + +/* + * This must be called ONCE at the end of startup/recovery. + */ +void +TrimCSNLOG(void) +{ + TransactionId xid = ShmemVariableCache->nextXid; + int pageno = TransactionIdToPage(xid); + + LWLockAcquire(CSNLogControlLock, LW_EXCLUSIVE); + + /* + * Re-Initialize our idea of the latest page number. + */ + CsnlogCtl->shared->latest_page_number = pageno; + + /* + * Zero out the remainder of the current clog page. Under normal + * circumstances it should be zeroes already, but it seems at least + * theoretically possible that XLOG replay will have settled on a nextXID + * value that is less than the last XID actually used and marked by the + * previous database lifecycle (since subtransaction commit writes clog + * but makes no WAL entry). Let's just be safe. (We need not worry about + * pages beyond the current one, since those will be zeroed when first + * used. For the same reason, there is no need to do anything when + * nextXid is exactly at a page boundary; and it's likely that the + * "current" page doesn't exist yet in that case.) + */ + if (TransactionIdToPgIndex(xid) != 0) + { + int entryno = TransactionIdToPgIndex(xid); + int byteno = entryno * sizeof(XLogRecPtr); + int slotno; + char *byteptr; + + slotno = SimpleLruReadPage(CsnlogCtl, pageno, false, xid); + + byteptr = CsnlogCtl->shared->page_buffer[slotno] + byteno; + + /* Zero the rest of the page */ + MemSet(byteptr, 0, BLCKSZ - byteno); + + CsnlogCtl->shared->page_dirty[slotno] = true; + } + + LWLockRelease(CSNLogControlLock); +} + +/* + * Perform a checkpoint --- either during shutdown, or on-the-fly + */ +void +CheckPointCSNLOG(void) +{ + /* + * Flush dirty CLOG pages to disk + * + * This is not actually necessary from a correctness point of view. We do + * it merely to improve the odds that writing of dirty pages is done by + * the checkpoint process and not by backends. + */ + TRACE_POSTGRESQL_CSNLOG_CHECKPOINT_START(true); + SimpleLruFlush(CsnlogCtl, true); + TRACE_POSTGRESQL_CSNLOG_CHECKPOINT_DONE(true); +} + + +/* + * Make sure that CSNLOG has room for a newly-allocated XID. + * + * NB: this is called while holding XidGenLock. We want it to be very fast + * most of the time; even when it's not so fast, no actual I/O need happen + * unless we're forced to write out a dirty clog or xlog page to make room + * in shared memory. + */ +void +ExtendCSNLOG(TransactionId newestXact) +{ + int i; + int pageno; + + /* + * No work except at first XID of a page. But beware: just after + * wraparound, the first XID of page zero is FirstNormalTransactionId. + */ + if (TransactionIdToPgIndex(newestXact) != 0 && + !TransactionIdEquals(newestXact, FirstNormalTransactionId)) + return; + + pageno = TransactionIdToPage(newestXact); + + if (pageno % BATCH_SIZE) + return; + LWLockAcquire(CSNLogControlLock, LW_EXCLUSIVE); + + /* Zero the page and make an XLOG entry about it */ + for (i = pageno; i < pageno + BATCH_SIZE; i++) + ZeroCSNLOGPage(i); + + LWLockRelease(CSNLogControlLock); +} + + +/* + * Remove all CSNLOG segments before the one holding the passed transaction ID + * + * This is normally called during checkpoint, with oldestXact being the + * oldest TransactionXmin of any running transaction. + */ +void +TruncateCSNLOG(TransactionId oldestXact) +{ + int cutoffPage; + + /* + * The cutoff point is the start of the segment containing oldestXact. We + * pass the *page* containing oldestXact to SimpleLruTruncate. + */ + cutoffPage = TransactionIdToPage(oldestXact); + + SimpleLruTruncate(CsnlogCtl, cutoffPage); +} + + +/* + * Decide which of two CLOG page numbers is "older" for truncation purposes. + * + * We need to use comparison of TransactionIds here in order to do the right + * thing with wraparound XID arithmetic. However, if we are asked about + * page number zero, we don't want to hand InvalidTransactionId to + * TransactionIdPrecedes: it'll get weird about permanent xact IDs. So, + * offset both xids by FirstNormalTransactionId to avoid that. + */ +static bool +CSNLOGPagePrecedes(int page1, int page2) +{ + TransactionId xid1; + TransactionId xid2; + + xid1 = ((TransactionId) page1) * CSNLOG_XACTS_PER_PAGE; + xid1 += FirstNormalTransactionId; + xid2 = ((TransactionId) page2) * CSNLOG_XACTS_PER_PAGE; + xid2 += FirstNormalTransactionId; + + return TransactionIdPrecedes(xid1, xid2); +} diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c index 0fb6bf2f02..5c38da7eda 100644 --- a/src/backend/access/transam/multixact.c +++ b/src/backend/access/transam/multixact.c @@ -69,6 +69,7 @@ #include "postgres.h" #include "access/multixact.h" +#include "access/mvccvars.h" #include "access/slru.h" #include "access/transam.h" #include "access/twophase.h" @@ -513,9 +514,11 @@ MultiXactIdExpand(MultiXactId multi, TransactionId xid, MultiXactStatus status) for (i = 0, j = 0; i < nmembers; i++) { - if (TransactionIdIsInProgress(members[i].xid) || + TransactionIdStatus xidstatus = TransactionIdGetStatus(members[i].xid); + + if (xidstatus == XID_INPROGRESS || (ISUPDATE_from_mxstatus(members[i].status) && - TransactionIdDidCommit(members[i].xid))) + xidstatus == XID_COMMITTED)) { newMembers[j].xid = members[i].xid; newMembers[j++].status = members[i].status; @@ -590,7 +593,7 @@ MultiXactIdIsRunning(MultiXactId multi, bool isLockOnly) */ for (i = 0; i < nmembers; i++) { - if (TransactionIdIsInProgress(members[i].xid)) + if (TransactionIdGetStatus(members[i].xid) == XID_INPROGRESS) { debug_elog4(DEBUG2, "IsRunning: member %d (%u) is running", i, members[i].xid); diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c index 94b6e6612a..960944dc0f 100644 --- a/src/backend/access/transam/slru.c +++ b/src/backend/access/transam/slru.c @@ -57,6 +57,7 @@ #include "pgstat.h" #include "storage/fd.h" #include "storage/shmem.h" +#include "utils/hsearch.h" #include "miscadmin.h" @@ -81,6 +82,13 @@ typedef struct SlruFlushData typedef struct SlruFlushData *SlruFlush; +/* An entry of page-to-slot hash map */ +typedef struct PageSlotEntry +{ + int page; + int slot; +} PageSlotEntry; + /* * Macro to mark a buffer slot "most recently used". Note multiple evaluation * of arguments! @@ -166,11 +174,24 @@ SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns, LWLock *ctllock, const char *subdir, int tranche_id) { SlruShared shared; + char *hashName; + HTAB *htab; bool found; + HASHCTL info; shared = (SlruShared) ShmemInitStruct(name, SimpleLruShmemSize(nslots, nlsns), &found); + hashName = psprintf("%s_hash", name); + + MemSet(&info, 0, sizeof(info)); + info.keysize = sizeof(((PageSlotEntry*)0)->page); + info.entrysize = sizeof(PageSlotEntry); + + htab = ShmemInitHash(hashName, nslots, nslots, &info, + HASH_ELEM | HASH_BLOBS | HASH_FIXED_SIZE); + + pfree(hashName); if (!IsUnderPostmaster) { @@ -247,6 +268,7 @@ SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns, * assume caller set PagePrecedes. */ ctl->shared = shared; + ctl->pageToSlot = htab; ctl->do_fsync = true; /* default behavior */ StrNCpy(ctl->Dir, subdir, sizeof(ctl->Dir)); } @@ -264,6 +286,7 @@ SimpleLruZeroPage(SlruCtl ctl, int pageno) { SlruShared shared = ctl->shared; int slotno; + PageSlotEntry *entry = NULL; /* Find a suitable buffer slot for the page */ slotno = SlruSelectLRUPage(ctl, pageno); @@ -273,7 +296,16 @@ SimpleLruZeroPage(SlruCtl ctl, int pageno) shared->page_number[slotno] == pageno); /* Mark the slot as containing this page */ + if (shared->page_status[slotno] == SLRU_PAGE_VALID) + { + int oldpageno = shared->page_number[slotno]; + entry = hash_search(ctl->pageToSlot, &oldpageno, HASH_REMOVE, NULL); + Assert(entry != NULL); + } + shared->page_number[slotno] = pageno; + entry = hash_search(ctl->pageToSlot, &pageno, HASH_ENTER, NULL); + entry->slot = slotno; shared->page_status[slotno] = SLRU_PAGE_VALID; shared->page_dirty[slotno] = true; SlruRecentlyUsed(shared, slotno); @@ -343,8 +375,14 @@ SimpleLruWaitIO(SlruCtl ctl, int slotno) { /* indeed, the I/O must have failed */ if (shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS) + { + int oldpageno = shared->page_number[slotno]; + PageSlotEntry *entry = hash_search(ctl->pageToSlot, &oldpageno, HASH_REMOVE, NULL); + + Assert(entry != NULL); shared->page_status[slotno] = SLRU_PAGE_EMPTY; - else /* write_in_progress */ + } + else /* write_in_progress */ { shared->page_status[slotno] = SLRU_PAGE_VALID; shared->page_dirty[slotno] = true; @@ -382,6 +420,7 @@ SimpleLruReadPage(SlruCtl ctl, int pageno, bool write_ok, { int slotno; bool ok; + PageSlotEntry *entry; /* See if page already is in memory; if not, pick victim slot */ slotno = SlruSelectLRUPage(ctl, pageno); @@ -413,7 +452,16 @@ SimpleLruReadPage(SlruCtl ctl, int pageno, bool write_ok, !shared->page_dirty[slotno])); /* Mark the slot read-busy */ + if (shared->page_status[slotno] == SLRU_PAGE_VALID) + { + int oldpageno = shared->page_number[slotno]; + PageSlotEntry *entry = hash_search(ctl->pageToSlot, &oldpageno, HASH_REMOVE, NULL); + Assert(entry != NULL); + } + shared->page_number[slotno] = pageno; + entry = hash_search(ctl->pageToSlot, &pageno, HASH_ENTER, NULL); + entry->slot = slotno; shared->page_status[slotno] = SLRU_PAGE_READ_IN_PROGRESS; shared->page_dirty[slotno] = false; @@ -436,7 +484,14 @@ SimpleLruReadPage(SlruCtl ctl, int pageno, bool write_ok, shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS && !shared->page_dirty[slotno]); - shared->page_status[slotno] = ok ? SLRU_PAGE_VALID : SLRU_PAGE_EMPTY; + if (ok) + shared->page_status[slotno] = SLRU_PAGE_VALID; + else + { + PageSlotEntry *entry = hash_search(ctl->pageToSlot, &pageno, HASH_REMOVE, NULL); + Assert(entry != NULL); + shared->page_status[slotno] = SLRU_PAGE_EMPTY; + } LWLockRelease(&shared->buffer_locks[slotno].lock); @@ -450,9 +505,13 @@ SimpleLruReadPage(SlruCtl ctl, int pageno, bool write_ok, } /* + * !!! FIXME: rename to SimpleLruReadPage_Shared + * * Find a page in a shared buffer, reading it in if necessary. * The page number must correspond to an already-initialized page. - * The caller must intend only read-only access to the page. + * The caller can dirty the page holding the shared lock, but it + * becomes their responsibility to synchronize the access to the + * page data. * * The passed-in xid is used only for error reporting, and may be * InvalidTransactionId if no specific xid is associated with the action. @@ -467,19 +526,22 @@ int SimpleLruReadPage_ReadOnly(SlruCtl ctl, int pageno, TransactionId xid) { SlruShared shared = ctl->shared; - int slotno; + PageSlotEntry *entry = NULL; + int slotno; /* Try to find the page while holding only shared lock */ LWLockAcquire(shared->ControlLock, LW_SHARED); /* See if page is already in a buffer */ - for (slotno = 0; slotno < shared->num_slots; slotno++) + entry = hash_search(ctl->pageToSlot, &pageno, HASH_FIND, NULL); + if (entry != NULL) { - if (shared->page_number[slotno] == pageno && - shared->page_status[slotno] != SLRU_PAGE_EMPTY && - shared->page_status[slotno] != SLRU_PAGE_READ_IN_PROGRESS) + slotno = entry->slot; + Assert(shared->page_status[slotno] != SLRU_PAGE_EMPTY); + if (shared->page_status[slotno] != SLRU_PAGE_EMPTY + && shared->page_status[slotno] != SLRU_PAGE_READ_IN_PROGRESS) { - /* See comments for SlruRecentlyUsed macro */ + Assert(shared->page_number[slotno] == pageno); SlruRecentlyUsed(shared, slotno); return slotno; } @@ -493,6 +555,44 @@ SimpleLruReadPage_ReadOnly(SlruCtl ctl, int pageno, TransactionId xid) } /* + * Same as SimpleLruReadPage_ReadOnly, but the shared lock must be held by the caller + * and will be held at exit. + */ +int +SimpleLruReadPage_ReadOnly_Locked(SlruCtl ctl, int pageno, TransactionId xid) +{ + SlruShared shared = ctl->shared; + int slotno; + PageSlotEntry *entry; + + Assert(LWLockHeldByMe(shared->ControlLock)); + + for (;;) + { + /* See if page is already in a buffer */ + entry = hash_search(ctl->pageToSlot, &pageno, HASH_FIND, NULL); + if (entry != NULL) + { + slotno = entry->slot; + if (shared->page_status[slotno] != SLRU_PAGE_EMPTY + && shared->page_status[slotno] != SLRU_PAGE_READ_IN_PROGRESS) + { + Assert(shared->page_number[slotno] == pageno); + SlruRecentlyUsed(shared, slotno); + return slotno; + } + } + + /* No luck, so switch to normal exclusive lock and do regular read */ + LWLockRelease(shared->ControlLock); + LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE); + SimpleLruReadPage(ctl, pageno, true, xid); + LWLockRelease(shared->ControlLock); + LWLockAcquire(shared->ControlLock, LW_SHARED); + } +} + +/* * Write a page from a shared buffer, if necessary. * Does nothing if the specified slot is not dirty. * @@ -975,9 +1075,9 @@ SlruSelectLRUPage(SlruCtl ctl, int pageno) int bestvalidslot = 0; /* keep compiler quiet */ int best_valid_delta = -1; int best_valid_page_number = 0; /* keep compiler quiet */ - int bestinvalidslot = 0; /* keep compiler quiet */ + int bestinvalidslot = 0; /* keep compiler quiet */ int best_invalid_delta = -1; - int best_invalid_page_number = 0; /* keep compiler quiet */ + int best_invalid_page_number = 0; /* keep compiler quiet */ /* See if page already has a buffer assigned */ for (slotno = 0; slotno < shared->num_slots; slotno++) @@ -1213,6 +1313,9 @@ restart:; if (shared->page_status[slotno] == SLRU_PAGE_VALID && !shared->page_dirty[slotno]) { + int oldpageno = shared->page_number[slotno]; + PageSlotEntry *entry = hash_search(ctl->pageToSlot, &oldpageno, HASH_REMOVE, NULL); + Assert(entry != NULL); shared->page_status[slotno] = SLRU_PAGE_EMPTY; continue; } @@ -1284,6 +1387,9 @@ restart: if (shared->page_status[slotno] == SLRU_PAGE_VALID && !shared->page_dirty[slotno]) { + int oldpageno = shared->page_number[slotno]; + PageSlotEntry *entry = hash_search(ctl->pageToSlot, &oldpageno, HASH_REMOVE, NULL); + Assert(entry != NULL); shared->page_status[slotno] = SLRU_PAGE_EMPTY; continue; } diff --git a/src/backend/access/transam/subtrans.c b/src/backend/access/transam/subtrans.c deleted file mode 100644 index f640661130..0000000000 --- a/src/backend/access/transam/subtrans.c +++ /dev/null @@ -1,394 +0,0 @@ -/*------------------------------------------------------------------------- - * - * subtrans.c - * PostgreSQL subtransaction-log manager - * - * The pg_subtrans manager is a pg_xact-like manager that stores the parent - * transaction Id for each transaction. It is a fundamental part of the - * nested transactions implementation. A main transaction has a parent - * of InvalidTransactionId, and each subtransaction has its immediate parent. - * The tree can easily be walked from child to parent, but not in the - * opposite direction. - * - * This code is based on xact.c, but the robustness requirements - * are completely different from pg_xact, because we only need to remember - * pg_subtrans information for currently-open transactions. Thus, there is - * no need to preserve data over a crash and restart. - * - * There are no XLOG interactions since we do not care about preserving - * data across crashes. During database startup, we simply force the - * currently-active page of SUBTRANS to zeroes. - * - * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group - * Portions Copyright (c) 1994, Regents of the University of California - * - * src/backend/access/transam/subtrans.c - * - *------------------------------------------------------------------------- - */ -#include "postgres.h" - -#include "access/slru.h" -#include "access/subtrans.h" -#include "access/transam.h" -#include "pg_trace.h" -#include "utils/snapmgr.h" - - -/* - * Defines for SubTrans page sizes. A page is the same BLCKSZ as is used - * everywhere else in Postgres. - * - * Note: because TransactionIds are 32 bits and wrap around at 0xFFFFFFFF, - * SubTrans page numbering also wraps around at - * 0xFFFFFFFF/SUBTRANS_XACTS_PER_PAGE, and segment numbering at - * 0xFFFFFFFF/SUBTRANS_XACTS_PER_PAGE/SLRU_PAGES_PER_SEGMENT. We need take no - * explicit notice of that fact in this module, except when comparing segment - * and page numbers in TruncateSUBTRANS (see SubTransPagePrecedes) and zeroing - * them in StartupSUBTRANS. - */ - -/* We need four bytes per xact */ -#define SUBTRANS_XACTS_PER_PAGE (BLCKSZ / sizeof(TransactionId)) - -#define TransactionIdToPage(xid) ((xid) / (TransactionId) SUBTRANS_XACTS_PER_PAGE) -#define TransactionIdToEntry(xid) ((xid) % (TransactionId) SUBTRANS_XACTS_PER_PAGE) - - -/* - * Link to shared-memory data structures for SUBTRANS control - */ -static SlruCtlData SubTransCtlData; - -#define SubTransCtl (&SubTransCtlData) - - -static int ZeroSUBTRANSPage(int pageno); -static bool SubTransPagePrecedes(int page1, int page2); - - -/* - * Record the parent of a subtransaction in the subtrans log. - */ -void -SubTransSetParent(TransactionId xid, TransactionId parent) -{ - int pageno = TransactionIdToPage(xid); - int entryno = TransactionIdToEntry(xid); - int slotno; - TransactionId *ptr; - - Assert(TransactionIdIsValid(parent)); - Assert(TransactionIdFollows(xid, parent)); - - LWLockAcquire(SubtransControlLock, LW_EXCLUSIVE); - - slotno = SimpleLruReadPage(SubTransCtl, pageno, true, xid); - ptr = (TransactionId *) SubTransCtl->shared->page_buffer[slotno]; - ptr += entryno; - - /* - * It's possible we'll try to set the parent xid multiple times but we - * shouldn't ever be changing the xid from one valid xid to another valid - * xid, which would corrupt the data structure. - */ - if (*ptr != parent) - { - Assert(*ptr == InvalidTransactionId); - *ptr = parent; - SubTransCtl->shared->page_dirty[slotno] = true; - } - - LWLockRelease(SubtransControlLock); -} - -/* - * Interrogate the parent of a transaction in the subtrans log. - */ -TransactionId -SubTransGetParent(TransactionId xid) -{ - int pageno = TransactionIdToPage(xid); - int entryno = TransactionIdToEntry(xid); - int slotno; - TransactionId *ptr; - TransactionId parent; - - /* Can't ask about stuff that might not be around anymore */ - Assert(TransactionIdFollowsOrEquals(xid, TransactionXmin)); - - /* Bootstrap and frozen XIDs have no parent */ - if (!TransactionIdIsNormal(xid)) - return InvalidTransactionId; - - /* lock is acquired by SimpleLruReadPage_ReadOnly */ - - slotno = SimpleLruReadPage_ReadOnly(SubTransCtl, pageno, xid); - ptr = (TransactionId *) SubTransCtl->shared->page_buffer[slotno]; - ptr += entryno; - - parent = *ptr; - - LWLockRelease(SubtransControlLock); - - return parent; -} - -/* - * SubTransGetTopmostTransaction - * - * Returns the topmost transaction of the given transaction id. - * - * Because we cannot look back further than TransactionXmin, it is possible - * that this function will lie and return an intermediate subtransaction ID - * instead of the true topmost parent ID. This is OK, because in practice - * we only care about detecting whether the topmost parent is still running - * or is part of a current snapshot's list of still-running transactions. - * Therefore, any XID before TransactionXmin is as good as any other. - */ -TransactionId -SubTransGetTopmostTransaction(TransactionId xid) -{ - TransactionId parentXid = xid, - previousXid = xid; - - /* Can't ask about stuff that might not be around anymore */ - Assert(TransactionIdFollowsOrEquals(xid, TransactionXmin)); - - while (TransactionIdIsValid(parentXid)) - { - previousXid = parentXid; - if (TransactionIdPrecedes(parentXid, TransactionXmin)) - break; - parentXid = SubTransGetParent(parentXid); - - /* - * By convention the parent xid gets allocated first, so should always - * precede the child xid. Anything else points to a corrupted data - * structure that could lead to an infinite loop, so exit. - */ - if (!TransactionIdPrecedes(parentXid, previousXid)) - elog(ERROR, "pg_subtrans contains invalid entry: xid %u points to parent xid %u", - previousXid, parentXid); - } - - Assert(TransactionIdIsValid(previousXid)); - - return previousXid; -} - - -/* - * Initialization of shared memory for SUBTRANS - */ -Size -SUBTRANSShmemSize(void) -{ - return SimpleLruShmemSize(NUM_SUBTRANS_BUFFERS, 0); -} - -void -SUBTRANSShmemInit(void) -{ - SubTransCtl->PagePrecedes = SubTransPagePrecedes; - SimpleLruInit(SubTransCtl, "subtrans", NUM_SUBTRANS_BUFFERS, 0, - SubtransControlLock, "pg_subtrans", - LWTRANCHE_SUBTRANS_BUFFERS); - /* Override default assumption that writes should be fsync'd */ - SubTransCtl->do_fsync = false; -} - -/* - * This func must be called ONCE on system install. It creates - * the initial SUBTRANS segment. (The SUBTRANS directory is assumed to - * have been created by the initdb shell script, and SUBTRANSShmemInit - * must have been called already.) - * - * Note: it's not really necessary to create the initial segment now, - * since slru.c would create it on first write anyway. But we may as well - * do it to be sure the directory is set up correctly. - */ -void -BootStrapSUBTRANS(void) -{ - int slotno; - - LWLockAcquire(SubtransControlLock, LW_EXCLUSIVE); - - /* Create and zero the first page of the subtrans log */ - slotno = ZeroSUBTRANSPage(0); - - /* Make sure it's written out */ - SimpleLruWritePage(SubTransCtl, slotno); - Assert(!SubTransCtl->shared->page_dirty[slotno]); - - LWLockRelease(SubtransControlLock); -} - -/* - * Initialize (or reinitialize) a page of SUBTRANS to zeroes. - * - * The page is not actually written, just set up in shared memory. - * The slot number of the new page is returned. - * - * Control lock must be held at entry, and will be held at exit. - */ -static int -ZeroSUBTRANSPage(int pageno) -{ - return SimpleLruZeroPage(SubTransCtl, pageno); -} - -/* - * This must be called ONCE during postmaster or standalone-backend startup, - * after StartupXLOG has initialized ShmemVariableCache->nextXid. - * - * oldestActiveXID is the oldest XID of any prepared transaction, or nextXid - * if there are none. - */ -void -StartupSUBTRANS(TransactionId oldestActiveXID) -{ - int startPage; - int endPage; - - /* - * Since we don't expect pg_subtrans to be valid across crashes, we - * initialize the currently-active page(s) to zeroes during startup. - * Whenever we advance into a new page, ExtendSUBTRANS will likewise zero - * the new page without regard to whatever was previously on disk. - */ - LWLockAcquire(SubtransControlLock, LW_EXCLUSIVE); - - startPage = TransactionIdToPage(oldestActiveXID); - endPage = TransactionIdToPage(ShmemVariableCache->nextXid); - - while (startPage != endPage) - { - (void) ZeroSUBTRANSPage(startPage); - startPage++; - /* must account for wraparound */ - if (startPage > TransactionIdToPage(MaxTransactionId)) - startPage = 0; - } - (void) ZeroSUBTRANSPage(startPage); - - LWLockRelease(SubtransControlLock); -} - -/* - * This must be called ONCE during postmaster or standalone-backend shutdown - */ -void -ShutdownSUBTRANS(void) -{ - /* - * Flush dirty SUBTRANS pages to disk - * - * This is not actually necessary from a correctness point of view. We do - * it merely as a debugging aid. - */ - TRACE_POSTGRESQL_SUBTRANS_CHECKPOINT_START(false); - SimpleLruFlush(SubTransCtl, false); - TRACE_POSTGRESQL_SUBTRANS_CHECKPOINT_DONE(false); -} - -/* - * Perform a checkpoint --- either during shutdown, or on-the-fly - */ -void -CheckPointSUBTRANS(void) -{ - /* - * Flush dirty SUBTRANS pages to disk - * - * This is not actually necessary from a correctness point of view. We do - * it merely to improve the odds that writing of dirty pages is done by - * the checkpoint process and not by backends. - */ - TRACE_POSTGRESQL_SUBTRANS_CHECKPOINT_START(true); - SimpleLruFlush(SubTransCtl, true); - TRACE_POSTGRESQL_SUBTRANS_CHECKPOINT_DONE(true); -} - - -/* - * Make sure that SUBTRANS has room for a newly-allocated XID. - * - * NB: this is called while holding XidGenLock. We want it to be very fast - * most of the time; even when it's not so fast, no actual I/O need happen - * unless we're forced to write out a dirty subtrans page to make room - * in shared memory. - */ -void -ExtendSUBTRANS(TransactionId newestXact) -{ - int pageno; - - /* - * No work except at first XID of a page. But beware: just after - * wraparound, the first XID of page zero is FirstNormalTransactionId. - */ - if (TransactionIdToEntry(newestXact) != 0 && - !TransactionIdEquals(newestXact, FirstNormalTransactionId)) - return; - - pageno = TransactionIdToPage(newestXact); - - LWLockAcquire(SubtransControlLock, LW_EXCLUSIVE); - - /* Zero the page */ - ZeroSUBTRANSPage(pageno); - - LWLockRelease(SubtransControlLock); -} - - -/* - * Remove all SUBTRANS segments before the one holding the passed transaction ID - * - * This is normally called during checkpoint, with oldestXact being the - * oldest TransactionXmin of any running transaction. - */ -void -TruncateSUBTRANS(TransactionId oldestXact) -{ - int cutoffPage; - - /* - * The cutoff point is the start of the segment containing oldestXact. We - * pass the *page* containing oldestXact to SimpleLruTruncate. We step - * back one transaction to avoid passing a cutoff page that hasn't been - * created yet in the rare case that oldestXact would be the first item on - * a page and oldestXact == next XID. In that case, if we didn't subtract - * one, we'd trigger SimpleLruTruncate's wraparound detection. - */ - TransactionIdRetreat(oldestXact); - cutoffPage = TransactionIdToPage(oldestXact); - - SimpleLruTruncate(SubTransCtl, cutoffPage); -} - - -/* - * Decide which of two SUBTRANS page numbers is "older" for truncation purposes. - * - * We need to use comparison of TransactionIds here in order to do the right - * thing with wraparound XID arithmetic. However, if we are asked about - * page number zero, we don't want to hand InvalidTransactionId to - * TransactionIdPrecedes: it'll get weird about permanent xact IDs. So, - * offset both xids by FirstNormalTransactionId to avoid that. - */ -static bool -SubTransPagePrecedes(int page1, int page2) -{ - TransactionId xid1; - TransactionId xid2; - - xid1 = ((TransactionId) page1) * SUBTRANS_XACTS_PER_PAGE; - xid1 += FirstNormalTransactionId; - xid2 = ((TransactionId) page2) * SUBTRANS_XACTS_PER_PAGE; - xid2 += FirstNormalTransactionId; - - return TransactionIdPrecedes(xid1, xid2); -} diff --git a/src/backend/access/transam/transam.c b/src/backend/access/transam/transam.c index 968b232364..e2dd957693 100644 --- a/src/backend/access/transam/transam.c +++ b/src/backend/access/transam/transam.c @@ -3,6 +3,15 @@ * transam.c * postgres transaction (commit) log interface routines * + * This module contains high level functions for managing the status + * of transactions. It sits on top of two lower level structures: the + * CLOG, and the CSNLOG. The CLOG is a permanent on-disk structure that + * tracks the committed/aborted status for each transaction ID. The CSNLOG + * tracks *when* each transaction ID committed (or aborted). The CSNLOG + * is used when checking the status of recent transactions that might still + * be in-progress, and it is reset at server startup. The CLOG is used for + * older transactions that are known to have completed (or crashed). + * * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * @@ -10,56 +19,49 @@ * IDENTIFICATION * src/backend/access/transam/transam.c * - * NOTES - * This file contains the high level access-method interface to the - * transaction system. - * *------------------------------------------------------------------------- */ #include "postgres.h" #include "access/clog.h" +#include "access/csnlog.h" +#include "access/mvccvars.h" #include "access/subtrans.h" #include "access/transam.h" +#include "storage/lmgr.h" #include "utils/snapmgr.h" /* - * Single-item cache for results of TransactionLogFetch. It's worth having + * Single-item cache for results of TransactionIdGetCommitSeqNo. It's worth + * having * such a cache because we frequently find ourselves repeatedly checking the * same XID, for example when scanning a table just after a bulk insert, * update, or delete. */ static TransactionId cachedFetchXid = InvalidTransactionId; -static XidStatus cachedFetchXidStatus; -static XLogRecPtr cachedCommitLSN; +static CommitSeqNo cachedCSN; -/* Local functions */ -static XidStatus TransactionLogFetch(TransactionId transactionId); - - -/* ---------------------------------------------------------------- - * Postgres log access method interface - * - * TransactionLogFetch - * ---------------------------------------------------------------- +/* + * Also have a (separate) cache for CLogGetCommitLSN() */ +static TransactionId cachedLSNFetchXid = InvalidTransactionId; +static XLogRecPtr cachedCommitLSN; /* - * TransactionLogFetch --- fetch commit status of specified transaction id + * TransactionIdGetCommitSeqNo --- fetch CSN of specified transaction id */ -static XidStatus -TransactionLogFetch(TransactionId transactionId) +CommitSeqNo +TransactionIdGetCommitSeqNo(TransactionId transactionId) { - XidStatus xidstatus; - XLogRecPtr xidlsn; + CommitSeqNo csn; /* * Before going to the commit log manager, check our single item cache to * see if we didn't just check the transaction status a moment ago. */ if (TransactionIdEquals(transactionId, cachedFetchXid)) - return cachedFetchXidStatus; + return cachedCSN; /* * Also, check to see if the transaction ID is a permanent one. @@ -67,53 +69,63 @@ TransactionLogFetch(TransactionId transactionId) if (!TransactionIdIsNormal(transactionId)) { if (TransactionIdEquals(transactionId, BootstrapTransactionId)) - return TRANSACTION_STATUS_COMMITTED; + return COMMITSEQNO_FROZEN; if (TransactionIdEquals(transactionId, FrozenTransactionId)) - return TRANSACTION_STATUS_COMMITTED; - return TRANSACTION_STATUS_ABORTED; + return COMMITSEQNO_FROZEN; + return COMMITSEQNO_ABORTED; } /* - * Get the transaction status. + * If the XID is older than TransactionXmin, check the clog. Otherwise + * check the csnlog. */ - xidstatus = TransactionIdGetStatus(transactionId, &xidlsn); + Assert(TransactionIdIsValid(TransactionXmin)); + if (TransactionIdPrecedes(transactionId, TransactionXmin)) + { + XLogRecPtr lsn; + + if (CLogGetStatus(transactionId, &lsn) == CLOG_XID_STATUS_COMMITTED) + csn = COMMITSEQNO_FROZEN; + else + csn = COMMITSEQNO_ABORTED; + } + else + { + csn = CSNLogGetCommitSeqNo(transactionId); + + if (csn == COMMITSEQNO_COMMITTING) + { + /* + * If the transaction is committing at this very instant, and + * hasn't set its CSN yet, wait for it to finish doing so. + * + * XXX: Alternatively, we could wait on the heavy-weight lock on + * the XID. that'd make TransactionIdCommitTree() slightly + * cheaper, as it wouldn't need to acquire CommitSeqNoLock (even + * in shared mode). + */ + LWLockAcquire(CommitSeqNoLock, LW_EXCLUSIVE); + LWLockRelease(CommitSeqNoLock); + + csn = CSNLogGetCommitSeqNo(transactionId); + Assert(csn != COMMITSEQNO_COMMITTING); + } + } /* - * Cache it, but DO NOT cache status for unfinished or sub-committed - * transactions! We only cache status that is guaranteed not to change. + * Cache it, but DO NOT cache status for unfinished transactions! + * We only cache status that is guaranteed not to change. */ - if (xidstatus != TRANSACTION_STATUS_IN_PROGRESS && - xidstatus != TRANSACTION_STATUS_SUB_COMMITTED) + if (COMMITSEQNO_IS_COMMITTED(csn) || + COMMITSEQNO_IS_ABORTED(csn)) { cachedFetchXid = transactionId; - cachedFetchXidStatus = xidstatus; - cachedCommitLSN = xidlsn; + cachedCSN = csn; } - return xidstatus; + return csn; } -/* ---------------------------------------------------------------- - * Interface functions - * - * TransactionIdDidCommit - * TransactionIdDidAbort - * ======== - * these functions test the transaction status of - * a specified transaction id. - * - * TransactionIdCommitTree - * TransactionIdAsyncCommitTree - * TransactionIdAbortTree - * ======== - * these functions set the transaction status of the specified - * transaction tree. - * - * See also TransactionIdIsInProgress, which once was in this module - * but now lives in procarray.c. - * ---------------------------------------------------------------- - */ - /* * TransactionIdDidCommit * True iff transaction associated with the identifier did commit. @@ -124,50 +136,14 @@ TransactionLogFetch(TransactionId transactionId) bool /* true if given transaction committed */ TransactionIdDidCommit(TransactionId transactionId) { - XidStatus xidstatus; + CommitSeqNo csn; - xidstatus = TransactionLogFetch(transactionId); + csn = TransactionIdGetCommitSeqNo(transactionId); - /* - * If it's marked committed, it's committed. - */ - if (xidstatus == TRANSACTION_STATUS_COMMITTED) + if (COMMITSEQNO_IS_COMMITTED(csn)) return true; - - /* - * If it's marked subcommitted, we have to check the parent recursively. - * However, if it's older than TransactionXmin, we can't look at - * pg_subtrans; instead assume that the parent crashed without cleaning up - * its children. - * - * Originally we Assert'ed that the result of SubTransGetParent was not - * zero. However with the introduction of prepared transactions, there can - * be a window just after database startup where we do not have complete - * knowledge in pg_subtrans of the transactions after TransactionXmin. - * StartupSUBTRANS() has ensured that any missing information will be - * zeroed. Since this case should not happen under normal conditions, it - * seems reasonable to emit a WARNING for it. - */ - if (xidstatus == TRANSACTION_STATUS_SUB_COMMITTED) - { - TransactionId parentXid; - - if (TransactionIdPrecedes(transactionId, TransactionXmin)) - return false; - parentXid = SubTransGetParent(transactionId); - if (!TransactionIdIsValid(parentXid)) - { - elog(WARNING, "no pg_subtrans entry for subcommitted XID %u", - transactionId); - return false; - } - return TransactionIdDidCommit(parentXid); - } - - /* - * It's not committed. - */ - return false; + else + return false; } /* @@ -180,70 +156,35 @@ TransactionIdDidCommit(TransactionId transactionId) bool /* true if given transaction aborted */ TransactionIdDidAbort(TransactionId transactionId) { - XidStatus xidstatus; + CommitSeqNo csn; - xidstatus = TransactionLogFetch(transactionId); + csn = TransactionIdGetCommitSeqNo(transactionId); - /* - * If it's marked aborted, it's aborted. - */ - if (xidstatus == TRANSACTION_STATUS_ABORTED) + if (COMMITSEQNO_IS_ABORTED(csn)) return true; - - /* - * If it's marked subcommitted, we have to check the parent recursively. - * However, if it's older than TransactionXmin, we can't look at - * pg_subtrans; instead assume that the parent crashed without cleaning up - * its children. - */ - if (xidstatus == TRANSACTION_STATUS_SUB_COMMITTED) - { - TransactionId parentXid; - - if (TransactionIdPrecedes(transactionId, TransactionXmin)) - return true; - parentXid = SubTransGetParent(transactionId); - if (!TransactionIdIsValid(parentXid)) - { - /* see notes in TransactionIdDidCommit */ - elog(WARNING, "no pg_subtrans entry for subcommitted XID %u", - transactionId); - return true; - } - return TransactionIdDidAbort(parentXid); - } - - /* - * It's not aborted. - */ - return false; + else + return false; } /* - * TransactionIdIsKnownCompleted - * True iff transaction associated with the identifier is currently - * known to have either committed or aborted. + * Returns the status of the tranaction. * - * This does NOT look into pg_xact but merely probes our local cache - * (and so it's not named TransactionIdDidComplete, which would be the - * appropriate name for a function that worked that way). The intended - * use is just to short-circuit TransactionIdIsInProgress calls when doing - * repeated tqual.c checks for the same XID. If this isn't extremely fast - * then it will be counterproductive. - * - * Note: - * Assumes transaction identifier is valid. + * Note that this treats a a crashed transaction as still in-progress, + * until it falls off the xmin horizon. */ -bool -TransactionIdIsKnownCompleted(TransactionId transactionId) +TransactionIdStatus +TransactionIdGetStatus(TransactionId xid) { - if (TransactionIdEquals(transactionId, cachedFetchXid)) - { - /* If it's in the cache at all, it must be completed. */ - return true; - } + CommitSeqNo csn; + + csn = TransactionIdGetCommitSeqNo(xid); - return false; + if (COMMITSEQNO_IS_COMMITTED(csn)) + return XID_COMMITTED; + else if (COMMITSEQNO_IS_ABORTED(csn)) + return XID_ABORTED; + else + return XID_INPROGRESS; } /* @@ -252,28 +193,82 @@ TransactionIdIsKnownCompleted(TransactionId transactionId) * * "xid" is a toplevel transaction commit, and the xids array contains its * committed subtransactions. - * - * This commit operation is not guaranteed to be atomic, but if not, subxids - * are correctly marked subcommit first. */ void TransactionIdCommitTree(TransactionId xid, int nxids, TransactionId *xids) { - TransactionIdSetTreeStatus(xid, nxids, xids, - TRANSACTION_STATUS_COMMITTED, - InvalidXLogRecPtr); + TransactionIdAsyncCommitTree(xid, nxids, xids, InvalidXLogRecPtr); } /* * TransactionIdAsyncCommitTree - * Same as above, but for async commits. The commit record LSN is needed. + * Same as above, but for async commits. + * + * "xid" is a toplevel transaction commit, and the xids array contains its + * committed subtransactions. */ void TransactionIdAsyncCommitTree(TransactionId xid, int nxids, TransactionId *xids, XLogRecPtr lsn) { - TransactionIdSetTreeStatus(xid, nxids, xids, - TRANSACTION_STATUS_COMMITTED, lsn); + CommitSeqNo csn; + TransactionId latestXid; + TransactionId currentLatestCompletedXid; + + latestXid = TransactionIdLatest(xid, nxids, xids); + /* + * First update the clog, then CSN log. + * oldestActiveXid advances based on CSN log content (see + * AdvanceOldestActiveXid), and it should not become greater than + * our xid before we set the clog status. + * Otherwise other transactions could see us as aborted for some time + * after we have written to CSN log, and somebody advanced the oldest + * active xid past our xid, but before we write to clog. + */ + CLogSetTreeStatus(xid, nxids, xids, + CLOG_XID_STATUS_COMMITTED, + lsn); + + /* + * Grab the CommitSeqNoLock, in shared mode. This is only used to + * provide a way for a concurrent transaction to wait for us to + * complete (see TransactionIdGetCommitSeqNo()). + * + * XXX: We could reduce the time the lock is held, by only setting + * the CSN on the top-XID while holding the lock, and updating the + * sub-XIDs later. But it doesn't matter much, because we're only + * holding it in shared mode, and it's rare for it to be acquired + * in exclusive mode. + */ + LWLockAcquire(CommitSeqNoLock, LW_SHARED); + + /* + * First update latestCompletedXid to cover this xid. We do this before + * assigning a CSN, so that if someone acquires a new snapshot at the same + * time, the xmax it computes is sure to cover our XID. + */ + currentLatestCompletedXid = pg_atomic_read_u32(&ShmemVariableCache->latestCompletedXid); + while (TransactionIdFollows(latestXid, currentLatestCompletedXid)) + { + if (pg_atomic_compare_exchange_u32(&ShmemVariableCache->latestCompletedXid, + ¤tLatestCompletedXid, + latestXid)) + break; + } + + /* + * Mark our top transaction id as commit-in-progress. + */ + CSNLogSetCommitSeqNo(xid, 0, NULL, COMMITSEQNO_COMMITTING); + + /* Get our CSN and increment */ + csn = pg_atomic_fetch_add_u64(&ShmemVariableCache->nextCommitSeqNo, 1); + Assert(csn >= COMMITSEQNO_FIRST_NORMAL); + + /* Stamp this XID (and sub-XIDs) with the CSN */ + CSNLogSetCommitSeqNo(xid, nxids, xids, csn); + + LWLockRelease(CommitSeqNoLock); } /* @@ -289,8 +284,23 @@ TransactionIdAsyncCommitTree(TransactionId xid, int nxids, TransactionId *xids, void TransactionIdAbortTree(TransactionId xid, int nxids, TransactionId *xids) { - TransactionIdSetTreeStatus(xid, nxids, xids, - TRANSACTION_STATUS_ABORTED, InvalidXLogRecPtr); + TransactionId latestXid; + TransactionId currentLatestCompletedXid; + + latestXid = TransactionIdLatest(xid, nxids, xids); + + currentLatestCompletedXid = pg_atomic_read_u32(&ShmemVariableCache->latestCompletedXid); + while (TransactionIdFollows(latestXid, currentLatestCompletedXid)) + { + if (pg_atomic_compare_exchange_u32(&ShmemVariableCache->latestCompletedXid, + ¤tLatestCompletedXid, + latestXid)) + break; + } + + CSNLogSetCommitSeqNo(xid, nxids, xids, COMMITSEQNO_ABORTED); + CLogSetTreeStatus(xid, nxids, xids, + CLOG_XID_STATUS_ABORTED, InvalidXLogRecPtr); } /* @@ -409,7 +419,7 @@ TransactionIdGetCommitLSN(TransactionId xid) * checking TransactionLogFetch's cache will usually succeed and avoid an * extra trip to shared memory. */ - if (TransactionIdEquals(xid, cachedFetchXid)) + if (TransactionIdEquals(xid, cachedLSNFetchXid)) return cachedCommitLSN; /* Special XIDs are always known committed */ @@ -419,7 +429,10 @@ TransactionIdGetCommitLSN(TransactionId xid) /* * Get the transaction status. */ - (void) TransactionIdGetStatus(xid, &result); + (void) CLogGetStatus(xid, &result); + + cachedLSNFetchXid = xid; + cachedCommitLSN = result; return result; } diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c index b715152e8d..2de3a943ec 100644 --- a/src/backend/access/transam/twophase.c +++ b/src/backend/access/transam/twophase.c @@ -22,7 +22,7 @@ * transaction in prepared state with the same GID. * * A global transaction (gxact) also has dummy PGXACT and PGPROC; this is - * what keeps the XID considered running by TransactionIdIsInProgress. + * what keeps the XID considered running by the functions in procarray.c. * It is also convenient as a PGPROC to hook the gxact's locks to. * * Information to recover prepared transactions in case of crash is @@ -78,6 +78,7 @@ #include "access/commit_ts.h" #include "access/htup_details.h" +#include "access/mvccvars.h" #include "access/subtrans.h" #include "access/transam.h" #include "access/twophase.h" @@ -467,6 +468,7 @@ MarkAsPreparingGuts(GlobalTransaction gxact, TransactionId xid, const char *gid, proc->lxid = (LocalTransactionId) xid; pgxact->xid = xid; pgxact->xmin = InvalidTransactionId; + pgxact->snapshotcsn = InvalidCommitSeqNo; pgxact->delayChkpt = false; pgxact->vacuumFlags = 0; proc->pid = 0; @@ -480,9 +482,6 @@ MarkAsPreparingGuts(GlobalTransaction gxact, TransactionId xid, const char *gid, proc->waitProcLock = NULL; for (i = 0; i < NUM_LOCK_PARTITIONS; i++) SHMQueueInit(&(proc->myProcLocks[i])); - /* subxid data must be filled later by GXactLoadSubxactData */ - pgxact->overflowed = false; - pgxact->nxids = 0; gxact->prepared_at = prepared_at; gxact->xid = xid; @@ -500,34 +499,6 @@ MarkAsPreparingGuts(GlobalTransaction gxact, TransactionId xid, const char *gid, } /* - * GXactLoadSubxactData - * - * If the transaction being persisted had any subtransactions, this must - * be called before MarkAsPrepared() to load information into the dummy - * PGPROC. - */ -static void -GXactLoadSubxactData(GlobalTransaction gxact, int nsubxacts, - TransactionId *children) -{ - PGPROC *proc = &ProcGlobal->allProcs[gxact->pgprocno]; - PGXACT *pgxact = &ProcGlobal->allPgXact[gxact->pgprocno]; - - /* We need no extra lock since the GXACT isn't valid yet */ - if (nsubxacts > PGPROC_MAX_CACHED_SUBXIDS) - { - pgxact->overflowed = true; - nsubxacts = PGPROC_MAX_CACHED_SUBXIDS; - } - if (nsubxacts > 0) - { - memcpy(proc->subxids.xids, children, - nsubxacts * sizeof(TransactionId)); - pgxact->nxids = nsubxacts; - } -} - -/* * MarkAsPrepared * Mark the GXACT as fully valid, and enter it into the global ProcArray. * @@ -545,7 +516,7 @@ MarkAsPrepared(GlobalTransaction gxact, bool lock_held) LWLockRelease(TwoPhaseStateLock); /* - * Put it into the global ProcArray so TransactionIdIsInProgress considers + * Put it into the global ProcArray so GetOldestActiveTransactionId() considers * the XID as still running. */ ProcArrayAdd(&ProcGlobal->allProcs[gxact->pgprocno]); @@ -1036,8 +1007,6 @@ StartPrepare(GlobalTransaction gxact) if (hdr.nsubxacts > 0) { save_state_data(children, hdr.nsubxacts * sizeof(TransactionId)); - /* While we have the child-xact data, stuff it in the gxact too */ - GXactLoadSubxactData(gxact, hdr.nsubxacts, children); } if (hdr.ncommitrels > 0) { @@ -1123,7 +1092,7 @@ EndPrepare(GlobalTransaction gxact) * NB: a side effect of this is to make a dummy ProcArray entry for the * prepared XID. This must happen before we clear the XID from MyPgXact, * else there is a window where the XID is not running according to - * TransactionIdIsInProgress, and onlookers would be entitled to assume + * GetOldestActiveTransactionId, and onlookers would be entitled to assume * the xact crashed. Instead we have a window where the same XID appears * twice in ProcArray, which is OK. */ @@ -1374,7 +1343,6 @@ FinishPreparedTransaction(const char *gid, bool isCommit) char *buf; char *bufptr; TwoPhaseFileHeader *hdr; - TransactionId latestXid; TransactionId *children; RelFileNode *commitrels; RelFileNode *abortrels; @@ -1419,14 +1387,11 @@ FinishPreparedTransaction(const char *gid, bool isCommit) invalmsgs = (SharedInvalidationMessage *) bufptr; bufptr += MAXALIGN(hdr->ninvalmsgs * sizeof(SharedInvalidationMessage)); - /* compute latestXid among all children */ - latestXid = TransactionIdLatest(xid, hdr->nsubxacts, children); - /* * The order of operations here is critical: make the XLOG entry for * commit or abort, then mark the transaction committed or aborted in * pg_xact, then remove its PGPROC from the global ProcArray (which means - * TransactionIdIsInProgress will stop saying the prepared xact is in + * GetOldestActiveTransactionId() will stop saying the prepared xact is in * progress), then run the post-commit or post-abort callbacks. The * callbacks will release the locks the transaction held. */ @@ -1441,7 +1406,7 @@ FinishPreparedTransaction(const char *gid, bool isCommit) hdr->nsubxacts, children, hdr->nabortrels, abortrels); - ProcArrayRemove(proc, latestXid); + ProcArrayRemove(proc); /* * In case we fail while running the callbacks, mark the gxact invalid so @@ -1926,17 +1891,17 @@ RecoverPreparedTransactions(void) xid = gxact->xid; /* - * Reconstruct subtrans state for the transaction --- needed because - * pg_subtrans is not preserved over a restart. Note that we are - * linking all the subtransactions directly to the top-level XID; - * there may originally have been a more complex hierarchy, but - * there's no need to restore that exactly. It's possible that - * SubTransSetParent has been set before, if the prepared transaction - * generated xid assignment records. + * Reconstruct subtrans state for the transaction --- needed + * because pg_csnlog is not preserved over a restart. Note that + * we are linking all the subtransactions directly to the + * top-level XID; there may originally have been a more complex + * hierarchy, but there's no need to restore that exactly. + * It's possible that SubTransSetParent has been set before, if + * the prepared transaction generated xid assignment records. */ buf = ProcessTwoPhaseBuffer(xid, - gxact->prepare_start_lsn, - gxact->ondisk, true, false); + gxact->prepare_start_lsn, + gxact->ondisk, true, false); if (buf == NULL) continue; @@ -1965,7 +1930,6 @@ RecoverPreparedTransactions(void) /* recovered, so reset the flag for entries generated by redo */ gxact->inredo = false; - GXactLoadSubxactData(gxact, hdr->nsubxacts, subxids); MarkAsPrepared(gxact, true); LWLockRelease(TwoPhaseStateLock); @@ -2026,7 +1990,7 @@ ProcessTwoPhaseBuffer(TransactionId xid, Assert(prepare_start_lsn != InvalidXLogRecPtr); /* Already processed? */ - if (TransactionIdDidCommit(xid) || TransactionIdDidAbort(xid)) + if (TransactionIdGetStatus(xid) != XID_INPROGRESS) { if (fromdisk) { @@ -2225,7 +2189,7 @@ RecordTransactionCommitPrepared(TransactionId xid, /* Flush XLOG to disk */ XLogFlush(recptr); - /* Mark the transaction committed in pg_xact */ + /* Mark the transaction committed in pg_xact and pg_csnlog */ TransactionIdCommitTree(xid, nchildren, children); /* Checkpoint can proceed now */ @@ -2263,7 +2227,7 @@ RecordTransactionAbortPrepared(TransactionId xid, * Catch the scenario where we aborted partway through * RecordTransactionCommitPrepared ... */ - if (TransactionIdDidCommit(xid)) + if (TransactionIdGetStatus(xid) == XID_COMMITTED) elog(PANIC, "cannot abort transaction %u, it was already committed", xid); diff --git a/src/backend/access/transam/varsup.c b/src/backend/access/transam/varsup.c index 702c8c957f..f7ce30273c 100644 --- a/src/backend/access/transam/varsup.c +++ b/src/backend/access/transam/varsup.c @@ -15,6 +15,8 @@ #include "access/clog.h" #include "access/commit_ts.h" +#include "access/csnlog.h" +#include "access/mvccvars.h" #include "access/subtrans.h" #include "access/transam.h" #include "access/xact.h" @@ -169,8 +171,8 @@ GetNewTransactionId(bool isSubXact) * Extend pg_subtrans and pg_commit_ts too. */ ExtendCLOG(xid); + ExtendCSNLOG(xid); ExtendCommitTs(xid); - ExtendSUBTRANS(xid); /* * Now advance the nextXid counter. This must not happen until after we @@ -200,17 +202,8 @@ GetNewTransactionId(bool isSubXact) * A solution to the atomic-store problem would be to give each PGXACT its * own spinlock used only for fetching/storing that PGXACT's xid and * related fields. - * - * If there's no room to fit a subtransaction XID into PGPROC, set the - * cache-overflowed flag instead. This forces readers to look in - * pg_subtrans to map subtransaction XIDs up to top-level XIDs. There is a - * race-condition window, in that the new XID will not appear as running - * until its parent link has been placed into pg_subtrans. However, that - * will happen before anyone could possibly have a reason to inquire about - * the status of the XID, so it seems OK. (Snapshots taken during this - * window *will* include the parent XID, so they will deliver the correct - * answer later on when someone does have a reason to inquire.) */ + if (!isSubXact) { /* * Use volatile pointer to prevent code rearrangement; other backends @@ -219,23 +212,9 @@ GetNewTransactionId(bool isSubXact) * nxids before filling the array entry. Note we are assuming that * TransactionId and int fetch/store are atomic. */ - volatile PGPROC *myproc = MyProc; volatile PGXACT *mypgxact = MyPgXact; - if (!isSubXact) - mypgxact->xid = xid; - else - { - int nxids = mypgxact->nxids; - - if (nxids < PGPROC_MAX_CACHED_SUBXIDS) - { - myproc->subxids.xids[nxids] = xid; - mypgxact->nxids = nxids + 1; - } - else - mypgxact->overflowed = true; - } + mypgxact->xid = xid; } LWLockRelease(XidGenLock); diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index c06fabca10..efb8e5fefe 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -20,8 +20,10 @@ #include #include +#include "access/clog.h" #include "access/commit_ts.h" #include "access/multixact.h" +#include "access/mvccvars.h" #include "access/parallel.h" #include "access/subtrans.h" #include "access/transam.h" @@ -185,11 +187,10 @@ typedef struct TransactionStateData int maxChildXids; /* allocated size of childXids[] */ Oid prevUser; /* previous CurrentUserId setting */ int prevSecContext; /* previous SecurityRestrictionContext */ - bool prevXactReadOnly; /* entry-time xact r/o state */ - bool startedInRecovery; /* did we start in recovery? */ - bool didLogXid; /* has xid been included in WAL record? */ - int parallelModeLevel; /* Enter/ExitParallelMode counter */ - struct TransactionStateData *parent; /* back link to parent */ + bool prevXactReadOnly; /* entry-time xact r/o state */ + bool startedInRecovery; /* did we start in recovery? */ + int parallelModeLevel; /* Enter/ExitParallelMode counter */ + struct TransactionStateData *parent; /* back link to parent */ } TransactionStateData; typedef TransactionStateData *TransactionState; @@ -218,18 +219,10 @@ static TransactionStateData TopTransactionStateData = { 0, /* previous SecurityRestrictionContext */ false, /* entry-time xact r/o state */ false, /* startedInRecovery */ - false, /* didLogXid */ 0, /* parallelMode */ NULL /* link to parent state block */ }; -/* - * unreportedXids holds XIDs of all subtransactions that have not yet been - * reported in an XLOG_XACT_ASSIGNMENT record. - */ -static int nUnreportedXids; -static TransactionId unreportedXids[PGPROC_MAX_CACHED_SUBXIDS]; - static TransactionState CurrentTransactionState = &TopTransactionStateData; /* @@ -313,7 +306,7 @@ static void CleanupTransaction(void); static void CheckTransactionChain(bool isTopLevel, bool throwError, const char *stmtType); static void CommitTransaction(void); -static TransactionId RecordTransactionAbort(bool isSubXact); +static void RecordTransactionAbort(bool isSubXact); static void StartTransaction(void); static void StartSubTransaction(void); @@ -438,19 +431,6 @@ GetCurrentTransactionIdIfAny(void) } /* - * MarkCurrentTransactionIdLoggedIfAny - * - * Remember that the current xid - if it is assigned - now has been wal logged. - */ -void -MarkCurrentTransactionIdLoggedIfAny(void) -{ - if (TransactionIdIsValid(CurrentTransactionState->transactionId)) - CurrentTransactionState->didLogXid = true; -} - - -/* * GetStableLatestTransactionId * * Get the transaction's XID if it has one, else read the next-to-be-assigned @@ -491,7 +471,6 @@ AssignTransactionId(TransactionState s) { bool isSubXact = (s->parent != NULL); ResourceOwner currentOwner; - bool log_unknown_top = false; /* Assert that caller didn't screw up */ Assert(!TransactionIdIsValid(s->transactionId)); @@ -542,18 +521,14 @@ AssignTransactionId(TransactionState s) * superfluously log something. That can happen when an xid is included * somewhere inside a wal record, but not in XLogRecord->xl_xid, like in * xl_standby_locks. + * + * FIXME: didLogXid and the whole xact_assignment stuff is no more. We + * no longer need it for subtransactions. Do we still need it for this + * logical stuff? */ - if (isSubXact && XLogLogicalInfoActive() && - !TopTransactionStateData.didLogXid) - log_unknown_top = true; /* * Generate a new Xid and record it in PG_PROC and pg_subtrans. - * - * NB: we must make the subtrans entry BEFORE the Xid appears anywhere in - * shared storage other than PG_PROC; because if there's no room for it in - * PG_PROC, the subtrans entry is needed to ensure that other backends see - * the Xid as "running". See GetNewTransactionId. */ s->transactionId = GetNewTransactionId(isSubXact); if (!isSubXact) @@ -580,59 +555,6 @@ AssignTransactionId(TransactionState s) XactLockTableInsert(s->transactionId); CurrentResourceOwner = currentOwner; - - /* - * Every PGPROC_MAX_CACHED_SUBXIDS assigned transaction ids within each - * top-level transaction we issue a WAL record for the assignment. We - * include the top-level xid and all the subxids that have not yet been - * reported using XLOG_XACT_ASSIGNMENT records. - * - * This is required to limit the amount of shared memory required in a hot - * standby server to keep track of in-progress XIDs. See notes for - * RecordKnownAssignedTransactionIds(). - * - * We don't keep track of the immediate parent of each subxid, only the - * top-level transaction that each subxact belongs to. This is correct in - * recovery only because aborted subtransactions are separately WAL - * logged. - * - * This is correct even for the case where several levels above us didn't - * have an xid assigned as we recursed up to them beforehand. - */ - if (isSubXact && XLogStandbyInfoActive()) - { - unreportedXids[nUnreportedXids] = s->transactionId; - nUnreportedXids++; - - /* - * ensure this test matches similar one in - * RecoverPreparedTransactions() - */ - if (nUnreportedXids >= PGPROC_MAX_CACHED_SUBXIDS || - log_unknown_top) - { - xl_xact_assignment xlrec; - - /* - * xtop is always set by now because we recurse up transaction - * stack to the highest unassigned xid and then come back down - */ - xlrec.xtop = GetTopTransactionId(); - Assert(TransactionIdIsValid(xlrec.xtop)); - xlrec.nsubxacts = nUnreportedXids; - - XLogBeginInsert(); - XLogRegisterData((char *) &xlrec, MinSizeOfXactAssignment); - XLogRegisterData((char *) unreportedXids, - nUnreportedXids * sizeof(TransactionId)); - - (void) XLogInsert(RM_XACT_ID, XLOG_XACT_ASSIGNMENT); - - nUnreportedXids = 0; - /* mark top, not current xact as having been logged */ - TopTransactionStateData.didLogXid = true; - } - } } /* @@ -1109,17 +1031,13 @@ AtSubStart_ResourceOwner(void) /* * RecordTransactionCommit * - * Returns latest XID among xact and its children, or InvalidTransactionId - * if the xact has no XID. (We compute that here just because it's easier.) - * * If you change this function, see RecordTransactionCommitPrepared also. */ -static TransactionId +static void RecordTransactionCommit(void) { TransactionId xid = GetTopTransactionIdIfAny(); bool markXidCommitted = TransactionIdIsValid(xid); - TransactionId latestXid = InvalidTransactionId; int nrels; RelFileNode *rels; int nchildren; @@ -1283,7 +1201,7 @@ RecordTransactionCommit(void) XLogFlush(XactLastRecEnd); /* - * Now we may update the CLOG, if we wrote a COMMIT record above + * Now we may update the CLOG and CSNLOG, if we wrote a COMMIT record above */ if (markXidCommitted) TransactionIdCommitTree(xid, nchildren, children); @@ -1309,7 +1227,8 @@ RecordTransactionCommit(void) * flushed before the CLOG may be updated. */ if (markXidCommitted) - TransactionIdAsyncCommitTree(xid, nchildren, children, XactLastRecEnd); + TransactionIdAsyncCommitTree(xid, nchildren, children, + XactLastRecEnd); } /* @@ -1322,9 +1241,6 @@ RecordTransactionCommit(void) END_CRIT_SECTION(); } - /* Compute latestXid while we have the child XIDs handy */ - latestXid = TransactionIdLatest(xid, nchildren, children); - /* * Wait for synchronous replication, if required. Similar to the decision * above about using committing asynchronously we only want to wait if @@ -1346,8 +1262,6 @@ cleanup: /* Clean up local data */ if (rels) pfree(rels); - - return latestXid; } @@ -1515,15 +1429,11 @@ AtSubCommit_childXids(void) /* * RecordTransactionAbort - * - * Returns latest XID among xact and its children, or InvalidTransactionId - * if the xact has no XID. (We compute that here just because it's easier.) */ -static TransactionId +static void RecordTransactionAbort(bool isSubXact) { TransactionId xid = GetCurrentTransactionIdIfAny(); - TransactionId latestXid; int nrels; RelFileNode *rels; int nchildren; @@ -1541,7 +1451,7 @@ RecordTransactionAbort(bool isSubXact) /* Reset XactLastRecEnd until the next transaction writes something */ if (!isSubXact) XactLastRecEnd = 0; - return InvalidTransactionId; + return; } /* @@ -1604,18 +1514,6 @@ RecordTransactionAbort(bool isSubXact) END_CRIT_SECTION(); - /* Compute latestXid while we have the child XIDs handy */ - latestXid = TransactionIdLatest(xid, nchildren, children); - - /* - * If we're aborting a subtransaction, we can immediately remove failed - * XIDs from PGPROC's cache of running child XIDs. We do that here for - * subxacts, because we already have the child XID array at hand. For - * main xacts, the equivalent happens just after this function returns. - */ - if (isSubXact) - XidCacheRemoveRunningXids(xid, nchildren, children, latestXid); - /* Reset XactLastRecEnd until the next transaction writes something */ if (!isSubXact) XactLastRecEnd = 0; @@ -1623,8 +1521,6 @@ RecordTransactionAbort(bool isSubXact) /* And clean up local data */ if (rels) pfree(rels); - - return latestXid; } /* @@ -1851,12 +1747,6 @@ StartTransaction(void) currentCommandIdUsed = false; /* - * initialize reported xid accounting - */ - nUnreportedXids = 0; - s->didLogXid = false; - - /* * must initialize resource-management stuff first */ AtStart_Memory(); @@ -1933,7 +1823,6 @@ static void CommitTransaction(void) { TransactionState s = CurrentTransactionState; - TransactionId latestXid; bool is_parallel_worker; is_parallel_worker = (s->blockState == TBLOCK_PARALLEL_INPROGRESS); @@ -2033,17 +1922,11 @@ CommitTransaction(void) * We need to mark our XIDs as committed in pg_xact. This is where we * durably commit. */ - latestXid = RecordTransactionCommit(); + RecordTransactionCommit(); } else { /* - * We must not mark our XID committed; the parallel master is - * responsible for that. - */ - latestXid = InvalidTransactionId; - - /* * Make sure the master will know about any WAL we wrote before it * commits. */ @@ -2057,7 +1940,7 @@ CommitTransaction(void) * must be done _before_ releasing locks we hold and _after_ * RecordTransactionCommit. */ - ProcArrayEndTransaction(MyProc, latestXid); + ProcArrayEndTransaction(MyProc); /* * This is all post-commit cleanup. Note that if an error is raised here, @@ -2444,7 +2327,6 @@ static void AbortTransaction(void) { TransactionState s = CurrentTransactionState; - TransactionId latestXid; bool is_parallel_worker; /* Prevent cancel/die interrupt while cleaning up */ @@ -2549,11 +2431,9 @@ AbortTransaction(void) * record. */ if (!is_parallel_worker) - latestXid = RecordTransactionAbort(false); + RecordTransactionAbort(false); else { - latestXid = InvalidTransactionId; - /* * Since the parallel master won't get our value of XactLastRecEnd in * this case, we nudge WAL-writer ourselves in this case. See related @@ -2569,7 +2449,7 @@ AbortTransaction(void) * must be done _before_ releasing locks we hold and _after_ * RecordTransactionAbort. */ - ProcArrayEndTransaction(MyProc, latestXid); + ProcArrayEndTransaction(MyProc); /* * Post-abort cleanup. See notes in CommitTransaction() concerning @@ -5530,9 +5410,12 @@ xact_redo_commit(xl_xact_parsed_commit *parsed, if (standbyState == STANDBY_DISABLED) { /* - * Mark the transaction committed in pg_xact. + * Mark the transaction committed in pg_xact. We don't bother updating + * pg_csnlog during replay. */ - TransactionIdCommitTree(xid, parsed->nsubxacts, parsed->subxacts); + CLogSetTreeStatus(xid, parsed->nsubxacts, parsed->subxacts, + CLOG_XID_STATUS_COMMITTED, + InvalidXLogRecPtr); } else { @@ -5556,14 +5439,7 @@ xact_redo_commit(xl_xact_parsed_commit *parsed, * bits set on changes made by transactions that haven't yet * recovered. It's unlikely but it's good to be safe. */ - TransactionIdAsyncCommitTree( - xid, parsed->nsubxacts, parsed->subxacts, lsn); - - /* - * We must mark clog before we update the ProcArray. - */ - ExpireTreeKnownAssignedTransactionIds( - xid, parsed->nsubxacts, parsed->subxacts, max_xid); + TransactionIdAsyncCommitTree(xid, parsed->nsubxacts, parsed->subxacts, lsn); /* * Send any cache invalidations attached to the commit. We must @@ -5688,8 +5564,13 @@ xact_redo_abort(xl_xact_parsed_abort *parsed, TransactionId xid) if (standbyState == STANDBY_DISABLED) { - /* Mark the transaction aborted in pg_xact, no need for async stuff */ - TransactionIdAbortTree(xid, parsed->nsubxacts, parsed->subxacts); + /* + * Mark the transaction aborted in pg_xact, no need for async stuff or + * to update pg_csnlog. + */ + CLogSetTreeStatus(xid, parsed->nsubxacts, parsed->subxacts, + CLOG_XID_STATUS_ABORTED, + InvalidXLogRecPtr); } else { @@ -5708,12 +5589,6 @@ xact_redo_abort(xl_xact_parsed_abort *parsed, TransactionId xid) TransactionIdAbortTree(xid, parsed->nsubxacts, parsed->subxacts); /* - * We must update the ProcArray after we have marked clog. - */ - ExpireTreeKnownAssignedTransactionIds( - xid, parsed->nsubxacts, parsed->subxacts, max_xid); - - /* * There are no flat files that need updating, nor invalidation * messages to send or undo. */ @@ -5802,14 +5677,6 @@ xact_redo(XLogReaderState *record) record->EndRecPtr); LWLockRelease(TwoPhaseStateLock); } - else if (info == XLOG_XACT_ASSIGNMENT) - { - xl_xact_assignment *xlrec = (xl_xact_assignment *) XLogRecGetData(record); - - if (standbyState >= STANDBY_INITIALIZED) - ProcArrayApplyXidAssignment(xlrec->xtop, - xlrec->nsubxacts, xlrec->xsub); - } else elog(PANIC, "xact_redo: unknown op code %u", info); } diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index e729180f82..f7781cbabc 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -24,7 +24,9 @@ #include "access/clog.h" #include "access/commit_ts.h" +#include "access/csnlog.h" #include "access/multixact.h" +#include "access/mvccvars.h" #include "access/rewriteheap.h" #include "access/subtrans.h" #include "access/timeline.h" @@ -1103,8 +1105,6 @@ XLogInsertRecord(XLogRecData *rdata, */ WALInsertLockRelease(); - MarkCurrentTransactionIdLoggedIfAny(); - END_CRIT_SECTION(); /* @@ -5013,6 +5013,7 @@ BootStrapXLOG(void) char mock_auth_nonce[MOCK_AUTH_NONCE_LEN]; struct timeval tv; pg_crc32c crc; + TransactionId latestCompletedXid; /* * Select a hopefully-unique system identifier code for this installation. @@ -5078,6 +5079,13 @@ BootStrapXLOG(void) ShmemVariableCache->nextXid = checkPoint.nextXid; ShmemVariableCache->nextOid = checkPoint.nextOid; ShmemVariableCache->oidCount = 0; + + pg_atomic_write_u64(&ShmemVariableCache->nextCommitSeqNo, COMMITSEQNO_FIRST_NORMAL); + latestCompletedXid = checkPoint.nextXid; + TransactionIdRetreat(latestCompletedXid); + pg_atomic_write_u32(&ShmemVariableCache->latestCompletedXid, latestCompletedXid); + pg_atomic_write_u32(&ShmemVariableCache->oldestActiveXid, checkPoint.nextXid); + MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset); AdvanceOldestClogXid(checkPoint.oldestXid); SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB); @@ -5176,8 +5184,8 @@ BootStrapXLOG(void) /* Bootstrap the commit log, too */ BootStrapCLOG(); + BootStrapCSNLOG(); BootStrapCommitTs(); - BootStrapSUBTRANS(); BootStrapMultiXact(); pfree(buffer); @@ -6283,6 +6291,7 @@ StartupXLOG(void) XLogPageReadPrivate private; bool fast_promoted = false; struct stat st; + TransactionId latestCompletedXid; /* * Verify XLOG status looks valid. @@ -6694,6 +6703,12 @@ StartupXLOG(void) XLogCtl->ckptXidEpoch = checkPoint.nextXidEpoch; XLogCtl->ckptXid = checkPoint.nextXid; + pg_atomic_write_u64(&ShmemVariableCache->nextCommitSeqNo, COMMITSEQNO_FIRST_NORMAL); + latestCompletedXid = checkPoint.nextXid; + TransactionIdRetreat(latestCompletedXid); + pg_atomic_write_u32(&ShmemVariableCache->latestCompletedXid, latestCompletedXid); + pg_atomic_write_u32(&ShmemVariableCache->oldestActiveXid, checkPoint.nextXid); + /* * Initialize replication slots, before there's a chance to remove * required resources. @@ -6945,15 +6960,15 @@ StartupXLOG(void) Assert(TransactionIdIsValid(oldestActiveXID)); /* Tell procarray about the range of xids it has to deal with */ - ProcArrayInitRecovery(ShmemVariableCache->nextXid); + ProcArrayInitRecovery(oldestActiveXID, ShmemVariableCache->nextXid); /* - * Startup commit log and subtrans only. MultiXact and commit + * Startup commit log and csnlog only. MultiXact and commit * timestamp have already been started up and other SLRUs are not * maintained during recovery and need not be started yet. */ StartupCLOG(); - StartupSUBTRANS(oldestActiveXID); + StartupCSNLOG(oldestActiveXID); /* * If we're beginning at a shutdown checkpoint, we know that @@ -6964,7 +6979,6 @@ StartupXLOG(void) if (wasShutdown) { RunningTransactionsData running; - TransactionId latestCompletedXid; /* * Construct a RunningTransactions snapshot representing a @@ -6972,16 +6986,8 @@ StartupXLOG(void) * alive. We're never overflowed at this point because all * subxids are listed with their parent prepared transactions. */ - running.xcnt = nxids; - running.subxcnt = 0; - running.subxid_overflow = false; running.nextXid = checkPoint.nextXid; running.oldestRunningXid = oldestActiveXID; - latestCompletedXid = checkPoint.nextXid; - TransactionIdRetreat(latestCompletedXid); - Assert(TransactionIdIsNormal(latestCompletedXid)); - running.latestCompletedXid = latestCompletedXid; - running.xids = xids; ProcArrayApplyRecoveryInfo(&running); @@ -7725,20 +7731,22 @@ StartupXLOG(void) XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL); XLogCtl->lastSegSwitchLSN = EndOfLog; - /* also initialize latestCompletedXid, to nextXid - 1 */ - LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); - ShmemVariableCache->latestCompletedXid = ShmemVariableCache->nextXid; - TransactionIdRetreat(ShmemVariableCache->latestCompletedXid); - LWLockRelease(ProcArrayLock); + /* also initialize latestCompletedXid, to nextXid - 1, and oldestActiveXid */ + latestCompletedXid = ShmemVariableCache->nextXid; + TransactionIdRetreat(latestCompletedXid); + pg_atomic_write_u32(&ShmemVariableCache->latestCompletedXid, + latestCompletedXid); + pg_atomic_write_u32(&ShmemVariableCache->oldestActiveXid, + oldestActiveXID); /* - * Start up the commit log and subtrans, if not already done for hot + * Start up the commit log and csnlog, if not already done for hot * standby. (commit timestamps are started below, if necessary.) */ if (standbyState == STANDBY_DISABLED) { StartupCLOG(); - StartupSUBTRANS(oldestActiveXID); + StartupCSNLOG(oldestActiveXID); } /* @@ -8390,8 +8398,8 @@ ShutdownXLOG(int code, Datum arg) CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE); } ShutdownCLOG(); + ShutdownCSNLOG(); ShutdownCommitTs(); - ShutdownSUBTRANS(); ShutdownMultiXact(); } @@ -8959,14 +8967,14 @@ CreateCheckPoint(int flags) PreallocXlogFiles(recptr); /* - * Truncate pg_subtrans if possible. We can throw away all data before + * Truncate pg_csnlog if possible. We can throw away all data before * the oldest XMIN of any running transaction. No future transaction will - * attempt to reference any pg_subtrans entry older than that (see Asserts - * in subtrans.c). During recovery, though, we mustn't do this because - * StartupSUBTRANS hasn't been called yet. + * attempt to reference any pg_csnlog entry older than that (see Asserts + * in csnlog.c). During recovery, though, we mustn't do this because + * StartupCSNLOG hasn't been called yet. */ if (!RecoveryInProgress()) - TruncateSUBTRANS(GetOldestXmin(NULL, PROCARRAY_FLAGS_DEFAULT)); + TruncateCSNLOG(GetOldestXmin(NULL, PROCARRAY_FLAGS_DEFAULT)); /* Real work is done, but log and update stats before releasing lock. */ LogCheckpointEnd(false); @@ -9042,13 +9050,12 @@ static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags) { CheckPointCLOG(); + CheckPointCSNLOG(); CheckPointCommitTs(); - CheckPointSUBTRANS(); CheckPointMultiXact(); CheckPointPredicate(); CheckPointRelationMap(); CheckPointReplicationSlots(); - CheckPointSnapBuild(); CheckPointLogicalRewriteHeap(); CheckPointBuffers(flags); /* performs all required fsyncs */ CheckPointReplicationOrigin(); @@ -9320,14 +9327,14 @@ CreateRestartPoint(int flags) } /* - * Truncate pg_subtrans if possible. We can throw away all data before + * Truncate pg_csnlog if possible. We can throw away all data before * the oldest XMIN of any running transaction. No future transaction will - * attempt to reference any pg_subtrans entry older than that (see Asserts - * in subtrans.c). When hot standby is disabled, though, we mustn't do - * this because StartupSUBTRANS hasn't been called yet. + * attempt to reference any pg_csnlog entry older than that (see Asserts + * in csnlog.c). When hot standby is disabled, though, we mustn't do + * this because StartupCSNLOG hasn't been called yet. */ if (EnableHotStandby) - TruncateSUBTRANS(GetOldestXmin(NULL, PROCARRAY_FLAGS_DEFAULT)); + TruncateCSNLOG(GetOldestXmin(NULL, PROCARRAY_FLAGS_DEFAULT)); /* Real work is done, but log and update before releasing lock. */ LogCheckpointEnd(true); @@ -9714,7 +9721,6 @@ xlog_redo(XLogReaderState *record) TransactionId *xids; int nxids; TransactionId oldestActiveXID; - TransactionId latestCompletedXid; RunningTransactionsData running; oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids); @@ -9725,16 +9731,8 @@ xlog_redo(XLogReaderState *record) * never overflowed at this point because all subxids are listed * with their parent prepared transactions. */ - running.xcnt = nxids; - running.subxcnt = 0; - running.subxid_overflow = false; running.nextXid = checkPoint.nextXid; running.oldestRunningXid = oldestActiveXID; - latestCompletedXid = checkPoint.nextXid; - TransactionIdRetreat(latestCompletedXid); - Assert(TransactionIdIsNormal(latestCompletedXid)); - running.latestCompletedXid = latestCompletedXid; - running.xids = xids; ProcArrayApplyRecoveryInfo(&running); diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c index 9e14880b99..4be3a23900 100644 --- a/src/backend/catalog/heap.c +++ b/src/backend/catalog/heap.c @@ -69,6 +69,7 @@ #include "parser/parse_relation.h" #include "storage/lmgr.h" #include "storage/predicate.h" +#include "storage/procarray.h" #include "storage/smgr.h" #include "utils/acl.h" #include "utils/builtins.h" @@ -895,7 +896,7 @@ AddNewRelationTuple(Relation pg_class_desc, * We know that no xacts older than RecentXmin are still running, so * that will do. */ - new_rel_reltup->relfrozenxid = RecentXmin; + new_rel_reltup->relfrozenxid = GetOldestActiveTransactionId(); /* * Similarly, initialize the minimum Multixact to the first value that diff --git a/src/backend/commands/async.c b/src/backend/commands/async.c index f7de742a56..9af6fe6a3d 100644 --- a/src/backend/commands/async.c +++ b/src/backend/commands/async.c @@ -1942,23 +1942,34 @@ asyncQueueProcessPageEntries(volatile QueuePosition *current, /* Ignore messages destined for other databases */ if (qe->dboid == MyDatabaseId) { - if (XidInMVCCSnapshot(qe->xid, snapshot)) + TransactionIdStatus status; + if (XidVisibleInSnapshot(qe->xid, snapshot, &status)) + { + /* qe->data is the null-terminated channel name */ + char *channel = qe->data; + + Assert(status == XID_COMMITTED); + + if (IsListeningOn(channel)) + { + /* payload follows channel name */ + char *payload = qe->data + strlen(channel) + 1; + + NotifyMyFrontEnd(channel, payload, qe->srcPid); + } + } + else if (status == XID_INPROGRESS || status == XID_COMMITTED) { /* - * The source transaction is still in progress, so we can't - * process this message yet. Break out of the loop, but first - * back up *current so we will reprocess the message next - * time. (Note: it is unlikely but not impossible for - * TransactionIdDidCommit to fail, so we can't really avoid + * The source transaction is still in progress accroding to our + * snapshot, so we can't process this message yet. Break out + * of the loop, but first back up *current so we will reprocess + * the message next time. (Note: it is unlikely but not impossible + * for TransactionIdDidCommit to fail, so we can't really avoid * this advance-then-back-up behavior when dealing with an * uncommitted message.) * - * Note that we must test XidInMVCCSnapshot before we test - * TransactionIdDidCommit, else we might return a message from - * a transaction that is not yet visible to snapshots; compare - * the comments at the head of tqual.c. - * - * Also, while our own xact won't be listed in the snapshot, + * Note that while our own xact won't be listed in the snapshot, * we need not check for TransactionIdIsCurrentTransactionId * because our transaction cannot (yet) have queued any * messages. @@ -1967,21 +1978,9 @@ asyncQueueProcessPageEntries(volatile QueuePosition *current, reachedStop = true; break; } - else if (TransactionIdDidCommit(qe->xid)) - { - /* qe->data is the null-terminated channel name */ - char *channel = qe->data; - - if (IsListeningOn(channel)) - { - /* payload follows channel name */ - char *payload = qe->data + strlen(channel) + 1; - - NotifyMyFrontEnd(channel, payload, qe->srcPid); - } - } else { + Assert(status == XID_ABORTED); /* * The source transaction aborted or crashed, so we just * ignore its notifications. diff --git a/src/backend/commands/matview.c b/src/backend/commands/matview.c index d2e0376511..26575706a8 100644 --- a/src/backend/commands/matview.c +++ b/src/backend/commands/matview.c @@ -33,6 +33,7 @@ #include "pgstat.h" #include "rewrite/rewriteHandler.h" #include "storage/lmgr.h" +#include "storage/procarray.h" #include "storage/smgr.h" #include "tcop/tcopprot.h" #include "utils/builtins.h" @@ -842,7 +843,8 @@ static void refresh_by_heap_swap(Oid matviewOid, Oid OIDNewHeap, char relpersistence) { finish_heap_swap(matviewOid, OIDNewHeap, false, false, true, true, - RecentXmin, ReadNextMultiXactId(), relpersistence); + GetOldestActiveTransactionId(), ReadNextMultiXactId(), + relpersistence); } diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index d19846d005..ea4234864d 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -87,6 +87,7 @@ #include "storage/lmgr.h" #include "storage/lock.h" #include "storage/predicate.h" +#include "storage/procarray.h" #include "storage/smgr.h" #include "utils/acl.h" #include "utils/builtins.h" @@ -1474,7 +1475,7 @@ ExecuteTruncate(TruncateStmt *stmt) * deletion at commit. */ RelationSetNewRelfilenode(rel, rel->rd_rel->relpersistence, - RecentXmin, minmulti); + GetOldestActiveTransactionId(), minmulti); if (rel->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED) heap_create_init_fork(rel); @@ -1488,7 +1489,7 @@ ExecuteTruncate(TruncateStmt *stmt) { rel = relation_open(toast_relid, AccessExclusiveLock); RelationSetNewRelfilenode(rel, rel->rd_rel->relpersistence, - RecentXmin, minmulti); + GetOldestActiveTransactionId(), minmulti); if (rel->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED) heap_create_init_fork(rel); heap_close(rel, NoLock); @@ -4294,7 +4295,7 @@ ATRewriteTables(AlterTableStmt *parsetree, List **wqueue, LOCKMODE lockmode) finish_heap_swap(tab->relid, OIDNewHeap, false, false, true, !OidIsValid(tab->newTableSpace), - RecentXmin, + GetOldestActiveTransactionId(), ReadNextMultiXactId(), persistence); } diff --git a/src/backend/replication/logical/decode.c b/src/backend/replication/logical/decode.c index 486fd0c988..23d36401a6 100644 --- a/src/backend/replication/logical/decode.c +++ b/src/backend/replication/logical/decode.c @@ -165,7 +165,6 @@ LogicalDecodingProcessRecord(LogicalDecodingContext *ctx, XLogReaderState *recor static void DecodeXLogOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) { - SnapBuild *builder = ctx->snapshot_builder; uint8 info = XLogRecGetInfo(buf->record) & ~XLR_INFO_MASK; ReorderBufferProcessXid(ctx->reorder, XLogRecGetXid(buf->record), @@ -176,8 +175,6 @@ DecodeXLogOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) /* this is also used in END_OF_RECOVERY checkpoints */ case XLOG_CHECKPOINT_SHUTDOWN: case XLOG_END_OF_RECOVERY: - SnapBuildSerializationPoint(builder, buf->origptr); - break; case XLOG_CHECKPOINT_ONLINE: @@ -217,8 +214,11 @@ DecodeXactOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) * ok not to call ReorderBufferProcessXid() in that case, except in the * assignment case there'll not be any later records with the same xid; * and in the assignment case we'll not decode those xacts. + * + * FIXME: the assignment record is no more. I don't understand the above + * comment. Can it be just removed? */ - if (SnapBuildCurrentState(builder) < SNAPBUILD_FULL_SNAPSHOT) + if (SnapBuildCurrentState(builder) < SNAPBUILD_CONSISTENT) return; switch (info) @@ -259,23 +259,6 @@ DecodeXactOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) DecodeAbort(ctx, buf, &parsed, xid); break; } - case XLOG_XACT_ASSIGNMENT: - { - xl_xact_assignment *xlrec; - int i; - TransactionId *sub_xid; - - xlrec = (xl_xact_assignment *) XLogRecGetData(r); - - sub_xid = &xlrec->xsub[0]; - - for (i = 0; i < xlrec->nsubxacts; i++) - { - ReorderBufferAssignChild(reorder, xlrec->xtop, - *(sub_xid++), buf->origptr); - } - break; - } case XLOG_XACT_PREPARE: /* @@ -354,7 +337,7 @@ DecodeHeap2Op(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) ReorderBufferProcessXid(ctx->reorder, xid, buf->origptr); /* no point in doing anything yet */ - if (SnapBuildCurrentState(builder) < SNAPBUILD_FULL_SNAPSHOT) + if (SnapBuildCurrentState(builder) < SNAPBUILD_CONSISTENT) return; switch (info) @@ -409,7 +392,7 @@ DecodeHeapOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) ReorderBufferProcessXid(ctx->reorder, xid, buf->origptr); /* no point in doing anything yet */ - if (SnapBuildCurrentState(builder) < SNAPBUILD_FULL_SNAPSHOT) + if (SnapBuildCurrentState(builder) < SNAPBUILD_CONSISTENT) return; switch (info) @@ -502,7 +485,7 @@ DecodeLogicalMsgOp(LogicalDecodingContext *ctx, XLogRecordBuffer *buf) ReorderBufferProcessXid(ctx->reorder, XLogRecGetXid(r), buf->origptr); /* No point in doing anything yet. */ - if (SnapBuildCurrentState(builder) < SNAPBUILD_FULL_SNAPSHOT) + if (SnapBuildCurrentState(builder) < SNAPBUILD_CONSISTENT) return; message = (xl_logical_message *) XLogRecGetData(r); diff --git a/src/backend/replication/logical/logical.c b/src/backend/replication/logical/logical.c index bca585fc27..1f212cc04e 100644 --- a/src/backend/replication/logical/logical.c +++ b/src/backend/replication/logical/logical.c @@ -113,7 +113,7 @@ CheckLogicalDecodingRequirements(void) static LogicalDecodingContext * StartupDecodingContext(List *output_plugin_options, XLogRecPtr start_lsn, - TransactionId xmin_horizon, + bool need_full_snapshot, XLogPageReadCB read_page, LogicalOutputPluginWriterPrepareWrite prepare_write, @@ -173,7 +173,7 @@ StartupDecodingContext(List *output_plugin_options, ctx->reorder = ReorderBufferAllocate(); ctx->snapshot_builder = - AllocateSnapshotBuilder(ctx->reorder, xmin_horizon, start_lsn, + AllocateSnapshotBuilder(ctx->reorder, start_lsn, need_full_snapshot); ctx->reorder->private_data = ctx; @@ -302,7 +302,7 @@ CreateInitDecodingContext(char *plugin, ReplicationSlotMarkDirty(); ReplicationSlotSave(); - ctx = StartupDecodingContext(NIL, InvalidXLogRecPtr, xmin_horizon, + ctx = StartupDecodingContext(NIL, InvalidXLogRecPtr, need_full_snapshot, read_page, prepare_write, do_write, update_progress); @@ -394,10 +394,9 @@ CreateDecodingContext(XLogRecPtr start_lsn, } ctx = StartupDecodingContext(output_plugin_options, - start_lsn, InvalidTransactionId, false, + start_lsn, false, read_page, prepare_write, do_write, update_progress); - /* call output plugin initialization callback */ old_context = MemoryContextSwitchTo(ctx->context); if (ctx->callbacks.startup_cb != NULL) @@ -777,12 +776,12 @@ message_cb_wrapper(ReorderBuffer *cache, ReorderBufferTXN *txn, } /* - * Set the required catalog xmin horizon for historic snapshots in the current - * replication slot. + * Set the oldest snapshot required for historic catalog lookups in the + * current replication slot. * - * Note that in the most cases, we won't be able to immediately use the xmin - * to increase the xmin horizon: we need to wait till the client has confirmed - * receiving current_lsn with LogicalConfirmReceivedLocation(). + * Note that in the most cases, we won't be able to immediately use the + * snapshot to increase the oldest snapshot, we need to wait till the client + * has confirmed receiving current_lsn with LogicalConfirmReceivedLocation(). */ void LogicalIncreaseXminForSlot(XLogRecPtr current_lsn, TransactionId xmin) diff --git a/src/backend/replication/logical/reorderbuffer.c b/src/backend/replication/logical/reorderbuffer.c index dc0ad5b0e7..d43401287e 100644 --- a/src/backend/replication/logical/reorderbuffer.c +++ b/src/backend/replication/logical/reorderbuffer.c @@ -1190,7 +1190,6 @@ ReorderBufferCopySnap(ReorderBuffer *rb, Snapshot orig_snap, Size size; size = sizeof(SnapshotData) + - sizeof(TransactionId) * orig_snap->xcnt + sizeof(TransactionId) * (txn->nsubtxns + 1); snap = MemoryContextAllocZero(rb->context, size); @@ -1199,36 +1198,33 @@ ReorderBufferCopySnap(ReorderBuffer *rb, Snapshot orig_snap, snap->copied = true; snap->active_count = 1; /* mark as active so nobody frees it */ snap->regd_count = 0; - snap->xip = (TransactionId *) (snap + 1); - - memcpy(snap->xip, orig_snap->xip, sizeof(TransactionId) * snap->xcnt); /* * snap->subxip contains all txids that belong to our transaction which we * need to check via cmin/cmax. That's why we store the toplevel * transaction in there as well. */ - snap->subxip = snap->xip + snap->xcnt; - snap->subxip[i++] = txn->xid; + snap->this_xip = (TransactionId *) (snap + 1); + snap->this_xip[i++] = txn->xid; /* * nsubxcnt isn't decreased when subtransactions abort, so count manually. * Since it's an upper boundary it is safe to use it for the allocation * above. */ - snap->subxcnt = 1; + snap->this_xcnt = 1; dlist_foreach(iter, &txn->subtxns) { ReorderBufferTXN *sub_txn; sub_txn = dlist_container(ReorderBufferTXN, node, iter.cur); - snap->subxip[i++] = sub_txn->xid; - snap->subxcnt++; + snap->this_xip[i++] = sub_txn->xid; + snap->this_xcnt++; } /* sort so we can bsearch() later */ - qsort(snap->subxip, snap->subxcnt, sizeof(TransactionId), xidComparator); + qsort(snap->this_xip, snap->this_xcnt, sizeof(TransactionId), xidComparator); /* store the specified current CommandId */ snap->curcid = cid; @@ -1300,6 +1296,7 @@ ReorderBufferCommit(ReorderBuffer *rb, TransactionId xid, } snapshot_now = txn->base_snapshot; + Assert(snapshot_now->snapshotcsn != InvalidCommitSeqNo); /* build data to be able to lookup the CommandIds of catalog tuples */ ReorderBufferBuildTupleCidHash(rb, txn); @@ -2192,10 +2189,7 @@ ReorderBufferSerializeChange(ReorderBuffer *rb, ReorderBufferTXN *txn, snap = change->data.snapshot; - sz += sizeof(SnapshotData) + - sizeof(TransactionId) * snap->xcnt + - sizeof(TransactionId) * snap->subxcnt - ; + sz += sizeof(SnapshotData); /* make sure we have enough space */ ReorderBufferSerializeReserve(rb, sz); @@ -2205,20 +2199,6 @@ ReorderBufferSerializeChange(ReorderBuffer *rb, ReorderBufferTXN *txn, memcpy(data, snap, sizeof(SnapshotData)); data += sizeof(SnapshotData); - - if (snap->xcnt) - { - memcpy(data, snap->xip, - sizeof(TransactionId) * snap->xcnt); - data += sizeof(TransactionId) * snap->xcnt; - } - - if (snap->subxcnt) - { - memcpy(data, snap->subxip, - sizeof(TransactionId) * snap->subxcnt); - data += sizeof(TransactionId) * snap->subxcnt; - } break; } case REORDER_BUFFER_CHANGE_INTERNAL_SPEC_CONFIRM: @@ -2484,24 +2464,16 @@ ReorderBufferRestoreChange(ReorderBuffer *rb, ReorderBufferTXN *txn, } case REORDER_BUFFER_CHANGE_INTERNAL_SNAPSHOT: { - Snapshot oldsnap; Snapshot newsnap; Size size; - oldsnap = (Snapshot) data; - - size = sizeof(SnapshotData) + - sizeof(TransactionId) * oldsnap->xcnt + - sizeof(TransactionId) * (oldsnap->subxcnt + 0); + size = sizeof(SnapshotData); change->data.snapshot = MemoryContextAllocZero(rb->context, size); newsnap = change->data.snapshot; memcpy(newsnap, data, size); - newsnap->xip = (TransactionId *) - (((char *) newsnap) + sizeof(SnapshotData)); - newsnap->subxip = newsnap->xip + newsnap->xcnt; newsnap->copied = true; break; } @@ -3153,7 +3125,7 @@ UpdateLogicalMappings(HTAB *tuplecid_data, Oid relid, Snapshot snapshot) continue; /* not for our transaction */ - if (!TransactionIdInArray(f_mapped_xid, snapshot->subxip, snapshot->subxcnt)) + if (!TransactionIdInArray(f_mapped_xid, snapshot->this_xip, snapshot->this_xcnt)) continue; /* ok, relevant, queue for apply */ @@ -3181,7 +3153,7 @@ UpdateLogicalMappings(HTAB *tuplecid_data, Oid relid, Snapshot snapshot) RewriteMappingFile *f = files_a[off]; elog(DEBUG1, "applying mapping: \"%s\" in %u", f->fname, - snapshot->subxip[0]); + snapshot->this_xip[0]); ApplyLogicalMappingFile(tuplecid_data, relid, f->fname); pfree(f); } diff --git a/src/backend/replication/logical/snapbuild.c b/src/backend/replication/logical/snapbuild.c index ad65b9831d..580d45b252 100644 --- a/src/backend/replication/logical/snapbuild.c +++ b/src/backend/replication/logical/snapbuild.c @@ -164,17 +164,15 @@ struct SnapBuild /* all transactions >= than this are uncommitted */ TransactionId xmax; + /* this determines the state of transactions between xmin and xmax */ + CommitSeqNo snapshotcsn; + /* * Don't replay commits from an LSN < this LSN. This can be set externally * but it will also be advanced (never retreat) from within snapbuild.c. */ XLogRecPtr start_decoding_at; - /* - * Don't start decoding WAL until the "xl_running_xacts" information - * indicates there are no running xids with an xid smaller than this. - */ - TransactionId initial_xmin_horizon; /* Indicates if we are building full snapshot or just catalog one. */ bool building_full_snapshot; @@ -185,70 +183,9 @@ struct SnapBuild Snapshot snapshot; /* - * LSN of the last location we are sure a snapshot has been serialized to. - */ - XLogRecPtr last_serialized_snapshot; - - /* * The reorderbuffer we need to update with usable snapshots et al. */ ReorderBuffer *reorder; - - /* - * Outdated: This struct isn't used for its original purpose anymore, but - * can't be removed / changed in a minor version, because it's stored - * on-disk. - */ - struct - { - /* - * NB: This field is misused, until a major version can break on-disk - * compatibility. See SnapBuildNextPhaseAt() / - * SnapBuildStartNextPhaseAt(). - */ - TransactionId was_xmin; - TransactionId was_xmax; - - size_t was_xcnt; /* number of used xip entries */ - size_t was_xcnt_space; /* allocated size of xip */ - TransactionId *was_xip; /* running xacts array, xidComparator-sorted */ - } was_running; - - /* - * Array of transactions which could have catalog changes that committed - * between xmin and xmax. - */ - struct - { - /* number of committed transactions */ - size_t xcnt; - - /* available space for committed transactions */ - size_t xcnt_space; - - /* - * Until we reach a CONSISTENT state, we record commits of all - * transactions, not just the catalog changing ones. Record when that - * changes so we know we cannot export a snapshot safely anymore. - */ - bool includes_all_transactions; - - /* - * Array of committed transactions that have modified the catalog. - * - * As this array is frequently modified we do *not* keep it in - * xidComparator order. Instead we sort the array when building & - * distributing a snapshot. - * - * TODO: It's unclear whether that reasoning has much merit. Every - * time we add something here after becoming consistent will also - * require distributing a snapshot. Storing them sorted would - * potentially also make it easier to purge (but more complicated wrt - * wraparound?). Should be improved if sorting while building the - * snapshot shows up in profiles. - */ - TransactionId *xip; - } committed; }; /* @@ -258,9 +195,6 @@ struct SnapBuild static ResourceOwner SavedResourceOwnerDuringExport = NULL; static bool ExportInProgress = false; -/* ->committed manipulation */ -static void SnapBuildPurgeCommittedTxn(SnapBuild *builder); - /* snapshot building/manipulation/distribution functions */ static Snapshot SnapBuildBuildSnapshot(SnapBuild *builder); @@ -270,41 +204,6 @@ static void SnapBuildSnapIncRefcount(Snapshot snap); static void SnapBuildDistributeNewCatalogSnapshot(SnapBuild *builder, XLogRecPtr lsn); -/* xlog reading helper functions for SnapBuildProcessRecord */ -static bool SnapBuildFindSnapshot(SnapBuild *builder, XLogRecPtr lsn, xl_running_xacts *running); -static void SnapBuildWaitSnapshot(xl_running_xacts *running, TransactionId cutoff); - -/* serialization functions */ -static void SnapBuildSerialize(SnapBuild *builder, XLogRecPtr lsn); -static bool SnapBuildRestore(SnapBuild *builder, XLogRecPtr lsn); - -/* - * Return TransactionId after which the next phase of initial snapshot - * building will happen. - */ -static inline TransactionId -SnapBuildNextPhaseAt(SnapBuild *builder) -{ - /* - * For backward compatibility reasons this has to be stored in the wrongly - * named field. Will be fixed in next major version. - */ - return builder->was_running.was_xmax; -} - -/* - * Set TransactionId after which the next phase of initial snapshot building - * will happen. - */ -static inline void -SnapBuildStartNextPhaseAt(SnapBuild *builder, TransactionId at) -{ - /* - * For backward compatibility reasons this has to be stored in the wrongly - * named field. Will be fixed in next major version. - */ - builder->was_running.was_xmax = at; -} /* * Allocate a new snapshot builder. @@ -314,7 +213,6 @@ SnapBuildStartNextPhaseAt(SnapBuild *builder, TransactionId at) */ SnapBuild * AllocateSnapshotBuilder(ReorderBuffer *reorder, - TransactionId xmin_horizon, XLogRecPtr start_lsn, bool need_full_snapshot) { @@ -335,13 +233,6 @@ AllocateSnapshotBuilder(ReorderBuffer *reorder, builder->reorder = reorder; /* Other struct members initialized by zeroing via palloc0 above */ - builder->committed.xcnt = 0; - builder->committed.xcnt_space = 128; /* arbitrary number */ - builder->committed.xip = - palloc0(builder->committed.xcnt_space * sizeof(TransactionId)); - builder->committed.includes_all_transactions = true; - - builder->initial_xmin_horizon = xmin_horizon; builder->start_decoding_at = start_lsn; builder->building_full_snapshot = need_full_snapshot; @@ -380,7 +271,6 @@ SnapBuildFreeSnapshot(Snapshot snap) /* make sure nobody modified our snapshot */ Assert(snap->curcid == FirstCommandId); - Assert(!snap->suboverflowed); Assert(!snap->takenDuringRecovery); Assert(snap->regd_count == 0); @@ -438,7 +328,6 @@ SnapBuildSnapDecRefcount(Snapshot snap) /* make sure nobody modified our snapshot */ Assert(snap->curcid == FirstCommandId); - Assert(!snap->suboverflowed); Assert(!snap->takenDuringRecovery); Assert(snap->regd_count == 0); @@ -468,10 +357,9 @@ SnapBuildBuildSnapshot(SnapBuild *builder) Snapshot snapshot; Size ssize; - Assert(builder->state >= SNAPBUILD_FULL_SNAPSHOT); + Assert(builder->state >= SNAPBUILD_CONSISTENT); ssize = sizeof(SnapshotData) - + sizeof(TransactionId) * builder->committed.xcnt + sizeof(TransactionId) * 1 /* toplevel xid */ ; snapshot = MemoryContextAllocZero(builder->context, ssize); @@ -479,52 +367,34 @@ SnapBuildBuildSnapshot(SnapBuild *builder) snapshot->satisfies = HeapTupleSatisfiesHistoricMVCC; /* - * We misuse the original meaning of SnapshotData's xip and subxip fields - * to make the more fitting for our needs. - * - * In the 'xip' array we store transactions that have to be treated as - * committed. Since we will only ever look at tuples from transactions - * that have modified the catalog it's more efficient to store those few - * that exist between xmin and xmax (frequently there are none). - * * Snapshots that are used in transactions that have modified the catalog - * also use the 'subxip' array to store their toplevel xid and all the + * use the 'this_xip' array to store their toplevel xid and all the * subtransaction xids so we can recognize when we need to treat rows as - * visible that are not in xip but still need to be visible. Subxip only + * visible that would not normally be visible by the CSN test. this_xip only * gets filled when the transaction is copied into the context of a * catalog modifying transaction since we otherwise share a snapshot * between transactions. As long as a txn hasn't modified the catalog it * doesn't need to treat any uncommitted rows as visible, so there is no * need for those xids. * - * Both arrays are qsort'ed so that we can use bsearch() on them. + * this_xip array is qsort'ed so that we can use bsearch() on them. */ Assert(TransactionIdIsNormal(builder->xmin)); Assert(TransactionIdIsNormal(builder->xmax)); + Assert(builder->snapshotcsn != InvalidCommitSeqNo); snapshot->xmin = builder->xmin; snapshot->xmax = builder->xmax; - - /* store all transactions to be treated as committed by this snapshot */ - snapshot->xip = - (TransactionId *) ((char *) snapshot + sizeof(SnapshotData)); - snapshot->xcnt = builder->committed.xcnt; - memcpy(snapshot->xip, - builder->committed.xip, - builder->committed.xcnt * sizeof(TransactionId)); - - /* sort so we can bsearch() */ - qsort(snapshot->xip, snapshot->xcnt, sizeof(TransactionId), xidComparator); + snapshot->snapshotcsn = builder->snapshotcsn; /* - * Initially, subxip is empty, i.e. it's a snapshot to be used by + * Initially, this_xip is empty, i.e. it's a snapshot to be used by * transactions that don't modify the catalog. Will be filled by * ReorderBufferCopySnap() if necessary. */ - snapshot->subxcnt = 0; - snapshot->subxip = NULL; + snapshot->this_xcnt = 0; + snapshot->this_xip = NULL; - snapshot->suboverflowed = false; snapshot->takenDuringRecovery = false; snapshot->copied = false; snapshot->curcid = FirstCommandId; @@ -545,9 +415,6 @@ Snapshot SnapBuildInitialSnapshot(SnapBuild *builder) { Snapshot snap; - TransactionId xid; - TransactionId *newxip; - int newxcnt = 0; Assert(!FirstSnapshotSet); Assert(XactIsoLevel == XACT_REPEATABLE_READ); @@ -555,9 +422,6 @@ SnapBuildInitialSnapshot(SnapBuild *builder) if (builder->state != SNAPBUILD_CONSISTENT) elog(ERROR, "cannot build an initial slot snapshot before reaching a consistent state"); - if (!builder->committed.includes_all_transactions) - elog(ERROR, "cannot build an initial slot snapshot, not all transactions are monitored anymore"); - /* so we don't overwrite the existing value */ if (TransactionIdIsValid(MyPgXact->xmin)) elog(ERROR, "cannot build an initial slot snapshot when MyPgXact->xmin already is valid"); @@ -569,56 +433,7 @@ SnapBuildInitialSnapshot(SnapBuild *builder) * mechanism. Due to that we can do this without locks, we're only * changing our own value. */ -#ifdef USE_ASSERT_CHECKING - { - TransactionId safeXid; - - LWLockAcquire(ProcArrayLock, LW_SHARED); - safeXid = GetOldestSafeDecodingTransactionId(false); - LWLockRelease(ProcArrayLock); - - Assert(TransactionIdPrecedesOrEquals(safeXid, snap->xmin)); - } -#endif - - MyPgXact->xmin = snap->xmin; - - /* allocate in transaction context */ - newxip = (TransactionId *) - palloc(sizeof(TransactionId) * GetMaxSnapshotXidCount()); - - /* - * snapbuild.c builds transactions in an "inverted" manner, which means it - * stores committed transactions in ->xip, not ones in progress. Build a - * classical snapshot by marking all non-committed transactions as - * in-progress. This can be expensive. - */ - for (xid = snap->xmin; NormalTransactionIdPrecedes(xid, snap->xmax);) - { - void *test; - - /* - * Check whether transaction committed using the decoding snapshot - * meaning of ->xip. - */ - test = bsearch(&xid, snap->xip, snap->xcnt, - sizeof(TransactionId), xidComparator); - - if (test == NULL) - { - if (newxcnt >= GetMaxSnapshotXidCount()) - ereport(ERROR, - (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), - errmsg("initial slot snapshot too large"))); - - newxip[newxcnt++] = xid; - } - - TransactionIdAdvance(xid); - } - - snap->xcnt = newxcnt; - snap->xip = newxip; + MyPgXact->snapshotcsn = snap->snapshotcsn; return snap; } @@ -661,10 +476,10 @@ SnapBuildExportSnapshot(SnapBuild *builder) snapname = ExportSnapshot(snap); ereport(LOG, - (errmsg_plural("exported logical decoding snapshot: \"%s\" with %u transaction ID", - "exported logical decoding snapshot: \"%s\" with %u transaction IDs", - snap->xcnt, - snapname, snap->xcnt))); + (errmsg("exported logical decoding snapshot: \"%s\" at %X/%X", + snapname, + (uint32) (snap->snapshotcsn >> 32), + (uint32) snap->snapshotcsn))); return snapname; } @@ -722,16 +537,7 @@ SnapBuildProcessChange(SnapBuild *builder, TransactionId xid, XLogRecPtr lsn) * We can't handle data in transactions if we haven't built a snapshot * yet, so don't store them. */ - if (builder->state < SNAPBUILD_FULL_SNAPSHOT) - return false; - - /* - * No point in keeping track of changes in transactions that we don't have - * enough information about to decode. This means that they started before - * we got into the SNAPBUILD_FULL_SNAPSHOT state. - */ - if (builder->state < SNAPBUILD_CONSISTENT && - TransactionIdPrecedes(xid, SnapBuildNextPhaseAt(builder))) + if (builder->state < SNAPBUILD_CONSISTENT) return false; /* @@ -851,76 +657,6 @@ SnapBuildDistributeNewCatalogSnapshot(SnapBuild *builder, XLogRecPtr lsn) } /* - * Keep track of a new catalog changing transaction that has committed. - */ -static void -SnapBuildAddCommittedTxn(SnapBuild *builder, TransactionId xid) -{ - Assert(TransactionIdIsValid(xid)); - - if (builder->committed.xcnt == builder->committed.xcnt_space) - { - builder->committed.xcnt_space = builder->committed.xcnt_space * 2 + 1; - - elog(DEBUG1, "increasing space for committed transactions to %u", - (uint32) builder->committed.xcnt_space); - - builder->committed.xip = repalloc(builder->committed.xip, - builder->committed.xcnt_space * sizeof(TransactionId)); - } - - /* - * TODO: It might make sense to keep the array sorted here instead of - * doing it every time we build a new snapshot. On the other hand this - * gets called repeatedly when a transaction with subtransactions commits. - */ - builder->committed.xip[builder->committed.xcnt++] = xid; -} - -/* - * Remove knowledge about transactions we treat as committed that are smaller - * than ->xmin. Those won't ever get checked via the ->committed array but via - * the clog machinery, so we don't need to waste memory on them. - */ -static void -SnapBuildPurgeCommittedTxn(SnapBuild *builder) -{ - int off; - TransactionId *workspace; - int surviving_xids = 0; - - /* not ready yet */ - if (!TransactionIdIsNormal(builder->xmin)) - return; - - /* TODO: Neater algorithm than just copying and iterating? */ - workspace = - MemoryContextAlloc(builder->context, - builder->committed.xcnt * sizeof(TransactionId)); - - /* copy xids that still are interesting to workspace */ - for (off = 0; off < builder->committed.xcnt; off++) - { - if (NormalTransactionIdPrecedes(builder->committed.xip[off], - builder->xmin)) - ; /* remove */ - else - workspace[surviving_xids++] = builder->committed.xip[off]; - } - - /* copy workspace back to persistent state */ - memcpy(builder->committed.xip, workspace, - surviving_xids * sizeof(TransactionId)); - - elog(DEBUG3, "purged committed transactions from %u to %u, xmin: %u, xmax: %u", - (uint32) builder->committed.xcnt, (uint32) surviving_xids, - builder->xmin, builder->xmax); - builder->committed.xcnt = surviving_xids; - - pfree(workspace); -} - -/* * Handle everything that needs to be done when a transaction commits */ void @@ -929,26 +665,19 @@ SnapBuildCommitTxn(SnapBuild *builder, XLogRecPtr lsn, TransactionId xid, { int nxact; - bool needs_snapshot = false; - bool needs_timetravel = false; - bool sub_needs_timetravel = false; + bool forced_timetravel = false; - TransactionId xmax = xid; + TransactionId xmax; /* - * Transactions preceding BUILDING_SNAPSHOT will neither be decoded, nor - * will they be part of a snapshot. So we don't need to record anything. + * If we couldn't observe every change of a transaction because it was + * already running at the point we started to observe we have to assume it + * made catalog changes. + * + * This has the positive benefit that we afterwards have enough + * information to build an exportable snapshot that's usable by pg_dump et + * al. */ - if (builder->state == SNAPBUILD_START || - (builder->state == SNAPBUILD_BUILDING_SNAPSHOT && - TransactionIdPrecedes(xid, SnapBuildNextPhaseAt(builder)))) - { - /* ensure that only commits after this are getting replayed */ - if (builder->start_decoding_at <= lsn) - builder->start_decoding_at = lsn + 1; - return; - } - if (builder->state < SNAPBUILD_CONSISTENT) { /* ensure that only commits after this are getting replayed */ @@ -956,104 +685,45 @@ SnapBuildCommitTxn(SnapBuild *builder, XLogRecPtr lsn, TransactionId xid, builder->start_decoding_at = lsn + 1; /* - * If building an exportable snapshot, force xid to be tracked, even - * if the transaction didn't modify the catalog. + * We could avoid treating !SnapBuildTxnIsRunning transactions as + * timetravel ones, but we want to be able to export a snapshot when + * we reached consistency. */ - if (builder->building_full_snapshot) - { - needs_timetravel = true; - } + forced_timetravel = true; + elog(DEBUG1, "forced to assume catalog changes for xid %u because it was running too early", xid); } + xmax = builder->xmax; + + if (NormalTransactionIdFollows(xid, xmax)) + xmax = xid; + if (!forced_timetravel) + { + if (ReorderBufferXidHasCatalogChanges(builder->reorder, xid)) + forced_timetravel = true; + } for (nxact = 0; nxact < nsubxacts; nxact++) { TransactionId subxid = subxacts[nxact]; - /* - * Add subtransaction to base snapshot if catalog modifying, we don't - * distinguish to toplevel transactions there. - */ - if (ReorderBufferXidHasCatalogChanges(builder->reorder, subxid)) - { - sub_needs_timetravel = true; - needs_snapshot = true; - - elog(DEBUG1, "found subtransaction %u:%u with catalog changes", - xid, subxid); - - SnapBuildAddCommittedTxn(builder, subxid); + if (NormalTransactionIdFollows(subxid, xmax)) + xmax = subxid; - if (NormalTransactionIdFollows(subxid, xmax)) - xmax = subxid; - } - - /* - * If we're forcing timetravel we also need visibility information - * about subtransaction, so keep track of subtransaction's state, even - * if not catalog modifying. Don't need to distribute a snapshot in - * that case. - */ - else if (needs_timetravel) + if (!forced_timetravel) { - SnapBuildAddCommittedTxn(builder, subxid); - if (NormalTransactionIdFollows(subxid, xmax)) - xmax = subxid; + if (ReorderBufferXidHasCatalogChanges(builder->reorder, subxid)) + forced_timetravel = true; } } - /* if top-level modified catalog, it'll need a snapshot */ - if (ReorderBufferXidHasCatalogChanges(builder->reorder, xid)) - { - elog(DEBUG2, "found top level transaction %u, with catalog changes", - xid); - needs_snapshot = true; - needs_timetravel = true; - SnapBuildAddCommittedTxn(builder, xid); - } - else if (sub_needs_timetravel) - { - /* track toplevel txn as well, subxact alone isn't meaningful */ - SnapBuildAddCommittedTxn(builder, xid); - } - else if (needs_timetravel) - { - elog(DEBUG2, "forced transaction %u to do timetravel", xid); - - SnapBuildAddCommittedTxn(builder, xid); - } - - if (!needs_timetravel) - { - /* record that we cannot export a general snapshot anymore */ - builder->committed.includes_all_transactions = false; - } - - Assert(!needs_snapshot || needs_timetravel); - - /* - * Adjust xmax of the snapshot builder, we only do that for committed, - * catalog modifying, transactions, everything else isn't interesting for - * us since we'll never look at the respective rows. - */ - if (needs_timetravel && - (!TransactionIdIsValid(builder->xmax) || - TransactionIdFollowsOrEquals(xmax, builder->xmax))) - { - builder->xmax = xmax; - TransactionIdAdvance(builder->xmax); - } + builder->xmax = xmax; + /* We use the commit record's LSN as the snapshot */ + builder->snapshotcsn = (CommitSeqNo) lsn; /* if there's any reason to build a historic snapshot, do so now */ - if (needs_snapshot) + if (forced_timetravel) { /* - * If we haven't built a complete snapshot yet there's no need to hand - * it out, it wouldn't (and couldn't) be used anyway. - */ - if (builder->state < SNAPBUILD_FULL_SNAPSHOT) - return; - - /* * Decrease the snapshot builder's refcount of the old snapshot, note * that it still will be used if it has been handed out to the * reorderbuffer earlier. @@ -1096,43 +766,20 @@ SnapBuildProcessRunningXacts(SnapBuild *builder, XLogRecPtr lsn, xl_running_xact ReorderBufferTXN *txn; /* - * If we're not consistent yet, inspect the record to see whether it - * allows to get closer to being consistent. If we are consistent, dump - * our snapshot so others or we, after a restart, can use it. - */ - if (builder->state < SNAPBUILD_CONSISTENT) - { - /* returns false if there's no point in performing cleanup just yet */ - if (!SnapBuildFindSnapshot(builder, lsn, running)) - return; - } - else - SnapBuildSerialize(builder, lsn); - - /* * Update range of interesting xids based on the running xacts - * information. We don't increase ->xmax using it, because once we are in - * a consistent state we can do that ourselves and much more efficiently - * so, because we only need to do it for catalog transactions since we - * only ever look at those. - * - * NB: We only increase xmax when a catalog modifying transaction commits - * (see SnapBuildCommitTxn). Because of this, xmax can be lower than - * xmin, which looks odd but is correct and actually more efficient, since - * we hit fast paths in tqual.c. + * information. */ builder->xmin = running->oldestRunningXid; + builder->xmax = running->nextXid; + builder->snapshotcsn = (CommitSeqNo) lsn; - /* Remove transactions we don't need to keep track off anymore */ - SnapBuildPurgeCommittedTxn(builder); - - elog(DEBUG3, "xmin: %u, xmax: %u, oldestrunning: %u", - builder->xmin, builder->xmax, - running->oldestRunningXid); + elog(DEBUG3, "xmin: %u, xmax: %u", + builder->xmin, builder->xmax); + Assert(lsn != InvalidXLogRecPtr); /* - * Increase shared memory limits, so vacuum can work on tuples we - * prevented from being pruned till now. + * Increase shared memory limits, so vacuum can work on tuples we prevented + * from being pruned till now. */ LogicalIncreaseXminForSlot(lsn, running->oldestRunningXid); @@ -1148,12 +795,8 @@ SnapBuildProcessRunningXacts(SnapBuild *builder, XLogRecPtr lsn, xl_running_xact * beginning. That point is where we can restart from. */ - /* - * Can't know about a serialized snapshot's location if we're not - * consistent. - */ if (builder->state < SNAPBUILD_CONSISTENT) - return; + builder->state = SNAPBUILD_CONSISTENT; txn = ReorderBufferGetOldestTXN(builder->reorder); @@ -1163,780 +806,4 @@ SnapBuildProcessRunningXacts(SnapBuild *builder, XLogRecPtr lsn, xl_running_xact */ if (txn != NULL && txn->restart_decoding_lsn != InvalidXLogRecPtr) LogicalIncreaseRestartDecodingForSlot(lsn, txn->restart_decoding_lsn); - - /* - * No in-progress transaction, can reuse the last serialized snapshot if - * we have one. - */ - else if (txn == NULL && - builder->reorder->current_restart_decoding_lsn != InvalidXLogRecPtr && - builder->last_serialized_snapshot != InvalidXLogRecPtr) - LogicalIncreaseRestartDecodingForSlot(lsn, - builder->last_serialized_snapshot); -} - - -/* - * Build the start of a snapshot that's capable of decoding the catalog. - * - * Helper function for SnapBuildProcessRunningXacts() while we're not yet - * consistent. - * - * Returns true if there is a point in performing internal maintenance/cleanup - * using the xl_running_xacts record. - */ -static bool -SnapBuildFindSnapshot(SnapBuild *builder, XLogRecPtr lsn, xl_running_xacts *running) -{ - /* --- - * Build catalog decoding snapshot incrementally using information about - * the currently running transactions. There are several ways to do that: - * - * a) There were no running transactions when the xl_running_xacts record - * was inserted, jump to CONSISTENT immediately. We might find such a - * state while waiting on c)'s sub-states. - * - * b) This (in a previous run) or another decoding slot serialized a - * snapshot to disk that we can use. Can't use this method for the - * initial snapshot when slot is being created and needs full snapshot - * for export or direct use, as that snapshot will only contain catalog - * modifying transactions. - * - * c) First incrementally build a snapshot for catalog tuples - * (BUILDING_SNAPSHOT), that requires all, already in-progress, - * transactions to finish. Every transaction starting after that - * (FULL_SNAPSHOT state), has enough information to be decoded. But - * for older running transactions no viable snapshot exists yet, so - * CONSISTENT will only be reached once all of those have finished. - * --- - */ - - /* - * xl_running_xact record is older than what we can use, we might not have - * all necessary catalog rows anymore. - */ - if (TransactionIdIsNormal(builder->initial_xmin_horizon) && - NormalTransactionIdPrecedes(running->oldestRunningXid, - builder->initial_xmin_horizon)) - { - ereport(DEBUG1, - (errmsg_internal("skipping snapshot at %X/%X while building logical decoding snapshot, xmin horizon too low", - (uint32) (lsn >> 32), (uint32) lsn), - errdetail_internal("initial xmin horizon of %u vs the snapshot's %u", - builder->initial_xmin_horizon, running->oldestRunningXid))); - - - SnapBuildWaitSnapshot(running, builder->initial_xmin_horizon); - - return true; - } - - /* - * a) No transaction were running, we can jump to consistent. - * - * This is not affected by races around xl_running_xacts, because we can - * miss transaction commits, but currently not transactions starting. - * - * NB: We might have already started to incrementally assemble a snapshot, - * so we need to be careful to deal with that. - */ - if (running->oldestRunningXid == running->nextXid) - { - if (builder->start_decoding_at == InvalidXLogRecPtr || - builder->start_decoding_at <= lsn) - /* can decode everything after this */ - builder->start_decoding_at = lsn + 1; - - /* As no transactions were running xmin/xmax can be trivially set. */ - builder->xmin = running->nextXid; /* < are finished */ - builder->xmax = running->nextXid; /* >= are running */ - - /* so we can safely use the faster comparisons */ - Assert(TransactionIdIsNormal(builder->xmin)); - Assert(TransactionIdIsNormal(builder->xmax)); - - builder->state = SNAPBUILD_CONSISTENT; - SnapBuildStartNextPhaseAt(builder, InvalidTransactionId); - - ereport(LOG, - (errmsg("logical decoding found consistent point at %X/%X", - (uint32) (lsn >> 32), (uint32) lsn), - errdetail("There are no running transactions."))); - - return false; - } - /* b) valid on disk state and not building full snapshot */ - else if (!builder->building_full_snapshot && - SnapBuildRestore(builder, lsn)) - { - /* there won't be any state to cleanup */ - return false; - } - - /* - * c) transition from START to BUILDING_SNAPSHOT. - * - * In START state, and a xl_running_xacts record with running xacts is - * encountered. In that case, switch to BUILDING_SNAPSHOT state, and - * record xl_running_xacts->nextXid. Once all running xacts have finished - * (i.e. they're all >= nextXid), we have a complete catalog snapshot. It - * might look that we could use xl_running_xact's ->xids information to - * get there quicker, but that is problematic because transactions marked - * as running, might already have inserted their commit record - it's - * infeasible to change that with locking. - */ - else if (builder->state == SNAPBUILD_START) - { - builder->state = SNAPBUILD_BUILDING_SNAPSHOT; - SnapBuildStartNextPhaseAt(builder, running->nextXid); - - /* - * Start with an xmin/xmax that's correct for future, when all the - * currently running transactions have finished. We'll update both - * while waiting for the pending transactions to finish. - */ - builder->xmin = running->nextXid; /* < are finished */ - builder->xmax = running->nextXid; /* >= are running */ - - /* so we can safely use the faster comparisons */ - Assert(TransactionIdIsNormal(builder->xmin)); - Assert(TransactionIdIsNormal(builder->xmax)); - - ereport(LOG, - (errmsg("logical decoding found initial starting point at %X/%X", - (uint32) (lsn >> 32), (uint32) lsn), - errdetail("Waiting for transactions (approximately %d) older than %u to end.", - running->xcnt, running->nextXid))); - - SnapBuildWaitSnapshot(running, running->nextXid); - } - - /* - * c) transition from BUILDING_SNAPSHOT to FULL_SNAPSHOT. - * - * In BUILDING_SNAPSHOT state, and this xl_running_xacts' oldestRunningXid - * is >= than nextXid from when we switched to BUILDING_SNAPSHOT. This - * means all transactions starting afterwards have enough information to - * be decoded. Switch to FULL_SNAPSHOT. - */ - else if (builder->state == SNAPBUILD_BUILDING_SNAPSHOT && - TransactionIdPrecedesOrEquals(SnapBuildNextPhaseAt(builder), - running->oldestRunningXid)) - { - builder->state = SNAPBUILD_FULL_SNAPSHOT; - SnapBuildStartNextPhaseAt(builder, running->nextXid); - - ereport(LOG, - (errmsg("logical decoding found initial consistent point at %X/%X", - (uint32) (lsn >> 32), (uint32) lsn), - errdetail("Waiting for transactions (approximately %d) older than %u to end.", - running->xcnt, running->nextXid))); - - SnapBuildWaitSnapshot(running, running->nextXid); - } - - /* - * c) transition from FULL_SNAPSHOT to CONSISTENT. - * - * In FULL_SNAPSHOT state (see d) ), and this xl_running_xacts' - * oldestRunningXid is >= than nextXid from when we switched to - * FULL_SNAPSHOT. This means all transactions that are currently in - * progress have a catalog snapshot, and all their changes have been - * collected. Switch to CONSISTENT. - */ - else if (builder->state == SNAPBUILD_FULL_SNAPSHOT && - TransactionIdPrecedesOrEquals(SnapBuildNextPhaseAt(builder), - running->oldestRunningXid)) - { - builder->state = SNAPBUILD_CONSISTENT; - SnapBuildStartNextPhaseAt(builder, InvalidTransactionId); - - ereport(LOG, - (errmsg("logical decoding found consistent point at %X/%X", - (uint32) (lsn >> 32), (uint32) lsn), - errdetail("There are no old transactions anymore."))); - } - - /* - * We already started to track running xacts and need to wait for all - * in-progress ones to finish. We fall through to the normal processing of - * records so incremental cleanup can be performed. - */ - return true; - -} - -/* --- - * Iterate through xids in record, wait for all older than the cutoff to - * finish. Then, if possible, log a new xl_running_xacts record. - * - * This isn't required for the correctness of decoding, but to: - * a) allow isolationtester to notice that we're currently waiting for - * something. - * b) log a new xl_running_xacts record where it'd be helpful, without having - * to write for bgwriter or checkpointer. - * --- - */ -static void -SnapBuildWaitSnapshot(xl_running_xacts *running, TransactionId cutoff) -{ - int off; - - for (off = 0; off < running->xcnt; off++) - { - TransactionId xid = running->xids[off]; - - /* - * Upper layers should prevent that we ever need to wait on ourselves. - * Check anyway, since failing to do so would either result in an - * endless wait or an Assert() failure. - */ - if (TransactionIdIsCurrentTransactionId(xid)) - elog(ERROR, "waiting for ourselves"); - - if (TransactionIdFollows(xid, cutoff)) - continue; - - XactLockTableWait(xid, NULL, NULL, XLTW_None); - } - - /* - * All transactions we needed to finish finished - try to ensure there is - * another xl_running_xacts record in a timely manner, without having to - * write for bgwriter or checkpointer to log one. During recovery we - * can't enforce that, so we'll have to wait. - */ - if (!RecoveryInProgress()) - { - LogStandbySnapshot(); - } -} - -/* ----------------------------------- - * Snapshot serialization support - * ----------------------------------- - */ - -/* - * We store current state of struct SnapBuild on disk in the following manner: - * - * struct SnapBuildOnDisk; - * TransactionId * running.xcnt_space; - * TransactionId * committed.xcnt; (*not xcnt_space*) - * - */ -typedef struct SnapBuildOnDisk -{ - /* first part of this struct needs to be version independent */ - - /* data not covered by checksum */ - uint32 magic; - pg_crc32c checksum; - - /* data covered by checksum */ - - /* version, in case we want to support pg_upgrade */ - uint32 version; - /* how large is the on disk data, excluding the constant sized part */ - uint32 length; - - /* version dependent part */ - SnapBuild builder; - - /* variable amount of TransactionIds follows */ -} SnapBuildOnDisk; - -#define SnapBuildOnDiskConstantSize \ - offsetof(SnapBuildOnDisk, builder) -#define SnapBuildOnDiskNotChecksummedSize \ - offsetof(SnapBuildOnDisk, version) - -#define SNAPBUILD_MAGIC 0x51A1E001 -#define SNAPBUILD_VERSION 2 - -/* - * Store/Load a snapshot from disk, depending on the snapshot builder's state. - * - * Supposed to be used by external (i.e. not snapbuild.c) code that just read - * a record that's a potential location for a serialized snapshot. - */ -void -SnapBuildSerializationPoint(SnapBuild *builder, XLogRecPtr lsn) -{ - if (builder->state < SNAPBUILD_CONSISTENT) - SnapBuildRestore(builder, lsn); - else - SnapBuildSerialize(builder, lsn); -} - -/* - * Serialize the snapshot 'builder' at the location 'lsn' if it hasn't already - * been done by another decoding process. - */ -static void -SnapBuildSerialize(SnapBuild *builder, XLogRecPtr lsn) -{ - Size needed_length; - SnapBuildOnDisk *ondisk; - char *ondisk_c; - int fd; - char tmppath[MAXPGPATH]; - char path[MAXPGPATH]; - int ret; - struct stat stat_buf; - Size sz; - - Assert(lsn != InvalidXLogRecPtr); - Assert(builder->last_serialized_snapshot == InvalidXLogRecPtr || - builder->last_serialized_snapshot <= lsn); - - /* - * no point in serializing if we cannot continue to work immediately after - * restoring the snapshot - */ - if (builder->state < SNAPBUILD_CONSISTENT) - return; - - /* - * We identify snapshots by the LSN they are valid for. We don't need to - * include timelines in the name as each LSN maps to exactly one timeline - * unless the user used pg_resetwal or similar. If a user did so, there's - * no hope continuing to decode anyway. - */ - sprintf(path, "pg_logical/snapshots/%X-%X.snap", - (uint32) (lsn >> 32), (uint32) lsn); - - /* - * first check whether some other backend already has written the snapshot - * for this LSN. It's perfectly fine if there's none, so we accept ENOENT - * as a valid state. Everything else is an unexpected error. - */ - ret = stat(path, &stat_buf); - - if (ret != 0 && errno != ENOENT) - ereport(ERROR, - (errmsg("could not stat file \"%s\": %m", path))); - - else if (ret == 0) - { - /* - * somebody else has already serialized to this point, don't overwrite - * but remember location, so we don't need to read old data again. - * - * To be sure it has been synced to disk after the rename() from the - * tempfile filename to the real filename, we just repeat the fsync. - * That ought to be cheap because in most scenarios it should already - * be safely on disk. - */ - fsync_fname(path, false); - fsync_fname("pg_logical/snapshots", true); - - builder->last_serialized_snapshot = lsn; - goto out; - } - - /* - * there is an obvious race condition here between the time we stat(2) the - * file and us writing the file. But we rename the file into place - * atomically and all files created need to contain the same data anyway, - * so this is perfectly fine, although a bit of a resource waste. Locking - * seems like pointless complication. - */ - elog(DEBUG1, "serializing snapshot to %s", path); - - /* to make sure only we will write to this tempfile, include pid */ - sprintf(tmppath, "pg_logical/snapshots/%X-%X.snap.%u.tmp", - (uint32) (lsn >> 32), (uint32) lsn, MyProcPid); - - /* - * Unlink temporary file if it already exists, needs to have been before a - * crash/error since we won't enter this function twice from within a - * single decoding slot/backend and the temporary file contains the pid of - * the current process. - */ - if (unlink(tmppath) != 0 && errno != ENOENT) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not remove file \"%s\": %m", path))); - - needed_length = sizeof(SnapBuildOnDisk) + - sizeof(TransactionId) * builder->committed.xcnt; - - ondisk_c = MemoryContextAllocZero(builder->context, needed_length); - ondisk = (SnapBuildOnDisk *) ondisk_c; - ondisk->magic = SNAPBUILD_MAGIC; - ondisk->version = SNAPBUILD_VERSION; - ondisk->length = needed_length; - INIT_CRC32C(ondisk->checksum); - COMP_CRC32C(ondisk->checksum, - ((char *) ondisk) + SnapBuildOnDiskNotChecksummedSize, - SnapBuildOnDiskConstantSize - SnapBuildOnDiskNotChecksummedSize); - ondisk_c += sizeof(SnapBuildOnDisk); - - memcpy(&ondisk->builder, builder, sizeof(SnapBuild)); - /* NULL-ify memory-only data */ - ondisk->builder.context = NULL; - ondisk->builder.snapshot = NULL; - ondisk->builder.reorder = NULL; - ondisk->builder.committed.xip = NULL; - - COMP_CRC32C(ondisk->checksum, - &ondisk->builder, - sizeof(SnapBuild)); - - /* there shouldn't be any running xacts */ - Assert(builder->was_running.was_xcnt == 0); - - /* copy committed xacts */ - sz = sizeof(TransactionId) * builder->committed.xcnt; - memcpy(ondisk_c, builder->committed.xip, sz); - COMP_CRC32C(ondisk->checksum, ondisk_c, sz); - ondisk_c += sz; - - FIN_CRC32C(ondisk->checksum); - - /* we have valid data now, open tempfile and write it there */ - fd = OpenTransientFile(tmppath, - O_CREAT | O_EXCL | O_WRONLY | PG_BINARY); - if (fd < 0) - ereport(ERROR, - (errmsg("could not open file \"%s\": %m", path))); - - pgstat_report_wait_start(WAIT_EVENT_SNAPBUILD_WRITE); - if ((write(fd, ondisk, needed_length)) != needed_length) - { - CloseTransientFile(fd); - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not write to file \"%s\": %m", tmppath))); - } - pgstat_report_wait_end(); - - /* - * fsync the file before renaming so that even if we crash after this we - * have either a fully valid file or nothing. - * - * TODO: Do the fsync() via checkpoints/restartpoints, doing it here has - * some noticeable overhead since it's performed synchronously during - * decoding? - */ - pgstat_report_wait_start(WAIT_EVENT_SNAPBUILD_SYNC); - if (pg_fsync(fd) != 0) - { - CloseTransientFile(fd); - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not fsync file \"%s\": %m", tmppath))); - } - pgstat_report_wait_end(); - CloseTransientFile(fd); - - fsync_fname("pg_logical/snapshots", true); - - /* - * We may overwrite the work from some other backend, but that's ok, our - * snapshot is valid as well, we'll just have done some superfluous work. - */ - if (rename(tmppath, path) != 0) - { - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not rename file \"%s\" to \"%s\": %m", - tmppath, path))); - } - - /* make sure we persist */ - fsync_fname(path, false); - fsync_fname("pg_logical/snapshots", true); - - /* - * Now there's no way we can loose the dumped state anymore, remember this - * as a serialization point. - */ - builder->last_serialized_snapshot = lsn; - -out: - ReorderBufferSetRestartPoint(builder->reorder, - builder->last_serialized_snapshot); -} - -/* - * Restore a snapshot into 'builder' if previously one has been stored at the - * location indicated by 'lsn'. Returns true if successful, false otherwise. - */ -static bool -SnapBuildRestore(SnapBuild *builder, XLogRecPtr lsn) -{ - SnapBuildOnDisk ondisk; - int fd; - char path[MAXPGPATH]; - Size sz; - int readBytes; - pg_crc32c checksum; - - /* no point in loading a snapshot if we're already there */ - if (builder->state == SNAPBUILD_CONSISTENT) - return false; - - sprintf(path, "pg_logical/snapshots/%X-%X.snap", - (uint32) (lsn >> 32), (uint32) lsn); - - fd = OpenTransientFile(path, O_RDONLY | PG_BINARY); - - if (fd < 0 && errno == ENOENT) - return false; - else if (fd < 0) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not open file \"%s\": %m", path))); - - /* ---- - * Make sure the snapshot had been stored safely to disk, that's normally - * cheap. - * Note that we do not need PANIC here, nobody will be able to use the - * slot without fsyncing, and saving it won't succeed without an fsync() - * either... - * ---- - */ - fsync_fname(path, false); - fsync_fname("pg_logical/snapshots", true); - - - /* read statically sized portion of snapshot */ - pgstat_report_wait_start(WAIT_EVENT_SNAPBUILD_READ); - readBytes = read(fd, &ondisk, SnapBuildOnDiskConstantSize); - pgstat_report_wait_end(); - if (readBytes != SnapBuildOnDiskConstantSize) - { - CloseTransientFile(fd); - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not read file \"%s\", read %d of %d: %m", - path, readBytes, (int) SnapBuildOnDiskConstantSize))); - } - - if (ondisk.magic != SNAPBUILD_MAGIC) - ereport(ERROR, - (errmsg("snapbuild state file \"%s\" has wrong magic number: %u instead of %u", - path, ondisk.magic, SNAPBUILD_MAGIC))); - - if (ondisk.version != SNAPBUILD_VERSION) - ereport(ERROR, - (errmsg("snapbuild state file \"%s\" has unsupported version: %u instead of %u", - path, ondisk.version, SNAPBUILD_VERSION))); - - INIT_CRC32C(checksum); - COMP_CRC32C(checksum, - ((char *) &ondisk) + SnapBuildOnDiskNotChecksummedSize, - SnapBuildOnDiskConstantSize - SnapBuildOnDiskNotChecksummedSize); - - /* read SnapBuild */ - pgstat_report_wait_start(WAIT_EVENT_SNAPBUILD_READ); - readBytes = read(fd, &ondisk.builder, sizeof(SnapBuild)); - pgstat_report_wait_end(); - if (readBytes != sizeof(SnapBuild)) - { - CloseTransientFile(fd); - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not read file \"%s\", read %d of %d: %m", - path, readBytes, (int) sizeof(SnapBuild)))); - } - COMP_CRC32C(checksum, &ondisk.builder, sizeof(SnapBuild)); - - /* restore running xacts (dead, but kept for backward compat) */ - sz = sizeof(TransactionId) * ondisk.builder.was_running.was_xcnt_space; - ondisk.builder.was_running.was_xip = - MemoryContextAllocZero(builder->context, sz); - pgstat_report_wait_start(WAIT_EVENT_SNAPBUILD_READ); - readBytes = read(fd, ondisk.builder.was_running.was_xip, sz); - pgstat_report_wait_end(); - if (readBytes != sz) - { - CloseTransientFile(fd); - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not read file \"%s\", read %d of %d: %m", - path, readBytes, (int) sz))); - } - COMP_CRC32C(checksum, ondisk.builder.was_running.was_xip, sz); - - /* restore committed xacts information */ - sz = sizeof(TransactionId) * ondisk.builder.committed.xcnt; - ondisk.builder.committed.xip = MemoryContextAllocZero(builder->context, sz); - pgstat_report_wait_start(WAIT_EVENT_SNAPBUILD_READ); - readBytes = read(fd, ondisk.builder.committed.xip, sz); - pgstat_report_wait_end(); - if (readBytes != sz) - { - CloseTransientFile(fd); - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not read file \"%s\", read %d of %d: %m", - path, readBytes, (int) sz))); - } - COMP_CRC32C(checksum, ondisk.builder.committed.xip, sz); - - CloseTransientFile(fd); - - FIN_CRC32C(checksum); - - /* verify checksum of what we've read */ - if (!EQ_CRC32C(checksum, ondisk.checksum)) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("checksum mismatch for snapbuild state file \"%s\": is %u, should be %u", - path, checksum, ondisk.checksum))); - - /* - * ok, we now have a sensible snapshot here, figure out if it has more - * information than we have. - */ - - /* - * We are only interested in consistent snapshots for now, comparing - * whether one incomplete snapshot is more "advanced" seems to be - * unnecessarily complex. - */ - if (ondisk.builder.state < SNAPBUILD_CONSISTENT) - goto snapshot_not_interesting; - - /* - * Don't use a snapshot that requires an xmin that we cannot guarantee to - * be available. - */ - if (TransactionIdPrecedes(ondisk.builder.xmin, builder->initial_xmin_horizon)) - goto snapshot_not_interesting; - - - /* ok, we think the snapshot is sensible, copy over everything important */ - builder->xmin = ondisk.builder.xmin; - builder->xmax = ondisk.builder.xmax; - builder->state = ondisk.builder.state; - - builder->committed.xcnt = ondisk.builder.committed.xcnt; - /* We only allocated/stored xcnt, not xcnt_space xids ! */ - /* don't overwrite preallocated xip, if we don't have anything here */ - if (builder->committed.xcnt > 0) - { - pfree(builder->committed.xip); - builder->committed.xcnt_space = ondisk.builder.committed.xcnt; - builder->committed.xip = ondisk.builder.committed.xip; - } - ondisk.builder.committed.xip = NULL; - - /* our snapshot is not interesting anymore, build a new one */ - if (builder->snapshot != NULL) - { - SnapBuildSnapDecRefcount(builder->snapshot); - } - builder->snapshot = SnapBuildBuildSnapshot(builder); - SnapBuildSnapIncRefcount(builder->snapshot); - - ReorderBufferSetRestartPoint(builder->reorder, lsn); - - Assert(builder->state == SNAPBUILD_CONSISTENT); - - ereport(LOG, - (errmsg("logical decoding found consistent point at %X/%X", - (uint32) (lsn >> 32), (uint32) lsn), - errdetail("Logical decoding will begin using saved snapshot."))); - return true; - -snapshot_not_interesting: - if (ondisk.builder.committed.xip != NULL) - pfree(ondisk.builder.committed.xip); - return false; -} - -/* - * Remove all serialized snapshots that are not required anymore because no - * slot can need them. This doesn't actually have to run during a checkpoint, - * but it's a convenient point to schedule this. - * - * NB: We run this during checkpoints even if logical decoding is disabled so - * we cleanup old slots at some point after it got disabled. - */ -void -CheckPointSnapBuild(void) -{ - XLogRecPtr cutoff; - XLogRecPtr redo; - DIR *snap_dir; - struct dirent *snap_de; - char path[MAXPGPATH + 21]; - - /* - * We start off with a minimum of the last redo pointer. No new - * replication slot will start before that, so that's a safe upper bound - * for removal. - */ - redo = GetRedoRecPtr(); - - /* now check for the restart ptrs from existing slots */ - cutoff = ReplicationSlotsComputeLogicalRestartLSN(); - - /* don't start earlier than the restart lsn */ - if (redo < cutoff) - cutoff = redo; - - snap_dir = AllocateDir("pg_logical/snapshots"); - while ((snap_de = ReadDir(snap_dir, "pg_logical/snapshots")) != NULL) - { - uint32 hi; - uint32 lo; - XLogRecPtr lsn; - struct stat statbuf; - - if (strcmp(snap_de->d_name, ".") == 0 || - strcmp(snap_de->d_name, "..") == 0) - continue; - - snprintf(path, sizeof(path), "pg_logical/snapshots/%s", snap_de->d_name); - - if (lstat(path, &statbuf) == 0 && !S_ISREG(statbuf.st_mode)) - { - elog(DEBUG1, "only regular files expected: %s", path); - continue; - } - - /* - * temporary filenames from SnapBuildSerialize() include the LSN and - * everything but are postfixed by .$pid.tmp. We can just remove them - * the same as other files because there can be none that are - * currently being written that are older than cutoff. - * - * We just log a message if a file doesn't fit the pattern, it's - * probably some editors lock/state file or similar... - */ - if (sscanf(snap_de->d_name, "%X-%X.snap", &hi, &lo) != 2) - { - ereport(LOG, - (errmsg("could not parse file name \"%s\"", path))); - continue; - } - - lsn = ((uint64) hi) << 32 | lo; - - /* check whether we still need it */ - if (lsn < cutoff || cutoff == InvalidXLogRecPtr) - { - elog(DEBUG1, "removing snapbuild snapshot %s", path); - - /* - * It's not particularly harmful, though strange, if we can't - * remove the file here. Don't prevent the checkpoint from - * completing, that'd be a cure worse than the disease. - */ - if (unlink(path) < 0) - { - ereport(LOG, - (errcode_for_file_access(), - errmsg("could not remove file \"%s\": %m", - path))); - continue; - } - } - } - FreeDir(snap_dir); } diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c index 2d1ed143e0..4e9f14090f 100644 --- a/src/backend/storage/ipc/ipci.c +++ b/src/backend/storage/ipc/ipci.c @@ -16,10 +16,10 @@ #include "access/clog.h" #include "access/commit_ts.h" +#include "access/csnlog.h" #include "access/heapam.h" #include "access/multixact.h" #include "access/nbtree.h" -#include "access/subtrans.h" #include "access/twophase.h" #include "commands/async.h" #include "miscadmin.h" @@ -127,8 +127,8 @@ CreateSharedMemoryAndSemaphores(bool makePrivate, int port) size = add_size(size, ProcGlobalShmemSize()); size = add_size(size, XLOGShmemSize()); size = add_size(size, CLOGShmemSize()); + size = add_size(size, CSNLOGShmemSize()); size = add_size(size, CommitTsShmemSize()); - size = add_size(size, SUBTRANSShmemSize()); size = add_size(size, TwoPhaseShmemSize()); size = add_size(size, BackgroundWorkerShmemSize()); size = add_size(size, MultiXactShmemSize()); @@ -219,8 +219,8 @@ CreateSharedMemoryAndSemaphores(bool makePrivate, int port) */ XLOGShmemInit(); CLOGShmemInit(); + CSNLOGShmemInit(); CommitTsShmemInit(); - SUBTRANSShmemInit(); MultiXactShmemInit(); InitBufferPool(); diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c index 37e12bd829..71a3997e21 100644 --- a/src/backend/storage/ipc/procarray.c +++ b/src/backend/storage/ipc/procarray.c @@ -13,24 +13,14 @@ * See notes in src/backend/access/transam/README. * * The process arrays now also include structures representing prepared - * transactions. The xid and subxids fields of these are valid, as are the + * transactions. The xid fields of these are valid, as are the * myProcLocks lists. They can be distinguished from regular backend PGPROCs * at need by checking for pid == 0. * - * During hot standby, we also keep a list of XIDs representing transactions - * that are known to be running in the master (or more precisely, were running - * as of the current point in the WAL stream). This list is kept in the - * KnownAssignedXids array, and is updated by watching the sequence of - * arriving XIDs. This is necessary because if we leave those XIDs out of - * snapshots taken for standby queries, then they will appear to be already - * complete, leading to MVCC failures. Note that in hot standby, the PGPROC - * array represents standby processes, which by definition are not running - * transactions that have XIDs. - * - * It is perhaps possible for a backend on the master to terminate without - * writing an abort record for its transaction. While that shouldn't really - * happen, it would tie up KnownAssignedXids indefinitely, so we protect - * ourselves by pruning the array when a valid list of running XIDs arrives. + * During hot standby, we update latestCompletedXid, oldestActiveXid, and + * latestObservedXid, as we replay transaction commit/abort and standby WAL + * records. Note that in hot standby, the PGPROC array represents standby + * processes, which by definition are not running transactions that have XIDs. * * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California @@ -46,7 +36,8 @@ #include #include "access/clog.h" -#include "access/subtrans.h" +#include "access/csnlog.h" +#include "access/mvccvars.h" #include "access/transam.h" #include "access/twophase.h" #include "access/xact.h" @@ -68,24 +59,6 @@ typedef struct ProcArrayStruct int numProcs; /* number of valid procs entries */ int maxProcs; /* allocated size of procs array */ - /* - * Known assigned XIDs handling - */ - int maxKnownAssignedXids; /* allocated size of array */ - int numKnownAssignedXids; /* current # of valid entries */ - int tailKnownAssignedXids; /* index of oldest valid element */ - int headKnownAssignedXids; /* index of newest element, + 1 */ - slock_t known_assigned_xids_lck; /* protects head/tail pointers */ - - /* - * Highest subxid that has been removed from KnownAssignedXids array to - * prevent overflow; or InvalidTransactionId if none. We track this for - * similar reasons to tracking overflowing cached subxids in PGXACT - * entries. Must hold exclusive ProcArrayLock to change this, and shared - * lock to read it. - */ - TransactionId lastOverflowedXid; - /* oldest xmin of any replication slot */ TransactionId replication_slot_xmin; /* oldest catalog xmin of any replication slot */ @@ -101,76 +74,23 @@ static PGPROC *allProcs; static PGXACT *allPgXact; /* - * Bookkeeping for tracking emulated transactions in recovery + * Cached values for GetRecentGlobalXmin(). + * + * RecentGlobalXmin and RecentGlobalDataXmin are initialized to + * InvalidTransactionId, to ensure that no one tries to use a stale + * value. Readers should ensure that it has been set to something else + * before using it. */ -static TransactionId *KnownAssignedXids; -static bool *KnownAssignedXidsValid; -static TransactionId latestObservedXid = InvalidTransactionId; +static int XminCacheResetCounter = 0; +static TransactionId RecentGlobalXmin = InvalidTransactionId; +static TransactionId RecentGlobalDataXmin = InvalidTransactionId; /* - * If we're in STANDBY_SNAPSHOT_PENDING state, standbySnapshotPendingXmin is - * the highest xid that might still be running that we don't have in - * KnownAssignedXids. + * Bookkeeping for tracking transactions in recovery */ -static TransactionId standbySnapshotPendingXmin; - -#ifdef XIDCACHE_DEBUG - -/* counters for XidCache measurement */ -static long xc_by_recent_xmin = 0; -static long xc_by_known_xact = 0; -static long xc_by_my_xact = 0; -static long xc_by_latest_xid = 0; -static long xc_by_main_xid = 0; -static long xc_by_child_xid = 0; -static long xc_by_known_assigned = 0; -static long xc_no_overflow = 0; -static long xc_slow_answer = 0; - -#define xc_by_recent_xmin_inc() (xc_by_recent_xmin++) -#define xc_by_known_xact_inc() (xc_by_known_xact++) -#define xc_by_my_xact_inc() (xc_by_my_xact++) -#define xc_by_latest_xid_inc() (xc_by_latest_xid++) -#define xc_by_main_xid_inc() (xc_by_main_xid++) -#define xc_by_child_xid_inc() (xc_by_child_xid++) -#define xc_by_known_assigned_inc() (xc_by_known_assigned++) -#define xc_no_overflow_inc() (xc_no_overflow++) -#define xc_slow_answer_inc() (xc_slow_answer++) - -static void DisplayXidCache(void); -#else /* !XIDCACHE_DEBUG */ - -#define xc_by_recent_xmin_inc() ((void) 0) -#define xc_by_known_xact_inc() ((void) 0) -#define xc_by_my_xact_inc() ((void) 0) -#define xc_by_latest_xid_inc() ((void) 0) -#define xc_by_main_xid_inc() ((void) 0) -#define xc_by_child_xid_inc() ((void) 0) -#define xc_by_known_assigned_inc() ((void) 0) -#define xc_no_overflow_inc() ((void) 0) -#define xc_slow_answer_inc() ((void) 0) -#endif /* XIDCACHE_DEBUG */ - -/* Primitives for KnownAssignedXids array handling for standby */ -static void KnownAssignedXidsCompress(bool force); -static void KnownAssignedXidsAdd(TransactionId from_xid, TransactionId to_xid, - bool exclusive_lock); -static bool KnownAssignedXidsSearch(TransactionId xid, bool remove); -static bool KnownAssignedXidExists(TransactionId xid); -static void KnownAssignedXidsRemove(TransactionId xid); -static void KnownAssignedXidsRemoveTree(TransactionId xid, int nsubxids, - TransactionId *subxids); -static void KnownAssignedXidsRemovePreceding(TransactionId xid); -static int KnownAssignedXidsGet(TransactionId *xarray, TransactionId xmax); -static int KnownAssignedXidsGetAndSetXmin(TransactionId *xarray, - TransactionId *xmin, - TransactionId xmax); -static TransactionId KnownAssignedXidsGetOldestXmin(void); -static void KnownAssignedXidsDisplay(int trace_level); -static void KnownAssignedXidsReset(void); -static inline void ProcArrayEndTransactionInternal(PGPROC *proc, - PGXACT *pgxact, TransactionId latestXid); -static void ProcArrayGroupClearXid(PGPROC *proc, TransactionId latestXid); +static TransactionId latestObservedXid = InvalidTransactionId; + +static void AdvanceOldestActiveXid(TransactionId myXid); /* * Report shared-memory space needed by CreateSharedProcArray. @@ -186,31 +106,6 @@ ProcArrayShmemSize(void) size = offsetof(ProcArrayStruct, pgprocnos); size = add_size(size, mul_size(sizeof(int), PROCARRAY_MAXPROCS)); - /* - * During Hot Standby processing we have a data structure called - * KnownAssignedXids, created in shared memory. Local data structures are - * also created in various backends during GetSnapshotData(), - * TransactionIdIsInProgress() and GetRunningTransactionData(). All of the - * main structures created in those functions must be identically sized, - * since we may at times copy the whole of the data structures around. We - * refer to this size as TOTAL_MAX_CACHED_SUBXIDS. - * - * Ideally we'd only create this structure if we were actually doing hot - * standby in the current run, but we don't know that yet at the time - * shared memory is being set up. - */ -#define TOTAL_MAX_CACHED_SUBXIDS \ - ((PGPROC_MAX_CACHED_SUBXIDS + 1) * PROCARRAY_MAXPROCS) - - if (EnableHotStandby) - { - size = add_size(size, - mul_size(sizeof(TransactionId), - TOTAL_MAX_CACHED_SUBXIDS)); - size = add_size(size, - mul_size(sizeof(bool), TOTAL_MAX_CACHED_SUBXIDS)); - } - return size; } @@ -237,12 +132,6 @@ CreateSharedProcArray(void) */ procArray->numProcs = 0; procArray->maxProcs = PROCARRAY_MAXPROCS; - procArray->maxKnownAssignedXids = TOTAL_MAX_CACHED_SUBXIDS; - procArray->numKnownAssignedXids = 0; - procArray->tailKnownAssignedXids = 0; - procArray->headKnownAssignedXids = 0; - SpinLockInit(&procArray->known_assigned_xids_lck); - procArray->lastOverflowedXid = InvalidTransactionId; procArray->replication_slot_xmin = InvalidTransactionId; procArray->replication_slot_catalog_xmin = InvalidTransactionId; } @@ -250,20 +139,6 @@ CreateSharedProcArray(void) allProcs = ProcGlobal->allProcs; allPgXact = ProcGlobal->allPgXact; - /* Create or attach to the KnownAssignedXids arrays too, if needed */ - if (EnableHotStandby) - { - KnownAssignedXids = (TransactionId *) - ShmemInitStruct("KnownAssignedXids", - mul_size(sizeof(TransactionId), - TOTAL_MAX_CACHED_SUBXIDS), - &found); - KnownAssignedXidsValid = (bool *) - ShmemInitStruct("KnownAssignedXidsValid", - mul_size(sizeof(bool), TOTAL_MAX_CACHED_SUBXIDS), - &found); - } - /* Register and initialize fields of ProcLWLockTranche */ LWLockRegisterTranche(LWTRANCHE_PROC, "proc"); } @@ -321,43 +196,15 @@ ProcArrayAdd(PGPROC *proc) /* * Remove the specified PGPROC from the shared array. - * - * When latestXid is a valid XID, we are removing a live 2PC gxact from the - * array, and thus causing it to appear as "not running" anymore. In this - * case we must advance latestCompletedXid. (This is essentially the same - * as ProcArrayEndTransaction followed by removal of the PGPROC, but we take - * the ProcArrayLock only once, and don't damage the content of the PGPROC; - * twophase.c depends on the latter.) */ void -ProcArrayRemove(PGPROC *proc, TransactionId latestXid) +ProcArrayRemove(PGPROC *proc) { ProcArrayStruct *arrayP = procArray; int index; -#ifdef XIDCACHE_DEBUG - /* dump stats at backend shutdown, but not prepared-xact end */ - if (proc->pid != 0) - DisplayXidCache(); -#endif - LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); - if (TransactionIdIsValid(latestXid)) - { - Assert(TransactionIdIsValid(allPgXact[proc->pgprocno].xid)); - - /* Advance global latestCompletedXid while holding the lock */ - if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid, - latestXid)) - ShmemVariableCache->latestCompletedXid = latestXid; - } - else - { - /* Shouldn't be trying to remove a live transaction here */ - Assert(!TransactionIdIsValid(allPgXact[proc->pgprocno].xid)); - } - for (index = 0; index < arrayP->numProcs; index++) { if (arrayP->pgprocnos[index] == proc->pgprocno) @@ -378,6 +225,15 @@ ProcArrayRemove(PGPROC *proc, TransactionId latestXid) elog(LOG, "failed to find proc %p in ProcArray", proc); } +static void resetGlobalXminCache(void) +{ + if (++XminCacheResetCounter == 13) + { + XminCacheResetCounter = 0; + RecentGlobalXmin = InvalidTransactionId; + RecentGlobalDataXmin = InvalidTransactionId; + } +} /* * ProcArrayEndTransaction -- mark a transaction as no longer running @@ -386,211 +242,49 @@ ProcArrayRemove(PGPROC *proc, TransactionId latestXid) * commit/abort must already be reported to WAL and pg_xact. * * proc is currently always MyProc, but we pass it explicitly for flexibility. - * latestXid is the latest Xid among the transaction's main XID and - * subtransactions, or InvalidTransactionId if it has no XID. (We must ask - * the caller to pass latestXid, instead of computing it from the PGPROC's - * contents, because the subxid information in the PGPROC might be - * incomplete.) */ void -ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid) +ProcArrayEndTransaction(PGPROC *proc) { PGXACT *pgxact = &allPgXact[proc->pgprocno]; + TransactionId myXid; - if (TransactionIdIsValid(latestXid)) - { - /* - * We must lock ProcArrayLock while clearing our advertised XID, so - * that we do not exit the set of "running" transactions while someone - * else is taking a snapshot. See discussion in - * src/backend/access/transam/README. - */ - Assert(TransactionIdIsValid(allPgXact[proc->pgprocno].xid)); - - /* - * If we can immediately acquire ProcArrayLock, we clear our own XID - * and release the lock. If not, use group XID clearing to improve - * efficiency. - */ - if (LWLockConditionalAcquire(ProcArrayLock, LW_EXCLUSIVE)) - { - ProcArrayEndTransactionInternal(proc, pgxact, latestXid); - LWLockRelease(ProcArrayLock); - } - else - ProcArrayGroupClearXid(proc, latestXid); - } - else - { - /* - * If we have no XID, we don't need to lock, since we won't affect - * anyone else's calculation of a snapshot. We might change their - * estimate of global xmin, but that's OK. - */ - Assert(!TransactionIdIsValid(allPgXact[proc->pgprocno].xid)); - - proc->lxid = InvalidLocalTransactionId; - pgxact->xmin = InvalidTransactionId; - /* must be cleared with xid/xmin: */ - pgxact->vacuumFlags &= ~PROC_VACUUM_STATE_MASK; - pgxact->delayChkpt = false; /* be sure this is cleared in abort */ - proc->recoveryConflictPending = false; + myXid = pgxact->xid; - Assert(pgxact->nxids == 0); - Assert(pgxact->overflowed == false); - } -} - -/* - * Mark a write transaction as no longer running. - * - * We don't do any locking here; caller must handle that. - */ -static inline void -ProcArrayEndTransactionInternal(PGPROC *proc, PGXACT *pgxact, - TransactionId latestXid) -{ + /* A shared lock is enough to modify our own fields */ + LWLockAcquire(ProcArrayLock, LW_SHARED); pgxact->xid = InvalidTransactionId; proc->lxid = InvalidLocalTransactionId; pgxact->xmin = InvalidTransactionId; - /* must be cleared with xid/xmin: */ + pgxact->snapshotcsn = InvalidCommitSeqNo; + /* must be cleared with xid/xmin/snapshotcsn: */ pgxact->vacuumFlags &= ~PROC_VACUUM_STATE_MASK; pgxact->delayChkpt = false; /* be sure this is cleared in abort */ proc->recoveryConflictPending = false; - /* Clear the subtransaction-XID cache too while holding the lock */ - pgxact->nxids = 0; - pgxact->overflowed = false; + LWLockRelease(ProcArrayLock); + + /* If we were the oldest active XID, advance oldestXid */ + if (TransactionIdIsValid(myXid)) + AdvanceOldestActiveXid(myXid); - /* Also advance global latestCompletedXid while holding the lock */ - if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid, - latestXid)) - ShmemVariableCache->latestCompletedXid = latestXid; + /* Reset cached variables */ + resetGlobalXminCache(); } -/* - * ProcArrayGroupClearXid -- group XID clearing - * - * When we cannot immediately acquire ProcArrayLock in exclusive mode at - * commit time, add ourselves to a list of processes that need their XIDs - * cleared. The first process to add itself to the list will acquire - * ProcArrayLock in exclusive mode and perform ProcArrayEndTransactionInternal - * on behalf of all group members. This avoids a great deal of contention - * around ProcArrayLock when many processes are trying to commit at once, - * since the lock need not be repeatedly handed off from one committing - * process to the next. - */ -static void -ProcArrayGroupClearXid(PGPROC *proc, TransactionId latestXid) +void +ProcArrayResetXmin(PGPROC *proc) { - volatile PROC_HDR *procglobal = ProcGlobal; - uint32 nextidx; - uint32 wakeidx; - - /* We should definitely have an XID to clear. */ - Assert(TransactionIdIsValid(allPgXact[proc->pgprocno].xid)); - - /* Add ourselves to the list of processes needing a group XID clear. */ - proc->procArrayGroupMember = true; - proc->procArrayGroupMemberXid = latestXid; - while (true) - { - nextidx = pg_atomic_read_u32(&procglobal->procArrayGroupFirst); - pg_atomic_write_u32(&proc->procArrayGroupNext, nextidx); - - if (pg_atomic_compare_exchange_u32(&procglobal->procArrayGroupFirst, - &nextidx, - (uint32) proc->pgprocno)) - break; - } - - /* - * If the list was not empty, the leader will clear our XID. It is - * impossible to have followers without a leader because the first process - * that has added itself to the list will always have nextidx as - * INVALID_PGPROCNO. - */ - if (nextidx != INVALID_PGPROCNO) - { - int extraWaits = 0; - - /* Sleep until the leader clears our XID. */ - pgstat_report_wait_start(WAIT_EVENT_PROCARRAY_GROUP_UPDATE); - for (;;) - { - /* acts as a read barrier */ - PGSemaphoreLock(proc->sem); - if (!proc->procArrayGroupMember) - break; - extraWaits++; - } - pgstat_report_wait_end(); - - Assert(pg_atomic_read_u32(&proc->procArrayGroupNext) == INVALID_PGPROCNO); - - /* Fix semaphore count for any absorbed wakeups */ - while (extraWaits-- > 0) - PGSemaphoreUnlock(proc->sem); - return; - } - - /* We are the leader. Acquire the lock on behalf of everyone. */ - LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); - - /* - * Now that we've got the lock, clear the list of processes waiting for - * group XID clearing, saving a pointer to the head of the list. Trying - * to pop elements one at a time could lead to an ABA problem. - */ - while (true) - { - nextidx = pg_atomic_read_u32(&procglobal->procArrayGroupFirst); - if (pg_atomic_compare_exchange_u32(&procglobal->procArrayGroupFirst, - &nextidx, - INVALID_PGPROCNO)) - break; - } - - /* Remember head of list so we can perform wakeups after dropping lock. */ - wakeidx = nextidx; - - /* Walk the list and clear all XIDs. */ - while (nextidx != INVALID_PGPROCNO) - { - PGPROC *proc = &allProcs[nextidx]; - PGXACT *pgxact = &allPgXact[nextidx]; - - ProcArrayEndTransactionInternal(proc, pgxact, proc->procArrayGroupMemberXid); - - /* Move to next proc in list. */ - nextidx = pg_atomic_read_u32(&proc->procArrayGroupNext); - } - - /* We're done with the lock now. */ - LWLockRelease(ProcArrayLock); + PGXACT *pgxact = &allPgXact[proc->pgprocno]; /* - * Now that we've released the lock, go back and wake everybody up. We - * don't do this under the lock so as to keep lock hold times to a - * minimum. The system calls we need to perform to wake other processes - * up are probably much slower than the simple memory writes we did while - * holding the lock. + * Note we can do this without locking because we assume that storing an Xid + * is atomic. */ - while (wakeidx != INVALID_PGPROCNO) - { - PGPROC *proc = &allProcs[wakeidx]; - - wakeidx = pg_atomic_read_u32(&proc->procArrayGroupNext); - pg_atomic_write_u32(&proc->procArrayGroupNext, INVALID_PGPROCNO); - - /* ensure all previous writes are visible before follower continues. */ - pg_write_barrier(); - - proc->procArrayGroupMember = false; + pgxact->xmin = InvalidTransactionId; - if (proc != MyProc) - PGSemaphoreUnlock(proc->sem); - } + /* Reset cached variables */ + resetGlobalXminCache(); } /* @@ -615,38 +309,47 @@ ProcArrayClearTransaction(PGPROC *proc) pgxact->xid = InvalidTransactionId; proc->lxid = InvalidLocalTransactionId; pgxact->xmin = InvalidTransactionId; + pgxact->snapshotcsn = InvalidCommitSeqNo; proc->recoveryConflictPending = false; /* redundant, but just in case */ pgxact->vacuumFlags &= ~PROC_VACUUM_STATE_MASK; pgxact->delayChkpt = false; - /* Clear the subtransaction-XID cache too */ - pgxact->nxids = 0; - pgxact->overflowed = false; + /* + * We don't need to update oldestActiveXid, because the gxact entry in + * the procarray is still running with the same XID. + */ + + /* Reset cached variables */ + RecentGlobalXmin = InvalidTransactionId; + RecentGlobalDataXmin = InvalidTransactionId; } /* * ProcArrayInitRecovery -- initialize recovery xid mgmt environment * - * Remember up to where the startup process initialized the CLOG and subtrans + * Remember up to where the startup process initialized the CLOG and CSNLOG * so we can ensure it's initialized gaplessly up to the point where necessary * while in recovery. */ void -ProcArrayInitRecovery(TransactionId initializedUptoXID) +ProcArrayInitRecovery(TransactionId oldestActiveXID, TransactionId initializedUptoXID) { Assert(standbyState == STANDBY_INITIALIZED); Assert(TransactionIdIsNormal(initializedUptoXID)); /* - * we set latestObservedXid to the xid SUBTRANS has been initialized up + * we set latestObservedXid to the xid SUBTRANS (XXX csnlog?) has been initialized up * to, so we can extend it from that point onwards in * RecordKnownAssignedTransactionIds, and when we get consistent in * ProcArrayApplyRecoveryInfo(). */ latestObservedXid = initializedUptoXID; TransactionIdRetreat(latestObservedXid); + + /* also initialize oldestActiveXid */ + pg_atomic_write_u32(&ShmemVariableCache->oldestActiveXid, oldestActiveXID); } /* @@ -667,20 +370,11 @@ ProcArrayInitRecovery(TransactionId initializedUptoXID) void ProcArrayApplyRecoveryInfo(RunningTransactions running) { - TransactionId *xids; - int nxids; TransactionId nextXid; - int i; Assert(standbyState >= STANDBY_INITIALIZED); Assert(TransactionIdIsValid(running->nextXid)); Assert(TransactionIdIsValid(running->oldestRunningXid)); - Assert(TransactionIdIsNormal(running->latestCompletedXid)); - - /* - * Remove stale transactions, if any. - */ - ExpireOldKnownAssignedTransactionIds(running->oldestRunningXid); /* * Remove stale locks, if any. @@ -688,7 +382,7 @@ ProcArrayApplyRecoveryInfo(RunningTransactions running) * Locks are always assigned to the toplevel xid so we don't need to care * about subxcnt/subxids (and by extension not about ->suboverflowed). */ - StandbyReleaseOldLocks(running->xcnt, running->xids); + StandbyReleaseOldLocks(running->oldestRunningXid); /* * If our snapshot is already valid, nothing else to do... @@ -696,51 +390,6 @@ ProcArrayApplyRecoveryInfo(RunningTransactions running) if (standbyState == STANDBY_SNAPSHOT_READY) return; - /* - * If our initial RunningTransactionsData had an overflowed snapshot then - * we knew we were missing some subxids from our snapshot. If we continue - * to see overflowed snapshots then we might never be able to start up, so - * we make another test to see if our snapshot is now valid. We know that - * the missing subxids are equal to or earlier than nextXid. After we - * initialise we continue to apply changes during recovery, so once the - * oldestRunningXid is later than the nextXid from the initial snapshot we - * know that we no longer have missing information and can mark the - * snapshot as valid. - */ - if (standbyState == STANDBY_SNAPSHOT_PENDING) - { - /* - * If the snapshot isn't overflowed or if its empty we can reset our - * pending state and use this snapshot instead. - */ - if (!running->subxid_overflow || running->xcnt == 0) - { - /* - * If we have already collected known assigned xids, we need to - * throw them away before we apply the recovery snapshot. - */ - KnownAssignedXidsReset(); - standbyState = STANDBY_INITIALIZED; - } - else - { - if (TransactionIdPrecedes(standbySnapshotPendingXmin, - running->oldestRunningXid)) - { - standbyState = STANDBY_SNAPSHOT_READY; - elog(trace_recovery(DEBUG1), - "recovery snapshots are now enabled"); - } - else - elog(trace_recovery(DEBUG1), - "recovery snapshot waiting for non-overflowed snapshot or " - "until oldest active xid on standby is at least %u (now %u)", - standbySnapshotPendingXmin, - running->oldestRunningXid); - return; - } - } - Assert(standbyState == STANDBY_INITIALIZED); /* @@ -751,78 +400,10 @@ ProcArrayApplyRecoveryInfo(RunningTransactions running) */ /* - * Nobody else is running yet, but take locks anyhow - */ - LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); - - /* - * KnownAssignedXids is sorted so we cannot just add the xids, we have to - * sort them first. - * - * Some of the new xids are top-level xids and some are subtransactions. - * We don't call SubtransSetParent because it doesn't matter yet. If we - * aren't overflowed then all xids will fit in snapshot and so we don't - * need subtrans. If we later overflow, an xid assignment record will add - * xids to subtrans. If RunningXacts is overflowed then we don't have - * enough information to correctly update subtrans anyway. - */ - - /* - * Allocate a temporary array to avoid modifying the array passed as - * argument. - */ - xids = palloc(sizeof(TransactionId) * (running->xcnt + running->subxcnt)); - - /* - * Add to the temp array any xids which have not already completed. - */ - nxids = 0; - for (i = 0; i < running->xcnt + running->subxcnt; i++) - { - TransactionId xid = running->xids[i]; - - /* - * The running-xacts snapshot can contain xids that were still visible - * in the procarray when the snapshot was taken, but were already - * WAL-logged as completed. They're not running anymore, so ignore - * them. - */ - if (TransactionIdDidCommit(xid) || TransactionIdDidAbort(xid)) - continue; - - xids[nxids++] = xid; - } - - if (nxids > 0) - { - if (procArray->numKnownAssignedXids != 0) - { - LWLockRelease(ProcArrayLock); - elog(ERROR, "KnownAssignedXids is not empty"); - } - - /* - * Sort the array so that we can add them safely into - * KnownAssignedXids. - */ - qsort(xids, nxids, sizeof(TransactionId), xidComparator); - - /* - * Add the sorted snapshot into KnownAssignedXids - */ - for (i = 0; i < nxids; i++) - KnownAssignedXidsAdd(xids[i], xids[i], true); - - KnownAssignedXidsDisplay(trace_recovery(DEBUG3)); - } - - pfree(xids); - - /* - * latestObservedXid is at least set to the point where SUBTRANS was + * latestObservedXid is at least set to the point where CSNLOG was * started up to (c.f. ProcArrayInitRecovery()) or to the biggest xid - * RecordKnownAssignedTransactionIds() was called for. Initialize - * subtrans from thereon, up to nextXid - 1. + * RecordKnownAssignedTransactionIds() (FIXME: gone!) was called for. Initialize + * csnlog from thereon, up to nextXid - 1. * * We need to duplicate parts of RecordKnownAssignedTransactionId() here, * because we've just added xids to the known assigned xids machinery that @@ -832,52 +413,11 @@ ProcArrayApplyRecoveryInfo(RunningTransactions running) TransactionIdAdvance(latestObservedXid); while (TransactionIdPrecedes(latestObservedXid, running->nextXid)) { - ExtendSUBTRANS(latestObservedXid); + ExtendCSNLOG(latestObservedXid); TransactionIdAdvance(latestObservedXid); } TransactionIdRetreat(latestObservedXid); /* = running->nextXid - 1 */ - /* ---------- - * Now we've got the running xids we need to set the global values that - * are used to track snapshots as they evolve further. - * - * - latestCompletedXid which will be the xmax for snapshots - * - lastOverflowedXid which shows whether snapshots overflow - * - nextXid - * - * If the snapshot overflowed, then we still initialise with what we know, - * but the recovery snapshot isn't fully valid yet because we know there - * are some subxids missing. We don't know the specific subxids that are - * missing, so conservatively assume the last one is latestObservedXid. - * ---------- - */ - if (running->subxid_overflow) - { - standbyState = STANDBY_SNAPSHOT_PENDING; - - standbySnapshotPendingXmin = latestObservedXid; - procArray->lastOverflowedXid = latestObservedXid; - } - else - { - standbyState = STANDBY_SNAPSHOT_READY; - - standbySnapshotPendingXmin = InvalidTransactionId; - } - - /* - * If a transaction wrote a commit record in the gap between taking and - * logging the snapshot then latestCompletedXid may already be higher than - * the value from the snapshot, so check before we use the incoming value. - */ - if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid, - running->latestCompletedXid)) - ShmemVariableCache->latestCompletedXid = running->latestCompletedXid; - - Assert(TransactionIdIsNormal(ShmemVariableCache->latestCompletedXid)); - - LWLockRelease(ProcArrayLock); - /* * ShmemVariableCache->nextXid must be beyond any observed xid. * @@ -896,367 +436,213 @@ ProcArrayApplyRecoveryInfo(RunningTransactions running) Assert(TransactionIdIsValid(ShmemVariableCache->nextXid)); - KnownAssignedXidsDisplay(trace_recovery(DEBUG3)); - if (standbyState == STANDBY_SNAPSHOT_READY) - elog(trace_recovery(DEBUG1), "recovery snapshots are now enabled"); - else - elog(trace_recovery(DEBUG1), - "recovery snapshot waiting for non-overflowed snapshot or " - "until oldest active xid on standby is at least %u (now %u)", - standbySnapshotPendingXmin, - running->oldestRunningXid); + standbyState = STANDBY_SNAPSHOT_READY; + elog(trace_recovery(DEBUG1), "recovery snapshots are now enabled"); } /* - * ProcArrayApplyXidAssignment - * Process an XLOG_XACT_ASSIGNMENT WAL record + * TransactionIdIsActive -- is xid the top-level XID of an active backend? + * + * This ignores prepared transactions and subtransactions, since that's not + * needed for current uses. */ -void -ProcArrayApplyXidAssignment(TransactionId topxid, - int nsubxids, TransactionId *subxids) +bool +TransactionIdIsActive(TransactionId xid) { - TransactionId max_xid; + bool result = false; + ProcArrayStruct *arrayP = procArray; int i; - Assert(standbyState >= STANDBY_INITIALIZED); - - max_xid = TransactionIdLatest(topxid, nsubxids, subxids); - - /* - * Mark all the subtransactions as observed. - * - * NOTE: This will fail if the subxid contains too many previously - * unobserved xids to fit into known-assigned-xids. That shouldn't happen - * as the code stands, because xid-assignment records should never contain - * more than PGPROC_MAX_CACHED_SUBXIDS entries. - */ - RecordKnownAssignedTransactionIds(max_xid); + LWLockAcquire(ProcArrayLock, LW_SHARED); - /* - * Notice that we update pg_subtrans with the top-level xid, rather than - * the parent xid. This is a difference between normal processing and - * recovery, yet is still correct in all cases. The reason is that - * subtransaction commit is not marked in clog until commit processing, so - * all aborted subtransactions have already been clearly marked in clog. - * As a result we are able to refer directly to the top-level - * transaction's state rather than skipping through all the intermediate - * states in the subtransaction tree. This should be the first time we - * have attempted to SubTransSetParent(). - */ - for (i = 0; i < nsubxids; i++) - SubTransSetParent(subxids[i], topxid); + for (i = 0; i < arrayP->numProcs; i++) + { + int pgprocno = arrayP->pgprocnos[i]; + volatile PGPROC *proc = &allProcs[pgprocno]; + volatile PGXACT *pgxact = &allPgXact[pgprocno]; + TransactionId pxid; - /* KnownAssignedXids isn't maintained yet, so we're done for now */ - if (standbyState == STANDBY_INITIALIZED) - return; + /* Fetch xid just once - see GetNewTransactionId */ + pxid = pgxact->xid; - /* - * Uses same locking as transaction commit - */ - LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + if (!TransactionIdIsValid(pxid)) + continue; - /* - * Remove subxids from known-assigned-xacts. - */ - KnownAssignedXidsRemoveTree(InvalidTransactionId, nsubxids, subxids); + if (proc->pid == 0) + continue; /* ignore prepared transactions */ - /* - * Advance lastOverflowedXid to be at least the last of these subxids. - */ - if (TransactionIdPrecedes(procArray->lastOverflowedXid, max_xid)) - procArray->lastOverflowedXid = max_xid; + if (TransactionIdEquals(pxid, xid)) + { + result = true; + break; + } + } LWLockRelease(ProcArrayLock); + + return result; } /* - * TransactionIdIsInProgress -- is given transaction running in some backend - * - * Aside from some shortcuts such as checking RecentXmin and our own Xid, - * there are four possibilities for finding a running transaction: - * - * 1. The given Xid is a main transaction Id. We will find this out cheaply - * by looking at the PGXACT struct for each backend. + * AdvanceOldestActiveXid -- * - * 2. The given Xid is one of the cached subxact Xids in the PGPROC array. - * We can find this out cheaply too. - * - * 3. In Hot Standby mode, we must search the KnownAssignedXids list to see - * if the Xid is running on the master. - * - * 4. Search the SubTrans tree to find the Xid's topmost parent, and then see - * if that is running according to PGXACT or KnownAssignedXids. This is the - * slowest way, but sadly it has to be done always if the others failed, - * unless we see that the cached subxact sets are complete (none have - * overflowed). - * - * ProcArrayLock has to be held while we do 1, 2, 3. If we save the top Xids - * while doing 1 and 3, we can release the ProcArrayLock while we do 4. - * This buys back some concurrency (and we can't retrieve the main Xids from - * PGXACT again anyway; see GetNewTransactionId). + * Advance oldestActiveXid. 'oldXid' is the current value, and it's known to be + * finished now. */ -bool -TransactionIdIsInProgress(TransactionId xid) +static void +AdvanceOldestActiveXid(TransactionId myXid) { - static TransactionId *xids = NULL; - int nxids = 0; - ProcArrayStruct *arrayP = procArray; - TransactionId topxid; - int i, - j; + TransactionId nextXid; + TransactionId xid; + TransactionId oldValue; - /* - * Don't bother checking a transaction older than RecentXmin; it could not - * possibly still be running. (Note: in particular, this guarantees that - * we reject InvalidTransactionId, FrozenTransactionId, etc as not - * running.) - */ - if (TransactionIdPrecedes(xid, RecentXmin)) - { - xc_by_recent_xmin_inc(); - return false; - } + oldValue = pg_atomic_read_u32(&ShmemVariableCache->oldestActiveXid); - /* - * We may have just checked the status of this transaction, so if it is - * already known to be completed, we can fall out without any access to - * shared memory. - */ - if (TransactionIdIsKnownCompleted(xid)) - { - xc_by_known_xact_inc(); - return false; - } + /* Quick exit if we were not the oldest active XID. */ + if (myXid != oldValue) + return; - /* - * Also, we can handle our own transaction (and subtransactions) without - * any access to shared memory. - */ - if (TransactionIdIsCurrentTransactionId(xid)) - { - xc_by_my_xact_inc(); - return true; - } + xid = myXid; + TransactionIdAdvance(xid); - /* - * If first time through, get workspace to remember main XIDs in. We - * malloc it permanently to avoid repeated palloc/pfree overhead. - */ - if (xids == NULL) + for (;;) { /* - * In hot standby mode, reserve enough space to hold all xids in the - * known-assigned list. If we later finish recovery, we no longer need - * the bigger array, but we don't bother to shrink it. + * Current nextXid is the upper bound, if there are no transactions + * active at all. */ - int maxxids = RecoveryInProgress() ? TOTAL_MAX_CACHED_SUBXIDS : arrayP->maxProcs; + /* assume we can read nextXid atomically without holding XidGenlock. */ + nextXid = ShmemVariableCache->nextXid; + /* Scan the CSN Log for the next active xid */ + xid = CSNLogGetNextActiveXid(xid, nextXid); - xids = (TransactionId *) malloc(maxxids * sizeof(TransactionId)); - if (xids == NULL) - ereport(ERROR, - (errcode(ERRCODE_OUT_OF_MEMORY), - errmsg("out of memory"))); - } - - LWLockAcquire(ProcArrayLock, LW_SHARED); - - /* - * Now that we have the lock, we can check latestCompletedXid; if the - * target Xid is after that, it's surely still running. - */ - if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid, xid)) - { - LWLockRelease(ProcArrayLock); - xc_by_latest_xid_inc(); - return true; - } - - /* No shortcuts, gotta grovel through the array */ - for (i = 0; i < arrayP->numProcs; i++) - { - int pgprocno = arrayP->pgprocnos[i]; - volatile PGPROC *proc = &allProcs[pgprocno]; - volatile PGXACT *pgxact = &allPgXact[pgprocno]; - TransactionId pxid; - - /* Ignore my own proc --- dealt with it above */ - if (proc == MyProc) - continue; - - /* Fetch xid just once - see GetNewTransactionId */ - pxid = pgxact->xid; - - if (!TransactionIdIsValid(pxid)) - continue; - - /* - * Step 1: check the main Xid - */ - if (TransactionIdEquals(pxid, xid)) + if (xid == oldValue) { - LWLockRelease(ProcArrayLock); - xc_by_main_xid_inc(); - return true; + /* nothing more to do */ + break; } /* - * We can ignore main Xids that are younger than the target Xid, since - * the target could not possibly be their child. - */ - if (TransactionIdPrecedes(xid, pxid)) - continue; - - /* - * Step 2: check the cached child-Xids arrays + * Update oldestActiveXid with that value. */ - for (j = pgxact->nxids - 1; j >= 0; j--) + if (!pg_atomic_compare_exchange_u32(&ShmemVariableCache->oldestActiveXid, + &oldValue, + xid)) { - /* Fetch xid just once - see GetNewTransactionId */ - TransactionId cxid = proc->subxids.xids[j]; - - if (TransactionIdEquals(cxid, xid)) - { - LWLockRelease(ProcArrayLock); - xc_by_child_xid_inc(); - return true; - } + /* + * Someone beat us to it. This can happen if we hit the race + * condition described below. That's OK. We're no longer the oldest active + * XID in that case, so we're done. + */ + Assert(TransactionIdFollows(oldValue, myXid)); + break; } /* - * Save the main Xid for step 4. We only need to remember main Xids - * that have uncached children. (Note: there is no race condition - * here because the overflowed flag cannot be cleared, only set, while - * we hold ProcArrayLock. So we can't miss an Xid that we need to - * worry about.) + * We're not necessarily done yet. It's possible that the XID that we saw + * as still running committed just before we updated oldestActiveXid. + * She didn't see herself as the oldest transaction, so she wouldn't + * update oldestActiveXid. Loop back to check the XID that we saw as + * the oldest in-progress one is still in-progress, and if not, update + * oldestActiveXid again, on behalf of that transaction. */ - if (pgxact->overflowed) - xids[nxids++] = pxid; + oldValue = xid; } +} + + +/* + * This is like GetOldestXmin(NULL, true), but can return slightly stale, cached value. + */ +TransactionId +GetRecentGlobalXmin(void) +{ + TransactionId globalXmin; + ProcArrayStruct *arrayP = procArray; + int index; + volatile TransactionId replication_slot_xmin = InvalidTransactionId; + volatile TransactionId replication_slot_catalog_xmin = InvalidTransactionId; + + if (TransactionIdIsValid(RecentGlobalXmin)) + return RecentGlobalXmin; + + LWLockAcquire(ProcArrayLock, LW_SHARED); /* - * Step 3: in hot standby mode, check the known-assigned-xids list. XIDs - * in the list must be treated as running. + * We initialize the MIN() calculation with oldestActiveXid. This + * is a lower bound for the XIDs that might appear in the ProcArray later, + * and so protects us against overestimating the result due to future + * additions. */ - if (RecoveryInProgress()) + globalXmin = pg_atomic_read_u32(&ShmemVariableCache->oldestActiveXid); + Assert(TransactionIdIsNormal(globalXmin)); + + for (index = 0; index < arrayP->numProcs; index++) { - /* none of the PGXACT entries should have XIDs in hot standby mode */ - Assert(nxids == 0); + int pgprocno = arrayP->pgprocnos[index]; + volatile PGXACT *pgxact = &allPgXact[pgprocno]; + TransactionId xmin = pgxact->xmin; - if (KnownAssignedXidExists(xid)) - { - LWLockRelease(ProcArrayLock); - xc_by_known_assigned_inc(); - return true; - } + /* + * Backend is doing logical decoding which manages xmin separately, + * check below. + */ + if (pgxact->vacuumFlags & PROC_IN_LOGICAL_DECODING) + continue; + + if (pgxact->vacuumFlags & PROC_IN_VACUUM) + continue; /* - * If the KnownAssignedXids overflowed, we have to check pg_subtrans - * too. Fetch all xids from KnownAssignedXids that are lower than - * xid, since if xid is a subtransaction its parent will always have a - * lower value. Note we will collect both main and subXIDs here, but - * there's no help for it. + * Consider the transaction's Xmin, if set. */ - if (TransactionIdPrecedesOrEquals(xid, procArray->lastOverflowedXid)) - nxids = KnownAssignedXidsGet(xids, xid); + if (TransactionIdIsNormal(xmin) && + NormalTransactionIdPrecedes(xmin, globalXmin)) + globalXmin = xmin; } + /* fetch into volatile var while ProcArrayLock is held */ + replication_slot_xmin = procArray->replication_slot_xmin; + replication_slot_catalog_xmin = procArray->replication_slot_catalog_xmin; + LWLockRelease(ProcArrayLock); - /* - * If none of the relevant caches overflowed, we know the Xid is not - * running without even looking at pg_subtrans. - */ - if (nxids == 0) - { - xc_no_overflow_inc(); - return false; - } + /* Update cached variables */ + RecentGlobalXmin = globalXmin - vacuum_defer_cleanup_age; + if (!TransactionIdIsNormal(RecentGlobalXmin)) + RecentGlobalXmin = FirstNormalTransactionId; - /* - * Step 4: have to check pg_subtrans. - * - * At this point, we know it's either a subtransaction of one of the Xids - * in xids[], or it's not running. If it's an already-failed - * subtransaction, we want to say "not running" even though its parent may - * still be running. So first, check pg_xact to see if it's been aborted. - */ - xc_slow_answer_inc(); + /* Check whether there's a replication slot requiring an older xmin. */ + if (TransactionIdIsValid(replication_slot_xmin) && + NormalTransactionIdPrecedes(replication_slot_xmin, RecentGlobalXmin)) + RecentGlobalXmin = replication_slot_xmin; - if (TransactionIdDidAbort(xid)) - return false; + /* Non-catalog tables can be vacuumed if older than this xid */ + RecentGlobalDataXmin = RecentGlobalXmin; /* - * It isn't aborted, so check whether the transaction tree it belongs to - * is still running (or, more precisely, whether it was running when we - * held ProcArrayLock). + * Check whether there's a replication slot requiring an older catalog + * xmin. */ - topxid = SubTransGetTopmostTransaction(xid); - Assert(TransactionIdIsValid(topxid)); - if (!TransactionIdEquals(topxid, xid)) - { - for (i = 0; i < nxids; i++) - { - if (TransactionIdEquals(xids[i], topxid)) - return true; - } - } + if (TransactionIdIsNormal(replication_slot_catalog_xmin) && + NormalTransactionIdPrecedes(replication_slot_catalog_xmin, RecentGlobalXmin)) + RecentGlobalXmin = replication_slot_catalog_xmin; - return false; + return RecentGlobalXmin; } -/* - * TransactionIdIsActive -- is xid the top-level XID of an active backend? - * - * This differs from TransactionIdIsInProgress in that it ignores prepared - * transactions, as well as transactions running on the master if we're in - * hot standby. Also, we ignore subtransactions since that's not needed - * for current uses. - */ -bool -TransactionIdIsActive(TransactionId xid) +TransactionId +GetRecentGlobalDataXmin(void) { - bool result = false; - ProcArrayStruct *arrayP = procArray; - int i; - - /* - * Don't bother checking a transaction older than RecentXmin; it could not - * possibly still be running. - */ - if (TransactionIdPrecedes(xid, RecentXmin)) - return false; - - LWLockAcquire(ProcArrayLock, LW_SHARED); - - for (i = 0; i < arrayP->numProcs; i++) - { - int pgprocno = arrayP->pgprocnos[i]; - volatile PGPROC *proc = &allProcs[pgprocno]; - volatile PGXACT *pgxact = &allPgXact[pgprocno]; - TransactionId pxid; + if (TransactionIdIsValid(RecentGlobalDataXmin)) + return RecentGlobalDataXmin; - /* Fetch xid just once - see GetNewTransactionId */ - pxid = pgxact->xid; - - if (!TransactionIdIsValid(pxid)) - continue; - - if (proc->pid == 0) - continue; /* ignore prepared transactions */ - - if (TransactionIdEquals(pxid, xid)) - { - result = true; - break; - } - } - - LWLockRelease(ProcArrayLock); + (void) GetRecentGlobalXmin(); + Assert(TransactionIdIsValid(RecentGlobalDataXmin)); - return result; + return RecentGlobalDataXmin; } - /* * GetOldestXmin -- returns oldest transaction that was running * when any current transaction was started. @@ -1279,7 +665,7 @@ TransactionIdIsActive(TransactionId xid) * ignore concurrently running lazy VACUUMs because (a) they must be working * on other tables, and (b) they don't need to do snapshot-based lookups. * - * This is also used to determine where to truncate pg_subtrans. For that + * This is also used to determine where to truncate pg_csnlog. For that * backends in all databases have to be considered, so rel = NULL has to be * passed in. * @@ -1310,6 +696,10 @@ TransactionIdIsActive(TransactionId xid) * The return value is also adjusted with vacuum_defer_cleanup_age, so * increasing that setting on the fly is another easy way to make * GetOldestXmin() move backwards, with no consequences for data integrity. + * + * + * XXX: We track GlobalXmin in shared memory now. Would it makes sense to + * have GetOldestXmin() just return that? At least for the rel == NULL case. */ TransactionId GetOldestXmin(Relation rel, int flags) @@ -1340,7 +730,7 @@ GetOldestXmin(Relation rel, int flags) * and so protects us against overestimating the result due to future * additions. */ - result = ShmemVariableCache->latestCompletedXid; + result = pg_atomic_read_u32(&ShmemVariableCache->latestCompletedXid); Assert(TransactionIdIsNormal(result)); TransactionIdAdvance(result); @@ -1383,28 +773,11 @@ GetOldestXmin(Relation rel, int flags) replication_slot_xmin = procArray->replication_slot_xmin; replication_slot_catalog_xmin = procArray->replication_slot_catalog_xmin; - if (RecoveryInProgress()) - { - /* - * Check to see whether KnownAssignedXids contains an xid value older - * than the main procarray. - */ - TransactionId kaxmin = KnownAssignedXidsGetOldestXmin(); - - LWLockRelease(ProcArrayLock); + LWLockRelease(ProcArrayLock); - if (TransactionIdIsNormal(kaxmin) && - TransactionIdPrecedes(kaxmin, result)) - result = kaxmin; - } - else + if (!RecoveryInProgress()) { /* - * No other information needed, so release the lock immediately. - */ - LWLockRelease(ProcArrayLock); - - /* * Compute the cutoff XID by subtracting vacuum_defer_cleanup_age, * being careful not to generate a "permanent" XID. * @@ -1448,337 +821,199 @@ GetOldestXmin(Relation rel, int flags) } /* - * GetMaxSnapshotXidCount -- get max size for snapshot XID array - * - * We have to export this for use by snapmgr.c. - */ -int -GetMaxSnapshotXidCount(void) -{ - return procArray->maxProcs; -} -/* - * GetMaxSnapshotSubxidCount -- get max size for snapshot sub-XID array - * - * We have to export this for use by snapmgr.c. - */ -int -GetMaxSnapshotSubxidCount(void) -{ - return TOTAL_MAX_CACHED_SUBXIDS; -} +oldestActiveXid + oldest XID that's currently in-progress + +GlobalXmin + oldest XID that's *seen* by any active snapshot as still in-progress + +latestCompletedXid + latest XID that has committed. + +CSN + current CSN + + + +Get snapshot: + +1. LWLockAcquire(ProcArrayLock, LW_SHARED) +2. Read oldestActiveXid. Store it in MyProc->xmin +3. Read CSN +4. LWLockRelease(ProcArrayLock) + +End-of-xact: + +1. LWLockAcquire(ProcArrayLock, LW_SHARED) +2. Reset MyProc->xmin, xid and CSN +3. Was my XID == oldestActiveXid? If so, advance oldestActiveXid. +4. Was my xmin == oldestXmin? If so, advance oldestXmin. +5. LWLockRelease(ProcArrayLock) + +AdvanceGlobalXmin: + +1. LWLockAcquire(ProcArrayLock, LW_SHARED) +2. Read current oldestActiveXid. That's the upper bound. If a transaction + begins now, that's the xmin it would get. +3. Scan ProcArray, for the smallest xmin. +4. Set that as the new GlobalXmin. +5. LWLockRelease(ProcArrayLock) + +AdvanceOldestActiveXid: + +Two alternatives: scan the csnlog or scan the procarray. Scanning the +procarray is tricky: it's possible that a backend has just read nextXid, +but not set it in MyProc->xid yet. + + +*/ + + /* - * GetSnapshotData -- returns information about running transactions. - * - * The returned snapshot includes xmin (lowest still-running xact ID), - * xmax (highest completed xact ID + 1), and a list of running xact IDs - * in the range xmin <= xid < xmax. It is used as follows: - * All xact IDs < xmin are considered finished. - * All xact IDs >= xmax are considered still running. - * For an xact ID xmin <= xid < xmax, consult list to see whether - * it is considered running or not. + * GetSnapshotData -- returns an MVCC snapshot. + * + * The crux of the returned snapshot is the current Commit-Sequence-Number. + * All transactions that committed before the CSN is considered + * as visible to the snapshot, and all transactions that committed at or + * later are considered as still-in-progress. + * + * The returned snapshot also includes xmin (lowest still-running xact ID), + * and xmax (highest completed xact ID + 1). They can be used to avoid + * the more expensive check against the CSN: + * All xact IDs < xmin are known to be finished. + * All xact IDs >= xmax are known to be still running. + * For an xact ID xmin <= xid < xmax, consult the CSNLOG to see + * whether its CSN is before or after the snapshot's CSN. + * * This ensures that the set of transactions seen as "running" by the * current xact will not change after it takes the snapshot. * - * All running top-level XIDs are included in the snapshot, except for lazy - * VACUUM processes. We also try to include running subtransaction XIDs, - * but since PGPROC has only a limited cache area for subxact XIDs, full - * information may not be available. If we find any overflowed subxid arrays, - * we have to mark the snapshot's subxid data as overflowed, and extra work - * *may* need to be done to determine what's running (see XidInMVCCSnapshot() - * in tqual.c). - * * We also update the following backend-global variables: * TransactionXmin: the oldest xmin of any snapshot in use in the - * current transaction (this is the same as MyPgXact->xmin). - * RecentXmin: the xmin computed for the most recent snapshot. XIDs - * older than this are known not running any more. + * current transaction. * RecentGlobalXmin: the global xmin (oldest TransactionXmin across all - * running transactions, except those running LAZY VACUUM). This is - * the same computation done by - * GetOldestXmin(NULL, PROCARRAY_FLAGS_VACUUM). + * running transactions, except those running LAZY VACUUM). This + * can be used to opportunistically remove old dead tuples. * RecentGlobalDataXmin: the global xmin for non-catalog tables * >= RecentGlobalXmin - * - * Note: this function should probably not be called with an argument that's - * not statically allocated (see xip allocation below). */ Snapshot GetSnapshotData(Snapshot snapshot) { - ProcArrayStruct *arrayP = procArray; TransactionId xmin; TransactionId xmax; - TransactionId globalxmin; - int index; - int count = 0; - int subcount = 0; - bool suboverflowed = false; - volatile TransactionId replication_slot_xmin = InvalidTransactionId; - volatile TransactionId replication_slot_catalog_xmin = InvalidTransactionId; + CommitSeqNo snapshotcsn; + bool takenDuringRecovery; Assert(snapshot != NULL); /* - * Allocating space for maxProcs xids is usually overkill; numProcs would - * be sufficient. But it seems better to do the malloc while not holding - * the lock, so we can't look at numProcs. Likewise, we allocate much - * more subxip storage than is probably needed. - * - * This does open a possibility for avoiding repeated malloc/free: since - * maxProcs does not change at runtime, we can simply reuse the previous - * xip arrays if any. (This relies on the fact that all callers pass - * static SnapshotData structs.) + * The ProcArrayLock is not needed here. We only set our xmin if + * it's not already set. There are only a few functions that check + * the xmin under exclusive ProcArrayLock: + * 1) ProcArrayInstallRestored/ImportedXmin -- can only care about + * our xmin long after it has been first set. + * 2) ProcArrayEndTransaction is not called concurrently with + * GetSnapshotData. */ - if (snapshot->xip == NULL) + + takenDuringRecovery = RecoveryInProgress(); + + /* Anything older than oldestActiveXid is surely finished by now. */ + xmin = pg_atomic_read_u32(&ShmemVariableCache->oldestActiveXid); + + /* Announce my xmin, to hold back GlobalXmin. */ + if (!TransactionIdIsValid(MyPgXact->xmin)) { + TransactionId oldestActiveXid; + + MyPgXact->xmin = xmin; + + /* + * Recheck, if oldestActiveXid advanced after we read it. + * + * This protects against a race condition with AdvanceGlobalXmin(). + * If a transaction ends runs AdvanceGlobalXmin(), just after we fetch + * oldestActiveXid, but before we set MyPgXact->xmin, it's possible + * that AdvanceGlobalXmin() computed a new GlobalXmin that doesn't + * cover the xmin that we got. To fix that, check oldestActiveXid + * again, after setting xmin. Redoing it once is enough, we don't need + * to loop, because the (stale) xmin that we set prevents the same + * race condition from advancing oldestXid again. + * + * For a brief moment, we can have the situation that our xmin is + * lower than GlobalXmin, but it's OK because we don't use that xmin + * until we've re-checked and corrected it if necessary. + */ /* - * First call for this snapshot. Snapshot is same size whether or not - * we are in recovery, see later comments. + * memory barrier to make sure that setting the xmin in our PGPROC entry + * is made visible to others, before the read below. */ - snapshot->xip = (TransactionId *) - malloc(GetMaxSnapshotXidCount() * sizeof(TransactionId)); - if (snapshot->xip == NULL) - ereport(ERROR, - (errcode(ERRCODE_OUT_OF_MEMORY), - errmsg("out of memory"))); - Assert(snapshot->subxip == NULL); - snapshot->subxip = (TransactionId *) - malloc(GetMaxSnapshotSubxidCount() * sizeof(TransactionId)); - if (snapshot->subxip == NULL) - ereport(ERROR, - (errcode(ERRCODE_OUT_OF_MEMORY), - errmsg("out of memory"))); + pg_memory_barrier(); + + oldestActiveXid = pg_atomic_read_u32(&ShmemVariableCache->oldestActiveXid); + if (oldestActiveXid != xmin) + { + xmin = oldestActiveXid; + + MyPgXact->xmin = xmin; + } + + TransactionXmin = xmin; } /* - * It is sufficient to get shared lock on ProcArrayLock, even if we are - * going to set MyPgXact->xmin. + * Get the current snapshot CSN, and copy that to my PGPROC entry. This + * serializes us with any concurrent commits. */ - LWLockAcquire(ProcArrayLock, LW_SHARED); - - /* xmax is always latestCompletedXid + 1 */ - xmax = ShmemVariableCache->latestCompletedXid; + snapshotcsn = pg_atomic_read_u64(&ShmemVariableCache->nextCommitSeqNo); + if (MyPgXact->snapshotcsn == InvalidCommitSeqNo) + MyPgXact->snapshotcsn = snapshotcsn; + /* + * Also get xmax. It is always latestCompletedXid + 1. + * Make sure to read it after CSN (see TransactionIdAsyncCommitTree()) + */ + pg_read_barrier(); + xmax = pg_atomic_read_u32(&ShmemVariableCache->latestCompletedXid); Assert(TransactionIdIsNormal(xmax)); TransactionIdAdvance(xmax); - /* initialize xmin calculation with xmax */ - globalxmin = xmin = xmax; + snapshot->xmin = xmin; + snapshot->xmax = xmax; + snapshot->snapshotcsn = snapshotcsn; + snapshot->curcid = GetCurrentCommandId(false); + snapshot->takenDuringRecovery = takenDuringRecovery; - snapshot->takenDuringRecovery = RecoveryInProgress(); + /* + * This is a new snapshot, so set both refcounts are zero, and mark it as + * not copied in persistent memory. + */ + snapshot->active_count = 0; + snapshot->regd_count = 0; + snapshot->copied = false; - if (!snapshot->takenDuringRecovery) + if (old_snapshot_threshold < 0) { - int *pgprocnos = arrayP->pgprocnos; - int numProcs; - /* - * Spin over procArray checking xid, xmin, and subxids. The goal is - * to gather all active xids, find the lowest xmin, and try to record - * subxids. + * If not using "snapshot too old" feature, fill related fields with + * dummy values that don't require any locking. */ - numProcs = arrayP->numProcs; - for (index = 0; index < numProcs; index++) - { - int pgprocno = pgprocnos[index]; - volatile PGXACT *pgxact = &allPgXact[pgprocno]; - TransactionId xid; - - /* - * Backend is doing logical decoding which manages xmin - * separately, check below. - */ - if (pgxact->vacuumFlags & PROC_IN_LOGICAL_DECODING) - continue; - - /* Ignore procs running LAZY VACUUM */ - if (pgxact->vacuumFlags & PROC_IN_VACUUM) - continue; - - /* Update globalxmin to be the smallest valid xmin */ - xid = pgxact->xmin; /* fetch just once */ - if (TransactionIdIsNormal(xid) && - NormalTransactionIdPrecedes(xid, globalxmin)) - globalxmin = xid; - - /* Fetch xid just once - see GetNewTransactionId */ - xid = pgxact->xid; - - /* - * If the transaction has no XID assigned, we can skip it; it - * won't have sub-XIDs either. If the XID is >= xmax, we can also - * skip it; such transactions will be treated as running anyway - * (and any sub-XIDs will also be >= xmax). - */ - if (!TransactionIdIsNormal(xid) - || !NormalTransactionIdPrecedes(xid, xmax)) - continue; - - /* - * We don't include our own XIDs (if any) in the snapshot, but we - * must include them in xmin. - */ - if (NormalTransactionIdPrecedes(xid, xmin)) - xmin = xid; - if (pgxact == MyPgXact) - continue; - - /* Add XID to snapshot. */ - snapshot->xip[count++] = xid; - - /* - * Save subtransaction XIDs if possible (if we've already - * overflowed, there's no point). Note that the subxact XIDs must - * be later than their parent, so no need to check them against - * xmin. We could filter against xmax, but it seems better not to - * do that much work while holding the ProcArrayLock. - * - * The other backend can add more subxids concurrently, but cannot - * remove any. Hence it's important to fetch nxids just once. - * Should be safe to use memcpy, though. (We needn't worry about - * missing any xids added concurrently, because they must postdate - * xmax.) - * - * Again, our own XIDs are not included in the snapshot. - */ - if (!suboverflowed) - { - if (pgxact->overflowed) - suboverflowed = true; - else - { - int nxids = pgxact->nxids; - - if (nxids > 0) - { - volatile PGPROC *proc = &allProcs[pgprocno]; - - memcpy(snapshot->subxip + subcount, - (void *) proc->subxids.xids, - nxids * sizeof(TransactionId)); - subcount += nxids; - } - } - } - } - } - else - { - /* - * We're in hot standby, so get XIDs from KnownAssignedXids. - * - * We store all xids directly into subxip[]. Here's why: - * - * In recovery we don't know which xids are top-level and which are - * subxacts, a design choice that greatly simplifies xid processing. - * - * It seems like we would want to try to put xids into xip[] only, but - * that is fairly small. We would either need to make that bigger or - * to increase the rate at which we WAL-log xid assignment; neither is - * an appealing choice. - * - * We could try to store xids into xip[] first and then into subxip[] - * if there are too many xids. That only works if the snapshot doesn't - * overflow because we do not search subxip[] in that case. A simpler - * way is to just store all xids in the subxact array because this is - * by far the bigger array. We just leave the xip array empty. - * - * Either way we need to change the way XidInMVCCSnapshot() works - * depending upon when the snapshot was taken, or change normal - * snapshot processing so it matches. - * - * Note: It is possible for recovery to end before we finish taking - * the snapshot, and for newly assigned transaction ids to be added to - * the ProcArray. xmax cannot change while we hold ProcArrayLock, so - * those newly added transaction ids would be filtered away, so we - * need not be concerned about them. - */ - subcount = KnownAssignedXidsGetAndSetXmin(snapshot->subxip, &xmin, - xmax); - - if (TransactionIdPrecedesOrEquals(xmin, procArray->lastOverflowedXid)) - suboverflowed = true; - } - - - /* fetch into volatile var while ProcArrayLock is held */ - replication_slot_xmin = procArray->replication_slot_xmin; - replication_slot_catalog_xmin = procArray->replication_slot_catalog_xmin; - - if (!TransactionIdIsValid(MyPgXact->xmin)) - MyPgXact->xmin = TransactionXmin = xmin; - - LWLockRelease(ProcArrayLock); - - /* - * Update globalxmin to include actual process xids. This is a slightly - * different way of computing it than GetOldestXmin uses, but should give - * the same result. - */ - if (TransactionIdPrecedes(xmin, globalxmin)) - globalxmin = xmin; - - /* Update global variables too */ - RecentGlobalXmin = globalxmin - vacuum_defer_cleanup_age; - if (!TransactionIdIsNormal(RecentGlobalXmin)) - RecentGlobalXmin = FirstNormalTransactionId; - - /* Check whether there's a replication slot requiring an older xmin. */ - if (TransactionIdIsValid(replication_slot_xmin) && - NormalTransactionIdPrecedes(replication_slot_xmin, RecentGlobalXmin)) - RecentGlobalXmin = replication_slot_xmin; - - /* Non-catalog tables can be vacuumed if older than this xid */ - RecentGlobalDataXmin = RecentGlobalXmin; - - /* - * Check whether there's a replication slot requiring an older catalog - * xmin. - */ - if (TransactionIdIsNormal(replication_slot_catalog_xmin) && - NormalTransactionIdPrecedes(replication_slot_catalog_xmin, RecentGlobalXmin)) - RecentGlobalXmin = replication_slot_catalog_xmin; - - RecentXmin = xmin; - - snapshot->xmin = xmin; - snapshot->xmax = xmax; - snapshot->xcnt = count; - snapshot->subxcnt = subcount; - snapshot->suboverflowed = suboverflowed; - - snapshot->curcid = GetCurrentCommandId(false); - - /* - * This is a new snapshot, so set both refcounts are zero, and mark it as - * not copied in persistent memory. - */ - snapshot->active_count = 0; - snapshot->regd_count = 0; - snapshot->copied = false; - - if (old_snapshot_threshold < 0) - { - /* - * If not using "snapshot too old" feature, fill related fields with - * dummy values that don't require any locking. - */ - snapshot->lsn = InvalidXLogRecPtr; - snapshot->whenTaken = 0; - } - else - { - /* - * Capture the current time and WAL stream location in case this - * snapshot becomes old enough to need to fall back on the special - * "old snapshot" logic. - */ - snapshot->lsn = GetXLogInsertRecPtr(); - snapshot->whenTaken = GetSnapshotCurrentTimestamp(); - MaintainOldSnapshotTimeMapping(snapshot->whenTaken, xmin); - } + snapshot->lsn = InvalidXLogRecPtr; + snapshot->whenTaken = 0; + } + else + { + /* + * Capture the current time and WAL stream location in case this + * snapshot becomes old enough to need to fall back on the special + * "old snapshot" logic. + */ + snapshot->lsn = GetXLogInsertRecPtr(); + snapshot->whenTaken = GetSnapshotCurrentTimestamp(); + MaintainOldSnapshotTimeMapping(snapshot->whenTaken, xmin); + } return snapshot; } @@ -1805,8 +1040,10 @@ ProcArrayInstallImportedXmin(TransactionId xmin, if (!sourcevxid) return false; - /* Get lock so source xact can't end while we're doing this */ - LWLockAcquire(ProcArrayLock, LW_SHARED); + /* + * Get exclusive lock so source xact can't end while we're doing this. + */ + LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); for (index = 0; index < arrayP->numProcs; index++) { @@ -1878,8 +1115,10 @@ ProcArrayInstallRestoredXmin(TransactionId xmin, PGPROC *proc) Assert(TransactionIdIsNormal(xmin)); Assert(proc != NULL); - /* Get lock so source xact can't end while we're doing this */ - LWLockAcquire(ProcArrayLock, LW_SHARED); + /* + * Get exclusive lock so source xact can't end while we're doing this. + */ + LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); pgxact = &allPgXact[proc->pgprocno]; @@ -1906,29 +1145,24 @@ ProcArrayInstallRestoredXmin(TransactionId xmin, PGPROC *proc) /* * GetRunningTransactionData -- returns information about running transactions. * - * Similar to GetSnapshotData but returns more information. We include - * all PGXACTs with an assigned TransactionId, even VACUUM processes. + * Returns the oldest running TransactionId among all backends, even VACUUM + * processes. + * + * We acquire XidGenlock, but the caller is responsible for releasing it. + * Acquiring XidGenLock ensures that no new XID can be assigned until + * the caller has WAL-logged this snapshot, and releases the lock. + * FIXME: this also used to hold ProcArrayLock, to prevent any transactions + * from committing until the caller has WAL-logged. I don't think we need + * that anymore, but verify. * - * We acquire XidGenLock and ProcArrayLock, but the caller is responsible for - * releasing them. Acquiring XidGenLock ensures that no new XIDs enter the proc - * array until the caller has WAL-logged this snapshot, and releases the - * lock. Acquiring ProcArrayLock ensures that no transactions commit until the - * lock is released. + * Returns the current xmin and xmax, like GetSnapshotData does. * * The returned data structure is statically allocated; caller should not * modify it, and must not assume it is valid past the next call. * - * This is never executed during recovery so there is no need to look at - * KnownAssignedXids. - * * We don't worry about updating other counters, we want to keep this as * simple as possible and leave GetSnapshotData() as the primary code for * that bookkeeping. - * - * Note that if any transaction has overflowed its cached subtransactions - * then there is no real need include any subtransactions. That isn't a - * common enough case to worry about optimising the size of the WAL record, - * and we may wish to see that data for diagnostic purposes anyway. */ RunningTransactions GetRunningTransactionData(void) @@ -1938,52 +1172,18 @@ GetRunningTransactionData(void) ProcArrayStruct *arrayP = procArray; RunningTransactions CurrentRunningXacts = &CurrentRunningXactsData; - TransactionId latestCompletedXid; TransactionId oldestRunningXid; - TransactionId *xids; int index; - int count; - int subcount; - bool suboverflowed; Assert(!RecoveryInProgress()); /* - * Allocating space for maxProcs xids is usually overkill; numProcs would - * be sufficient. But it seems better to do the malloc while not holding - * the lock, so we can't look at numProcs. Likewise, we allocate much - * more subxip storage than is probably needed. - * - * Should only be allocated in bgwriter, since only ever executed during - * checkpoints. - */ - if (CurrentRunningXacts->xids == NULL) - { - /* - * First call - */ - CurrentRunningXacts->xids = (TransactionId *) - malloc(TOTAL_MAX_CACHED_SUBXIDS * sizeof(TransactionId)); - if (CurrentRunningXacts->xids == NULL) - ereport(ERROR, - (errcode(ERRCODE_OUT_OF_MEMORY), - errmsg("out of memory"))); - } - - xids = CurrentRunningXacts->xids; - - count = subcount = 0; - suboverflowed = false; - - /* * Ensure that no xids enter or leave the procarray while we obtain * snapshot. */ LWLockAcquire(ProcArrayLock, LW_SHARED); LWLockAcquire(XidGenLock, LW_SHARED); - latestCompletedXid = ShmemVariableCache->latestCompletedXid; - oldestRunningXid = ShmemVariableCache->nextXid; /* @@ -2005,47 +1205,8 @@ GetRunningTransactionData(void) if (!TransactionIdIsValid(xid)) continue; - xids[count++] = xid; - if (TransactionIdPrecedes(xid, oldestRunningXid)) oldestRunningXid = xid; - - if (pgxact->overflowed) - suboverflowed = true; - } - - /* - * Spin over procArray collecting all subxids, but only if there hasn't - * been a suboverflow. - */ - if (!suboverflowed) - { - for (index = 0; index < arrayP->numProcs; index++) - { - int pgprocno = arrayP->pgprocnos[index]; - volatile PGPROC *proc = &allProcs[pgprocno]; - volatile PGXACT *pgxact = &allPgXact[pgprocno]; - int nxids; - - /* - * Save subtransaction XIDs. Other backends can't add or remove - * entries while we're holding XidGenLock. - */ - nxids = pgxact->nxids; - if (nxids > 0) - { - memcpy(&xids[count], (void *) proc->subxids.xids, - nxids * sizeof(TransactionId)); - count += nxids; - subcount += nxids; - - /* - * Top-level XID of a transaction is always less than any of - * its subxids, so we don't need to check if any of the - * subxids are smaller than oldestRunningXid - */ - } - } } /* @@ -2057,18 +1218,14 @@ GetRunningTransactionData(void) * increases if slots do. */ - CurrentRunningXacts->xcnt = count - subcount; - CurrentRunningXacts->subxcnt = subcount; - CurrentRunningXacts->subxid_overflow = suboverflowed; CurrentRunningXacts->nextXid = ShmemVariableCache->nextXid; CurrentRunningXacts->oldestRunningXid = oldestRunningXid; - CurrentRunningXacts->latestCompletedXid = latestCompletedXid; Assert(TransactionIdIsValid(CurrentRunningXacts->nextXid)); Assert(TransactionIdIsValid(CurrentRunningXacts->oldestRunningXid)); - Assert(TransactionIdIsNormal(CurrentRunningXacts->latestCompletedXid)); - /* We don't release the locks here, the caller is responsible for that */ + LWLockRelease(ProcArrayLock); + /* We don't release XidGenLock here, the caller is responsible for that */ return CurrentRunningXacts; } @@ -2076,17 +1233,18 @@ GetRunningTransactionData(void) /* * GetOldestActiveTransactionId() * - * Similar to GetSnapshotData but returns just oldestActiveXid. We include + * Returns the oldest XID that's still running. We include * all PGXACTs with an assigned TransactionId, even VACUUM processes. * We look at all databases, though there is no need to include WALSender * since this has no effect on hot standby conflicts. * - * This is never executed during recovery so there is no need to look at - * KnownAssignedXids. - * * We don't worry about updating other counters, we want to keep this as * simple as possible and leave GetSnapshotData() as the primary code for * that bookkeeping. + * + * XXX: We could just use return ShmemVariableCache->oldestActiveXid. this + * uses a different method of computing the value though, so maybe this is + * useful as a cross-check? */ TransactionId GetOldestActiveTransactionId(void) @@ -2541,7 +1699,7 @@ GetCurrentVirtualXIDs(TransactionId limitXmin, bool excludeXmin0, * * All callers that are checking xmins always now supply a valid and useful * value for limitXmin. The limitXmin is always lower than the lowest - * numbered KnownAssignedXid that is not already a FATAL error. This is + * numbered KnownAssignedXid (XXX) that is not already a FATAL error. This is * because we only care about cleanup records that are cleaning up tuple * versions from committed transactions. In that case they will only occur * at the point where the record is less than the lowest running xid. That @@ -2997,170 +2155,9 @@ ProcArrayGetReplicationSlotXmin(TransactionId *xmin, LWLockRelease(ProcArrayLock); } - -#define XidCacheRemove(i) \ - do { \ - MyProc->subxids.xids[i] = MyProc->subxids.xids[MyPgXact->nxids - 1]; \ - MyPgXact->nxids--; \ - } while (0) - -/* - * XidCacheRemoveRunningXids - * - * Remove a bunch of TransactionIds from the list of known-running - * subtransactions for my backend. Both the specified xid and those in - * the xids[] array (of length nxids) are removed from the subxids cache. - * latestXid must be the latest XID among the group. - */ -void -XidCacheRemoveRunningXids(TransactionId xid, - int nxids, const TransactionId *xids, - TransactionId latestXid) -{ - int i, - j; - - Assert(TransactionIdIsValid(xid)); - - /* - * We must hold ProcArrayLock exclusively in order to remove transactions - * from the PGPROC array. (See src/backend/access/transam/README.) It's - * possible this could be relaxed since we know this routine is only used - * to abort subtransactions, but pending closer analysis we'd best be - * conservative. - */ - LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); - - /* - * Under normal circumstances xid and xids[] will be in increasing order, - * as will be the entries in subxids. Scan backwards to avoid O(N^2) - * behavior when removing a lot of xids. - */ - for (i = nxids - 1; i >= 0; i--) - { - TransactionId anxid = xids[i]; - - for (j = MyPgXact->nxids - 1; j >= 0; j--) - { - if (TransactionIdEquals(MyProc->subxids.xids[j], anxid)) - { - XidCacheRemove(j); - break; - } - } - - /* - * Ordinarily we should have found it, unless the cache has - * overflowed. However it's also possible for this routine to be - * invoked multiple times for the same subtransaction, in case of an - * error during AbortSubTransaction. So instead of Assert, emit a - * debug warning. - */ - if (j < 0 && !MyPgXact->overflowed) - elog(WARNING, "did not find subXID %u in MyProc", anxid); - } - - for (j = MyPgXact->nxids - 1; j >= 0; j--) - { - if (TransactionIdEquals(MyProc->subxids.xids[j], xid)) - { - XidCacheRemove(j); - break; - } - } - /* Ordinarily we should have found it, unless the cache has overflowed */ - if (j < 0 && !MyPgXact->overflowed) - elog(WARNING, "did not find subXID %u in MyProc", xid); - - /* Also advance global latestCompletedXid while holding the lock */ - if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid, - latestXid)) - ShmemVariableCache->latestCompletedXid = latestXid; - - LWLockRelease(ProcArrayLock); -} - -#ifdef XIDCACHE_DEBUG - -/* - * Print stats about effectiveness of XID cache - */ -static void -DisplayXidCache(void) -{ - fprintf(stderr, - "XidCache: xmin: %ld, known: %ld, myxact: %ld, latest: %ld, mainxid: %ld, childxid: %ld, knownassigned: %ld, nooflo: %ld, slow: %ld\n", - xc_by_recent_xmin, - xc_by_known_xact, - xc_by_my_xact, - xc_by_latest_xid, - xc_by_main_xid, - xc_by_child_xid, - xc_by_known_assigned, - xc_no_overflow, - xc_slow_answer); -} -#endif /* XIDCACHE_DEBUG */ - - -/* ---------------------------------------------- - * KnownAssignedTransactions sub-module - * ---------------------------------------------- - */ - -/* - * In Hot Standby mode, we maintain a list of transactions that are (or were) - * running in the master at the current point in WAL. These XIDs must be - * treated as running by standby transactions, even though they are not in - * the standby server's PGXACT array. - * - * We record all XIDs that we know have been assigned. That includes all the - * XIDs seen in WAL records, plus all unobserved XIDs that we can deduce have - * been assigned. We can deduce the existence of unobserved XIDs because we - * know XIDs are assigned in sequence, with no gaps. The KnownAssignedXids - * list expands as new XIDs are observed or inferred, and contracts when - * transaction completion records arrive. - * - * During hot standby we do not fret too much about the distinction between - * top-level XIDs and subtransaction XIDs. We store both together in the - * KnownAssignedXids list. In backends, this is copied into snapshots in - * GetSnapshotData(), taking advantage of the fact that XidInMVCCSnapshot() - * doesn't care about the distinction either. Subtransaction XIDs are - * effectively treated as top-level XIDs and in the typical case pg_subtrans - * links are *not* maintained (which does not affect visibility). - * - * We have room in KnownAssignedXids and in snapshots to hold maxProcs * - * (1 + PGPROC_MAX_CACHED_SUBXIDS) XIDs, so every master transaction must - * report its subtransaction XIDs in a WAL XLOG_XACT_ASSIGNMENT record at - * least every PGPROC_MAX_CACHED_SUBXIDS. When we receive one of these - * records, we mark the subXIDs as children of the top XID in pg_subtrans, - * and then remove them from KnownAssignedXids. This prevents overflow of - * KnownAssignedXids and snapshots, at the cost that status checks for these - * subXIDs will take a slower path through TransactionIdIsInProgress(). - * This means that KnownAssignedXids is not necessarily complete for subXIDs, - * though it should be complete for top-level XIDs; this is the same situation - * that holds with respect to the PGPROC entries in normal running. - * - * When we throw away subXIDs from KnownAssignedXids, we need to keep track of - * that, similarly to tracking overflow of a PGPROC's subxids array. We do - * that by remembering the lastOverflowedXID, ie the last thrown-away subXID. - * As long as that is within the range of interesting XIDs, we have to assume - * that subXIDs are missing from snapshots. (Note that subXID overflow occurs - * on primary when 65th subXID arrives, whereas on standby it occurs when 64th - * subXID arrives - that is not an error.) - * - * Should a backend on primary somehow disappear before it can write an abort - * record, then we just leave those XIDs in KnownAssignedXids. They actually - * aborted but we think they were running; the distinction is irrelevant - * because either way any changes done by the transaction are not visible to - * backends in the standby. We prune KnownAssignedXids when - * XLOG_RUNNING_XACTS arrives, to forestall possible overflow of the - * array due to such dead XIDs. - */ - /* * RecordKnownAssignedTransactionIds - * Record the given XID in KnownAssignedXids, as well as any preceding + * Record the given XID in KnownAssignedXids (FIXME: update comment, KnownAssignedXid is no more), as well as any preceding * unobserved XIDs. * * RecordKnownAssignedTransactionIds() should be run for *every* WAL record @@ -3189,7 +2186,7 @@ RecordKnownAssignedTransactionIds(TransactionId xid) TransactionId next_expected_xid; /* - * Extend subtrans like we do in GetNewTransactionId() during normal + * Extend csnlog like we do in GetNewTransactionId() during normal * operation using individual extend steps. Note that we do not need * to extend clog since its extensions are WAL logged. * @@ -3201,28 +2198,11 @@ RecordKnownAssignedTransactionIds(TransactionId xid) while (TransactionIdPrecedes(next_expected_xid, xid)) { TransactionIdAdvance(next_expected_xid); - ExtendSUBTRANS(next_expected_xid); + ExtendCSNLOG(next_expected_xid); } Assert(next_expected_xid == xid); /* - * If the KnownAssignedXids machinery isn't up yet, there's nothing - * more to do since we don't track assigned xids yet. - */ - if (standbyState <= STANDBY_INITIALIZED) - { - latestObservedXid = xid; - return; - } - - /* - * Add (latestObservedXid, xid] onto the KnownAssignedXids array. - */ - next_expected_xid = latestObservedXid; - TransactionIdAdvance(next_expected_xid); - KnownAssignedXidsAdd(next_expected_xid, xid, false); - - /* * Now we can advance latestObservedXid */ latestObservedXid = xid; @@ -3235,726 +2215,3 @@ RecordKnownAssignedTransactionIds(TransactionId xid) LWLockRelease(XidGenLock); } } - -/* - * ExpireTreeKnownAssignedTransactionIds - * Remove the given XIDs from KnownAssignedXids. - * - * Called during recovery in analogy with and in place of ProcArrayEndTransaction() - */ -void -ExpireTreeKnownAssignedTransactionIds(TransactionId xid, int nsubxids, - TransactionId *subxids, TransactionId max_xid) -{ - Assert(standbyState >= STANDBY_INITIALIZED); - - /* - * Uses same locking as transaction commit - */ - LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); - - KnownAssignedXidsRemoveTree(xid, nsubxids, subxids); - - /* As in ProcArrayEndTransaction, advance latestCompletedXid */ - if (TransactionIdPrecedes(ShmemVariableCache->latestCompletedXid, - max_xid)) - ShmemVariableCache->latestCompletedXid = max_xid; - - LWLockRelease(ProcArrayLock); -} - -/* - * ExpireAllKnownAssignedTransactionIds - * Remove all entries in KnownAssignedXids - */ -void -ExpireAllKnownAssignedTransactionIds(void) -{ - LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); - KnownAssignedXidsRemovePreceding(InvalidTransactionId); - LWLockRelease(ProcArrayLock); -} - -/* - * ExpireOldKnownAssignedTransactionIds - * Remove KnownAssignedXids entries preceding the given XID - */ -void -ExpireOldKnownAssignedTransactionIds(TransactionId xid) -{ - LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); - KnownAssignedXidsRemovePreceding(xid); - LWLockRelease(ProcArrayLock); -} - - -/* - * Private module functions to manipulate KnownAssignedXids - * - * There are 5 main uses of the KnownAssignedXids data structure: - * - * * backends taking snapshots - all valid XIDs need to be copied out - * * backends seeking to determine presence of a specific XID - * * startup process adding new known-assigned XIDs - * * startup process removing specific XIDs as transactions end - * * startup process pruning array when special WAL records arrive - * - * This data structure is known to be a hot spot during Hot Standby, so we - * go to some lengths to make these operations as efficient and as concurrent - * as possible. - * - * The XIDs are stored in an array in sorted order --- TransactionIdPrecedes - * order, to be exact --- to allow binary search for specific XIDs. Note: - * in general TransactionIdPrecedes would not provide a total order, but - * we know that the entries present at any instant should not extend across - * a large enough fraction of XID space to wrap around (the master would - * shut down for fear of XID wrap long before that happens). So it's OK to - * use TransactionIdPrecedes as a binary-search comparator. - * - * It's cheap to maintain the sortedness during insertions, since new known - * XIDs are always reported in XID order; we just append them at the right. - * - * To keep individual deletions cheap, we need to allow gaps in the array. - * This is implemented by marking array elements as valid or invalid using - * the parallel boolean array KnownAssignedXidsValid[]. A deletion is done - * by setting KnownAssignedXidsValid[i] to false, *without* clearing the - * XID entry itself. This preserves the property that the XID entries are - * sorted, so we can do binary searches easily. Periodically we compress - * out the unused entries; that's much cheaper than having to compress the - * array immediately on every deletion. - * - * The actually valid items in KnownAssignedXids[] and KnownAssignedXidsValid[] - * are those with indexes tail <= i < head; items outside this subscript range - * have unspecified contents. When head reaches the end of the array, we - * force compression of unused entries rather than wrapping around, since - * allowing wraparound would greatly complicate the search logic. We maintain - * an explicit tail pointer so that pruning of old XIDs can be done without - * immediately moving the array contents. In most cases only a small fraction - * of the array contains valid entries at any instant. - * - * Although only the startup process can ever change the KnownAssignedXids - * data structure, we still need interlocking so that standby backends will - * not observe invalid intermediate states. The convention is that backends - * must hold shared ProcArrayLock to examine the array. To remove XIDs from - * the array, the startup process must hold ProcArrayLock exclusively, for - * the usual transactional reasons (compare commit/abort of a transaction - * during normal running). Compressing unused entries out of the array - * likewise requires exclusive lock. To add XIDs to the array, we just insert - * them into slots to the right of the head pointer and then advance the head - * pointer. This wouldn't require any lock at all, except that on machines - * with weak memory ordering we need to be careful that other processors - * see the array element changes before they see the head pointer change. - * We handle this by using a spinlock to protect reads and writes of the - * head/tail pointers. (We could dispense with the spinlock if we were to - * create suitable memory access barrier primitives and use those instead.) - * The spinlock must be taken to read or write the head/tail pointers unless - * the caller holds ProcArrayLock exclusively. - * - * Algorithmic analysis: - * - * If we have a maximum of M slots, with N XIDs currently spread across - * S elements then we have N <= S <= M always. - * - * * Adding a new XID is O(1) and needs little locking (unless compression - * must happen) - * * Compressing the array is O(S) and requires exclusive lock - * * Removing an XID is O(logS) and requires exclusive lock - * * Taking a snapshot is O(S) and requires shared lock - * * Checking for an XID is O(logS) and requires shared lock - * - * In comparison, using a hash table for KnownAssignedXids would mean that - * taking snapshots would be O(M). If we can maintain S << M then the - * sorted array technique will deliver significantly faster snapshots. - * If we try to keep S too small then we will spend too much time compressing, - * so there is an optimal point for any workload mix. We use a heuristic to - * decide when to compress the array, though trimming also helps reduce - * frequency of compressing. The heuristic requires us to track the number of - * currently valid XIDs in the array. - */ - - -/* - * Compress KnownAssignedXids by shifting valid data down to the start of the - * array, removing any gaps. - * - * A compression step is forced if "force" is true, otherwise we do it - * only if a heuristic indicates it's a good time to do it. - * - * Caller must hold ProcArrayLock in exclusive mode. - */ -static void -KnownAssignedXidsCompress(bool force) -{ - /* use volatile pointer to prevent code rearrangement */ - volatile ProcArrayStruct *pArray = procArray; - int head, - tail; - int compress_index; - int i; - - /* no spinlock required since we hold ProcArrayLock exclusively */ - head = pArray->headKnownAssignedXids; - tail = pArray->tailKnownAssignedXids; - - if (!force) - { - /* - * If we can choose how much to compress, use a heuristic to avoid - * compressing too often or not often enough. - * - * Heuristic is if we have a large enough current spread and less than - * 50% of the elements are currently in use, then compress. This - * should ensure we compress fairly infrequently. We could compress - * less often though the virtual array would spread out more and - * snapshots would become more expensive. - */ - int nelements = head - tail; - - if (nelements < 4 * PROCARRAY_MAXPROCS || - nelements < 2 * pArray->numKnownAssignedXids) - return; - } - - /* - * We compress the array by reading the valid values from tail to head, - * re-aligning data to 0th element. - */ - compress_index = 0; - for (i = tail; i < head; i++) - { - if (KnownAssignedXidsValid[i]) - { - KnownAssignedXids[compress_index] = KnownAssignedXids[i]; - KnownAssignedXidsValid[compress_index] = true; - compress_index++; - } - } - - pArray->tailKnownAssignedXids = 0; - pArray->headKnownAssignedXids = compress_index; -} - -/* - * Add xids into KnownAssignedXids at the head of the array. - * - * xids from from_xid to to_xid, inclusive, are added to the array. - * - * If exclusive_lock is true then caller already holds ProcArrayLock in - * exclusive mode, so we need no extra locking here. Else caller holds no - * lock, so we need to be sure we maintain sufficient interlocks against - * concurrent readers. (Only the startup process ever calls this, so no need - * to worry about concurrent writers.) - */ -static void -KnownAssignedXidsAdd(TransactionId from_xid, TransactionId to_xid, - bool exclusive_lock) -{ - /* use volatile pointer to prevent code rearrangement */ - volatile ProcArrayStruct *pArray = procArray; - TransactionId next_xid; - int head, - tail; - int nxids; - int i; - - Assert(TransactionIdPrecedesOrEquals(from_xid, to_xid)); - - /* - * Calculate how many array slots we'll need. Normally this is cheap; in - * the unusual case where the XIDs cross the wrap point, we do it the hard - * way. - */ - if (to_xid >= from_xid) - nxids = to_xid - from_xid + 1; - else - { - nxids = 1; - next_xid = from_xid; - while (TransactionIdPrecedes(next_xid, to_xid)) - { - nxids++; - TransactionIdAdvance(next_xid); - } - } - - /* - * Since only the startup process modifies the head/tail pointers, we - * don't need a lock to read them here. - */ - head = pArray->headKnownAssignedXids; - tail = pArray->tailKnownAssignedXids; - - Assert(head >= 0 && head <= pArray->maxKnownAssignedXids); - Assert(tail >= 0 && tail < pArray->maxKnownAssignedXids); - - /* - * Verify that insertions occur in TransactionId sequence. Note that even - * if the last existing element is marked invalid, it must still have a - * correctly sequenced XID value. - */ - if (head > tail && - TransactionIdFollowsOrEquals(KnownAssignedXids[head - 1], from_xid)) - { - KnownAssignedXidsDisplay(LOG); - elog(ERROR, "out-of-order XID insertion in KnownAssignedXids"); - } - - /* - * If our xids won't fit in the remaining space, compress out free space - */ - if (head + nxids > pArray->maxKnownAssignedXids) - { - /* must hold lock to compress */ - if (!exclusive_lock) - LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); - - KnownAssignedXidsCompress(true); - - head = pArray->headKnownAssignedXids; - /* note: we no longer care about the tail pointer */ - - if (!exclusive_lock) - LWLockRelease(ProcArrayLock); - - /* - * If it still won't fit then we're out of memory - */ - if (head + nxids > pArray->maxKnownAssignedXids) - elog(ERROR, "too many KnownAssignedXids"); - } - - /* Now we can insert the xids into the space starting at head */ - next_xid = from_xid; - for (i = 0; i < nxids; i++) - { - KnownAssignedXids[head] = next_xid; - KnownAssignedXidsValid[head] = true; - TransactionIdAdvance(next_xid); - head++; - } - - /* Adjust count of number of valid entries */ - pArray->numKnownAssignedXids += nxids; - - /* - * Now update the head pointer. We use a spinlock to protect this - * pointer, not because the update is likely to be non-atomic, but to - * ensure that other processors see the above array updates before they - * see the head pointer change. - * - * If we're holding ProcArrayLock exclusively, there's no need to take the - * spinlock. - */ - if (exclusive_lock) - pArray->headKnownAssignedXids = head; - else - { - SpinLockAcquire(&pArray->known_assigned_xids_lck); - pArray->headKnownAssignedXids = head; - SpinLockRelease(&pArray->known_assigned_xids_lck); - } -} - -/* - * KnownAssignedXidsSearch - * - * Searches KnownAssignedXids for a specific xid and optionally removes it. - * Returns true if it was found, false if not. - * - * Caller must hold ProcArrayLock in shared or exclusive mode. - * Exclusive lock must be held for remove = true. - */ -static bool -KnownAssignedXidsSearch(TransactionId xid, bool remove) -{ - /* use volatile pointer to prevent code rearrangement */ - volatile ProcArrayStruct *pArray = procArray; - int first, - last; - int head; - int tail; - int result_index = -1; - - if (remove) - { - /* we hold ProcArrayLock exclusively, so no need for spinlock */ - tail = pArray->tailKnownAssignedXids; - head = pArray->headKnownAssignedXids; - } - else - { - /* take spinlock to ensure we see up-to-date array contents */ - SpinLockAcquire(&pArray->known_assigned_xids_lck); - tail = pArray->tailKnownAssignedXids; - head = pArray->headKnownAssignedXids; - SpinLockRelease(&pArray->known_assigned_xids_lck); - } - - /* - * Standard binary search. Note we can ignore the KnownAssignedXidsValid - * array here, since even invalid entries will contain sorted XIDs. - */ - first = tail; - last = head - 1; - while (first <= last) - { - int mid_index; - TransactionId mid_xid; - - mid_index = (first + last) / 2; - mid_xid = KnownAssignedXids[mid_index]; - - if (xid == mid_xid) - { - result_index = mid_index; - break; - } - else if (TransactionIdPrecedes(xid, mid_xid)) - last = mid_index - 1; - else - first = mid_index + 1; - } - - if (result_index < 0) - return false; /* not in array */ - - if (!KnownAssignedXidsValid[result_index]) - return false; /* in array, but invalid */ - - if (remove) - { - KnownAssignedXidsValid[result_index] = false; - - pArray->numKnownAssignedXids--; - Assert(pArray->numKnownAssignedXids >= 0); - - /* - * If we're removing the tail element then advance tail pointer over - * any invalid elements. This will speed future searches. - */ - if (result_index == tail) - { - tail++; - while (tail < head && !KnownAssignedXidsValid[tail]) - tail++; - if (tail >= head) - { - /* Array is empty, so we can reset both pointers */ - pArray->headKnownAssignedXids = 0; - pArray->tailKnownAssignedXids = 0; - } - else - { - pArray->tailKnownAssignedXids = tail; - } - } - } - - return true; -} - -/* - * Is the specified XID present in KnownAssignedXids[]? - * - * Caller must hold ProcArrayLock in shared or exclusive mode. - */ -static bool -KnownAssignedXidExists(TransactionId xid) -{ - Assert(TransactionIdIsValid(xid)); - - return KnownAssignedXidsSearch(xid, false); -} - -/* - * Remove the specified XID from KnownAssignedXids[]. - * - * Caller must hold ProcArrayLock in exclusive mode. - */ -static void -KnownAssignedXidsRemove(TransactionId xid) -{ - Assert(TransactionIdIsValid(xid)); - - elog(trace_recovery(DEBUG4), "remove KnownAssignedXid %u", xid); - - /* - * Note: we cannot consider it an error to remove an XID that's not - * present. We intentionally remove subxact IDs while processing - * XLOG_XACT_ASSIGNMENT, to avoid array overflow. Then those XIDs will be - * removed again when the top-level xact commits or aborts. - * - * It might be possible to track such XIDs to distinguish this case from - * actual errors, but it would be complicated and probably not worth it. - * So, just ignore the search result. - */ - (void) KnownAssignedXidsSearch(xid, true); -} - -/* - * KnownAssignedXidsRemoveTree - * Remove xid (if it's not InvalidTransactionId) and all the subxids. - * - * Caller must hold ProcArrayLock in exclusive mode. - */ -static void -KnownAssignedXidsRemoveTree(TransactionId xid, int nsubxids, - TransactionId *subxids) -{ - int i; - - if (TransactionIdIsValid(xid)) - KnownAssignedXidsRemove(xid); - - for (i = 0; i < nsubxids; i++) - KnownAssignedXidsRemove(subxids[i]); - - /* Opportunistically compress the array */ - KnownAssignedXidsCompress(false); -} - -/* - * Prune KnownAssignedXids up to, but *not* including xid. If xid is invalid - * then clear the whole table. - * - * Caller must hold ProcArrayLock in exclusive mode. - */ -static void -KnownAssignedXidsRemovePreceding(TransactionId removeXid) -{ - /* use volatile pointer to prevent code rearrangement */ - volatile ProcArrayStruct *pArray = procArray; - int count = 0; - int head, - tail, - i; - - if (!TransactionIdIsValid(removeXid)) - { - elog(trace_recovery(DEBUG4), "removing all KnownAssignedXids"); - pArray->numKnownAssignedXids = 0; - pArray->headKnownAssignedXids = pArray->tailKnownAssignedXids = 0; - return; - } - - elog(trace_recovery(DEBUG4), "prune KnownAssignedXids to %u", removeXid); - - /* - * Mark entries invalid starting at the tail. Since array is sorted, we - * can stop as soon as we reach an entry >= removeXid. - */ - tail = pArray->tailKnownAssignedXids; - head = pArray->headKnownAssignedXids; - - for (i = tail; i < head; i++) - { - if (KnownAssignedXidsValid[i]) - { - TransactionId knownXid = KnownAssignedXids[i]; - - if (TransactionIdFollowsOrEquals(knownXid, removeXid)) - break; - - if (!StandbyTransactionIdIsPrepared(knownXid)) - { - KnownAssignedXidsValid[i] = false; - count++; - } - } - } - - pArray->numKnownAssignedXids -= count; - Assert(pArray->numKnownAssignedXids >= 0); - - /* - * Advance the tail pointer if we've marked the tail item invalid. - */ - for (i = tail; i < head; i++) - { - if (KnownAssignedXidsValid[i]) - break; - } - if (i >= head) - { - /* Array is empty, so we can reset both pointers */ - pArray->headKnownAssignedXids = 0; - pArray->tailKnownAssignedXids = 0; - } - else - { - pArray->tailKnownAssignedXids = i; - } - - /* Opportunistically compress the array */ - KnownAssignedXidsCompress(false); -} - -/* - * KnownAssignedXidsGet - Get an array of xids by scanning KnownAssignedXids. - * We filter out anything >= xmax. - * - * Returns the number of XIDs stored into xarray[]. Caller is responsible - * that array is large enough. - * - * Caller must hold ProcArrayLock in (at least) shared mode. - */ -static int -KnownAssignedXidsGet(TransactionId *xarray, TransactionId xmax) -{ - TransactionId xtmp = InvalidTransactionId; - - return KnownAssignedXidsGetAndSetXmin(xarray, &xtmp, xmax); -} - -/* - * KnownAssignedXidsGetAndSetXmin - as KnownAssignedXidsGet, plus - * we reduce *xmin to the lowest xid value seen if not already lower. - * - * Caller must hold ProcArrayLock in (at least) shared mode. - */ -static int -KnownAssignedXidsGetAndSetXmin(TransactionId *xarray, TransactionId *xmin, - TransactionId xmax) -{ - int count = 0; - int head, - tail; - int i; - - /* - * Fetch head just once, since it may change while we loop. We can stop - * once we reach the initially seen head, since we are certain that an xid - * cannot enter and then leave the array while we hold ProcArrayLock. We - * might miss newly-added xids, but they should be >= xmax so irrelevant - * anyway. - * - * Must take spinlock to ensure we see up-to-date array contents. - */ - SpinLockAcquire(&procArray->known_assigned_xids_lck); - tail = procArray->tailKnownAssignedXids; - head = procArray->headKnownAssignedXids; - SpinLockRelease(&procArray->known_assigned_xids_lck); - - for (i = tail; i < head; i++) - { - /* Skip any gaps in the array */ - if (KnownAssignedXidsValid[i]) - { - TransactionId knownXid = KnownAssignedXids[i]; - - /* - * Update xmin if required. Only the first XID need be checked, - * since the array is sorted. - */ - if (count == 0 && - TransactionIdPrecedes(knownXid, *xmin)) - *xmin = knownXid; - - /* - * Filter out anything >= xmax, again relying on sorted property - * of array. - */ - if (TransactionIdIsValid(xmax) && - TransactionIdFollowsOrEquals(knownXid, xmax)) - break; - - /* Add knownXid into output array */ - xarray[count++] = knownXid; - } - } - - return count; -} - -/* - * Get oldest XID in the KnownAssignedXids array, or InvalidTransactionId - * if nothing there. - */ -static TransactionId -KnownAssignedXidsGetOldestXmin(void) -{ - int head, - tail; - int i; - - /* - * Fetch head just once, since it may change while we loop. - */ - SpinLockAcquire(&procArray->known_assigned_xids_lck); - tail = procArray->tailKnownAssignedXids; - head = procArray->headKnownAssignedXids; - SpinLockRelease(&procArray->known_assigned_xids_lck); - - for (i = tail; i < head; i++) - { - /* Skip any gaps in the array */ - if (KnownAssignedXidsValid[i]) - return KnownAssignedXids[i]; - } - - return InvalidTransactionId; -} - -/* - * Display KnownAssignedXids to provide debug trail - * - * Currently this is only called within startup process, so we need no - * special locking. - * - * Note this is pretty expensive, and much of the expense will be incurred - * even if the elog message will get discarded. It's not currently called - * in any performance-critical places, however, so no need to be tenser. - */ -static void -KnownAssignedXidsDisplay(int trace_level) -{ - /* use volatile pointer to prevent code rearrangement */ - volatile ProcArrayStruct *pArray = procArray; - StringInfoData buf; - int head, - tail, - i; - int nxids = 0; - - tail = pArray->tailKnownAssignedXids; - head = pArray->headKnownAssignedXids; - - initStringInfo(&buf); - - for (i = tail; i < head; i++) - { - if (KnownAssignedXidsValid[i]) - { - nxids++; - appendStringInfo(&buf, "[%d]=%u ", i, KnownAssignedXids[i]); - } - } - - elog(trace_level, "%d KnownAssignedXids (num=%d tail=%d head=%d) %s", - nxids, - pArray->numKnownAssignedXids, - pArray->tailKnownAssignedXids, - pArray->headKnownAssignedXids, - buf.data); - - pfree(buf.data); -} - -/* - * KnownAssignedXidsReset - * Resets KnownAssignedXids to be empty - */ -static void -KnownAssignedXidsReset(void) -{ - /* use volatile pointer to prevent code rearrangement */ - volatile ProcArrayStruct *pArray = procArray; - - LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); - - pArray->numKnownAssignedXids = 0; - pArray->tailKnownAssignedXids = 0; - pArray->headKnownAssignedXids = 0; - - LWLockRelease(ProcArrayLock); -} diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c index 22522676f3..476ec5b9c5 100644 --- a/src/backend/storage/ipc/shmem.c +++ b/src/backend/storage/ipc/shmem.c @@ -65,7 +65,7 @@ #include "postgres.h" -#include "access/transam.h" +#include "access/mvccvars.h" #include "miscadmin.h" #include "storage/lwlock.h" #include "storage/pg_shmem.h" diff --git a/src/backend/storage/ipc/standby.c b/src/backend/storage/ipc/standby.c index d491ece60a..0ee15efaff 100644 --- a/src/backend/storage/ipc/standby.c +++ b/src/backend/storage/ipc/standby.c @@ -101,9 +101,6 @@ InitRecoveryTransactionEnvironment(void) void ShutdownRecoveryTransactionEnvironment(void) { - /* Mark all tracked in-progress transactions as finished. */ - ExpireAllKnownAssignedTransactionIds(); - /* Release all locks the tracked transactions were holding */ StandbyReleaseAllLocks(); @@ -309,7 +306,7 @@ ResolveRecoveryConflictWithTablespace(Oid tsid) * * We don't wait for commit because drop tablespace is non-transactional. */ - temp_file_users = GetConflictingVirtualXIDs(InvalidTransactionId, + temp_file_users = GetConflictingVirtualXIDs(InvalidCommitSeqNo, InvalidOid); ResolveRecoveryConflictWithVirtualXIDs(temp_file_users, PROCSIG_RECOVERY_CONFLICT_TABLESPACE); @@ -606,8 +603,7 @@ StandbyAcquireAccessExclusiveLock(TransactionId xid, Oid dbOid, Oid relOid) /* Already processed? */ if (!TransactionIdIsValid(xid) || - TransactionIdDidCommit(xid) || - TransactionIdDidAbort(xid)) + TransactionIdGetStatus(xid) != XID_INPROGRESS) return; elog(trace_recovery(DEBUG4), @@ -722,7 +718,7 @@ StandbyReleaseAllLocks(void) * as long as they're not prepared transactions. */ void -StandbyReleaseOldLocks(int nxids, TransactionId *xids) +StandbyReleaseOldLocks(TransactionId oldestRunningXid) { ListCell *cell, *prev, @@ -741,26 +737,8 @@ StandbyReleaseOldLocks(int nxids, TransactionId *xids) if (StandbyTransactionIdIsPrepared(lock->xid)) remove = false; - else - { - int i; - bool found = false; - - for (i = 0; i < nxids; i++) - { - if (lock->xid == xids[i]) - { - found = true; - break; - } - } - - /* - * If its not a running transaction, remove it. - */ - if (!found) - remove = true; - } + else if (TransactionIdPrecedes(lock->xid, oldestRunningXid)) + remove = true; if (remove) { @@ -815,13 +793,8 @@ standby_redo(XLogReaderState *record) xl_running_xacts *xlrec = (xl_running_xacts *) XLogRecGetData(record); RunningTransactionsData running; - running.xcnt = xlrec->xcnt; - running.subxcnt = xlrec->subxcnt; - running.subxid_overflow = xlrec->subxid_overflow; running.nextXid = xlrec->nextXid; - running.latestCompletedXid = xlrec->latestCompletedXid; running.oldestRunningXid = xlrec->oldestRunningXid; - running.xids = xlrec->xids; ProcArrayApplyRecoveryInfo(&running); } @@ -929,27 +902,8 @@ LogStandbySnapshot(void) */ running = GetRunningTransactionData(); - /* - * GetRunningTransactionData() acquired ProcArrayLock, we must release it. - * For Hot Standby this can be done before inserting the WAL record - * because ProcArrayApplyRecoveryInfo() rechecks the commit status using - * the clog. For logical decoding, though, the lock can't be released - * early because the clog might be "in the future" from the POV of the - * historic snapshot. This would allow for situations where we're waiting - * for the end of a transaction listed in the xl_running_xacts record - * which, according to the WAL, has committed before the xl_running_xacts - * record. Fortunately this routine isn't executed frequently, and it's - * only a shared lock. - */ - if (wal_level < WAL_LEVEL_LOGICAL) - LWLockRelease(ProcArrayLock); - recptr = LogCurrentRunningXacts(running); - /* Release lock if we kept it longer ... */ - if (wal_level >= WAL_LEVEL_LOGICAL) - LWLockRelease(ProcArrayLock); - /* GetRunningTransactionData() acquired XidGenLock, we must release it */ LWLockRelease(XidGenLock); @@ -971,41 +925,21 @@ LogCurrentRunningXacts(RunningTransactions CurrRunningXacts) xl_running_xacts xlrec; XLogRecPtr recptr; - xlrec.xcnt = CurrRunningXacts->xcnt; - xlrec.subxcnt = CurrRunningXacts->subxcnt; - xlrec.subxid_overflow = CurrRunningXacts->subxid_overflow; xlrec.nextXid = CurrRunningXacts->nextXid; xlrec.oldestRunningXid = CurrRunningXacts->oldestRunningXid; - xlrec.latestCompletedXid = CurrRunningXacts->latestCompletedXid; /* Header */ XLogBeginInsert(); XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT); - XLogRegisterData((char *) (&xlrec), MinSizeOfXactRunningXacts); - - /* array of TransactionIds */ - if (xlrec.xcnt > 0) - XLogRegisterData((char *) CurrRunningXacts->xids, - (xlrec.xcnt + xlrec.subxcnt) * sizeof(TransactionId)); + XLogRegisterData((char *) (&xlrec), SizeOfXactRunningXacts); recptr = XLogInsert(RM_STANDBY_ID, XLOG_RUNNING_XACTS); - if (CurrRunningXacts->subxid_overflow) - elog(trace_recovery(DEBUG2), - "snapshot of %u running transactions overflowed (lsn %X/%X oldest xid %u latest complete %u next xid %u)", - CurrRunningXacts->xcnt, - (uint32) (recptr >> 32), (uint32) recptr, - CurrRunningXacts->oldestRunningXid, - CurrRunningXacts->latestCompletedXid, - CurrRunningXacts->nextXid); - else - elog(trace_recovery(DEBUG2), - "snapshot of %u+%u running transaction ids (lsn %X/%X oldest xid %u latest complete %u next xid %u)", - CurrRunningXacts->xcnt, CurrRunningXacts->subxcnt, - (uint32) (recptr >> 32), (uint32) recptr, - CurrRunningXacts->oldestRunningXid, - CurrRunningXacts->latestCompletedXid, - CurrRunningXacts->nextXid); + elog(trace_recovery(DEBUG2), + "snapshot of running transaction ids (lsn %X/%X oldest xid %u next xid %u)", + (uint32) (recptr >> 32), (uint32) recptr, + CurrRunningXacts->oldestRunningXid, + CurrRunningXacts->nextXid); /* * Ensure running_xacts information is synced to disk not too far in the diff --git a/src/backend/storage/lmgr/lmgr.c b/src/backend/storage/lmgr/lmgr.c index da5679b7a3..3ebb58649f 100644 --- a/src/backend/storage/lmgr/lmgr.c +++ b/src/backend/storage/lmgr/lmgr.c @@ -579,6 +579,8 @@ XactLockTableWait(TransactionId xid, Relation rel, ItemPointer ctid, for (;;) { + TransactionId parentXid; + Assert(TransactionIdIsValid(xid)); Assert(!TransactionIdEquals(xid, GetTopTransactionIdIfAny())); @@ -588,9 +590,23 @@ XactLockTableWait(TransactionId xid, Relation rel, ItemPointer ctid, LockRelease(&tag, ShareLock, false); - if (!TransactionIdIsInProgress(xid)) + /* + * Ok, this xid is not running anymore. But it might be a + * subtransaction whose parent is still running. + */ + CommitSeqNo csn = TransactionIdGetCommitSeqNo(xid); + if (COMMITSEQNO_IS_COMMITTED(csn) || COMMITSEQNO_IS_ABORTED(csn)) + break; + + parentXid = SubTransGetParent(xid); + if (parentXid == InvalidTransactionId) + { + csn = TransactionIdGetCommitSeqNo(xid); + Assert(COMMITSEQNO_IS_COMMITTED(csn) || COMMITSEQNO_IS_ABORTED(csn)); break; - xid = SubTransGetParent(xid); + } + + xid = parentXid; } if (oper != XLTW_None) @@ -607,6 +623,7 @@ bool ConditionalXactLockTableWait(TransactionId xid) { LOCKTAG tag; + TransactionId parentXid; for (;;) { @@ -620,9 +637,23 @@ ConditionalXactLockTableWait(TransactionId xid) LockRelease(&tag, ShareLock, false); - if (!TransactionIdIsInProgress(xid)) + /* + * Ok, this xid is not running anymore. But it might be a + * subtransaction whose parent is still running. + */ + CommitSeqNo csn = TransactionIdGetCommitSeqNo(xid); + if (COMMITSEQNO_IS_COMMITTED(csn) || COMMITSEQNO_IS_ABORTED(csn)) break; - xid = SubTransGetParent(xid); + + parentXid = SubTransGetParent(xid); + if (parentXid == InvalidTransactionId) + { + csn = TransactionIdGetCommitSeqNo(xid); + Assert(COMMITSEQNO_IS_COMMITTED(csn) || COMMITSEQNO_IS_ABORTED(csn)); + break; + } + + xid = parentXid; } return true; diff --git a/src/backend/storage/lmgr/lwlocknames.txt b/src/backend/storage/lmgr/lwlocknames.txt index e6025ecedb..75af22ec8a 100644 --- a/src/backend/storage/lmgr/lwlocknames.txt +++ b/src/backend/storage/lmgr/lwlocknames.txt @@ -16,7 +16,7 @@ WALWriteLock 8 ControlFileLock 9 CheckpointLock 10 CLogControlLock 11 -SubtransControlLock 12 +CSNLogControlLock 12 MultiXactGenLock 13 MultiXactOffsetControlLock 14 MultiXactMemberControlLock 15 @@ -47,6 +47,8 @@ CommitTsLock 39 ReplicationOriginLock 40 MultiXactTruncationLock 41 OldSnapshotTimeMapLock 42 -BackendRandomLock 43 -LogicalRepWorkerLock 44 -CLogTruncationLock 45 +CommitSeqNoLock 43 +BackendRandomLock 44 + +LogicalRepWorkerLock 45 +CLogTruncationLock 46 diff --git a/src/backend/storage/lmgr/predicate.c b/src/backend/storage/lmgr/predicate.c index 251a359bff..966fd36156 100644 --- a/src/backend/storage/lmgr/predicate.c +++ b/src/backend/storage/lmgr/predicate.c @@ -185,7 +185,9 @@ #include "postgres.h" +#include "access/clog.h" #include "access/htup_details.h" +#include "access/mvccvars.h" #include "access/slru.h" #include "access/subtrans.h" #include "access/transam.h" @@ -3902,7 +3904,7 @@ static bool XidIsConcurrent(TransactionId xid) { Snapshot snap; - uint32 i; + XLogRecPtr csn; Assert(TransactionIdIsValid(xid)); Assert(!TransactionIdEquals(xid, GetTopTransactionIdIfAny())); @@ -3915,11 +3917,11 @@ XidIsConcurrent(TransactionId xid) if (TransactionIdFollowsOrEquals(xid, snap->xmax)) return true; - for (i = 0; i < snap->xcnt; i++) - { - if (xid == snap->xip[i]) - return true; - } + csn = TransactionIdGetCommitSeqNo(xid); + if (COMMITSEQNO_IS_INPROGRESS(csn)) + return true; + if (COMMITSEQNO_IS_COMMITTED(csn)) + return csn >= snap->snapshotcsn; return false; } diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c index 5f6727d501..121cd93013 100644 --- a/src/backend/storage/lmgr/proc.c +++ b/src/backend/storage/lmgr/proc.c @@ -365,7 +365,7 @@ InitProcess(void) MyProc->fpVXIDLock = false; MyProc->fpLocalTransactionId = InvalidLocalTransactionId; MyPgXact->xid = InvalidTransactionId; - MyPgXact->xmin = InvalidTransactionId; + MyPgXact->snapshotcsn = InvalidCommitSeqNo; MyProc->pid = MyProcPid; /* backendId, databaseId and roleId will be filled in later */ MyProc->backendId = InvalidBackendId; @@ -412,9 +412,10 @@ InitProcess(void) /* Initialize fields for group transaction status update. */ MyProc->clogGroupMember = false; MyProc->clogGroupMemberXid = InvalidTransactionId; - MyProc->clogGroupMemberXidStatus = TRANSACTION_STATUS_IN_PROGRESS; + MyProc->clogGroupMemberXidStatus = CLOG_XID_STATUS_IN_PROGRESS; MyProc->clogGroupMemberPage = -1; MyProc->clogGroupMemberLsn = InvalidXLogRecPtr; + MyProc->clogGroupNSubxids = 0; pg_atomic_init_u32(&MyProc->clogGroupNext, INVALID_PGPROCNO); /* @@ -548,7 +549,7 @@ InitAuxiliaryProcess(void) MyProc->fpVXIDLock = false; MyProc->fpLocalTransactionId = InvalidLocalTransactionId; MyPgXact->xid = InvalidTransactionId; - MyPgXact->xmin = InvalidTransactionId; + MyPgXact->snapshotcsn = InvalidCommitSeqNo; MyProc->backendId = InvalidBackendId; MyProc->databaseId = InvalidOid; MyProc->roleId = InvalidOid; @@ -779,7 +780,7 @@ static void RemoveProcFromArray(int code, Datum arg) { Assert(MyProc != NULL); - ProcArrayRemove(MyProc, InvalidTransactionId); + ProcArrayRemove(MyProc); } /* diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index edff6da410..3780f951b3 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -130,6 +130,7 @@ #include "parser/parse_coerce.h" #include "parser/parsetree.h" #include "statistics/statistics.h" +#include "storage/procarray.h" #include "utils/acl.h" #include "utils/builtins.h" #include "utils/bytea.h" @@ -5469,7 +5470,7 @@ get_actual_variable_range(PlannerInfo *root, VariableStatData *vardata, slot = MakeSingleTupleTableSlot(RelationGetDescr(heapRel)); econtext->ecxt_scantuple = slot; get_typlenbyval(vardata->atttype, &typLen, &typByVal); - InitNonVacuumableSnapshot(SnapshotNonVacuumable, RecentGlobalXmin); + InitNonVacuumableSnapshot(SnapshotNonVacuumable, GetRecentGlobalXmin()); /* set up an IS NOT NULL scan key so that we ignore nulls */ ScanKeyEntryInitialize(&scankeys[0], diff --git a/src/backend/utils/adt/txid.c b/src/backend/utils/adt/txid.c index 9d312edf04..16a3663f1e 100644 --- a/src/backend/utils/adt/txid.c +++ b/src/backend/utils/adt/txid.c @@ -22,6 +22,7 @@ #include "postgres.h" #include "access/clog.h" +#include "access/mvccvars.h" #include "access/transam.h" #include "access/xact.h" #include "access/xlog.h" @@ -53,6 +54,8 @@ typedef uint64 txid; /* * Snapshot containing 8byte txids. + * + * FIXME: this could be a fixed-length datatype now. */ typedef struct { @@ -63,17 +66,16 @@ typedef struct */ int32 __varsz; - uint32 nxip; /* number of txids in xip array */ - txid xmin; txid xmax; - /* in-progress txids, xmin <= xip[i] < xmax: */ - txid xip[FLEXIBLE_ARRAY_MEMBER]; + /* + * FIXME: this is change in on-disk format if someone created a column + * with txid datatype. Dump+reload won't load either. + */ + CommitSeqNo snapshotcsn; } TxidSnapshot; -#define TXID_SNAPSHOT_SIZE(nxip) \ - (offsetof(TxidSnapshot, xip) + sizeof(txid) * (nxip)) -#define TXID_SNAPSHOT_MAX_NXIP \ - ((MaxAllocSize - offsetof(TxidSnapshot, xip)) / sizeof(txid)) +#define TXID_SNAPSHOT_SIZE \ + (offsetof(TxidSnapshot, snapshotcsn) + sizeof(CommitSeqNo)) /* * Epoch values from xact.c @@ -183,60 +185,12 @@ convert_xid(TransactionId xid, const TxidEpoch *state) } /* - * txid comparator for qsort/bsearch - */ -static int -cmp_txid(const void *aa, const void *bb) -{ - txid a = *(const txid *) aa; - txid b = *(const txid *) bb; - - if (a < b) - return -1; - if (a > b) - return 1; - return 0; -} - -/* - * Sort a snapshot's txids, so we can use bsearch() later. Also remove - * any duplicates. - * - * For consistency of on-disk representation, we always sort even if bsearch - * will not be used. - */ -static void -sort_snapshot(TxidSnapshot *snap) -{ - txid last = 0; - int nxip, - idx1, - idx2; - - if (snap->nxip > 1) - { - qsort(snap->xip, snap->nxip, sizeof(txid), cmp_txid); - - /* remove duplicates */ - nxip = snap->nxip; - idx1 = idx2 = 0; - while (idx1 < nxip) - { - if (snap->xip[idx1] != last) - last = snap->xip[idx2++] = snap->xip[idx1]; - else - snap->nxip--; - idx1++; - } - } -} - -/* * check txid visibility. */ static bool is_visible_txid(txid value, const TxidSnapshot *snap) { +#ifdef BROKEN if (value < snap->xmin) return true; else if (value >= snap->xmax) @@ -262,50 +216,8 @@ is_visible_txid(txid value, const TxidSnapshot *snap) } return true; } -} - -/* - * helper functions to use StringInfo for TxidSnapshot creation. - */ - -static StringInfo -buf_init(txid xmin, txid xmax) -{ - TxidSnapshot snap; - StringInfo buf; - - snap.xmin = xmin; - snap.xmax = xmax; - snap.nxip = 0; - - buf = makeStringInfo(); - appendBinaryStringInfo(buf, (char *) &snap, TXID_SNAPSHOT_SIZE(0)); - return buf; -} - -static void -buf_add_txid(StringInfo buf, txid xid) -{ - TxidSnapshot *snap = (TxidSnapshot *) buf->data; - - /* do this before possible realloc */ - snap->nxip++; - - appendBinaryStringInfo(buf, (char *) &xid, sizeof(xid)); -} - -static TxidSnapshot * -buf_finalize(StringInfo buf) -{ - TxidSnapshot *snap = (TxidSnapshot *) buf->data; - - SET_VARSIZE(snap, buf->len); - - /* buf is not needed anymore */ - buf->data = NULL; - pfree(buf); - - return snap; +#endif + return false; } /* @@ -350,54 +262,29 @@ str2txid(const char *s, const char **endp) static TxidSnapshot * parse_snapshot(const char *str) { - txid xmin; - txid xmax; - txid last_val = 0, - val; const char *str_start = str; const char *endp; - StringInfo buf; + TxidSnapshot *snap; + uint32 csn_hi, + csn_lo; - xmin = str2txid(str, &endp); - if (*endp != ':') - goto bad_format; - str = endp + 1; + snap = palloc0(TXID_SNAPSHOT_SIZE); + SET_VARSIZE(snap, TXID_SNAPSHOT_SIZE); - xmax = str2txid(str, &endp); + snap->xmax = str2txid(str, &endp); if (*endp != ':') goto bad_format; str = endp + 1; /* it should look sane */ - if (xmin == 0 || xmax == 0 || xmin > xmax) + if (snap->xmax == 0) goto bad_format; - /* allocate buffer */ - buf = buf_init(xmin, xmax); - - /* loop over values */ - while (*str != '\0') - { - /* read next value */ - val = str2txid(str, &endp); - str = endp; - - /* require the input to be in order */ - if (val < xmin || val >= xmax || val < last_val) - goto bad_format; - - /* skip duplicates */ - if (val != last_val) - buf_add_txid(buf, val); - last_val = val; - - if (*str == ',') - str++; - else if (*str != '\0') - goto bad_format; - } + if (sscanf(str, "%X/%X", &csn_hi, &csn_lo) != 2) + goto bad_format; + snap->snapshotcsn = ((uint64) csn_hi) << 32 | csn_lo; - return buf_finalize(buf); + return snap; bad_format: ereport(ERROR, @@ -477,8 +364,6 @@ Datum txid_current_snapshot(PG_FUNCTION_ARGS) { TxidSnapshot *snap; - uint32 nxip, - i; TxidEpoch state; Snapshot cur; @@ -488,35 +373,13 @@ txid_current_snapshot(PG_FUNCTION_ARGS) load_xid_epoch(&state); - /* - * Compile-time limits on the procarray (MAX_BACKENDS processes plus - * MAX_BACKENDS prepared transactions) guarantee nxip won't be too large. - */ - StaticAssertStmt(MAX_BACKENDS * 2 <= TXID_SNAPSHOT_MAX_NXIP, - "possible overflow in txid_current_snapshot()"); - /* allocate */ - nxip = cur->xcnt; - snap = palloc(TXID_SNAPSHOT_SIZE(nxip)); + snap = palloc(TXID_SNAPSHOT_SIZE); + SET_VARSIZE(snap, TXID_SNAPSHOT_SIZE); /* fill */ - snap->xmin = convert_xid(cur->xmin, &state); snap->xmax = convert_xid(cur->xmax, &state); - snap->nxip = nxip; - for (i = 0; i < nxip; i++) - snap->xip[i] = convert_xid(cur->xip[i], &state); - - /* - * We want them guaranteed to be in ascending order. This also removes - * any duplicate xids. Normally, an XID can only be assigned to one - * backend, but when preparing a transaction for two-phase commit, there - * is a transient state when both the original backend and the dummy - * PGPROC entry reserved for the prepared transaction hold the same XID. - */ - sort_snapshot(snap); - - /* set size after sorting, because it may have removed duplicate xips */ - SET_VARSIZE(snap, TXID_SNAPSHOT_SIZE(snap->nxip)); + snap->snapshotcsn = cur->snapshotcsn; PG_RETURN_POINTER(snap); } @@ -547,19 +410,12 @@ txid_snapshot_out(PG_FUNCTION_ARGS) { TxidSnapshot *snap = (TxidSnapshot *) PG_GETARG_VARLENA_P(0); StringInfoData str; - uint32 i; initStringInfo(&str); - appendStringInfo(&str, TXID_FMT ":", snap->xmin); appendStringInfo(&str, TXID_FMT ":", snap->xmax); - - for (i = 0; i < snap->nxip; i++) - { - if (i > 0) - appendStringInfoChar(&str, ','); - appendStringInfo(&str, TXID_FMT, snap->xip[i]); - } + appendStringInfo(&str, "%X/%X", (uint32) (snap->snapshotcsn >> 32), + (uint32) snap->snapshotcsn); PG_RETURN_CSTRING(str.data); } @@ -574,6 +430,7 @@ txid_snapshot_out(PG_FUNCTION_ARGS) Datum txid_snapshot_recv(PG_FUNCTION_ARGS) { +#ifdef BROKEN StringInfo buf = (StringInfo) PG_GETARG_POINTER(0); TxidSnapshot *snap; txid last = 0; @@ -582,11 +439,6 @@ txid_snapshot_recv(PG_FUNCTION_ARGS) txid xmin, xmax; - /* load and validate nxip */ - nxip = pq_getmsgint(buf, 4); - if (nxip < 0 || nxip > TXID_SNAPSHOT_MAX_NXIP) - goto bad_format; - xmin = pq_getmsgint64(buf); xmax = pq_getmsgint64(buf); if (xmin == 0 || xmax == 0 || xmin > xmax || xmax > MAX_TXID) @@ -619,6 +471,7 @@ txid_snapshot_recv(PG_FUNCTION_ARGS) PG_RETURN_POINTER(snap); bad_format: +#endif ereport(ERROR, (errcode(ERRCODE_INVALID_BINARY_REPRESENTATION), errmsg("invalid external txid_snapshot data"))); @@ -637,14 +490,13 @@ txid_snapshot_send(PG_FUNCTION_ARGS) { TxidSnapshot *snap = (TxidSnapshot *) PG_GETARG_VARLENA_P(0); StringInfoData buf; - uint32 i; pq_begintypsend(&buf); - pq_sendint32(&buf, snap->nxip); +#ifdef BROKEN pq_sendint64(&buf, snap->xmin); pq_sendint64(&buf, snap->xmax); - for (i = 0; i < snap->nxip; i++) - pq_sendint64(&buf, snap->xip[i]); +#endif + pq_sendint64(&buf, snap->snapshotcsn); PG_RETURN_BYTEA_P(pq_endtypsend(&buf)); } @@ -665,14 +517,18 @@ txid_visible_in_snapshot(PG_FUNCTION_ARGS) /* * txid_snapshot_xmin(txid_snapshot) returns int8 * - * return snapshot's xmin + * return snapshot's xmin */ Datum txid_snapshot_xmin(PG_FUNCTION_ARGS) { + /* FIXME: we don't store xmin in the TxidSnapshot anymore. Maybe we still should? */ +#ifdef BROKEN TxidSnapshot *snap = (TxidSnapshot *) PG_GETARG_VARLENA_P(0); PG_RETURN_INT64(snap->xmin); +#endif + PG_RETURN_INT64(0); } /* @@ -687,47 +543,6 @@ txid_snapshot_xmax(PG_FUNCTION_ARGS) PG_RETURN_INT64(snap->xmax); } - -/* - * txid_snapshot_xip(txid_snapshot) returns setof int8 - * - * return in-progress TXIDs in snapshot. - */ -Datum -txid_snapshot_xip(PG_FUNCTION_ARGS) -{ - FuncCallContext *fctx; - TxidSnapshot *snap; - txid value; - - /* on first call initialize snap_state and get copy of snapshot */ - if (SRF_IS_FIRSTCALL()) - { - TxidSnapshot *arg = (TxidSnapshot *) PG_GETARG_VARLENA_P(0); - - fctx = SRF_FIRSTCALL_INIT(); - - /* make a copy of user snapshot */ - snap = MemoryContextAlloc(fctx->multi_call_memory_ctx, VARSIZE(arg)); - memcpy(snap, arg, VARSIZE(arg)); - - fctx->user_fctx = snap; - } - - /* return values one-by-one */ - fctx = SRF_PERCALL_SETUP(); - snap = fctx->user_fctx; - if (fctx->call_cntr < snap->nxip) - { - value = snap->xip[fctx->call_cntr]; - SRF_RETURN_NEXT(fctx, Int64GetDatum(value)); - } - else - { - SRF_RETURN_DONE(fctx); - } -} - /* * Report the status of a recent transaction ID, or null for wrapped, * truncated away or otherwise too old XIDs. diff --git a/src/backend/utils/probes.d b/src/backend/utils/probes.d index 214dc712ca..c58d6adb6f 100644 --- a/src/backend/utils/probes.d +++ b/src/backend/utils/probes.d @@ -75,6 +75,8 @@ provider postgresql { probe checkpoint__done(int, int, int, int, int); probe clog__checkpoint__start(bool); probe clog__checkpoint__done(bool); + probe csnlog__checkpoint__start(bool); + probe csnlog__checkpoint__done(bool); probe subtrans__checkpoint__start(bool); probe subtrans__checkpoint__done(bool); probe multixact__checkpoint__start(bool); diff --git a/src/backend/utils/time/snapmgr.c b/src/backend/utils/time/snapmgr.c index addf87dc3b..c137325db1 100644 --- a/src/backend/utils/time/snapmgr.c +++ b/src/backend/utils/time/snapmgr.c @@ -152,19 +152,11 @@ static Snapshot CatalogSnapshot = NULL; static Snapshot HistoricSnapshot = NULL; /* - * These are updated by GetSnapshotData. We initialize them this way - * for the convenience of TransactionIdIsInProgress: even in bootstrap - * mode, we don't want it to say that BootstrapTransactionId is in progress. - * - * RecentGlobalXmin and RecentGlobalDataXmin are initialized to - * InvalidTransactionId, to ensure that no one tries to use a stale - * value. Readers should ensure that it has been set to something else - * before using it. + * These are updated by GetSnapshotData. We initialize them this way, + * because even in bootstrap mode, we don't want it to say that + * BootstrapTransactionId is in progress. */ TransactionId TransactionXmin = FirstNormalTransactionId; -TransactionId RecentXmin = FirstNormalTransactionId; -TransactionId RecentGlobalXmin = InvalidTransactionId; -TransactionId RecentGlobalDataXmin = InvalidTransactionId; /* (table, ctid) => (cmin, cmax) mapping during timetravel */ static HTAB *tuplecid_data = NULL; @@ -238,9 +230,7 @@ typedef struct SerializedSnapshotData { TransactionId xmin; TransactionId xmax; - uint32 xcnt; - int32 subxcnt; - bool suboverflowed; + CommitSeqNo snapshotcsn; bool takenDuringRecovery; CommandId curcid; TimestampTz whenTaken; @@ -579,26 +569,18 @@ SetTransactionSnapshot(Snapshot sourcesnap, VirtualTransactionId *sourcevxid, * Even though we are not going to use the snapshot it computes, we must * call GetSnapshotData, for two reasons: (1) to be sure that * CurrentSnapshotData's XID arrays have been allocated, and (2) to update - * RecentXmin and RecentGlobalXmin. (We could alternatively include those + * RecentGlobalXmin. (We could alternatively include those * two variables in exported snapshot files, but it seems better to have * snapshot importers compute reasonably up-to-date values for them.) + * + * FIXME: neither of those reasons hold anymore. Can we drop this? */ CurrentSnapshot = GetSnapshotData(&CurrentSnapshotData); /* * Now copy appropriate fields from the source snapshot. */ - CurrentSnapshot->xmin = sourcesnap->xmin; CurrentSnapshot->xmax = sourcesnap->xmax; - CurrentSnapshot->xcnt = sourcesnap->xcnt; - Assert(sourcesnap->xcnt <= GetMaxSnapshotXidCount()); - memcpy(CurrentSnapshot->xip, sourcesnap->xip, - sourcesnap->xcnt * sizeof(TransactionId)); - CurrentSnapshot->subxcnt = sourcesnap->subxcnt; - Assert(sourcesnap->subxcnt <= GetMaxSnapshotSubxidCount()); - memcpy(CurrentSnapshot->subxip, sourcesnap->subxip, - sourcesnap->subxcnt * sizeof(TransactionId)); - CurrentSnapshot->suboverflowed = sourcesnap->suboverflowed; CurrentSnapshot->takenDuringRecovery = sourcesnap->takenDuringRecovery; /* NB: curcid should NOT be copied, it's a local matter */ @@ -660,50 +642,17 @@ static Snapshot CopySnapshot(Snapshot snapshot) { Snapshot newsnap; - Size subxipoff; - Size size; Assert(snapshot != InvalidSnapshot); /* We allocate any XID arrays needed in the same palloc block. */ - size = subxipoff = sizeof(SnapshotData) + - snapshot->xcnt * sizeof(TransactionId); - if (snapshot->subxcnt > 0) - size += snapshot->subxcnt * sizeof(TransactionId); - - newsnap = (Snapshot) MemoryContextAlloc(TopTransactionContext, size); + newsnap = (Snapshot) MemoryContextAlloc(TopTransactionContext, sizeof(SnapshotData)); memcpy(newsnap, snapshot, sizeof(SnapshotData)); newsnap->regd_count = 0; newsnap->active_count = 0; newsnap->copied = true; - /* setup XID array */ - if (snapshot->xcnt > 0) - { - newsnap->xip = (TransactionId *) (newsnap + 1); - memcpy(newsnap->xip, snapshot->xip, - snapshot->xcnt * sizeof(TransactionId)); - } - else - newsnap->xip = NULL; - - /* - * Setup subXID array. Don't bother to copy it if it had overflowed, - * though, because it's not used anywhere in that case. Except if it's a - * snapshot taken during recovery; all the top-level XIDs are in subxip as - * well in that case, so we mustn't lose them. - */ - if (snapshot->subxcnt > 0 && - (!snapshot->suboverflowed || snapshot->takenDuringRecovery)) - { - newsnap->subxip = (TransactionId *) ((char *) newsnap + subxipoff); - memcpy(newsnap->subxip, snapshot->subxip, - snapshot->subxcnt * sizeof(TransactionId)); - } - else - newsnap->subxip = NULL; - return newsnap; } @@ -984,7 +933,7 @@ SnapshotResetXmin(void) if (pairingheap_is_empty(&RegisteredSnapshots)) { - MyPgXact->xmin = InvalidTransactionId; + ProcArrayResetXmin(MyProc); return; } @@ -992,7 +941,7 @@ SnapshotResetXmin(void) pairingheap_first(&RegisteredSnapshots)); if (TransactionIdPrecedes(MyPgXact->xmin, minSnapshot->xmin)) - MyPgXact->xmin = minSnapshot->xmin; + ProcArrayResetXmin(MyProc); } /* @@ -1159,13 +1108,8 @@ char * ExportSnapshot(Snapshot snapshot) { TransactionId topXid; - TransactionId *children; - ExportedSnapshot *esnap; - int nchildren; - int addTopXid; StringInfoData buf; FILE *f; - int i; MemoryContext oldcxt; char path[MAXPGPATH]; char pathtmp[MAXPGPATH]; @@ -1185,9 +1129,9 @@ ExportSnapshot(Snapshot snapshot) */ /* - * Get our transaction ID if there is one, to include in the snapshot. + * This will assign a transaction ID if we do not yet have one. */ - topXid = GetTopTransactionIdIfAny(); + topXid = GetTopTransactionId(); /* * We cannot export a snapshot from a subtransaction because there's no @@ -1200,20 +1144,6 @@ ExportSnapshot(Snapshot snapshot) errmsg("cannot export a snapshot from a subtransaction"))); /* - * We do however allow previous committed subtransactions to exist. - * Importers of the snapshot must see them as still running, so get their - * XIDs to add them to the snapshot. - */ - nchildren = xactGetCommittedChildren(&children); - - /* - * Generate file path for the snapshot. We start numbering of snapshots - * inside the transaction from 1. - */ - snprintf(path, sizeof(path), SNAPSHOT_EXPORT_DIR "/%08X-%08X-%d", - MyProc->backendId, MyProc->lxid, list_length(exportedSnapshots) + 1); - - /* * Copy the snapshot into TopTransactionContext, add it to the * exportedSnapshots list, and mark it pseudo-registered. We do this to * ensure that the snapshot's xmin is honored for the rest of the @@ -1222,10 +1152,7 @@ ExportSnapshot(Snapshot snapshot) snapshot = CopySnapshot(snapshot); oldcxt = MemoryContextSwitchTo(TopTransactionContext); - esnap = (ExportedSnapshot *) palloc(sizeof(ExportedSnapshot)); - esnap->snapfile = pstrdup(path); - esnap->snapshot = snapshot; - exportedSnapshots = lappend(exportedSnapshots, esnap); + exportedSnapshots = lappend(exportedSnapshots, snapshot); MemoryContextSwitchTo(oldcxt); snapshot->regd_count++; @@ -1238,7 +1165,7 @@ ExportSnapshot(Snapshot snapshot) */ initStringInfo(&buf); - appendStringInfo(&buf, "vxid:%d/%u\n", MyProc->backendId, MyProc->lxid); + appendStringInfo(&buf, "xid:%u\n", topXid); appendStringInfo(&buf, "pid:%d\n", MyProcPid); appendStringInfo(&buf, "dbid:%u\n", MyDatabaseId); appendStringInfo(&buf, "iso:%d\n", XactIsoLevel); @@ -1247,42 +1174,10 @@ ExportSnapshot(Snapshot snapshot) appendStringInfo(&buf, "xmin:%u\n", snapshot->xmin); appendStringInfo(&buf, "xmax:%u\n", snapshot->xmax); - /* - * We must include our own top transaction ID in the top-xid data, since - * by definition we will still be running when the importing transaction - * adopts the snapshot, but GetSnapshotData never includes our own XID in - * the snapshot. (There must, therefore, be enough room to add it.) - * - * However, it could be that our topXid is after the xmax, in which case - * we shouldn't include it because xip[] members are expected to be before - * xmax. (We need not make the same check for subxip[] members, see - * snapshot.h.) - */ - addTopXid = (TransactionIdIsValid(topXid) && - TransactionIdPrecedes(topXid, snapshot->xmax)) ? 1 : 0; - appendStringInfo(&buf, "xcnt:%d\n", snapshot->xcnt + addTopXid); - for (i = 0; i < snapshot->xcnt; i++) - appendStringInfo(&buf, "xip:%u\n", snapshot->xip[i]); - if (addTopXid) - appendStringInfo(&buf, "xip:%u\n", topXid); - - /* - * Similarly, we add our subcommitted child XIDs to the subxid data. Here, - * we have to cope with possible overflow. - */ - if (snapshot->suboverflowed || - snapshot->subxcnt + nchildren > GetMaxSnapshotSubxidCount()) - appendStringInfoString(&buf, "sof:1\n"); - else - { - appendStringInfoString(&buf, "sof:0\n"); - appendStringInfo(&buf, "sxcnt:%d\n", snapshot->subxcnt + nchildren); - for (i = 0; i < snapshot->subxcnt; i++) - appendStringInfo(&buf, "sxp:%u\n", snapshot->subxip[i]); - for (i = 0; i < nchildren; i++) - appendStringInfo(&buf, "sxp:%u\n", children[i]); - } appendStringInfo(&buf, "rec:%u\n", snapshot->takenDuringRecovery); + appendStringInfo(&buf, "snapshotcsn:%X/%X\n", + (uint32) (snapshot->snapshotcsn >> 32), + (uint32) snapshot->snapshotcsn); /* * Now write the text representation into a file. We first write to a @@ -1342,85 +1237,6 @@ pg_export_snapshot(PG_FUNCTION_ARGS) /* - * Parsing subroutines for ImportSnapshot: parse a line with the given - * prefix followed by a value, and advance *s to the next line. The - * filename is provided for use in error messages. - */ -static int -parseIntFromText(const char *prefix, char **s, const char *filename) -{ - char *ptr = *s; - int prefixlen = strlen(prefix); - int val; - - if (strncmp(ptr, prefix, prefixlen) != 0) - ereport(ERROR, - (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), - errmsg("invalid snapshot data in file \"%s\"", filename))); - ptr += prefixlen; - if (sscanf(ptr, "%d", &val) != 1) - ereport(ERROR, - (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), - errmsg("invalid snapshot data in file \"%s\"", filename))); - ptr = strchr(ptr, '\n'); - if (!ptr) - ereport(ERROR, - (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), - errmsg("invalid snapshot data in file \"%s\"", filename))); - *s = ptr + 1; - return val; -} - -static TransactionId -parseXidFromText(const char *prefix, char **s, const char *filename) -{ - char *ptr = *s; - int prefixlen = strlen(prefix); - TransactionId val; - - if (strncmp(ptr, prefix, prefixlen) != 0) - ereport(ERROR, - (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), - errmsg("invalid snapshot data in file \"%s\"", filename))); - ptr += prefixlen; - if (sscanf(ptr, "%u", &val) != 1) - ereport(ERROR, - (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), - errmsg("invalid snapshot data in file \"%s\"", filename))); - ptr = strchr(ptr, '\n'); - if (!ptr) - ereport(ERROR, - (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), - errmsg("invalid snapshot data in file \"%s\"", filename))); - *s = ptr + 1; - return val; -} - -static void -parseVxidFromText(const char *prefix, char **s, const char *filename, - VirtualTransactionId *vxid) -{ - char *ptr = *s; - int prefixlen = strlen(prefix); - - if (strncmp(ptr, prefix, prefixlen) != 0) - ereport(ERROR, - (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), - errmsg("invalid snapshot data in file \"%s\"", filename))); - ptr += prefixlen; - if (sscanf(ptr, "%d/%u", &vxid->backendId, &vxid->localTransactionId) != 2) - ereport(ERROR, - (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), - errmsg("invalid snapshot data in file \"%s\"", filename))); - ptr = strchr(ptr, '\n'); - if (!ptr) - ereport(ERROR, - (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), - errmsg("invalid snapshot data in file \"%s\"", filename))); - *s = ptr + 1; -} - -/* * ImportSnapshot * Import a previously exported snapshot. The argument should be a * filename in SNAPSHOT_EXPORT_DIR. Load the snapshot from that file. @@ -1429,170 +1245,7 @@ parseVxidFromText(const char *prefix, char **s, const char *filename, void ImportSnapshot(const char *idstr) { - char path[MAXPGPATH]; - FILE *f; - struct stat stat_buf; - char *filebuf; - int xcnt; - int i; - VirtualTransactionId src_vxid; - int src_pid; - Oid src_dbid; - int src_isolevel; - bool src_readonly; - SnapshotData snapshot; - - /* - * Must be at top level of a fresh transaction. Note in particular that - * we check we haven't acquired an XID --- if we have, it's conceivable - * that the snapshot would show it as not running, making for very screwy - * behavior. - */ - if (FirstSnapshotSet || - GetTopTransactionIdIfAny() != InvalidTransactionId || - IsSubTransaction()) - ereport(ERROR, - (errcode(ERRCODE_ACTIVE_SQL_TRANSACTION), - errmsg("SET TRANSACTION SNAPSHOT must be called before any query"))); - - /* - * If we are in read committed mode then the next query would execute with - * a new snapshot thus making this function call quite useless. - */ - if (!IsolationUsesXactSnapshot()) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("a snapshot-importing transaction must have isolation level SERIALIZABLE or REPEATABLE READ"))); - - /* - * Verify the identifier: only 0-9, A-F and hyphens are allowed. We do - * this mainly to prevent reading arbitrary files. - */ - if (strspn(idstr, "0123456789ABCDEF-") != strlen(idstr)) - ereport(ERROR, - (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("invalid snapshot identifier: \"%s\"", idstr))); - - /* OK, read the file */ - snprintf(path, MAXPGPATH, SNAPSHOT_EXPORT_DIR "/%s", idstr); - - f = AllocateFile(path, PG_BINARY_R); - if (!f) - ereport(ERROR, - (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("invalid snapshot identifier: \"%s\"", idstr))); - - /* get the size of the file so that we know how much memory we need */ - if (fstat(fileno(f), &stat_buf)) - elog(ERROR, "could not stat file \"%s\": %m", path); - - /* and read the file into a palloc'd string */ - filebuf = (char *) palloc(stat_buf.st_size + 1); - if (fread(filebuf, stat_buf.st_size, 1, f) != 1) - elog(ERROR, "could not read file \"%s\": %m", path); - - filebuf[stat_buf.st_size] = '\0'; - - FreeFile(f); - - /* - * Construct a snapshot struct by parsing the file content. - */ - memset(&snapshot, 0, sizeof(snapshot)); - - parseVxidFromText("vxid:", &filebuf, path, &src_vxid); - src_pid = parseIntFromText("pid:", &filebuf, path); - /* we abuse parseXidFromText a bit here ... */ - src_dbid = parseXidFromText("dbid:", &filebuf, path); - src_isolevel = parseIntFromText("iso:", &filebuf, path); - src_readonly = parseIntFromText("ro:", &filebuf, path); - - snapshot.xmin = parseXidFromText("xmin:", &filebuf, path); - snapshot.xmax = parseXidFromText("xmax:", &filebuf, path); - - snapshot.xcnt = xcnt = parseIntFromText("xcnt:", &filebuf, path); - - /* sanity-check the xid count before palloc */ - if (xcnt < 0 || xcnt > GetMaxSnapshotXidCount()) - ereport(ERROR, - (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), - errmsg("invalid snapshot data in file \"%s\"", path))); - - snapshot.xip = (TransactionId *) palloc(xcnt * sizeof(TransactionId)); - for (i = 0; i < xcnt; i++) - snapshot.xip[i] = parseXidFromText("xip:", &filebuf, path); - - snapshot.suboverflowed = parseIntFromText("sof:", &filebuf, path); - - if (!snapshot.suboverflowed) - { - snapshot.subxcnt = xcnt = parseIntFromText("sxcnt:", &filebuf, path); - - /* sanity-check the xid count before palloc */ - if (xcnt < 0 || xcnt > GetMaxSnapshotSubxidCount()) - ereport(ERROR, - (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), - errmsg("invalid snapshot data in file \"%s\"", path))); - - snapshot.subxip = (TransactionId *) palloc(xcnt * sizeof(TransactionId)); - for (i = 0; i < xcnt; i++) - snapshot.subxip[i] = parseXidFromText("sxp:", &filebuf, path); - } - else - { - snapshot.subxcnt = 0; - snapshot.subxip = NULL; - } - - snapshot.takenDuringRecovery = parseIntFromText("rec:", &filebuf, path); - - /* - * Do some additional sanity checking, just to protect ourselves. We - * don't trouble to check the array elements, just the most critical - * fields. - */ - if (!VirtualTransactionIdIsValid(src_vxid) || - !OidIsValid(src_dbid) || - !TransactionIdIsNormal(snapshot.xmin) || - !TransactionIdIsNormal(snapshot.xmax)) - ereport(ERROR, - (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), - errmsg("invalid snapshot data in file \"%s\"", path))); - - /* - * If we're serializable, the source transaction must be too, otherwise - * predicate.c has problems (SxactGlobalXmin could go backwards). Also, a - * non-read-only transaction can't adopt a snapshot from a read-only - * transaction, as predicate.c handles the cases very differently. - */ - if (IsolationIsSerializable()) - { - if (src_isolevel != XACT_SERIALIZABLE) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("a serializable transaction cannot import a snapshot from a non-serializable transaction"))); - if (src_readonly && !XactReadOnly) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("a non-read-only serializable transaction cannot import a snapshot from a read-only transaction"))); - } - - /* - * We cannot import a snapshot that was taken in a different database, - * because vacuum calculates OldestXmin on a per-database basis; so the - * source transaction's xmin doesn't protect us from data loss. This - * restriction could be removed if the source transaction were to mark its - * xmin as being globally applicable. But that would require some - * additional syntax, since that has to be known when the snapshot is - * initially taken. (See pgsql-hackers discussion of 2011-10-21.) - */ - if (src_dbid != MyDatabaseId) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("cannot import a snapshot from a different database"))); - - /* OK, install the snapshot */ - SetTransactionSnapshot(&snapshot, &src_vxid, src_pid, NULL); + Assert(false); } /* @@ -1839,7 +1492,6 @@ TransactionIdLimitedForOldSnapshots(TransactionId recentXmin, if (NormalTransactionIdFollows(xlimit, recentXmin)) return xlimit; } - return recentXmin; } @@ -2050,13 +1702,7 @@ EstimateSnapshotSpace(Snapshot snap) Assert(snap != InvalidSnapshot); Assert(snap->satisfies == HeapTupleSatisfiesMVCC); - /* We allocate any XID arrays needed in the same palloc block. */ - size = add_size(sizeof(SerializedSnapshotData), - mul_size(snap->xcnt, sizeof(TransactionId))); - if (snap->subxcnt > 0 && - (!snap->suboverflowed || snap->takenDuringRecovery)) - size = add_size(size, - mul_size(snap->subxcnt, sizeof(TransactionId))); + size = sizeof(SerializedSnapshotData); return size; } @@ -2071,51 +1717,20 @@ SerializeSnapshot(Snapshot snapshot, char *start_address) { SerializedSnapshotData serialized_snapshot; - Assert(snapshot->subxcnt >= 0); - /* Copy all required fields */ serialized_snapshot.xmin = snapshot->xmin; serialized_snapshot.xmax = snapshot->xmax; - serialized_snapshot.xcnt = snapshot->xcnt; - serialized_snapshot.subxcnt = snapshot->subxcnt; - serialized_snapshot.suboverflowed = snapshot->suboverflowed; serialized_snapshot.takenDuringRecovery = snapshot->takenDuringRecovery; serialized_snapshot.curcid = snapshot->curcid; serialized_snapshot.whenTaken = snapshot->whenTaken; serialized_snapshot.lsn = snapshot->lsn; - /* - * Ignore the SubXID array if it has overflowed, unless the snapshot was - * taken during recovery - in that case, top-level XIDs are in subxip as - * well, and we mustn't lose them. - */ - if (serialized_snapshot.suboverflowed && !snapshot->takenDuringRecovery) - serialized_snapshot.subxcnt = 0; + serialized_snapshot.snapshotcsn = snapshot->snapshotcsn; /* Copy struct to possibly-unaligned buffer */ memcpy(start_address, &serialized_snapshot, sizeof(SerializedSnapshotData)); - /* Copy XID array */ - if (snapshot->xcnt > 0) - memcpy((TransactionId *) (start_address + - sizeof(SerializedSnapshotData)), - snapshot->xip, snapshot->xcnt * sizeof(TransactionId)); - - /* - * Copy SubXID array. Don't bother to copy it if it had overflowed, - * though, because it's not used anywhere in that case. Except if it's a - * snapshot taken during recovery; all the top-level XIDs are in subxip as - * well in that case, so we mustn't lose them. - */ - if (serialized_snapshot.subxcnt > 0) - { - Size subxipoff = sizeof(SerializedSnapshotData) + - snapshot->xcnt * sizeof(TransactionId); - - memcpy((TransactionId *) (start_address + subxipoff), - snapshot->subxip, snapshot->subxcnt * sizeof(TransactionId)); - } } /* @@ -2129,52 +1744,21 @@ Snapshot RestoreSnapshot(char *start_address) { SerializedSnapshotData serialized_snapshot; - Size size; Snapshot snapshot; - TransactionId *serialized_xids; memcpy(&serialized_snapshot, start_address, sizeof(SerializedSnapshotData)); - serialized_xids = (TransactionId *) - (start_address + sizeof(SerializedSnapshotData)); - - /* We allocate any XID arrays needed in the same palloc block. */ - size = sizeof(SnapshotData) - + serialized_snapshot.xcnt * sizeof(TransactionId) - + serialized_snapshot.subxcnt * sizeof(TransactionId); /* Copy all required fields */ - snapshot = (Snapshot) MemoryContextAlloc(TopTransactionContext, size); + snapshot = (Snapshot) MemoryContextAlloc(TopTransactionContext, sizeof(SnapshotData)); snapshot->satisfies = HeapTupleSatisfiesMVCC; snapshot->xmin = serialized_snapshot.xmin; snapshot->xmax = serialized_snapshot.xmax; - snapshot->xip = NULL; - snapshot->xcnt = serialized_snapshot.xcnt; - snapshot->subxip = NULL; - snapshot->subxcnt = serialized_snapshot.subxcnt; - snapshot->suboverflowed = serialized_snapshot.suboverflowed; + snapshot->snapshotcsn = serialized_snapshot.snapshotcsn; snapshot->takenDuringRecovery = serialized_snapshot.takenDuringRecovery; snapshot->curcid = serialized_snapshot.curcid; snapshot->whenTaken = serialized_snapshot.whenTaken; snapshot->lsn = serialized_snapshot.lsn; - - /* Copy XIDs, if present. */ - if (serialized_snapshot.xcnt > 0) - { - snapshot->xip = (TransactionId *) (snapshot + 1); - memcpy(snapshot->xip, serialized_xids, - serialized_snapshot.xcnt * sizeof(TransactionId)); - } - - /* Copy SubXIDs, if present. */ - if (serialized_snapshot.subxcnt > 0) - { - snapshot->subxip = ((TransactionId *) (snapshot + 1)) + - serialized_snapshot.xcnt; - memcpy(snapshot->subxip, serialized_xids + serialized_snapshot.xcnt, - serialized_snapshot.subxcnt * sizeof(TransactionId)); - } - /* Set the copied flag so that the caller will set refcounts correctly. */ snapshot->regd_count = 0; snapshot->active_count = 0; diff --git a/src/backend/utils/time/tqual.c b/src/backend/utils/time/tqual.c index a821e2eed1..3c3a8cc6ad 100644 --- a/src/backend/utils/time/tqual.c +++ b/src/backend/utils/time/tqual.c @@ -10,28 +10,6 @@ * the passed-in buffer. The caller must hold not only a pin, but at least * shared buffer content lock on the buffer containing the tuple. * - * NOTE: When using a non-MVCC snapshot, we must check - * TransactionIdIsInProgress (which looks in the PGXACT array) - * before TransactionIdDidCommit/TransactionIdDidAbort (which look in - * pg_xact). Otherwise we have a race condition: we might decide that a - * just-committed transaction crashed, because none of the tests succeed. - * xact.c is careful to record commit/abort in pg_xact before it unsets - * MyPgXact->xid in the PGXACT array. That fixes that problem, but it - * also means there is a window where TransactionIdIsInProgress and - * TransactionIdDidCommit will both return true. If we check only - * TransactionIdDidCommit, we could consider a tuple committed when a - * later GetSnapshotData call will still think the originating transaction - * is in progress, which leads to application-level inconsistency. The - * upshot is that we gotta check TransactionIdIsInProgress first in all - * code paths, except for a few cases where we are looking at - * subtransactions of our own main transaction and so there can't be any - * race condition. - * - * When using an MVCC snapshot, we rely on XidInMVCCSnapshot rather than - * TransactionIdIsInProgress, but the logic is otherwise the same: do not - * check pg_xact until after deciding that the xact is no longer in progress. - * - * * Summary of visibility functions: * * HeapTupleSatisfiesMVCC() @@ -66,7 +44,6 @@ #include "access/htup_details.h" #include "access/multixact.h" #include "access/subtrans.h" -#include "access/transam.h" #include "access/xact.h" #include "access/xlog.h" #include "storage/bufmgr.h" @@ -81,6 +58,9 @@ SnapshotData SnapshotSelfData = {HeapTupleSatisfiesSelf}; SnapshotData SnapshotAnyData = {HeapTupleSatisfiesAny}; +/* local functions */ +static bool CommittedXidVisibleInSnapshot(TransactionId xid, Snapshot snapshot); +static bool IsMovedTupleVisible(HeapTuple htup, Buffer buffer); /* * SetHintBits() @@ -120,7 +100,7 @@ SetHintBits(HeapTupleHeader tuple, Buffer buffer, if (TransactionIdIsValid(xid)) { /* NB: xid must be known committed here! */ - XLogRecPtr commitLSN = TransactionIdGetCommitLSN(xid); + XLogRecPtr commitLSN = TransactionIdGetCommitLSN(xid); if (BufferIsPermanent(buffer) && XLogNeedsFlush(commitLSN) && BufferGetLSNAtomic(buffer) < commitLSN) @@ -176,6 +156,8 @@ bool HeapTupleSatisfiesSelf(HeapTuple htup, Snapshot snapshot, Buffer buffer) { HeapTupleHeader tuple = htup->t_data; + bool visible; + TransactionIdStatus hintstatus; Assert(ItemPointerIsValid(&htup->t_self)); Assert(htup->t_tableOid != InvalidOid); @@ -186,45 +168,10 @@ HeapTupleSatisfiesSelf(HeapTuple htup, Snapshot snapshot, Buffer buffer) return false; /* Used by pre-9.0 binary upgrades */ - if (tuple->t_infomask & HEAP_MOVED_OFF) - { - TransactionId xvac = HeapTupleHeaderGetXvac(tuple); + if (tuple->t_infomask & HEAP_MOVED) + return IsMovedTupleVisible(htup, buffer); - if (TransactionIdIsCurrentTransactionId(xvac)) - return false; - if (!TransactionIdIsInProgress(xvac)) - { - if (TransactionIdDidCommit(xvac)) - { - SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, - InvalidTransactionId); - return false; - } - SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, - InvalidTransactionId); - } - } - /* Used by pre-9.0 binary upgrades */ - else if (tuple->t_infomask & HEAP_MOVED_IN) - { - TransactionId xvac = HeapTupleHeaderGetXvac(tuple); - - if (!TransactionIdIsCurrentTransactionId(xvac)) - { - if (TransactionIdIsInProgress(xvac)) - return false; - if (TransactionIdDidCommit(xvac)) - SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, - InvalidTransactionId); - else - { - SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, - InvalidTransactionId); - return false; - } - } - } - else if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmin(tuple))) + if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmin(tuple))) { if (tuple->t_infomask & HEAP_XMAX_INVALID) /* xid invalid */ return true; @@ -258,17 +205,18 @@ HeapTupleSatisfiesSelf(HeapTuple htup, Snapshot snapshot, Buffer buffer) return false; } - else if (TransactionIdIsInProgress(HeapTupleHeaderGetRawXmin(tuple))) - return false; - else if (TransactionIdDidCommit(HeapTupleHeaderGetRawXmin(tuple))) - SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, - HeapTupleHeaderGetRawXmin(tuple)); else { - /* it must have aborted or crashed */ - SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, - InvalidTransactionId); - return false; + visible = XidVisibleInSnapshot(HeapTupleHeaderGetRawXmin(tuple), snapshot, &hintstatus); + + if (hintstatus == XID_COMMITTED) + SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, + HeapTupleHeaderGetRawXmin(tuple)); + if (hintstatus == XID_ABORTED) + SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, + InvalidTransactionId); + if (!visible) + return false; } } @@ -298,12 +246,13 @@ HeapTupleSatisfiesSelf(HeapTuple htup, Snapshot snapshot, Buffer buffer) if (TransactionIdIsCurrentTransactionId(xmax)) return false; - if (TransactionIdIsInProgress(xmax)) + + visible = XidVisibleInSnapshot(xmax, snapshot, &hintstatus); + if (!visible) + { + /* it must have aborted or crashed */ return true; - if (TransactionIdDidCommit(xmax)) - return false; - /* it must have aborted or crashed */ - return true; + } } if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmax(tuple))) @@ -313,16 +262,15 @@ HeapTupleSatisfiesSelf(HeapTuple htup, Snapshot snapshot, Buffer buffer) return false; } - if (TransactionIdIsInProgress(HeapTupleHeaderGetRawXmax(tuple))) - return true; - - if (!TransactionIdDidCommit(HeapTupleHeaderGetRawXmax(tuple))) + visible = XidVisibleInSnapshot(HeapTupleHeaderGetRawXmax(tuple), snapshot, &hintstatus); + if (hintstatus == XID_ABORTED) { /* it must have aborted or crashed */ SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, InvalidTransactionId); - return true; } + if (!visible) + return true; /* xmax transaction committed */ @@ -377,51 +325,15 @@ HeapTupleSatisfiesToast(HeapTuple htup, Snapshot snapshot, return false; /* Used by pre-9.0 binary upgrades */ - if (tuple->t_infomask & HEAP_MOVED_OFF) - { - TransactionId xvac = HeapTupleHeaderGetXvac(tuple); - - if (TransactionIdIsCurrentTransactionId(xvac)) - return false; - if (!TransactionIdIsInProgress(xvac)) - { - if (TransactionIdDidCommit(xvac)) - { - SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, - InvalidTransactionId); - return false; - } - SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, - InvalidTransactionId); - } - } - /* Used by pre-9.0 binary upgrades */ - else if (tuple->t_infomask & HEAP_MOVED_IN) - { - TransactionId xvac = HeapTupleHeaderGetXvac(tuple); - - if (!TransactionIdIsCurrentTransactionId(xvac)) - { - if (TransactionIdIsInProgress(xvac)) - return false; - if (TransactionIdDidCommit(xvac)) - SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, - InvalidTransactionId); - else - { - SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, - InvalidTransactionId); - return false; - } - } - } + if (tuple->t_infomask & HEAP_MOVED) + return IsMovedTupleVisible(htup, buffer); /* * An invalid Xmin can be left behind by a speculative insertion that * is canceled by super-deleting the tuple. This also applies to * TOAST tuples created during speculative insertion. */ - else if (!TransactionIdIsValid(HeapTupleHeaderGetXmin(tuple))) + if (!TransactionIdIsValid(HeapTupleHeaderGetXmin(tuple))) return false; } @@ -461,6 +373,7 @@ HeapTupleSatisfiesUpdate(HeapTuple htup, CommandId curcid, Buffer buffer) { HeapTupleHeader tuple = htup->t_data; + TransactionIdStatus xidstatus; Assert(ItemPointerIsValid(&htup->t_self)); Assert(htup->t_tableOid != InvalidOid); @@ -471,45 +384,15 @@ HeapTupleSatisfiesUpdate(HeapTuple htup, CommandId curcid, return HeapTupleInvisible; /* Used by pre-9.0 binary upgrades */ - if (tuple->t_infomask & HEAP_MOVED_OFF) + if (tuple->t_infomask & HEAP_MOVED) { - TransactionId xvac = HeapTupleHeaderGetXvac(tuple); - - if (TransactionIdIsCurrentTransactionId(xvac)) + if (IsMovedTupleVisible(htup, buffer)) + return HeapTupleMayBeUpdated; + else return HeapTupleInvisible; - if (!TransactionIdIsInProgress(xvac)) - { - if (TransactionIdDidCommit(xvac)) - { - SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, - InvalidTransactionId); - return HeapTupleInvisible; - } - SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, - InvalidTransactionId); - } } - /* Used by pre-9.0 binary upgrades */ - else if (tuple->t_infomask & HEAP_MOVED_IN) - { - TransactionId xvac = HeapTupleHeaderGetXvac(tuple); - if (!TransactionIdIsCurrentTransactionId(xvac)) - { - if (TransactionIdIsInProgress(xvac)) - return HeapTupleInvisible; - if (TransactionIdDidCommit(xvac)) - SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, - InvalidTransactionId); - else - { - SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, - InvalidTransactionId); - return HeapTupleInvisible; - } - } - } - else if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmin(tuple))) + if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmin(tuple))) { if (HeapTupleHeaderGetCmin(tuple) >= curcid) return HeapTupleInvisible; /* inserted after scan started */ @@ -543,9 +426,11 @@ HeapTupleSatisfiesUpdate(HeapTuple htup, CommandId curcid, * left in this Xmax; otherwise, report the tuple as * locked/updated. */ - if (!TransactionIdIsInProgress(xmax)) + xidstatus = TransactionIdGetStatus(xmax); + if (xidstatus != XID_INPROGRESS) return HeapTupleMayBeUpdated; - return HeapTupleBeingUpdated; + else + return HeapTupleBeingUpdated; } if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) @@ -589,17 +474,21 @@ HeapTupleSatisfiesUpdate(HeapTuple htup, CommandId curcid, else return HeapTupleInvisible; /* updated before scan started */ } - else if (TransactionIdIsInProgress(HeapTupleHeaderGetRawXmin(tuple))) - return HeapTupleInvisible; - else if (TransactionIdDidCommit(HeapTupleHeaderGetRawXmin(tuple))) - SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, - HeapTupleHeaderGetRawXmin(tuple)); else { - /* it must have aborted or crashed */ - SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, - InvalidTransactionId); - return HeapTupleInvisible; + xidstatus = TransactionIdGetStatus(HeapTupleHeaderGetRawXmin(tuple)); + if (xidstatus == XID_COMMITTED) + { + SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, + HeapTupleHeaderGetXmin(tuple)); + } + else + { + if (xidstatus == XID_ABORTED) + SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, + InvalidTransactionId); + return HeapTupleInvisible; + } } } @@ -649,17 +538,21 @@ HeapTupleSatisfiesUpdate(HeapTuple htup, CommandId curcid, return HeapTupleInvisible; /* updated before scan started */ } - if (MultiXactIdIsRunning(HeapTupleHeaderGetRawXmax(tuple), false)) - return HeapTupleBeingUpdated; - - if (TransactionIdDidCommit(xmax)) - return HeapTupleUpdated; + xidstatus = TransactionIdGetStatus(xmax); + switch (xidstatus) + { + case XID_INPROGRESS: + return HeapTupleBeingUpdated; + case XID_COMMITTED: + return HeapTupleUpdated; + case XID_ABORTED: + break; + } /* * By here, the update in the Xmax is either aborted or crashed, but * what about the other members? */ - if (!MultiXactIdIsRunning(HeapTupleHeaderGetRawXmax(tuple), false)) { /* @@ -687,15 +580,18 @@ HeapTupleSatisfiesUpdate(HeapTuple htup, CommandId curcid, return HeapTupleInvisible; /* updated before scan started */ } - if (TransactionIdIsInProgress(HeapTupleHeaderGetRawXmax(tuple))) - return HeapTupleBeingUpdated; - - if (!TransactionIdDidCommit(HeapTupleHeaderGetRawXmax(tuple))) + xidstatus = TransactionIdGetStatus(HeapTupleHeaderGetRawXmax(tuple)); + switch (xidstatus) { - /* it must have aborted or crashed */ - SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, - InvalidTransactionId); - return HeapTupleMayBeUpdated; + case XID_INPROGRESS: + return HeapTupleBeingUpdated; + case XID_ABORTED: + /* it must have aborted or crashed */ + SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, + InvalidTransactionId); + return HeapTupleMayBeUpdated; + case XID_COMMITTED: + break; } /* xmax transaction committed */ @@ -740,6 +636,7 @@ HeapTupleSatisfiesDirty(HeapTuple htup, Snapshot snapshot, Buffer buffer) { HeapTupleHeader tuple = htup->t_data; + TransactionIdStatus xidstatus; Assert(ItemPointerIsValid(&htup->t_self)); Assert(htup->t_tableOid != InvalidOid); @@ -753,45 +650,10 @@ HeapTupleSatisfiesDirty(HeapTuple htup, Snapshot snapshot, return false; /* Used by pre-9.0 binary upgrades */ - if (tuple->t_infomask & HEAP_MOVED_OFF) - { - TransactionId xvac = HeapTupleHeaderGetXvac(tuple); + if (tuple->t_infomask & HEAP_MOVED) + return IsMovedTupleVisible(htup, buffer); - if (TransactionIdIsCurrentTransactionId(xvac)) - return false; - if (!TransactionIdIsInProgress(xvac)) - { - if (TransactionIdDidCommit(xvac)) - { - SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, - InvalidTransactionId); - return false; - } - SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, - InvalidTransactionId); - } - } - /* Used by pre-9.0 binary upgrades */ - else if (tuple->t_infomask & HEAP_MOVED_IN) - { - TransactionId xvac = HeapTupleHeaderGetXvac(tuple); - - if (!TransactionIdIsCurrentTransactionId(xvac)) - { - if (TransactionIdIsInProgress(xvac)) - return false; - if (TransactionIdDidCommit(xvac)) - SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, - InvalidTransactionId); - else - { - SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, - InvalidTransactionId); - return false; - } - } - } - else if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmin(tuple))) + if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmin(tuple))) { if (tuple->t_infomask & HEAP_XMAX_INVALID) /* xid invalid */ return true; @@ -825,35 +687,39 @@ HeapTupleSatisfiesDirty(HeapTuple htup, Snapshot snapshot, return false; } - else if (TransactionIdIsInProgress(HeapTupleHeaderGetRawXmin(tuple))) + else { - /* - * Return the speculative token to caller. Caller can worry about - * xmax, since it requires a conclusively locked row version, and - * a concurrent update to this tuple is a conflict of its - * purposes. - */ - if (HeapTupleHeaderIsSpeculative(tuple)) + xidstatus = TransactionIdGetStatus(HeapTupleHeaderGetRawXmin(tuple)); + switch (xidstatus) { - snapshot->speculativeToken = - HeapTupleHeaderGetSpeculativeToken(tuple); - - Assert(snapshot->speculativeToken != 0); + case XID_INPROGRESS: + /* + * Return the speculative token to caller. Caller can worry about + * xmax, since it requires a conclusively locked row version, and + * a concurrent update to this tuple is a conflict of its + * purposes. + */ + if (HeapTupleHeaderIsSpeculative(tuple)) + { + snapshot->speculativeToken = + HeapTupleHeaderGetSpeculativeToken(tuple); + + Assert(snapshot->speculativeToken != 0); + } + + snapshot->xmin = HeapTupleHeaderGetRawXmin(tuple); + /* XXX shouldn't we fall through to look at xmax? */ + return true; /* in insertion by other */ + case XID_COMMITTED: + SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, + HeapTupleHeaderGetRawXmin(tuple)); + break; + case XID_ABORTED: + /* it must have aborted or crashed */ + SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, + InvalidTransactionId); + return false; } - - snapshot->xmin = HeapTupleHeaderGetRawXmin(tuple); - /* XXX shouldn't we fall through to look at xmax? */ - return true; /* in insertion by other */ - } - else if (TransactionIdDidCommit(HeapTupleHeaderGetRawXmin(tuple))) - SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, - HeapTupleHeaderGetRawXmin(tuple)); - else - { - /* it must have aborted or crashed */ - SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, - InvalidTransactionId); - return false; } } @@ -883,15 +749,19 @@ HeapTupleSatisfiesDirty(HeapTuple htup, Snapshot snapshot, if (TransactionIdIsCurrentTransactionId(xmax)) return false; - if (TransactionIdIsInProgress(xmax)) + + xidstatus = TransactionIdGetStatus(xmax); + switch (xidstatus) { - snapshot->xmax = xmax; - return true; + case XID_INPROGRESS: + snapshot->xmax = xmax; + return true; + case XID_COMMITTED: + return false; + case XID_ABORTED: + /* it must have aborted or crashed */ + return true; } - if (TransactionIdDidCommit(xmax)) - return false; - /* it must have aborted or crashed */ - return true; } if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmax(tuple))) @@ -901,19 +771,20 @@ HeapTupleSatisfiesDirty(HeapTuple htup, Snapshot snapshot, return false; } - if (TransactionIdIsInProgress(HeapTupleHeaderGetRawXmax(tuple))) + xidstatus = TransactionIdGetStatus(HeapTupleHeaderGetRawXmax(tuple)); + switch (xidstatus) { - if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) - snapshot->xmax = HeapTupleHeaderGetRawXmax(tuple); - return true; - } - - if (!TransactionIdDidCommit(HeapTupleHeaderGetRawXmax(tuple))) - { - /* it must have aborted or crashed */ - SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, - InvalidTransactionId); - return true; + case XID_INPROGRESS: + if (!HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) + snapshot->xmax = HeapTupleHeaderGetRawXmax(tuple); + return true; + case XID_ABORTED: + /* it must have aborted or crashed */ + SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, + InvalidTransactionId); + return true; + case XID_COMMITTED: + break; } /* xmax transaction committed */ @@ -942,28 +813,14 @@ HeapTupleSatisfiesDirty(HeapTuple htup, Snapshot snapshot, * transactions shown as in-progress by the snapshot * transactions started after the snapshot was taken * changes made by the current command - * - * Notice that here, we will not update the tuple status hint bits if the - * inserting/deleting transaction is still running according to our snapshot, - * even if in reality it's committed or aborted by now. This is intentional. - * Checking the true transaction state would require access to high-traffic - * shared data structures, creating contention we'd rather do without, and it - * would not change the result of our visibility check anyway. The hint bits - * will be updated by the first visitor that has a snapshot new enough to see - * the inserting/deleting transaction as done. In the meantime, the cost of - * leaving the hint bits unset is basically that each HeapTupleSatisfiesMVCC - * call will need to run TransactionIdIsCurrentTransactionId in addition to - * XidInMVCCSnapshot (but it would have to do the latter anyway). In the old - * coding where we tried to set the hint bits as soon as possible, we instead - * did TransactionIdIsInProgress in each call --- to no avail, as long as the - * inserting/deleting transaction was still running --- which was more cycles - * and more contention on the PGXACT array. */ bool HeapTupleSatisfiesMVCC(HeapTuple htup, Snapshot snapshot, Buffer buffer) { HeapTupleHeader tuple = htup->t_data; + bool visible; + TransactionIdStatus hintstatus; Assert(ItemPointerIsValid(&htup->t_self)); Assert(htup->t_tableOid != InvalidOid); @@ -974,45 +831,10 @@ HeapTupleSatisfiesMVCC(HeapTuple htup, Snapshot snapshot, return false; /* Used by pre-9.0 binary upgrades */ - if (tuple->t_infomask & HEAP_MOVED_OFF) - { - TransactionId xvac = HeapTupleHeaderGetXvac(tuple); + if (tuple->t_infomask & HEAP_MOVED) + return IsMovedTupleVisible(htup, buffer); - if (TransactionIdIsCurrentTransactionId(xvac)) - return false; - if (!XidInMVCCSnapshot(xvac, snapshot)) - { - if (TransactionIdDidCommit(xvac)) - { - SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, - InvalidTransactionId); - return false; - } - SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, - InvalidTransactionId); - } - } - /* Used by pre-9.0 binary upgrades */ - else if (tuple->t_infomask & HEAP_MOVED_IN) - { - TransactionId xvac = HeapTupleHeaderGetXvac(tuple); - - if (!TransactionIdIsCurrentTransactionId(xvac)) - { - if (XidInMVCCSnapshot(xvac, snapshot)) - return false; - if (TransactionIdDidCommit(xvac)) - SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, - InvalidTransactionId); - else - { - SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, - InvalidTransactionId); - return false; - } - } - } - else if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmin(tuple))) + if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmin(tuple))) { if (HeapTupleHeaderGetCmin(tuple) >= snapshot->curcid) return false; /* inserted after scan started */ @@ -1054,25 +876,29 @@ HeapTupleSatisfiesMVCC(HeapTuple htup, Snapshot snapshot, else return false; /* deleted before scan started */ } - else if (XidInMVCCSnapshot(HeapTupleHeaderGetRawXmin(tuple), snapshot)) - return false; - else if (TransactionIdDidCommit(HeapTupleHeaderGetRawXmin(tuple))) - SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, - HeapTupleHeaderGetRawXmin(tuple)); else { - /* it must have aborted or crashed */ - SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, - InvalidTransactionId); - return false; + visible = XidVisibleInSnapshot(HeapTupleHeaderGetXmin(tuple), + snapshot, &hintstatus); + if (hintstatus == XID_COMMITTED) + SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, + HeapTupleHeaderGetRawXmin(tuple)); + if (hintstatus == XID_ABORTED) + SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, + InvalidTransactionId); + if (!visible) + return false; } } else { /* xmin is committed, but maybe not according to our snapshot */ - if (!HeapTupleHeaderXminFrozen(tuple) && - XidInMVCCSnapshot(HeapTupleHeaderGetRawXmin(tuple), snapshot)) - return false; /* treat as still in progress */ + if (!HeapTupleHeaderXminFrozen(tuple)) + { + visible = CommittedXidVisibleInSnapshot(HeapTupleHeaderGetRawXmin(tuple), snapshot); + if (!visible) + return false; /* treat as still in progress */ + } } /* by here, the inserting transaction has committed */ @@ -1102,12 +928,15 @@ HeapTupleSatisfiesMVCC(HeapTuple htup, Snapshot snapshot, else return false; /* deleted before scan started */ } - if (XidInMVCCSnapshot(xmax, snapshot)) - return true; - if (TransactionIdDidCommit(xmax)) + + visible = XidVisibleInSnapshot(xmax, snapshot, &hintstatus); + if (visible) return false; /* updating transaction committed */ - /* it must have aborted or crashed */ - return true; + else + { + /* it must have aborted or crashed */ + return true; + } } if (!(tuple->t_infomask & HEAP_XMAX_COMMITTED)) @@ -1120,25 +949,28 @@ HeapTupleSatisfiesMVCC(HeapTuple htup, Snapshot snapshot, return false; /* deleted before scan started */ } - if (XidInMVCCSnapshot(HeapTupleHeaderGetRawXmax(tuple), snapshot)) - return true; - - if (!TransactionIdDidCommit(HeapTupleHeaderGetRawXmax(tuple))) + visible = XidVisibleInSnapshot(HeapTupleHeaderGetRawXmax(tuple), + snapshot, &hintstatus); + if (hintstatus == XID_COMMITTED) + { + /* xmax transaction committed */ + SetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED, + HeapTupleHeaderGetRawXmax(tuple)); + } + if (hintstatus == XID_ABORTED) { /* it must have aborted or crashed */ SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, InvalidTransactionId); - return true; } - - /* xmax transaction committed */ - SetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED, - HeapTupleHeaderGetRawXmax(tuple)); + if (!visible) + return true; /* treat as still in progress */ } else { /* xmax is committed, but maybe not according to our snapshot */ - if (XidInMVCCSnapshot(HeapTupleHeaderGetRawXmax(tuple), snapshot)) + visible = CommittedXidVisibleInSnapshot(HeapTupleHeaderGetRawXmax(tuple), snapshot); + if (!visible) return true; /* treat as still in progress */ } @@ -1147,7 +979,6 @@ HeapTupleSatisfiesMVCC(HeapTuple htup, Snapshot snapshot, return false; } - /* * HeapTupleSatisfiesVacuum * @@ -1155,16 +986,22 @@ HeapTupleSatisfiesMVCC(HeapTuple htup, Snapshot snapshot, * we mainly want to know is if a tuple is potentially visible to *any* * running transaction. If so, it can't be removed yet by VACUUM. * - * OldestXmin is a cutoff XID (obtained from GetOldestXmin()). Tuples - * deleted by XIDs >= OldestXmin are deemed "recently dead"; they might - * still be visible to some open transaction, so we can't remove them, - * even if we see that the deleting transaction has committed. + * OldestSnapshot is a cutoff snapshot (obtained from GetOldestSnapshot()). + * Tuples deleted by XIDs that are still visible to OldestSnapshot are deemed + * "recently dead"; they might still be visible to some open transaction, + * so we can't remove them, even if we see that the deleting transaction + * has committed. + * + * Note: predicate.c calls this with a current snapshot, rather than one obtained + * from GetOldestSnapshot(). So even if this function determines that a tuple + * is not visible to anyone anymore, we can't "kill" the tuple right here. */ HTSV_Result HeapTupleSatisfiesVacuum(HeapTuple htup, TransactionId OldestXmin, Buffer buffer) { HeapTupleHeader tuple = htup->t_data; + TransactionIdStatus xidstatus; Assert(ItemPointerIsValid(&htup->t_self)); Assert(htup->t_tableOid != InvalidOid); @@ -1179,44 +1016,17 @@ HeapTupleSatisfiesVacuum(HeapTuple htup, TransactionId OldestXmin, { if (HeapTupleHeaderXminInvalid(tuple)) return HEAPTUPLE_DEAD; - /* Used by pre-9.0 binary upgrades */ - else if (tuple->t_infomask & HEAP_MOVED_OFF) - { - TransactionId xvac = HeapTupleHeaderGetXvac(tuple); - if (TransactionIdIsCurrentTransactionId(xvac)) - return HEAPTUPLE_DELETE_IN_PROGRESS; - if (TransactionIdIsInProgress(xvac)) - return HEAPTUPLE_DELETE_IN_PROGRESS; - if (TransactionIdDidCommit(xvac)) - { - SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, - InvalidTransactionId); - return HEAPTUPLE_DEAD; - } - SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, - InvalidTransactionId); - } /* Used by pre-9.0 binary upgrades */ - else if (tuple->t_infomask & HEAP_MOVED_IN) + if (tuple->t_infomask & HEAP_MOVED) { - TransactionId xvac = HeapTupleHeaderGetXvac(tuple); - - if (TransactionIdIsCurrentTransactionId(xvac)) - return HEAPTUPLE_INSERT_IN_PROGRESS; - if (TransactionIdIsInProgress(xvac)) - return HEAPTUPLE_INSERT_IN_PROGRESS; - if (TransactionIdDidCommit(xvac)) - SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, - InvalidTransactionId); + if (IsMovedTupleVisible(htup, buffer)) + return HEAPTUPLE_LIVE; else - { - SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, - InvalidTransactionId); return HEAPTUPLE_DEAD; - } } - else if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmin(tuple))) + + if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmin(tuple))) { if (tuple->t_infomask & HEAP_XMAX_INVALID) /* xid invalid */ return HEAPTUPLE_INSERT_IN_PROGRESS; @@ -1230,7 +1040,10 @@ HeapTupleSatisfiesVacuum(HeapTuple htup, TransactionId OldestXmin, /* deleting subtransaction must have aborted */ return HEAPTUPLE_INSERT_IN_PROGRESS; } - else if (TransactionIdIsInProgress(HeapTupleHeaderGetRawXmin(tuple))) + + xidstatus = TransactionIdGetStatus(HeapTupleHeaderGetRawXmin(tuple)); + + if (xidstatus == XID_INPROGRESS) { /* * It'd be possible to discern between INSERT/DELETE in progress @@ -1242,7 +1055,7 @@ HeapTupleSatisfiesVacuum(HeapTuple htup, TransactionId OldestXmin, */ return HEAPTUPLE_INSERT_IN_PROGRESS; } - else if (TransactionIdDidCommit(HeapTupleHeaderGetRawXmin(tuple))) + else if (xidstatus == XID_COMMITTED) SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, HeapTupleHeaderGetRawXmin(tuple)); else @@ -1293,7 +1106,8 @@ HeapTupleSatisfiesVacuum(HeapTuple htup, TransactionId OldestXmin, } else { - if (TransactionIdIsInProgress(HeapTupleHeaderGetRawXmax(tuple))) + xidstatus = TransactionIdGetStatus(HeapTupleHeaderGetRawXmax(tuple)); + if (xidstatus == XID_INPROGRESS) return HEAPTUPLE_LIVE; SetHintBits(tuple, buffer, HEAP_XMAX_INVALID, InvalidTransactionId); @@ -1323,13 +1137,17 @@ HeapTupleSatisfiesVacuum(HeapTuple htup, TransactionId OldestXmin, /* not LOCKED_ONLY, so it has to have an xmax */ Assert(TransactionIdIsValid(xmax)); - if (TransactionIdIsInProgress(xmax)) - return HEAPTUPLE_DELETE_IN_PROGRESS; - else if (TransactionIdDidCommit(xmax)) - /* there are still lockers around -- can't return DEAD here */ - return HEAPTUPLE_RECENTLY_DEAD; - /* updating transaction aborted */ - return HEAPTUPLE_LIVE; + switch(TransactionIdGetStatus(xmax)) + { + case XID_INPROGRESS: + return HEAPTUPLE_DELETE_IN_PROGRESS; + case XID_COMMITTED: + /* there are still lockers around -- can't return DEAD here */ + return HEAPTUPLE_RECENTLY_DEAD; + case XID_ABORTED: + /* updating transaction aborted */ + return HEAPTUPLE_LIVE; + } } Assert(!(tuple->t_infomask & HEAP_XMAX_COMMITTED)); @@ -1339,8 +1157,12 @@ HeapTupleSatisfiesVacuum(HeapTuple htup, TransactionId OldestXmin, /* not LOCKED_ONLY, so it has to have an xmax */ Assert(TransactionIdIsValid(xmax)); - /* multi is not running -- updating xact cannot be */ - Assert(!TransactionIdIsInProgress(xmax)); + /* + * multi is not running -- updating xact cannot be (this assertion + * won't catch a running subtransaction) + */ + Assert(!TransactionIdIsActive(xmax)); + if (TransactionIdDidCommit(xmax)) { if (!TransactionIdPrecedes(xmax, OldestXmin)) @@ -1359,9 +1181,11 @@ HeapTupleSatisfiesVacuum(HeapTuple htup, TransactionId OldestXmin, if (!(tuple->t_infomask & HEAP_XMAX_COMMITTED)) { - if (TransactionIdIsInProgress(HeapTupleHeaderGetRawXmax(tuple))) + xidstatus = TransactionIdGetStatus(HeapTupleHeaderGetRawXmax(tuple)); + + if (xidstatus == XID_INPROGRESS) return HEAPTUPLE_DELETE_IN_PROGRESS; - else if (TransactionIdDidCommit(HeapTupleHeaderGetRawXmax(tuple))) + else if (xidstatus == XID_COMMITTED) SetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED, HeapTupleHeaderGetRawXmax(tuple)); else @@ -1471,127 +1295,95 @@ HeapTupleIsSurelyDead(HeapTuple htup, TransactionId OldestXmin) } /* - * XidInMVCCSnapshot - * Is the given XID still-in-progress according to the snapshot? + * XidVisibleInSnapshot + * Is the given XID visible according to the snapshot? * - * Note: GetSnapshotData never stores either top xid or subxids of our own - * backend into a snapshot, so these xids will not be reported as "running" - * by this function. This is OK for current uses, because we always check - * TransactionIdIsCurrentTransactionId first, except when it's known the - * XID could not be ours anyway. + * On return, *hintstatus is set to indicate if the transaction had committed, + * or aborted, whether or not it's not visible to us. */ bool -XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot) +XidVisibleInSnapshot(TransactionId xid, Snapshot snapshot, + TransactionIdStatus *hintstatus) { - uint32 i; + CommitSeqNo csn; - /* - * Make a quick range check to eliminate most XIDs without looking at the - * xip arrays. Note that this is OK even if we convert a subxact XID to - * its parent below, because a subxact with XID < xmin has surely also got - * a parent with XID < xmin, while one with XID >= xmax must belong to a - * parent that was not yet committed at the time of this snapshot. - */ - - /* Any xid < xmin is not in-progress */ - if (TransactionIdPrecedes(xid, snapshot->xmin)) - return false; - /* Any xid >= xmax is in-progress */ - if (TransactionIdFollowsOrEquals(xid, snapshot->xmax)) - return true; + *hintstatus = XID_INPROGRESS; /* - * Snapshot information is stored slightly differently in snapshots taken - * during recovery. + * Any xid >= xmax is in-progress (or aborted, but we don't distinguish + * that here). + * + * We can't do anything useful with xmin, because the xmin only tells us + * whether we see it as completed. We have to check the transaction log to + * see if the transaction committed or aborted, in any case. */ - if (!snapshot->takenDuringRecovery) - { - /* - * If the snapshot contains full subxact data, the fastest way to - * check things is just to compare the given XID against both subxact - * XIDs and top-level XIDs. If the snapshot overflowed, we have to - * use pg_subtrans to convert a subxact XID to its parent XID, but - * then we need only look at top-level XIDs not subxacts. - */ - if (!snapshot->suboverflowed) - { - /* we have full data, so search subxip */ - int32 j; + if (TransactionIdFollowsOrEquals(xid, snapshot->xmax)) + return false; - for (j = 0; j < snapshot->subxcnt; j++) - { - if (TransactionIdEquals(xid, snapshot->subxip[j])) - return true; - } + csn = TransactionIdGetCommitSeqNo(xid); - /* not there, fall through to search xip[] */ - } + if (COMMITSEQNO_IS_COMMITTED(csn)) + { + *hintstatus = XID_COMMITTED; + if (csn < snapshot->snapshotcsn) + return true; else - { - /* - * Snapshot overflowed, so convert xid to top-level. This is safe - * because we eliminated too-old XIDs above. - */ - xid = SubTransGetTopmostTransaction(xid); - - /* - * If xid was indeed a subxact, we might now have an xid < xmin, - * so recheck to avoid an array scan. No point in rechecking - * xmax. - */ - if (TransactionIdPrecedes(xid, snapshot->xmin)) - return false; - } - - for (i = 0; i < snapshot->xcnt; i++) - { - if (TransactionIdEquals(xid, snapshot->xip[i])) - return true; - } + return false; } else { - int32 j; + if (csn == COMMITSEQNO_ABORTED) + *hintstatus = XID_ABORTED; + return false; + } +} - /* - * In recovery we store all xids in the subxact array because it is by - * far the bigger array, and we mostly don't know which xids are - * top-level and which are subxacts. The xip array is empty. - * - * We start by searching subtrans, if we overflowed. - */ - if (snapshot->suboverflowed) - { - /* - * Snapshot overflowed, so convert xid to top-level. This is safe - * because we eliminated too-old XIDs above. - */ - xid = SubTransGetTopmostTransaction(xid); +/* + * CommittedXidVisibleInSnapshot + * Is the given XID visible according to the snapshot? + * + * This is the same as XidVisibleInSnapshot, but the caller knows that the + * given XID committed. The only question is whether it's visible to our + * snapshot or not. + */ +static bool +CommittedXidVisibleInSnapshot(TransactionId xid, Snapshot snapshot) +{ + CommitSeqNo csn; - /* - * If xid was indeed a subxact, we might now have an xid < xmin, - * so recheck to avoid an array scan. No point in rechecking - * xmax. - */ - if (TransactionIdPrecedes(xid, snapshot->xmin)) - return false; - } + /* + * Make a quick range check to eliminate most XIDs without looking at the + * CSN log. + */ + if (TransactionIdPrecedes(xid, snapshot->xmin)) + return true; + + /* + * Any xid >= xmax is in-progress (or aborted, but we don't distinguish + * that here. + */ + if (TransactionIdFollowsOrEquals(xid, snapshot->xmax)) + return false; + csn = TransactionIdGetCommitSeqNo(xid); + + if (!COMMITSEQNO_IS_COMMITTED(csn)) + { + elog(WARNING, "transaction %u was hinted as committed, but was not marked as committed in the transaction log", xid); /* - * We now have either a top-level xid higher than xmin or an - * indeterminate xid. We don't know whether it's top level or subxact - * but it doesn't matter. If it's present, the xid is visible. + * We have contradicting evidence on whether the transaction committed or + * not. Let's assume that it did. That seems better than erroring out. */ - for (j = 0; j < snapshot->subxcnt; j++) - { - if (TransactionIdEquals(xid, snapshot->subxip[j])) - return true; - } + return true; } - return false; + if (csn < snapshot->snapshotcsn) + return true; + else + return false; } + /* * Is the tuple really only locked? That is, is it not updated? * @@ -1605,6 +1397,7 @@ bool HeapTupleHeaderIsOnlyLocked(HeapTupleHeader tuple) { TransactionId xmax; + TransactionIdStatus xidstatus; /* if there's no valid Xmax, then there's obviously no update either */ if (tuple->t_infomask & HEAP_XMAX_INVALID) @@ -1632,9 +1425,11 @@ HeapTupleHeaderIsOnlyLocked(HeapTupleHeader tuple) if (TransactionIdIsCurrentTransactionId(xmax)) return false; - if (TransactionIdIsInProgress(xmax)) + + xidstatus = TransactionIdGetStatus(xmax); + if (xidstatus == XID_INPROGRESS) return false; - if (TransactionIdDidCommit(xmax)) + if (xidstatus == XID_COMMITTED) return false; /* @@ -1675,6 +1470,7 @@ HeapTupleSatisfiesHistoricMVCC(HeapTuple htup, Snapshot snapshot, HeapTupleHeader tuple = htup->t_data; TransactionId xmin = HeapTupleHeaderGetXmin(tuple); TransactionId xmax = HeapTupleHeaderGetRawXmax(tuple); + TransactionIdStatus hintstatus; Assert(ItemPointerIsValid(&htup->t_self)); Assert(htup->t_tableOid != InvalidOid); @@ -1686,7 +1482,7 @@ HeapTupleSatisfiesHistoricMVCC(HeapTuple htup, Snapshot snapshot, return false; } /* check if it's one of our txids, toplevel is also in there */ - else if (TransactionIdInArray(xmin, snapshot->subxip, snapshot->subxcnt)) + else if (TransactionIdInArray(xmin, snapshot->this_xip, snapshot->this_xcnt)) { bool resolved; CommandId cmin = HeapTupleHeaderGetRawCommandId(tuple); @@ -1697,7 +1493,8 @@ HeapTupleSatisfiesHistoricMVCC(HeapTuple htup, Snapshot snapshot, * cmin/cmax was stored in a combocid. So we need to lookup the actual * values externally. */ - resolved = ResolveCminCmaxDuringDecoding(HistoricSnapshotGetTupleCids(), snapshot, + resolved = ResolveCminCmaxDuringDecoding(HistoricSnapshotGetTupleCids(), + snapshot, htup, buffer, &cmin, &cmax); @@ -1710,34 +1507,11 @@ HeapTupleSatisfiesHistoricMVCC(HeapTuple htup, Snapshot snapshot, return false; /* inserted after scan started */ /* fall through */ } - /* committed before our xmin horizon. Do a normal visibility check. */ - else if (TransactionIdPrecedes(xmin, snapshot->xmin)) - { - Assert(!(HeapTupleHeaderXminCommitted(tuple) && - !TransactionIdDidCommit(xmin))); - - /* check for hint bit first, consult clog afterwards */ - if (!HeapTupleHeaderXminCommitted(tuple) && - !TransactionIdDidCommit(xmin)) - return false; - /* fall through */ - } - /* beyond our xmax horizon, i.e. invisible */ - else if (TransactionIdFollowsOrEquals(xmin, snapshot->xmax)) - { - return false; - } - /* check if it's a committed transaction in [xmin, xmax) */ - else if (TransactionIdInArray(xmin, snapshot->xip, snapshot->xcnt)) - { - /* fall through */ - } - /* - * none of the above, i.e. between [xmin, xmax) but hasn't committed. I.e. - * invisible. + * it's not "this" transaction. Do a normal visibility check using the + * snapshot. */ - else + else if (!XidVisibleInSnapshot(xmin, snapshot, &hintstatus)) { return false; } @@ -1761,14 +1535,15 @@ HeapTupleSatisfiesHistoricMVCC(HeapTuple htup, Snapshot snapshot, } /* check if it's one of our txids, toplevel is also in there */ - if (TransactionIdInArray(xmax, snapshot->subxip, snapshot->subxcnt)) + if (TransactionIdInArray(xmax, snapshot->this_xip, snapshot->this_xcnt)) { bool resolved; CommandId cmin; CommandId cmax = HeapTupleHeaderGetRawCommandId(tuple); /* Lookup actual cmin/cmax values */ - resolved = ResolveCminCmaxDuringDecoding(HistoricSnapshotGetTupleCids(), snapshot, + resolved = ResolveCminCmaxDuringDecoding(HistoricSnapshotGetTupleCids(), + snapshot, htup, buffer, &cmin, &cmax); @@ -1782,26 +1557,74 @@ HeapTupleSatisfiesHistoricMVCC(HeapTuple htup, Snapshot snapshot, else return false; /* deleted before scan started */ } - /* below xmin horizon, normal transaction state is valid */ - else if (TransactionIdPrecedes(xmax, snapshot->xmin)) - { - Assert(!(tuple->t_infomask & HEAP_XMAX_COMMITTED && - !TransactionIdDidCommit(xmax))); + /* + * it's not "this" transaction. Do a normal visibility check using the + * snapshot. + */ + if (XidVisibleInSnapshot(xmax, snapshot, &hintstatus)) + return false; + else + return true; +} - /* check hint bit first */ - if (tuple->t_infomask & HEAP_XMAX_COMMITTED) - return false; - /* check clog */ - return !TransactionIdDidCommit(xmax); +/* + * Check the visibility on a tuple with HEAP_MOVED flags set. + * + * Returns true if the tuple is visible, false otherwise. These flags are + * no longer used, any such tuples must've come from binary upgrade of a + * pre-9.0 system, so we can assume that the xid is long finished by now. + */ +static bool +IsMovedTupleVisible(HeapTuple htup, Buffer buffer) +{ + HeapTupleHeader tuple = htup->t_data; + TransactionId xvac = HeapTupleHeaderGetXvac(tuple); + TransactionIdStatus xidstatus; + + /* + * Check that the xvac is not a live transaction. This should never + * happen, because HEAP_MOVED flags are not set by current code. + */ + if (TransactionIdIsCurrentTransactionId(xvac)) + elog(ERROR, "HEAP_MOVED tuple with in-progress xvac: %u", xvac); + + xidstatus = TransactionIdGetStatus(xvac); + + if (tuple->t_infomask & HEAP_MOVED_OFF) + { + if (xidstatus == XID_COMMITTED) + { + SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, + InvalidTransactionId); + return false; + } + else + { + SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, + InvalidTransactionId); + return true; + } + } + /* Used by pre-9.0 binary upgrades */ + else if (tuple->t_infomask & HEAP_MOVED_IN) + { + if (xidstatus == XID_COMMITTED) + { + SetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, + InvalidTransactionId); + return true; + } + else + { + SetHintBits(tuple, buffer, HEAP_XMIN_INVALID, + InvalidTransactionId); + return false; + } } - /* above xmax horizon, we cannot possibly see the deleting transaction */ - else if (TransactionIdFollowsOrEquals(xmax, snapshot->xmax)) - return true; - /* xmax is between [xmin, xmax), check known committed array */ - else if (TransactionIdInArray(xmax, snapshot->xip, snapshot->xcnt)) - return false; - /* xmax is between [xmin, xmax), but known not to have committed yet */ else - return true; + { + elog(ERROR, "IsMovedTupleVisible() called on a non-moved tuple"); + return true; /* keep compiler quiet */ + } } diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c index bb2bc065ef..f93fdc472d 100644 --- a/src/bin/initdb/initdb.c +++ b/src/bin/initdb/initdb.c @@ -201,12 +201,12 @@ static const char *backend_options = "--single -F -O -j -c search_path=pg_catalo static const char *const subdirs[] = { "global", "pg_wal/archive_status", + "pg_csnlog", "pg_commit_ts", "pg_dynshmem", "pg_notify", "pg_serial", "pg_snapshots", - "pg_subtrans", "pg_twophase", "pg_multixact", "pg_multixact/members", diff --git a/src/include/access/clog.h b/src/include/access/clog.h index 7bae0902b5..0755ffd864 100644 --- a/src/include/access/clog.h +++ b/src/include/access/clog.h @@ -17,16 +17,19 @@ /* * Possible transaction statuses --- note that all-zeroes is the initial * state. - * - * A "subcommitted" transaction is a committed subtransaction whose parent - * hasn't committed or aborted yet. */ -typedef int XidStatus; +typedef int CLogXidStatus; + +#define CLOG_XID_STATUS_IN_PROGRESS 0x00 +#define CLOG_XID_STATUS_COMMITTED 0x01 +#define CLOG_XID_STATUS_ABORTED 0x02 -#define TRANSACTION_STATUS_IN_PROGRESS 0x00 -#define TRANSACTION_STATUS_COMMITTED 0x01 -#define TRANSACTION_STATUS_ABORTED 0x02 -#define TRANSACTION_STATUS_SUB_COMMITTED 0x03 +/* + * A "subcommitted" transaction is a committed subtransaction whose parent + * hasn't committed or aborted yet. We don't create these anymore, but accept + * them in existing clog, if we've been pg_upgraded from an older version. + */ +#define CLOG_XID_STATUS_SUB_COMMITTED 0x03 typedef struct xl_clog_truncate { @@ -35,9 +38,9 @@ typedef struct xl_clog_truncate Oid oldestXactDb; } xl_clog_truncate; -extern void TransactionIdSetTreeStatus(TransactionId xid, int nsubxids, - TransactionId *subxids, XidStatus status, XLogRecPtr lsn); -extern XidStatus TransactionIdGetStatus(TransactionId xid, XLogRecPtr *lsn); +extern void CLogSetTreeStatus(TransactionId xid, int nsubxids, + TransactionId *subxids, CLogXidStatus status, XLogRecPtr lsn); +extern CLogXidStatus CLogGetStatus(TransactionId xid, XLogRecPtr *lsn); extern Size CLOGShmemBuffers(void); extern Size CLOGShmemSize(void); diff --git a/src/include/access/csnlog.h b/src/include/access/csnlog.h new file mode 100644 index 0000000000..165effbee6 --- /dev/null +++ b/src/include/access/csnlog.h @@ -0,0 +1,33 @@ +/* + * csnlog.h + * + * Commit-Sequence-Number log. + * + * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/clog.h + */ +#ifndef CSNLOG_H +#define CSNLOG_H + +#include "access/xlog.h" + +extern void CSNLogSetCommitSeqNo(TransactionId xid, int nsubxids, + TransactionId *subxids, CommitSeqNo csn); +extern CommitSeqNo CSNLogGetCommitSeqNo(TransactionId xid); +extern TransactionId CSNLogGetNextActiveXid(TransactionId start, + TransactionId end); + +extern Size CSNLOGShmemBuffers(void); +extern Size CSNLOGShmemSize(void); +extern void CSNLOGShmemInit(void); +extern void BootStrapCSNLOG(void); +extern void StartupCSNLOG(TransactionId oldestActiveXID); +extern void TrimCSNLOG(void); +extern void ShutdownCSNLOG(void); +extern void CheckPointCSNLOG(void); +extern void ExtendCSNLOG(TransactionId newestXact); +extern void TruncateCSNLOG(TransactionId oldestXact); + +#endif /* CSNLOG_H */ diff --git a/src/include/access/mvccvars.h b/src/include/access/mvccvars.h new file mode 100644 index 0000000000..66de5a8ea6 --- /dev/null +++ b/src/include/access/mvccvars.h @@ -0,0 +1,86 @@ +/*------------------------------------------------------------------------- + * + * mvccvars.h + * Shared memory variables for XID assignment and snapshots + * + * + * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/mvccvars.h + * + *------------------------------------------------------------------------- + */ +#ifndef MVCCVARS_H +#define MVCCVARS_H + +#include "port/atomics.h" + +/* + * VariableCache is a data structure in shared memory that is used to track + * OID and XID assignment state. For largely historical reasons, there is + * just one struct with different fields that are protected by different + * LWLocks. + * + * Note: xidWrapLimit and oldestXidDB are not "active" values, but are + * used just to generate useful messages when xidWarnLimit or xidStopLimit + * are exceeded. + */ +typedef struct VariableCacheData +{ + /* + * These fields are protected by OidGenLock. + */ + Oid nextOid; /* next OID to assign */ + uint32 oidCount; /* OIDs available before must do XLOG work */ + + /* + * These fields are protected by XidGenLock. + */ + TransactionId nextXid; /* next XID to assign */ + + TransactionId oldestXid; /* cluster-wide minimum datfrozenxid */ + TransactionId xidVacLimit; /* start forcing autovacuums here */ + TransactionId xidWarnLimit; /* start complaining here */ + TransactionId xidStopLimit; /* refuse to advance nextXid beyond here */ + TransactionId xidWrapLimit; /* where the world ends */ + Oid oldestXidDB; /* database with minimum datfrozenxid */ + + + /* + * Fields related to MVCC snapshots. + * + * lastCommitSeqNo is the CSN assigned to last committed transaction. + * It is protected by CommitSeqNoLock. + * + * latestCompletedXid is the highest XID that has committed. Anything + * > this is seen by still in-progress by everyone. Use atomic ops to + * update. + * + * oldestActiveXid is the XID of the oldest transaction that's still + * in-progress. (Or rather, the oldest XID among all still in-progress + * transactions; it's not necessarily the one that started first). + * Must hold ProcArrayLock in shared mode, and use atomic ops, to update. + */ + pg_atomic_uint64 nextCommitSeqNo; + pg_atomic_uint32 latestCompletedXid; + pg_atomic_uint32 oldestActiveXid; + + /* + * These fields are protected by CommitTsLock + */ + TransactionId oldestCommitTsXid; + TransactionId newestCommitTsXid; + + /* + * These fields are protected by CLogTruncationLock + */ + TransactionId oldestClogXid; /* oldest it's safe to look up in clog */ +} VariableCacheData; + +typedef VariableCacheData *VariableCache; + +/* in transam/varsup.c */ +extern PGDLLIMPORT VariableCache ShmemVariableCache; + +#endif /* MVCCVARS_H */ diff --git a/src/include/access/slru.h b/src/include/access/slru.h index 20114c4d44..1ae022771a 100644 --- a/src/include/access/slru.h +++ b/src/include/access/slru.h @@ -105,6 +105,8 @@ typedef struct SlruSharedData } SlruSharedData; typedef SlruSharedData *SlruShared; +typedef struct HTAB HTAB; +typedef struct PageSlotEntry PageSlotEntry; /* * SlruCtlData is an unshared structure that points to the active information @@ -113,6 +115,7 @@ typedef SlruSharedData *SlruShared; typedef struct SlruCtlData { SlruShared shared; + HTAB *pageToSlot; /* * This flag tells whether to fsync writes (true for pg_xact and multixact @@ -145,6 +148,8 @@ extern int SimpleLruReadPage(SlruCtl ctl, int pageno, bool write_ok, TransactionId xid); extern int SimpleLruReadPage_ReadOnly(SlruCtl ctl, int pageno, TransactionId xid); +extern int SimpleLruReadPage_ReadOnly_Locked(SlruCtl ctl, int pageno, + TransactionId xid); extern void SimpleLruWritePage(SlruCtl ctl, int slotno); extern void SimpleLruFlush(SlruCtl ctl, bool allow_redirtied); extern void SimpleLruTruncate(SlruCtl ctl, int cutoffPage); diff --git a/src/include/access/subtrans.h b/src/include/access/subtrans.h index 41716d7b71..92267be465 100644 --- a/src/include/access/subtrans.h +++ b/src/include/access/subtrans.h @@ -11,20 +11,9 @@ #ifndef SUBTRANS_H #define SUBTRANS_H -/* Number of SLRU buffers to use for subtrans */ -#define NUM_SUBTRANS_BUFFERS 32 - +/* these are in csnlog.c now */ extern void SubTransSetParent(TransactionId xid, TransactionId parent); extern TransactionId SubTransGetParent(TransactionId xid); extern TransactionId SubTransGetTopmostTransaction(TransactionId xid); -extern Size SUBTRANSShmemSize(void); -extern void SUBTRANSShmemInit(void); -extern void BootStrapSUBTRANS(void); -extern void StartupSUBTRANS(TransactionId oldestActiveXID); -extern void ShutdownSUBTRANS(void); -extern void CheckPointSUBTRANS(void); -extern void ExtendSUBTRANS(TransactionId newestXact); -extern void TruncateSUBTRANS(TransactionId oldestXact); - -#endif /* SUBTRANS_H */ +#endif /* SUBTRANS_H */ diff --git a/src/include/access/transam.h b/src/include/access/transam.h index 86076dede1..7a3839ce19 100644 --- a/src/include/access/transam.h +++ b/src/include/access/transam.h @@ -93,57 +93,6 @@ #define FirstBootstrapObjectId 10000 #define FirstNormalObjectId 16384 -/* - * VariableCache is a data structure in shared memory that is used to track - * OID and XID assignment state. For largely historical reasons, there is - * just one struct with different fields that are protected by different - * LWLocks. - * - * Note: xidWrapLimit and oldestXidDB are not "active" values, but are - * used just to generate useful messages when xidWarnLimit or xidStopLimit - * are exceeded. - */ -typedef struct VariableCacheData -{ - /* - * These fields are protected by OidGenLock. - */ - Oid nextOid; /* next OID to assign */ - uint32 oidCount; /* OIDs available before must do XLOG work */ - - /* - * These fields are protected by XidGenLock. - */ - TransactionId nextXid; /* next XID to assign */ - - TransactionId oldestXid; /* cluster-wide minimum datfrozenxid */ - TransactionId xidVacLimit; /* start forcing autovacuums here */ - TransactionId xidWarnLimit; /* start complaining here */ - TransactionId xidStopLimit; /* refuse to advance nextXid beyond here */ - TransactionId xidWrapLimit; /* where the world ends */ - Oid oldestXidDB; /* database with minimum datfrozenxid */ - - /* - * These fields are protected by CommitTsLock - */ - TransactionId oldestCommitTsXid; - TransactionId newestCommitTsXid; - - /* - * These fields are protected by ProcArrayLock. - */ - TransactionId latestCompletedXid; /* newest XID that has committed or - * aborted */ - - /* - * These fields are protected by CLogTruncationLock - */ - TransactionId oldestClogXid; /* oldest it's safe to look up in clog */ - -} VariableCacheData; - -typedef VariableCacheData *VariableCache; - /* ---------------- * extern declarations @@ -153,15 +102,44 @@ typedef VariableCacheData *VariableCache; /* in transam/xact.c */ extern bool TransactionStartedDuringRecovery(void); -/* in transam/varsup.c */ -extern PGDLLIMPORT VariableCache ShmemVariableCache; - /* * prototypes for functions in transam/transam.c */ extern bool TransactionIdDidCommit(TransactionId transactionId); extern bool TransactionIdDidAbort(TransactionId transactionId); -extern bool TransactionIdIsKnownCompleted(TransactionId transactionId); + + +#define COMMITSEQNO_INPROGRESS UINT64CONST(0x0) +#define COMMITSEQNO_ABORTED UINT64CONST(0x1) +/* + * COMMITSEQNO_COMMITING is an intermediate state that is used to set CSN + * atomically for a top level transaction and its subtransactions. + * High-level users should not see this value, see TransactionIdGetCommitSeqNo(). + */ +#define COMMITSEQNO_COMMITTING UINT64CONST(0x2) +#define COMMITSEQNO_FROZEN UINT64CONST(0x3) +#define COMMITSEQNO_FIRST_NORMAL UINT64CONST(0x4) + +#define COMMITSEQNO_IS_INPROGRESS(csn) ((csn) == COMMITSEQNO_INPROGRESS) +#define COMMITSEQNO_IS_ABORTED(csn) ((csn) == COMMITSEQNO_ABORTED) +#define COMMITSEQNO_IS_FROZEN(csn) ((csn) == COMMITSEQNO_FROZEN) +#define COMMITSEQNO_IS_NORMAL(csn) ((csn) >= COMMITSEQNO_FIRST_NORMAL) +#define COMMITSEQNO_IS_COMMITTING(csn) ((csn) == COMMITSEQNO_COMMITTING) +#define COMMITSEQNO_IS_COMMITTED(csn) ((csn) >= COMMITSEQNO_FROZEN && !COMMITSEQNO_IS_SUBTRANS(csn)) + +#define CSN_SUBTRANS_BIT (UINT64CONST(1)<<63) + +#define COMMITSEQNO_IS_SUBTRANS(csn) ((csn) & CSN_SUBTRANS_BIT) + +typedef enum +{ + XID_COMMITTED, + XID_ABORTED, + XID_INPROGRESS +} TransactionIdStatus; + +extern CommitSeqNo TransactionIdGetCommitSeqNo(TransactionId xid); +extern TransactionIdStatus TransactionIdGetStatus(TransactionId transactionId); extern void TransactionIdAbort(TransactionId transactionId); extern void TransactionIdCommitTree(TransactionId xid, int nxids, TransactionId *xids); extern void TransactionIdAsyncCommitTree(TransactionId xid, int nxids, TransactionId *xids, XLogRecPtr lsn); diff --git a/src/include/access/xact.h b/src/include/access/xact.h index 118b0a8432..015cbe58b2 100644 --- a/src/include/access/xact.h +++ b/src/include/access/xact.h @@ -135,7 +135,7 @@ typedef void (*SubXactCallback) (SubXactEvent event, SubTransactionId mySubid, #define XLOG_XACT_ABORT 0x20 #define XLOG_XACT_COMMIT_PREPARED 0x30 #define XLOG_XACT_ABORT_PREPARED 0x40 -#define XLOG_XACT_ASSIGNMENT 0x50 +/* free opcode 0x50 */ /* free opcode 0x60 */ /* free opcode 0x70 */ @@ -334,7 +334,6 @@ extern TransactionId GetCurrentTransactionId(void); extern TransactionId GetCurrentTransactionIdIfAny(void); extern TransactionId GetStableLatestTransactionId(void); extern SubTransactionId GetCurrentSubTransactionId(void); -extern void MarkCurrentTransactionIdLoggedIfAny(void); extern bool SubTransactionIsActive(SubTransactionId subxid); extern CommandId GetCurrentCommandId(bool used); extern TimestampTz GetCurrentTransactionStartTimestamp(void); diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index 8fd6010ba0..676c12df36 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -52,11 +52,6 @@ extern bool InRecovery; * we haven't yet processed a RUNNING_XACTS or shutdown-checkpoint WAL record * to initialize our master-transaction tracking system. * - * When the transaction tracking is initialized, we enter the SNAPSHOT_PENDING - * state. The tracked information might still be incomplete, so we can't allow - * connections yet, but redo functions must update the in-memory state when - * appropriate. - * * In SNAPSHOT_READY mode, we have full knowledge of transactions that are * (or were) running in the master at the current WAL location. Snapshots * can be taken, and read-only queries can be run. @@ -65,13 +60,12 @@ typedef enum { STANDBY_DISABLED, STANDBY_INITIALIZED, - STANDBY_SNAPSHOT_PENDING, STANDBY_SNAPSHOT_READY } HotStandbyState; extern HotStandbyState standbyState; -#define InHotStandby (standbyState >= STANDBY_SNAPSHOT_PENDING) +#define InHotStandby (standbyState >= STANDBY_SNAPSHOT_READY) /* * Recovery target type. diff --git a/src/include/c.h b/src/include/c.h index a61428843a..702658b089 100644 --- a/src/include/c.h +++ b/src/include/c.h @@ -462,6 +462,13 @@ typedef uint32 CommandId; #define InvalidCommandId (~(CommandId)0) /* + * CommitSeqNo is currently an LSN, but keep use a separate datatype for clarity. + */ +typedef uint64 CommitSeqNo; + +#define InvalidCommitSeqNo ((CommitSeqNo) 0) + +/* * Array indexing support */ #define MAXDIM 6 diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h index c969375981..090d94b4b1 100644 --- a/src/include/catalog/pg_proc.h +++ b/src/include/catalog/pg_proc.h @@ -5085,8 +5085,6 @@ DATA(insert OID = 2945 ( txid_snapshot_xmin PGNSP PGUID 12 1 0 0 0 f f f f t DESCR("get xmin of snapshot"); DATA(insert OID = 2946 ( txid_snapshot_xmax PGNSP PGUID 12 1 0 0 0 f f f f t f i s 1 0 20 "2970" _null_ _null_ _null_ _null_ _null_ txid_snapshot_xmax _null_ _null_ _null_ )); DESCR("get xmax of snapshot"); -DATA(insert OID = 2947 ( txid_snapshot_xip PGNSP PGUID 12 1 50 0 0 f f f f t t i s 1 0 20 "2970" _null_ _null_ _null_ _null_ _null_ txid_snapshot_xip _null_ _null_ _null_ )); -DESCR("get set of in-progress txids in snapshot"); DATA(insert OID = 2948 ( txid_visible_in_snapshot PGNSP PGUID 12 1 0 0 0 f f f f t f i s 2 0 16 "20 2970" _null_ _null_ _null_ _null_ _null_ txid_visible_in_snapshot _null_ _null_ _null_ )); DESCR("is txid visible in snapshot?"); DATA(insert OID = 3360 ( txid_status PGNSP PGUID 12 1 0 0 0 f f f f t f v s 1 0 25 "20" _null_ _null_ _null_ _null_ _null_ txid_status _null_ _null_ _null_ )); diff --git a/src/include/replication/snapbuild.h b/src/include/replication/snapbuild.h index 7653717f83..6e93a9033f 100644 --- a/src/include/replication/snapbuild.h +++ b/src/include/replication/snapbuild.h @@ -20,30 +20,14 @@ typedef enum /* * Initial state, we can't do much yet. */ - SNAPBUILD_START = -1, + SNAPBUILD_START, /* - * Collecting committed transactions, to build the initial catalog - * snapshot. + * Found a point after hitting built_full_snapshot where all transactions + * that were running at that point finished. Till we reach that we hold + * off calling any commit callbacks. */ - SNAPBUILD_BUILDING_SNAPSHOT = 0, - - /* - * We have collected enough information to decode tuples in transactions - * that started after this. - * - * Once we reached this we start to collect changes. We cannot apply them - * yet, because they might be based on transactions that were still - * running when FULL_SNAPSHOT was reached. - */ - SNAPBUILD_FULL_SNAPSHOT = 1, - - /* - * Found a point after SNAPBUILD_FULL_SNAPSHOT where all transactions that - * were running at that point finished. Till we reach that we hold off - * calling any commit callbacks. - */ - SNAPBUILD_CONSISTENT = 2 + SNAPBUILD_CONSISTENT } SnapBuildState; /* forward declare so we don't have to expose the struct to the public */ @@ -57,10 +41,8 @@ struct ReorderBuffer; struct xl_heap_new_cid; struct xl_running_xacts; -extern void CheckPointSnapBuild(void); - extern SnapBuild *AllocateSnapshotBuilder(struct ReorderBuffer *cache, - TransactionId xmin_horizon, XLogRecPtr start_lsn, + XLogRecPtr start_lsn, bool need_full_snapshot); extern void FreeSnapshotBuilder(SnapBuild *cache); @@ -85,6 +67,7 @@ extern void SnapBuildProcessNewCid(SnapBuild *builder, TransactionId xid, XLogRecPtr lsn, struct xl_heap_new_cid *cid); extern void SnapBuildProcessRunningXacts(SnapBuild *builder, XLogRecPtr lsn, struct xl_running_xacts *running); -extern void SnapBuildSerializationPoint(SnapBuild *builder, XLogRecPtr lsn); +extern void SnapBuildProcessInitialSnapshot(SnapBuild *builder, XLogRecPtr lsn, + TransactionId xmin, TransactionId xmax); #endif /* SNAPBUILD_H */ diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h index 596fdadc63..f54a6c6d70 100644 --- a/src/include/storage/lwlock.h +++ b/src/include/storage/lwlock.h @@ -197,7 +197,7 @@ typedef enum BuiltinTrancheIds { LWTRANCHE_CLOG_BUFFERS = NUM_INDIVIDUAL_LWLOCKS, LWTRANCHE_COMMITTS_BUFFERS, - LWTRANCHE_SUBTRANS_BUFFERS, + LWTRANCHE_CSNLOG_BUFFERS, LWTRANCHE_MXACTOFFSET_BUFFERS, LWTRANCHE_MXACTMEMBER_BUFFERS, LWTRANCHE_ASYNC_BUFFERS, diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h index 205f484510..bc611fd8cc 100644 --- a/src/include/storage/proc.h +++ b/src/include/storage/proc.h @@ -23,24 +23,6 @@ #include "storage/proclist_types.h" /* - * Each backend advertises up to PGPROC_MAX_CACHED_SUBXIDS TransactionIds - * for non-aborted subtransactions of its current top transaction. These - * have to be treated as running XIDs by other backends. - * - * We also keep track of whether the cache overflowed (ie, the transaction has - * generated at least one subtransaction that didn't fit in the cache). - * If none of the caches have overflowed, we can assume that an XID that's not - * listed anywhere in the PGPROC array is not a running transaction. Else we - * have to look at pg_subtrans. - */ -#define PGPROC_MAX_CACHED_SUBXIDS 64 /* XXX guessed-at value */ - -struct XidCache -{ - TransactionId xids[PGPROC_MAX_CACHED_SUBXIDS]; -}; - -/* * Flags for PGXACT->vacuumFlags * * Note: If you modify these flags, you need to modify PROCARRAY_XXX flags @@ -77,6 +59,14 @@ struct XidCache #define INVALID_PGPROCNO PG_INT32_MAX /* + * The number of subtransactions below which we consider to apply clog group + * update optimization. Testing reveals that the number higher than this can + * hurt performance. + */ +#define THRESHOLD_SUBTRANS_CLOG_OPT 5 + + +/* * Each backend has a PGPROC struct in shared memory. There is also a list of * currently-unused PGPROC structs that will be reallocated to new backends. * @@ -156,8 +146,6 @@ struct PGPROC */ SHM_QUEUE myProcLocks[NUM_LOCK_PARTITIONS]; - struct XidCache subxids; /* cache for subtransaction XIDs */ - /* Support for group XID clearing. */ /* true, if member of ProcArray group waiting for XID clear */ bool procArrayGroupMember; @@ -176,12 +164,14 @@ struct PGPROC bool clogGroupMember; /* true, if member of clog group */ pg_atomic_uint32 clogGroupNext; /* next clog group member */ TransactionId clogGroupMemberXid; /* transaction id of clog group member */ - XidStatus clogGroupMemberXidStatus; /* transaction status of clog + CLogXidStatus clogGroupMemberXidStatus; /* transaction status of clog * group member */ int clogGroupMemberPage; /* clog page corresponding to * transaction id of clog group member */ XLogRecPtr clogGroupMemberLsn; /* WAL location of commit record for clog * group member */ + TransactionId clogGroupSubxids[THRESHOLD_SUBTRANS_CLOG_OPT]; + int clogGroupNSubxids; /* Per-backend LWLock. Protects fields below (but not group fields). */ LWLock backendLock; @@ -215,6 +205,9 @@ extern PGDLLIMPORT struct PGXACT *MyPgXact; * considerably on systems with many CPU cores, by reducing the number of * cache lines needing to be fetched. Thus, think very carefully before adding * anything else here. + * + * XXX: GetSnapshotData no longer does that, so perhaps we should put these + * back to PGPROC for simplicity's sake. */ typedef struct PGXACT { @@ -224,15 +217,17 @@ typedef struct PGXACT TransactionId xmin; /* minimal running XID as it was when we were * starting our xact, excluding LAZY VACUUM: - * vacuum must not remove tuples deleted by * xid >= xmin ! */ + CommitSeqNo snapshotcsn; /* oldest snapshot in use in this backend: + * vacuum must not remove tuples deleted by + * xacts with commit seqno > snapshotcsn ! + * XXX: currently unused, vacuum uses just xmin, still. + */ + uint8 vacuumFlags; /* vacuum-related flags, see above */ - bool overflowed; bool delayChkpt; /* true if this proc delays checkpoint start; * previously called InCommit */ - - uint8 nxids; } PGXACT; /* diff --git a/src/include/storage/procarray.h b/src/include/storage/procarray.h index 174c537be4..1e54b5d92c 100644 --- a/src/include/storage/procarray.h +++ b/src/include/storage/procarray.h @@ -58,25 +58,18 @@ extern Size ProcArrayShmemSize(void); extern void CreateSharedProcArray(void); extern void ProcArrayAdd(PGPROC *proc); -extern void ProcArrayRemove(PGPROC *proc, TransactionId latestXid); +extern void ProcArrayRemove(PGPROC *proc); -extern void ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid); +extern void ProcArrayEndTransaction(PGPROC *proc); extern void ProcArrayClearTransaction(PGPROC *proc); +extern void ProcArrayResetXmin(PGPROC *proc); -extern void ProcArrayInitRecovery(TransactionId initializedUptoXID); +extern void ProcArrayInitRecovery(TransactionId oldestActiveXID, TransactionId initializedUptoXID); extern void ProcArrayApplyRecoveryInfo(RunningTransactions running); extern void ProcArrayApplyXidAssignment(TransactionId topxid, int nsubxids, TransactionId *subxids); extern void RecordKnownAssignedTransactionIds(TransactionId xid); -extern void ExpireTreeKnownAssignedTransactionIds(TransactionId xid, - int nsubxids, TransactionId *subxids, - TransactionId max_xid); -extern void ExpireAllKnownAssignedTransactionIds(void); -extern void ExpireOldKnownAssignedTransactionIds(TransactionId xid); - -extern int GetMaxSnapshotXidCount(void); -extern int GetMaxSnapshotSubxidCount(void); extern Snapshot GetSnapshotData(Snapshot snapshot); @@ -86,8 +79,9 @@ extern bool ProcArrayInstallRestoredXmin(TransactionId xmin, PGPROC *proc); extern RunningTransactions GetRunningTransactionData(void); -extern bool TransactionIdIsInProgress(TransactionId xid); extern bool TransactionIdIsActive(TransactionId xid); +extern TransactionId GetRecentGlobalXmin(void); +extern TransactionId GetRecentGlobalDataXmin(void); extern TransactionId GetOldestXmin(Relation rel, int flags); extern TransactionId GetOldestActiveTransactionId(void); extern TransactionId GetOldestSafeDecodingTransactionId(bool catalogOnly); @@ -100,9 +94,8 @@ extern PGPROC *BackendPidGetProcWithLock(int pid); extern int BackendXidGetPid(TransactionId xid); extern bool IsBackendPid(int pid); -extern VirtualTransactionId *GetCurrentVirtualXIDs(TransactionId limitXmin, - bool excludeXmin0, bool allDbs, int excludeVacuum, - int *nvxids); +extern VirtualTransactionId *GetCurrentVirtualXIDs(TransactionId limitXmin, bool excludeXmin0, + bool allDbs, int excludeVacuum, int *nvxids); extern VirtualTransactionId *GetConflictingVirtualXIDs(TransactionId limitXmin, Oid dbOid); extern pid_t CancelVirtualTransaction(VirtualTransactionId vxid, ProcSignalReason sigmode); @@ -114,10 +107,6 @@ extern int CountUserBackends(Oid roleid); extern bool CountOtherDBBackends(Oid databaseId, int *nbackends, int *nprepared); -extern void XidCacheRemoveRunningXids(TransactionId xid, - int nxids, const TransactionId *xids, - TransactionId latestXid); - extern void ProcArraySetReplicationSlotXmin(TransactionId xmin, TransactionId catalog_xmin, bool already_locked); diff --git a/src/include/storage/standby.h b/src/include/storage/standby.h index f5404b4c1f..80d0917615 100644 --- a/src/include/storage/standby.h +++ b/src/include/storage/standby.h @@ -50,10 +50,7 @@ extern void StandbyAcquireAccessExclusiveLock(TransactionId xid, Oid dbOid, Oid extern void StandbyReleaseLockTree(TransactionId xid, int nsubxids, TransactionId *subxids); extern void StandbyReleaseAllLocks(void); -extern void StandbyReleaseOldLocks(int nxids, TransactionId *xids); - -#define MinSizeOfXactRunningXacts offsetof(xl_running_xacts, xids) - +extern void StandbyReleaseOldLocks(TransactionId oldestRunningXid); /* * Declarations for GetRunningTransactionData(). Similar to Snapshots, but @@ -69,14 +66,8 @@ extern void StandbyReleaseOldLocks(int nxids, TransactionId *xids); typedef struct RunningTransactionsData { - int xcnt; /* # of xact ids in xids[] */ - int subxcnt; /* # of subxact ids in xids[] */ - bool subxid_overflow; /* snapshot overflowed, subxids missing */ TransactionId nextXid; /* copy of ShmemVariableCache->nextXid */ - TransactionId oldestRunningXid; /* *not* oldestXmin */ - TransactionId latestCompletedXid; /* so we can set xmax */ - - TransactionId *xids; /* array of (sub)xids still running */ + TransactionId oldestRunningXid; /* *not* oldestXmin */ } RunningTransactionsData; typedef RunningTransactionsData *RunningTransactions; diff --git a/src/include/storage/standbydefs.h b/src/include/storage/standbydefs.h index a0af6788e9..2bc167e5cc 100644 --- a/src/include/storage/standbydefs.h +++ b/src/include/storage/standbydefs.h @@ -46,16 +46,13 @@ typedef struct xl_standby_locks */ typedef struct xl_running_xacts { - int xcnt; /* # of xact ids in xids[] */ - int subxcnt; /* # of subxact ids in xids[] */ - bool subxid_overflow; /* snapshot overflowed, subxids missing */ TransactionId nextXid; /* copy of ShmemVariableCache->nextXid */ TransactionId oldestRunningXid; /* *not* oldestXmin */ TransactionId latestCompletedXid; /* so we can set xmax */ - - TransactionId xids[FLEXIBLE_ARRAY_MEMBER]; } xl_running_xacts; +#define SizeOfXactRunningXacts (offsetof(xl_running_xacts, latestCompletedXid) + sizeof(TransactionId)) + /* * Invalidations for standby, currently only when transactions without an * assigned xid commit. diff --git a/src/include/utils/snapmgr.h b/src/include/utils/snapmgr.h index fc64153780..bbef99b875 100644 --- a/src/include/utils/snapmgr.h +++ b/src/include/utils/snapmgr.h @@ -57,9 +57,6 @@ extern TimestampTz GetOldSnapshotThresholdTimestamp(void); extern bool FirstSnapshotSet; extern TransactionId TransactionXmin; -extern TransactionId RecentXmin; -extern PGDLLIMPORT TransactionId RecentGlobalXmin; -extern TransactionId RecentGlobalDataXmin; extern Snapshot GetTransactionSnapshot(void); extern Snapshot GetLatestSnapshot(void); diff --git a/src/include/utils/snapshot.h b/src/include/utils/snapshot.h index bf519778df..759cbd4fc8 100644 --- a/src/include/utils/snapshot.h +++ b/src/include/utils/snapshot.h @@ -60,37 +60,18 @@ typedef struct SnapshotData * specially by HeapTupleSatisfiesDirty, and xmin is used specially by * HeapTupleSatisfiesNonVacuumable.) * - * An MVCC snapshot can never see the effects of XIDs >= xmax. It can see - * the effects of all older XIDs except those listed in the snapshot. xmin - * is stored as an optimization to avoid needing to search the XID arrays - * for most tuples. + * An MVCC snapshot can see the effects of those XIDs that committed + * after snapshotlsn. xmin and xmax are stored as an optimization, to + * avoid checking the commit LSN for most tuples. */ TransactionId xmin; /* all XID < xmin are visible to me */ TransactionId xmax; /* all XID >= xmax are invisible to me */ /* - * For normal MVCC snapshot this contains the all xact IDs that are in - * progress, unless the snapshot was taken during recovery in which case - * it's empty. For historic MVCC snapshots, the meaning is inverted, i.e. - * it contains *committed* transactions between xmin and xmax. - * - * note: all ids in xip[] satisfy xmin <= xip[i] < xmax - */ - TransactionId *xip; - uint32 xcnt; /* # of xact ids in xip[] */ - - /* - * For non-historic MVCC snapshots, this contains subxact IDs that are in - * progress (and other transactions that are in progress if taken during - * recovery). For historic snapshot it contains *all* xids assigned to the - * replayed transaction, including the toplevel xid. - * - * note: all ids in subxip[] are >= xmin, but we don't bother filtering - * out any that are >= xmax + * This snapshot can see the effects of all transactions with CSN <= + * snapshotcsn. */ - TransactionId *subxip; - int32 subxcnt; /* # of xact ids in subxip[] */ - bool suboverflowed; /* has the subxip array overflowed? */ + CommitSeqNo snapshotcsn; bool takenDuringRecovery; /* recovery-shaped snapshot? */ bool copied; /* false if it's a static snapshot */ @@ -104,6 +85,14 @@ typedef struct SnapshotData uint32 speculativeToken; /* + * this_xip contains *all* xids assigned to the replayed transaction, + * including the toplevel xid. Used only in a historic MVCC snapshot, + * used in logical decoding. + */ + TransactionId *this_xip; + uint32 this_xcnt; /* # of xact ids in this_xip[] */ + + /* * Book-keeping information, used by the snapshot manager */ uint32 active_count; /* refcount on ActiveSnapshot stack */ diff --git a/src/include/utils/tqual.h b/src/include/utils/tqual.h index 96eaf01ca0..4666b35385 100644 --- a/src/include/utils/tqual.h +++ b/src/include/utils/tqual.h @@ -17,6 +17,7 @@ #include "utils/snapshot.h" #include "access/xlogdefs.h" +#include "access/transam.h" /* Static variables representing various special snapshot semantics */ @@ -78,7 +79,8 @@ extern HTSV_Result HeapTupleSatisfiesVacuum(HeapTuple htup, TransactionId OldestXmin, Buffer buffer); extern bool HeapTupleIsSurelyDead(HeapTuple htup, TransactionId OldestXmin); -extern bool XidInMVCCSnapshot(TransactionId xid, Snapshot snapshot); +extern bool XidVisibleInSnapshot(TransactionId xid, Snapshot snapshot, + TransactionIdStatus *hintstatus); extern void HeapTupleSetHintBits(HeapTupleHeader tuple, Buffer buffer, uint16 infomask, TransactionId xid); diff --git a/src/test/modules/mvcctorture/Makefile b/src/test/modules/mvcctorture/Makefile new file mode 100644 index 0000000000..cc4ebc838a --- /dev/null +++ b/src/test/modules/mvcctorture/Makefile @@ -0,0 +1,18 @@ +# src/test/modules/mvcctorture/Makefile + +MODULE_big = mvcctorture +OBJS = mvcctorture.o + +EXTENSION = mvcctorture +DATA = mvcctorture--1.0.sql + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +subdir = src/test/modules/mvcctorture +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif diff --git a/src/test/modules/mvcctorture/README b/src/test/modules/mvcctorture/README new file mode 100644 index 0000000000..915b00129a --- /dev/null +++ b/src/test/modules/mvcctorture/README @@ -0,0 +1,25 @@ +A litte helper module for testing MVCC performance. + +The populate_mvcc_test_table function can be used to create a test table, +with given number of rows. Each row in the table is stamped with a different +xmin, and XMIN_COMMITTED hint bit can be set or not. Furthermore, the +xmins values are shuffled, to defeat caching in transam.c and clog.c as badly +as possible. + +The test table is always called "mvcc_test_table". You'll have to drop it +yourself between tests. + +For example: + +-- Create a test table with 10 million rows, without setting hint bits +select populate_mvcc_test_table(10000000, false); + +-- See how long it takes to scan it +\timing +select count(*) from mvcc_test_table; + + + +If you do the above, but have another psql session open, in a transaction +that's done some updates, i.e. is holding backthe xmin horizon, you will +see the worst-case performance of the CSN patch. diff --git a/src/test/modules/mvcctorture/mvcctorture--1.0.sql b/src/test/modules/mvcctorture/mvcctorture--1.0.sql new file mode 100644 index 0000000000..652a6a3f39 --- /dev/null +++ b/src/test/modules/mvcctorture/mvcctorture--1.0.sql @@ -0,0 +1,9 @@ +/* src/test/modules/mvcctorture/mvcctorture--1.0.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "CREATE EXTENSION mvcctorture" to load this file. \quit + +CREATE FUNCTION populate_mvcc_test_table(int4, bool) +RETURNS void +AS 'MODULE_PATHNAME', 'populate_mvcc_test_table' +LANGUAGE C STRICT; diff --git a/src/test/modules/mvcctorture/mvcctorture.c b/src/test/modules/mvcctorture/mvcctorture.c new file mode 100644 index 0000000000..a89a2e6e96 --- /dev/null +++ b/src/test/modules/mvcctorture/mvcctorture.c @@ -0,0 +1,129 @@ +/*------------------------------------------------------------------------- + * + * mvctorture.c + * + * Copyright (c) 2012, PostgreSQL Global Development Group + * + * IDENTIFICATION + * src/test/modules/mvcctorture.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/heapam.h" +#include "access/hio.h" +#include "access/htup_details.h" +#include "access/transam.h" +#include "access/xact.h" +#include "access/visibilitymap.h" +#include "catalog/pg_am.h" +#include "executor/spi.h" +#include "funcapi.h" +#include "nodes/makefuncs.h" +#include "storage/bufmgr.h" +#include "utils/rel.h" + +PG_MODULE_MAGIC; + +PG_FUNCTION_INFO_V1(populate_mvcc_test_table); + +Datum +populate_mvcc_test_table(PG_FUNCTION_ARGS) +{ + uint32 nrows = PG_GETARG_UINT32(0); + bool set_xmin_committed = PG_GETARG_BOOL(1); + RangeVar *rv; + Relation rel; + Datum values[1]; + bool isnull[1]; + HeapTuple tup; + TransactionId *xids; + int ret; + int i; + Buffer buffer; + Buffer vmbuffer = InvalidBuffer; + + /* Connect to SPI manager */ + if ((ret = SPI_connect()) < 0) + /* internal error */ + elog(ERROR, "populate_mvcc_test_table: SPI_connect returned %d", ret); + + SPI_execute("CREATE TABLE mvcc_test_table(i int4)", false, 0); + + SPI_finish(); + + /* Generate a different XID for each tuple */ + xids = (TransactionId *) palloc0(nrows * sizeof(TransactionId)); + for (i = 0; i < nrows; i++) + { + BeginInternalSubTransaction(NULL); + xids[i] = GetCurrentTransactionId(); + ReleaseCurrentSubTransaction(); + } + + rv = makeRangeVar(NULL, "mvcc_test_table", -1); + + rel = heap_openrv(rv, RowExclusiveLock); + + /* shuffle */ + for (i = 0; i < nrows - 1; i++) + { + int x = i + (random() % (nrows - i)); + TransactionId tmp; + + tmp = xids[i]; + xids[i] = xids[x]; + xids[x] = tmp; + } + + for (i = 0; i < nrows; i++) + { + values[0] = Int32GetDatum(i); + isnull[0] = false; + + tup = heap_form_tuple(RelationGetDescr(rel), values, isnull); + + /* Fill the header fields, like heap_prepare_insert does */ + tup->t_data->t_infomask &= ~(HEAP_XACT_MASK); + tup->t_data->t_infomask2 &= ~(HEAP2_XACT_MASK); + tup->t_data->t_infomask |= HEAP_XMAX_INVALID; + if (set_xmin_committed) + tup->t_data->t_infomask |= HEAP_XMIN_COMMITTED; + HeapTupleHeaderSetXmin(tup->t_data, xids[i]); + HeapTupleHeaderSetCmin(tup->t_data, 1); + HeapTupleHeaderSetXmax(tup->t_data, 0); /* for cleanliness */ + tup->t_tableOid = RelationGetRelid(rel); + + heap_freetuple(tup); + + /* + * Find buffer to insert this tuple into. If the page is all visible, + * this will also pin the requisite visibility map page. + */ + buffer = RelationGetBufferForTuple(rel, tup->t_len, + InvalidBuffer, + 0, NULL, + &vmbuffer, NULL); + RelationPutHeapTuple(rel, buffer, tup, false); + + if (PageIsAllVisible(BufferGetPage(buffer))) + { + PageClearAllVisible(BufferGetPage(buffer)); + visibilitymap_clear(rel, + ItemPointerGetBlockNumber(&(tup->t_self)), + vmbuffer, VISIBILITYMAP_VALID_BITS); + } + + MarkBufferDirty(buffer); + UnlockReleaseBuffer(buffer); + } + + if (vmbuffer != InvalidBuffer) + ReleaseBuffer(vmbuffer); + + heap_close(rel, NoLock); + + PG_RETURN_VOID(); +} diff --git a/src/test/modules/mvcctorture/mvcctorture.control b/src/test/modules/mvcctorture/mvcctorture.control new file mode 100644 index 0000000000..1b5feb95a7 --- /dev/null +++ b/src/test/modules/mvcctorture/mvcctorture.control @@ -0,0 +1,5 @@ +# mvcctorture extension +comment = 'populate a table with a mix of different XIDs' +default_version = '1.0' +module_pathname = '$libdir/mvcctorture' +relocatable = true diff --git a/src/test/regress/expected/txid.out b/src/test/regress/expected/txid.out index 015dae3051..a53ada26ac 100644 --- a/src/test/regress/expected/txid.out +++ b/src/test/regress/expected/txid.out @@ -1,199 +1,44 @@ -- txid_snapshot data type and related functions -- i/o -select '12:13:'::txid_snapshot; +select '12:0/ABCDABCD'::txid_snapshot; txid_snapshot --------------- - 12:13: -(1 row) - -select '12:18:14,16'::txid_snapshot; - txid_snapshot ---------------- - 12:18:14,16 -(1 row) - -select '12:16:14,14'::txid_snapshot; - txid_snapshot ---------------- - 12:16:14 + 12:0/ABCDABCD (1 row) -- errors -select '31:12:'::txid_snapshot; -ERROR: invalid input syntax for type txid_snapshot: "31:12:" -LINE 1: select '31:12:'::txid_snapshot; - ^ -select '0:1:'::txid_snapshot; -ERROR: invalid input syntax for type txid_snapshot: "0:1:" -LINE 1: select '0:1:'::txid_snapshot; - ^ -select '12:13:0'::txid_snapshot; -ERROR: invalid input syntax for type txid_snapshot: "12:13:0" -LINE 1: select '12:13:0'::txid_snapshot; - ^ -select '12:16:14,13'::txid_snapshot; -ERROR: invalid input syntax for type txid_snapshot: "12:16:14,13" -LINE 1: select '12:16:14,13'::txid_snapshot; +select '0:0/ABCDABCD'::txid_snapshot; +ERROR: invalid input syntax for type txid_snapshot: "0:0/ABCDABCD" +LINE 1: select '0:0/ABCDABCD'::txid_snapshot; ^ create temp table snapshot_test ( nr integer, snap txid_snapshot ); -insert into snapshot_test values (1, '12:13:'); -insert into snapshot_test values (2, '12:20:13,15,18'); -insert into snapshot_test values (3, '100001:100009:100005,100007,100008'); -insert into snapshot_test values (4, '100:150:101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131'); +insert into snapshot_test values (1, '12:0/ABCDABCD'); select snap from snapshot_test order by nr; - snap -------------------------------------------------------------------------------------------------------------------------------------- - 12:13: - 12:20:13,15,18 - 100001:100009:100005,100007,100008 - 100:150:101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131 -(4 rows) + snap +--------------- + 12:0/ABCDABCD +(1 row) -select txid_snapshot_xmin(snap), - txid_snapshot_xmax(snap), - txid_snapshot_xip(snap) +select txid_snapshot_xmax(snap) from snapshot_test order by nr; - txid_snapshot_xmin | txid_snapshot_xmax | txid_snapshot_xip ---------------------+--------------------+------------------- - 12 | 20 | 13 - 12 | 20 | 15 - 12 | 20 | 18 - 100001 | 100009 | 100005 - 100001 | 100009 | 100007 - 100001 | 100009 | 100008 - 100 | 150 | 101 - 100 | 150 | 102 - 100 | 150 | 103 - 100 | 150 | 104 - 100 | 150 | 105 - 100 | 150 | 106 - 100 | 150 | 107 - 100 | 150 | 108 - 100 | 150 | 109 - 100 | 150 | 110 - 100 | 150 | 111 - 100 | 150 | 112 - 100 | 150 | 113 - 100 | 150 | 114 - 100 | 150 | 115 - 100 | 150 | 116 - 100 | 150 | 117 - 100 | 150 | 118 - 100 | 150 | 119 - 100 | 150 | 120 - 100 | 150 | 121 - 100 | 150 | 122 - 100 | 150 | 123 - 100 | 150 | 124 - 100 | 150 | 125 - 100 | 150 | 126 - 100 | 150 | 127 - 100 | 150 | 128 - 100 | 150 | 129 - 100 | 150 | 130 - 100 | 150 | 131 -(37 rows) + txid_snapshot_xmax +-------------------- + 12 +(1 row) +/* select id, txid_visible_in_snapshot(id, snap) from snapshot_test, generate_series(11, 21) id where nr = 2; - id | txid_visible_in_snapshot -----+-------------------------- - 11 | t - 12 | t - 13 | f - 14 | t - 15 | f - 16 | t - 17 | t - 18 | f - 19 | t - 20 | f - 21 | f -(11 rows) -- test bsearch select id, txid_visible_in_snapshot(id, snap) from snapshot_test, generate_series(90, 160) id where nr = 4; - id | txid_visible_in_snapshot ------+-------------------------- - 90 | t - 91 | t - 92 | t - 93 | t - 94 | t - 95 | t - 96 | t - 97 | t - 98 | t - 99 | t - 100 | t - 101 | f - 102 | f - 103 | f - 104 | f - 105 | f - 106 | f - 107 | f - 108 | f - 109 | f - 110 | f - 111 | f - 112 | f - 113 | f - 114 | f - 115 | f - 116 | f - 117 | f - 118 | f - 119 | f - 120 | f - 121 | f - 122 | f - 123 | f - 124 | f - 125 | f - 126 | f - 127 | f - 128 | f - 129 | f - 130 | f - 131 | f - 132 | t - 133 | t - 134 | t - 135 | t - 136 | t - 137 | t - 138 | t - 139 | t - 140 | t - 141 | t - 142 | t - 143 | t - 144 | t - 145 | t - 146 | t - 147 | t - 148 | t - 149 | t - 150 | f - 151 | f - 152 | f - 153 | f - 154 | f - 155 | f - 156 | f - 157 | f - 158 | f - 159 | f - 160 | f -(71 rows) - +*/ -- test current values also select txid_current() >= txid_snapshot_xmin(txid_current_snapshot()); ?column? @@ -208,98 +53,45 @@ select txid_visible_in_snapshot(txid_current(), txid_current_snapshot()); f (1 row) +/* -- test 64bitness -select txid_snapshot '1000100010001000:1000100010001100:1000100010001012,1000100010001013'; - txid_snapshot ---------------------------------------------------------------------- - 1000100010001000:1000100010001100:1000100010001012,1000100010001013 -(1 row) +select txid_snapshot '1000100010001000:1000100010001100:1000100010001012,1000100010001013'; select txid_visible_in_snapshot('1000100010001012', '1000100010001000:1000100010001100:1000100010001012,1000100010001013'); - txid_visible_in_snapshot --------------------------- - f -(1 row) - select txid_visible_in_snapshot('1000100010001015', '1000100010001000:1000100010001100:1000100010001012,1000100010001013'); - txid_visible_in_snapshot --------------------------- - t -(1 row) -- test 64bit overflow SELECT txid_snapshot '1:9223372036854775807:3'; - txid_snapshot -------------------------- - 1:9223372036854775807:3 -(1 row) - SELECT txid_snapshot '1:9223372036854775808:3'; -ERROR: invalid input syntax for type txid_snapshot: "1:9223372036854775808:3" -LINE 1: SELECT txid_snapshot '1:9223372036854775808:3'; - ^ + -- test txid_current_if_assigned BEGIN; SELECT txid_current_if_assigned() IS NULL; - ?column? ----------- - t -(1 row) - SELECT txid_current() \gset SELECT txid_current_if_assigned() IS NOT DISTINCT FROM BIGINT :'txid_current'; - ?column? ----------- - t -(1 row) - COMMIT; + -- test xid status functions BEGIN; SELECT txid_current() AS committed \gset COMMIT; + BEGIN; SELECT txid_current() AS rolledback \gset ROLLBACK; + BEGIN; SELECT txid_current() AS inprogress \gset -SELECT txid_status(:committed) AS committed; - committed ------------ - committed -(1 row) +SELECT txid_status(:committed) AS committed; SELECT txid_status(:rolledback) AS rolledback; - rolledback ------------- - aborted -(1 row) - SELECT txid_status(:inprogress) AS inprogress; - inprogress -------------- - in progress -(1 row) - SELECT txid_status(1); -- BootstrapTransactionId is always committed - txid_status -------------- - committed -(1 row) - SELECT txid_status(2); -- FrozenTransactionId is always committed - txid_status -------------- - committed -(1 row) - SELECT txid_status(3); -- in regress testing FirstNormalTransactionId will always be behind oldestXmin - txid_status -------------- - -(1 row) COMMIT; + BEGIN; CREATE FUNCTION test_future_xid_status(bigint) RETURNS void @@ -311,14 +103,9 @@ BEGIN RAISE EXCEPTION 'didn''t ERROR at xid in the future as expected'; EXCEPTION WHEN invalid_parameter_value THEN - RAISE NOTICE 'Got expected error for xid in the future'; + RAISE NOTICE 'Got expected error for xid in the future'; END; $$; SELECT test_future_xid_status(:inprogress + 10000); -NOTICE: Got expected error for xid in the future - test_future_xid_status ------------------------- - -(1 row) - ROLLBACK; +*/ diff --git a/src/test/regress/sql/txid.sql b/src/test/regress/sql/txid.sql index bd6decf0ef..6775e04e33 100644 --- a/src/test/regress/sql/txid.sql +++ b/src/test/regress/sql/txid.sql @@ -1,32 +1,22 @@ -- txid_snapshot data type and related functions -- i/o -select '12:13:'::txid_snapshot; -select '12:18:14,16'::txid_snapshot; -select '12:16:14,14'::txid_snapshot; +select '12:0/ABCDABCD'::txid_snapshot; -- errors -select '31:12:'::txid_snapshot; -select '0:1:'::txid_snapshot; -select '12:13:0'::txid_snapshot; -select '12:16:14,13'::txid_snapshot; +select '0:0/ABCDABCD'::txid_snapshot; create temp table snapshot_test ( nr integer, snap txid_snapshot ); -insert into snapshot_test values (1, '12:13:'); -insert into snapshot_test values (2, '12:20:13,15,18'); -insert into snapshot_test values (3, '100001:100009:100005,100007,100008'); -insert into snapshot_test values (4, '100:150:101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,127,128,129,130,131'); +insert into snapshot_test values (1, '12:0/ABCDABCD'); select snap from snapshot_test order by nr; -select txid_snapshot_xmin(snap), - txid_snapshot_xmax(snap), - txid_snapshot_xip(snap) +select txid_snapshot_xmax(snap) from snapshot_test order by nr; - +/* select id, txid_visible_in_snapshot(id, snap) from snapshot_test, generate_series(11, 21) id where nr = 2; @@ -35,7 +25,7 @@ where nr = 2; select id, txid_visible_in_snapshot(id, snap) from snapshot_test, generate_series(90, 160) id where nr = 4; - +*/ -- test current values also select txid_current() >= txid_snapshot_xmin(txid_current_snapshot()); @@ -43,6 +33,7 @@ select txid_current() >= txid_snapshot_xmin(txid_current_snapshot()); select txid_visible_in_snapshot(txid_current(), txid_current_snapshot()); +/* -- test 64bitness select txid_snapshot '1000100010001000:1000100010001100:1000100010001012,1000100010001013'; @@ -92,8 +83,9 @@ BEGIN RAISE EXCEPTION 'didn''t ERROR at xid in the future as expected'; EXCEPTION WHEN invalid_parameter_value THEN - RAISE NOTICE 'Got expected error for xid in the future'; + RAISE NOTICE 'Got expected error for xid in the future'; END; $$; SELECT test_future_xid_status(:inprogress + 10000); ROLLBACK; +*/ diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index b422050a92..ca7343f636 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -34,6 +34,9 @@ AfterTriggerEventList AfterTriggerShared AfterTriggerSharedData AfterTriggersData +AfterTriggersQueryData +AfterTriggersTableData +AfterTriggersTransData Agg AggClauseCosts AggInfo @@ -125,7 +128,6 @@ ArrayMetaState ArrayParseState ArrayRef ArrayRefState -ArrayRemapInfo ArrayType AsyncQueueControl AsyncQueueEntry @@ -143,7 +145,6 @@ AutoVacOpts AutoVacuumShmemStruct AutoVacuumWorkItem AutoVacuumWorkItemType -AutovacWorkItems AuxProcType BF_ctx BF_key @@ -635,6 +636,7 @@ FileFdwPlanState FileName FileNameMap FindSplitData +FixedParallelExecutorState FixedParallelState FixedParamState FlagMode @@ -1021,13 +1023,13 @@ InsertStmt Instrumentation Int128AggState Int8TransTypeData +IntRBTreeNode InternalDefaultACL InternalGrant Interval IntoClause InvalidationChunk InvalidationListHeader -InvertedWalkNextStep IpcMemoryId IpcMemoryKey IpcSemaphoreId @@ -1571,6 +1573,7 @@ PartitionListValue PartitionRangeBound PartitionRangeDatum PartitionRangeDatumKind +PartitionScheme PartitionSpec PartitionedChildRelInfo PasswordType @@ -1781,7 +1784,6 @@ RangeBox RangeFunction RangeIOData RangeQueryClause -RangeRemapInfo RangeSubselect RangeTableFunc RangeTableFuncCol @@ -1794,6 +1796,7 @@ RangeVar RangeVarGetRelidCallback RawColumnDefault RawStmt +ReInitializeDSMForeignScan_function ReScanForeignScan_function ReadBufPtrType ReadBufferMode @@ -1805,8 +1808,6 @@ RecheckForeignScan_function RecordCacheEntry RecordCompareData RecordIOData -RecordRemapInfo -RecordTypmodMap RecoveryTargetAction RecoveryTargetType RectBox @@ -2297,9 +2298,10 @@ TupleHashEntryData TupleHashIterator TupleHashTable TupleQueueReader -TupleRemapClass -TupleRemapInfo TupleTableSlot +TuplesortInstrumentation +TuplesortMethod +TuplesortSpaceType Tuplesortstate Tuplestorestate TwoPhaseCallback @@ -2329,7 +2331,6 @@ UChar UCharIterator UCollator UConverter -UEnumeration UErrorCode UINT ULARGE_INTEGER @@ -2612,7 +2613,9 @@ dsa_pointer dsa_segment_header dsa_segment_index dsa_segment_map +dshash_compare_function dshash_hash +dshash_hash_function dshash_parameters dshash_partition dshash_table