Author: Noah Misch Commit: Noah Misch Prevent concurrent SimpleLruTruncate() for any given SLRU. The SimpleLruTruncate() header comment states the new coding rule. This closes race conditions that presented rare opportunities for data loss. Symptoms and recommendations to users mirror the previous commit[1]. Back-patch to 9.5 (all supported versions). Reviewed by FIXME. Discussion: https://postgr.es/m/20190218073103.GA1434723@rfd.leadboat.com [1] (footnote to be removed before commit) refers to a patch of thread https://postgr.es/m/flat/20190217040913.GA1305059@rfd.leadboat.com diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml index 49d4bb1..1125c3c 100644 --- a/doc/src/sgml/monitoring.sgml +++ b/doc/src/sgml/monitoring.sgml @@ -1717,6 +1717,12 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser Waiting to extend a relation. + frozenid + Waiting to + update pg_database.datfrozenxid + and pg_database.datminmxid. + + object Waiting to acquire a lock on a non-relation database object. @@ -1885,6 +1891,11 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser Waiting to read or update NOTIFY messages. + NotifyQueueTail + Waiting to update limit on NOTIFY message + storage. + + NotifySLRU Waiting to access the NOTIFY message SLRU cache. @@ -2061,6 +2072,11 @@ postgres 27093 0.0 0.0 30096 2752 ? Ss 11:34 0:00 postgres: ser Waiting for WAL buffers to be written to disk. + WrapLimitsVacuum + Waiting to update limits on transaction id and multixact + consumption. + + XactBuffer Waiting for I/O on a transaction status SLRU buffer. diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c index 61249f4..2750580 100644 --- a/src/backend/access/transam/slru.c +++ b/src/backend/access/transam/slru.c @@ -1208,6 +1208,14 @@ SimpleLruFlush(SlruCtl ctl, bool allow_redirtied) /* * Remove all segments before the one holding the passed page number + * + * All SLRUs prevent concurrent calls to this function, either with an LWLock + * or by calling it only as part of a checkpoint. Mutual exclusion must begin + * before computing cutoffPage. Mutual exclusion must end after any limit + * update that would permit other backends to write fresh data into the + * segment immediately preceding the one containing cutoffPage. Otherwise, + * when the SLRU is quite full, SimpleLruTruncate() might delete that segment + * after it has accrued freshly-written data. */ void SimpleLruTruncate(SlruCtl ctl, int cutoffPage) diff --git a/src/backend/access/transam/subtrans.c b/src/backend/access/transam/subtrans.c index f33ae40..d9a31d3 100644 --- a/src/backend/access/transam/subtrans.c +++ b/src/backend/access/transam/subtrans.c @@ -349,8 +349,8 @@ ExtendSUBTRANS(TransactionId newestXact) /* * Remove all SUBTRANS segments before the one holding the passed transaction ID * - * This is normally called during checkpoint, with oldestXact being the - * oldest TransactionXmin of any running transaction. + * oldestXact is the oldest TransactionXmin of any running transaction. This + * is called only during checkpoint. */ void TruncateSUBTRANS(TransactionId oldestXact) diff --git a/src/backend/commands/async.c b/src/backend/commands/async.c index a3ba88d..4e0db8e 100644 --- a/src/backend/commands/async.c +++ b/src/backend/commands/async.c @@ -244,19 +244,22 @@ typedef struct QueueBackendStatus /* * Shared memory state for LISTEN/NOTIFY (excluding its SLRU stuff) * - * The AsyncQueueControl structure is protected by the NotifyQueueLock. + * The AsyncQueueControl structure is protected by the NotifyQueueLock and + * NotifyQueueTailLock. * - * When holding the lock in SHARED mode, backends may only inspect their own - * entries as well as the head and tail pointers. Consequently we can allow a - * backend to update its own record while holding only SHARED lock (since no - * other backend will inspect it). + * When holding NotifyQueueLock in SHARED mode, backends may only inspect + * their own entries as well as the head and tail pointers. Consequently we + * can allow a backend to update its own record while holding only SHARED lock + * (since no other backend will inspect it). * - * When holding the lock in EXCLUSIVE mode, backends can inspect the entries - * of other backends and also change the head and tail pointers. + * When holding NotifyQueueLock in EXCLUSIVE mode, backends can inspect the + * entries of other backends and also change the head pointer. When holding + * both NotifyQueueLock and NotifyQueueTailLock in EXCLUSIVE mode, backends + * can change the tail pointer. * * NotifySLRULock is used as the control lock for the pg_notify SLRU buffers. - * In order to avoid deadlocks, whenever we need both locks, we always first - * get NotifyQueueLock and then NotifySLRULock. + * In order to avoid deadlocks, whenever we need multiple locks, we first get + * NotifyQueueTailLock, then NotifyQueueLock, and lastly NotifySLRULock. * * Each backend uses the backend[] array entry with index equal to its * BackendId (which can range from 1 to MaxBackends). We rely on this to make @@ -2177,6 +2180,10 @@ asyncQueueAdvanceTail(void) int newtailpage; int boundary; + /* Restrict task to one backend per cluster; see SimpleLruTruncate(). */ + LWLockAcquire(NotifyQueueTailLock, LW_EXCLUSIVE); + + /* Compute the new tail. */ LWLockAcquire(NotifyQueueLock, LW_EXCLUSIVE); min = QUEUE_HEAD; for (BackendId i = QUEUE_FIRST_LISTENER; i > 0; i = QUEUE_NEXT_LISTENER(i)) @@ -2185,7 +2192,6 @@ asyncQueueAdvanceTail(void) min = QUEUE_POS_MIN(min, QUEUE_BACKEND_POS(i)); } oldtailpage = QUEUE_POS_PAGE(QUEUE_TAIL); - QUEUE_TAIL = min; LWLockRelease(NotifyQueueLock); /* @@ -2205,6 +2211,17 @@ asyncQueueAdvanceTail(void) */ SimpleLruTruncate(NotifyCtl, newtailpage); } + + /* + * Advertise the new tail. This changes asyncQueueIsFull()'s verdict for + * the segment immediately prior to the new tail, allowing fresh data into + * that segment. + */ + LWLockAcquire(NotifyQueueLock, LW_EXCLUSIVE); + QUEUE_TAIL = min; + LWLockRelease(NotifyQueueLock); + + LWLockRelease(NotifyQueueTailLock); } /* diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c index 5a110ed..2911761 100644 --- a/src/backend/commands/vacuum.c +++ b/src/backend/commands/vacuum.c @@ -1345,6 +1345,14 @@ vac_update_datfrozenxid(void) bool dirty = false; /* + * Restrict this task to one backend per database. This avoids race + * conditions that would move datfrozenxid or datminmxid backward or call + * vac_truncate_clog() with a datfrozenxid preceding a datfrozenxid passed + * to an earlier vac_truncate_clog() call. + */ + LockDatabaseFrozenIds(ExclusiveLock); + + /* * Initialize the "min" calculation with GetOldestXmin, which is a * reasonable approximation to the minimum relfrozenxid for not-yet- * committed pg_class entries for new tables; see AddNewRelationTuple(). @@ -1533,6 +1541,9 @@ vac_truncate_clog(TransactionId frozenXID, bool bogus = false; bool frozenAlreadyWrapped = false; + /* Restrict task to one backend per cluster; see SimpleLruTruncate(). */ + LWLockAcquire(WrapLimitsVacuumLock, LW_EXCLUSIVE); + /* init oldest datoids to sync with my frozenXID/minMulti values */ oldestxid_datoid = MyDatabaseId; minmulti_datoid = MyDatabaseId; @@ -1642,6 +1653,8 @@ vac_truncate_clog(TransactionId frozenXID, */ SetTransactionIdLimit(frozenXID, oldestxid_datoid); SetMultiXactIdLimit(minMulti, minmulti_datoid, false); + + LWLockRelease(WrapLimitsVacuumLock); } diff --git a/src/backend/storage/lmgr/lmgr.c b/src/backend/storage/lmgr/lmgr.c index 2010320..7409de9 100644 --- a/src/backend/storage/lmgr/lmgr.c +++ b/src/backend/storage/lmgr/lmgr.c @@ -461,6 +461,21 @@ UnlockRelationForExtension(Relation relation, LOCKMODE lockmode) } /* + * LockDatabaseFrozenIds + * + * This allows one backend per database to execute vac_update_datfrozenxid(). + */ +void +LockDatabaseFrozenIds(LOCKMODE lockmode) +{ + LOCKTAG tag; + + SET_LOCKTAG_DATABASE_FROZEN_IDS(tag, MyDatabaseId); + + (void) LockAcquire(&tag, lockmode, false, false); +} + +/* * LockPage * * Obtain a page-level lock. This is currently used by some index access @@ -1098,6 +1113,11 @@ DescribeLockTag(StringInfo buf, const LOCKTAG *tag) tag->locktag_field2, tag->locktag_field1); break; + case LOCKTAG_DATABASE_FROZEN_IDS: + appendStringInfo(buf, + _("pg_database.datfrozenxid of database %u"), + tag->locktag_field1); + break; case LOCKTAG_PAGE: appendStringInfo(buf, _("page %u of relation %u of database %u"), diff --git a/src/backend/storage/lmgr/lwlocknames.txt b/src/backend/storage/lmgr/lwlocknames.txt index e6985e8..a1e62de 100644 --- a/src/backend/storage/lmgr/lwlocknames.txt +++ b/src/backend/storage/lmgr/lwlocknames.txt @@ -50,3 +50,5 @@ MultiXactTruncationLock 41 OldSnapshotTimeMapLock 42 LogicalRepWorkerLock 43 XactTruncationLock 44 +WrapLimitsVacuumLock 45 +NotifyQueueTailLock 46 diff --git a/src/backend/utils/adt/lockfuncs.c b/src/backend/utils/adt/lockfuncs.c index e992d1b..f592292 100644 --- a/src/backend/utils/adt/lockfuncs.c +++ b/src/backend/utils/adt/lockfuncs.c @@ -29,6 +29,7 @@ const char *const LockTagTypeNames[] = { "relation", "extend", + "frozenid", "page", "tuple", "transactionid", @@ -254,6 +255,17 @@ pg_lock_status(PG_FUNCTION_ARGS) nulls[8] = true; nulls[9] = true; break; + case LOCKTAG_DATABASE_FROZEN_IDS: + values[1] = ObjectIdGetDatum(instance->locktag.locktag_field1); + nulls[2] = true; + nulls[3] = true; + nulls[4] = true; + nulls[5] = true; + nulls[6] = true; + nulls[7] = true; + nulls[8] = true; + nulls[9] = true; + break; case LOCKTAG_PAGE: values[1] = ObjectIdGetDatum(instance->locktag.locktag_field1); values[2] = ObjectIdGetDatum(instance->locktag.locktag_field2); diff --git a/src/include/storage/lmgr.h b/src/include/storage/lmgr.h index 3acc11a..f7cabcb 100644 --- a/src/include/storage/lmgr.h +++ b/src/include/storage/lmgr.h @@ -59,6 +59,9 @@ extern bool ConditionalLockRelationForExtension(Relation relation, LOCKMODE lockmode); extern int RelationExtensionLockWaiterCount(Relation relation); +/* Lock to recompute pg_database.datfrozenxid in the current database */ +extern void LockDatabaseFrozenIds(LOCKMODE lockmode); + /* Lock a page (currently only used within indexes) */ extern void LockPage(Relation relation, BlockNumber blkno, LOCKMODE lockmode); extern bool ConditionalLockPage(Relation relation, BlockNumber blkno, LOCKMODE lockmode); diff --git a/src/include/storage/lock.h b/src/include/storage/lock.h index fdabf42..1c3e9c1 100644 --- a/src/include/storage/lock.h +++ b/src/include/storage/lock.h @@ -138,6 +138,7 @@ typedef enum LockTagType { LOCKTAG_RELATION, /* whole relation */ LOCKTAG_RELATION_EXTEND, /* the right to extend a relation */ + LOCKTAG_DATABASE_FROZEN_IDS, /* pg_database.datfrozenxid */ LOCKTAG_PAGE, /* one page of a relation */ LOCKTAG_TUPLE, /* one physical tuple */ LOCKTAG_TRANSACTION, /* transaction (for waiting for xact done) */ @@ -194,6 +195,15 @@ typedef struct LOCKTAG (locktag).locktag_type = LOCKTAG_RELATION_EXTEND, \ (locktag).locktag_lockmethodid = DEFAULT_LOCKMETHOD) +/* ID info for frozen IDs is DB OID */ +#define SET_LOCKTAG_DATABASE_FROZEN_IDS(locktag,dboid) \ + ((locktag).locktag_field1 = (dboid), \ + (locktag).locktag_field2 = 0, \ + (locktag).locktag_field3 = 0, \ + (locktag).locktag_field4 = 0, \ + (locktag).locktag_type = LOCKTAG_DATABASE_FROZEN_IDS, \ + (locktag).locktag_lockmethodid = DEFAULT_LOCKMETHOD) + /* ID info for a page is RELATION info + BlockNumber */ #define SET_LOCKTAG_PAGE(locktag,dboid,reloid,blocknum) \ ((locktag).locktag_field1 = (dboid), \