Index: src/backend/access/transam/twophase.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/access/transam/twophase.c,v
retrieving revision 1.28
diff -c -r1.28 twophase.c
*** src/backend/access/transam/twophase.c	13 Feb 2007 19:39:42 -0000	1.28
--- src/backend/access/transam/twophase.c	3 Apr 2007 09:20:04 -0000
***************
*** 279,284 ****
--- 279,285 ----
  	gxact->proc.pid = 0;
  	gxact->proc.databaseId = databaseid;
  	gxact->proc.roleId = owner;
+ 	gxact->proc.inCommit = false;
  	gxact->proc.inVacuum = false;
  	gxact->proc.isAutovacuum = false;
  	gxact->proc.lwWaiting = false;
***************
*** 853,858 ****
--- 854,860 ----
  	pg_crc32	statefile_crc;
  	pg_crc32	bogus_crc;
  	int			fd;
+ 	volatile PGPROC *myproc = MyProc;
  
  	/* Add the end sentinel to the list of 2PC records */
  	RegisterTwoPhaseRecord(TWOPHASE_RM_END_ID, 0,
***************
*** 938,951 ****
  	 * starting immediately after the WAL record is inserted could complete
  	 * without fsync'ing our state file.  (This is essentially the same kind
  	 * of race condition as the COMMIT-to-clog-write case that
! 	 * RecordTransactionCommit uses CheckpointStartLock for; see notes there.)
  	 *
  	 * We save the PREPARE record's location in the gxact for later use by
  	 * CheckPointTwoPhase.
  	 */
  	START_CRIT_SECTION();
  
! 	LWLockAcquire(CheckpointStartLock, LW_SHARED);
  
  	gxact->prepare_lsn = XLogInsert(RM_XACT_ID, XLOG_XACT_PREPARE,
  									records.head);
--- 940,953 ----
  	 * starting immediately after the WAL record is inserted could complete
  	 * without fsync'ing our state file.  (This is essentially the same kind
  	 * of race condition as the COMMIT-to-clog-write case that
! 	 * RecordTransactionCommit uses inCommit flag for; see notes there.)
  	 *
  	 * We save the PREPARE record's location in the gxact for later use by
  	 * CheckPointTwoPhase.
  	 */
  	START_CRIT_SECTION();
  
! 	myproc->inCommit = true;
  
  	gxact->prepare_lsn = XLogInsert(RM_XACT_ID, XLOG_XACT_PREPARE,
  									records.head);
***************
*** 981,992 ****
  	 */
  	MarkAsPrepared(gxact);
  
- 	/*
- 	 * Now we can release the checkpoint start lock: a checkpoint starting
- 	 * after this will certainly see the gxact as a candidate for fsyncing.
- 	 */
- 	LWLockRelease(CheckpointStartLock);
- 
  	END_CRIT_SECTION();
  
  	records.tail = records.head = NULL;
--- 983,988 ----
***************
*** 1649,1655 ****
   *	RecordTransactionCommitPrepared
   *
   * This is basically the same as RecordTransactionCommit: in particular,
!  * we must take the CheckpointStartLock to avoid a race condition.
   *
   * We know the transaction made at least one XLOG entry (its PREPARE),
   * so it is never possible to optimize out the commit record.
--- 1645,1651 ----
   *	RecordTransactionCommitPrepared
   *
   * This is basically the same as RecordTransactionCommit: in particular,
!  * we must set the inCommit flag to avoid a race condition.
   *
   * We know the transaction made at least one XLOG entry (its PREPARE),
   * so it is never possible to optimize out the commit record.
***************
*** 1665,1675 ****
  	int			lastrdata = 0;
  	xl_xact_commit_prepared xlrec;
  	XLogRecPtr	recptr;
  
  	START_CRIT_SECTION();
  
  	/* See notes in RecordTransactionCommit */
! 	LWLockAcquire(CheckpointStartLock, LW_SHARED);
  
  	/* Emit the XLOG commit record */
  	xlrec.xid = xid;
--- 1661,1672 ----
  	int			lastrdata = 0;
  	xl_xact_commit_prepared xlrec;
  	XLogRecPtr	recptr;
+ 	volatile PGPROC *myproc = MyProc;
  
  	START_CRIT_SECTION();
  
  	/* See notes in RecordTransactionCommit */
! 	myproc->inCommit = true;
  
  	/* Emit the XLOG commit record */
  	xlrec.xid = xid;
***************
*** 1713,1721 ****
  	/* to avoid race conditions, the parent must commit first */
  	TransactionIdCommitTree(nchildren, children);
  
- 	/* Checkpoint is allowed again */
- 	LWLockRelease(CheckpointStartLock);
- 
  	END_CRIT_SECTION();
  }
  
--- 1710,1715 ----
Index: src/backend/access/transam/xact.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/access/transam/xact.c,v
retrieving revision 1.238
diff -c -r1.238 xact.c
*** src/backend/access/transam/xact.c	22 Mar 2007 19:55:04 -0000	1.238
--- src/backend/access/transam/xact.c	3 Apr 2007 12:30:32 -0000
***************
*** 718,739 ****
  
  		/*
  		 * If our transaction made any transaction-controlled XLOG entries, we
! 		 * need to lock out checkpoint start between writing our XLOG record
! 		 * and updating pg_clog.  Otherwise it is possible for the checkpoint
! 		 * to set REDO after the XLOG record but fail to flush the pg_clog
! 		 * update to disk, leading to loss of the transaction commit if we
! 		 * crash a little later.  Slightly klugy fix for problem discovered
! 		 * 2004-08-10.
  		 *
  		 * (If it made no transaction-controlled XLOG entries, its XID appears
  		 * nowhere in permanent storage, so no one else will ever care if it
  		 * committed; so it doesn't matter if we lose the commit flag.)
- 		 *
- 		 * Note we only need a shared lock.
  		 */
  		madeTCentries = (MyLastRecPtr.xrecoff != 0);
  		if (madeTCentries)
! 			LWLockAcquire(CheckpointStartLock, LW_SHARED);
  
  		/*
  		 * We only need to log the commit in XLOG if the transaction made any
--- 718,750 ----
  
  		/*
  		 * If our transaction made any transaction-controlled XLOG entries, we
! 		 * need to tell checkpoint to wait until we've updated pg_clog.
! 		 * Otherwise it is possible for the checkpoint to set REDO after the
! 		 * XLOG record but fail to flush the pg_clog update to disk, leading
! 		 * to loss of the transaction commit if we crash a little later.
  		 *
  		 * (If it made no transaction-controlled XLOG entries, its XID appears
  		 * nowhere in permanent storage, so no one else will ever care if it
  		 * committed; so it doesn't matter if we lose the commit flag.)
  		 */
  		madeTCentries = (MyLastRecPtr.xrecoff != 0);
  		if (madeTCentries)
! 		{
! 			/* Use volatile pointer to prevent code rearrangement.  We want
! 			 * to make sure that other backends (actually bgwriter) to see
! 			 * that we're committing before we insert a commit record.
! 			 *
! 			 * It's safe to set the inCommit flag of my own backend without
! 			 * holding the ProcArrayLock, we're the only one modifying it.
! 			 * Checkpoint will hold ProcArrayLock when reading the value,
! 			 * because it needs to atomically make note of the xid 
! 			 * associated with the flag.
! 			 *
! 			 * The flag is cleared later together with MyProc->xid.
! 			 */
! 			volatile PGPROC *myproc = MyProc;
! 			myproc->inCommit = true;
! 		}
  
  		/*
  		 * We only need to log the commit in XLOG if the transaction made any
***************
*** 825,834 ****
  			TransactionIdCommitTree(nchildren, children);
  		}
  
- 		/* Unlock checkpoint lock if we acquired it */
- 		if (madeTCentries)
- 			LWLockRelease(CheckpointStartLock);
- 
  		END_CRIT_SECTION();
  	}
  
--- 836,841 ----
***************
*** 1556,1561 ****
--- 1563,1569 ----
  		MyProc->xid = InvalidTransactionId;
  		MyProc->xmin = InvalidTransactionId;
  		MyProc->inVacuum = false;		/* must be cleared with xid/xmin */
+ 		MyProc->inCommit = false;
  
  		/* Clear the subtransaction-XID cache too while holding the lock */
  		MyProc->subxids.nxids = 0;
***************
*** 1795,1800 ****
--- 1803,1809 ----
  	MyProc->xid = InvalidTransactionId;
  	MyProc->xmin = InvalidTransactionId;
  	MyProc->inVacuum = false;	/* must be cleared with xid/xmin */
+ 	MyProc->inCommit = false;
  
  	/* Clear the subtransaction-XID cache too while holding the lock */
  	MyProc->subxids.nxids = 0;
***************
*** 1961,1966 ****
--- 1970,1976 ----
  		MyProc->xid = InvalidTransactionId;
  		MyProc->xmin = InvalidTransactionId;
  		MyProc->inVacuum = false;		/* must be cleared with xid/xmin */
+ 		MyProc->inCommit = false;
  
  		/* Clear the subtransaction-XID cache too while holding the lock */
  		MyProc->subxids.nxids = 0;
Index: src/backend/access/transam/xlog.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/access/transam/xlog.c,v
retrieving revision 1.265
diff -c -r1.265 xlog.c
*** src/backend/access/transam/xlog.c	3 Mar 2007 20:02:26 -0000	1.265
--- src/backend/access/transam/xlog.c	3 Apr 2007 12:34:43 -0000
***************
*** 5356,5361 ****
--- 5356,5363 ----
  	int			nsegsadded = 0;
  	int			nsegsremoved = 0;
  	int			nsegsrecycled = 0;
+ 	TransactionId *inCommitXids;
+ 	int			nInCommit;
  
  	/*
  	 * Acquire CheckpointLock to ensure only one checkpoint happens at a time.
***************
*** 5381,5394 ****
  	checkPoint.ThisTimeLineID = ThisTimeLineID;
  	checkPoint.time = time(NULL);
  
- 	/*
- 	 * We must hold CheckpointStartLock while determining the checkpoint REDO
- 	 * pointer.  This ensures that any concurrent transaction commits will be
- 	 * either not yet logged, or logged and recorded in pg_clog. See notes in
- 	 * RecordTransactionCommit().
- 	 */
- 	LWLockAcquire(CheckpointStartLock, LW_EXCLUSIVE);
- 
  	/* And we need WALInsertLock too */
  	LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
  
--- 5383,5388 ----
***************
*** 5421,5433 ****
  			ControlFile->checkPointCopy.redo.xrecoff)
  		{
  			LWLockRelease(WALInsertLock);
- 			LWLockRelease(CheckpointStartLock);
  			LWLockRelease(CheckpointLock);
  			END_CRIT_SECTION();
  			return;
  		}
  	}
  
  	/*
  	 * Compute new REDO record ptr = location of next XLOG record.
  	 *
--- 5415,5441 ----
  			ControlFile->checkPointCopy.redo.xrecoff)
  		{
  			LWLockRelease(WALInsertLock);
  			LWLockRelease(CheckpointLock);
  			END_CRIT_SECTION();
  			return;
  		}
  	}
  
+ 	/* Make note of all transactions that have started but not yet finished
+ 	 * committing while we compute the new REDO ptr.
+ 	 *
+ 	 * NB: GetInCommitTransactions acquires ProcArrayLock lock while we're 
+ 	 * holding WALInsertLock.  That's not currently prone to deadlocks because
+ 	 * we always acquire them in the same order, but keep that in mind if you
+ 	 * fiddle with the locking.  It's important that we hold WALInsertLock,
+ 	 * otherwise a transaction might start committing and insert a commit 
+ 	 * record right after we call GetInCommitTransactions.  Holding the lock
+ 	 * ensures that even if someone starts committing after our 
+ 	 * GetInCommitTransactions call, he can't insert his commit record earlier
+ 	 * than our REDO pointer.
+ 	 */
+ 	inCommitXids = GetInCommitTransactions(&nInCommit);
+ 
  	/*
  	 * Compute new REDO record ptr = location of next XLOG record.
  	 *
***************
*** 5471,5478 ****
  	 */
  	LWLockRelease(WALInsertLock);
  
- 	LWLockRelease(CheckpointStartLock);
- 
  	if (!shutdown)
  		ereport(DEBUG2,
  				(errmsg("checkpoint starting")));
--- 5479,5484 ----
***************
*** 5509,5514 ****
--- 5515,5530 ----
  	 */
  	END_CRIT_SECTION();
  
+ 	/*
+ 	 * We must wait until all transactions that were committing when we 
+ 	 * determined the checkpoint REDO pointer are finished.  This ensures 
+ 	 * that any concurrent transaction commits will be either not yet logged,
+ 	 * or logged and recorded in pg_clog.  See notes in
+ 	 * RecordTransactionCommit().
+ 	 */
+ 	WaitForCommitsToFinish(inCommitXids, nInCommit);
+ 	pfree(inCommitXids);
+ 
  	CheckPointGuts(checkPoint.redo);
  
  	START_CRIT_SECTION();
Index: src/backend/storage/ipc/procarray.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/storage/ipc/procarray.c,v
retrieving revision 1.23
diff -c -r1.23 procarray.c
*** src/backend/storage/ipc/procarray.c	25 Mar 2007 19:45:14 -0000	1.23
--- src/backend/storage/ipc/procarray.c	3 Apr 2007 12:31:28 -0000
***************
*** 740,745 ****
--- 740,841 ----
  }
  
  /*
+  * GetInCommitTransactions -- Returns an array of xids that are committing
+  *
+  * Gets a list of transactions that have started but not yet finished 
+  * committing.  Transactions in the middle of a PREPARE TRANSACTION are
+  * also returned.
+  *
+  * The meaning of commit-in-progress is that the transaction has inserted
+  * or is about to insert the commit record, but all on-disk changes have
+  * not been written yet.
+  *
+  * The returned array is palloc'd and the number of xids in it is stored
+  * in *nxids_ptr.
+  */
+ TransactionId *
+ GetInCommitTransactions(int *nxids_ptr)
+ {
+ 	ProcArrayStruct *arrayP = procArray;
+ 	TransactionId *result;
+ 	int	nxids;
+ 	int	index;
+ 
+ 	result = palloc(sizeof(TransactionId) * MaxBackends);
+ 	nxids = 0;
+ 
+ 	LWLockAcquire(ProcArrayLock, LW_SHARED);
+ 
+ 	for (index = 0; index < arrayP->numProcs; index++)
+ 	{
+ 		PGPROC	   *proc = arrayP->procs[index];
+ 
+ 		if (proc->inCommit)
+ 			result[nxids++] = proc->xid;
+ 	}
+ 
+ 	LWLockRelease(ProcArrayLock);
+ 
+ 	*nxids_ptr = nxids;
+ 	return result;
+ }
+ 
+ /*
+  * WaitForCommitsToFinish -- wait for given xids to finish committing
+  *
+  * NOTE: The contents of the input array are destroyed.
+  *
+  * NOTE: The current implementation is very inefficient.  Waiting is 
+  * implemented by sleeping and polling the proc array, and matching the
+  * in-progress commits with the ones to wait for has O(n^2) behavior.
+  * This is currently only used in checkpoints, so we can live with it.
+  */
+ void
+ WaitForCommitsToFinish(TransactionId *xids, int nxids)
+ {
+ 	TransactionId *currentlyInCommit;
+ 	int nCurrentlyInCommit;
+ 	int i, j;
+ 
+ 	for(;;)
+ 	{
+ 		currentlyInCommit = GetInCommitTransactions(&nCurrentlyInCommit);
+ 
+ 		/* Remove all entries from xids-array that are no longer 
+ 		 * commit-in-progress */
+ 		for(i = 0; i < nxids; i++)
+ 		{
+ 			bool found = false;
+ 			for(j = 0; j < nCurrentlyInCommit; j++)
+ 			{
+ 				if(TransactionIdEquals(currentlyInCommit[j], xids[i]))
+ 				{
+ 					found = true;
+ 					break;
+ 				}
+ 			}
+ 			if (!found)
+ 			{
+ 				/* This commit is no longer in progress.  Remove from array */
+ 				xids[i] = xids[nxids - 1];
+ 				nxids--;
+ 				i--;
+ 			}
+ 		}
+ 
+ 		pfree(currentlyInCommit);
+ 		
+ 		if(nxids == 0)
+ 		{
+ 			/* We're done, none of the original xids are inCommit anymore. */
+ 			break;
+ 		}
+ 
+ 		pg_usleep(100000L); /* sleep for 100 ms and try again */
+ 	}
+ }
+ 
+ /*
   * BackendPidGetProc -- get a backend's PGPROC given its PID
   *
   * Returns NULL if not found.  Note that it is up to the caller to be
Index: src/backend/storage/lmgr/proc.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/storage/lmgr/proc.c,v
retrieving revision 1.186
diff -c -r1.186 proc.c
*** src/backend/storage/lmgr/proc.c	7 Mar 2007 13:35:03 -0000	1.186
--- src/backend/storage/lmgr/proc.c	3 Apr 2007 09:08:05 -0000
***************
*** 259,264 ****
--- 259,265 ----
  	/* databaseId and roleId will be filled in later */
  	MyProc->databaseId = InvalidOid;
  	MyProc->roleId = InvalidOid;
+ 	MyProc->inCommit = false;
  	MyProc->inVacuum = false;
  	MyProc->isAutovacuum = IsAutoVacuumWorkerProcess();
  	MyProc->lwWaiting = false;
***************
*** 392,397 ****
--- 393,399 ----
  	MyProc->xmin = InvalidTransactionId;
  	MyProc->databaseId = InvalidOid;
  	MyProc->roleId = InvalidOid;
+ 	MyProc->inCommit = false;
  	MyProc->inVacuum = false;
  	MyProc->isAutovacuum = IsAutoVacuumLauncherProcess(); /* is this needed? */
  	MyProc->lwWaiting = false;
Index: src/include/storage/lwlock.h
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/include/storage/lwlock.h,v
retrieving revision 1.34
diff -c -r1.34 lwlock.h
*** src/include/storage/lwlock.h	15 Feb 2007 23:23:23 -0000	1.34
--- src/include/storage/lwlock.h	3 Apr 2007 12:34:15 -0000
***************
*** 49,55 ****
  	WALWriteLock,
  	ControlFileLock,
  	CheckpointLock,
- 	CheckpointStartLock,
  	CLogControlLock,
  	SubtransControlLock,
  	MultiXactGenLock,
--- 49,54 ----
Index: src/include/storage/proc.h
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/include/storage/proc.h,v
retrieving revision 1.96
diff -c -r1.96 proc.h
*** src/include/storage/proc.h	7 Mar 2007 13:35:03 -0000	1.96
--- src/include/storage/proc.h	3 Apr 2007 08:56:57 -0000
***************
*** 74,79 ****
--- 74,80 ----
  	Oid			databaseId;		/* OID of database this backend is using */
  	Oid			roleId;			/* OID of role using this backend */
  
+ 	bool		inCommit;		/* true if commit is in progress */
  	bool		inVacuum;		/* true if current xact is a LAZY VACUUM */
  	bool		isAutovacuum;	/* true if it's autovacuum */
  
Index: src/include/storage/procarray.h
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/include/storage/procarray.h,v
retrieving revision 1.12
diff -c -r1.12 procarray.h
*** src/include/storage/procarray.h	16 Jan 2007 13:28:57 -0000	1.12
--- src/include/storage/procarray.h	3 Apr 2007 11:06:11 -0000
***************
*** 25,30 ****
--- 25,32 ----
  extern bool TransactionIdIsInProgress(TransactionId xid);
  extern bool TransactionIdIsActive(TransactionId xid);
  extern TransactionId GetOldestXmin(bool allDbs, bool ignoreVacuum);
+ extern TransactionId *GetInCommitTransactions(int *nxids_ptr);
+ extern void WaitForCommitsToFinish(TransactionId *xids, int nxids);
  
  extern PGPROC *BackendPidGetProc(int pid);
  extern int	BackendXidGetPid(TransactionId xid);
