*** src/backend/storage/smgr/md.c.orig	Wed Jan 17 11:25:01 2007
--- src/backend/storage/smgr/md.c	Tue Apr 10 16:38:27 2007
***************
*** 122,135 ****
  	BlockNumber segno;			/* which segment */
  } PendingOperationTag;
  
  typedef struct
  {
  	PendingOperationTag tag;	/* hash table key (must be first!) */
! 	int			failures;		/* number of failed attempts to fsync */
  } PendingOperationEntry;
  
  static HTAB *pendingOpsTable = NULL;
  
  
  typedef enum					/* behavior for mdopen & _mdfd_getseg */
  {
--- 122,140 ----
  	BlockNumber segno;			/* which segment */
  } PendingOperationTag;
  
+ typedef uint16 CycleCtr;		/* can be any convenient integer size */
+ 
  typedef struct
  {
  	PendingOperationTag tag;	/* hash table key (must be first!) */
! 	bool		canceled;		/* T => request canceled, not yet removed */
! 	CycleCtr	cycle_ctr;		/* mdsync_cycle_ctr when request was made */
  } PendingOperationEntry;
  
  static HTAB *pendingOpsTable = NULL;
  
+ static CycleCtr mdsync_cycle_ctr = 0;
+ 
  
  typedef enum					/* behavior for mdopen & _mdfd_getseg */
  {
***************
*** 856,926 ****
  
  /*
   *	mdsync() -- Sync previous writes to stable storage.
-  *
-  * This is only called during checkpoints, and checkpoints should only
-  * occur in processes that have created a pendingOpsTable.
   */
  void
  mdsync(void)
  {
! 	bool		need_retry;
  
  	if (!pendingOpsTable)
  		elog(ERROR, "cannot sync without a pendingOpsTable");
  
  	/*
! 	 * The fsync table could contain requests to fsync relations that have
! 	 * been deleted (unlinked) by the time we get to them.  Rather than
! 	 * just hoping an ENOENT (or EACCES on Windows) error can be ignored,
! 	 * what we will do is retry the whole process after absorbing fsync
! 	 * request messages again.  Since mdunlink() queues a "revoke" message
! 	 * before actually unlinking, the fsync request is guaranteed to be gone
! 	 * the second time if it really was this case.  DROP DATABASE likewise
! 	 * has to tell us to forget fsync requests before it starts deletions.
  	 */
! 	do {
! 		HASH_SEQ_STATUS hstat;
! 		PendingOperationEntry *entry;
! 		int			absorb_counter;
  
! 		need_retry = false;
  
  		/*
! 		 * If we are in the bgwriter, the sync had better include all fsync
! 		 * requests that were queued by backends before the checkpoint REDO
! 		 * point was determined. We go that a little better by accepting all
! 		 * requests queued up to the point where we start fsync'ing.
  		 */
! 		AbsorbFsyncRequests();
  
! 		absorb_counter = FSYNCS_PER_ABSORB;
! 		hash_seq_init(&hstat, pendingOpsTable);
! 		while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
  		{
  			/*
! 			 * If fsync is off then we don't have to bother opening the file
! 			 * at all.  (We delay checking until this point so that changing
! 			 * fsync on the fly behaves sensibly.)
  			 */
! 			if (enableFsync)
  			{
  				SMgrRelation reln;
  				MdfdVec    *seg;
  
  				/*
- 				 * If in bgwriter, we want to absorb pending requests every so
- 				 * often to prevent overflow of the fsync request queue.  This
- 				 * could result in deleting the current entry out from under
- 				 * our hashtable scan, so the procedure is to fall out of the
- 				 * scan and start over from the top of the function.
- 				 */
- 				if (--absorb_counter <= 0)
- 				{
- 					need_retry = true;
- 					break;
- 				}
- 
- 				/*
  				 * Find or create an smgr hash entry for this relation. This
  				 * may seem a bit unclean -- md calling smgr?  But it's really
  				 * the best solution.  It ensures that the open file reference
--- 861,985 ----
  
  /*
   *	mdsync() -- Sync previous writes to stable storage.
   */
  void
  mdsync(void)
  {
! 	static bool mdsync_in_progress = false;
! 
! 	HASH_SEQ_STATUS hstat;
! 	PendingOperationEntry *entry;
! 	int			absorb_counter;
  
+ 	/*
+ 	 * This is only called during checkpoints, and checkpoints should only
+ 	 * occur in processes that have created a pendingOpsTable.
+ 	 */
  	if (!pendingOpsTable)
  		elog(ERROR, "cannot sync without a pendingOpsTable");
  
  	/*
! 	 * If we are in the bgwriter, the sync had better include all fsync
! 	 * requests that were queued by backends before the checkpoint REDO
! 	 * point was determined.  We go that a little better by accepting all
! 	 * requests queued up to the point where we start fsync'ing.
  	 */
! 	AbsorbFsyncRequests();
  
! 	/*
! 	 * To avoid excess fsync'ing, we want to ignore fsync requests that are
! 	 * entered into the hashtable during the main mdsync loop (they should
! 	 * be processed next time, instead).  We use mdsync_cycle_ctr to tell
! 	 * old entries apart from new ones: new ones will have cycle_ctr equal
! 	 * to the incremented value of mdsync_cycle_ctr.
! 	 *
! 	 * In normal circumstances, all entries present in the table at this
! 	 * point will have cycle_ctr exactly equal to the current (about to be old)
! 	 * value of mdsync_cycle_ctr.  However, if we fail partway through the
! 	 * fsync'ing loop, then older values of cycle_ctr might remain when we
! 	 * come back here to try again.  Repeated checkpoint failures would
! 	 * eventually wrap the counter around to the point where an old entry
! 	 * might appear new, causing us to skip it, possibly allowing a checkpoint
! 	 * to succeed that should not have.  To forestall wraparound, any time
! 	 * the previous mdsync() failed to complete, run through the table and
! 	 * forcibly set cycle_ctr = mdsync_cycle_ctr.
! 	 *
! 	 * Think not to merge this loop with the main loop, as the problem is
! 	 * exactly that that loop may fail before having visited all the entries.
! 	 * From a performance point of view it doesn't matter anyway, as this
! 	 * path will never be taken in a system that's functioning normally.
! 	 */
! 	if (mdsync_in_progress)
! 	{
! 		/* prior try failed, so update any stale cycle_ctr values */
! 		hash_seq_init(&hstat, pendingOpsTable);
! 		while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
! 		{
! 			entry->cycle_ctr = mdsync_cycle_ctr;
! 		}
! 	}
  
+ 	/* Advance counter so that new hashtable entries are distinguishable */
+ 	mdsync_cycle_ctr++;
+ 
+ 	/* Set flag to detect failure if we don't reach the end of the loop */
+ 	mdsync_in_progress = true;
+ 
+ 	/* Now scan the hashtable for fsync requests to process */
+ 	absorb_counter = FSYNCS_PER_ABSORB;
+ 	hash_seq_init(&hstat, pendingOpsTable);
+ 	while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
+ 	{
  		/*
! 		 * If the entry is new then don't process it this time.  Note that
! 		 * "continue" bypasses the hash-remove call at the bottom of the loop.
  		 */
! 		if (entry->cycle_ctr == mdsync_cycle_ctr)
! 			continue;
  
! 		/* Else assert we haven't missed it */
! 		Assert((CycleCtr) (entry->cycle_ctr + 1) == mdsync_cycle_ctr);
! 
! 		/*
! 		 * If fsync is off then we don't have to bother opening the file
! 		 * at all.  (We delay checking until this point so that changing
! 		 * fsync on the fly behaves sensibly.)  Also, if the entry is
! 		 * marked canceled, fall through to delete it.
! 		 */
! 		if (enableFsync && !entry->canceled)
  		{
+ 			int			failures;
+ 
  			/*
! 			 * If in bgwriter, we want to absorb pending requests every so
! 			 * often to prevent overflow of the fsync request queue.  It is
! 			 * unspecified whether newly-added entries will be visited by
! 			 * hash_seq_search, but we don't care since we don't need to
! 			 * process them anyway.
  			 */
! 			if (--absorb_counter <= 0)
! 			{
! 				AbsorbFsyncRequests();
! 				absorb_counter = FSYNCS_PER_ABSORB;
! 			}
! 
! 			/*
! 			 * The fsync table could contain requests to fsync segments that
! 			 * have been deleted (unlinked) by the time we get to them.
! 			 * Rather than just hoping an ENOENT (or EACCES on Windows) error
! 			 * can be ignored, what we do on error is absorb pending requests
! 			 * and then retry.  Since mdunlink() queues a "revoke" message
! 			 * before actually unlinking, the fsync request is guaranteed to
! 			 * be marked canceled after the absorb if it really was this case.
! 			 * DROP DATABASE likewise has to tell us to forget fsync requests
! 			 * before it starts deletions.
! 			 */
! 			for (failures = 0; ; failures++)	/* loop exits at "break" */
  			{
  				SMgrRelation reln;
  				MdfdVec    *seg;
  
  				/*
  				 * Find or create an smgr hash entry for this relation. This
  				 * may seem a bit unclean -- md calling smgr?  But it's really
  				 * the best solution.  It ensures that the open file reference
***************
*** 940,946 ****
  				/*
  				 * It is possible that the relation has been dropped or
  				 * truncated since the fsync request was entered.  Therefore,
! 				 * allow ENOENT, but only if we didn't fail once already on
  				 * this file.  This applies both during _mdfd_getseg() and
  				 * during FileSync, since fd.c might have closed the file
  				 * behind our back.
--- 999,1005 ----
  				/*
  				 * It is possible that the relation has been dropped or
  				 * truncated since the fsync request was entered.  Therefore,
! 				 * allow ENOENT, but only if we didn't fail already on
  				 * this file.  This applies both during _mdfd_getseg() and
  				 * during FileSync, since fd.c might have closed the file
  				 * behind our back.
***************
*** 948,989 ****
  				seg = _mdfd_getseg(reln,
  								   entry->tag.segno * ((BlockNumber) RELSEG_SIZE),
  								   false, EXTENSION_RETURN_NULL);
! 				if (seg == NULL ||
! 					FileSync(seg->mdfd_vfd) < 0)
! 				{
! 					/*
! 					 * XXX is there any point in allowing more than one try?
! 					 * Don't see one at the moment, but easy to change the
! 					 * test here if so.
! 					 */
! 					if (!FILE_POSSIBLY_DELETED(errno) ||
! 						++(entry->failures) > 1)
! 						ereport(ERROR,
! 								(errcode_for_file_access(),
! 								 errmsg("could not fsync segment %u of relation %u/%u/%u: %m",
! 										entry->tag.segno,
! 										entry->tag.rnode.spcNode,
! 										entry->tag.rnode.dbNode,
! 										entry->tag.rnode.relNode)));
! 					else
! 						ereport(DEBUG1,
! 								(errcode_for_file_access(),
! 								 errmsg("could not fsync segment %u of relation %u/%u/%u, but retrying: %m",
! 										entry->tag.segno,
! 										entry->tag.rnode.spcNode,
! 										entry->tag.rnode.dbNode,
! 										entry->tag.rnode.relNode)));
! 					need_retry = true;
! 					continue;	/* don't delete the hashtable entry */
! 				}
! 			}
  
! 			/* Okay, delete this entry */
! 			if (hash_search(pendingOpsTable, &entry->tag,
! 							HASH_REMOVE, NULL) == NULL)
! 				elog(ERROR, "pendingOpsTable corrupted");
  		}
! 	} while (need_retry);
  }
  
  /*
--- 1007,1062 ----
  				seg = _mdfd_getseg(reln,
  								   entry->tag.segno * ((BlockNumber) RELSEG_SIZE),
  								   false, EXTENSION_RETURN_NULL);
! 				if (seg != NULL &&
! 					FileSync(seg->mdfd_vfd) >= 0)
! 					break;		/* success; break out of retry loop */
  
! 				/*
! 				 * XXX is there any point in allowing more than one retry?
! 				 * Don't see one at the moment, but easy to change the
! 				 * test here if so.
! 				 */
! 				if (!FILE_POSSIBLY_DELETED(errno) ||
! 					failures > 0)
! 					ereport(ERROR,
! 							(errcode_for_file_access(),
! 							 errmsg("could not fsync segment %u of relation %u/%u/%u: %m",
! 									entry->tag.segno,
! 									entry->tag.rnode.spcNode,
! 									entry->tag.rnode.dbNode,
! 									entry->tag.rnode.relNode)));
! 				else
! 					ereport(DEBUG1,
! 							(errcode_for_file_access(),
! 							 errmsg("could not fsync segment %u of relation %u/%u/%u, but retrying: %m",
! 									entry->tag.segno,
! 									entry->tag.rnode.spcNode,
! 									entry->tag.rnode.dbNode,
! 									entry->tag.rnode.relNode)));
! 
! 				/*
! 				 * Absorb incoming requests and check to see if canceled.
! 				 */
! 				AbsorbFsyncRequests();
! 				absorb_counter = FSYNCS_PER_ABSORB;	/* might as well... */
! 
! 				if (entry->canceled)
! 					break;
! 			}	/* end retry loop */
  		}
! 
! 		/*
! 		 * If we get here, either we fsync'd successfully, we don't have
! 		 * to because enableFsync is off, or the entry is (now) marked
! 		 * canceled.  Okay to delete it.
! 		 */
! 		if (hash_search(pendingOpsTable, &entry->tag,
! 						HASH_REMOVE, NULL) == NULL)
! 			elog(ERROR, "pendingOpsTable corrupted");
! 	}	/* end loop over hashtable entries */
! 
! 	/* Flag successful completion of mdsync */
! 	mdsync_in_progress = false;
  }
  
  /*
***************
*** 1027,1034 ****
   *
   * The range of possible segment numbers is way less than the range of
   * BlockNumber, so we can reserve high values of segno for special purposes.
!  * We define two: FORGET_RELATION_FSYNC means to drop pending fsyncs for
!  * a relation, and FORGET_DATABASE_FSYNC means to drop pending fsyncs for
   * a whole database.  (These are a tad slow because the hash table has to be
   * searched linearly, but it doesn't seem worth rethinking the table structure
   * for them.)
--- 1100,1107 ----
   *
   * The range of possible segment numbers is way less than the range of
   * BlockNumber, so we can reserve high values of segno for special purposes.
!  * We define two: FORGET_RELATION_FSYNC means to cancel pending fsyncs for
!  * a relation, and FORGET_DATABASE_FSYNC means to cancel pending fsyncs for
   * a whole database.  (These are a tad slow because the hash table has to be
   * searched linearly, but it doesn't seem worth rethinking the table structure
   * for them.)
***************
*** 1049,1058 ****
  		{
  			if (RelFileNodeEquals(entry->tag.rnode, rnode))
  			{
! 				/* Okay, delete this entry */
! 				if (hash_search(pendingOpsTable, &entry->tag,
! 								HASH_REMOVE, NULL) == NULL)
! 					elog(ERROR, "pendingOpsTable corrupted");
  			}
  		}
  	}
--- 1122,1129 ----
  		{
  			if (RelFileNodeEquals(entry->tag.rnode, rnode))
  			{
! 				/* Okay, cancel this entry */
! 				entry->canceled = true;
  			}
  		}
  	}
***************
*** 1067,1076 ****
  		{
  			if (entry->tag.rnode.dbNode == rnode.dbNode)
  			{
! 				/* Okay, delete this entry */
! 				if (hash_search(pendingOpsTable, &entry->tag,
! 								HASH_REMOVE, NULL) == NULL)
! 					elog(ERROR, "pendingOpsTable corrupted");
  			}
  		}
  	}
--- 1138,1145 ----
  		{
  			if (entry->tag.rnode.dbNode == rnode.dbNode)
  			{
! 				/* Okay, cancel this entry */
! 				entry->canceled = true;
  			}
  		}
  	}
***************
*** 1091,1097 ****
  													  HASH_ENTER,
  													  &found);
  		if (!found)				/* new entry, so initialize it */
! 			entry->failures = 0;
  	}
  }
  
--- 1160,1175 ----
  													  HASH_ENTER,
  													  &found);
  		if (!found)				/* new entry, so initialize it */
! 		{
! 			entry->canceled = false;
! 			entry->cycle_ctr = mdsync_cycle_ctr;
! 		}
! 		/*
! 		 * NB: it's intentional that we don't change cycle_ctr if the entry
! 		 * already exists.  The fsync request must be treated as old, even
! 		 * though the new request will be satisfied too by our subsequent
! 		 * fsync.
! 		 */
  	}
  }