Index: src/backend/storage/buffer/bufmgr.c
===================================================================
RCS file: /cvsroot/pgsql/src/backend/storage/buffer/bufmgr.c,v
retrieving revision 1.228
diff -c -r1.228 bufmgr.c
*** src/backend/storage/buffer/bufmgr.c	1 Jan 2008 19:45:51 -0000	1.228
--- src/backend/storage/buffer/bufmgr.c	4 May 2008 01:11:08 -0000
***************
*** 56,61 ****
--- 56,68 ----
  #define BUF_WRITTEN				0x01
  #define BUF_REUSABLE			0x02
  
+ /* Struct for BufferSync's internal to-do list */
+ typedef struct BufAndTag
+ {
+ 	int			buf_id;
+ 	BufferTag	tag;
+ } BufAndTag;
+ 
  
  /* GUC variables */
  bool		zero_damaged_pages = false;
***************
*** 986,991 ****
--- 993,1025 ----
  }
  
  /*
+  * qsort comparator for BufferSync
+  */
+ static int
+ bufandtagcmp(const void *a, const void *b)
+ {
+ 	const BufAndTag *lhs = (const BufAndTag *) a;
+ 	const BufAndTag *rhs = (const BufAndTag *) b;
+ 	int			r;
+ 
+ 	/*
+ 	 * We don't much care about the order in which different relations get
+ 	 * written, so memcmp is enough for comparing the relfilenodes,
+ 	 * even though its behavior will be platform-dependent.
+ 	 */
+ 	r = memcmp(&lhs->tag.rnode, &rhs->tag.rnode, sizeof(lhs->tag.rnode));
+ 	if (r != 0)
+ 		return r;
+ 
+ 	/* We do want blocks within a relation to be ordered accurately */
+ 	if (lhs->tag.blockNum < rhs->tag.blockNum)
+ 		return -1;
+ 	if (lhs->tag.blockNum > rhs->tag.blockNum)
+ 		return 1;
+ 	return 0;
+ }
+ 
+ /*
   * BufferSync -- Write out all dirty buffers in the pool.
   *
   * This is called at checkpoint time to write out all dirty shared buffers.
***************
*** 995,1004 ****
  static void
  BufferSync(int flags)
  {
  	int			buf_id;
- 	int			num_to_scan;
  	int			num_to_write;
  	int			num_written;
  
  	/* Make sure we can handle the pin inside SyncOneBuffer */
  	ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
--- 1029,1056 ----
  static void
  BufferSync(int flags)
  {
+ 	static BufAndTag *bufs_to_write = NULL;
  	int			buf_id;
  	int			num_to_write;
  	int			num_written;
+ 	int			i;
+ 
+ 	/*
+ 	 * We allocate the bufs_to_write[] array on first call and keep it
+ 	 * around for the life of the process.  This is okay because in normal
+ 	 * operation this function is only called within the bgwriter, so
+ 	 * we won't have lots of large arrays floating around.  We prefer this
+ 	 * way because we don't want checkpoints to suddenly start failing
+ 	 * when the system gets under memory pressure.
+ 	 */
+ 	if (bufs_to_write == NULL)
+ 	{
+ 		bufs_to_write = (BufAndTag *) malloc(NBuffers * sizeof(BufAndTag));
+ 		if (bufs_to_write == NULL)
+ 			ereport(FATAL,
+ 					(errcode(ERRCODE_OUT_OF_MEMORY),
+ 					 errmsg("out of memory")));
+ 	}
  
  	/* Make sure we can handle the pin inside SyncOneBuffer */
  	ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
***************
*** 1033,1038 ****
--- 1085,1092 ----
  		if (bufHdr->flags & BM_DIRTY)
  		{
  			bufHdr->flags |= BM_CHECKPOINT_NEEDED;
+ 			bufs_to_write[num_to_write].buf_id = buf_id;
+ 			bufs_to_write[num_to_write].tag = bufHdr->tag;
  			num_to_write++;
  		}
  
***************
*** 1043,1061 ****
  		return;					/* nothing to do */
  
  	/*
! 	 * Loop over all buffers again, and write the ones (still) marked with
! 	 * BM_CHECKPOINT_NEEDED.  In this loop, we start at the clock sweep point
! 	 * since we might as well dump soon-to-be-recycled buffers first.
! 	 *
! 	 * Note that we don't read the buffer alloc count here --- that should be
! 	 * left untouched till the next BgBufferSync() call.
  	 */
- 	buf_id = StrategySyncStart(NULL, NULL);
- 	num_to_scan = NBuffers;
  	num_written = 0;
! 	while (num_to_scan-- > 0)
  	{
! 		volatile BufferDesc *bufHdr = &BufferDescriptors[buf_id];
  
  		/*
  		 * We don't need to acquire the lock here, because we're only looking
--- 1097,1120 ----
  		return;					/* nothing to do */
  
  	/*
! 	 * Sort the buffers-to-be-written into order by file and block number.
! 	 * This improves sequentiality of access for the upcoming I/O.
! 	 */
! 	qsort(bufs_to_write, num_to_write, sizeof(BufAndTag), bufandtagcmp);
! 
! 	/*
! 	 * Loop over all buffers to be written, and write the ones (still) marked
! 	 * with BM_CHECKPOINT_NEEDED.  Note that we don't need to recheck the
! 	 * buffer tag, because if the buffer has been reassigned it cannot have
! 	 * BM_CHECKPOINT_NEEDED still set.
  	 */
  	num_written = 0;
! 	for (i = 0; i < num_to_write; i++)
  	{
! 		volatile BufferDesc *bufHdr;
! 
! 		buf_id = bufs_to_write[i].buf_id;
! 		bufHdr = &BufferDescriptors[buf_id];
  
  		/*
  		 * We don't need to acquire the lock here, because we're only looking
***************
*** 1077,1096 ****
  				num_written++;
  
  				/*
- 				 * We know there are at most num_to_write buffers with
- 				 * BM_CHECKPOINT_NEEDED set; so we can stop scanning if
- 				 * num_written reaches num_to_write.
- 				 *
- 				 * Note that num_written doesn't include buffers written by
- 				 * other backends, or by the bgwriter cleaning scan. That
- 				 * means that the estimate of how much progress we've made is
- 				 * conservative, and also that this test will often fail to
- 				 * trigger.  But it seems worth making anyway.
- 				 */
- 				if (num_written >= num_to_write)
- 					break;
- 
- 				/*
  				 * Perform normal bgwriter duties and sleep to throttle our
  				 * I/O rate.
  				 */
--- 1136,1141 ----
***************
*** 1098,1110 ****
  									 (double) num_written / num_to_write);
  			}
  		}
- 
- 		if (++buf_id >= NBuffers)
- 			buf_id = 0;
  	}
  
  	/*
! 	 * Update checkpoint statistics. As noted above, this doesn't include
  	 * buffers written by other backends or bgwriter scan.
  	 */
  	CheckpointStats.ckpt_bufs_written += num_written;
--- 1143,1152 ----
  									 (double) num_written / num_to_write);
  			}
  		}
  	}
  
  	/*
! 	 * Update checkpoint statistics.  The num_written count doesn't include
  	 * buffers written by other backends or bgwriter scan.
  	 */
  	CheckpointStats.ckpt_bufs_written += num_written;
Index: src/backend/storage/buffer/freelist.c
===================================================================
RCS file: /cvsroot/pgsql/src/backend/storage/buffer/freelist.c,v
retrieving revision 1.64
diff -c -r1.64 freelist.c
*** src/backend/storage/buffer/freelist.c	1 Jan 2008 19:45:51 -0000	1.64
--- src/backend/storage/buffer/freelist.c	4 May 2008 01:11:08 -0000
***************
*** 241,250 ****
  }
  
  /*
!  * StrategySyncStart -- tell BufferSync where to start syncing
   *
!  * The result is the buffer index of the best buffer to sync first.
!  * BufferSync() will proceed circularly around the buffer array from there.
   *
   * In addition, we return the completed-pass count (which is effectively
   * the higher-order bits of nextVictimBuffer) and the count of recent buffer
--- 241,251 ----
  }
  
  /*
!  * StrategySyncStart -- tell BgBufferSync where we are reclaiming buffers
   *
!  * The result is the buffer index of the next possible victim buffer.
!  * BgBufferSync() tries to keep the buffers immediately in front of this
!  * point clean.
   *
   * In addition, we return the completed-pass count (which is effectively
   * the higher-order bits of nextVictimBuffer) and the count of recent buffer