Index: src/backend/access/heap/heapam.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/access/heap/heapam.c,v
retrieving revision 1.232
diff -c -r1.232 heapam.c
*** src/backend/access/heap/heapam.c	8 Apr 2007 01:26:27 -0000	1.232
--- src/backend/access/heap/heapam.c	16 May 2007 11:35:14 -0000
***************
*** 83,88 ****
--- 83,96 ----
  	 */
  	scan->rs_nblocks = RelationGetNumberOfBlocks(scan->rs_rd);
  
+ 	/* A scan on a table smaller than shared_buffers is treated like random
+ 	 * access, but bigger scans should use the bulk read replacement policy.
+ 	 */
+ 	if (scan->rs_nblocks > NBuffers)
+ 		scan->rs_accesspattern = AP_BULKREAD;
+ 	else
+ 		scan->rs_accesspattern = AP_NORMAL;
+ 
  	scan->rs_inited = false;
  	scan->rs_ctup.t_data = NULL;
  	ItemPointerSetInvalid(&scan->rs_ctup.t_self);
***************
*** 123,133 ****
--- 131,146 ----
  
  	Assert(page < scan->rs_nblocks);
  
+ 	/* Read the page with the right strategy */
+ 	SetAccessPattern(scan->rs_accesspattern);
+ 
  	scan->rs_cbuf = ReleaseAndReadBuffer(scan->rs_cbuf,
  										 scan->rs_rd,
  										 page);
  	scan->rs_cblock = page;
  
+ 	SetAccessPattern(AP_NORMAL);
+ 
  	if (!scan->rs_pageatatime)
  		return;
  
Index: src/backend/access/transam/xlog.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/access/transam/xlog.c,v
retrieving revision 1.268
diff -c -r1.268 xlog.c
*** src/backend/access/transam/xlog.c	30 Apr 2007 21:01:52 -0000	1.268
--- src/backend/access/transam/xlog.c	15 May 2007 16:23:30 -0000
***************
*** 1668,1673 ****
--- 1668,1700 ----
  }
  
  /*
+  * Returns true if 'record' hasn't been flushed to disk yet.
+  */
+ bool
+ XLogNeedsFlush(XLogRecPtr record)
+ {
+ 	/* Quick exit if already known flushed */
+ 	if (XLByteLE(record, LogwrtResult.Flush))
+ 		return false;
+ 
+ 	/* read LogwrtResult and update local state */
+ 	{
+ 		/* use volatile pointer to prevent code rearrangement */
+ 		volatile XLogCtlData *xlogctl = XLogCtl;
+ 
+ 		SpinLockAcquire(&xlogctl->info_lck);
+ 		LogwrtResult = xlogctl->LogwrtResult;
+ 		SpinLockRelease(&xlogctl->info_lck);
+ 	}
+ 
+ 	/* check again */
+ 	if (XLByteLE(record, LogwrtResult.Flush))
+ 		return false;
+ 
+ 	return true;
+ }
+ 
+ /*
   * Ensure that all XLOG data through the given position is flushed to disk.
   *
   * NOTE: this differs from XLogWrite mainly in that the WALWriteLock is not
Index: src/backend/commands/copy.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/commands/copy.c,v
retrieving revision 1.283
diff -c -r1.283 copy.c
*** src/backend/commands/copy.c	27 Apr 2007 22:05:46 -0000	1.283
--- src/backend/commands/copy.c	15 May 2007 17:05:29 -0000
***************
*** 1876,1881 ****
--- 1876,1888 ----
  	nfields = file_has_oids ? (attr_count + 1) : attr_count;
  	field_strings = (char **) palloc(nfields * sizeof(char *));
  
+ 	/* Use the special COPY buffer replacement strategy if WAL-logging
+ 	 * is enabled. If it's not, the pages we're writing are dirty but
+ 	 * don't need a WAL flush to write out, so the BULKREAD strategy
+ 	 * is more suitable.
+ 	 */
+ 	SetAccessPattern(use_wal ? AP_COPY : AP_BULKREAD);
+ 
  	/* Initialize state variables */
  	cstate->fe_eof = false;
  	cstate->eol_type = EOL_UNKNOWN;
***************
*** 2161,2166 ****
--- 2168,2176 ----
  							cstate->filename)));
  	}
  
+ 	/* Reset buffer replacement strategy */
+ 	SetAccessPattern(AP_NORMAL);
+ 
  	/* 
  	 * If we skipped writing WAL, then we need to sync the heap (but not
  	 * indexes since those use WAL anyway)
Index: src/backend/commands/vacuum.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/commands/vacuum.c,v
retrieving revision 1.350
diff -c -r1.350 vacuum.c
*** src/backend/commands/vacuum.c	16 Apr 2007 18:29:50 -0000	1.350
--- src/backend/commands/vacuum.c	15 May 2007 17:06:18 -0000
***************
*** 421,431 ****
  				 * Tell the buffer replacement strategy that vacuum is causing
  				 * the IO
  				 */
! 				StrategyHintVacuum(true);
  
  				analyze_rel(relid, vacstmt);
  
! 				StrategyHintVacuum(false);
  
  				if (use_own_xacts)
  					CommitTransactionCommand();
--- 421,431 ----
  				 * Tell the buffer replacement strategy that vacuum is causing
  				 * the IO
  				 */
! 				SetAccessPattern(AP_VACUUM);
  
  				analyze_rel(relid, vacstmt);
  
! 				SetAccessPattern(AP_NORMAL);
  
  				if (use_own_xacts)
  					CommitTransactionCommand();
***************
*** 442,448 ****
  		/* Make sure cost accounting is turned off after error */
  		VacuumCostActive = false;
  		/* And reset buffer replacement strategy, too */
! 		StrategyHintVacuum(false);
  		PG_RE_THROW();
  	}
  	PG_END_TRY();
--- 442,448 ----
  		/* Make sure cost accounting is turned off after error */
  		VacuumCostActive = false;
  		/* And reset buffer replacement strategy, too */
! 		SetAccessPattern(AP_NORMAL);
  		PG_RE_THROW();
  	}
  	PG_END_TRY();
***************
*** 1088,1094 ****
  	 * Tell the cache replacement strategy that vacuum is causing all
  	 * following IO
  	 */
! 	StrategyHintVacuum(true);
  
  	/*
  	 * Do the actual work --- either FULL or "lazy" vacuum
--- 1088,1094 ----
  	 * Tell the cache replacement strategy that vacuum is causing all
  	 * following IO
  	 */
! 	SetAccessPattern(AP_VACUUM);
  
  	/*
  	 * Do the actual work --- either FULL or "lazy" vacuum
***************
*** 1098,1104 ****
  	else
  		lazy_vacuum_rel(onerel, vacstmt);
  
! 	StrategyHintVacuum(false);
  
  	/* all done with this class, but hold lock until commit */
  	relation_close(onerel, NoLock);
--- 1098,1104 ----
  	else
  		lazy_vacuum_rel(onerel, vacstmt);
  
! 	SetAccessPattern(AP_NORMAL);
  
  	/* all done with this class, but hold lock until commit */
  	relation_close(onerel, NoLock);
Index: src/backend/storage/buffer/README
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/storage/buffer/README,v
retrieving revision 1.11
diff -c -r1.11 README
*** src/backend/storage/buffer/README	23 Jul 2006 03:07:58 -0000	1.11
--- src/backend/storage/buffer/README	16 May 2007 11:43:11 -0000
***************
*** 152,159 ****
  a field to show which backend is doing its I/O).
  
  
! Buffer replacement strategy
! ---------------------------
  
  There is a "free list" of buffers that are prime candidates for replacement.
  In particular, buffers that are completely free (contain no valid page) are
--- 152,159 ----
  a field to show which backend is doing its I/O).
  
  
! Normal buffer replacement strategy
! ----------------------------------
  
  There is a "free list" of buffers that are prime candidates for replacement.
  In particular, buffers that are completely free (contain no valid page) are
***************
*** 199,221 ****
  have to give up and try another buffer.  This however is not a concern
  of the basic select-a-victim-buffer algorithm.)
  
- A special provision is that while running VACUUM, a backend does not
- increment the usage count on buffers it accesses.  In fact, if ReleaseBuffer
- sees that it is dropping the pin count to zero and the usage count is zero,
- then it appends the buffer to the tail of the free list.  (This implies that
- VACUUM, but only VACUUM, must take the BufFreelistLock during ReleaseBuffer;
- this shouldn't create much of a contention problem.)  This provision
- encourages VACUUM to work in a relatively small number of buffers rather
- than blowing out the entire buffer cache.  It is reasonable since a page
- that has been touched only by VACUUM is unlikely to be needed again soon.
- 
- Since VACUUM usually requests many pages very fast, the effect of this is that
- it will get back the very buffers it filled and possibly modified on the next
- call and will therefore do its work in a few shared memory buffers, while
- being able to use whatever it finds in the cache already.  This also implies
- that most of the write traffic caused by a VACUUM will be done by the VACUUM
- itself and not pushed off onto other processes.
  
  
  Background writer's processing
  ------------------------------
--- 199,243 ----
  have to give up and try another buffer.  This however is not a concern
  of the basic select-a-victim-buffer algorithm.)
  
  
+ Buffer ring replacement strategy
+ ---------------------------------
+ 
+ When running a query that needs to access a large number of pages, like VACUUM,
+ COPY, or a large sequential scan, a different strategy is used.  A page that
+ has been touched only by such a scan is unlikely to be needed again soon, so
+ instead of running the normal clock sweep algorithm and blowing out the entire
+ buffer cache, a small ring of buffers is allocated using the normal clock sweep
+ algorithm and those buffers are reused for the whole scan.  This also implies
+ that most of the write traffic caused by such a statement will be done by the
+ backend itself and not pushed off onto other processes.
+ 
+ The size of the ring used depends on the kind of scan:
+ 
+ For sequential scans, a small 256 KB ring is used. That's small enough to fit
+ in L2 cache, which makes transferring pages from OS cache to shared buffer
+ cache efficient. Even less would often be enough, but the ring must be big
+ enough to accommodate all pages in the scan that are pinned concurrently. 
+ 256 KB should also be enough to leave a small cache trail for other backends to
+ join in a synchronized seq scan. If a buffer is dirtied and LSN set, the buffer
+ is removed from the ring and a replacement buffer is chosen using the normal
+ replacement strategy. In a scan that modifies every page in the scan, like a
+ bulk UPDATE or DELETE, the buffers in the ring will always be dirtied and the
+ ring strategy effectively degrades to the normal strategy.
+ 
+ VACUUM uses a 256 KB ring like sequential scans, but dirty pages are not
+ removed from the ring. WAL is flushed instead to allow reuse of the buffers.
+ Before introducing the buffer ring strategy in 8.3, buffers were put to the
+ freelist, which was effectively a buffer ring of 1 buffer.
+ 
+ COPY behaves like VACUUM, but a much larger ring is used. The ring size is
+ chosen to be twice the WAL segment size. This avoids polluting the buffer cache
+ like the clock sweep would do, and using a ring larger than WAL segment size
+ avoids having to do any extra WAL flushes, since a WAL segment will always be
+ filled, forcing a WAL flush, before looping through the buffer ring and bumping
+ into a buffer that would force a WAL flush. However, for non-WAL-logged COPY
+ operations the smaller 256 KB ring is used because WAL flushes are not needed
+ to write the buffers.
  
  Background writer's processing
  ------------------------------
Index: src/backend/storage/buffer/bufmgr.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/storage/buffer/bufmgr.c,v
retrieving revision 1.218
diff -c -r1.218 bufmgr.c
*** src/backend/storage/buffer/bufmgr.c	2 May 2007 23:34:48 -0000	1.218
--- src/backend/storage/buffer/bufmgr.c	16 May 2007 12:34:10 -0000
***************
*** 419,431 ****
  	/* Loop here in case we have to try another victim buffer */
  	for (;;)
  	{
  		/*
  		 * Select a victim buffer.	The buffer is returned with its header
  		 * spinlock still held!  Also the BufFreelistLock is still held, since
  		 * it would be bad to hold the spinlock while possibly waking up other
  		 * processes.
  		 */
! 		buf = StrategyGetBuffer();
  
  		Assert(buf->refcount == 0);
  
--- 419,433 ----
  	/* Loop here in case we have to try another victim buffer */
  	for (;;)
  	{
+ 		bool lock_held;
+ 
  		/*
  		 * Select a victim buffer.	The buffer is returned with its header
  		 * spinlock still held!  Also the BufFreelistLock is still held, since
  		 * it would be bad to hold the spinlock while possibly waking up other
  		 * processes.
  		 */
! 		buf = StrategyGetBuffer(&lock_held);
  
  		Assert(buf->refcount == 0);
  
***************
*** 436,442 ****
  		PinBuffer_Locked(buf);
  
  		/* Now it's safe to release the freelist lock */
! 		LWLockRelease(BufFreelistLock);
  
  		/*
  		 * If the buffer was dirty, try to write it out.  There is a race
--- 438,445 ----
  		PinBuffer_Locked(buf);
  
  		/* Now it's safe to release the freelist lock */
! 		if (lock_held)
! 			LWLockRelease(BufFreelistLock);
  
  		/*
  		 * If the buffer was dirty, try to write it out.  There is a race
***************
*** 464,469 ****
--- 467,489 ----
  			 */
  			if (LWLockConditionalAcquire(buf->content_lock, LW_SHARED))
  			{
+ 				/* In BULKREAD-mode, check if a WAL flush would be needed to
+ 				 * evict this buffer. If so, ask the replacement strategy if
+ 				 * we should go ahead and do it or choose another victim.
+ 				 */
+ 				if (active_access_pattern == AP_BULKREAD)
+ 				{
+ 					if (XLogNeedsFlush(BufferGetLSN(buf)))
+ 					{
+ 						if (StrategyRejectBuffer(buf))
+ 						{
+ 							LWLockRelease(buf->content_lock);
+ 							UnpinBuffer(buf, true, false);
+ 							continue;
+ 						}
+ 					}
+ 				}
+ 
  				FlushBuffer(buf, NULL);
  				LWLockRelease(buf->content_lock);
  			}
***************
*** 925,932 ****
  	PrivateRefCount[b]--;
  	if (PrivateRefCount[b] == 0)
  	{
- 		bool		immed_free_buffer = false;
- 
  		/* I'd better not still hold any locks on the buffer */
  		Assert(!LWLockHeldByMe(buf->content_lock));
  		Assert(!LWLockHeldByMe(buf->io_in_progress_lock));
--- 945,950 ----
***************
*** 940,956 ****
  		/* Update buffer usage info, unless this is an internal access */
  		if (normalAccess)
  		{
! 			if (!strategy_hint_vacuum)
  			{
! 				if (buf->usage_count < BM_MAX_USAGE_COUNT)
! 					buf->usage_count++;
  			}
  			else
! 			{
! 				/* VACUUM accesses don't bump usage count, instead... */
! 				if (buf->refcount == 0 && buf->usage_count == 0)
! 					immed_free_buffer = true;
! 			}
  		}
  
  		if ((buf->flags & BM_PIN_COUNT_WAITER) &&
--- 958,975 ----
  		/* Update buffer usage info, unless this is an internal access */
  		if (normalAccess)
  		{
! 			if (active_access_pattern != AP_NORMAL)
  			{
! 				/* We don't want large one-off scans like vacuum to inflate 
! 				 * the usage_count. We do want to set it to 1, though, to keep
! 				 * other backends from hijacking it from the buffer ring.
! 				 */
! 				if (buf->usage_count == 0)
! 					buf->usage_count = 1;
  			}
  			else
! 			if (buf->usage_count < BM_MAX_USAGE_COUNT)
! 				buf->usage_count++;
  		}
  
  		if ((buf->flags & BM_PIN_COUNT_WAITER) &&
***************
*** 965,978 ****
  		}
  		else
  			UnlockBufHdr(buf);
- 
- 		/*
- 		 * If VACUUM is releasing an otherwise-unused buffer, send it to the
- 		 * freelist for near-term reuse.  We put it at the tail so that it
- 		 * won't be used before any invalid buffers that may exist.
- 		 */
- 		if (immed_free_buffer)
- 			StrategyFreeBuffer(buf, false);
  	}
  }
  
--- 984,989 ----
Index: src/backend/storage/buffer/freelist.c
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/backend/storage/buffer/freelist.c,v
retrieving revision 1.58
diff -c -r1.58 freelist.c
*** src/backend/storage/buffer/freelist.c	5 Jan 2007 22:19:37 -0000	1.58
--- src/backend/storage/buffer/freelist.c	17 May 2007 16:12:56 -0000
***************
*** 18,23 ****
--- 18,25 ----
  #include "storage/buf_internals.h"
  #include "storage/bufmgr.h"
  
+ #include "utils/memutils.h"
+ 
  
  /*
   * The shared freelist control information.
***************
*** 39,47 ****
  /* Pointers to shared state */
  static BufferStrategyControl *StrategyControl = NULL;
  
! /* Backend-local state about whether currently vacuuming */
! bool		strategy_hint_vacuum = false;
  
  
  /*
   * StrategyGetBuffer
--- 41,53 ----
  /* Pointers to shared state */
  static BufferStrategyControl *StrategyControl = NULL;
  
! /* Currently active access pattern hint. */
! AccessPattern active_access_pattern = AP_NORMAL;
  
+ /* prototypes for internal functions */
+ static volatile BufferDesc *GetBufferFromRing(void);
+ static void PutBufferToRing(volatile BufferDesc *buf);
+ static void InitRing(void);
  
  /*
   * StrategyGetBuffer
***************
*** 51,67 ****
   *	the selected buffer must not currently be pinned by anyone.
   *
   *	To ensure that no one else can pin the buffer before we do, we must
!  *	return the buffer with the buffer header spinlock still held.  That
!  *	means that we return with the BufFreelistLock still held, as well;
!  *	the caller must release that lock once the spinlock is dropped.
   */
  volatile BufferDesc *
! StrategyGetBuffer(void)
  {
  	volatile BufferDesc *buf;
  	int			trycounter;
  
  	LWLockAcquire(BufFreelistLock, LW_EXCLUSIVE);
  
  	/*
  	 * Try to get a buffer from the freelist.  Note that the freeNext fields
--- 57,89 ----
   *	the selected buffer must not currently be pinned by anyone.
   *
   *	To ensure that no one else can pin the buffer before we do, we must
!  *	return the buffer with the buffer header spinlock still held.  If
!  *	*lock_held is set at return, we return with the BufFreelistLock still
!  *	held, as well;	the caller must release that lock once the spinlock is
!  *	dropped.
   */
  volatile BufferDesc *
! StrategyGetBuffer(bool *lock_held)
  {
  	volatile BufferDesc *buf;
  	int			trycounter;
  
+ 	/* Get a buffer from the ring if we're doing a bulk scan */
+ 	if (active_access_pattern != AP_NORMAL)
+ 	{
+ 		buf = GetBufferFromRing();
+ 		if (buf != NULL)
+ 		{
+ 			*lock_held = false;
+ 			return buf;
+ 		}
+ 	}
+ 
+ 	/*
+ 	 * If our selected buffer wasn't available, pick another...
+ 	 */
  	LWLockAcquire(BufFreelistLock, LW_EXCLUSIVE);
+ 	*lock_held = true;
  
  	/*
  	 * Try to get a buffer from the freelist.  Note that the freeNext fields
***************
*** 86,96 ****
  		 */
  		LockBufHdr(buf);
  		if (buf->refcount == 0 && buf->usage_count == 0)
  			return buf;
  		UnlockBufHdr(buf);
  	}
  
! 	/* Nothing on the freelist, so run the "clock sweep" algorithm */
  	trycounter = NBuffers;
  	for (;;)
  	{
--- 108,122 ----
  		 */
  		LockBufHdr(buf);
  		if (buf->refcount == 0 && buf->usage_count == 0)
+ 		{
+ 			if (active_access_pattern != AP_NORMAL)
+ 				PutBufferToRing(buf);
  			return buf;
+ 		}
  		UnlockBufHdr(buf);
  	}
  
! 	/* Nothing on the freelist, so run the shared "clock sweep" algorithm */
  	trycounter = NBuffers;
  	for (;;)
  	{
***************
*** 105,111 ****
--- 131,141 ----
  		 */
  		LockBufHdr(buf);
  		if (buf->refcount == 0 && buf->usage_count == 0)
+ 		{
+ 			if (active_access_pattern != AP_NORMAL)
+ 				PutBufferToRing(buf);
  			return buf;
+ 		}
  		if (buf->usage_count > 0)
  		{
  			buf->usage_count--;
***************
*** 191,204 ****
  }
  
  /*
!  * StrategyHintVacuum -- tell us whether VACUUM is active
   */
  void
! StrategyHintVacuum(bool vacuum_active)
  {
! 	strategy_hint_vacuum = vacuum_active;
! }
  
  
  /*
   * StrategyShmemSize
--- 221,245 ----
  }
  
  /*
!  * SetAccessPattern -- Sets the active access pattern hint
!  *
!  * Caller is responsible for resetting the hint to AP_NORMAL after the bulk
!  * operation is done. It's ok to switch repeatedly between AP_NORMAL and one of
!  * the other strategies, for example in a query with one large sequential scan
!  * nested loop joined to an index scan. Index tuples should be fetched with the
!  * normal strategy and the pages from the seq scan should be read in with the
!  * AP_BULKREAD strategy. The ring won't be affected by such switching, however
!  * switching to an access pattern with different ring size will invalidate the
!  * old ring.
   */
  void
! SetAccessPattern(AccessPattern new_pattern)
  {
! 	active_access_pattern = new_pattern;
  
+ 	if (active_access_pattern != AP_NORMAL)
+ 		InitRing();
+ }
  
  /*
   * StrategyShmemSize
***************
*** 274,276 ****
--- 315,498 ----
  	else
  		Assert(!init);
  }
+ 
+ /* ----------------------------------------------------------------
+  *				Backend-private buffer ring management
+  * ----------------------------------------------------------------
+  */
+ 
+ /*
+  * Ring sizes for different access patterns. See README for the rationale
+  * of these.
+  */
+ #define BULKREAD_RING_SIZE	256 * 1024 / BLCKSZ
+ #define VACUUM_RING_SIZE	256 * 1024 / BLCKSZ
+ #define COPY_RING_SIZE		Min(NBuffers / 8, (XLOG_SEG_SIZE / BLCKSZ) * 2)
+ 
+ /*
+  * BufferRing is an array of buffer ids, and RingSize it's size in number of
+  * elements. It's allocated in TopMemoryContext the first time it's needed.
+  */
+ static int *BufferRing = NULL;
+ static int RingSize = 0;
+ 
+ /* Index of the "current" slot in the ring. It's advanced every time a buffer
+  * is handed out from the ring with GetBufferFromRing and it points to the 
+  * last buffer returned from the ring. RingCurSlot + 1 is the next victim
+  * GetBufferRing will hand out.
+  */
+ static int RingCurSlot = 0;
+ 
+ /* magic value to mark empty slots in the ring */
+ #define BUF_ID_NOT_SET -1
+ 
+ 
+ /*
+  * GetBufferFromRing -- returns a buffer from the ring, or NULL if the
+  *		ring is empty.
+  *
+  * The bufhdr spin lock is held on the returned buffer.
+  */
+ static volatile BufferDesc *
+ GetBufferFromRing(void)
+ {
+ 	volatile BufferDesc *buf;
+ 
+ 	/* ring should be initialized by now */
+ 	Assert(RingSize > 0 && BufferRing != NULL);
+ 
+ 	/* Run private "clock cycle" */
+ 	if (++RingCurSlot >= RingSize)
+ 		RingCurSlot = 0;
+ 
+ 	/*
+ 	 * If that slot hasn't been filled yet, tell the caller to allocate
+ 	 * a new buffer with the normal allocation strategy. He will then
+ 	 * fill this slot by calling PutBufferToRing with the new buffer.
+ 	 */
+ 	if (BufferRing[RingCurSlot] == BUF_ID_NOT_SET)
+ 		return NULL;
+ 
+ 	buf = &BufferDescriptors[BufferRing[RingCurSlot]];
+ 
+ 	/*
+ 	 * If the buffer is pinned we cannot use it under any circumstances.
+ 	 * If usage_count == 0 then the buffer is fair game. 
+ 	 *
+ 	 * We also choose this buffer if usage_count == 1. Strictly, this
+ 	 * might sometimes be the wrong thing to do, but we rely on the high
+ 	 * probability that it was this process that last touched the buffer.
+ 	 * If it wasn't, we'll choose a suboptimal victim, but  it shouldn't
+ 	 * make any difference in the big scheme of things.
+ 	 *
+ 	 */
+ 	LockBufHdr(buf);
+ 	if (buf->refcount == 0 && buf->usage_count <= 1)
+ 		return buf;
+ 	UnlockBufHdr(buf);
+ 
+ 	return NULL;
+ }
+ 
+ /*
+  * PutBufferToRing -- adds a buffer to the buffer ring
+  *
+  * Caller must hold the buffer header spinlock on the buffer.
+  */
+ static void
+ PutBufferToRing(volatile BufferDesc *buf)
+ {
+ 	/* ring should be initialized by now */
+ 	Assert(RingSize > 0 && BufferRing != NULL);
+ 
+ 	if (BufferRing[RingCurSlot] == BUF_ID_NOT_SET)
+ 		BufferRing[RingCurSlot] = buf->buf_id;
+ }
+ 
+ /*
+  * Initializes a ring buffer with correct size for the currently
+  * active strategy. Does nothing if the ring already has the right size.
+  */
+ static void
+ InitRing(void)
+ {
+ 	int new_size;
+ 	int old_size = RingSize;
+ 	int i;
+ 	MemoryContext oldcxt;
+ 
+ 	/* Determine new size */
+ 
+ 	switch(active_access_pattern)
+ 	{
+ 		case AP_BULKREAD:
+ 			new_size = BULKREAD_RING_SIZE;
+ 			break;
+ 		case AP_COPY:
+ 			new_size = COPY_RING_SIZE;
+ 			break;
+ 		case AP_VACUUM:
+ 			new_size = VACUUM_RING_SIZE;
+ 			break;
+ 		default:
+ 			elog(ERROR, "unexpected buffer cache strategy %d", 
+ 				 active_access_pattern);
+ 			return; /* keep compile happy */
+ 	}
+ 
+ 	/*
+ 	 * Seq scans set and reset the strategy on every page, so we better exit
+ 	 * quickly if no change in size is needed.
+ 	 */
+ 	if (new_size == old_size)
+ 		return;
+ 
+ 	/* Allocate array */
+ 
+ 	oldcxt = MemoryContextSwitchTo(TopMemoryContext);
+ 
+ 	if (old_size == 0)
+ 	{
+ 		Assert(BufferRing == NULL);
+ 		BufferRing = palloc(new_size * sizeof(int));
+ 	}
+ 	else
+ 		BufferRing = repalloc(BufferRing, new_size * sizeof(int));
+ 
+ 	MemoryContextSwitchTo(oldcxt);
+ 
+ 	for(i = 0; i < new_size; i++)
+ 		BufferRing[i] = BUF_ID_NOT_SET;
+ 
+ 	RingCurSlot = 0;
+ 	RingSize = new_size;
+ }
+ 
+ /*
+  * Buffer manager calls this function in AP_BULKREAD mode when the
+  * buffer handed to it turns out to need a WAL flush to write out. This
+  * gives the strategy a second chance to choose another victim.
+  *
+  * Returns true if buffer manager should ask for a new victim, and false
+  * if WAL should be flushed and this buffer used.
+  */
+ bool
+ StrategyRejectBuffer(volatile BufferDesc *buf)
+ {
+ 	Assert(RingSize > 0);
+ 
+ 	if (BufferRing[RingCurSlot] == buf->buf_id)
+ 	{
+ 		BufferRing[RingCurSlot] = BUF_ID_NOT_SET;
+ 		return true;
+ 	}
+ 	else
+ 	{
+ 		/* Apparently the buffer didn't come from the ring. We don't want to
+ 		 * mess with how the clock sweep works; in worst case there's no
+ 		 * buffers in the buffer cache that can be reused without a WAL flush,
+ 		 * and we'd get into an endless loop trying.
+ 		 */
+ 		return false;
+ 	}
+ }
Index: src/include/access/relscan.h
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/include/access/relscan.h,v
retrieving revision 1.52
diff -c -r1.52 relscan.h
*** src/include/access/relscan.h	20 Jan 2007 18:43:35 -0000	1.52
--- src/include/access/relscan.h	15 May 2007 17:01:31 -0000
***************
*** 28,33 ****
--- 28,34 ----
  	ScanKey		rs_key;			/* array of scan key descriptors */
  	BlockNumber rs_nblocks;		/* number of blocks to scan */
  	bool		rs_pageatatime; /* verify visibility page-at-a-time? */
+ 	AccessPattern rs_accesspattern; /* access pattern to use for reads */
  
  	/* scan current state */
  	bool		rs_inited;		/* false = scan not init'd yet */
Index: src/include/access/xlog.h
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/include/access/xlog.h,v
retrieving revision 1.76
diff -c -r1.76 xlog.h
*** src/include/access/xlog.h	5 Jan 2007 22:19:51 -0000	1.76
--- src/include/access/xlog.h	14 May 2007 21:22:40 -0000
***************
*** 151,156 ****
--- 151,157 ----
  
  extern XLogRecPtr XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata);
  extern void XLogFlush(XLogRecPtr RecPtr);
+ extern bool XLogNeedsFlush(XLogRecPtr RecPtr);
  
  extern void xlog_redo(XLogRecPtr lsn, XLogRecord *record);
  extern void xlog_desc(StringInfo buf, uint8 xl_info, char *rec);
Index: src/include/storage/buf_internals.h
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/include/storage/buf_internals.h,v
retrieving revision 1.89
diff -c -r1.89 buf_internals.h
*** src/include/storage/buf_internals.h	5 Jan 2007 22:19:57 -0000	1.89
--- src/include/storage/buf_internals.h	15 May 2007 17:07:59 -0000
***************
*** 16,21 ****
--- 16,22 ----
  #define BUFMGR_INTERNALS_H
  
  #include "storage/buf.h"
+ #include "storage/bufmgr.h"
  #include "storage/lwlock.h"
  #include "storage/shmem.h"
  #include "storage/spin.h"
***************
*** 168,174 ****
  extern BufferDesc *LocalBufferDescriptors;
  
  /* in freelist.c */
! extern bool strategy_hint_vacuum;
  
  /* event counters in buf_init.c */
  extern long int ReadBufferCount;
--- 169,175 ----
  extern BufferDesc *LocalBufferDescriptors;
  
  /* in freelist.c */
! extern AccessPattern active_access_pattern;
  
  /* event counters in buf_init.c */
  extern long int ReadBufferCount;
***************
*** 184,195 ****
   */
  
  /* freelist.c */
! extern volatile BufferDesc *StrategyGetBuffer(void);
  extern void StrategyFreeBuffer(volatile BufferDesc *buf, bool at_head);
  extern int	StrategySyncStart(void);
  extern Size StrategyShmemSize(void);
  extern void StrategyInitialize(bool init);
  
  /* buf_table.c */
  extern Size BufTableShmemSize(int size);
  extern void InitBufTable(int size);
--- 185,198 ----
   */
  
  /* freelist.c */
! extern volatile BufferDesc *StrategyGetBuffer(bool *lock_held);
  extern void StrategyFreeBuffer(volatile BufferDesc *buf, bool at_head);
  extern int	StrategySyncStart(void);
  extern Size StrategyShmemSize(void);
  extern void StrategyInitialize(bool init);
  
+ extern bool StrategyRejectBuffer(volatile BufferDesc *buf);
+ 
  /* buf_table.c */
  extern Size BufTableShmemSize(int size);
  extern void InitBufTable(int size);
Index: src/include/storage/bufmgr.h
===================================================================
RCS file: /home/hlinnaka/pgcvsrepository/pgsql/src/include/storage/bufmgr.h,v
retrieving revision 1.103
diff -c -r1.103 bufmgr.h
*** src/include/storage/bufmgr.h	2 May 2007 23:18:03 -0000	1.103
--- src/include/storage/bufmgr.h	15 May 2007 17:07:02 -0000
***************
*** 48,53 ****
--- 48,61 ----
  #define BUFFER_LOCK_SHARE		1
  #define BUFFER_LOCK_EXCLUSIVE	2
  
+ typedef enum AccessPattern
+ {
+ 	AP_NORMAL,		/* Normal random access */
+     AP_BULKREAD,	/* Large read-only scan (hint bit updates are ok) */
+     AP_COPY,		/* Large updating scan, like COPY with WAL enabled */
+     AP_VACUUM,		/* VACUUM */
+ } AccessPattern;
+ 
  /*
   * These routines are beaten on quite heavily, hence the macroization.
   */
***************
*** 157,162 ****
  extern void AtProcExit_LocalBuffers(void);
  
  /* in freelist.c */
! extern void StrategyHintVacuum(bool vacuum_active);
  
  #endif
--- 165,170 ----
  extern void AtProcExit_LocalBuffers(void);
  
  /* in freelist.c */
! extern void SetAccessPattern(AccessPattern new_pattern);
  
  #endif