Index: src/backend/access/transam/xlog.c
===================================================================
RCS file: /cvsroot/pgsql/src/backend/access/transam/xlog.c,v
retrieving revision 1.257
diff -c -c -r1.257 xlog.c
*** src/backend/access/transam/xlog.c	21 Nov 2006 20:59:52 -0000	1.257
--- src/backend/access/transam/xlog.c	24 Nov 2006 18:57:39 -0000
***************
*** 49,127 ****
  #include "utils/pg_locale.h"
  
  
- /*
-  *	Because O_DIRECT bypasses the kernel buffers, and because we never
-  *	read those buffers except during crash recovery, it is a win to use
-  *	it in all cases where we sync on each write().	We could allow O_DIRECT
-  *	with fsync(), but because skipping the kernel buffer forces writes out
-  *	quickly, it seems best just to use it for O_SYNC.  It is hard to imagine
-  *	how fsync() could be a win for O_DIRECT compared to O_SYNC and O_DIRECT.
-  *	Also, O_DIRECT is never enough to force data to the drives, it merely
-  *	tries to bypass the kernel cache, so we still need O_SYNC or fsync().
-  */
- #ifdef O_DIRECT
- #define PG_O_DIRECT				O_DIRECT
- #else
- #define PG_O_DIRECT				0
- #endif
- 
- /*
-  * This chunk of hackery attempts to determine which file sync methods
-  * are available on the current platform, and to choose an appropriate
-  * default method.	We assume that fsync() is always available, and that
-  * configure determined whether fdatasync() is.
-  */
- #if defined(O_SYNC)
- #define BARE_OPEN_SYNC_FLAG		O_SYNC
- #elif defined(O_FSYNC)
- #define BARE_OPEN_SYNC_FLAG		O_FSYNC
- #endif
- #ifdef BARE_OPEN_SYNC_FLAG
- #define OPEN_SYNC_FLAG			(BARE_OPEN_SYNC_FLAG | PG_O_DIRECT)
- #endif
- 
- #if defined(O_DSYNC)
- #if defined(OPEN_SYNC_FLAG)
- /* O_DSYNC is distinct? */
- #if O_DSYNC != BARE_OPEN_SYNC_FLAG
- #define OPEN_DATASYNC_FLAG		(O_DSYNC | PG_O_DIRECT)
- #endif
- #else							/* !defined(OPEN_SYNC_FLAG) */
- /* Win32 only has O_DSYNC */
- #define OPEN_DATASYNC_FLAG		(O_DSYNC | PG_O_DIRECT)
- #endif
- #endif
- 
- #if defined(OPEN_DATASYNC_FLAG)
- #define DEFAULT_SYNC_METHOD_STR "open_datasync"
- #define DEFAULT_SYNC_METHOD		SYNC_METHOD_OPEN
- #define DEFAULT_SYNC_FLAGBIT	OPEN_DATASYNC_FLAG
- #elif defined(HAVE_FDATASYNC)
- #define DEFAULT_SYNC_METHOD_STR "fdatasync"
- #define DEFAULT_SYNC_METHOD		SYNC_METHOD_FDATASYNC
- #define DEFAULT_SYNC_FLAGBIT	0
- #elif defined(HAVE_FSYNC_WRITETHROUGH_ONLY)
- #define DEFAULT_SYNC_METHOD_STR "fsync_writethrough"
- #define DEFAULT_SYNC_METHOD		SYNC_METHOD_FSYNC_WRITETHROUGH
- #define DEFAULT_SYNC_FLAGBIT	0
- #else
- #define DEFAULT_SYNC_METHOD_STR "fsync"
- #define DEFAULT_SYNC_METHOD		SYNC_METHOD_FSYNC
- #define DEFAULT_SYNC_FLAGBIT	0
- #endif
- 
- 
- /*
-  * Limitation of buffer-alignment for direct IO depends on OS and filesystem,
-  * but XLOG_BLCKSZ is assumed to be enough for it.
-  */
- #ifdef O_DIRECT
- #define ALIGNOF_XLOG_BUFFER		XLOG_BLCKSZ
- #else
- #define ALIGNOF_XLOG_BUFFER		ALIGNOF_BUFFER
- #endif
- 
- 
  /* File path names (all relative to $PGDATA) */
  #define BACKUP_LABEL_FILE		"backup_label"
  #define BACKUP_LABEL_OLD		"backup_label.old"
--- 49,54 ----
Index: src/include/access/xlog.h
===================================================================
RCS file: /cvsroot/pgsql/src/include/access/xlog.h,v
retrieving revision 1.75
diff -c -c -r1.75 xlog.h
*** src/include/access/xlog.h	5 Nov 2006 22:42:10 -0000	1.75
--- src/include/access/xlog.h	24 Nov 2006 18:57:42 -0000
***************
*** 90,95 ****
--- 90,167 ----
  extern int	sync_method;
  
  /*
+  *	Because O_DIRECT bypasses the kernel buffers, and because we never
+  *	read those buffers except during crash recovery, it is a win to use
+  *	it in all cases where we sync on each write().	We could allow O_DIRECT
+  *	with fsync(), but because skipping the kernel buffer forces writes out
+  *	quickly, it seems best just to use it for O_SYNC.  It is hard to imagine
+  *	how fsync() could be a win for O_DIRECT compared to O_SYNC and O_DIRECT.
+  *	Also, O_DIRECT is never enough to force data to the drives, it merely
+  *	tries to bypass the kernel cache, so we still need O_SYNC or fsync().
+  */
+ #ifdef O_DIRECT
+ #define PG_O_DIRECT				O_DIRECT
+ #else
+ #define PG_O_DIRECT				0
+ #endif
+ 
+ /*
+  * This chunk of hackery attempts to determine which file sync methods
+  * are available on the current platform, and to choose an appropriate
+  * default method.	We assume that fsync() is always available, and that
+  * configure determined whether fdatasync() is.
+  */
+ #if defined(O_SYNC)
+ #define BARE_OPEN_SYNC_FLAG		O_SYNC
+ #elif defined(O_FSYNC)
+ #define BARE_OPEN_SYNC_FLAG		O_FSYNC
+ #endif
+ #ifdef BARE_OPEN_SYNC_FLAG
+ #define OPEN_SYNC_FLAG			(BARE_OPEN_SYNC_FLAG | PG_O_DIRECT)
+ #endif
+ 
+ #if defined(O_DSYNC)
+ #if defined(OPEN_SYNC_FLAG)
+ /* O_DSYNC is distinct? */
+ #if O_DSYNC != BARE_OPEN_SYNC_FLAG
+ #define OPEN_DATASYNC_FLAG		(O_DSYNC | PG_O_DIRECT)
+ #endif
+ #else							/* !defined(OPEN_SYNC_FLAG) */
+ /* Win32 only has O_DSYNC */
+ #define OPEN_DATASYNC_FLAG		(O_DSYNC | PG_O_DIRECT)
+ #endif
+ #endif
+ 
+ #if defined(OPEN_DATASYNC_FLAG)
+ #define DEFAULT_SYNC_METHOD_STR "open_datasync"
+ #define DEFAULT_SYNC_METHOD		SYNC_METHOD_OPEN
+ #define DEFAULT_SYNC_FLAGBIT	OPEN_DATASYNC_FLAG
+ #elif defined(HAVE_FDATASYNC)
+ #define DEFAULT_SYNC_METHOD_STR "fdatasync"
+ #define DEFAULT_SYNC_METHOD		SYNC_METHOD_FDATASYNC
+ #define DEFAULT_SYNC_FLAGBIT	0
+ #elif defined(HAVE_FSYNC_WRITETHROUGH_ONLY)
+ #define DEFAULT_SYNC_METHOD_STR "fsync_writethrough"
+ #define DEFAULT_SYNC_METHOD		SYNC_METHOD_FSYNC_WRITETHROUGH
+ #define DEFAULT_SYNC_FLAGBIT	0
+ #else
+ #define DEFAULT_SYNC_METHOD_STR "fsync"
+ #define DEFAULT_SYNC_METHOD		SYNC_METHOD_FSYNC
+ #define DEFAULT_SYNC_FLAGBIT	0
+ #endif
+ 
+ 
+ /*
+  * Limitation of buffer-alignment for direct IO depends on OS and filesystem,
+  * but XLOG_BLCKSZ is assumed to be enough for it.
+  */
+ #ifdef O_DIRECT
+ #define ALIGNOF_XLOG_BUFFER		XLOG_BLCKSZ
+ #else
+ #define ALIGNOF_XLOG_BUFFER		ALIGNOF_BUFFER
+ #endif
+ 
+ /*
   * The rmgr data to be written by XLogInsert() is defined by a chain of
   * one or more XLogRecData structs.  (Multiple structs would be used when
   * parts of the source data aren't physically adjacent in memory, or when
Index: src/tools/fsync/test_fsync.c
===================================================================
RCS file: /cvsroot/pgsql/src/tools/fsync/test_fsync.c,v
retrieving revision 1.16
diff -c -c -r1.16 test_fsync.c
*** src/tools/fsync/test_fsync.c	23 Nov 2006 17:20:47 -0000	1.16
--- src/tools/fsync/test_fsync.c	24 Nov 2006 18:57:43 -0000
***************
*** 1,10 ****
  /*
   *	test_fsync.c
!  *		tests if fsync can be done from another process than the original write
   */
  
! #include "../../include/pg_config.h"
! #include "../../include/pg_config_os.h"
  
  #include <sys/types.h>
  #include <sys/stat.h>
--- 1,12 ----
  /*
   *	test_fsync.c
!  *		test various fsync() methods
   */
  
! #include "postgres.h"
! 
! #include "access/xlog_internal.h"
! #include "access/xlog.h"
  
  #include <sys/types.h>
  #include <sys/stat.h>
***************
*** 14,42 ****
  #include <time.h>
  #include <sys/time.h>
  #include <unistd.h>
  
  #ifdef WIN32
  #define FSYNC_FILENAME	"./test_fsync.out"
  #else
  #define FSYNC_FILENAME	"/var/tmp/test_fsync.out"
  #endif
  
! /* O_SYNC and O_FSYNC are the same */
! #if defined(O_SYNC)
! #define OPEN_SYNC_FLAG		O_SYNC
! #elif defined(O_FSYNC)
! #define OPEN_SYNC_FLAG		O_FSYNC
! #elif defined(O_DSYNC)
! #define OPEN_DATASYNC_FLAG	O_DSYNC
! #endif
! 
! #if defined(OPEN_SYNC_FLAG)
! #if defined(O_DSYNC) && (O_DSYNC != OPEN_SYNC_FLAG)
! #define OPEN_DATASYNC_FLAG	O_DSYNC
! #endif
! #endif
  
! #define WAL_FILE_SIZE	(16 * 1024 * 1024)
  
  void		die(char *str);
  void		print_elapse(struct timeval start_t, struct timeval elapse_t);
--- 16,34 ----
  #include <time.h>
  #include <sys/time.h>
  #include <unistd.h>
+ #include <string.h>
  
  #ifdef WIN32
  #define FSYNC_FILENAME	"./test_fsync.out"
  #else
+ /* /tmp might be a memory file system */
  #define FSYNC_FILENAME	"/var/tmp/test_fsync.out"
  #endif
  
! #define WRITE_SIZE	(16 * 1024)
  
! /* We allocate extra to  guarantee ALIGNOF_XLOG_BUFFER alignment */
! #define ALLOCATE_WAL_FILE_SIZE	(WRITE_SIZE + ALIGNOF_XLOG_BUFFER)
  
  void		die(char *str);
  void		print_elapse(struct timeval start_t, struct timeval elapse_t);
***************
*** 49,55 ****
  	int			tmpfile,
  				i,
  				loops = 1000;
! 	char	   *strout = (char *) malloc(WAL_FILE_SIZE);
  	char	   *filename = FSYNC_FILENAME;
  
  	if (argc > 2 && strcmp(argv[1], "-f") == 0)
--- 41,47 ----
  	int			tmpfile,
  				i,
  				loops = 1000;
! 	char	   *full_buf = (char *) malloc(ALLOCATE_WAL_FILE_SIZE), *buf;
  	char	   *filename = FSYNC_FILENAME;
  
  	if (argc > 2 && strcmp(argv[1], "-f") == 0)
***************
*** 62,74 ****
  	if (argc > 1)
  		loops = atoi(argv[1]);
  
! 	for (i = 0; i < WAL_FILE_SIZE; i++)
! 		strout[i] = 'a';
  
  	if ((tmpfile = open(filename, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR)) == -1)
  		die("Cannot open output file.");
! 	write(tmpfile, strout, WAL_FILE_SIZE);
! 	fsync(tmpfile);				/* fsync so later fsync's don't have to do it */
  	close(tmpfile);
  
  	printf("Simple write timing:\n");
--- 54,70 ----
  	if (argc > 1)
  		loops = atoi(argv[1]);
  
! 	buf = (char *)TYPEALIGN(ALIGNOF_XLOG_BUFFER, full_buf);
! 	for (i = 0; i < WRITE_SIZE; i++)
! 		buf[i] = 'a';
  
  	if ((tmpfile = open(filename, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR)) == -1)
  		die("Cannot open output file.");
! 	if (write(tmpfile, buf, WRITE_SIZE) != WRITE_SIZE)
! 		die("write failed");
! 	/* fsync so later fsync's don't have to do it */
! 	if (fsync(tmpfile) != 0)
! 		die("fsync failed");
  	close(tmpfile);
  
  	printf("Simple write timing:\n");
***************
*** 78,84 ****
  	{
  		if ((tmpfile = open(filename, O_RDWR)) == -1)
  			die("Cannot open output file.");
! 		write(tmpfile, strout, 8192);
  		close(tmpfile);
  	}
  	gettimeofday(&elapse_t, NULL);
--- 74,81 ----
  	{
  		if ((tmpfile = open(filename, O_RDWR)) == -1)
  			die("Cannot open output file.");
! 		if (write(tmpfile, buf, WRITE_SIZE/2) != WRITE_SIZE/2)
! 			die("write failed");
  		close(tmpfile);
  	}
  	gettimeofday(&elapse_t, NULL);
***************
*** 95,102 ****
  	{
  		if ((tmpfile = open(filename, O_RDWR)) == -1)
  			die("Cannot open output file.");
! 		write(tmpfile, strout, 8192);
! 		fsync(tmpfile);
  		close(tmpfile);
  		if ((tmpfile = open(filename, O_RDWR)) == -1)
  			die("Cannot open output file.");
--- 92,101 ----
  	{
  		if ((tmpfile = open(filename, O_RDWR)) == -1)
  			die("Cannot open output file.");
! 		if (write(tmpfile, buf, WRITE_SIZE/2) != WRITE_SIZE/2)
! 			die("write failed");
! 		if (fsync(tmpfile) != 0)
! 			die("fsync failed");
  		close(tmpfile);
  		if ((tmpfile = open(filename, O_RDWR)) == -1)
  			die("Cannot open output file.");
***************
*** 114,125 ****
  	{
  		if ((tmpfile = open(filename, O_RDWR)) == -1)
  			die("Cannot open output file.");
! 		write(tmpfile, strout, 8192);
  		close(tmpfile);
  		/* reopen file */
  		if ((tmpfile = open(filename, O_RDWR)) == -1)
  			die("Cannot open output file.");
! 		fsync(tmpfile);
  		close(tmpfile);
  	}
  	gettimeofday(&elapse_t, NULL);
--- 113,126 ----
  	{
  		if ((tmpfile = open(filename, O_RDWR)) == -1)
  			die("Cannot open output file.");
! 		if (write(tmpfile, buf, WRITE_SIZE/2) != WRITE_SIZE/2)
! 			die("write failed");
  		close(tmpfile);
  		/* reopen file */
  		if ((tmpfile = open(filename, O_RDWR)) == -1)
  			die("Cannot open output file.");
! 		if (fsync(tmpfile) != 0)
! 			die("fsync failed");
  		close(tmpfile);
  	}
  	gettimeofday(&elapse_t, NULL);
***************
*** 135,141 ****
  		die("Cannot open output file.");
  	gettimeofday(&start_t, NULL);
  	for (i = 0; i < loops; i++)
! 		write(tmpfile, strout, 16384);
  	gettimeofday(&elapse_t, NULL);
  	close(tmpfile);
  	printf("\tone 16k o_sync write   ");
--- 136,143 ----
  		die("Cannot open output file.");
  	gettimeofday(&start_t, NULL);
  	for (i = 0; i < loops; i++)
! 		if (write(tmpfile, buf, WRITE_SIZE) != WRITE_SIZE)
! 			die("write failed");
  	gettimeofday(&elapse_t, NULL);
  	close(tmpfile);
  	printf("\tone 16k o_sync write   ");
***************
*** 148,155 ****
  	gettimeofday(&start_t, NULL);
  	for (i = 0; i < loops; i++)
  	{
! 		write(tmpfile, strout, 8192);
! 		write(tmpfile, strout, 8192);
  	}
  	gettimeofday(&elapse_t, NULL);
  	close(tmpfile);
--- 150,159 ----
  	gettimeofday(&start_t, NULL);
  	for (i = 0; i < loops; i++)
  	{
! 		if (write(tmpfile, buf, WRITE_SIZE/2) != WRITE_SIZE/2)
! 			die("write failed");
! 		if (write(tmpfile, buf, WRITE_SIZE/2) != WRITE_SIZE/2)
! 			die("write failed");
  	}
  	gettimeofday(&elapse_t, NULL);
  	close(tmpfile);
***************
*** 169,175 ****
  		die("Cannot open output file.");
  	gettimeofday(&start_t, NULL);
  	for (i = 0; i < loops; i++)
! 		write(tmpfile, strout, 8192);
  	gettimeofday(&elapse_t, NULL);
  	close(tmpfile);
  	printf("\topen o_dsync, write    ");
--- 173,180 ----
  		die("Cannot open output file.");
  	gettimeofday(&start_t, NULL);
  	for (i = 0; i < loops; i++)
! 		if (write(tmpfile, buf, WRITE_SIZE/2) != WRITE_SIZE/2)
! 			die("write failed");
  	gettimeofday(&elapse_t, NULL);
  	close(tmpfile);
  	printf("\topen o_dsync, write    ");
***************
*** 181,187 ****
  		die("Cannot open output file.");
  	gettimeofday(&start_t, NULL);
  	for (i = 0; i < loops; i++)
! 		write(tmpfile, strout, 8192);
  	gettimeofday(&elapse_t, NULL);
  	close(tmpfile);
  	printf("\topen o_sync, write     ");
--- 186,193 ----
  		die("Cannot open output file.");
  	gettimeofday(&start_t, NULL);
  	for (i = 0; i < loops; i++)
! 		if (write(tmpfile, buf, WRITE_SIZE/2) != WRITE_SIZE/2)
! 			die("write failed");
  	gettimeofday(&elapse_t, NULL);
  	close(tmpfile);
  	printf("\topen o_sync, write     ");
***************
*** 199,205 ****
  	gettimeofday(&start_t, NULL);
  	for (i = 0; i < loops; i++)
  	{
! 		write(tmpfile, strout, 8192);
  		fdatasync(tmpfile);
  	}
  	gettimeofday(&elapse_t, NULL);
--- 205,212 ----
  	gettimeofday(&start_t, NULL);
  	for (i = 0; i < loops; i++)
  	{
! 		if (write(tmpfile, buf, WRITE_SIZE/2) != WRITE_SIZE/2)
! 			die("write failed");
  		fdatasync(tmpfile);
  	}
  	gettimeofday(&elapse_t, NULL);
***************
*** 217,224 ****
  	gettimeofday(&start_t, NULL);
  	for (i = 0; i < loops; i++)
  	{
! 		write(tmpfile, strout, 8192);
! 		fsync(tmpfile);
  	}
  	gettimeofday(&elapse_t, NULL);
  	close(tmpfile);
--- 224,233 ----
  	gettimeofday(&start_t, NULL);
  	for (i = 0; i < loops; i++)
  	{
! 		if (write(tmpfile, buf, WRITE_SIZE/2) != WRITE_SIZE/2)
! 			die("write failed");
! 		if (fsync(tmpfile) != 0)
! 			die("fsync failed");
  	}
  	gettimeofday(&elapse_t, NULL);
  	close(tmpfile);
***************
*** 235,242 ****
  	gettimeofday(&start_t, NULL);
  	for (i = 0; i < loops; i++)
  	{
! 		write(tmpfile, strout, 8192);
! 		write(tmpfile, strout, 8192);
  	}
  	gettimeofday(&elapse_t, NULL);
  	close(tmpfile);
--- 244,253 ----
  	gettimeofday(&start_t, NULL);
  	for (i = 0; i < loops; i++)
  	{
! 		if (write(tmpfile, buf, WRITE_SIZE/2) != WRITE_SIZE/2)
! 			die("write failed");
! 		if (write(tmpfile, buf, WRITE_SIZE/2) != WRITE_SIZE/2)
! 			die("write failed");
  	}
  	gettimeofday(&elapse_t, NULL);
  	close(tmpfile);
***************
*** 254,261 ****
  	gettimeofday(&start_t, NULL);
  	for (i = 0; i < loops; i++)
  	{
! 		write(tmpfile, strout, 8192);
! 		write(tmpfile, strout, 8192);
  	}
  	gettimeofday(&elapse_t, NULL);
  	close(tmpfile);
--- 265,274 ----
  	gettimeofday(&start_t, NULL);
  	for (i = 0; i < loops; i++)
  	{
! 		if (write(tmpfile, buf, WRITE_SIZE/2) != WRITE_SIZE/2)
! 			die("write failed");
! 		if (write(tmpfile, buf, WRITE_SIZE/2) != WRITE_SIZE/2)
! 			die("write failed");
  	}
  	gettimeofday(&elapse_t, NULL);
  	close(tmpfile);
***************
*** 271,278 ****
  	gettimeofday(&start_t, NULL);
  	for (i = 0; i < loops; i++)
  	{
! 		write(tmpfile, strout, 8192);
! 		write(tmpfile, strout, 8192);
  		fdatasync(tmpfile);
  	}
  	gettimeofday(&elapse_t, NULL);
--- 284,293 ----
  	gettimeofday(&start_t, NULL);
  	for (i = 0; i < loops; i++)
  	{
! 		if (write(tmpfile, buf, WRITE_SIZE/2) != WRITE_SIZE/2)
! 			die("write failed");
! 		if (write(tmpfile, buf, WRITE_SIZE/2) != WRITE_SIZE/2)
! 			die("write failed");
  		fdatasync(tmpfile);
  	}
  	gettimeofday(&elapse_t, NULL);
***************
*** 290,298 ****
  	gettimeofday(&start_t, NULL);
  	for (i = 0; i < loops; i++)
  	{
! 		write(tmpfile, strout, 8192);
! 		write(tmpfile, strout, 8192);
! 		fsync(tmpfile);
  	}
  	gettimeofday(&elapse_t, NULL);
  	close(tmpfile);
--- 305,316 ----
  	gettimeofday(&start_t, NULL);
  	for (i = 0; i < loops; i++)
  	{
! 		if (write(tmpfile, buf, WRITE_SIZE/2) != WRITE_SIZE/2)
! 			die("write failed");
! 		if (write(tmpfile, buf, WRITE_SIZE/2) != WRITE_SIZE/2)
! 			die("write failed");
! 		if (fsync(tmpfile) != 0)
! 			die("fsync failed");
  	}
  	gettimeofday(&elapse_t, NULL);
  	close(tmpfile);
***************
*** 300,305 ****
--- 318,324 ----
  	print_elapse(start_t, elapse_t);
  	printf("\n");
  
+ 	free(full_buf);
  	unlink(filename);
  
  	return 0;
