Skip site navigation (1) Skip section navigation (2)

Re: A patch for xlog.c

From: Bruce Momjian <pgman(at)candle(dot)pha(dot)pa(dot)us>
To: Matthew Kirkwood <matthew(at)hairy(dot)beasts(dot)org>
Cc: pgsql-patches(at)postgresql(dot)org
Subject: Re: A patch for xlog.c
Date: 2001-02-24 21:01:15
Message-ID: 200102242101.QAA08185@candle.pha.pa.us (view raw or flat)
Thread:
Lists: pgsql-hackerspgsql-patches
I am confused why mmap() is better than writing to a real file.  Don't
we need to write to a real file so it is available for database
recovery?


> Hi,
> 
> Here is a patch against 7.1beta5 to use mmap(), and thus a
> single write, to initialise xlogs.  It may well improve
> performance of this on platforms/filesystems which write
> metadata synchronously.
> 
> It needs a configure test, but certainly builds and runs
> OK.
> 
> It also wraps the file reopening in an "ifdef WIN32", since
> it certainly isn't needed for UNIX-like platforms (which I
> assume includes BeOS).
> 
> Matthew.
> 
> 
> diff -ruN postgresql-7.1beta5-clean/src/backend/access/transam/xlog.c postgresql-7.1beta5/src/backend/access/transam/xlog.c
> --- postgresql-7.1beta5-clean/src/backend/access/transam/xlog.c	Fri Feb 23 18:12:00 2001
> +++ postgresql-7.1beta5/src/backend/access/transam/xlog.c	Sat Feb 24 15:23:41 2001
> @@ -24,6 +24,10 @@
>  #include <locale.h>
>  #endif
>  
> +#ifdef	_HAVE_MMAP
> +#include <sys/mman.h>
> +#endif
> +
>  #include "access/transam.h"
>  #include "access/xact.h"
>  #include "catalog/catversion.h"
> @@ -36,6 +40,7 @@
>  #include "access/xlogutils.h"
>  #include "utils/builtins.h"
>  #include "utils/relcache.h"
> +#include "utils/pfile.h"
>  
>  #include "miscadmin.h"
>  
> @@ -53,6 +58,10 @@
>  StartUpID	ThisStartUpID = 0;
>  XLogRecPtr	RedoRecPtr;
>  
> +#ifdef	_HAVE_MMAP
> +void		*zmmap = NULL;
> +#endif
> +
>  int			XLOG_DEBUG = 0;
>  
>  /* To read/update control file and create new log file */
> @@ -955,7 +964,6 @@
>  {
>  	char		path[MAXPGPATH];
>  	char		tpath[MAXPGPATH];
> -	char		zbuffer[BLCKSZ];
>  	int			fd;
>  	int			nbytes;
>  
> @@ -987,28 +995,36 @@
>  		elog(STOP, "InitCreate(logfile %u seg %u) failed: %m",
>  			 logId, logSeg);
>  
> -	/*
> -	 * Zero-fill the file.  We have to do this the hard way to ensure that
> -	 * all the file space has really been allocated --- on platforms that
> -	 * allow "holes" in files, just seeking to the end doesn't allocate
> -	 * intermediate space.  This way, we know that we have all the space
> -	 * and (after the fsync below) that all the indirect blocks are down
> -	 * on disk.  Therefore, fdatasync(2) will be sufficient to sync future
> -	 * writes to the log file.
> -	 */
> -	MemSet(zbuffer, 0, sizeof(zbuffer));
> -	for (nbytes = 0; nbytes < XLogSegSize; nbytes += sizeof(zbuffer))
> +#ifdef	_HAVE_MMAP
> +	if (!zmmap || (write(fd, zmmap, XLogSegSize) != XLogSegSize))
> +#endif
>  	{
> -		if ((int) write(fd, zbuffer, sizeof(zbuffer)) != (int) sizeof(zbuffer))
> -			elog(STOP, "ZeroFill(logfile %u seg %u) failed: %m",
> -				 logId, logSeg);
> +		/*
> +	 	* Zero-fill the file.  We have to do this the hard way to ensure that
> +	 	* all the file space has really been allocated --- on platforms that
> +	 	* allow "holes" in files, just seeking to the end doesn't allocate
> +	 	* intermediate space.  This way, we know that we have all the space
> +	 	* and (after the fsync below) that all the indirect blocks are down
> +	 	* on disk.  Therefore, fdatasync(2) will be sufficient to sync future
> +	 	* writes to the log file.
> +	 	*/
> +		char		zbuffer[BLCKSZ];
> +		MemSet(zbuffer, 0, sizeof(zbuffer));
> +		for (nbytes = 0; nbytes < XLogSegSize; nbytes += sizeof(zbuffer))
> +		{
> +			if ((int) write(fd, zbuffer, sizeof(zbuffer)) != (int) sizeof(zbuffer))
> +				elog(STOP, "ZeroFill(logfile %u seg %u) failed: %m",
> +				 	logId, logSeg);
> +		}
>  	}
>  
>  	if (pg_fsync(fd) != 0)
>  		elog(STOP, "fsync(logfile %u seg %u) failed: %m",
>  			 logId, logSeg);
>  
> +#ifdef	WIN32
>  	close(fd);
> +#endif
>  
>  	/*
>  	 * Prefer link() to rename() here just to be sure that we don't overwrite
> @@ -1026,10 +1042,12 @@
>  			 logId, logSeg);
>  #endif
>  
> +#ifdef	WIN32
>  	fd = BasicOpenFile(path, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR);
>  	if (fd < 0)
>  		elog(STOP, "InitReopen(logfile %u seg %u) failed: %m",
>  			 logId, logSeg);
> +#endif
>  
>  	return (fd);
>  }
> @@ -1255,11 +1273,8 @@
>  	if (noBlck || readOff != (RecPtr->xrecoff % XLogSegSize) / BLCKSZ)
>  	{
>  		readOff = (RecPtr->xrecoff % XLogSegSize) / BLCKSZ;
> -		if (lseek(readFile, (off_t) (readOff * BLCKSZ), SEEK_SET) < 0)
> -			elog(STOP, "ReadRecord: lseek(logfile %u seg %u off %u) failed: %m",
> -				 readId, readSeg, readOff);
> -		if (read(readFile, readBuf, BLCKSZ) != BLCKSZ)
> -			elog(STOP, "ReadRecord: read(logfile %u seg %u off %u) failed: %m",
> +		if (pg_pread(readFile, readBuf, BLCKSZ, (readOff * BLCKSZ)) != BLCKSZ)
> +			elog(STOP, "ReadRecord: pg_pread(logfile %u seg %u off %u) failed: %m",
>  				 readId, readSeg, readOff);
>  		if (((XLogPageHeader) readBuf)->xlp_magic != XLOG_PAGE_MAGIC)
>  		{
> @@ -1415,19 +1430,13 @@
>  		elog(LOG, "Formatting logfile %u seg %u block %u at offset %u",
>  			 readId, readSeg, readOff, EndRecPtr.xrecoff % BLCKSZ);
>  		readFile = XLogFileOpen(readId, readSeg, false);
> -		if (lseek(readFile, (off_t) (readOff * BLCKSZ), SEEK_SET) < 0)
> -			elog(STOP, "ReadRecord: lseek(logfile %u seg %u off %u) failed: %m",
> -				 readId, readSeg, readOff);
> -		if (read(readFile, readBuf, BLCKSZ) != BLCKSZ)
> -			elog(STOP, "ReadRecord: read(logfile %u seg %u off %u) failed: %m",
> +		if (pg_pread(readFile, readBuf, BLCKSZ, (readOff * BLCKSZ)) != BLCKSZ)
> +			elog(STOP, "ReadRecord: pg_pread(logfile %u seg %u off %u) failed: %m",
>  				 readId, readSeg, readOff);
>  		memset(readBuf + EndRecPtr.xrecoff % BLCKSZ, 0,
>  			   BLCKSZ - EndRecPtr.xrecoff % BLCKSZ);
> -		if (lseek(readFile, (off_t) (readOff * BLCKSZ), SEEK_SET) < 0)
> -			elog(STOP, "ReadRecord: lseek(logfile %u seg %u off %u) failed: %m",
> -				 readId, readSeg, readOff);
> -		if (write(readFile, readBuf, BLCKSZ) != BLCKSZ)
> -			elog(STOP, "ReadRecord: write(logfile %u seg %u off %u) failed: %m",
> +		if (pg_pwrite(readFile, readBuf, BLCKSZ, (readOff * BLCKSZ)) != BLCKSZ)
> +			elog(STOP, "ReadRecord: pg_pwrite(logfile %u seg %u off %u) failed: %m",
>  				 readId, readSeg, readOff);
>  		readOff++;
>  	}
> @@ -1797,6 +1806,28 @@
>  	return buf;
>  }
>  
> +
> +#ifdef	_HAVE_MMAP
> +static void
> +ZeroMapInit(void)
> +{
> +	int zfd;
> +
> +	zfd = BasicOpenFile("/dev/zero", O_RDONLY, 0);
> +	if (zfd < 0) {
> +		elog(LOG, "Can't open /dev/zero: %m");
> +		return;
> +	}
> +	zmmap = mmap(NULL, XLogSegSize, PROT_READ, MAP_SHARED, zfd, 0);
> +	if (!zmmap)
> +		elog(LOG, "Can't mmap /dev/zero: %m");
> +	close(zfd);
> +}
> +#else
> +#define	ZeroMapInit()
> +#endif
> +
> +
>  /*
>   * This func must be called ONCE on system startup
>   */
> @@ -1811,6 +1842,9 @@
>  	char		buffer[_INTL_MAXLOGRECSZ + SizeOfXLogRecord];
>  
>  	elog(LOG, "starting up");
> +
> +	ZeroMapInit();
> +
>  	CritSectionCount++;
>  
>  	XLogCtl->xlblocks = (XLogRecPtr *) (((char *) XLogCtl) + sizeof(XLogCtlData));
> 
> 


-- 
  Bruce Momjian                        |  http://candle.pha.pa.us
  pgman(at)candle(dot)pha(dot)pa(dot)us               |  (610) 853-3000
  +  If your life is a hard drive,     |  830 Blythe Avenue
  +  Christ can be your backup.        |  Drexel Hill, Pennsylvania 19026

In response to

Responses

pgsql-hackers by date

Next:From: Tom LaneDate: 2001-02-24 21:58:16
Subject: Re: pg_atoi() behavior change? Intentional?
Previous:From: Matthew KirkwoodDate: 2001-02-24 20:37:20
Subject: Re: A patch for xlog.c

pgsql-patches by date

Next:From: Tom LaneDate: 2001-02-24 22:20:06
Subject: Re: A patch for xlog.c
Previous:From: Matthew KirkwoodDate: 2001-02-24 20:37:20
Subject: Re: A patch for xlog.c

Privacy Policy | About PostgreSQL
Copyright © 1996-2014 The PostgreSQL Global Development Group