Re: A patch for xlog.c

From: Bruce Momjian <pgman(at)candle(dot)pha(dot)pa(dot)us>
To: Matthew Kirkwood <matthew(at)hairy(dot)beasts(dot)org>
Cc: pgsql-patches(at)postgresql(dot)org
Subject: Re: A patch for xlog.c
Date: 2001-02-24 21:01:15
Message-ID: 200102242101.QAA08185@candle.pha.pa.us
Views: Raw Message | Whole Thread | Download mbox | Resend email
Thread:
Lists: pgsql-hackers pgsql-patches

I am confused why mmap() is better than writing to a real file. Don't
we need to write to a real file so it is available for database
recovery?

> Hi,
>
> Here is a patch against 7.1beta5 to use mmap(), and thus a
> single write, to initialise xlogs. It may well improve
> performance of this on platforms/filesystems which write
> metadata synchronously.
>
> It needs a configure test, but certainly builds and runs
> OK.
>
> It also wraps the file reopening in an "ifdef WIN32", since
> it certainly isn't needed for UNIX-like platforms (which I
> assume includes BeOS).
>
> Matthew.
>
>
> diff -ruN postgresql-7.1beta5-clean/src/backend/access/transam/xlog.c postgresql-7.1beta5/src/backend/access/transam/xlog.c
> --- postgresql-7.1beta5-clean/src/backend/access/transam/xlog.c Fri Feb 23 18:12:00 2001
> +++ postgresql-7.1beta5/src/backend/access/transam/xlog.c Sat Feb 24 15:23:41 2001
> @@ -24,6 +24,10 @@
> #include <locale.h>
> #endif
>
> +#ifdef _HAVE_MMAP
> +#include <sys/mman.h>
> +#endif
> +
> #include "access/transam.h"
> #include "access/xact.h"
> #include "catalog/catversion.h"
> @@ -36,6 +40,7 @@
> #include "access/xlogutils.h"
> #include "utils/builtins.h"
> #include "utils/relcache.h"
> +#include "utils/pfile.h"
>
> #include "miscadmin.h"
>
> @@ -53,6 +58,10 @@
> StartUpID ThisStartUpID = 0;
> XLogRecPtr RedoRecPtr;
>
> +#ifdef _HAVE_MMAP
> +void *zmmap = NULL;
> +#endif
> +
> int XLOG_DEBUG = 0;
>
> /* To read/update control file and create new log file */
> @@ -955,7 +964,6 @@
> {
> char path[MAXPGPATH];
> char tpath[MAXPGPATH];
> - char zbuffer[BLCKSZ];
> int fd;
> int nbytes;
>
> @@ -987,28 +995,36 @@
> elog(STOP, "InitCreate(logfile %u seg %u) failed: %m",
> logId, logSeg);
>
> - /*
> - * Zero-fill the file. We have to do this the hard way to ensure that
> - * all the file space has really been allocated --- on platforms that
> - * allow "holes" in files, just seeking to the end doesn't allocate
> - * intermediate space. This way, we know that we have all the space
> - * and (after the fsync below) that all the indirect blocks are down
> - * on disk. Therefore, fdatasync(2) will be sufficient to sync future
> - * writes to the log file.
> - */
> - MemSet(zbuffer, 0, sizeof(zbuffer));
> - for (nbytes = 0; nbytes < XLogSegSize; nbytes += sizeof(zbuffer))
> +#ifdef _HAVE_MMAP
> + if (!zmmap || (write(fd, zmmap, XLogSegSize) != XLogSegSize))
> +#endif
> {
> - if ((int) write(fd, zbuffer, sizeof(zbuffer)) != (int) sizeof(zbuffer))
> - elog(STOP, "ZeroFill(logfile %u seg %u) failed: %m",
> - logId, logSeg);
> + /*
> + * Zero-fill the file. We have to do this the hard way to ensure that
> + * all the file space has really been allocated --- on platforms that
> + * allow "holes" in files, just seeking to the end doesn't allocate
> + * intermediate space. This way, we know that we have all the space
> + * and (after the fsync below) that all the indirect blocks are down
> + * on disk. Therefore, fdatasync(2) will be sufficient to sync future
> + * writes to the log file.
> + */
> + char zbuffer[BLCKSZ];
> + MemSet(zbuffer, 0, sizeof(zbuffer));
> + for (nbytes = 0; nbytes < XLogSegSize; nbytes += sizeof(zbuffer))
> + {
> + if ((int) write(fd, zbuffer, sizeof(zbuffer)) != (int) sizeof(zbuffer))
> + elog(STOP, "ZeroFill(logfile %u seg %u) failed: %m",
> + logId, logSeg);
> + }
> }
>
> if (pg_fsync(fd) != 0)
> elog(STOP, "fsync(logfile %u seg %u) failed: %m",
> logId, logSeg);
>
> +#ifdef WIN32
> close(fd);
> +#endif
>
> /*
> * Prefer link() to rename() here just to be sure that we don't overwrite
> @@ -1026,10 +1042,12 @@
> logId, logSeg);
> #endif
>
> +#ifdef WIN32
> fd = BasicOpenFile(path, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR);
> if (fd < 0)
> elog(STOP, "InitReopen(logfile %u seg %u) failed: %m",
> logId, logSeg);
> +#endif
>
> return (fd);
> }
> @@ -1255,11 +1273,8 @@
> if (noBlck || readOff != (RecPtr->xrecoff % XLogSegSize) / BLCKSZ)
> {
> readOff = (RecPtr->xrecoff % XLogSegSize) / BLCKSZ;
> - if (lseek(readFile, (off_t) (readOff * BLCKSZ), SEEK_SET) < 0)
> - elog(STOP, "ReadRecord: lseek(logfile %u seg %u off %u) failed: %m",
> - readId, readSeg, readOff);
> - if (read(readFile, readBuf, BLCKSZ) != BLCKSZ)
> - elog(STOP, "ReadRecord: read(logfile %u seg %u off %u) failed: %m",
> + if (pg_pread(readFile, readBuf, BLCKSZ, (readOff * BLCKSZ)) != BLCKSZ)
> + elog(STOP, "ReadRecord: pg_pread(logfile %u seg %u off %u) failed: %m",
> readId, readSeg, readOff);
> if (((XLogPageHeader) readBuf)->xlp_magic != XLOG_PAGE_MAGIC)
> {
> @@ -1415,19 +1430,13 @@
> elog(LOG, "Formatting logfile %u seg %u block %u at offset %u",
> readId, readSeg, readOff, EndRecPtr.xrecoff % BLCKSZ);
> readFile = XLogFileOpen(readId, readSeg, false);
> - if (lseek(readFile, (off_t) (readOff * BLCKSZ), SEEK_SET) < 0)
> - elog(STOP, "ReadRecord: lseek(logfile %u seg %u off %u) failed: %m",
> - readId, readSeg, readOff);
> - if (read(readFile, readBuf, BLCKSZ) != BLCKSZ)
> - elog(STOP, "ReadRecord: read(logfile %u seg %u off %u) failed: %m",
> + if (pg_pread(readFile, readBuf, BLCKSZ, (readOff * BLCKSZ)) != BLCKSZ)
> + elog(STOP, "ReadRecord: pg_pread(logfile %u seg %u off %u) failed: %m",
> readId, readSeg, readOff);
> memset(readBuf + EndRecPtr.xrecoff % BLCKSZ, 0,
> BLCKSZ - EndRecPtr.xrecoff % BLCKSZ);
> - if (lseek(readFile, (off_t) (readOff * BLCKSZ), SEEK_SET) < 0)
> - elog(STOP, "ReadRecord: lseek(logfile %u seg %u off %u) failed: %m",
> - readId, readSeg, readOff);
> - if (write(readFile, readBuf, BLCKSZ) != BLCKSZ)
> - elog(STOP, "ReadRecord: write(logfile %u seg %u off %u) failed: %m",
> + if (pg_pwrite(readFile, readBuf, BLCKSZ, (readOff * BLCKSZ)) != BLCKSZ)
> + elog(STOP, "ReadRecord: pg_pwrite(logfile %u seg %u off %u) failed: %m",
> readId, readSeg, readOff);
> readOff++;
> }
> @@ -1797,6 +1806,28 @@
> return buf;
> }
>
> +
> +#ifdef _HAVE_MMAP
> +static void
> +ZeroMapInit(void)
> +{
> + int zfd;
> +
> + zfd = BasicOpenFile("/dev/zero", O_RDONLY, 0);
> + if (zfd < 0) {
> + elog(LOG, "Can't open /dev/zero: %m");
> + return;
> + }
> + zmmap = mmap(NULL, XLogSegSize, PROT_READ, MAP_SHARED, zfd, 0);
> + if (!zmmap)
> + elog(LOG, "Can't mmap /dev/zero: %m");
> + close(zfd);
> +}
> +#else
> +#define ZeroMapInit()
> +#endif
> +
> +
> /*
> * This func must be called ONCE on system startup
> */
> @@ -1811,6 +1842,9 @@
> char buffer[_INTL_MAXLOGRECSZ + SizeOfXLogRecord];
>
> elog(LOG, "starting up");
> +
> + ZeroMapInit();
> +
> CritSectionCount++;
>
> XLogCtl->xlblocks = (XLogRecPtr *) (((char *) XLogCtl) + sizeof(XLogCtlData));
>
>

--
Bruce Momjian | http://candle.pha.pa.us
pgman(at)candle(dot)pha(dot)pa(dot)us | (610) 853-3000
+ If your life is a hard drive, | 830 Blythe Avenue
+ Christ can be your backup. | Drexel Hill, Pennsylvania 19026

In response to

Responses

Browse pgsql-hackers by date

  From Date Subject
Next Message Tom Lane 2001-02-24 21:58:16 Re: pg_atoi() behavior change? Intentional?
Previous Message Matthew Kirkwood 2001-02-24 20:37:20 Re: A patch for xlog.c

Browse pgsql-patches by date

  From Date Subject
Next Message Tom Lane 2001-02-24 22:20:06 Re: A patch for xlog.c
Previous Message Matthew Kirkwood 2001-02-24 20:37:20 Re: A patch for xlog.c