/*-------------------------------------------------------------------------
 *
 * pageinfo.c
 *		Store information of data pages which should be read ahead.
 *
 * Portions Copyright (c) 2008, Nippon Telegraph and Telephone Corporation
 * Portions Copyright (c) 1996-2004, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
 *-------------------------------------------------------------------------
 */

#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>

#include "postgres.h"
#include "catalog/catalog.h"
#include "storage/relfilenode.h"
#include "storage/block.h"

#include "pageinfo.h"

/*
 * Information about the data page which will be read ahead.
 */
struct pageinfo {
	/*
	 * The physical location of the data page.
	 */
	RelFileNode node;
	BlockNumber blkno;

	/*
	 * xrecoff is the byte offset of location in the WAL segment file as
	 * defined in xlogdefs.h. The read ahead command does not deal with more
	 * than one WAL segment file at once, and xlogid is not going to be changed
	 * during read-ahead. This is why we need only xrecoff. 
	 */
	uint32 xrecoff;

	/*
	 * has_fpw indicates whether an WAL record contains full page write or not.
	 * This is used to skip unnecessary read-aheads.
	 */
	bool has_fpw;
}; 
typedef struct pageinfo pageinfo;

/*
 * MAX_PAGEINFO_NUM is the maximum number of pageinfo entries during one read
 * ahead round. When the number of pageinfos reaches this amount, we start
 * read-ahead. 
 * 
 * We limit the total size of read-ahead to 1GB as a default setting, which
 * means 131,072 pages for 8KB per block size. MAX_PAGEINFO_NUM should be
 * decided by the size of RAM for disk cache.
 */
#define MAX_PAGEINFO_NUM	((1024 * 1024 * 1024) / BLCKSZ)

/* The table for pageinfo entries. */
static pageinfo entry[MAX_PAGEINFO_NUM];

/* The number of pageinfo entries currently used. */
static uint32 entry_used;

/* prototype of local function */
static int pageinfo_compare(const void *l, const void *r);
#ifdef DEBUG
static void pageinfo_dump_all_entry(const char *);
static void pageinfo_dump_pageinfo(const char *str, pageinfo *info);
#else
#define pageinfo_dump_all_entry(str)			((void)(str))
#define pageinfo_dump_pageinfo(str, info)		((void)(str), (void)(info))
#endif

/* Initialize pageinfo module. */
void pageinfo_init(void)
{
	entry_used = 0;
}

/*
 * Append a new pageinfo entry to the table
 * 
 * If the table has been already full, the new entry will not be appended.
 */
void pageinfo_add(RelFileNode node, BlockNumber blkno, uint32 xrecoff,
	bool has_fpw)
{
	/* Check whether we can append a new pageinfo entry to the table or not. */
	if (entry_used >= MAX_PAGEINFO_NUM)
	{
		return;
	}	

	/* Append a new pageinfo entry to the table. */
	entry[entry_used].node = node;
	entry[entry_used].blkno = blkno;
	entry[entry_used].xrecoff = xrecoff;
	entry[entry_used].has_fpw = has_fpw;

	entry_used++;
}

/*
 * Pageinfo table availability check
 *
 * If the pageinfo table has a room for appending more num of pageinfos,
 * return true. If it does not, return false.
 */
bool pageinfo_has_room(int num)
{
	if (entry_used + num > MAX_PAGEINFO_NUM)
	{
		return false;
	}

	return true;
}

/*
 * Check whether info1 and info2 point same data page.
 */
#define IS_SAME_PAGE(info1, info2) \
	(RelFileNodeEquals((info1).node, (info2).node) && \
		(info1).blkno == (info2).blkno)
 
/*
 * Execute read ahead data pages
 * 
 * Before we actually read ahead data pages, sort the pageinfos in the table
 * for avoiding duplicated disk access and hopefully, reducing seek time.
 * We also skip read ahead data pages which has full page write.
 *
 * For performance, we keep file opened until reading another file.
 */
void pageinfo_read_ahead(void)
{
	int fd = -1;
	int i;
	BlockNumber last_segno = InvalidBlockNumber;
	BlockNumber segno;
	pageinfo last_entry = { { 0, 0, 0, }, 0, 0, false };

	/* Sort the pageinfo table for effective disk access. */
	pageinfo_dump_all_entry("before sort");
	qsort(entry, entry_used, sizeof(pageinfo), pageinfo_compare);
	pageinfo_dump_all_entry("after sort");

	for (i = 0; i < entry_used; i++)
	{
		/* Do read ahead once per a page if it doesn't have full page write. */
		if (IS_SAME_PAGE(last_entry, entry[i]) || entry[i].has_fpw)
		{
			last_entry = entry[i];
			continue;
		}

		/* Open data file if not opened yet. */
		last_segno = last_entry.blkno / RELSEG_SIZE;
		segno = entry[i].blkno / RELSEG_SIZE;

		if (last_segno != segno ||
				!RelFileNodeEquals(last_entry.node, entry[i].node))
		{
			char *path;
			char *fullpath;
#ifdef DEBUG
			int open_errno;
#endif

#ifdef DEBUG
			printf("%s(): OPEN\n", __func__);
#endif

			if (fd != -1)
			{
#ifdef DEBUG
				printf("%s(): closing file.\n", __func__);
#endif
				close(fd);
			}

			/*
			 * Generate file path and add segment number if necessary.
			 * Based on _mdfd_openseg() in src/backend/storage/smgr.c
			 */
			path = relpath(entry[i].node);
			if (segno > 0)
			{
				fullpath = malloc(strlen(path) + 12);
				if (!fullpath)
				{
					printf("Can't allocate memory for data file path.\n");
					exit(1);
				}
				sprintf(fullpath, "%s.%u", path, segno);
				free(path);
			}
			else
				fullpath = path;

			fd = open(fullpath, O_RDONLY | PG_BINARY);
#ifdef DEBUG
			open_errno = errno;
			printf("%s(): opening '%s'\n", __func__, fullpath);
#endif
			free(fullpath);
			if (fd == -1)
			{
				/*
				 * Even if open() returns error, continue to read ahead.
				 * We assume that the data file has not been created yet.
				 */
#ifdef DEBUG
				printf("%s(): failed to open data file '%s'\n", __func__,
					strerror(open_errno));
#endif
				last_entry = entry[i];
				continue;
			}
		}

#ifdef HAVE_DECL_POSIX_FADVISE
		/* Read ahead with posix_fadvise() */
		if (fd != -1)
		{
#ifdef DEBUG
				printf("%s(): reading\n", __func__);
#endif
			/* Even if posix_fadvise() returns error, continue to read ahead. */
			posix_fadvise(fd, entry[i].blkno % RELSEG_SIZE, BLCKSZ,
				POSIX_FADV_WILLNEED);
		}
#endif

		/* Store pageinfo to skip duplicate pages. */
		last_entry = entry[i];
	}
}

/*
 * Compare two pageinfo objects
 *
 * When l > r, then return 1, l == r, then return 0, and l < r, then return -1.
 * The priority of comparison clauses shows below;
 *    1. node.spcNode
 *    2. node.dbNode
 *    3. node.relNode
 *    4. blkno
 *    5. xrecoff
 */
static int
pageinfo_compare(const void *l, const void *r)
{
	pageinfo *left = (pageinfo *)l;
	pageinfo *right = (pageinfo *)r;

	/* compare node.spcNode */
	if (left->node.spcNode > right->node.spcNode)
		return 1;
	else if (left->node.spcNode < right->node.spcNode)
		return -1;

	/* compare node.dbNode */
	if (left->node.dbNode > right->node.dbNode)
		return 1;
	else if (left->node.dbNode < right->node.dbNode)
		return -1;

	/* compare node.relNode */
	if (left->node.relNode > right->node.relNode)
		return 1;
	else if (left->node.relNode < right->node.relNode)
		return -1;
 
	/* compare blkno */
	if (left->blkno > right->blkno)
		return 1;
	else if (left->blkno < right->blkno)
		return -1;

	/* compare xrecoff */
	if (left->xrecoff > right->xrecoff)
		return 1;
	else if (left->xrecoff < right->xrecoff)
		return -1;

	/* These two pageinfos are same. */
	return 0;
}

/* Dump all pageinfo stored in entry. */
#ifdef DEBUG
static void
pageinfo_dump_all_entry(const char *str)
{
	int i;
	char buff[1024];

	for (i = 0; i < entry_used; i++)
	{
		sprintf(buff, "%s entry[%d]", str, i);
		pageinfo_dump_pageinfo(buff, entry + i);
	}
}
#endif

/* Dump a pageinfo. */
#ifdef DEBUG
static void
pageinfo_dump_pageinfo(const char *str, pageinfo *info)
{
	printf("%s: { { %d, %d, %d }, %d, %d, %s }\n", str,
		info->node.spcNode, info->node.dbNode, info->node.relNode,
		info->blkno, info->xrecoff, info->has_fpw ? "true" : "false");
}
#endif

