/*-------------------------------------------------------------------------
 *
 * walreceiver.c
 *
 * The WAL receiver process (walreceiver) is new as of Postgres 8.4. It 
 * focuses on receiving xlog records from walsender on the primary server
 * and writing them to the disk.
 *
 * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
 *
 *
 * IDENTIFICATION
 *	  $PostgreSQL: pgsql/src/backend/postmaster/walsender.c,v 1.4 2008/01/01 19:45:51 momjian Exp $
 *
 *-------------------------------------------------------------------------
 */
#include "postgres.h"

#include <unistd.h>

#include "access/xlog_internal.h"
#include "libpq-fe.h"
#include "libpq/pqsignal.h"
#include "miscadmin.h"
#include "postmaster/walreceiver.h"
#include "storage/ipc.h"
#include "storage/pmsignal.h"
#include "utils/memutils.h"
#include "utils/ps_status.h"
#include "utils/resowner.h"

/*
 * Global state
 */
bool	am_walreceiver	= false;

/*
 * Private state
 */
static PGconn *walrcv_conn;
static XLogRecPtr replicated_lsn = {0, 0};

/*
 * Flags set by interrupt handlers for later service in the main loop.
 */
static volatile sig_atomic_t shutdown_requested = false;

/* Signal handlers */
static void WalRcvShutdownHandler(SIGNAL_ARGS);
static void WalRcvQuickDieHandler(SIGNAL_ARGS);

/* Prototypes for private functions */
static void WalRcvNormalExit(void);
static void WalRcvCloseConn(int, Datum);

/*
 * Main entry point for walreceiver process
 */
void
WalReceiverMain(void)
{
	char *buf;
	int len;
	TimeLineID tli;
	XLogRecPtr recptr;
	MemoryContext walreceiver_context;

	/*
	 * Do nothing if no walreceiver needed
	 */
	if (!XLogReceivingActive())
		return;

	am_walreceiver = true;

	/*
	 * If possible, make this process a group leader, so that the postmaster
	 * can signal any child processes too.	(walwriter probably never has any
	 * child processes, but for consistency we make all postmaster child
	 * processes do this.)
	 */
#ifdef HAVE_SETSID
	if (setsid() < 0)
		elog(FATAL, "setsid() failed: %m");
#endif

	/*
	 * Properly accept or ignore signals the postmaster might send us
	 */
	pqsignal(SIGHUP, SIG_IGN);
	pqsignal(SIGINT, SIG_IGN);
	pqsignal(SIGTERM, WalRcvShutdownHandler);		/* request shutdown */
	pqsignal(SIGQUIT, WalRcvQuickDieHandler);	/* hard crash time */
	pqsignal(SIGALRM, SIG_IGN);
	pqsignal(SIGPIPE, SIG_IGN);
	pqsignal(SIGUSR1, SIG_IGN);
	pqsignal(SIGUSR2, SIG_IGN);

	/*
	 * Reset some signals that are accepted by postmaster but not here
	 */
	pqsignal(SIGCHLD, SIG_DFL);
	pqsignal(SIGTTIN, SIG_DFL);
	pqsignal(SIGTTOU, SIG_DFL);
	pqsignal(SIGCONT, SIG_DFL);
	pqsignal(SIGWINCH, SIG_DFL);

	/* We allow SIGQUIT (quickdie) at all times */
#ifdef HAVE_SIGPROCMASK
	sigdelset(&BlockSig, SIGQUIT);
#else
	BlockSig &= ~(sigmask(SIGQUIT));
#endif

	/*
	 * Create a resource owner to keep track of our resources (not clear that
	 * we need this, but may as well have one).
	 */
	CurrentResourceOwner = ResourceOwnerCreate(NULL, "Wal Receiver");

	/*
	 * Create a memory context that we will do all our work in.  We do this so
	 * that we can reset the context during error recovery and thereby avoid
	 * possible memory leaks.  Formerly this code just ran in
	 * TopMemoryContext, but resetting that would be a really bad idea.
	 */
	walreceiver_context = AllocSetContextCreate(TopMemoryContext,
											  "Wal Receiver",
											  ALLOCSET_DEFAULT_MINSIZE,
											  ALLOCSET_DEFAULT_INITSIZE,
											  ALLOCSET_DEFAULT_MAXSIZE);
	MemoryContextSwitchTo(walreceiver_context);

	/*
	 * Unblock signals (they were blocked when the postmaster forked us)
	 */
	PG_SETMASK(&UnBlockSig);

	/*
	 * Set up walreceiver-exit callback for closing the connection to walsender
	 */
	on_proc_exit(WalRcvCloseConn, 0);

	/*
	 * Set up a connection for replication
	 */
	walrcv_conn = PQsetdbLogin(replicationPrimaryHost,
							   replicationPrimaryPort,
							   NULL, NULL, "walsender", 
							   NULL, NULL);
	
	if (PQstatus(walrcv_conn) != CONNECTION_OK)
		ereport(FATAL,
				(errmsg("Could not connect to the primary (%s:%s): %s",
						PQhost(walrcv_conn), PQport(walrcv_conn), 
						PQerrorMessage(walrcv_conn))));

	/*
	 * We can query the primary database freely after the setup of connection
	 * (PQsetdbLogin) before sending a replication message (first PQpeekXLog).
	 * Maybe an additional authentication for replication should be here in
	 * the future.
	 */

	/*
	 * Send a replication message and get the timeline of the running primary.
	 */
	buf = PQpeekXLog(walrcv_conn, 
					 &tli, 
					 &recptr.xlogid, 
					 &recptr.xrecoff, 
					 &len, 0);
	
	if (len != 0 && tli > 0)
		ereport(FATAL,
				(errmsg("Could not get the timeline ID of the primary: %s",
						PQerrorMessage(walrcv_conn))));

	ThisTimeLineID = tli;

	PQmarkConsumed(walrcv_conn);

	/*
	 * Start archiver to archive the received xlog file segment
	 */
	if (!XLogArchivingActive())
		SendPostmasterSignal(PMSIGNAL_WAKEN_ARCHIVER);
	
	for (;;)
	{
		/*
		 * Emergency bailout if postmaster has died.  This is to avoid the
		 * necessity for manual cleanup of all postmaster children.
		 */
		if (!PostmasterIsAlive(true))
			exit(1);
		
		/*
		 * Exit walreceiver if we're not in recovery. This should not happen
		 * because the startup process requests postmaster to terminate
		 * walreceiver before becoming non recovery mode. But, cross-check
		 * the status here.
		 */
		if (!IsRecoveryProcessingMode())
			ereport(FATAL,
					(errmsg("cannot perform replication not during recovery")));
		
		buf = PQpeekXLog(walrcv_conn, 
						 &tli,
						 &recptr.xlogid,
						 &recptr.xrecoff,
						 &len, 0);
		
		if (len < 0)	/* end-of-replication or error */
			break;
		
		/*
		 * A synchronization mode varies according to the timing of sending commit
		 * message to walsender. For example, the following timings can be considered:
		 * we send commit message after
		 *    
		 *    1) receiving xlog records and locate them on only memory
		 *    2) writing them to the disk
		 *    3) fsyncing them to the disk
		 *    4) redoing them
		 *    ...etc
		 *
		 * Now, only mode 2) is supported. In mode 2), xlog records might disappear 
		 * if the standby falls down before writing them to the disk *sector* (which 
		 * means fsyncing). But, in synchronous replication case, since such missing 
		 * records are guaranteed to be on the alive primary, a transaction is not
		 * lost in the whole system (i.e. the standby can recover all transactions
		 * from xlog records on the primary). If the primary also falls down
		 * concurrently, since the commit reply has not been returned to a client yet,
		 * it doesn't matter if xlog records under writing disappear.
		 */
		if (buf)
		{
			bool use_existent;
			int startoffset;
			char activitymsg[40];

			START_CRIT_SECTION();

			/*
			 * Write the received xlog records to the disk.
			 */
			XLogSegSwitchNeeded(recptr, len, &use_existent);
			startoffset = recptr.xrecoff % XLogSegSize;
			startoffset = ((startoffset == 0) ? XLogSegSize : startoffset) - len;
			XLogWriteSome(buf, len, startoffset);

			END_CRIT_SECTION();

			/*
			 * Save the current xlog position as a replicated one in order to
			 * report it when exiting walreceiver normally.
			 */
			replicated_lsn = recptr;

			/* Report replication progress in PS display */
			snprintf(activitymsg, sizeof(activitymsg), "replicated to %X/%X",
					 recptr.xlogid, recptr.xrecoff);
			set_ps_display(activitymsg, false);

			/*
			 * Exit walreceiver normally if shutdown is requested during writing
			 * the xlog records to the disk.
			 */
			if (shutdown_requested)
				WalRcvNormalExit();
			
			PQmarkConsumed(walrcv_conn);
			
			/*
			 * Send commit message to walsender.
			 */
			if (PQputXLogRecPtr(walrcv_conn, recptr.xlogid, recptr.xrecoff) == -1)
				ereport(FATAL,
						(errmsg("Could not send xlog position to the primary: %s",
								PQerrorMessage(walrcv_conn))));
		}
	}
	
	if (len == -1)	/* end-of-replication */
		WalRcvNormalExit();
	else	/* error */
		ereport(FATAL,
				(errmsg("unexpected EOF on replication connection: %s",
						PQerrorMessage(walrcv_conn))));
}

/*
 * Exit walreceiver normally
 */
static void
WalRcvNormalExit(void)
{
	ereport(LOG,
			(errmsg("replication done at %X/%X",
					replicated_lsn.xlogid, replicated_lsn.xrecoff)));
	proc_exit(0);	
}

/*
 * Close a connection to walsender
 */
static void
WalRcvCloseConn(int code, Datum arg)
{
	PQfinish(walrcv_conn);
}

/* --------------------------------
 *		signal handler routines
 * --------------------------------
 */

/* SIGTERM: set flag to exit normally */
static void
WalRcvShutdownHandler(SIGNAL_ARGS)
{
	if (CritSectionCount == 0)
		WalRcvNormalExit();
	
	/*
	 * Delay shutdown if we are inside a critical section.
	 */
	shutdown_requested = true;
}

/* 
 * SIGQUIT: exit quickly
 * 
 * Some backend has bought the farm,
 * so we need to stop what we're doing and exit.
 */
static void WalRcvQuickDieHandler(SIGNAL_ARGS)
{
	PG_SETMASK(&BlockSig);
	
	/*
	 * DO NOT proc_exit() -- we're here because shared memory may be
	 * corrupted, so we don't want to try to clean up our transaction. Just
	 * nail the windows shut and get out of town.
	 *
	 * Note we do exit(2) not exit(0).	This is to force the postmaster into a
	 * system reset cycle if some idiot DBA sends a manual SIGQUIT to a random
	 * backend.  This is necessary precisely because we don't clean up our
	 * shared memory state.
	 */
	exit(2);
}
