diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 49547ee..b651858 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -2884,6 +2884,21 @@ include_dir 'conf.d'
       </listitem>
      </varlistentry>
 
+     <varlistentry id="guc-enable-hashagg-disk" xreflabel="enable_hashagg_disk">
+      <term><varname>enable_hashagg_disk</varname> (<type>boolean</type>)
+      <indexterm>
+       <primary><varname>enable_hashagg_disk</> configuration parameter</primary>
+      </indexterm>
+      </term>
+      <listitem>
+       <para>
+        Enables or disables the query planner's use of hashed aggregation plan
+        types when the planner expects the hash table size to exceed
+        <varname>work_mem</varname>. The default is <literal>on</>.
+       </para>
+      </listitem>
+     </varlistentry>
+
      <varlistentry id="guc-enable-hashjoin" xreflabel="enable_hashjoin">
       <term><varname>enable_hashjoin</varname> (<type>boolean</type>)
       <indexterm>
diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 781a736..ca9f026 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -78,6 +78,8 @@ static void show_merge_append_keys(MergeAppendState *mstate, List *ancestors,
 					   ExplainState *es);
 static void show_agg_keys(AggState *astate, List *ancestors,
 			  ExplainState *es);
+static void show_agg_batching(AggState *astate, List *ancestors,
+			  ExplainState *es);
 static void show_group_keys(GroupState *gstate, List *ancestors,
 				ExplainState *es);
 static void show_sort_group_keys(PlanState *planstate, const char *qlabel,
@@ -1391,6 +1393,7 @@ ExplainNode(PlanState *planstate, List *ancestors,
 										   planstate, es);
 			break;
 		case T_Agg:
+			show_agg_batching((AggState *) planstate, ancestors, es);
 			show_agg_keys((AggState *) planstate, ancestors, es);
 			show_upper_qual(plan->qual, "Filter", planstate, ancestors, es);
 			if (plan->qual)
@@ -1790,6 +1793,37 @@ show_agg_keys(AggState *astate, List *ancestors,
 }
 
 /*
+ * Show the batching info for an Agg node.
+ */
+static void
+show_agg_batching(AggState *astate, List *ancestors,
+			  ExplainState *es)
+{
+	Agg		   *plan = (Agg *) astate->ss.ps.plan;
+
+	if ((es->analyze) && (plan->aggstrategy == AGG_HASHED))
+	{
+		if (es->format == EXPLAIN_FORMAT_TEXT)
+		{
+			appendStringInfoSpaces(es->str, es->indent * 2);
+			appendStringInfo(es->str, "Batch Count: %d  Original: %d  Smallest: %ldkB  Largest: %ldkB Rescanned: %.0f%%\n",
+							 astate->nbatch, astate->nbatch_original,
+							 astate->batch_min_size / 1024,
+							 astate->batch_max_size / 1024,
+							 astate->ntuples_rescanned * 100.0 / astate->ntuples_scanned);
+		}
+		else
+		{
+			ExplainPropertyLong("Batch Count", astate->nbatch, es);
+			ExplainPropertyLong("Batch Count Original", astate->nbatch_original, es);
+			ExplainPropertyLong("Batch Smallest", astate->batch_min_size/1024, es);
+			ExplainPropertyLong("Batch Largest", astate->batch_max_size/1024, es);
+			ExplainPropertyLong("Batch Rescan Rate", (astate->ntuples_rescanned * 100) / astate->ntuples_scanned, es);
+		}
+	}
+}
+
+/*
  * Show the grouping keys for a Group node.
  */
 static void
diff --git a/src/backend/executor/nodeAgg.c b/src/backend/executor/nodeAgg.c
index 6455864..d0e30b1 100644
--- a/src/backend/executor/nodeAgg.c
+++ b/src/backend/executor/nodeAgg.c
@@ -96,6 +96,8 @@
 
 #include "postgres.h"
 
+#include <limits.h>
+
 #include "access/htup_details.h"
 #include "catalog/objectaccess.h"
 #include "catalog/pg_aggregate.h"
@@ -108,6 +110,7 @@
 #include "optimizer/tlist.h"
 #include "parser/parse_agg.h"
 #include "parser/parse_coerce.h"
+#include "storage/buffile.h"
 #include "utils/acl.h"
 #include "utils/builtins.h"
 #include "utils/lsyscache.h"
@@ -115,7 +118,11 @@
 #include "utils/syscache.h"
 #include "utils/tuplesort.h"
 #include "utils/datum.h"
+#include "utils/dynahash.h"
 
+#define HASH_DISK_MIN_PARTITIONS		2
+#define HASH_DISK_DEFAULT_PARTITIONS	4
+#define HASH_DISK_MAX_PARTITIONS		256
 
 /*
  * AggStatePerAggData - per-aggregate working state for the Agg scan
@@ -310,7 +317,6 @@ typedef struct AggHashEntryData
 
 }	AggHashEntryData;	/* VARIABLE LENGTH STRUCT */
 
-
 static void initialize_aggregates(AggState *aggstate,
 					  AggStatePerAgg peragg,
 					  AggStatePerGroup pergroup);
@@ -332,22 +338,44 @@ static Bitmapset *find_unaggregated_cols(AggState *aggstate);
 static bool find_unaggregated_cols_walker(Node *node, Bitmapset **colnos);
 static void build_hash_table(AggState *aggstate, Size tuple_width);
 static AggHashEntry lookup_hash_entry(AggState *aggstate,
-				  TupleTableSlot *inputslot);
+					uint32 hashvalue, TupleTableSlot *inputslot);
+static void create_hash_entry(AggState *aggstate, AggHashEntry entry);
 static TupleTableSlot *agg_retrieve_direct(AggState *aggstate);
-static void agg_fill_hash_table(AggState *aggstate);
+static bool agg_fill_hash_table(AggState *aggstate);
 static TupleTableSlot *agg_retrieve_hash_table(AggState *aggstate);
 static Datum GetAggInitVal(Datum textInitVal, Oid transtype);
 
 static uint32 compute_hash_value(AggState * aggstate, TupleTableSlot * slot);
 static uint32 compute_bucket(AggState * aggstate, uint32 hashvalue);
+static uint32 compute_batchno(AggState * aggstate, uint32 hashvalue);
 static bool groups_match(AggState * aggstate, TupleTableSlot *slot, AggHashEntry entry);
 static void increase_nbuckets(AggState * aggstate);
+static void increase_nbatches(AggState * aggstate);
 static char * chunk_alloc(AggHashTable htab, int size);
 static void reset_hash_table(AggHashTable htab);
 
+static int choose_nbatch(AggState *aggstate, int nbuckets, Size tuple_width);
+static void init_batch_files(AggState * aggstate);
+static void close_batch_files(AggState * aggstate);
+static void reinit_batch_files(AggState * aggstate);
+
 static void IteratorReset(AggHashTable htab);
 static AggHashEntry IteratorGetNext(AggHashTable htab);
 
+static TupleTableSlot *
+read_saved_tuple(BufFile *file, uint32 *hashvalue, TupleTableSlot *tupleSlot);
+AggHashEntry
+read_saved_group(AggState * aggstate, BufFile *file, AggHashEntry entry);
+
+static void
+save_tuple(AggState *aggstate, int batchno, TupleTableSlot *slot,
+		   uint32 hashvalue);
+static void
+save_group(AggState *aggstate, int batchno, AggHashEntry entry);
+
+static bool
+batching_supported(AggState * aggstate);
+
 /*
  * The size of the chunks for dense allocation. This needs to be >8kB
  * because the default (and only) memory context implementation uses
@@ -392,6 +420,7 @@ typedef struct AggHashTableData
 	int	nentries;		/* number of hash table entries */
 	int	nbuckets;		/* current number of buckets */
 	int	nbuckets_max;	/* max number of buckets */
+	int	nbuckets_bits;	/* bits for nbuckets_max (used for batching) */
 
 	/* items copied from the TupleHashTable, because we still need them */
 	MemoryContext	tmpctx;		/* short-lived memory context (hash/eq funcs) */
@@ -412,6 +441,7 @@ typedef struct AggHashTableData
 	 */
 	HashChunk		cur_chunk;
 	AggHashEntry	cur_entry;
+	int				niterated;
 
 	/* list of chunks with dense-packed entries / minimal tuples */
 	HashChunk		chunks_hash;
@@ -1027,6 +1057,7 @@ build_hash_table(AggState *aggstate, Size tuple_width)
 	/* we assume 1024 buckets (i.e. 8kB of memory) is minimum */
 	int nbuckets     = 1024;
 	int nbuckets_max = 1024;
+	int nbuckets_bits = 10;
 
 	Assert(node->aggstrategy == AGG_HASHED);
 	Assert(node->numGroups > 0);
@@ -1076,7 +1107,10 @@ build_hash_table(AggState *aggstate, Size tuple_width)
 	 *     save a bit of memory by that (although not much).
 	 */
 	while (nbuckets_max * groupsize <= work_mem * 1024L)
+	{
 		nbuckets_max *= 2;
+		nbuckets_bits += 1;
+	}
 
 	/*
 	 * Update the initial number of buckets to match expected number of groups,
@@ -1096,12 +1130,15 @@ build_hash_table(AggState *aggstate, Size tuple_width)
 	htab = (AggHashTable)MemoryContextAllocZero(aggstate->aggcontext,
 											sizeof(AggHashTableData));
 
+	htab->niterated = 0;
+
 	/* TODO create a memory context for the hash table */
-	htab->htabctx = AllocSetContextCreate(aggstate->aggcontext,
+	htab->htabctx = AllocSetContextCreateTracked(aggstate->aggcontext,
 											"HashAggHashTable",
 											ALLOCSET_DEFAULT_MINSIZE,
 											ALLOCSET_DEFAULT_INITSIZE,
-											ALLOCSET_DEFAULT_MAXSIZE);
+											ALLOCSET_DEFAULT_MAXSIZE,
+											true);
 
 	/* buckets are just pointers to AggHashEntryData structures */
 	htab->buckets = (AggHashEntry*)MemoryContextAllocZero(htab->htabctx,
@@ -1115,6 +1152,7 @@ build_hash_table(AggState *aggstate, Size tuple_width)
 
 	htab->nbuckets = nbuckets;
 	htab->nbuckets_max = nbuckets_max;
+	htab->nbuckets_bits = nbuckets_bits;
 	htab->nentries = 0;
 	htab->slot = NULL;
 	htab->numCols = node->numCols;
@@ -1198,15 +1236,14 @@ hash_agg_entry_size(int numAggs)
  * When called, CurrentMemoryContext should be the per-query context.
  */
 static AggHashEntry
-lookup_hash_entry(AggState *aggstate, TupleTableSlot *inputslot)
+lookup_hash_entry(AggState *aggstate, uint32 hashvalue,
+TupleTableSlot *inputslot)
 {
 
 	AggHashEntry entry = NULL;
-	uint32		hashvalue;
 	uint32		bucketno;
 	MinimalTuple mintuple;
 
-	hashvalue = compute_hash_value(aggstate, inputslot);
 	bucketno = compute_bucket(aggstate, hashvalue);
 
 	entry = aggstate->hashtable->buckets[bucketno];
@@ -1223,9 +1260,11 @@ lookup_hash_entry(AggState *aggstate, TupleTableSlot *inputslot)
 		entry = entry->next;
 	}
 
-	/* There's not a maching entry in the bucket, so create a new one and
-	 * copy in data both for the aggregates, and the MinimalTuple containing
-	 * keys for the group columns. */
+	/*
+	 * There's not a maching entry in the bucket, create a new one and
+	 * copy in data both for the aggregates, and the MinimalTuple
+	 * containing keys for the group columns.
+	 */
 	if (entry == NULL)
 	{
 
@@ -1265,16 +1304,48 @@ lookup_hash_entry(AggState *aggstate, TupleTableSlot *inputslot)
 
 		aggstate->hashtable->nentries += 1;
 
-	}
+		/* once we exceed 1 entry / bucket, increase number of buckets */
+		if (aggstate->hashtable->nentries > aggstate->hashtable->nbuckets)
+			increase_nbuckets(aggstate);
 
-	/* once we exceed 1 entry / bucket, increase number of buckets */
-	if (aggstate->hashtable->nentries > aggstate->hashtable->nbuckets)
-		increase_nbuckets(aggstate);
+	}
 
 	return entry;
 }
 
 /*
+ * Creates a new hash entry in the hash table, containing the provided
+ * data. This assumes there's not a matching entry (this is not checked,
+ * and it's expected the caller not to break this).
+ * 
+ * This is used when adding entries with aggregate states read from
+ * a batch file.
+ */
+static void create_hash_entry(AggState *aggstate, AggHashEntry entry)
+{
+	AggHashEntry entry_new
+		= (AggHashEntry) chunk_alloc(aggstate->hashtable,
+					aggstate->hashtable->entrysize + entry->tuple->t_len);
+
+	AggHashTable htab = aggstate->hashtable;
+
+	int bucketno = compute_bucket(aggstate, entry->hashvalue);
+
+	Assert((bucketno >= 0) && (bucketno < htab->nbuckets));
+	Assert(aggstate->cur_batch == compute_batchno(aggstate, entry->hashvalue));
+
+	memcpy(entry_new, entry, htab->entrysize);
+
+	entry_new->tuple = (MinimalTuple)((char*)entry_new + htab->entrysize);
+
+	memcpy(entry_new->tuple, entry->tuple, entry->tuple->t_len);
+
+	entry_new->next = htab->buckets[bucketno];
+	htab->buckets[bucketno] = entry_new;
+
+}
+
+/*
  * ExecAgg -
  *
  *	  ExecAgg receives tuples from its outer subplan and aggregates over
@@ -1318,9 +1389,16 @@ ExecAgg(AggState *node)
 	/* Dispatch based on strategy */
 	if (((Agg *) node->ss.ps.plan)->aggstrategy == AGG_HASHED)
 	{
-		if (!node->table_filled)
-			agg_fill_hash_table(node);
-		return agg_retrieve_hash_table(node);
+		TupleTableSlot *slot = NULL;
+
+		while (slot == NULL)
+		{
+			if (!node->table_filled)
+				if (! agg_fill_hash_table(node))
+					break;	/* no more batches to process */
+			slot = agg_retrieve_hash_table(node);
+		}
+		return slot;
 	}
 	else
 		return agg_retrieve_direct(node);
@@ -1536,13 +1614,16 @@ agg_retrieve_direct(AggState *aggstate)
 /*
  * ExecAgg for hashed case: phase 1, read input and build hash table
  */
-static void
+static bool
 agg_fill_hash_table(AggState *aggstate)
 {
 	PlanState  *outerPlan;
 	ExprContext *tmpcontext;
 	AggHashEntry entry;
-	TupleTableSlot *outerslot;
+	TupleTableSlot *outerslot = NULL;
+	int64		ntuples = 0;
+	Size		allocated;
+	BufFile	   *infile = NULL;
 
 	/*
 	 * get state info from node
@@ -1551,33 +1632,172 @@ agg_fill_hash_table(AggState *aggstate)
 	/* tmpcontext is the per-input-tuple expression context */
 	tmpcontext = aggstate->tmpcontext;
 
+	/* if there're no more batches, we're done */
+	if (aggstate->cur_batch == aggstate->nbatch)
+	{
+		aggstate->agg_done = true;
+		return false;
+	}
+
+	/* if not the first time through, reinitialize */
+	if (aggstate->cur_batch > 0)
+	{
+
+		BufFile *file_groups = aggstate->batched_groups[aggstate->cur_batch];
+
+		/* used for all the read_saved_group calls, to minimize palloc
+		 * overhead (and released in the last one automatically) */
+		AggHashEntry entry = (AggHashEntry)palloc0(aggstate->hashtable->entrysize);
+
+		/* reset the hash table (free the chunks, zero buckets, ...) */
+		reset_hash_table(aggstate->hashtable);
+
+		/* read all the aggregate states and either insert them into the
+		 * hash table, or move them to the proper batch */
+		BufFileSeek(file_groups, 0, 0L, SEEK_SET);
+
+		while ((entry = read_saved_group(aggstate, file_groups, entry)) != NULL)
+		{
+			/* XXX hashjoin uses a single call to compute both bucket
+			 * and batch, maybe we could do the same here (and pass
+			 * bucketno to create_hash_entry) */
+			int batchno = compute_batchno(aggstate, entry->hashvalue);
+
+			if (batchno == aggstate->cur_batch)
+				/* keep in the current batch */
+				create_hash_entry(aggstate, entry);
+			else
+				/* move to the proper batch */
+				save_group(aggstate, batchno, entry);
+		}
+
+		/* we're done with the temp file */
+		BufFileClose(file_groups);
+		aggstate->batched_groups[aggstate->cur_batch] = NULL;
+
+		/* prepare to read the saved tuples */
+		BufFileSeek(aggstate->batched_tuples[aggstate->cur_batch], 0, 0L, SEEK_SET);
+		infile = aggstate->batched_tuples[aggstate->cur_batch];
+	}
+
 	/*
 	 * Process each outer-plan tuple, and then fetch the next one, until we
 	 * exhaust the outer plan.
 	 */
 	for (;;)
 	{
-		outerslot = ExecProcNode(outerPlan);
-		if (TupIsNull(outerslot))
-			break;
-		/* set up for advance_aggregates call */
-		tmpcontext->ecxt_outertuple = outerslot;
 
-		/* Find or build hashtable entry for this tuple's group */
-		entry = lookup_hash_entry(aggstate, outerslot);
+		uint32 hashvalue;
+		int batchno = 0;
+
+		CHECK_FOR_INTERRUPTS();
+
+		/* the first batch means we need to fetch the tuples */
+		if (aggstate->cur_batch == 0)
+		{
+			outerslot = ExecProcNode(outerPlan);
+
+			if (TupIsNull(outerslot))
+				break;
+
+			/* copy the tuple descriptor for the following batches */
+			if (aggstate->tupdesc == NULL)
+			{
+				MemoryContext oldContext = MemoryContextSwitchTo(aggstate->aggcontext);
+				aggstate->tupdesc = CreateTupleDescCopy(outerslot->tts_tupleDescriptor);
+				MemoryContextSwitchTo(oldContext);
+			}
+
+			hashvalue = compute_hash_value(aggstate, outerslot);
+			aggstate->ntuples_scanned += 1;
+		}
+		else
+		{
+
+			if (outerslot == NULL)
+				outerslot = MakeSingleTupleTableSlot(aggstate->tupdesc);
+
+			outerslot = read_saved_tuple(infile, &hashvalue, outerslot);
+			if (TupIsNull(outerslot))
+			{
+				BufFileClose(infile);
+				aggstate->batched_tuples[aggstate->cur_batch] = NULL;
+				break;
+			}
+
+			aggstate->ntuples_rescanned += 1;
+		}
+
+		ntuples++;
+
+		batchno = compute_batchno(aggstate, hashvalue);
+
+		Assert(batchno >= aggstate->cur_batch);
+
+		if (batchno == aggstate->cur_batch) {
+
+			/* Find or build hashtable entry for this tuple's group */
+			entry = lookup_hash_entry(aggstate, hashvalue, outerslot);
+
+			/* set up for advance_aggregates call */
+			tmpcontext->ecxt_outertuple = outerslot;
 
-		/* Advance the aggregates */
-		advance_aggregates(aggstate, entry->pergroup);
+			/* Advance the aggregates */
+			advance_aggregates(aggstate, entry->pergroup);
 
-		/* Reset per-input-tuple context after each tuple */
-		ResetExprContext(tmpcontext);
+			/* Reset per-input-tuple context after each tuple */
+			ResetExprContext(tmpcontext);
+
+			/* have we exceeded work_mem? if yes, increase number of batches
+			 * 
+			 * FIXME This uses htabctx, which is OK for states using
+			 *       pass-by-value types, but it's not really correct
+			 *       in general (use aggcontext instead).
+			 */
+			if (MemoryContextGetAllocated(aggstate->hashtable->htabctx, true) >= work_mem * 1024L)
+				increase_nbatches(aggstate);
+
+		} else {
+
+			/* no entry for this tuple, and we've reached work_mem */
+			save_tuple(aggstate, batchno, outerslot, hashvalue);
+
+		}
 	}
 
+	allocated = MemoryContextGetAllocated(aggstate->hashtable->htabctx, true);
+
+	/* keep track of the largest/smallest batch size */
+	if (aggstate->cur_batch == 0)
+	{
+		aggstate->batch_min_size = allocated;
+		aggstate->batch_max_size = allocated;
+	}
+	else
+	{
+		if (allocated < aggstate->batch_min_size)
+			aggstate->batch_min_size = allocated;
+		if (allocated > aggstate->batch_max_size)
+			aggstate->batch_max_size = allocated;
+	}
+
+	/* if we're in the first batch, and there were 0 tuples, we're done */
+	if ((aggstate->cur_batch == 0) && (aggstate->ntuples_scanned == 0))
+	{
+		aggstate->agg_done = true;
+		return false;
+	}
+
+	/* the hash table is filled, and we're ready for the next batch */
 	aggstate->table_filled = true;
+	aggstate->cur_batch += 1;
 
 	/* Initialize for iteration through the table (first bucket / entry) */
 	IteratorReset(aggstate->hashtable);
 
+	/* ready to return groups from this hash table */
+	return true;
+
 }
 
 /*
@@ -1620,6 +1840,8 @@ agg_retrieve_hash_table(AggState *aggstate)
 		 */
 		ResetExprContext(econtext);
 
+		htab->niterated += 1;
+
 		/*
 		* Store the copied first input tuple in the tuple table slot reserved
 		* for it, so that it can be used in ExecProject.
@@ -1677,7 +1899,8 @@ agg_retrieve_hash_table(AggState *aggstate)
 
 	}
 
-	aggstate->agg_done = true;
+	/* No more entries in hashtable, so done with this batch */
+	aggstate->table_filled = false;
 
 	/* No more groups */
 	return NULL;
@@ -1739,11 +1962,11 @@ ExecInitAgg(Agg *node, EState *estate, int eflags)
 	 * recover no-longer-wanted space.
 	 */
 	aggstate->aggcontext =
-		AllocSetContextCreate(CurrentMemoryContext,
+		AllocSetContextCreateTracked(CurrentMemoryContext,
 							  "AggContext",
 							  ALLOCSET_DEFAULT_MINSIZE,
 							  ALLOCSET_DEFAULT_INITSIZE,
-							  ALLOCSET_DEFAULT_MAXSIZE);
+							  ALLOCSET_DEFAULT_MAXSIZE, true);
 
 	/*
 	 * tuple table initialization
@@ -1842,10 +2065,29 @@ ExecInitAgg(Agg *node, EState *estate, int eflags)
 
 	if (node->aggstrategy == AGG_HASHED)
 	{
+		MemoryContext oldContext;
+
 		build_hash_table(aggstate, outerPlan->plan_width);
 		aggstate->table_filled = false;
+
+		aggstate->tupdesc = NULL;
+		aggstate->nbatch = choose_nbatch(aggstate, aggstate->hashtable->nbuckets, outerPlan->plan_width);
+		aggstate->nbatch_original = aggstate->nbatch;
+		aggstate->cur_batch = 0;
+
+		/* initialize temporary files for batched tuples/groups */
+		init_batch_files(aggstate);
+
+		/* explain (analyze) counters */
+		aggstate->ntuples_scanned = 0;
+		aggstate->ntuples_rescanned = 0;
+
 		/* Compute the columns we actually need to hash on */
 		aggstate->hash_needed = find_hash_columns(aggstate);
+
+		/* prime with initial work item to read from outer plan */
+		oldContext = MemoryContextSwitchTo(aggstate->aggcontext);
+		MemoryContextSwitchTo(oldContext);
 	}
 	else
 	{
@@ -2198,6 +2440,18 @@ ExecInitAgg(Agg *node, EState *estate, int eflags)
 	/* Update numaggs to match number of unique aggregates found */
 	aggstate->numaggs = aggno + 1;
 
+	/* check whether we can do batching */
+	aggstate->batching_enabled = batching_supported(aggstate);
+
+	/*
+	 * If in hashed mode, with no batching, disable nbuckets_max limit,
+	 * because if we're gonna exhaust memory, there's not much
+	 * difference between doing that fast and slow. It's equally bad
+	 * either way :-/
+	 */
+	if ((node->aggstrategy == AGG_HASHED) && (! aggstate->batching_enabled))
+		aggstate->hashtable->nbuckets_max = INT_MAX/2;
+
 	return aggstate;
 }
 
@@ -2245,6 +2499,10 @@ ExecEndAgg(AggState *node)
 	/* clean up tuple table */
 	ExecClearTuple(node->ss.ss_ScanTupleSlot);
 
+	/* properly close the batch files (in batching mode) */
+	if (node->nbatch != 0)
+		close_batch_files(node);
+
 	MemoryContextDelete(node->aggcontext);
 
 	outerPlan = outerPlanState(node);
@@ -2264,22 +2522,24 @@ ExecReScanAgg(AggState *node)
 	if (((Agg *) node->ss.ps.plan)->aggstrategy == AGG_HASHED)
 	{
 		/*
-		 * In the hashed case, if we haven't yet built the hash table then we
-		 * can just return; nothing done yet, so nothing to undo. If subnode's
-		 * chgParam is not NULL then it will be re-scanned by ExecProcNode,
-		 * else no reason to re-scan it at all.
+		 * In the hashed case, if we haven't done any execution work yet, we
+		 * can just return; nothing to undo. If subnode's chgParam is not NULL
+		 * then it will be re-scanned by ExecProcNode, else no reason to
+		 * re-scan it at all.
 		 */
-		if (!node->table_filled)
-			return;
+		// FIXME maybe it was a good idea to have hash_init_state ...
+		// if (node->hash_init_state)
+		//	return;
 
 		/*
-		 * If we do have the hash table and the subplan does not have any
-		 * parameter changes, then we can just rescan the existing hash table;
-		 * no need to build it again.
+		 * If we do have the hash table, it never went to disk, and the
+		 * subplan does not have any parameter changes, then we can just
+		 * rescan the existing hash table; no need to build it again.
 		 */
-		if (node->ss.ps.lefttree->chgParam == NULL)
+		if (node->ss.ps.lefttree->chgParam == NULL && (node->nbatch == 1))
 		{
 			IteratorReset(node->hashtable);
+			node->table_filled = true;
 			return;
 		}
 	}
@@ -2318,10 +2578,43 @@ ExecReScanAgg(AggState *node)
 
 	if (((Agg *) node->ss.ps.plan)->aggstrategy == AGG_HASHED)
 	{
-		Plan * outerPlan = outerPlan((Agg *) node->ss.ps.plan);
+		Plan *outerPlan = outerPlan((Agg *) node->ss.ps.plan);
+
 		/* Rebuild an empty hash table */
 		build_hash_table(node, outerPlan->plan_width);
 		node->table_filled = false;
+		node->tupdesc = NULL;
+
+		/* reset the current batch, but remember the nbatch */
+		node->cur_batch = 0;
+
+		/* XXX The way we work with the temporary files right now is that
+		 * on rescan we throw them away, and start over. The problem is
+		 * that when the rescan triggers after somewhere after the initial
+		 * batch and before completing all the batches, we don't know
+		 * which groups/tuples were already moved (copied) to the following
+		 * batches (so we can't just move them again). Also, we close the
+		 * files as soon as we complete reading them.
+		 * 
+		 * We could however improve this by keeping the files open until
+		 * ExecEndAgg, and remembering which tuples / groups we've
+		 * already moved to the appropriate batch (a batchno/tupleno
+		 * pair should be enough), and only move the tuples after that.
+		 * 
+		 * The problem is with the initial batch, which is only in memory
+		 * by default. We could serialize this to a file once it's
+		 * complete (only the groups, should be less than work_mem),
+		 * but that's likely to impact even plans that don't require
+		 * the rescan. Not sure if it's know in advance whether a rescan
+		 * is likely to happen.
+		 */
+
+		/* reinitialize the files with batched tuples/groups */
+		reinit_batch_files(node);
+
+		/* explain (analyze) counters */
+		node->ntuples_scanned = 0;
+		node->ntuples_rescanned = 0;
 	}
 	else
 	{
@@ -2649,9 +2942,6 @@ increase_nbuckets(AggState * aggstate)
 		/* position within the buffer (up to chunk->used) */
 		size_t idx = 0;
 
-		/* we have a whole number of entries */
-		Assert(chunk->used % htab->entrysize == 0);
-
 		/* process all tuples stored in this chunk (and then free it) */
 		while (idx < chunk->used)
 		{
@@ -2675,6 +2965,120 @@ increase_nbuckets(AggState * aggstate)
 
 }
 
+
+/*
+ * Increase the number of batches - we'll double it by default, but we
+ * may grow faster if needed. Contrary to increasing the number of
+ * buckets, this needs to remove ~50% of the entries (when doubling
+ * the number of batches).
+ * 
+ * We keep the number of buckets etc. because we expect the table to
+ * grow further.
+ */
+static void
+increase_nbatches(AggState * aggstate)
+{
+
+	HashChunk chunk, chunk_prev;
+	AggHashTable htab = aggstate->hashtable;
+	int i, nbatch_old = aggstate->nbatch;
+	MemoryContext oldctx;
+
+	/* remember the old chunks (and reset to NULL, to allocate new ones) */
+	HashChunk oldchunks = htab->chunks_hash;
+	htab->chunks_hash = NULL;
+
+	aggstate->nbatch      *= 2;
+	aggstate->hashtable->nentries = 0;
+
+	oldctx = MemoryContextSwitchTo(aggstate->aggcontext);
+
+	/* initialize appropriate number of temporary files */
+	if (aggstate->nbatch == 2)
+	{
+		aggstate->batched_groups = (BufFile**)palloc0(2*sizeof(BufFile*));
+		aggstate->batched_tuples = (BufFile**)palloc0(2*sizeof(BufFile*));
+	}
+	else
+	{
+		aggstate->batched_groups = (BufFile**)repalloc(aggstate->batched_groups, aggstate->nbatch * sizeof(BufFile*));
+		aggstate->batched_tuples = (BufFile**)repalloc(aggstate->batched_tuples, aggstate->nbatch * sizeof(BufFile*));
+	}
+
+	for (i = nbatch_old; i < aggstate->nbatch; i++)
+	{
+		aggstate->batched_groups[i] = BufFileCreateTemp(false);
+		aggstate->batched_tuples[i] = BufFileCreateTemp(false);
+	}
+
+	MemoryContextSwitchTo(oldctx);
+
+	/* reset the buckets (we'll rebuild them from scratch) */
+	memset(htab->buckets, 0, htab->nbuckets * sizeof(AggHashEntry));
+
+	chunk = oldchunks;
+	while (chunk != NULL)
+	{
+
+		/* position within the buffer (up to chunk->used) */
+		size_t idx = 0;
+
+		/* current chunk (so that we can pfree it at the end) */
+		chunk_prev = chunk;
+
+		/* process all tuples stored in this chunk (and then free it) */
+		while (idx < chunk->used)
+		{
+
+			AggHashEntry entry = (AggHashEntry)(chunk->data + idx);
+
+			/* this already uses the updated nbatch values */
+			int batchno = compute_batchno(aggstate, entry->hashvalue);
+
+			Assert(batchno >= aggstate->cur_batch);
+
+			if (batchno == aggstate->cur_batch) {
+
+				/*
+				 * If the batch number is still cur_batch, copy it to
+				 * a new chunk, and put it into the proper bucket.
+				 */
+
+				int bucketno = compute_bucket(aggstate, entry->hashvalue);
+
+				AggHashEntry entry_new = (AggHashEntry)chunk_alloc(htab,
+								htab->entrysize + entry->tuple->t_len);
+
+				memcpy(entry_new, entry, htab->entrysize + entry->tuple->t_len);
+
+				entry_new->tuple = (MinimalTuple)((char*)entry_new + htab->entrysize);
+
+				/* fine, just put the entry into */
+				entry_new->next = htab->buckets[bucketno];
+				htab->buckets[bucketno] = entry_new;
+
+				aggstate->hashtable->nentries += 1;
+
+			} else {
+
+				/* different batch - save the group */
+				save_group(aggstate, batchno, entry);
+
+			}
+
+			/* bytes occupied in memory HJ tuple overhead + actual tuple length */
+			idx += htab->entrysize + entry->tuple->t_len;
+
+		}
+
+		/* proceed to the next chunk */
+		chunk = chunk->next;
+
+		pfree(chunk_prev);
+	}
+
+}
+
 static
 char * chunk_alloc(AggHashTable htab, int size)
 {
@@ -2804,6 +3208,7 @@ AggHashEntry IteratorGetNext(AggHashTable htab)
 	 * So compute how many bytes we need to skip to the next entry.
 	 */
 	entry = htab->cur_entry;
+
 	len = entry->tuple->t_len + htab->entrysize;
 
 	/*
@@ -2826,6 +3231,166 @@ AggHashEntry IteratorGetNext(AggHashTable htab)
 
 }
 
+
+/*
+ * save_tuple
+ *
+ * Not enough memory to add tuple as new entry in hash table. Save for later
+ * in the appropriate partition.
+ */
+static void
+save_tuple(AggState *aggstate, int batchno, TupleTableSlot *slot,
+		   uint32 hashvalue)
+{
+	MinimalTuple		 tuple;
+	BufFile				*file;
+	int					 written;
+
+	Assert(batchno > aggstate->cur_batch);
+
+	file = aggstate->batched_tuples[batchno];
+
+	tuple = ExecFetchSlotMinimalTuple(slot);
+
+	written = BufFileWrite(file, (void *) &hashvalue, sizeof(uint32));
+	if (written != sizeof(uint32))
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not write to HashAgg temporary file: %m")));
+
+	written = BufFileWrite(file, (void *) tuple, tuple->t_len);
+	if (written != tuple->t_len)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not write to HashAgg temporary file: %m")));
+}
+
+
+/*
+ * save_group
+ *
+ */
+static void
+save_group(AggState *aggstate, int batchno, AggHashEntry entry)
+{
+	MinimalTuple		 tuple;
+	BufFile				*file;
+	int					 written;
+
+	Assert(batchno > aggstate->cur_batch);
+
+	file = aggstate->batched_groups[batchno];
+	tuple = entry->tuple;
+
+	written = BufFileWrite(file, (void *) entry, aggstate->hashtable->entrysize);
+	if (written != aggstate->hashtable->entrysize)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not write to HashAgg temporary file: %m")));
+
+	written = BufFileWrite(file, (void *) tuple, tuple->t_len);
+	if (written != tuple->t_len)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not write to HashAgg temporary file: %m")));
+}
+
+
+/*
+ * read_saved_tuple
+ *		read the next tuple from a batch file.  Return NULL if no more.
+ *
+ * On success, *hashvalue is set to the tuple's hash value, and the tuple
+ * itself is stored in the given slot.
+ *
+ * Copied with minor modifications from ExecHashJoinGetSavedTuple.
+ */
+static TupleTableSlot *
+read_saved_tuple(BufFile *file, uint32 *hashvalue, TupleTableSlot *tupleSlot)
+{
+	uint32		header[2];
+	size_t		nread;
+	MinimalTuple tuple;
+
+	/*
+	 * Since both the hash value and the MinimalTuple length word are uint32,
+	 * we can read them both in one BufFileRead() call without any type
+	 * cheating.
+	 */
+	nread = BufFileRead(file, (void *) header, sizeof(header));
+	if (nread == 0)				/* end of file */
+	{
+		ExecClearTuple(tupleSlot);
+		return NULL;
+	}
+	if (nread != sizeof(header))
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not read from HashAgg temporary file: %m")));
+	*hashvalue = header[0];
+	tuple = (MinimalTuple) palloc(header[1]);
+	tuple->t_len = header[1];
+	nread = BufFileRead(file,
+						(void *) ((char *) tuple + sizeof(uint32)),
+						header[1] - sizeof(uint32));
+	if (nread != header[1] - sizeof(uint32))
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not read from HashAgg temporary file: %m")));
+	return ExecStoreMinimalTuple(tuple, tupleSlot, true);
+}
+
+/*
+ * read_saved_group
+ */
+AggHashEntry
+read_saved_group(AggState * aggstate, BufFile *file, AggHashEntry entry)
+{
+	uint32		tlen;
+	size_t		nread;
+
+	/* we know the size of the entry, we don't know the tuple size yet */
+
+	Assert(entry != NULL);
+
+	/* always release the tuple (well, maybe we could keep track of the
+	 * allocated space and reuse the tuple buffer) */
+	if (entry->tuple != NULL)
+		pfree(entry->tuple);
+
+	nread = BufFileRead(file, (void *)entry, aggstate->hashtable->entrysize);
+	if (nread == 0)				/* end of file */
+	{
+		pfree(entry);
+		return NULL;
+	}
+
+	if (nread != aggstate->hashtable->entrysize)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not read from HashAgg temporary file: %m")));
+
+	/* now, we need to read the actual tuple - first, read the length */
+	nread = BufFileRead(file, (void *)&tlen, sizeof(uint32));
+	if (nread != sizeof(int32))
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not read from HashAgg temporary file: %m")));
+
+	/* now, allocate space for the tuple and read the rest */
+	entry->tuple = (MinimalTuple) palloc(tlen);
+	entry->tuple->t_len = tlen;
+	nread = BufFileRead(file,
+						(void *) ((char *) entry->tuple + sizeof(uint32)),
+						tlen - sizeof(uint32));
+	if (nread != tlen - sizeof(uint32))
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not read from HashAgg temporary file: %m")));
+
+	return entry;
+}
+
 /*
  * Resets the contents of the hash table - removes all the entries and
  * tuples, but keeps the 'size' of the hash table (nbuckets).
@@ -2858,3 +3423,114 @@ void reset_hash_table(AggHashTable htab) {
 								htab->nbuckets * sizeof(AggHashEntry));
 
 }
+
+static uint32 compute_batchno(AggState * aggstate, uint32 hashvalue) {
+	if ((! aggstate->batching_enabled) || (aggstate->nbatch == 1))
+		return 0;
+	else
+		/*
+		 * When computing the batch number, skip the bits that might be
+		 * used for buckets.
+		 * 
+		 * XXX We should probably make sure that we don't exceed the 32
+		 *     bits we have available in the hash. This is pretty much
+		 *     the same issue as in hash join.
+		 */
+		return (hashvalue >> aggstate->hashtable->nbuckets_bits) & (aggstate->nbatch - 1);
+}
+
+static bool
+batching_supported(AggState * aggstate)
+{
+	int aggno;
+
+	/* check that all the aggregates use state passed by value */
+	for (aggno = 0; aggno < aggstate->numaggs; aggno++)
+		if (! aggstate->peragg[aggno].transtypeByVal)
+			return false;
+
+	return true;
+}
+
+static void
+init_batch_files(AggState * aggstate)
+{
+
+	int i;
+
+	if (aggstate->nbatch > 1) {
+		aggstate->batched_groups = (BufFile**)palloc0(aggstate->nbatch * sizeof(BufFile*));
+		aggstate->batched_tuples = (BufFile**)palloc0(aggstate->nbatch * sizeof(BufFile*));
+	}
+
+	for (i = 1; i < aggstate->nbatch; i++) {
+		aggstate->batched_groups[i] = BufFileCreateTemp(false);
+		aggstate->batched_tuples[i] = BufFileCreateTemp(false);
+	}
+
+}
+
+static void
+close_batch_files(AggState * aggstate)
+{
+	int i;
+
+	for (i = 1; i < aggstate->nbatch; i++) {
+
+		/* if we're halfway through the batches, the files might be
+		 * already closed (and set to NULL) */
+		if (aggstate->batched_groups[i] != NULL)
+			BufFileClose(aggstate->batched_groups[i]);
+
+		if (aggstate->batched_tuples[i] != NULL)
+			BufFileClose(aggstate->batched_tuples[i]);
+
+		aggstate->batched_groups[i] = NULL;
+		aggstate->batched_tuples[i] = NULL;
+
+	}
+}
+
+static void
+reinit_batch_files(AggState * aggstate)
+{
+	int i;
+
+	/* make sure all the files are properly closed */
+	close_batch_files(aggstate);
+
+	/* reinit all the files (skip the first one, which is batchno=0) */
+	for (i = 1; i < aggstate->nbatch; i++)
+	{
+		aggstate->batched_groups[i] = BufFileCreateTemp(false);
+		aggstate->batched_tuples[i] = BufFileCreateTemp(false);
+	}
+}
+
+static int
+choose_nbatch(AggState *aggstate, int nbuckets, Size tuple_width)
+{
+	Agg		   *node = (Agg *) aggstate->ss.ps.plan;
+
+	/* space used by the group (includes bucket) */
+	Size		groupsize, bucketssize, groupssize;
+	int nbatch = 1;
+
+	Assert(node->aggstrategy == AGG_HASHED);
+	Assert(node->numGroups > 0);
+
+	/* XXX see how build_hash_table estimates entrysize and groupsize */
+	groupsize = MAXALIGN(sizeof(AggHashEntryData) +
+			(aggstate->numaggs - 1) * sizeof(AggStatePerGroupData)) +
+			+ MAXALIGN(sizeof(MinimalTupleData)) + tuple_width;
+
+	bucketssize = nbuckets * sizeof(AggHashEntry);
+	groupssize = groupsize * node->numGroups;
+
+	/* double nbatch until we're expected to fit in work_mem */
+	while (groupssize / nbatch + bucketssize >= work_mem * 1024L)
+		nbatch *= 2;
+
+	return nbatch;
+
+}
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index 0cdb790..926abad 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -113,6 +113,7 @@ bool		enable_bitmapscan = true;
 bool		enable_tidscan = true;
 bool		enable_sort = true;
 bool		enable_hashagg = true;
+bool		enable_hashagg_disk = true;
 bool		enable_nestloop = true;
 bool		enable_material = true;
 bool		enable_mergejoin = true;
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index e1480cd..7b8135d 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -2741,7 +2741,8 @@ choose_hashed_grouping(PlannerInfo *root,
 	/* plus the per-hash-entry overhead */
 	hashentrysize += hash_agg_entry_size(agg_costs->numAggs);
 
-	if (hashentrysize * dNumGroups > work_mem * 1024L)
+	if (!enable_hashagg_disk &&
+		hashentrysize * dNumGroups > work_mem * 1024L)
 		return false;
 
 	/*
@@ -2907,7 +2908,8 @@ choose_hashed_distinct(PlannerInfo *root,
 	/* plus the per-hash-entry overhead */
 	hashentrysize += hash_agg_entry_size(0);
 
-	if (hashentrysize * dNumDistinctRows > work_mem * 1024L)
+	if (!enable_hashagg_disk &&
+		hashentrysize * dNumDistinctRows > work_mem * 1024L)
 		return false;
 
 	/*
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 8c57803..5128e20 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -749,6 +749,15 @@ static struct config_bool ConfigureNamesBool[] =
 		NULL, NULL, NULL
 	},
 	{
+		{"enable_hashagg_disk", PGC_USERSET, QUERY_TUNING_METHOD,
+			gettext_noop("Enables the planner's use of disk-based hashed aggregation plans."),
+			NULL
+		},
+		&enable_hashagg_disk,
+		true,
+		NULL, NULL, NULL
+	},
+	{
 		{"enable_material", PGC_USERSET, QUERY_TUNING_METHOD,
 			gettext_noop("Enables the planner's use of materialization."),
 			NULL
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index df98b02..8f5b73b 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -266,6 +266,7 @@
 
 #enable_bitmapscan = on
 #enable_hashagg = on
+#enable_hashagg_disk = on
 #enable_hashjoin = on
 #enable_indexscan = on
 #enable_indexonlyscan = on
diff --git a/src/backend/utils/mmgr/mcxt.c b/src/backend/utils/mmgr/mcxt.c
index a70b296..97034f1 100644
--- a/src/backend/utils/mmgr/mcxt.c
+++ b/src/backend/utils/mmgr/mcxt.c
@@ -634,7 +634,7 @@ MemoryContextCreate(NodeTag tag, Size size,
 	 */
 	if (track_mem)
 	{
-		node->accounting = (MemoryAccounting)MemoryContextAlloc(TopMemoryContext,
+		node->accounting = (MemoryAccounting)MemoryContextAllocZero(TopMemoryContext,
 												sizeof(MemoryAccountingData));
 		if (parent)
 			node->accounting->parent = parent->accounting;
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 995389b..f2286fe 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -22,7 +22,7 @@
 #include "utils/reltrigger.h"
 #include "utils/sortsupport.h"
 #include "utils/tuplestore.h"
-
+#include "storage/buffile.h"
 
 /* ----------------
  *	  IndexInfo information
@@ -1725,6 +1725,31 @@ typedef struct AggState
 	List	   *hash_needed;	/* list of columns needed in hash table */
 	bool		table_filled;	/* hash table filled yet? */
 	AggHashTable	hashtable;	/* instance of the simple hash table */
+	TupleDesc	tupdesc;
+
+	/* simple batching */
+	bool		batching_enabled;	/* can we serialize group states? */
+	int			nbatch;				/* number of batches */
+	int			nbatch_bits;		/* bits from the hash */
+	int			nbatch_original;	/* number of batches (original) */
+	int			cur_batch;			/* current batch (up to nbatch-1) */
+
+	/* temporary files with data for further batches */
+	BufFile		**batched_groups;	/* serialized aggregate states */
+	BufFile		**batched_tuples;	/* serialized tuples */
+
+	/* counters for explain (analyze) */
+	Size		batch_min_size;		/* minimum batch size (bytes) */
+	Size		batch_max_size;		/* maximum batch size (bytes) */
+
+	/*
+	 * These two counters allow evaluation of how many times the tuples
+	 * were saved/read. With no batching, rescanned=0. With a single
+	 * level of batching (rescanned/scanned < 1.00) and with multi-level
+	 * batching it may happen that (rescanned/scanned > 1.00).
+	 */
+	int64		ntuples_scanned;	/* number of input tuples scanned */
+	int64		ntuples_rescanned;	/* number of tuples saved/read */
 
 } AggState;
 
diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h
index 75e2afb..d363e65 100644
--- a/src/include/optimizer/cost.h
+++ b/src/include/optimizer/cost.h
@@ -57,6 +57,7 @@ extern bool enable_bitmapscan;
 extern bool enable_tidscan;
 extern bool enable_sort;
 extern bool enable_hashagg;
+extern bool enable_hashagg_disk;
 extern bool enable_nestloop;
 extern bool enable_material;
 extern bool enable_mergejoin;
diff --git a/src/test/regress/expected/rangefuncs.out b/src/test/regress/expected/rangefuncs.out
index 774e75e..e88c83c 100644
--- a/src/test/regress/expected/rangefuncs.out
+++ b/src/test/regress/expected/rangefuncs.out
@@ -3,6 +3,7 @@ SELECT name, setting FROM pg_settings WHERE name LIKE 'enable%';
 ----------------------+---------
  enable_bitmapscan    | on
  enable_hashagg       | on
+ enable_hashagg_disk  | on
  enable_hashjoin      | on
  enable_indexonlyscan | on
  enable_indexscan     | on
@@ -12,7 +13,7 @@ SELECT name, setting FROM pg_settings WHERE name LIKE 'enable%';
  enable_seqscan       | on
  enable_sort          | on
  enable_tidscan       | on
-(11 rows)
+(12 rows)
 
 CREATE TABLE foo2(fooid int, f2 int);
 INSERT INTO foo2 VALUES(1, 11);
