diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c
index 3505236..85bec20 100644
--- a/src/backend/utils/sort/tuplesort.c
+++ b/src/backend/utils/sort/tuplesort.c
@@ -114,6 +114,8 @@
 #include "utils/rel.h"
 #include "utils/tuplesort.h"
 
+#include "utils/builtins.h"
+
 
 /* sort-type codes for sort__start probes */
 #define HEAP_SORT		0
@@ -273,6 +275,8 @@ struct Tuplesortstate
 	int			memtupcount;	/* number of tuples currently present */
 	int			memtupsize;		/* allocated length of memtuples array */
 
+	bool		hasnulls_initial;	/* any NULLs in the first key col? */
+
 	/*
 	 * While building initial runs, this is the current output run number
 	 * (starting at 0).  Afterwards, it is the number of initial runs we made.
@@ -341,6 +345,8 @@ struct Tuplesortstate
 	TupleDesc	tupDesc;
 	ScanKey		scanKeys;		/* array of length nKeys */
 
+	int (*comparator) (Datum, Datum);
+
 	/*
 	 * These variables are specific to the CLUSTER case; they are set by
 	 * tuplesort_begin_cluster.  Note CLUSTER also uses tupDesc and
@@ -459,6 +465,11 @@ static unsigned int getlen(Tuplesortstate *state, int tapenum, bool eofOK);
 static void markrunend(Tuplesortstate *state, int tapenum);
 static int comparetup_heap(const SortTuple *a, const SortTuple *b,
 				Tuplesortstate *state);
+static int comparetup_heap_1key_asc_nonulls(const SortTuple *a, const SortTuple *b,
+				Tuplesortstate *state);
+static int comparetup_heap_1key_asc_nonulls_btint4cmp(const SortTuple *a, const SortTuple *b,
+				Tuplesortstate *state);
+static int compare_int4(Datum first, Datum second);
 static void copytup_heap(Tuplesortstate *state, SortTuple *stup, void *tup);
 static void writetup_heap(Tuplesortstate *state, int tapenum,
 			  SortTuple *stup);
@@ -551,6 +562,7 @@ tuplesort_begin_common(int workMem, bool randomAccess)
 	state->availMem = state->allowedMem;
 	state->sortcontext = sortcontext;
 	state->tapeset = NULL;
+	state->hasnulls_initial = false;
 
 	state->memtupcount = 0;
 	state->memtupsize = 1024;	/* initial guess */
@@ -1247,11 +1259,41 @@ tuplesort_performsort(Tuplesortstate *state)
 			 * amount of memory.  Just qsort 'em and we're done.
 			 */
 			if (state->memtupcount > 1)
+			{
+				int	(*comparetup) (const SortTuple *a, const SortTuple *b,
+								   Tuplesortstate *state);
+
+				/* If there is only one key col, the input contains no NULLs,
+				 * and it's ascending, we can use a fastpath version of
+				 * comparetup_heap that skips the over those things.
+				 */
+				if (state->comparetup == comparetup_heap &&
+					state->nKeys == 1 &&
+					!(state->scanKeys[0].sk_flags & SK_BT_DESC) &&
+					!state->hasnulls_initial)
+				{
+					/*
+					 * furthermore, if the comparison function happens to be
+					 * btint4cmp, we have an additional fastpath implementation
+					 * for that that skips the FunctionCallInvoke overhead
+					 */
+					if (state->scanKeys[0].sk_func.fn_addr == btint4cmp)
+					{
+						state->comparator = compare_int4;
+						comparetup = comparetup_heap_1key_asc_nonulls_btint4cmp;
+					}
+					else
+						comparetup = comparetup_heap_1key_asc_nonulls;
+				}
+				else
+					comparetup = state->comparetup;
+
 				qsort_arg((void *) state->memtuples,
 						  state->memtupcount,
 						  sizeof(SortTuple),
-						  (qsort_arg_comparator) state->comparetup,
+						  (qsort_arg_comparator) comparetup,
 						  (void *) state);
+			}
 			state->current = 0;
 			state->eof_reached = false;
 			state->markpos_offset = 0;
@@ -2713,6 +2755,44 @@ ApplySortFunction(FmgrInfo *sortFunction, int sortFlags, Oid collation,
  */
 
 static int
+compare_int4(Datum first, Datum second)
+{
+	int32           a = DatumGetInt32(first);
+	int32           b = DatumGetInt32(second);
+	if (a > b)
+		return 1;
+	else if (a == b)
+		return 0;
+	else
+		return -1;
+}
+
+static int
+comparetup_heap_1key_asc_nonulls(const SortTuple *a, const SortTuple *b, Tuplesortstate *state)
+{
+	ScanKey		scanKey = state->scanKeys;
+
+	/* Allow interrupting long sorts */
+	CHECK_FOR_INTERRUPTS();
+
+	/* Compare the sort key */
+	return DatumGetInt32(myFunctionCall2Coll(&scanKey->sk_func,
+											 scanKey->sk_collation,
+											 a->datum1, b->datum1));
+}
+
+
+static int
+comparetup_heap_1key_asc_nonulls_btint4cmp(const SortTuple *a, const SortTuple *b, Tuplesortstate *state)
+{
+	/* Allow interrupting long sorts */
+	CHECK_FOR_INTERRUPTS();
+
+	/* Compare the sort key */
+	return state->comparator(a->datum1, b->datum1);
+}
+
+static int
 comparetup_heap(const SortTuple *a, const SortTuple *b, Tuplesortstate *state)
 {
 	ScanKey		scanKey = state->scanKeys;
@@ -2784,6 +2864,10 @@ copytup_heap(Tuplesortstate *state, SortTuple *stup, void *tup)
 								state->scanKeys[0].sk_attno,
 								state->tupDesc,
 								&stup->isnull1);
+
+	/* update hasnulls_initial flag */
+	if (stup->isnull1)
+		state->hasnulls_initial = true;
 }
 
 static void
diff --git a/src/port/qsort_arg.c b/src/port/qsort_arg.c
index 28d1894..0fa0d75 100644
--- a/src/port/qsort_arg.c
+++ b/src/port/qsort_arg.c
@@ -99,6 +99,124 @@ med3(char *a, char *b, char *c, qsort_arg_comparator cmp, void *arg)
 		: (cmp(b, c, arg) > 0 ? b : (cmp(a, c, arg) < 0 ? a : c));
 }
 
+
+/*
+ * A specialized version for es == sizeof(SortTuple). We only use use the
+ * SortTuple struct here to get its size and alignment.
+ */
+typedef uintptr_t Datum; /* copied from postgres.h */
+typedef struct
+{
+	void	   *tuple;			/* the tuple proper */
+	Datum		datum1;			/* value of first key column */
+	bool		isnull1;		/* is first key column NULL? */
+	int32		tupindex;		/* see notes above */
+} SortTuple;
+
+
+#define swap_SortTuple(a, b)							\
+	do {												\
+		SortTuple tmp;									\
+		tmp = *(SortTuple *) (a);						\
+		*(SortTuple *) (a) = *(SortTuple *) (b);		\
+		*(SortTuple *) (b) = tmp;						\
+	} while(0)
+
+static void
+qsort_arg_SortTuple(void *a, size_t n, qsort_arg_comparator cmp, void *arg)
+{
+	size_t		es = sizeof(SortTuple);
+	char	   *pa,
+			   *pb,
+			   *pc,
+			   *pd,
+			   *pl,
+			   *pm,
+			   *pn;
+	int			d,
+				r,
+				swaptype,
+				presorted;
+
+loop:SWAPINIT(a, es);
+	if (n < 7)
+	{
+		for (pm = (char *) a + es; pm < (char *) a + n * es; pm += es)
+			for (pl = pm; pl > (char *) a && cmp(pl - es, pl, arg) > 0;
+				 pl -= es)
+				swap_SortTuple(pl, pl - es);
+		return;
+	}
+	presorted = 1;
+	for (pm = (char *) a + es; pm < (char *) a + n * es; pm += es)
+	{
+		if (cmp(pm - es, pm, arg) > 0)
+		{
+			presorted = 0;
+			break;
+		}
+	}
+	if (presorted)
+		return;
+	pm = (char *) a + (n / 2) * es;
+	if (n > 7)
+	{
+		pl = (char *) a;
+		pn = (char *) a + (n - 1) * es;
+		if (n > 40)
+		{
+			d = (n / 8) * es;
+			pl = med3(pl, pl + d, pl + 2 * d, cmp, arg);
+			pm = med3(pm - d, pm, pm + d, cmp, arg);
+			pn = med3(pn - 2 * d, pn - d, pn, cmp, arg);
+		}
+		pm = med3(pl, pm, pn, cmp, arg);
+	}
+	swap_SortTuple(a, pm);
+	pa = pb = (char *) a + es;
+	pc = pd = (char *) a + (n - 1) * es;
+	for (;;)
+	{
+		while (pb <= pc && (r = cmp(pb, a, arg)) <= 0)
+		{
+			if (r == 0)
+			{
+				swap_SortTuple(pa, pb);
+				pa += es;
+			}
+			pb += es;
+		}
+		while (pb <= pc && (r = cmp(pc, a, arg)) >= 0)
+		{
+			if (r == 0)
+			{
+				swap_SortTuple(pc, pd);
+				pd -= es;
+			}
+			pc -= es;
+		}
+		if (pb > pc)
+			break;
+		swap_SortTuple(pb, pc);
+		pb += es;
+		pc -= es;
+	}
+	pn = (char *) a + n * es;
+	r = Min(pa - (char *) a, pb - pa);
+	vecswap(a, pb - r, r);
+	r = Min(pd - pc, pn - pd - es);
+	vecswap(pb, pn - r, r);
+	if ((r = pb - pa) > es)
+		qsort_arg_SortTuple(a, r / es, cmp, arg);
+	if ((r = pd - pc) > es)
+	{
+		/* Iterate rather than recurse to save stack space */
+		a = pn - r;
+		n = r / es;
+		goto loop;
+	}
+}
+
 void
 qsort_arg(void *a, size_t n, size_t es, qsort_arg_comparator cmp, void *arg)
 {
@@ -114,6 +232,12 @@ qsort_arg(void *a, size_t n, size_t es, qsort_arg_comparator cmp, void *arg)
 				swaptype,
 				presorted;
 
+	if (es == sizeof(SortTuple))
+	{
+		qsort_arg_SortTuple(a, n, cmp, arg);
+		return;
+	}
+
 loop:SWAPINIT(a, es);
 	if (n < 7)
 	{
