From 0d811c05cc7e944bc7901b3d1504a816c5b6a35b Mon Sep 17 00:00:00 2001
From: Amit Khandekar <amitdkhan.pg@gmail.com>
Date: Thu, 10 Dec 2020 21:44:06 +0800
Subject: [PATCH 1/2] Speed up xor'ing of two gist index signatures for
 tsvectors

In hemdistsign(), rather than using xor operator on char values, use
it in 64-bit chunks. And since the chunks are 64-bit, use popcount64()
on each of the chunks. I have checked that the two bitvector pointer
arguments of hemdistsign() are not always 64-bit aligned. So process
the leading and trailing bits char-by-char, leaving the middle 64-bit
chunks for use of popcount64().

This results in speed-up in Gist index creation for tsvectors. With
default siglen (124), the speed up is 12-20%. With siglen=700, it is
30-50%. So with longer signature lengths, we get higher percentage
speed-up.
---
 src/backend/utils/adt/tsgistidx.c | 15 ++--------
 src/include/port/pg_bitutils.h    |  2 ++
 src/port/pg_bitutils.c            | 47 +++++++++++++++++++++++++++++++
 3 files changed, 52 insertions(+), 12 deletions(-)

diff --git a/src/backend/utils/adt/tsgistidx.c b/src/backend/utils/adt/tsgistidx.c
index a601965bd8..fb8f34723d 100644
--- a/src/backend/utils/adt/tsgistidx.c
+++ b/src/backend/utils/adt/tsgistidx.c
@@ -486,20 +486,11 @@ sizebitvec(BITVECP sign, int siglen)
 	return pg_popcount(sign, siglen);
 }
 
-static int
+static inline int
 hemdistsign(BITVECP a, BITVECP b, int siglen)
 {
-	int			i,
-				diff,
-				dist = 0;
-
-	LOOPBYTE(siglen)
-	{
-		diff = (unsigned char) (a[i] ^ b[i]);
-		/* Using the popcount functions here isn't likely to win */
-		dist += pg_number_of_ones[diff];
-	}
-	return dist;
+	return pg_xorcount((const unsigned char *) a, (const unsigned char *) b,
+					   siglen);
 }
 
 static int
diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h
index 887e782911..174df28e66 100644
--- a/src/include/port/pg_bitutils.h
+++ b/src/include/port/pg_bitutils.h
@@ -213,6 +213,8 @@ extern int	(*pg_popcount64) (uint64 word);
 
 /* Count the number of one-bits in a byte array */
 extern uint64 pg_popcount(const char *buf, int bytes);
+/* Count the number of 1-bits in the result of xor operation */
+extern uint64 pg_xorcount(const unsigned char *a, const unsigned char *b, int bytes);
 
 /*
  * Rotate the bits of "word" to the right by n bits.
diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c
index 392fbd3384..cb2f5f5f0b 100644
--- a/src/port/pg_bitutils.c
+++ b/src/port/pg_bitutils.c
@@ -319,3 +319,50 @@ pg_popcount(const char *buf, int bytes)
 
 	return popcnt;
 }
+
+/*
+ * pg_xorcount
+ *		Count the number of 1-bits in the result of xor operation.
+ */
+uint64
+pg_xorcount(const unsigned char *a, const unsigned char *b, int bytes)
+{
+	uint64		popcnt = 0;
+	int			i = 0;
+
+#if SIZEOF_VOID_P >= 8
+	const unsigned char *a_aligned = (const unsigned char *) TYPEALIGN(8, a);
+	const unsigned char *b_aligned = (const unsigned char *) TYPEALIGN(8, b);
+
+	/*
+	 * We can process 64-bit chunks only if both are mis-aligned by the same
+	 * number of bytes.
+	 */
+	if (b_aligned - b == a_aligned - a)
+	{
+		int			unaligned_bytes = a_aligned - a;
+		uint64	   *aint64 = (uint64*) a_aligned;
+		uint64	   *bint64 = (uint64*) b_aligned;
+		int			nelem;
+
+		/* Process leading bytes upto where aligned bytes start */
+		unaligned_bytes = Min(unaligned_bytes, bytes);
+		for (i = 0; i < unaligned_bytes; i++)
+			popcnt += pg_number_of_ones[a[i] ^ b[i]];
+
+		/* Process 64-bit chunks using popcount function */
+		nelem = (bytes - unaligned_bytes)/sizeof(uint64);
+		for (i = 0; i < nelem; i++)
+			popcnt += pg_popcount64(aint64[i] ^ bint64[i]);
+
+		/* Position i for the trailing bytes */
+		i = unaligned_bytes + nelem*sizeof(uint64);
+	}
+#endif
+
+	/* Process trailing bytes */
+	for (; i < bytes; i++)
+		popcnt += pg_number_of_ones[a[i] ^ b[i]];
+
+	return popcnt;
+}
-- 
2.17.1

