From 1295f4d6eedabec1d850893d3bc86180bd33c932 Mon Sep 17 00:00:00 2001
From: Nathan Bossart <nathandbossart@gmail.com>
Date: Fri, 19 Aug 2022 10:41:07 -0700
Subject: [PATCH v1 1/1] Use ARM Advanced SIMD intrinsic functions in
 pg_lfind32().

Use ARM Advanced SIMD intrinsic functions to speed up the search,
where available.  Otherwise, use a simple 'for' loop as before.  As
with b6ef167, this speeds up XidInMVCCSnapshot(), but any uses of
pg_lfind32() will also benefit.

Author: Nathan Bossart
---
 config/c-compiler.m4        | 25 +++++++++++++++++++++++
 configure                   | 40 +++++++++++++++++++++++++++++++++++++
 configure.ac                |  2 ++
 src/include/pg_config.h.in  |  3 +++
 src/include/port/pg_lfind.h | 35 ++++++++++++++++++++++++++++++++
 src/include/port/simd.h     |  4 ++++
 6 files changed, 109 insertions(+)

diff --git a/config/c-compiler.m4 b/config/c-compiler.m4
index 69efc5bb10..e8931d7059 100644
--- a/config/c-compiler.m4
+++ b/config/c-compiler.m4
@@ -650,3 +650,28 @@ if test x"$Ac_cachevar" = x"yes"; then
 fi
 undefine([Ac_cachevar])dnl
 ])# PGAC_ARMV8_CRC32C_INTRINSICS
+
+# PGAC_ARM_ADVANCED_SIMD_INTRINSICS
+# ---------------------------------
+# Check if the compiler supports the vdupq_n_u32, vld1q_u32, vceqq_u32,
+# vorrq_u32, and vmaxvq_u32 intrinsic functions.  These instructions were first
+# introduced in ARMv7.
+AC_DEFUN([PGAC_ARM_ADVANCED_SIMD_INTRINSICS],
+[AC_CACHE_CHECK([for vdupq_n_u32, vld1q_u32, vceqq_u32, vorrq_u32, and vmaxvq_u32],
+pgac_cv_arm_advanced_simd_intrinsics,
+[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <arm_neon.h>],
+  [unsigned int val[[]] = {1, 2, 3, 4, 5, 6, 7, 8};
+   uint32x4_t keys = vdupq_n_u32(7);
+   uint32x4_t vals1 = vld1q_u32(val);
+   uint32x4_t vals2 = vld1q_u32(&val[[4]]);
+   uint32x4_t tmp1 = vceqq_u32(keys, vals1);
+   uint32x4_t tmp2 = vceqq_u32(keys, vals2);
+   uint32x4_t result = vorrq_u32(tmp1, tmp2);
+   /* return computed value to prevent the above from being optimized away */
+   return vmaxvq_u32(result) != 0;])],
+[pgac_cv_arm_advanced_simd_intrinsics=yes],
+[pgac_cv_arm_advanced_simd_intrinsics=no])])
+if test x"$pgac_cv_arm_advanced_simd_intrinsics" = xyes ; then
+AC_DEFINE(USE_ARM_ADVANCED_SIMD_INTRINSICS, 1,
+          [Define to 1 to use ARM Advanced SIMD (Neon) intrinsics.])
+fi])# PGAC_ARM_ADVANCED_SIMD_INTRINSICS
diff --git a/configure b/configure
index b28fccbc47..0924e5ae8f 100755
--- a/configure
+++ b/configure
@@ -18230,6 +18230,46 @@ $as_echo "slicing-by-8" >&6; }
 fi
 
 
+# Check for ARM Advanced SIMD intrinsics.
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for vdupq_n_u32, vld1q_u32, vceqq_u32, vorrq_u32, and vmaxvq_u32" >&5
+$as_echo_n "checking for vdupq_n_u32, vld1q_u32, vceqq_u32, vorrq_u32, and vmaxvq_u32... " >&6; }
+if ${pgac_cv_arm_advanced_simd_intrinsics+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <arm_neon.h>
+int
+main ()
+{
+unsigned int val[] = {1, 2, 3, 4, 5, 6, 7, 8};
+   uint32x4_t keys = vdupq_n_u32(7);
+   uint32x4_t vals1 = vld1q_u32(val);
+   uint32x4_t vals2 = vld1q_u32(&val[4]);
+   uint32x4_t tmp1 = vceqq_u32(keys, vals1);
+   uint32x4_t tmp2 = vceqq_u32(keys, vals2);
+   uint32x4_t result = vorrq_u32(tmp1, tmp2);
+   /* return computed value to prevent the above from being optimized away */
+   return vmaxvq_u32(result) != 0;
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  pgac_cv_arm_advanced_simd_intrinsics=yes
+else
+  pgac_cv_arm_advanced_simd_intrinsics=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_arm_advanced_simd_intrinsics" >&5
+$as_echo "$pgac_cv_arm_advanced_simd_intrinsics" >&6; }
+if test x"$pgac_cv_arm_advanced_simd_intrinsics" = xyes ; then
+
+$as_echo "#define USE_ARM_ADVANCED_SIMD_INTRINSICS 1" >>confdefs.h
+
+fi
 
 # Select semaphore implementation type.
 if test "$PORTNAME" != "win32"; then
diff --git a/configure.ac b/configure.ac
index dd368290a6..62d8b4abda 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2172,6 +2172,8 @@ else
 fi
 AC_SUBST(PG_CRC32C_OBJS)
 
+# Check for ARM Advanced SIMD intrinsics.
+PGAC_ARM_ADVANCED_SIMD_INTRINSICS
 
 # Select semaphore implementation type.
 if test "$PORTNAME" != "win32"; then
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 26eb6a2dfe..8d04f31fff 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -702,6 +702,9 @@
 /* Define to 1 if strerror_r() returns int. */
 #undef STRERROR_R_INT
 
+/* Define to 1 to use ARM Advanced SIMD (Neon) intrinsics. */
+#undef USE_ARM_ADVANCED_SIMD_INTRINSICS
+
 /* Define to 1 to use ARMv8 CRC Extension. */
 #undef USE_ARMV8_CRC32C
 
diff --git a/src/include/port/pg_lfind.h b/src/include/port/pg_lfind.h
index fb125977b2..51315a4fb3 100644
--- a/src/include/port/pg_lfind.h
+++ b/src/include/port/pg_lfind.h
@@ -82,6 +82,41 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem)
 	}
 #endif							/* USE_SSE2 */
 
+#ifdef USE_ARM_ADVANCED_SIMD_INTRINSICS
+
+	/*
+	 * A 16-byte register only has four 4-byte lanes. For better
+	 * instruction-level parallelism, each loop iteration operates on a block
+	 * of four registers.
+	 */
+	const		uint32x4_t keys = vdupq_n_u32(key); /* load 4 copies of key */
+	uint32		iterations = nelem & ~0xF;  /* round down to multiple of 16 */
+
+	for (i = 0; i < iterations; i += 16)
+	{
+		/* load the next block into 4 registers holding 4 values each */
+		const		uint32x4_t vals1 = vld1q_u32((const uint32 *) & base[i]);
+		const		uint32x4_t vals2 = vld1q_u32((const uint32 *) & base[i + 4]);
+		const		uint32x4_t vals3 = vld1q_u32((const uint32 *) & base[i + 8]);
+		const		uint32x4_t vals4 = vld1q_u32((const uint32 *) & base[i + 12]);
+
+		/* compare each value to the key */
+		const		uint32x4_t result1 = vceqq_u32(keys, vals1);
+		const		uint32x4_t result2 = vceqq_u32(keys, vals2);
+		const		uint32x4_t result3 = vceqq_u32(keys, vals3);
+		const		uint32x4_t result4 = vceqq_u32(keys, vals4);
+
+		/* combine the results into a single variable */
+		const		uint32x4_t tmp1 = vorrq_u32(result1, result2);
+		const		uint32x4_t tmp2 = vorrq_u32(result3, result4);
+		const		uint32x4_t result = vorrq_u32(tmp1, tmp2);
+
+		/* see if there was a match */
+		if (vmaxvq_u32(result) != 0)
+			return true;
+	}
+#endif							/* USE_ARM_ADVANCED_SIMD_INTRINSICS */
+
 	/* Process the remaining elements one at a time. */
 	for (; i < nelem; i++)
 	{
diff --git a/src/include/port/simd.h b/src/include/port/simd.h
index a571e79f57..5eee1c944f 100644
--- a/src/include/port/simd.h
+++ b/src/include/port/simd.h
@@ -27,4 +27,8 @@
 #define USE_SSE2
 #endif
 
+#ifdef USE_ARM_ADVANCED_SIMD_INTRINSICS
+#include <arm_neon.h>
+#endif
+
 #endif							/* SIMD_H */
-- 
2.25.1