From 415e29b2fc2427e1355f5d4aade8fcbd50483a7f Mon Sep 17 00:00:00 2001
From: John Naylor <john.naylor@postgresql.org>
Date: Tue, 31 Mar 2026 17:40:38 +0700
Subject: [PATCH v4] Compute CRC32C on ARM using the Crypto Extension where
 available

---
 config/c-compiler.m4              |  41 ++++++++++
 configure                         |  67 +++++++++++++++-
 configure.ac                      |  13 ++-
 meson.build                       |  33 ++++++++
 src/include/pg_config.h.in        |   3 +
 src/include/port/pg_crc32c.h      |  22 ++++--
 src/port/meson.build              |   1 +
 src/port/pg_crc32c_armv8.c        | 127 ++++++++++++++++++++++++++++++
 src/port/pg_crc32c_armv8_choose.c |  38 ++++++++-
 9 files changed, 334 insertions(+), 11 deletions(-)

diff --git a/config/c-compiler.m4 b/config/c-compiler.m4
index 629572ee350..f8a9a69f20a 100644
--- a/config/c-compiler.m4
+++ b/config/c-compiler.m4
@@ -759,6 +759,47 @@ fi
 undefine([Ac_cachevar])dnl
 ])# PGAC_ARMV8_CRC32C_INTRINSICS
 
+# PGAC_ARM_PLMULL
+# ---------------------------
+# Check if the compiler supports Arm CRYPTO PMULL (carryless multiplication)
+# instructions used for vectorized CRC.
+#
+# If the instructions are supported, sets pgac_arm_pmull.
+AC_DEFUN([PGAC_ARM_PLMULL],
+[define([Ac_cachevar], [AS_TR_SH([pgac_cv_arm_pmull_$1])])dnl
+AC_CACHE_CHECK([for pmull and pmull2], [Ac_cachevar],
+[pgac_save_CFLAGS=$CFLAGS
+CFLAGS="$pgac_save_CFLAGS"
+AC_LINK_IFELSE([AC_LANG_PROGRAM([#include <arm_acle.h>
+#include <arm_neon.h>
+uint64x2_t  a;
+uint64x2_t  b;
+uint64x2_t  c;
+uint64x2_t  r1;
+uint64x2_t  r2;
+
+  #if defined(__has_attribute) && __has_attribute (target)
+  __attribute__((target("+crypto")))
+  #endif
+  static int pmull_test(void)
+  {
+    __asm("pmull  %0.1q, %2.1d, %3.1d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r1), "+w"(c):"w"(a), "w"(b));
+    __asm("pmull2 %0.1q, %2.2d, %3.2d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r2), "+w"(c):"w"(a), "w"(b));
+
+    r1 = veorq_u64(r1, r2);
+    /* return computed value, to prevent the above being optimized away */
+    return (int) vgetq_lane_u64(r1, 0);
+  }],
+  [return pmull_test();])],
+  [Ac_cachevar=yes],
+  [Ac_cachevar=no])
+CFLAGS="$pgac_save_CFLAGS"])
+if test x"$Ac_cachevar" = x"yes"; then
+  pgac_arm_pmull=yes
+fi
+undefine([Ac_cachevar])dnl
+])# PGAC_ARM_PLMULL
+
 # PGAC_LOONGARCH_CRC32C_INTRINSICS
 # ---------------------------
 # Check if the compiler supports the LoongArch CRCC instructions, using
diff --git a/configure b/configure
index fe22bc71d0c..a1ed54d2439 100755
--- a/configure
+++ b/configure
@@ -18314,7 +18314,7 @@ $as_echo "SSE 4.2 with runtime check" >&6; }
 
 $as_echo "#define USE_ARMV8_CRC32C 1" >>confdefs.h
 
-      PG_CRC32C_OBJS="pg_crc32c_armv8.o"
+      PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_armv8_choose.o"
       { $as_echo "$as_me:${as_lineno-$LINENO}: result: ARMv8 CRC instructions" >&5
 $as_echo "ARMv8 CRC instructions" >&6; }
     else
@@ -18399,6 +18399,61 @@ if test x"$pgac_cv_avx512_pclmul_intrinsics" = x"yes"; then
   pgac_avx512_pclmul_intrinsics=yes
 fi
 
+else
+  if test x"$host_cpu" = x"aarch64"; then
+    { $as_echo "$as_me:${as_lineno-$LINENO}: checking for pmull and pmull2" >&5
+$as_echo_n "checking for pmull and pmull2... " >&6; }
+if ${pgac_cv_arm_pmull_+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  pgac_save_CFLAGS=$CFLAGS
+CFLAGS="$pgac_save_CFLAGS"
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <arm_acle.h>
+#include <arm_neon.h>
+uint64x2_t  a;
+uint64x2_t  b;
+uint64x2_t  c;
+uint64x2_t  r1;
+uint64x2_t  r2;
+
+  #if defined(__has_attribute) && __has_attribute (target)
+  __attribute__((target("+crypto")))
+  #endif
+  static int pmull_test(void)
+  {
+    __asm("pmull  %0.1q, %2.1d, %3.1d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r1), "+w"(c):"w"(a), "w"(b));
+    __asm("pmull2 %0.1q, %2.2d, %3.2d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r2), "+w"(c):"w"(a), "w"(b));
+
+    r1 = veorq_u64(r1, r2);
+    /* return computed value, to prevent the above being optimized away */
+    return (int) vgetq_lane_u64(r1, 0);
+  }
+int
+main ()
+{
+return pmull_test();
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_c_try_link "$LINENO"; then :
+  pgac_cv_arm_pmull_=yes
+else
+  pgac_cv_arm_pmull_=no
+fi
+rm -f core conftest.err conftest.$ac_objext \
+    conftest$ac_exeext conftest.$ac_ext
+CFLAGS="$pgac_save_CFLAGS"
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_arm_pmull_" >&5
+$as_echo "$pgac_cv_arm_pmull_" >&6; }
+if test x"$pgac_cv_arm_pmull_" = x"yes"; then
+  pgac_arm_pmull=yes
+fi
+
+  fi
 fi
 
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking for vectorized CRC-32C" >&5
@@ -18410,8 +18465,16 @@ $as_echo "#define USE_AVX512_CRC32C_WITH_RUNTIME_CHECK 1" >>confdefs.h
   { $as_echo "$as_me:${as_lineno-$LINENO}: result: AVX-512 with runtime check" >&5
 $as_echo "AVX-512 with runtime check" >&6; }
 else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: none" >&5
+  if test x"$pgac_arm_pmull" = x"yes"; then
+
+$as_echo "#define USE_PMULL_CRC32C_WITH_RUNTIME_CHECK 1" >>confdefs.h
+
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: CRYPTO PMULL with runtime check" >&5
+$as_echo "CRYPTO PMULL with runtime check" >&6; }
+  else
+    { $as_echo "$as_me:${as_lineno-$LINENO}: result: none" >&5
 $as_echo "none" >&6; }
+  fi
 fi
 
 # Select semaphore implementation type.
diff --git a/configure.ac b/configure.ac
index 6873b7546dd..afea6118eb1 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2268,7 +2268,7 @@ else
   else
     if test x"$USE_ARMV8_CRC32C" = x"1"; then
       AC_DEFINE(USE_ARMV8_CRC32C, 1, [Define to 1 to use ARMv8 CRC Extension.])
-      PG_CRC32C_OBJS="pg_crc32c_armv8.o"
+      PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_armv8_choose.o"
       AC_MSG_RESULT(ARMv8 CRC instructions)
     else
       if test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"1"; then
@@ -2295,6 +2295,10 @@ AC_SUBST(PG_CRC32C_OBJS)
 #
 if test x"$host_cpu" = x"x86_64"; then
   PGAC_AVX512_PCLMUL_INTRINSICS()
+else
+  if test x"$host_cpu" = x"aarch64"; then
+    PGAC_ARM_PLMULL()
+  fi
 fi
 
 AC_MSG_CHECKING([for vectorized CRC-32C])
@@ -2302,7 +2306,12 @@ if test x"$pgac_avx512_pclmul_intrinsics" = x"yes"; then
   AC_DEFINE(USE_AVX512_CRC32C_WITH_RUNTIME_CHECK, 1, [Define to 1 to use AVX-512 CRC algorithms with a runtime check.])
   AC_MSG_RESULT(AVX-512 with runtime check)
 else
-  AC_MSG_RESULT(none)
+  if test x"$pgac_arm_pmull" = x"yes"; then
+    AC_DEFINE(USE_PMULL_CRC32C_WITH_RUNTIME_CHECK, 1, [Define to 1 to use Arm PMULL CRC algorithms with a runtime check.])
+    AC_MSG_RESULT(CRYPTO PMULL with runtime check)
+  else
+    AC_MSG_RESULT(none)
+  fi
 fi
 
 # Select semaphore implementation type.
diff --git a/meson.build b/meson.build
index 6bc74c2ba79..8736ad90b24 100644
--- a/meson.build
+++ b/meson.build
@@ -2720,6 +2720,39 @@ int main(void)
     have_optimized_crc = true
   endif
 
+    # Check if the compiler supports Arm CRYPTO PMULL (carryless multiplication)
+    # instructions used for vectorized CRC.
+    prog = '''
+#include <arm_acle.h>
+#include <arm_neon.h>
+uint64x2_t	a;
+uint64x2_t	b;
+uint64x2_t	c;
+
+#if defined(__has_attribute) && __has_attribute (target)
+__attribute__((target("+crypto")))
+#endif
+int main(void)
+{
+    uint64x2_t	r1;
+    uint64x2_t	r2;
+
+	__asm("pmull  %0.1q, %2.1d, %3.1d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r1), "+w"(c):"w"(a), "w"(b));
+	__asm("pmull2 %0.1q, %2.2d, %3.2d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r2), "+w"(c):"w"(a), "w"(b));
+
+	r1 = veorq_u64(r1, r2);
+	/* return computed value, to prevent the above being optimized away */
+	return (int) vgetq_lane_u64(r1, 0);
+}
+'''
+
+  if cc.links(prog,
+      name: 'PMULL CRC32C',
+      args: test_c_args)
+    # Use ARM CRYPTO Extension, with runtime check
+    cdata.set('USE_PMULL_CRC32C_WITH_RUNTIME_CHECK', 1)
+  endif
+
 elif host_cpu == 'loongarch64'
 
   prog = '''
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index d8d61918aff..dbc97c565a3 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -729,6 +729,9 @@
 /* Define to 1 to build with PAM support. (--with-pam) */
 #undef USE_PAM
 
+/* Define to 1 to use Arm PMULL CRC algorithms with a runtime check. */
+#undef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK
+
 /* Define to 1 to use software CRC-32C implementation (slicing-by-8). */
 #undef USE_SLICING_BY_8_CRC32C
 
diff --git a/src/include/port/pg_crc32c.h b/src/include/port/pg_crc32c.h
index 1f8e837d119..10518614664 100644
--- a/src/include/port/pg_crc32c.h
+++ b/src/include/port/pg_crc32c.h
@@ -111,13 +111,22 @@ extern pg_crc32c pg_comp_crc32c_avx512(pg_crc32c crc, const void *data, size_t l
 #endif
 
 #elif defined(USE_ARMV8_CRC32C)
-/* Use ARMv8 CRC Extension instructions. */
-
+/*
+ * Use either ARMv8 CRC Extension or CRYPTO Extension (PMULL) instructions.
+ * We don't need a runtime check for CRC, so for constant inputs, where
+ * we assume the input is small, we can avoid an indirect function call.
+ */
 #define COMP_CRC32C(crc, data, len)							\
-	((crc) = pg_comp_crc32c_armv8((crc), (data), (len)))
+	((crc) = __builtin_constant_p(len) ? 					\
+		pg_comp_crc32c_armv8((crc), (data), (len)) : 		\
+		pg_comp_crc32c((crc), (data), (len)))
 #define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF)
 
+extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len);
 extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len);
+#ifdef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK
+extern pg_crc32c pg_comp_crc32c_pmull(pg_crc32c crc, const void *data, size_t len);
+#endif
 
 #elif defined(USE_LOONGARCH_CRC32C)
 /* Use LoongArch CRCC instructions. */
@@ -131,8 +140,8 @@ extern pg_crc32c pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_
 #elif defined(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK)
 
 /*
- * Use ARMv8 instructions, but perform a runtime check first
- * to check that they are available.
+ * Use either ARMv8 CRC Extension or CRYPTO Extension (PMULL) instructions,
+ * but perform a runtime check first to check that they are available.
  */
 #define COMP_CRC32C(crc, data, len) \
 	((crc) = pg_comp_crc32c((crc), (data), (len)))
@@ -141,6 +150,9 @@ extern pg_crc32c pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_
 extern pg_crc32c pg_comp_crc32c_sb8(pg_crc32c crc, const void *data, size_t len);
 extern pg_crc32c (*pg_comp_crc32c) (pg_crc32c crc, const void *data, size_t len);
 extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len);
+#ifdef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK
+extern pg_crc32c pg_comp_crc32c_pmull(pg_crc32c crc, const void *data, size_t len);
+#endif
 
 #else
 /*
diff --git a/src/port/meson.build b/src/port/meson.build
index d55cb0424f3..922b3f64676 100644
--- a/src/port/meson.build
+++ b/src/port/meson.build
@@ -93,6 +93,7 @@ replace_funcs_pos = [
   # arm / aarch64
   ['pg_crc32c_armv8', 'USE_ARMV8_CRC32C'],
   ['pg_crc32c_armv8', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK', 'crc'],
+  ['pg_crc32c_armv8_choose', 'USE_ARMV8_CRC32C'],
   ['pg_crc32c_armv8_choose', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK'],
   ['pg_crc32c_sb8', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK'],
 
diff --git a/src/port/pg_crc32c_armv8.c b/src/port/pg_crc32c_armv8.c
index 9ca0f728d39..aa0089a431c 100644
--- a/src/port/pg_crc32c_armv8.c
+++ b/src/port/pg_crc32c_armv8.c
@@ -20,6 +20,10 @@
 #include <arm_acle.h>
 #endif
 
+#ifdef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK
+#include <arm_neon.h>
+#endif
+
 #include "port/pg_crc32c.h"
 
 pg_crc32c
@@ -77,3 +81,126 @@ pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len)
 
 	return crc;
 }
+
+#ifdef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK
+
+/*
+ * Note: There is no copyright notice in the following generated code.
+ *
+ * We have modified the output to
+ *   - match our function declaration
+ *   - match whitespace to our project style
+ *   - be more friendly for pgindent
+ */
+
+/* Generated by https://github.com/corsix/fast-crc32/ using: */
+/* ./generate -i neon -p crc32c -a v4e */
+/* MIT licensed */
+
+pg_attribute_target("+crypto")
+static inline
+uint64x2_t
+clmul_lo_e(uint64x2_t a, uint64x2_t b, uint64x2_t c)
+{
+	uint64x2_t	r;
+
+__asm("pmull %0.1q, %2.1d, %3.1d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r), "+w"(c):"w"(a), "w"(b));
+	return r;
+}
+
+pg_attribute_target("+crypto")
+static inline
+uint64x2_t
+clmul_hi_e(uint64x2_t a, uint64x2_t b, uint64x2_t c)
+{
+	uint64x2_t	r;
+
+__asm("pmull2 %0.1q, %2.2d, %3.2d\neor %0.16b, %0.16b, %1.16b\n":"=w"(r), "+w"(c):"w"(a), "w"(b));
+	return r;
+}
+
+pg_attribute_target("+crypto")
+pg_crc32c
+pg_comp_crc32c_pmull(pg_crc32c crc, const void *data, size_t len)
+{
+	/* adjust names to match generated code */
+	pg_crc32c	crc0 = crc;
+	const char *buf = data;
+
+	/* align to 16 bytes */
+	for (; len && ((uintptr_t) buf & 7); --len)
+	{
+		crc0 = __crc32cb(crc0, *buf++);
+	}
+	if (((uintptr_t) buf & 8) && len >= 8)
+	{
+		crc0 = __crc32cd(crc0, *(const uint64_t *) buf);
+		buf += 8;
+		len -= 8;
+	}
+
+	if (len >= 64)
+	{
+		const char *end = buf + len;
+		const char *limit = buf + len - 64;
+
+		/* First vector chunk. */
+		uint64x2_t	x0 = vld1q_u64((const uint64_t *) buf),
+					y0;
+		uint64x2_t	x1 = vld1q_u64((const uint64_t *) (buf + 16)),
+					y1;
+		uint64x2_t	x2 = vld1q_u64((const uint64_t *) (buf + 32)),
+					y2;
+		uint64x2_t	x3 = vld1q_u64((const uint64_t *) (buf + 48)),
+					y3;
+		uint64x2_t	k;
+
+		{
+			static const uint64_t pg_attribute_aligned(16) k_[] = {0x740eef02, 0x9e4addf8};
+
+			k = vld1q_u64(k_);
+		}
+
+		/*
+		 * pgindent complained of unmatched parens upstream:
+		 *
+		 * x0 = veorq_u64((uint64x2_t) {crc0, 0}, x0);
+		 */
+		x0 = veorq_u64((uint64x2_t) vsetq_lane_u64(crc0, vdupq_n_u64(0), 0), x0);
+		buf += 64;
+
+		/* Main loop. */
+		while (buf <= limit)
+		{
+			y0 = clmul_lo_e(x0, k, vld1q_u64((const uint64_t *) buf)), x0 = clmul_hi_e(x0, k, y0);
+			y1 = clmul_lo_e(x1, k, vld1q_u64((const uint64_t *) (buf + 16))), x1 = clmul_hi_e(x1, k, y1);
+			y2 = clmul_lo_e(x2, k, vld1q_u64((const uint64_t *) (buf + 32))), x2 = clmul_hi_e(x2, k, y2);
+			y3 = clmul_lo_e(x3, k, vld1q_u64((const uint64_t *) (buf + 48))), x3 = clmul_hi_e(x3, k, y3);
+			buf += 64;
+		}
+
+		/* Reduce x0 ... x3 to just x0. */
+		{
+			static const uint64_t pg_attribute_aligned(16) k_[] = {0xf20c0dfe, 0x493c7d27};
+
+			k = vld1q_u64(k_);
+		}
+		y0 = clmul_lo_e(x0, k, x1), x0 = clmul_hi_e(x0, k, y0);
+		y2 = clmul_lo_e(x2, k, x3), x2 = clmul_hi_e(x2, k, y2);
+		{
+			static const uint64_t pg_attribute_aligned(16) k_[] = {0x3da6d0cb, 0xba4fc28e};
+
+			k = vld1q_u64(k_);
+		}
+		y0 = clmul_lo_e(x0, k, x2), x0 = clmul_hi_e(x0, k, y0);
+
+		/* Reduce 128 bits to 32 bits, and multiply by x^32. */
+		crc0 = __crc32cd(0, vgetq_lane_u64(x0, 0));
+		crc0 = __crc32cd(crc0, vgetq_lane_u64(x0, 1));
+		len = end - buf;
+	}
+
+	return pg_comp_crc32c_armv8(crc0, buf, len);
+}
+
+#endif
diff --git a/src/port/pg_crc32c_armv8_choose.c b/src/port/pg_crc32c_armv8_choose.c
index a1f0e540c6b..1640958d9d1 100644
--- a/src/port/pg_crc32c_armv8_choose.c
+++ b/src/port/pg_crc32c_armv8_choose.c
@@ -108,6 +108,27 @@ pg_crc32c_armv8_available(void)
 #endif
 }
 
+static inline bool
+pg_pmull_available(void)
+{
+#if defined(__aarch64__) && defined(HWCAP_PMULL)
+
+#ifdef HAVE_ELF_AUX_INFO
+	unsigned long value;
+
+	return elf_aux_info(AT_HWCAP, &value, sizeof(value)) == 0 &&
+		(value & HWCAP_PMULL) != 0;
+#elif defined(HAVE_GETAUXVAL)
+	return (getauxval(AT_HWCAP) & HWCAP_PMULL) != 0;
+#else
+	return false;
+#endif
+
+#else
+	return false;
+#endif
+}
+
 /*
  * This gets called on the first call. It replaces the function pointer
  * so that subsequent calls are routed directly to the chosen implementation.
@@ -115,10 +136,23 @@ pg_crc32c_armv8_available(void)
 static pg_crc32c
 pg_comp_crc32c_choose(pg_crc32c crc, const void *data, size_t len)
 {
+	/*
+	 * Set fallback. We must guard since slicing-by-8 is not visible
+	 * everywhere.
+	 */
+#ifdef USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK
+	pg_comp_crc32c = pg_comp_crc32c_sb8;
+#endif
+
 	if (pg_crc32c_armv8_available())
+	{
 		pg_comp_crc32c = pg_comp_crc32c_armv8;
-	else
-		pg_comp_crc32c = pg_comp_crc32c_sb8;
+
+#ifdef USE_PMULL_CRC32C_WITH_RUNTIME_CHECK
+		if (pg_pmull_available())
+			pg_comp_crc32c = pg_comp_crc32c_pmull;
+#endif
+	}
 
 	return pg_comp_crc32c(crc, data, len);
 }
-- 
2.53.0

