From 1464c22da33117900341496d03b92dec5be2a62a Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Thu, 22 Jul 2021 02:05:06 +1200
Subject: [PATCH 1/2] XXX Make SIMD code more platform neutral.

Move SIMD code into pg_utf_simd.c to experiment with the idea of a
shared implementation across architectures.  Introduce pg_u8x16_t to
abstract vector type.

XXX Experiment grade code only
---
 configure                                     |  18 +--
 configure.ac                                  |  18 +--
 src/include/pg_config.h.in                    |  14 +-
 src/include/port/pg_utf8.h                    |  10 +-
 src/port/Makefile                             |   8 +-
 ...g_utf8_sse42_choose.c => pg_utf8_choose.c} |  10 +-
 src/port/{pg_utf8_sse42.c => pg_utf8_simd.c}  | 148 +++++++++---------
 7 files changed, 114 insertions(+), 112 deletions(-)
 rename src/port/{pg_utf8_sse42_choose.c => pg_utf8_choose.c} (88%)
 rename src/port/{pg_utf8_sse42.c => pg_utf8_simd.c} (76%)

diff --git a/configure b/configure
index 30969840b1..df546b641c 100755
--- a/configure
+++ b/configure
@@ -18442,13 +18442,13 @@ fi
 #
 # You can override this logic by setting the appropriate USE_*_UTF8 flag to 1
 # in the template or configure command line.
-if test x"$USE_SSE42_UTF8" = x"" && test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"" && test x"$USE_FALLBACK_UTF8" = x""; then
+if test x"$USE_SIMD_UTF8" = x"" && test x"$USE_SIMD_UTF8_WITH_RUNTIME_CHECK" = x"" && test x"$USE_FALLBACK_UTF8" = x""; then
   if test x"$pgac_sse42_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
-    USE_SSE42_UTF8=1
+    USE_SIMD_UTF8=1
   else
     # the CPUID instruction is needed for the runtime check.
     if test x"$pgac_sse42_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
-      USE_SSE42_UTF8_WITH_RUNTIME_CHECK=1
+      USE_SIMD_UTF8_WITH_RUNTIME_CHECK=1
     else
       # fall back to algorithm which doesn't require any special
       # CPU support.
@@ -18461,19 +18461,19 @@ fi
 # Note: We need the fallback for error handling in all builds.
 { $as_echo "$as_me:${as_lineno-$LINENO}: checking which UTF-8 validator to use" >&5
 $as_echo_n "checking which UTF-8 validator to use... " >&6; }
-if test x"$USE_SSE42_UTF8" = x"1"; then
+if test x"$USE_SIMD_UTF8" = x"1"; then
 
-$as_echo "#define USE_SSE42_UTF8 1" >>confdefs.h
+$as_echo "#define USE_SIMD_UTF8 1" >>confdefs.h
 
-  PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o"
+  PG_UTF8_OBJS="pg_utf8_simd.o pg_utf8_fallback.o"
   { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2" >&5
 $as_echo "SSE 4.2" >&6; }
 else
-  if test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"1"; then
+  if test x"$USE_SIMD_UTF8_WITH_RUNTIME_CHECK" = x"1"; then
 
-$as_echo "#define USE_SSE42_UTF8_WITH_RUNTIME_CHECK 1" >>confdefs.h
+$as_echo "#define USE_SIMD_UTF8_WITH_RUNTIME_CHECK 1" >>confdefs.h
 
-    PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o pg_utf8_sse42_choose.o"
+    PG_UTF8_OBJS="pg_utf8_simd.o pg_utf8_fallback.o pg_utf8_choose.o"
     { $as_echo "$as_me:${as_lineno-$LINENO}: result: SSE 4.2 with runtime check" >&5
 $as_echo "SSE 4.2 with runtime check" >&6; }
   else
diff --git a/configure.ac b/configure.ac
index 5e2b4717c1..1606a80fb7 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2217,13 +2217,13 @@ AC_SUBST(PG_CRC32C_OBJS)
 #
 # You can override this logic by setting the appropriate USE_*_UTF8 flag to 1
 # in the template or configure command line.
-if test x"$USE_SSE42_UTF8" = x"" && test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"" && test x"$USE_FALLBACK_UTF8" = x""; then
+if test x"$USE_SIMD_UTF8" = x"" && test x"$USE_SIMD_UTF8_WITH_RUNTIME_CHECK" = x"" && test x"$USE_FALLBACK_UTF8" = x""; then
   if test x"$pgac_sse42_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then
-    USE_SSE42_UTF8=1
+    USE_SIMD_UTF8=1
   else
     # the CPUID instruction is needed for the runtime check.
     if test x"$pgac_sse42_intrinsics" = x"yes" && (test x"$pgac_cv__get_cpuid" = x"yes" || test x"$pgac_cv__cpuid" = x"yes"); then
-      USE_SSE42_UTF8_WITH_RUNTIME_CHECK=1
+      USE_SIMD_UTF8_WITH_RUNTIME_CHECK=1
     else
       # fall back to algorithm which doesn't require any special
       # CPU support.
@@ -2235,14 +2235,14 @@ fi
 # Set PG_UTF8_OBJS appropriately depending on the selected implementation.
 # Note: We need the fallback for error handling in all builds.
 AC_MSG_CHECKING([which UTF-8 validator to use])
-if test x"$USE_SSE42_UTF8" = x"1"; then
-  AC_DEFINE(USE_SSE42_UTF8, 1, [Define to 1 use Intel SSE 4.2 instructions.])
-  PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o"
+if test x"$USE_SIMD_UTF8" = x"1"; then
+  AC_DEFINE(USE_SIMD_UTF8, 1, [Define to 1 use Intel SSE 4.2 instructions.])
+  PG_UTF8_OBJS="pg_utf8_simd.o pg_utf8_fallback.o"
   AC_MSG_RESULT(SSE 4.2)
 else
-  if test x"$USE_SSE42_UTF8_WITH_RUNTIME_CHECK" = x"1"; then
-    AC_DEFINE(USE_SSE42_UTF8_WITH_RUNTIME_CHECK, 1, [Define to 1 to use Intel SSE 4.2 instructions with a runtime check.])
-    PG_UTF8_OBJS="pg_utf8_sse42.o pg_utf8_fallback.o pg_utf8_sse42_choose.o"
+  if test x"$USE_SIMD_UTF8_WITH_RUNTIME_CHECK" = x"1"; then
+    AC_DEFINE(USE_SIMD_UTF8_WITH_RUNTIME_CHECK, 1, [Define to 1 to use Intel SSE 4.2 instructions with a runtime check.])
+    PG_UTF8_OBJS="pg_utf8_simd.o pg_utf8_fallback.o pg_utf8_choose.o"
     AC_MSG_RESULT(SSE 4.2 with runtime check)
   else
     AC_DEFINE(USE_FALLBACK_UTF8, 1, [Define to 1 to use Intel SSE 4.2 instructions with a runtime check.])
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index 9d5e1efda9..f1456553e6 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -904,6 +904,9 @@
 /* Define to 1 to build with BSD Authentication support. (--with-bsd-auth) */
 #undef USE_BSD_AUTH
 
+/* Define to 1 to use Intel SSE 4.2 instructions with a runtime check. */
+#undef USE_FALLBACK_UTF8
+
 /* Define to build with ICU support. (--with-icu) */
 #undef USE_ICU
 
@@ -932,14 +935,11 @@
 /* Define to 1 to build with PAM support. (--with-pam) */
 #undef USE_PAM
 
-/* Define to 1 to use the fallback UTF-8 validator written in C. */
-#undef USE_FALLBACK_UTF8
-
-/* Define to 1 use the UTF-8 validator written with Intel SSE instructions. */
-#undef USE_SSE42_UTF8
+/* Define to 1 use Intel SSE 4.2 instructions. */
+#undef USE_SIMD_UTF8
 
-/* Define to 1 use the UTF-8 validator written with Intel SSE instructions with runtime check. */
-#undef USE_SSE42_UTF8_WITH_RUNTIME_CHECK
+/* Define to 1 to use Intel SSE 4.2 instructions with a runtime check. */
+#undef USE_SIMD_UTF8_WITH_RUNTIME_CHECK
 
 /* Define to 1 to use software CRC-32C implementation (slicing-by-8). */
 #undef USE_SLICING_BY_8_CRC32C
diff --git a/src/include/port/pg_utf8.h b/src/include/port/pg_utf8.h
index dc38369a31..a9f2b9f15b 100644
--- a/src/include/port/pg_utf8.h
+++ b/src/include/port/pg_utf8.h
@@ -15,14 +15,14 @@
 #define PG_UTF8_H
 
 
-#if defined(USE_SSE42_UTF8)
+#if defined(USE_SIMD_UTF8)
 /* Use Intel SSE4.2 instructions. */
 #define UTF8_VERIFYSTR(s, len) \
-	pg_validate_utf8_sse42((s), (len))
+	pg_validate_utf8_simd((s), (len))
 
-extern int	pg_validate_utf8_sse42(const unsigned char *s, int len);
+extern int	pg_validate_utf8_simd(const unsigned char *s, int len);
 
-#elif defined(USE_SSE42_UTF8_WITH_RUNTIME_CHECK)
+#elif defined(USE_SIMD_UTF8_WITH_RUNTIME_CHECK)
 /*
  * Use Intel SSE 4.2 instructions, but perform a runtime check first
  * to check that they are available.
@@ -31,7 +31,7 @@ extern int	pg_validate_utf8_sse42(const unsigned char *s, int len);
 	pg_validate_utf8((s), (len))
 
 extern int	(*pg_validate_utf8) (const unsigned char *s, int len);
-extern int	pg_validate_utf8_sse42(const unsigned char *s, int len);
+extern int	pg_validate_utf8_simd(const unsigned char *s, int len);
 
 #else
 #define UTF8_VERIFYSTR(s, len) \
diff --git a/src/port/Makefile b/src/port/Makefile
index 04838b0ab2..893fcd7d59 100644
--- a/src/port/Makefile
+++ b/src/port/Makefile
@@ -90,10 +90,10 @@ libpgport.a: $(OBJS)
 thread.o: CFLAGS+=$(PTHREAD_CFLAGS)
 thread_shlib.o: CFLAGS+=$(PTHREAD_CFLAGS)
 
-# all versions of pg_utf8_sse42.o need CFLAGS_SSE42
-pg_utf8_sse42.o: CFLAGS+=$(CFLAGS_SSE42)
-pg_utf8_sse42_shlib.o: CFLAGS+=$(CFLAGS_SSE42)
-pg_utf8_sse42_srv.o: CFLAGS+=$(CFLAGS_SSE42)
+# all versions of pg_utf8_simd.o need CFLAGS_SSE42
+pg_utf8_simd.o: CFLAGS+=$(CFLAGS_SSE42)
+pg_utf8_simd_shlib.o: CFLAGS+=$(CFLAGS_SSE42)
+pg_utf8_simd_srv.o: CFLAGS+=$(CFLAGS_SSE42)
 
 # all versions of pg_crc32c_sse42.o need CFLAGS_SSE42
 pg_crc32c_sse42.o: CFLAGS+=$(CFLAGS_SSE42)
diff --git a/src/port/pg_utf8_sse42_choose.c b/src/port/pg_utf8_choose.c
similarity index 88%
rename from src/port/pg_utf8_sse42_choose.c
rename to src/port/pg_utf8_choose.c
index ff6120be2b..140c0dce7b 100644
--- a/src/port/pg_utf8_sse42_choose.c
+++ b/src/port/pg_utf8_choose.c
@@ -1,7 +1,7 @@
 /*-------------------------------------------------------------------------
  *
- * pg_utf8_sse42_choose.c
- *	  Choose between Intel SSE 4.2 and fallback implementation.
+ * pg_utf8_choose.c
+ *	  Choose between SSE 4.2 and fallback implementation.
  *
  * On first call, checks if the CPU we're running on supports Intel SSE
  * 4.2. If it does, use SSE instructions for UTF-8 validation. Otherwise,
@@ -30,7 +30,7 @@
 #include "port/pg_utf8.h"
 
 static bool
-pg_utf8_sse42_available(void)
+pg_utf8_simd_available(void)
 {
 	/* To save from checking every SSE2 intrinsic, insist on 64-bit. */
 #ifdef __x86_64__
@@ -57,8 +57,8 @@ pg_utf8_sse42_available(void)
 static int
 pg_validate_utf8_choose(const unsigned char *s, int len)
 {
-	if (pg_utf8_sse42_available())
-		pg_validate_utf8 = pg_validate_utf8_sse42;
+	if (pg_utf8_simd_available())
+		pg_validate_utf8 = pg_validate_utf8_simd;
 	else
 		pg_validate_utf8 = pg_validate_utf8_fallback;
 
diff --git a/src/port/pg_utf8_sse42.c b/src/port/pg_utf8_simd.c
similarity index 76%
rename from src/port/pg_utf8_sse42.c
rename to src/port/pg_utf8_simd.c
index cd050ec2bf..7ca9060e3a 100644
--- a/src/port/pg_utf8_sse42.c
+++ b/src/port/pg_utf8_simd.c
@@ -1,6 +1,6 @@
 /*-------------------------------------------------------------------------
  *
- * pg_utf8_sse42.c
+ * pg_utf8_simd.c
  *	  Validate UTF-8 using Intel SSE 4.2 instructions.
  *
  * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  src/port/pg_utf8_sse42.c
+ *	  src/port/pg_utf8_simd.c
  *
  *-------------------------------------------------------------------------
  */
@@ -19,6 +19,8 @@
 
 #include "port/pg_utf8.h"
 
+typedef __m128i pg_u8x16_t;
+
 /*
  * This module is based on the paper "Validating UTF-8 In Less Than One
  * Instruction Per Byte" by John Keiser and Daniel Lemire, arXiv:2010.03090
@@ -184,48 +186,48 @@
 #define vset(...)		_mm_setr_epi8(__VA_ARGS__)
 
 /* return a zeroed register */
-static inline const __m128i
+static inline const pg_u8x16_t
 vzero()
 {
 	return _mm_setzero_si128();
 }
 
 /* perform an unaligned load from memory into a register */
-static inline const __m128i
+static inline const pg_u8x16_t
 vload(const unsigned char *raw_input)
 {
-	return _mm_loadu_si128((const __m128i *) raw_input);
+	return _mm_loadu_si128((const pg_u8x16_t *) raw_input);
 }
 
 /* return a vector with each 8-bit lane populated with the input scalar */
-static inline __m128i
+static inline pg_u8x16_t
 splat(char byte)
 {
 	return _mm_set1_epi8(byte);
 }
 
 /* perform signed greater-than on all 8-bit lanes */
-static inline __m128i
-greater_than(const __m128i v1, const __m128i v2)
+static inline pg_u8x16_t
+greater_than(const pg_u8x16_t v1, const pg_u8x16_t v2)
 {
 	return _mm_cmpgt_epi8(v1, v2);
 }
 
 /* bitwise vector operations */
-static inline __m128i
-bitwise_and(const __m128i v1, const __m128i v2)
+static inline pg_u8x16_t
+bitwise_and(const pg_u8x16_t v1, const pg_u8x16_t v2)
 {
 	return _mm_and_si128(v1, v2);
 }
 
-static inline __m128i
-bitwise_or(const __m128i v1, const __m128i v2)
+static inline pg_u8x16_t
+bitwise_or(const pg_u8x16_t v1, const pg_u8x16_t v2)
 {
 	return _mm_or_si128(v1, v2);
 }
 
-static inline __m128i
-bitwise_xor(const __m128i v1, const __m128i v2)
+static inline pg_u8x16_t
+bitwise_xor(const pg_u8x16_t v1, const pg_u8x16_t v2)
 {
 	return _mm_xor_si128(v1, v2);
 }
@@ -235,8 +237,8 @@ bitwise_xor(const __m128i v1, const __m128i v2)
  * on overflow, stop at zero. Useful for emulating unsigned
  * comparison.
  */
-static inline __m128i
-saturating_sub(const __m128i v1, const __m128i v2)
+static inline pg_u8x16_t
+saturating_sub(const pg_u8x16_t v1, const pg_u8x16_t v2)
 {
 	return _mm_subs_epu8(v1, v2);
 }
@@ -247,11 +249,11 @@ saturating_sub(const __m128i v1, const __m128i v2)
  * There is no intrinsic to do this on 8-bit lanes, so shift right in each
  * 16-bit lane then apply a mask in each 8-bit lane shifted the same amount.
  */
-static inline __m128i
-shift_right(const __m128i v, const int n)
+static inline pg_u8x16_t
+shift_right(const pg_u8x16_t v, const int n)
 {
-	const		__m128i shift16 = _mm_srli_epi16(v, n);
-	const		__m128i mask = splat(0xFF >> n);
+	const		pg_u8x16_t shift16 = _mm_srli_epi16(v, n);
+	const		pg_u8x16_t mask = splat(0xFF >> n);
 
 	return bitwise_and(shift16, mask);
 }
@@ -266,30 +268,30 @@ shift_right(const __m128i v, const int n)
  * The third argument to the intrinsic must be a numeric constant, so
  * we must have separate functions for different shift amounts.
  */
-static inline __m128i
-prev1(__m128i prev, __m128i input)
+static inline pg_u8x16_t
+prev1(pg_u8x16_t prev, pg_u8x16_t input)
 {
-	return _mm_alignr_epi8(input, prev, sizeof(__m128i) - 1);
+	return _mm_alignr_epi8(input, prev, sizeof(pg_u8x16_t) - 1);
 }
 
-static inline __m128i
-prev2(__m128i prev, __m128i input)
+static inline pg_u8x16_t
+prev2(pg_u8x16_t prev, pg_u8x16_t input)
 {
-	return _mm_alignr_epi8(input, prev, sizeof(__m128i) - 2);
+	return _mm_alignr_epi8(input, prev, sizeof(pg_u8x16_t) - 2);
 }
 
-static inline __m128i
-prev3(__m128i prev, __m128i input)
+static inline pg_u8x16_t
+prev3(pg_u8x16_t prev, pg_u8x16_t input)
 {
-	return _mm_alignr_epi8(input, prev, sizeof(__m128i) - 3);
+	return _mm_alignr_epi8(input, prev, sizeof(pg_u8x16_t) - 3);
 }
 
 /*
  * For each 8-bit lane in the input, use that value as an index
  * into the lookup vector as if it were a 16-element byte array.
  */
-static inline __m128i
-lookup(const __m128i input, const __m128i lookup)
+static inline pg_u8x16_t
+lookup(const pg_u8x16_t input, const pg_u8x16_t lookup)
 {
 	return _mm_shuffle_epi8(lookup, input);
 }
@@ -298,28 +300,28 @@ lookup(const __m128i input, const __m128i lookup)
  * Return a vector with lanes non-zero where we have either errors, or
  * two or more continuations in a row.
  */
-static inline __m128i
-check_special_cases(const __m128i prev, const __m128i input)
+static inline pg_u8x16_t
+check_special_cases(const pg_u8x16_t prev, const pg_u8x16_t input)
 {
-	const		__m128i byte_1_high_table = vset(BYTE_1_HIGH_TABLE);
-	const		__m128i byte_1_low_table = vset(BYTE_1_LOW_TABLE);
-	const		__m128i byte_2_high_table = vset(BYTE_2_HIGH_TABLE);
+	const		pg_u8x16_t byte_1_high_table = vset(BYTE_1_HIGH_TABLE);
+	const		pg_u8x16_t byte_1_low_table = vset(BYTE_1_LOW_TABLE);
+	const		pg_u8x16_t byte_2_high_table = vset(BYTE_2_HIGH_TABLE);
 
 	/*
 	 * To classify the first byte in each chunk we need to have the last byte
 	 * from the previous chunk.
 	 */
-	const		__m128i input_shift1 = prev1(prev, input);
+	const		pg_u8x16_t input_shift1 = prev1(prev, input);
 
 	/* put the relevant nibbles into their own bytes in their own registers */
-	const		__m128i byte_1_high = shift_right(input_shift1, 4);
-	const		__m128i byte_1_low = bitwise_and(input_shift1, splat(0x0F));
-	const		__m128i byte_2_high = shift_right(input, 4);
+	const		pg_u8x16_t byte_1_high = shift_right(input_shift1, 4);
+	const		pg_u8x16_t byte_1_low = bitwise_and(input_shift1, splat(0x0F));
+	const		pg_u8x16_t byte_2_high = shift_right(input, 4);
 
 	/* lookup the possible errors for each set of nibbles */
-	const		__m128i lookup_1_high = lookup(byte_1_high, byte_1_high_table);
-	const		__m128i lookup_1_low = lookup(byte_1_low, byte_1_low_table);
-	const		__m128i lookup_2_high = lookup(byte_2_high, byte_2_high_table);
+	const		pg_u8x16_t lookup_1_high = lookup(byte_1_high, byte_1_high_table);
+	const		pg_u8x16_t lookup_1_low = lookup(byte_1_low, byte_1_low_table);
+	const		pg_u8x16_t lookup_2_high = lookup(byte_2_high, byte_2_high_table);
 
 	/*
 	 * AND all the lookups together. At this point, non-zero lanes in the
@@ -331,7 +333,7 @@ check_special_cases(const __m128i prev, const __m128i input)
 	 *
 	 * 3. the third continuation byte of a 4-byte character
 	 */
-	const		__m128i temp = bitwise_and(lookup_1_high, lookup_1_low);
+	const		pg_u8x16_t temp = bitwise_and(lookup_1_high, lookup_1_low);
 
 	return bitwise_and(temp, lookup_2_high);
 }
@@ -340,22 +342,22 @@ check_special_cases(const __m128i prev, const __m128i input)
  * Return a vector with lanes set to TWO_CONTS where we expect to find two
  * continuations in a row. These are valid only within 3- and 4-byte sequences.
  */
-static inline __m128i
-check_multibyte_lengths(const __m128i prev, const __m128i input)
+static inline pg_u8x16_t
+check_multibyte_lengths(const pg_u8x16_t prev, const pg_u8x16_t input)
 {
 	/*
 	 * Populate registers that contain the input shifted right by 2 and 3
 	 * bytes, filling in the left lanes from the previous input.
 	 */
-	const		__m128i input_shift2 = prev2(prev, input);
-	const		__m128i input_shift3 = prev3(prev, input);
+	const		pg_u8x16_t input_shift2 = prev2(prev, input);
+	const		pg_u8x16_t input_shift3 = prev3(prev, input);
 
 	/*
 	 * Constants for comparison. Any 3-byte lead is greater than
 	 * MAX_TWO_BYTE_LEAD, etc.
 	 */
-	const		__m128i max_lead2 = splat(MAX_TWO_BYTE_LEAD);
-	const		__m128i max_lead3 = splat(MAX_THREE_BYTE_LEAD);
+	const		pg_u8x16_t max_lead2 = splat(MAX_TWO_BYTE_LEAD);
+	const		pg_u8x16_t max_lead3 = splat(MAX_THREE_BYTE_LEAD);
 
 	/*
 	 * Look in the shifted registers for 3- or 4-byte leads. There is no
@@ -363,17 +365,17 @@ check_multibyte_lengths(const __m128i prev, const __m128i input)
 	 * signed comparison with zero. Any non-zero bytes in the result represent
 	 * valid leads.
 	 */
-	const		__m128i is_third_byte = saturating_sub(input_shift2, max_lead2);
-	const		__m128i is_fourth_byte = saturating_sub(input_shift3, max_lead3);
+	const		pg_u8x16_t is_third_byte = saturating_sub(input_shift2, max_lead2);
+	const		pg_u8x16_t is_fourth_byte = saturating_sub(input_shift3, max_lead3);
 
 	/* OR them together for easier comparison */
-	const		__m128i temp = bitwise_or(is_third_byte, is_fourth_byte);
+	const		pg_u8x16_t temp = bitwise_or(is_third_byte, is_fourth_byte);
 
 	/*
 	 * Set all bits in each 8-bit lane if the result is greater than zero.
 	 * Signed arithmetic is okay because the values are small.
 	 */
-	const		__m128i must23 = greater_than(temp, vzero());
+	const		pg_u8x16_t must23 = greater_than(temp, vzero());
 
 	/*
 	 * We want to compare with the result of check_special_cases() so apply a
@@ -385,20 +387,20 @@ check_multibyte_lengths(const __m128i prev, const __m128i input)
 
 /* set bits in the error vector where we find invalid UTF-8 input */
 static inline void
-check_utf8_bytes(const __m128i prev, const __m128i input, __m128i * error)
+check_utf8_bytes(const pg_u8x16_t prev, const pg_u8x16_t input, pg_u8x16_t * error)
 {
-	const		__m128i special_cases = check_special_cases(prev, input);
-	const		__m128i expect_two_conts = check_multibyte_lengths(prev, input);
+	const		pg_u8x16_t special_cases = check_special_cases(prev, input);
+	const		pg_u8x16_t expect_two_conts = check_multibyte_lengths(prev, input);
 
 	/* If the two cases are identical, this will be zero. */
-	const		__m128i result = bitwise_xor(expect_two_conts, special_cases);
+	const		pg_u8x16_t result = bitwise_xor(expect_two_conts, special_cases);
 
 	*error = bitwise_or(*error, result);
 }
 
 /* return false if a register is zero, true otherwise */
 static inline bool
-to_bool(const __m128i v)
+to_bool(const pg_u8x16_t v)
 {
 	/*
 	 * _mm_testz_si128 returns 1 if the bitwise AND of the two arguments is
@@ -409,25 +411,25 @@ to_bool(const __m128i v)
 
 /* set bits in the error vector where bytes in the input are zero */
 static inline void
-check_for_zeros(const __m128i v, __m128i * error)
+check_for_zeros(const pg_u8x16_t v, pg_u8x16_t * error)
 {
-	const		__m128i cmp = _mm_cmpeq_epi8(v, vzero());
+	const		pg_u8x16_t cmp = _mm_cmpeq_epi8(v, vzero());
 
 	*error = bitwise_or(*error, cmp);
 }
 
 /* vector version of IS_HIGHBIT_SET() */
 static inline bool
-is_highbit_set(const __m128i v)
+is_highbit_set(const pg_u8x16_t v)
 {
 	return _mm_movemask_epi8(v) != 0;
 }
 
 /* return non-zero if the input terminates with an incomplete code point */
-static inline __m128i
-is_incomplete(const __m128i v)
+static inline pg_u8x16_t
+is_incomplete(const pg_u8x16_t v)
 {
-	const		__m128i max_array =
+	const		pg_u8x16_t max_array =
 	vset(0xFF, 0xFF, 0xFF, 0xFF,
 		 0xFF, 0xFF, 0xFF, 0xFF,
 		 0xFF, 0xFF, 0xFF, 0xFF,
@@ -440,20 +442,20 @@ is_incomplete(const __m128i v)
  * See the comment in common/wchar.c under "multibyte sequence validators".
  */
 int
-pg_validate_utf8_sse42(const unsigned char *s, int len)
+pg_validate_utf8_simd(const unsigned char *s, int len)
 {
 	const unsigned char *start = s;
 	const int	orig_len = len;
-	__m128i		error = vzero();
-	__m128i		prev = vzero();
-	__m128i		prev_incomplete = vzero();
-	__m128i		input;
+	pg_u8x16_t		error = vzero();
+	pg_u8x16_t		prev = vzero();
+	pg_u8x16_t		prev_incomplete = vzero();
+	pg_u8x16_t		input;
 
 	/*
 	 * NB: This check must be strictly greater-than, otherwise an invalid byte
 	 * at the end might not get detected.
 	 */
-	while (len > sizeof(__m128i))
+	while (len > sizeof(pg_u8x16_t))
 	{
 		input = vload(s);
 
@@ -474,8 +476,8 @@ pg_validate_utf8_sse42(const unsigned char *s, int len)
 		}
 
 		prev = input;
-		s += sizeof(__m128i);
-		len -= sizeof(__m128i);
+		s += sizeof(pg_u8x16_t);
+		len -= sizeof(pg_u8x16_t);
 	}
 
 	/*
-- 
2.30.2