From d577ef9d9755e7ca4d3722c1a044381a81d66244 Mon Sep 17 00:00:00 2001
From: Masahiko Sawada <sawada.mshk@gmail.com>
Date: Mon, 24 Oct 2022 14:07:09 +0900
Subject: [PATCH v27 1/9] Introduce helper SIMD functions for small byte arrays

vector8_min - helper for emulating ">=" semantics

vector8_highbit_mask - used to turn the result of a vector
comparison into a bitmask

Masahiko Sawada

Reviewed by Nathan Bossart, additional adjustments by me
Discussion: https://www.postgresql.org/message-id/CAD21AoDap240WDDdUDE0JMpCmuMMnGajrKrkCRxM7zn9Xk3JRA%40mail.gmail.com
---
 src/include/port/simd.h | 47 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/src/include/port/simd.h b/src/include/port/simd.h
index c836360d4b..350e2caaea 100644
--- a/src/include/port/simd.h
+++ b/src/include/port/simd.h
@@ -79,6 +79,7 @@ static inline bool vector8_has_le(const Vector8 v, const uint8 c);
 static inline bool vector8_is_highbit_set(const Vector8 v);
 #ifndef USE_NO_SIMD
 static inline bool vector32_is_highbit_set(const Vector32 v);
+static inline uint32 vector8_highbit_mask(const Vector8 v);
 #endif
 
 /* arithmetic operations */
@@ -96,6 +97,7 @@ static inline Vector8 vector8_ssub(const Vector8 v1, const Vector8 v2);
  */
 #ifndef USE_NO_SIMD
 static inline Vector8 vector8_eq(const Vector8 v1, const Vector8 v2);
+static inline Vector8 vector8_min(const Vector8 v1, const Vector8 v2);
 static inline Vector32 vector32_eq(const Vector32 v1, const Vector32 v2);
 #endif
 
@@ -299,6 +301,36 @@ vector32_is_highbit_set(const Vector32 v)
 }
 #endif							/* ! USE_NO_SIMD */
 
+/*
+ * Return a bitmask formed from the high-bit of each element.
+ */
+#ifndef USE_NO_SIMD
+static inline uint32
+vector8_highbit_mask(const Vector8 v)
+{
+#ifdef USE_SSE2
+	return (uint32) _mm_movemask_epi8(v);
+#elif defined(USE_NEON)
+	/*
+	 * Note: There is a faster way to do this, but it returns a uint64 and
+	 * and if the caller wanted to extract the bit position using CTZ,
+	 * it would have to divide that result by 4.
+	 */
+	static const uint8 mask[16] = {
+		1 << 0, 1 << 1, 1 << 2, 1 << 3,
+		1 << 4, 1 << 5, 1 << 6, 1 << 7,
+		1 << 0, 1 << 1, 1 << 2, 1 << 3,
+		1 << 4, 1 << 5, 1 << 6, 1 << 7,
+	  };
+
+	uint8x16_t masked = vandq_u8(vld1q_u8(mask), (uint8x16_t) vshrq_n_s8(v, 7));
+	uint8x16_t maskedhi = vextq_u8(masked, masked, 8);
+
+	return (uint32) vaddvq_u16((uint16x8_t) vzip1q_u8(masked, maskedhi));
+#endif
+}
+#endif							/* ! USE_NO_SIMD */
+
 /*
  * Return the bitwise OR of the inputs
  */
@@ -372,4 +404,19 @@ vector32_eq(const Vector32 v1, const Vector32 v2)
 }
 #endif							/* ! USE_NO_SIMD */
 
+/*
+ * Given two vectors, return a vector with the minimum element of each.
+ */
+#ifndef USE_NO_SIMD
+static inline Vector8
+vector8_min(const Vector8 v1, const Vector8 v2)
+{
+#ifdef USE_SSE2
+	return _mm_min_epu8(v1, v2);
+#elif defined(USE_NEON)
+	return vminq_u8(v1, v2);
+#endif
+}
+#endif							/* ! USE_NO_SIMD */
+
 #endif							/* SIMD_H */
-- 
2.39.1