From b2329478b5331e2aa9942c7ee4e23e3bfa871c1d Mon Sep 17 00:00:00 2001 From: YANG Xudong Date: Fri, 16 Jun 2023 09:22:08 +0800 Subject: [PATCH v2] Add loongarch native checksum implementation. --- config/c-compiler.m4 | 34 ++++++++++++++++ configure | 73 ++++++++++++++++++++++++++++++---- configure.ac | 33 +++++++++++---- meson.build | 24 +++++++++++ src/include/pg_config.h.in | 3 ++ src/include/port/pg_crc32c.h | 9 +++++ src/port/meson.build | 3 ++ src/port/pg_crc32c_loongarch.c | 72 +++++++++++++++++++++++++++++++++ 8 files changed, 236 insertions(+), 15 deletions(-) create mode 100644 src/port/pg_crc32c_loongarch.c diff --git a/config/c-compiler.m4 b/config/c-compiler.m4 index 5be8f0f08d..eb3af009c4 100644 --- a/config/c-compiler.m4 +++ b/config/c-compiler.m4 @@ -661,3 +661,37 @@ if test x"$Ac_cachevar" = x"yes"; then fi undefine([Ac_cachevar])dnl ])# PGAC_ARMV8_CRC32C_INTRINSICS + +# PGAC_LOONGARCH_CRC32C_INTRINSICS +# --------------------------- +# Check if the compiler supports the LoongArch CRCC instructions, using +# __builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w, +# __builtin_loongarch_crcc_w_w_w and __builtin_loongarch_crcc_w_d_w +# intrinsic functions. +# +# If the intrinsics are supported, sets pgac_loongarch_crc32c_intrinsics, +# and CFLAGS_CRC. +AC_DEFUN([PGAC_LOONGARCH_CRC32C_INTRINSICS], +[define([Ac_cachevar], [AS_TR_SH([pgac_cv_loongarch_crc32c_intrinsics_$1])])dnl +AC_CACHE_CHECK( + [for __builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w, __builtin_loongarch_crcc_w_w_w and __builtin_loongarch_crcc_w_d_w with CFLAGS=$1], + [Ac_cachevar], +[pgac_save_CFLAGS=$CFLAGS +CFLAGS="$pgac_save_CFLAGS $1" +AC_LINK_IFELSE([AC_LANG_PROGRAM([], + [unsigned int crc = 0; + crc = __builtin_loongarch_crcc_w_b_w(0, crc); + crc = __builtin_loongarch_crcc_w_h_w(0, crc); + crc = __builtin_loongarch_crcc_w_w_w(0, crc); + crc = __builtin_loongarch_crcc_w_d_w(0, crc); + /* return computed value, to prevent the above being optimized away */ + return crc == 0;])], + [Ac_cachevar=yes], + [Ac_cachevar=no]) +CFLAGS="$pgac_save_CFLAGS"]) +if test x"$Ac_cachevar" = x"yes"; then + CFLAGS_CRC="$1" + pgac_loongarch_crc32c_intrinsics=yes +fi +undefine([Ac_cachevar])dnl +])# PGAC_LOONGARCH_CRC32C_INTRINSICS diff --git a/configure b/configure index 1b415142d1..7c60a26c11 100755 --- a/configure +++ b/configure @@ -18114,6 +18114,51 @@ fi fi +# Check for LoongArch CRC intrinsics to do CRC calculations. +# +# Check if __builtin_loongarch_crcc_* intrinsics can be used +# with the default compiler flags. +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w, __builtin_loongarch_crcc_w_w_w and __builtin_loongarch_crcc_w_d_w with CFLAGS=" >&5 +$as_echo_n "checking for __builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w, __builtin_loongarch_crcc_w_w_w and __builtin_loongarch_crcc_w_d_w with CFLAGS=... " >&6; } +if ${pgac_cv_loongarch_crc32c_intrinsics_+:} false; then : + $as_echo_n "(cached) " >&6 +else + pgac_save_CFLAGS=$CFLAGS +CFLAGS="$pgac_save_CFLAGS " +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +int +main () +{ +unsigned int crc = 0; + crc = __builtin_loongarch_crcc_w_b_w(0, crc); + crc = __builtin_loongarch_crcc_w_h_w(0, crc); + crc = __builtin_loongarch_crcc_w_w_w(0, crc); + crc = __builtin_loongarch_crcc_w_d_w(0, crc); + /* return computed value, to prevent the above being optimized away */ + return crc == 0; + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + pgac_cv_loongarch_crc32c_intrinsics_=yes +else + pgac_cv_loongarch_crc32c_intrinsics_=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +CFLAGS="$pgac_save_CFLAGS" +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_loongarch_crc32c_intrinsics_" >&5 +$as_echo "$pgac_cv_loongarch_crc32c_intrinsics_" >&6; } +if test x"$pgac_cv_loongarch_crc32c_intrinsics_" = x"yes"; then + CFLAGS_CRC="" + pgac_loongarch_crc32c_intrinsics=yes +fi + + # Select CRC-32C implementation. @@ -18132,7 +18177,7 @@ fi # # You can override this logic by setting the appropriate USE_*_CRC32 flag to 1 # in the template or configure command line. -if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x""; then +if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_LOONGARCH_CRC32C" = x""; then # Use Intel SSE 4.2 if available. if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then USE_SSE42_CRC32C=1 @@ -18150,10 +18195,15 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && if test x"$pgac_armv8_crc32c_intrinsics" = x"yes"; then USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK=1 else - # fall back to slicing-by-8 algorithm, which doesn't require any - # special CPU support. - USE_SLICING_BY_8_CRC32C=1 - fi + if test x"$pgac_loongarch_crc32c_intrinsics" = x"yes"; then + # LoongArch CRCC instructions. + USE_LOONGARCH_CRC32C=1 + else + # fall back to slicing-by-8 algorithm, which doesn't require any + # special CPU support. + USE_SLICING_BY_8_CRC32C=1 + fi + fi fi fi fi @@ -18194,12 +18244,21 @@ $as_echo "#define USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK 1" >>confdefs.h { $as_echo "$as_me:${as_lineno-$LINENO}: result: ARMv8 CRC instructions with runtime check" >&5 $as_echo "ARMv8 CRC instructions with runtime check" >&6; } else + if test x"$USE_LOONGARCH_CRC32C" = x"1"; then + +$as_echo "#define USE_LOONGARCH_CRC32C 1" >>confdefs.h + + PG_CRC32C_OBJS="pg_crc32c_loongarch.o" + { $as_echo "$as_me:${as_lineno-$LINENO}: result: LoongArch CRCC instructions" >&5 +$as_echo "LoongArch CRCC instructions" >&6; } + else $as_echo "#define USE_SLICING_BY_8_CRC32C 1" >>confdefs.h - PG_CRC32C_OBJS="pg_crc32c_sb8.o" - { $as_echo "$as_me:${as_lineno-$LINENO}: result: slicing-by-8" >&5 + PG_CRC32C_OBJS="pg_crc32c_sb8.o" + { $as_echo "$as_me:${as_lineno-$LINENO}: result: slicing-by-8" >&5 $as_echo "slicing-by-8" >&6; } + fi fi fi fi diff --git a/configure.ac b/configure.ac index 09558ada0f..d44e9f9ef8 100644 --- a/configure.ac +++ b/configure.ac @@ -2126,6 +2126,12 @@ if test x"$pgac_armv8_crc32c_intrinsics" != x"yes"; then PGAC_ARMV8_CRC32C_INTRINSICS([-march=armv8-a+crc]) fi +# Check for LoongArch CRC intrinsics to do CRC calculations. +# +# Check if __builtin_loongarch_crcc_* intrinsics can be used +# with the default compiler flags. +PGAC_LOONGARCH_CRC32C_INTRINSICS([]) + AC_SUBST(CFLAGS_CRC) # Select CRC-32C implementation. @@ -2144,7 +2150,7 @@ AC_SUBST(CFLAGS_CRC) # # You can override this logic by setting the appropriate USE_*_CRC32 flag to 1 # in the template or configure command line. -if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x""; then +if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_LOONGARCH_CRC32C" = x""; then # Use Intel SSE 4.2 if available. if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then USE_SSE42_CRC32C=1 @@ -2162,10 +2168,15 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && if test x"$pgac_armv8_crc32c_intrinsics" = x"yes"; then USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK=1 else - # fall back to slicing-by-8 algorithm, which doesn't require any - # special CPU support. - USE_SLICING_BY_8_CRC32C=1 - fi + if test x"$pgac_loongarch_crc32c_intrinsics" = x"yes"; then + # LoongArch CRCC instructions. + USE_LOONGARCH_CRC32C=1 + else + # fall back to slicing-by-8 algorithm, which doesn't require any + # special CPU support. + USE_SLICING_BY_8_CRC32C=1 + fi + fi fi fi fi @@ -2193,9 +2204,15 @@ else PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_sb8.o pg_crc32c_armv8_choose.o" AC_MSG_RESULT(ARMv8 CRC instructions with runtime check) else - AC_DEFINE(USE_SLICING_BY_8_CRC32C, 1, [Define to 1 to use software CRC-32C implementation (slicing-by-8).]) - PG_CRC32C_OBJS="pg_crc32c_sb8.o" - AC_MSG_RESULT(slicing-by-8) + if test x"$USE_LOONGARCH_CRC32C" = x"1"; then + AC_DEFINE(USE_LOONGARCH_CRC32C, 1, [Define to 1 to use LoongArch CRCC instructions.]) + PG_CRC32C_OBJS="pg_crc32c_loongarch.o" + AC_MSG_RESULT(LoongArch CRCC instructions) + else + AC_DEFINE(USE_SLICING_BY_8_CRC32C, 1, [Define to 1 to use software CRC-32C implementation (slicing-by-8).]) + PG_CRC32C_OBJS="pg_crc32c_sb8.o" + AC_MSG_RESULT(slicing-by-8) + fi fi fi fi diff --git a/meson.build b/meson.build index 82f2782673..77a9b985df 100644 --- a/meson.build +++ b/meson.build @@ -2073,6 +2073,30 @@ int main(void) cdata.set('USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK', 1) have_optimized_crc = true endif + +elif host_cpu == 'loongarch64' + + prog = ''' +int main(void) +{ + unsigned int crc = 0; + crc = __builtin_loongarch_crcc_w_b_w(0, crc); + crc = __builtin_loongarch_crcc_w_h_w(0, crc); + crc = __builtin_loongarch_crcc_w_w_w(0, crc); + crc = __builtin_loongarch_crcc_w_d_w(0, crc); + + /* return computed value, to prevent the above being optimized away */ + return crc == 0; +} +''' + + if cc.links(prog, name: '__builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w, __builtin_loongarch_crcc_w_w_w, and __builtin_loongarch_crcc_w_d_w', + args: test_c_args) + # Use LoongArch CRC instruction unconditionally + cdata.set('USE_LOONGARCH_CRC32C', 1) + have_optimized_crc = true + endif + endif if not have_optimized_crc diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index 6d572c3820..1f253566e4 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -724,6 +724,9 @@ /* Define to 1 to build with LLVM based JIT support. (--with-llvm) */ #undef USE_LLVM +/* Define to 1 to use LoongArch CRCC instructions. */ +#undef USE_LOONGARCH_CRC32C + /* Define to 1 to build with LZ4 support. (--with-lz4) */ #undef USE_LZ4 diff --git a/src/include/port/pg_crc32c.h b/src/include/port/pg_crc32c.h index 7f8779261c..d085f1dc00 100644 --- a/src/include/port/pg_crc32c.h +++ b/src/include/port/pg_crc32c.h @@ -58,6 +58,15 @@ extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t le extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len); +#elif defined(USE_LOONGARCH_CRC32C) +/* Use LoongArch CRCC instructions. */ + +#define COMP_CRC32C(crc, data, len) \ + ((crc) = pg_comp_crc32c_loongarch((crc), (data), (len))) +#define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF) + +extern pg_crc32c pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_t len); + #elif defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK) || defined(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK) /* diff --git a/src/port/meson.build b/src/port/meson.build index 24416b9bfc..3e77b2493a 100644 --- a/src/port/meson.build +++ b/src/port/meson.build @@ -92,6 +92,9 @@ replace_funcs_pos = [ ['pg_crc32c_armv8_choose', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK'], ['pg_crc32c_sb8', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK'], + # loongarch + ['pg_crc32c_loongarch', 'USE_LOONGARCH_CRC32C'], + # generic fallback ['pg_crc32c_sb8', 'USE_SLICING_BY_8_CRC32C'], ] diff --git a/src/port/pg_crc32c_loongarch.c b/src/port/pg_crc32c_loongarch.c new file mode 100644 index 0000000000..b54fe12ff5 --- /dev/null +++ b/src/port/pg_crc32c_loongarch.c @@ -0,0 +1,72 @@ +/*------------------------------------------------------------------------- + * + * pg_crc32c_loongarch.c + * Compute CRC-32C checksum using LoongArch CRCC instructions + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/port/pg_crc32c_loongarch.c + * + *------------------------------------------------------------------------- + */ +#include "c.h" + +#include "port/pg_crc32c.h" + +pg_crc32c +pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_t len) +{ + const unsigned char *p = data; + const unsigned char *pend = p + len; + + /* + * Aligned memory access is significantly faster. + * Process leading bytes so that the loop below starts with a pointer aligned to eight bytes. + */ + if (!PointerIsAligned(p, uint16) && + p + 1 <= pend) + { + crc = __builtin_loongarch_crcc_w_b_w(*p, crc); + p += 1; + } + if (!PointerIsAligned(p, uint32) && + p + 2 <= pend) + { + crc = __builtin_loongarch_crcc_w_h_w(*(uint16 *) p, crc); + p += 2; + } + if (!PointerIsAligned(p, uint64) && + p + 4 <= pend) + { + crc = __builtin_loongarch_crcc_w_w_w(*(uint32 *) p, crc); + p += 4; + } + + /* Process eight bytes at a time, as far as we can. */ + while (p + 8 <= pend) + { + crc = __builtin_loongarch_crcc_w_d_w(*(uint64 *) p, crc); + p += 8; + } + + /* Process remaining 0-7 bytes. */ + if (p + 4 <= pend) + { + crc = __builtin_loongarch_crcc_w_w_w(*(uint32 *) p, crc); + p += 4; + } + if (p + 2 <= pend) + { + crc = __builtin_loongarch_crcc_w_h_w(*(uint16 *) p, crc); + p += 2; + } + if (p < pend) + { + crc = __builtin_loongarch_crcc_w_b_w(*p, crc); + } + + return crc; +} -- 2.41.0