From ced3f65d7445bdcca0628c4f5073b5657a81cd28 Mon Sep 17 00:00:00 2001 From: YANG Xudong Date: Tue, 8 Aug 2023 10:41:58 +0800 Subject: [PATCH v3] Add loongarch native checksum implementation. --- config/c-compiler.m4 | 29 ++++++++++++++ configure | 69 ++++++++++++++++++++++++++++---- configure.ac | 33 +++++++++++---- meson.build | 24 +++++++++++ src/include/pg_config.h.in | 3 ++ src/include/port/pg_crc32c.h | 9 +++++ src/port/meson.build | 3 ++ src/port/pg_crc32c_loongarch.c | 73 ++++++++++++++++++++++++++++++++++ 8 files changed, 228 insertions(+), 15 deletions(-) create mode 100644 src/port/pg_crc32c_loongarch.c diff --git a/config/c-compiler.m4 b/config/c-compiler.m4 index 5be8f0f08d..7777ad6e90 100644 --- a/config/c-compiler.m4 +++ b/config/c-compiler.m4 @@ -661,3 +661,32 @@ if test x"$Ac_cachevar" = x"yes"; then fi undefine([Ac_cachevar])dnl ])# PGAC_ARMV8_CRC32C_INTRINSICS + +# PGAC_LOONGARCH_CRC32C_INTRINSICS +# --------------------------- +# Check if the compiler supports the LoongArch CRCC instructions, using +# __builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w, +# __builtin_loongarch_crcc_w_w_w and __builtin_loongarch_crcc_w_d_w +# intrinsic functions. +# +# If the intrinsics are supported, sets pgac_loongarch_crc32c_intrinsics. +AC_DEFUN([PGAC_LOONGARCH_CRC32C_INTRINSICS], +[define([Ac_cachevar], [AS_TR_SH([pgac_cv_loongarch_crc32c_intrinsics])])dnl +AC_CACHE_CHECK( + [for __builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w, __builtin_loongarch_crcc_w_w_w and __builtin_loongarch_crcc_w_d_w], + [Ac_cachevar], +[AC_LINK_IFELSE([AC_LANG_PROGRAM([], + [unsigned int crc = 0; + crc = __builtin_loongarch_crcc_w_b_w(0, crc); + crc = __builtin_loongarch_crcc_w_h_w(0, crc); + crc = __builtin_loongarch_crcc_w_w_w(0, crc); + crc = __builtin_loongarch_crcc_w_d_w(0, crc); + /* return computed value, to prevent the above being optimized away */ + return crc == 0;])], + [Ac_cachevar=yes], + [Ac_cachevar=no])]) +if test x"$Ac_cachevar" = x"yes"; then + pgac_loongarch_crc32c_intrinsics=yes +fi +undefine([Ac_cachevar])dnl +])# PGAC_LOONGARCH_CRC32C_INTRINSICS diff --git a/configure b/configure index 963fbbcf1e..c5eb2814f1 100755 --- a/configure +++ b/configure @@ -18047,6 +18047,47 @@ fi fi +# Check for LoongArch CRC intrinsics to do CRC calculations. +# +# Check if __builtin_loongarch_crcc_* intrinsics can be used +# with the default compiler flags. +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w, __builtin_loongarch_crcc_w_w_w and __builtin_loongarch_crcc_w_d_w" >&5 +$as_echo_n "checking for __builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w, __builtin_loongarch_crcc_w_w_w and __builtin_loongarch_crcc_w_d_w... " >&6; } +if ${pgac_cv_loongarch_crc32c_intrinsics+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +int +main () +{ +unsigned int crc = 0; + crc = __builtin_loongarch_crcc_w_b_w(0, crc); + crc = __builtin_loongarch_crcc_w_h_w(0, crc); + crc = __builtin_loongarch_crcc_w_w_w(0, crc); + crc = __builtin_loongarch_crcc_w_d_w(0, crc); + /* return computed value, to prevent the above being optimized away */ + return crc == 0; + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + pgac_cv_loongarch_crc32c_intrinsics=yes +else + pgac_cv_loongarch_crc32c_intrinsics=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_loongarch_crc32c_intrinsics" >&5 +$as_echo "$pgac_cv_loongarch_crc32c_intrinsics" >&6; } +if test x"$pgac_cv_loongarch_crc32c_intrinsics" = x"yes"; then + pgac_loongarch_crc32c_intrinsics=yes +fi + + # Select CRC-32C implementation. @@ -18065,7 +18106,7 @@ fi # # You can override this logic by setting the appropriate USE_*_CRC32 flag to 1 # in the template or configure command line. -if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x""; then +if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_LOONGARCH_CRC32C" = x""; then # Use Intel SSE 4.2 if available. if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then USE_SSE42_CRC32C=1 @@ -18083,10 +18124,15 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && if test x"$pgac_armv8_crc32c_intrinsics" = x"yes"; then USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK=1 else - # fall back to slicing-by-8 algorithm, which doesn't require any - # special CPU support. - USE_SLICING_BY_8_CRC32C=1 - fi + if test x"$pgac_loongarch_crc32c_intrinsics" = x"yes"; then + # LoongArch CRCC instructions. + USE_LOONGARCH_CRC32C=1 + else + # fall back to slicing-by-8 algorithm, which doesn't require any + # special CPU support. + USE_SLICING_BY_8_CRC32C=1 + fi + fi fi fi fi @@ -18127,12 +18173,21 @@ $as_echo "#define USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK 1" >>confdefs.h { $as_echo "$as_me:${as_lineno-$LINENO}: result: ARMv8 CRC instructions with runtime check" >&5 $as_echo "ARMv8 CRC instructions with runtime check" >&6; } else + if test x"$USE_LOONGARCH_CRC32C" = x"1"; then + +$as_echo "#define USE_LOONGARCH_CRC32C 1" >>confdefs.h + + PG_CRC32C_OBJS="pg_crc32c_loongarch.o" + { $as_echo "$as_me:${as_lineno-$LINENO}: result: LoongArch CRCC instructions" >&5 +$as_echo "LoongArch CRCC instructions" >&6; } + else $as_echo "#define USE_SLICING_BY_8_CRC32C 1" >>confdefs.h - PG_CRC32C_OBJS="pg_crc32c_sb8.o" - { $as_echo "$as_me:${as_lineno-$LINENO}: result: slicing-by-8" >&5 + PG_CRC32C_OBJS="pg_crc32c_sb8.o" + { $as_echo "$as_me:${as_lineno-$LINENO}: result: slicing-by-8" >&5 $as_echo "slicing-by-8" >&6; } + fi fi fi fi diff --git a/configure.ac b/configure.ac index 5153b8b3fd..3011b8ea34 100644 --- a/configure.ac +++ b/configure.ac @@ -2099,6 +2099,12 @@ if test x"$pgac_armv8_crc32c_intrinsics" != x"yes"; then PGAC_ARMV8_CRC32C_INTRINSICS([-march=armv8-a+crc]) fi +# Check for LoongArch CRC intrinsics to do CRC calculations. +# +# Check if __builtin_loongarch_crcc_* intrinsics can be used +# with the default compiler flags. +PGAC_LOONGARCH_CRC32C_INTRINSICS() + AC_SUBST(CFLAGS_CRC) # Select CRC-32C implementation. @@ -2117,7 +2123,7 @@ AC_SUBST(CFLAGS_CRC) # # You can override this logic by setting the appropriate USE_*_CRC32 flag to 1 # in the template or configure command line. -if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x""; then +if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_LOONGARCH_CRC32C" = x""; then # Use Intel SSE 4.2 if available. if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then USE_SSE42_CRC32C=1 @@ -2135,10 +2141,15 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && if test x"$pgac_armv8_crc32c_intrinsics" = x"yes"; then USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK=1 else - # fall back to slicing-by-8 algorithm, which doesn't require any - # special CPU support. - USE_SLICING_BY_8_CRC32C=1 - fi + if test x"$pgac_loongarch_crc32c_intrinsics" = x"yes"; then + # LoongArch CRCC instructions. + USE_LOONGARCH_CRC32C=1 + else + # fall back to slicing-by-8 algorithm, which doesn't require any + # special CPU support. + USE_SLICING_BY_8_CRC32C=1 + fi + fi fi fi fi @@ -2166,9 +2177,15 @@ else PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_sb8.o pg_crc32c_armv8_choose.o" AC_MSG_RESULT(ARMv8 CRC instructions with runtime check) else - AC_DEFINE(USE_SLICING_BY_8_CRC32C, 1, [Define to 1 to use software CRC-32C implementation (slicing-by-8).]) - PG_CRC32C_OBJS="pg_crc32c_sb8.o" - AC_MSG_RESULT(slicing-by-8) + if test x"$USE_LOONGARCH_CRC32C" = x"1"; then + AC_DEFINE(USE_LOONGARCH_CRC32C, 1, [Define to 1 to use LoongArch CRCC instructions.]) + PG_CRC32C_OBJS="pg_crc32c_loongarch.o" + AC_MSG_RESULT(LoongArch CRCC instructions) + else + AC_DEFINE(USE_SLICING_BY_8_CRC32C, 1, [Define to 1 to use software CRC-32C implementation (slicing-by-8).]) + PG_CRC32C_OBJS="pg_crc32c_sb8.o" + AC_MSG_RESULT(slicing-by-8) + fi fi fi fi diff --git a/meson.build b/meson.build index 0a11efc97a..2acb204003 100644 --- a/meson.build +++ b/meson.build @@ -2065,6 +2065,30 @@ int main(void) cdata.set('USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK', 1) have_optimized_crc = true endif + +elif host_cpu == 'loongarch64' + + prog = ''' +int main(void) +{ + unsigned int crc = 0; + crc = __builtin_loongarch_crcc_w_b_w(0, crc); + crc = __builtin_loongarch_crcc_w_h_w(0, crc); + crc = __builtin_loongarch_crcc_w_w_w(0, crc); + crc = __builtin_loongarch_crcc_w_d_w(0, crc); + + /* return computed value, to prevent the above being optimized away */ + return crc == 0; +} +''' + + if cc.links(prog, name: '__builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w, __builtin_loongarch_crcc_w_w_w, and __builtin_loongarch_crcc_w_d_w', + args: test_c_args) + # Use LoongArch CRC instruction unconditionally + cdata.set('USE_LOONGARCH_CRC32C', 1) + have_optimized_crc = true + endif + endif if not have_optimized_crc diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index ee209d6d70..d8a2985567 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -714,6 +714,9 @@ /* Define to 1 to build with LLVM based JIT support. (--with-llvm) */ #undef USE_LLVM +/* Define to 1 to use LoongArch CRCC instructions. */ +#undef USE_LOONGARCH_CRC32C + /* Define to 1 to build with LZ4 support. (--with-lz4) */ #undef USE_LZ4 diff --git a/src/include/port/pg_crc32c.h b/src/include/port/pg_crc32c.h index 7f8779261c..d085f1dc00 100644 --- a/src/include/port/pg_crc32c.h +++ b/src/include/port/pg_crc32c.h @@ -58,6 +58,15 @@ extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t le extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len); +#elif defined(USE_LOONGARCH_CRC32C) +/* Use LoongArch CRCC instructions. */ + +#define COMP_CRC32C(crc, data, len) \ + ((crc) = pg_comp_crc32c_loongarch((crc), (data), (len))) +#define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF) + +extern pg_crc32c pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_t len); + #elif defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK) || defined(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK) /* diff --git a/src/port/meson.build b/src/port/meson.build index 9d0cd93c43..deb354418d 100644 --- a/src/port/meson.build +++ b/src/port/meson.build @@ -92,6 +92,9 @@ replace_funcs_pos = [ ['pg_crc32c_armv8_choose', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK'], ['pg_crc32c_sb8', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK'], + # loongarch + ['pg_crc32c_loongarch', 'USE_LOONGARCH_CRC32C'], + # generic fallback ['pg_crc32c_sb8', 'USE_SLICING_BY_8_CRC32C'], ] diff --git a/src/port/pg_crc32c_loongarch.c b/src/port/pg_crc32c_loongarch.c new file mode 100644 index 0000000000..2897920800 --- /dev/null +++ b/src/port/pg_crc32c_loongarch.c @@ -0,0 +1,73 @@ +/*------------------------------------------------------------------------- + * + * pg_crc32c_loongarch.c + * Compute CRC-32C checksum using LoongArch CRCC instructions + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/port/pg_crc32c_loongarch.c + * + *------------------------------------------------------------------------- + */ +#include "c.h" + +#include "port/pg_crc32c.h" + +pg_crc32c +pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_t len) +{ + const unsigned char *p = data; + const unsigned char *pend = p + len; + + /* + * Loongarch desktop and server chips support unaligned memory access by default. + * However, aligned memory access is significantly faster. + * Process leading bytes so that the loop below starts with a pointer aligned to eight bytes. + */ + if (!PointerIsAligned(p, uint16) && + p + 1 <= pend) + { + crc = __builtin_loongarch_crcc_w_b_w(*p, crc); + p += 1; + } + if (!PointerIsAligned(p, uint32) && + p + 2 <= pend) + { + crc = __builtin_loongarch_crcc_w_h_w(*(uint16 *) p, crc); + p += 2; + } + if (!PointerIsAligned(p, uint64) && + p + 4 <= pend) + { + crc = __builtin_loongarch_crcc_w_w_w(*(uint32 *) p, crc); + p += 4; + } + + /* Process eight bytes at a time, as far as we can. */ + while (p + 8 <= pend) + { + crc = __builtin_loongarch_crcc_w_d_w(*(uint64 *) p, crc); + p += 8; + } + + /* Process remaining 0-7 bytes. */ + if (p + 4 <= pend) + { + crc = __builtin_loongarch_crcc_w_w_w(*(uint32 *) p, crc); + p += 4; + } + if (p + 2 <= pend) + { + crc = __builtin_loongarch_crcc_w_h_w(*(uint16 *) p, crc); + p += 2; + } + if (p < pend) + { + crc = __builtin_loongarch_crcc_w_b_w(*p, crc); + } + + return crc; +} -- 2.41.0