From 6ccee980a42f1706094a51fa146c55edc3adfbb1 Mon Sep 17 00:00:00 2001 From: YANG Xudong Date: Thu, 8 Jun 2023 13:22:03 +0800 Subject: [PATCH] Add loongarch native checksum implementation. --- config/c-compiler.m4 | 34 ++++++++++ configure | 118 +++++++++++++++++++++++++++++++-- configure.ac | 39 ++++++++--- meson.build | 29 ++++++++ src/include/pg_config.h.in | 3 + src/include/port/pg_crc32c.h | 9 +++ src/port/Makefile | 5 ++ src/port/meson.build | 3 + src/port/pg_crc32c_loongarch.c | 72 ++++++++++++++++++++ 9 files changed, 297 insertions(+), 15 deletions(-) create mode 100644 src/port/pg_crc32c_loongarch.c diff --git a/config/c-compiler.m4 b/config/c-compiler.m4 index 5be8f0f08d..eb3af009c4 100644 --- a/config/c-compiler.m4 +++ b/config/c-compiler.m4 @@ -661,3 +661,37 @@ if test x"$Ac_cachevar" = x"yes"; then fi undefine([Ac_cachevar])dnl ])# PGAC_ARMV8_CRC32C_INTRINSICS + +# PGAC_LOONGARCH_CRC32C_INTRINSICS +# --------------------------- +# Check if the compiler supports the LoongArch CRCC instructions, using +# __builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w, +# __builtin_loongarch_crcc_w_w_w and __builtin_loongarch_crcc_w_d_w +# intrinsic functions. +# +# If the intrinsics are supported, sets pgac_loongarch_crc32c_intrinsics, +# and CFLAGS_CRC. +AC_DEFUN([PGAC_LOONGARCH_CRC32C_INTRINSICS], +[define([Ac_cachevar], [AS_TR_SH([pgac_cv_loongarch_crc32c_intrinsics_$1])])dnl +AC_CACHE_CHECK( + [for __builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w, __builtin_loongarch_crcc_w_w_w and __builtin_loongarch_crcc_w_d_w with CFLAGS=$1], + [Ac_cachevar], +[pgac_save_CFLAGS=$CFLAGS +CFLAGS="$pgac_save_CFLAGS $1" +AC_LINK_IFELSE([AC_LANG_PROGRAM([], + [unsigned int crc = 0; + crc = __builtin_loongarch_crcc_w_b_w(0, crc); + crc = __builtin_loongarch_crcc_w_h_w(0, crc); + crc = __builtin_loongarch_crcc_w_w_w(0, crc); + crc = __builtin_loongarch_crcc_w_d_w(0, crc); + /* return computed value, to prevent the above being optimized away */ + return crc == 0;])], + [Ac_cachevar=yes], + [Ac_cachevar=no]) +CFLAGS="$pgac_save_CFLAGS"]) +if test x"$Ac_cachevar" = x"yes"; then + CFLAGS_CRC="$1" + pgac_loongarch_crc32c_intrinsics=yes +fi +undefine([Ac_cachevar])dnl +])# PGAC_LOONGARCH_CRC32C_INTRINSICS diff --git a/configure b/configure index 1b415142d1..c5d8c2e4f6 100755 --- a/configure +++ b/configure @@ -18116,6 +18116,96 @@ fi +# Check for LoongArch CRC intrinsics to do CRC calculations. +# +# Check if __builtin_loongarch_crcc_* intrinsics can be used +# with the default compiler flags. +# CFLAGS_CRC is set if the extra flag is required. +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w, __builtin_loongarch_crcc_w_w_w and __builtin_loongarch_crcc_w_d_w with CFLAGS=" >&5 +$as_echo_n "checking for __builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w, __builtin_loongarch_crcc_w_w_w and __builtin_loongarch_crcc_w_d_w with CFLAGS=... " >&6; } +if ${pgac_cv_loongarch_crc32c_intrinsics_+:} false; then : + $as_echo_n "(cached) " >&6 +else + pgac_save_CFLAGS=$CFLAGS +CFLAGS="$pgac_save_CFLAGS " +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +int +main () +{ +unsigned int crc = 0; + crc = __builtin_loongarch_crcc_w_b_w(0, crc); + crc = __builtin_loongarch_crcc_w_h_w(0, crc); + crc = __builtin_loongarch_crcc_w_w_w(0, crc); + crc = __builtin_loongarch_crcc_w_d_w(0, crc); + /* return computed value, to prevent the above being optimized away */ + return crc == 0; + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + pgac_cv_loongarch_crc32c_intrinsics_=yes +else + pgac_cv_loongarch_crc32c_intrinsics_=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +CFLAGS="$pgac_save_CFLAGS" +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_loongarch_crc32c_intrinsics_" >&5 +$as_echo "$pgac_cv_loongarch_crc32c_intrinsics_" >&6; } +if test x"$pgac_cv_loongarch_crc32c_intrinsics_" = x"yes"; then + CFLAGS_CRC="" + pgac_loongarch_crc32c_intrinsics=yes +fi + +if test x"$pgac_loongarch_crc32c_intrinsics" != x"yes"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for __builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w, __builtin_loongarch_crcc_w_w_w and __builtin_loongarch_crcc_w_d_w with CFLAGS=-march=loongarch64" >&5 +$as_echo_n "checking for __builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w, __builtin_loongarch_crcc_w_w_w and __builtin_loongarch_crcc_w_d_w with CFLAGS=-march=loongarch64... " >&6; } +if ${pgac_cv_loongarch_crc32c_intrinsics__march_loongarch64+:} false; then : + $as_echo_n "(cached) " >&6 +else + pgac_save_CFLAGS=$CFLAGS +CFLAGS="$pgac_save_CFLAGS -march=loongarch64" +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +int +main () +{ +unsigned int crc = 0; + crc = __builtin_loongarch_crcc_w_b_w(0, crc); + crc = __builtin_loongarch_crcc_w_h_w(0, crc); + crc = __builtin_loongarch_crcc_w_w_w(0, crc); + crc = __builtin_loongarch_crcc_w_d_w(0, crc); + /* return computed value, to prevent the above being optimized away */ + return crc == 0; + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + pgac_cv_loongarch_crc32c_intrinsics__march_loongarch64=yes +else + pgac_cv_loongarch_crc32c_intrinsics__march_loongarch64=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +CFLAGS="$pgac_save_CFLAGS" +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_loongarch_crc32c_intrinsics__march_loongarch64" >&5 +$as_echo "$pgac_cv_loongarch_crc32c_intrinsics__march_loongarch64" >&6; } +if test x"$pgac_cv_loongarch_crc32c_intrinsics__march_loongarch64" = x"yes"; then + CFLAGS_CRC="-march=loongarch64" + pgac_loongarch_crc32c_intrinsics=yes +fi + +fi + + + # Select CRC-32C implementation. # # If we are targeting a processor that has Intel SSE 4.2 instructions, we can @@ -18132,7 +18222,7 @@ fi # # You can override this logic by setting the appropriate USE_*_CRC32 flag to 1 # in the template or configure command line. -if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x""; then +if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_LOONGARCH_CRC32C" = x""; then # Use Intel SSE 4.2 if available. if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then USE_SSE42_CRC32C=1 @@ -18150,10 +18240,15 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && if test x"$pgac_armv8_crc32c_intrinsics" = x"yes"; then USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK=1 else - # fall back to slicing-by-8 algorithm, which doesn't require any - # special CPU support. - USE_SLICING_BY_8_CRC32C=1 - fi + if test x"$pgac_loongarch_crc32c_intrinsics" = x"yes"; then + # LoongArch CRCC instructions. + USE_LOONGARCH_CRC32C=1 + else + # fall back to slicing-by-8 algorithm, which doesn't require any + # special CPU support. + USE_SLICING_BY_8_CRC32C=1 + fi + fi fi fi fi @@ -18194,12 +18289,21 @@ $as_echo "#define USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK 1" >>confdefs.h { $as_echo "$as_me:${as_lineno-$LINENO}: result: ARMv8 CRC instructions with runtime check" >&5 $as_echo "ARMv8 CRC instructions with runtime check" >&6; } else + if test x"$USE_LOONGARCH_CRC32C" = x"1"; then + +$as_echo "#define USE_LOONGARCH_CRC32C 1" >>confdefs.h + + PG_CRC32C_OBJS="pg_crc32c_loongarch.o" + { $as_echo "$as_me:${as_lineno-$LINENO}: result: LoongArch CRCC instructions" >&5 +$as_echo "LoongArch CRCC instructions" >&6; } + else $as_echo "#define USE_SLICING_BY_8_CRC32C 1" >>confdefs.h - PG_CRC32C_OBJS="pg_crc32c_sb8.o" - { $as_echo "$as_me:${as_lineno-$LINENO}: result: slicing-by-8" >&5 + PG_CRC32C_OBJS="pg_crc32c_sb8.o" + { $as_echo "$as_me:${as_lineno-$LINENO}: result: slicing-by-8" >&5 $as_echo "slicing-by-8" >&6; } + fi fi fi fi diff --git a/configure.ac b/configure.ac index 09558ada0f..5dcb1c4ab9 100644 --- a/configure.ac +++ b/configure.ac @@ -2128,6 +2128,18 @@ fi AC_SUBST(CFLAGS_CRC) +# Check for LoongArch CRC intrinsics to do CRC calculations. +# +# Check if __builtin_loongarch_crcc_* intrinsics can be used +# with the default compiler flags. +# CFLAGS_CRC is set if the extra flag is required. +PGAC_LOONGARCH_CRC32C_INTRINSICS([]) +if test x"$pgac_loongarch_crc32c_intrinsics" != x"yes"; then + PGAC_LOONGARCH_CRC32C_INTRINSICS([-march=loongarch64]) +fi + +AC_SUBST(CFLAGS_CRC) + # Select CRC-32C implementation. # # If we are targeting a processor that has Intel SSE 4.2 instructions, we can @@ -2144,7 +2156,7 @@ AC_SUBST(CFLAGS_CRC) # # You can override this logic by setting the appropriate USE_*_CRC32 flag to 1 # in the template or configure command line. -if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x""; then +if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && test x"$USE_SSE42_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_ARMV8_CRC32C" = x"" && test x"$USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK" = x"" && test x"$USE_LOONGARCH_CRC32C" = x""; then # Use Intel SSE 4.2 if available. if test x"$pgac_sse42_crc32_intrinsics" = x"yes" && test x"$SSE4_2_TARGETED" = x"1" ; then USE_SSE42_CRC32C=1 @@ -2162,10 +2174,15 @@ if test x"$USE_SLICING_BY_8_CRC32C" = x"" && test x"$USE_SSE42_CRC32C" = x"" && if test x"$pgac_armv8_crc32c_intrinsics" = x"yes"; then USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK=1 else - # fall back to slicing-by-8 algorithm, which doesn't require any - # special CPU support. - USE_SLICING_BY_8_CRC32C=1 - fi + if test x"$pgac_loongarch_crc32c_intrinsics" = x"yes"; then + # LoongArch CRCC instructions. + USE_LOONGARCH_CRC32C=1 + else + # fall back to slicing-by-8 algorithm, which doesn't require any + # special CPU support. + USE_SLICING_BY_8_CRC32C=1 + fi + fi fi fi fi @@ -2193,9 +2210,15 @@ else PG_CRC32C_OBJS="pg_crc32c_armv8.o pg_crc32c_sb8.o pg_crc32c_armv8_choose.o" AC_MSG_RESULT(ARMv8 CRC instructions with runtime check) else - AC_DEFINE(USE_SLICING_BY_8_CRC32C, 1, [Define to 1 to use software CRC-32C implementation (slicing-by-8).]) - PG_CRC32C_OBJS="pg_crc32c_sb8.o" - AC_MSG_RESULT(slicing-by-8) + if test x"$USE_LOONGARCH_CRC32C" = x"1"; then + AC_DEFINE(USE_LOONGARCH_CRC32C, 1, [Define to 1 to use LoongArch CRCC instructions.]) + PG_CRC32C_OBJS="pg_crc32c_loongarch.o" + AC_MSG_RESULT(LoongArch CRCC instructions) + else + AC_DEFINE(USE_SLICING_BY_8_CRC32C, 1, [Define to 1 to use software CRC-32C implementation (slicing-by-8).]) + PG_CRC32C_OBJS="pg_crc32c_sb8.o" + AC_MSG_RESULT(slicing-by-8) + fi fi fi fi diff --git a/meson.build b/meson.build index 16b2e86646..5de8a85c94 100644 --- a/meson.build +++ b/meson.build @@ -2073,6 +2073,35 @@ int main(void) cdata.set('USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK', 1) have_optimized_crc = true endif + +elif host_cpu == 'loongarch64' + + prog = ''' +int main(void) +{ + unsigned int crc = 0; + crc = __builtin_loongarch_crcc_w_b_w(0, crc); + crc = __builtin_loongarch_crcc_w_h_w(0, crc); + crc = __builtin_loongarch_crcc_w_w_w(0, crc); + crc = __builtin_loongarch_crcc_w_d_w(0, crc); + + /* return computed value, to prevent the above being optimized away */ + return crc == 0; +} +''' + + if cc.links(prog, name: '__builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w, __builtin_loongarch_crcc_w_w_w, and __builtin_loongarch_crcc_w_d_w without -march=loongarch64', + args: test_c_args) + # Use LoongArch CRC instruction unconditionally + cdata.set('USE_LOONGARCH_CRC32C', 1) + have_optimized_crc = true + elif cc.links(prog, name: '__builtin_loongarch_crcc_w_b_w, __builtin_loongarch_crcc_w_h_w, __builtin_loongarch_crcc_w_w_w, and __builtin_loongarch_crcc_w_d_w with -march=loongarch64', + args: test_c_args + ['-march=loongarch64']) + # Use LoongArch CRC instruction unconditionally + cdata.set('USE_LOONGARCH_CRC32C', 1) + have_optimized_crc = true + endif + endif if not have_optimized_crc diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index 6d572c3820..1f253566e4 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -724,6 +724,9 @@ /* Define to 1 to build with LLVM based JIT support. (--with-llvm) */ #undef USE_LLVM +/* Define to 1 to use LoongArch CRCC instructions. */ +#undef USE_LOONGARCH_CRC32C + /* Define to 1 to build with LZ4 support. (--with-lz4) */ #undef USE_LZ4 diff --git a/src/include/port/pg_crc32c.h b/src/include/port/pg_crc32c.h index 7f8779261c..d085f1dc00 100644 --- a/src/include/port/pg_crc32c.h +++ b/src/include/port/pg_crc32c.h @@ -58,6 +58,15 @@ extern pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t le extern pg_crc32c pg_comp_crc32c_armv8(pg_crc32c crc, const void *data, size_t len); +#elif defined(USE_LOONGARCH_CRC32C) +/* Use LoongArch CRCC instructions. */ + +#define COMP_CRC32C(crc, data, len) \ + ((crc) = pg_comp_crc32c_loongarch((crc), (data), (len))) +#define FIN_CRC32C(crc) ((crc) ^= 0xFFFFFFFF) + +extern pg_crc32c pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_t len); + #elif defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK) || defined(USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK) /* diff --git a/src/port/Makefile b/src/port/Makefile index 711f59e32b..1933b0e750 100644 --- a/src/port/Makefile +++ b/src/port/Makefile @@ -98,6 +98,11 @@ pg_crc32c_armv8.o: CFLAGS+=$(CFLAGS_CRC) pg_crc32c_armv8_shlib.o: CFLAGS+=$(CFLAGS_CRC) pg_crc32c_armv8_srv.o: CFLAGS+=$(CFLAGS_CRC) +# all versions of pg_crc32c_loongarch.o need CFLAGS_CRC +pg_crc32c_loongarch.o: CFLAGS+=$(CFLAGS_CRC) +pg_crc32c_loongarch_shlib.o: CFLAGS+=$(CFLAGS_CRC) +pg_crc32c_loongarch_srv.o: CFLAGS+=$(CFLAGS_CRC) + # # Shared library versions of object files # diff --git a/src/port/meson.build b/src/port/meson.build index 24416b9bfc..3e77b2493a 100644 --- a/src/port/meson.build +++ b/src/port/meson.build @@ -92,6 +92,9 @@ replace_funcs_pos = [ ['pg_crc32c_armv8_choose', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK'], ['pg_crc32c_sb8', 'USE_ARMV8_CRC32C_WITH_RUNTIME_CHECK'], + # loongarch + ['pg_crc32c_loongarch', 'USE_LOONGARCH_CRC32C'], + # generic fallback ['pg_crc32c_sb8', 'USE_SLICING_BY_8_CRC32C'], ] diff --git a/src/port/pg_crc32c_loongarch.c b/src/port/pg_crc32c_loongarch.c new file mode 100644 index 0000000000..b54fe12ff5 --- /dev/null +++ b/src/port/pg_crc32c_loongarch.c @@ -0,0 +1,72 @@ +/*------------------------------------------------------------------------- + * + * pg_crc32c_loongarch.c + * Compute CRC-32C checksum using LoongArch CRCC instructions + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/port/pg_crc32c_loongarch.c + * + *------------------------------------------------------------------------- + */ +#include "c.h" + +#include "port/pg_crc32c.h" + +pg_crc32c +pg_comp_crc32c_loongarch(pg_crc32c crc, const void *data, size_t len) +{ + const unsigned char *p = data; + const unsigned char *pend = p + len; + + /* + * Aligned memory access is significantly faster. + * Process leading bytes so that the loop below starts with a pointer aligned to eight bytes. + */ + if (!PointerIsAligned(p, uint16) && + p + 1 <= pend) + { + crc = __builtin_loongarch_crcc_w_b_w(*p, crc); + p += 1; + } + if (!PointerIsAligned(p, uint32) && + p + 2 <= pend) + { + crc = __builtin_loongarch_crcc_w_h_w(*(uint16 *) p, crc); + p += 2; + } + if (!PointerIsAligned(p, uint64) && + p + 4 <= pend) + { + crc = __builtin_loongarch_crcc_w_w_w(*(uint32 *) p, crc); + p += 4; + } + + /* Process eight bytes at a time, as far as we can. */ + while (p + 8 <= pend) + { + crc = __builtin_loongarch_crcc_w_d_w(*(uint64 *) p, crc); + p += 8; + } + + /* Process remaining 0-7 bytes. */ + if (p + 4 <= pend) + { + crc = __builtin_loongarch_crcc_w_w_w(*(uint32 *) p, crc); + p += 4; + } + if (p + 2 <= pend) + { + crc = __builtin_loongarch_crcc_w_h_w(*(uint16 *) p, crc); + p += 2; + } + if (p < pend) + { + crc = __builtin_loongarch_crcc_w_b_w(*p, crc); + } + + return crc; +} -- 2.41.0