From f9b62118024e018e50d58c99718d9500ab4c5f8a Mon Sep 17 00:00:00 2001 From: Nathan Bossart Date: Fri, 20 Feb 2026 14:33:27 -0600 Subject: [PATCH v15 1/2] Remove uses of popcount builtins. This commit replaces the implementations of pg_popcount{32,64} with branchless ones in plain C. Newer versions of popular compilers will automatically replace these with more sophisticated population count instructions if possible, leaving us little reason to continue using the builtins. This also allows us to remove the remaining architecture-specific implementations of pg_popcount64(), which were only used by the corresponding architecture-specific implementations of pg_popcount() and pg_popcount_masked(). Since this commit removes the only uses of the popcount builtins, we can remove the corresponding configuration checks, too. Suggested-by: John Naylor Reviewed-by: John Naylor Discussion: https://postgr.es/m/CANWCAZY7R%2Biy%2Br9YM_sySNydHzNqUirx1xk0tB3ej5HO62GdgQ%40mail.gmail.com --- config/c-compiler.m4 | 26 +++++++ configure | 119 +++++++++++++-------------------- configure.ac | 23 +++---- meson.build | 43 +++++++----- src/include/pg_config.h.in | 7 +- src/include/port/pg_bitutils.h | 56 +++++++--------- src/port/pg_bitutils.c | 4 +- src/port/pg_popcount_aarch64.c | 19 +----- src/port/pg_popcount_x86.c | 27 ++------ 9 files changed, 143 insertions(+), 181 deletions(-) diff --git a/config/c-compiler.m4 b/config/c-compiler.m4 index 1509dbfa2ab..93fd6b3b617 100644 --- a/config/c-compiler.m4 +++ b/config/c-compiler.m4 @@ -743,6 +743,32 @@ fi undefine([Ac_cachevar])dnl ])# PGAC_XSAVE_INTRINSICS +# PGAC_X86_POPCNT_INTRINSICS +# ---------------------- +# Check if the compiler supports the x86 POPCNT instructions, using the +# _popcnt64 intrinsic function. +# +# If the instrinsic is supported, sets pgac_x86_popcnt_intrinsics +AC_DEFUN([PGAC_X86_POPCNT_INTRINSICS], +[define([Ac_cachevar], [AS_TR_SH([pgac_cv_x86_popcnt_intrinsics])])dnl +AC_CACHE_CHECK([for _popcnt64], [Ac_cachevar], +[AC_LINK_IFELSE([AC_LANG_PROGRAM([#include + #if defined(__has_attribute) && __has_attribute (target) + __attribute__((target("popcnt"))) + #endif + static int x86_popcnt_test(void) + { + return _popcnt64(0); + }], + [return x86_popcnt_test();])], + [Ac_cachevar=yes], + [Ac_cachevar=no])]) +if test x"$Ac_cachevar" = x"yes"; then + pgac_x86_popcnt_intrinsics=yes +fi +undefine([Ac_cachevar])dnl +])# PGAC_X86_POPCNT_INTRINSICS + # PGAC_AVX512_POPCNT_INTRINSICS # ----------------------------- # Check if the compiler supports the AVX-512 popcount instructions using the diff --git a/configure b/configure index e1a08129974..a37b8d9676c 100755 --- a/configure +++ b/configure @@ -15256,40 +15256,6 @@ fi case $host_cpu in - x86_64) - # On x86_64, check if we can compile a popcntq instruction - { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether assembler supports x86_64 popcntq" >&5 -$as_echo_n "checking whether assembler supports x86_64 popcntq... " >&6; } -if ${pgac_cv_have_x86_64_popcntq+:} false; then : - $as_echo_n "(cached) " >&6 -else - cat confdefs.h - <<_ACEOF >conftest.$ac_ext -/* end confdefs.h. */ - -int -main () -{ -long long x = 1; long long r; - __asm__ __volatile__ (" popcntq %1,%0\n" : "=q"(r) : "rm"(x)); - ; - return 0; -} -_ACEOF -if ac_fn_c_try_compile "$LINENO"; then : - pgac_cv_have_x86_64_popcntq=yes -else - pgac_cv_have_x86_64_popcntq=no -fi -rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext -fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_have_x86_64_popcntq" >&5 -$as_echo "$pgac_cv_have_x86_64_popcntq" >&6; } - if test x"$pgac_cv_have_x86_64_popcntq" = xyes ; then - -$as_echo "#define HAVE_X86_64_POPCNTQ 1" >>confdefs.h - - fi - ;; ppc*|powerpc*) # On PPC, check if compiler accepts "i"(x) when __builtin_constant_p(x). { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether __builtin_constant_p(x) implies \"i\"(x) acceptance" >&5 @@ -15836,44 +15802,6 @@ cat >>confdefs.h <<_ACEOF #define HAVE__BUILTIN_CTZ 1 _ACEOF -fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __builtin_popcount" >&5 -$as_echo_n "checking for __builtin_popcount... " >&6; } -if ${pgac_cv__builtin_popcount+:} false; then : - $as_echo_n "(cached) " >&6 -else - cat confdefs.h - <<_ACEOF >conftest.$ac_ext -/* end confdefs.h. */ - -int -call__builtin_popcount(unsigned int x) -{ - return __builtin_popcount(x); -} -int -main () -{ - - ; - return 0; -} -_ACEOF -if ac_fn_c_try_link "$LINENO"; then : - pgac_cv__builtin_popcount=yes -else - pgac_cv__builtin_popcount=no -fi -rm -f core conftest.err conftest.$ac_objext \ - conftest$ac_exeext conftest.$ac_ext -fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv__builtin_popcount" >&5 -$as_echo "$pgac_cv__builtin_popcount" >&6; } -if test x"${pgac_cv__builtin_popcount}" = xyes ; then - -cat >>confdefs.h <<_ACEOF -#define HAVE__BUILTIN_POPCOUNT 1 -_ACEOF - fi # __builtin_frame_address may draw a diagnostic for non-constant argument, # so it needs a different test function. @@ -17721,6 +17649,53 @@ $as_echo "#define HAVE_XSAVE_INTRINSICS 1" >>confdefs.h fi +# Check for x86 POPCNT intrinsics +# +if test x"$host_cpu" = x"x86_64"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _popcnt64" >&5 +$as_echo_n "checking for _popcnt64... " >&6; } +if ${pgac_cv_x86_popcnt_intrinsics+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include + #if defined(__has_attribute) && __has_attribute (target) + __attribute__((target("popcnt"))) + #endif + static int x86_popcnt_test(void) + { + return _popcnt64(0); + } +int +main () +{ +return x86_popcnt_test(); + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + pgac_cv_x86_popcnt_intrinsics=yes +else + pgac_cv_x86_popcnt_intrinsics=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv_x86_popcnt_intrinsics" >&5 +$as_echo "$pgac_cv_x86_popcnt_intrinsics" >&6; } +if test x"$pgac_cv_x86_popcnt_intrinsics" = x"yes"; then + pgac_x86_popcnt_intrinsics=yes +fi + + if test x"$pgac_x86_popcnt_intrinsics" = x"yes"; then + +$as_echo "#define HAVE_X86_POPCNT_INTRINSICS 1" >>confdefs.h + + fi +fi + # Check for AVX-512 popcount intrinsics # if test x"$host_cpu" = x"x86_64"; then diff --git a/configure.ac b/configure.ac index cc85c233c03..1594ee802b8 100644 --- a/configure.ac +++ b/configure.ac @@ -1746,19 +1746,6 @@ AC_CHECK_TYPES([struct option], [], [], #endif]) case $host_cpu in - x86_64) - # On x86_64, check if we can compile a popcntq instruction - AC_CACHE_CHECK([whether assembler supports x86_64 popcntq], - [pgac_cv_have_x86_64_popcntq], - [AC_COMPILE_IFELSE([AC_LANG_PROGRAM([], - [long long x = 1; long long r; - __asm__ __volatile__ (" popcntq %1,%0\n" : "=q"(r) : "rm"(x));])], - [pgac_cv_have_x86_64_popcntq=yes], - [pgac_cv_have_x86_64_popcntq=no])]) - if test x"$pgac_cv_have_x86_64_popcntq" = xyes ; then - AC_DEFINE(HAVE_X86_64_POPCNTQ, 1, [Define to 1 if the assembler supports X86_64's POPCNTQ instruction.]) - fi - ;; ppc*|powerpc*) # On PPC, check if compiler accepts "i"(x) when __builtin_constant_p(x). AC_CACHE_CHECK([whether __builtin_constant_p(x) implies "i"(x) acceptance], @@ -1851,7 +1838,6 @@ PGAC_CHECK_BUILTIN_FUNC([__builtin_bswap64], [long int x]) # We assume that we needn't test all widths of these explicitly: PGAC_CHECK_BUILTIN_FUNC([__builtin_clz], [unsigned int x]) PGAC_CHECK_BUILTIN_FUNC([__builtin_ctz], [unsigned int x]) -PGAC_CHECK_BUILTIN_FUNC([__builtin_popcount], [unsigned int x]) # __builtin_frame_address may draw a diagnostic for non-constant argument, # so it needs a different test function. PGAC_CHECK_BUILTIN_FUNC_PTR([__builtin_frame_address], [0]) @@ -2128,6 +2114,15 @@ if test x"$pgac_xsave_intrinsics" = x"yes"; then AC_DEFINE(HAVE_XSAVE_INTRINSICS, 1, [Define to 1 if you have XSAVE intrinsics.]) fi +# Check for x86 POPCNT intrinsics +# +if test x"$host_cpu" = x"x86_64"; then + PGAC_X86_POPCNT_INTRINSICS() + if test x"$pgac_x86_popcnt_intrinsics" = x"yes"; then + AC_DEFINE(HAVE_X86_POPCNT_INTRINSICS, 1, [Define to 1 if you have x86 POPCNT intrinsics.]) + fi +fi + # Check for AVX-512 popcount intrinsics # if test x"$host_cpu" = x"x86_64"; then diff --git a/meson.build b/meson.build index 055e96315d0..b44cfd3993a 100644 --- a/meson.build +++ b/meson.build @@ -2006,7 +2006,6 @@ builtins = [ 'ctz', 'constant_p', 'frame_address', - 'popcount', 'unreachable', ] @@ -2377,6 +2376,31 @@ int main(void) endif +############################################################### +# Check for the availability of x86 POPCNT intrinsics. +############################################################### + +if host_cpu == 'x86_64' + + prog = ''' +#include + +#if defined(__has_attribute) && __has_attribute (target) +__attribute__((target("popcnt"))) +#endif +int main(void) +{ + return _popcnt64(0); +} +''' + + if cc.links(prog, name: 'x86 POPCNT intrinsics', args: test_c_args) + cdata.set('HAVE_X86_POPCNT_INTRINSICS', 1) + endif + +endif + + ############################################################### # Check for the availability of AVX-512 popcount intrinsics. ############################################################### @@ -2641,22 +2665,7 @@ endif # Other CPU specific stuff ############################################################### -if host_cpu == 'x86_64' - - if cc.get_id() == 'msvc' - cdata.set('HAVE_X86_64_POPCNTQ', 1) - elif cc.compiles(''' - void main(void) - { - long long x = 1; long long r; - __asm__ __volatile__ (" popcntq %1,%0\n" : "=q"(r) : "rm"(x)); - }''', - name: '@0@: popcntq instruction'.format(host_cpu), - args: test_c_args) - cdata.set('HAVE_X86_64_POPCNTQ', 1) - endif - -elif host_cpu == 'ppc' or host_cpu == 'ppc64' +if host_cpu == 'ppc' or host_cpu == 'ppc64' # Check if compiler accepts "i"(x) when __builtin_constant_p(x). if cdata.has('HAVE__BUILTIN_CONSTANT_P') if cc.compiles(''' diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index 3824a5571bb..2f6d291d9b2 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -493,8 +493,8 @@ /* Define to 1 if you have the `X509_get_signature_info' function. */ #undef HAVE_X509_GET_SIGNATURE_INFO -/* Define to 1 if the assembler supports X86_64's POPCNTQ instruction. */ -#undef HAVE_X86_64_POPCNTQ +/* Define to 1 if you have x86 POPCNT intrinsics. */ +#undef HAVE_X86_POPCNT_INTRINSICS /* Define to 1 if you have the header file. */ #undef HAVE_XLOCALE_H @@ -526,9 +526,6 @@ /* Define to 1 if your compiler understands __builtin_$op_overflow. */ #undef HAVE__BUILTIN_OP_OVERFLOW -/* Define to 1 if your compiler understands __builtin_popcount. */ -#undef HAVE__BUILTIN_POPCOUNT - /* Define to 1 if your compiler understands __builtin_types_compatible_p. */ #undef HAVE__BUILTIN_TYPES_COMPATIBLE_P diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h index 789663edd93..53df0594823 100644 --- a/src/include/port/pg_bitutils.h +++ b/src/include/port/pg_bitutils.h @@ -279,7 +279,7 @@ pg_ceil_log2_64(uint64 num) extern uint64 pg_popcount_portable(const char *buf, int bytes); extern uint64 pg_popcount_masked_portable(const char *buf, int bytes, bits8 mask); -#if defined(HAVE_X86_64_POPCNTQ) || defined(USE_SVE_POPCNT_WITH_RUNTIME_CHECK) +#if defined(HAVE_X86_POPCNT_INTRINSICS) || defined(USE_SVE_POPCNT_WITH_RUNTIME_CHECK) /* * Attempt to use specialized CPU instructions, but perform a runtime check * first. @@ -297,51 +297,41 @@ extern uint64 pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mas /* * pg_popcount32 * Return the number of 1 bits set in word + * + * Adapted from + * https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel. + * + * Note that newer versions of popular compilers will automatically replace + * this with a special popcount instruction if possible, so there isn't much + * reason to use builtin functions or intrinsics. */ static inline int pg_popcount32(uint32 word) { -#ifdef HAVE__BUILTIN_POPCOUNT - return __builtin_popcount(word); -#else /* !HAVE__BUILTIN_POPCOUNT */ - int result = 0; - - while (word != 0) - { - result += pg_number_of_ones[word & 255]; - word >>= 8; - } - - return result; -#endif /* HAVE__BUILTIN_POPCOUNT */ + word -= (word >> 1) & 0x55555555; + word = (word & 0x33333333) + ((word >> 2) & 0x33333333); + return (((word + (word >> 4)) & 0xf0f0f0f) * 0x1010101) >> 24; } /* * pg_popcount64 * Return the number of 1 bits set in word + * + * Adapted from + * https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel. + * + * Note that newer versions of popular compilers will automatically replace + * this with a special popcount instruction if possible, so there isn't much + * reason to use builtin functions or intrinsics. */ static inline int pg_popcount64(uint64 word) { -#ifdef HAVE__BUILTIN_POPCOUNT -#if SIZEOF_LONG == 8 - return __builtin_popcountl(word); -#elif SIZEOF_LONG_LONG == 8 - return __builtin_popcountll(word); -#else -#error "cannot find integer of the same size as uint64_t" -#endif -#else /* !HAVE__BUILTIN_POPCOUNT */ - int result = 0; - - while (word != 0) - { - result += pg_number_of_ones[word & 255]; - word >>= 8; - } - - return result; -#endif /* HAVE__BUILTIN_POPCOUNT */ + word -= (word >> 1) & UINT64CONST(0x5555555555555555); + word = (word & UINT64CONST(0x3333333333333333)) + + ((word >> 2) & UINT64CONST(0x3333333333333333)); + word = (word + (word >> 4)) & UINT64CONST(0xf0f0f0f0f0f0f0f); + return (word * UINT64CONST(0x101010101010101)) >> 56; } /* diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c index 49b130f1306..71327810119 100644 --- a/src/port/pg_bitutils.c +++ b/src/port/pg_bitutils.c @@ -162,7 +162,7 @@ pg_popcount_masked_portable(const char *buf, int bytes, bits8 mask) return popcnt; } -#if !defined(HAVE_X86_64_POPCNTQ) && !defined(USE_NEON) +#if !defined(HAVE_X86_POPCNT_INTRINSICS) && !defined(USE_NEON) /* * When special CPU instructions are not available, there's no point in using @@ -191,4 +191,4 @@ pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask) return pg_popcount_masked_portable(buf, bytes, mask); } -#endif /* ! HAVE_X86_64_POPCNTQ && ! USE_NEON */ +#endif /* ! HAVE_X86_POPCNT_INTRINSICS && ! USE_NEON */ diff --git a/src/port/pg_popcount_aarch64.c b/src/port/pg_popcount_aarch64.c index f474ef45510..74f71593721 100644 --- a/src/port/pg_popcount_aarch64.c +++ b/src/port/pg_popcount_aarch64.c @@ -291,21 +291,6 @@ pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask) #endif /* ! USE_SVE_POPCNT_WITH_RUNTIME_CHECK */ -/* - * pg_popcount64_neon - * Return number of 1 bits in word - */ -static inline int -pg_popcount64_neon(uint64 word) -{ - /* - * For some compilers, __builtin_popcountl() already emits Neon - * instructions. The line below should compile to the same code on those - * systems. - */ - return vaddv_u8(vcnt_u8(vld1_u8((const uint8 *) &word))); -} - /* * pg_popcount_neon * Returns number of 1 bits in buf @@ -373,7 +358,7 @@ pg_popcount_neon(const char *buf, int bytes) */ for (; bytes >= sizeof(uint64); bytes -= sizeof(uint64)) { - popcnt += pg_popcount64_neon(*((const uint64 *) buf)); + popcnt += pg_popcount64(*((const uint64 *) buf)); buf += sizeof(uint64); } @@ -455,7 +440,7 @@ pg_popcount_masked_neon(const char *buf, int bytes, bits8 mask) */ for (; bytes >= sizeof(uint64); bytes -= sizeof(uint64)) { - popcnt += pg_popcount64_neon(*((const uint64 *) buf) & mask64); + popcnt += pg_popcount64(*((const uint64 *) buf) & mask64); buf += sizeof(uint64); } diff --git a/src/port/pg_popcount_x86.c b/src/port/pg_popcount_x86.c index 6bce089432f..45ade1ee37b 100644 --- a/src/port/pg_popcount_x86.c +++ b/src/port/pg_popcount_x86.c @@ -12,7 +12,7 @@ */ #include "c.h" -#ifdef HAVE_X86_64_POPCNTQ +#ifdef HAVE_X86_POPCNT_INTRINSICS #if defined(HAVE__GET_CPUID) || defined(HAVE__GET_CPUID_COUNT) #include @@ -314,28 +314,12 @@ pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask) #endif /* USE_AVX512_POPCNT_WITH_RUNTIME_CHECK */ -/* - * pg_popcount64_sse42 - * Return the number of 1 bits set in word - */ -static inline int -pg_popcount64_sse42(uint64 word) -{ -#ifdef _MSC_VER - return __popcnt64(word); -#else - uint64 res; - -__asm__ __volatile__(" popcntq %1,%0\n":"=q"(res):"rm"(word):"cc"); - return (int) res; -#endif -} - /* * pg_popcount_sse42 * Returns the number of 1-bits in buf */ pg_attribute_no_sanitize_alignment() +pg_attribute_target("popcnt") static uint64 pg_popcount_sse42(const char *buf, int bytes) { @@ -344,7 +328,7 @@ pg_popcount_sse42(const char *buf, int bytes) while (bytes >= 8) { - popcnt += pg_popcount64_sse42(*words++); + popcnt += pg_popcount64(*words++); bytes -= 8; } @@ -362,6 +346,7 @@ pg_popcount_sse42(const char *buf, int bytes) * Returns the number of 1-bits in buf after applying the mask to each byte */ pg_attribute_no_sanitize_alignment() +pg_attribute_target("popcnt") static uint64 pg_popcount_masked_sse42(const char *buf, int bytes, bits8 mask) { @@ -371,7 +356,7 @@ pg_popcount_masked_sse42(const char *buf, int bytes, bits8 mask) while (bytes >= 8) { - popcnt += pg_popcount64_sse42(*words++ & maskv); + popcnt += pg_popcount64(*words++ & maskv); bytes -= 8; } @@ -384,4 +369,4 @@ pg_popcount_masked_sse42(const char *buf, int bytes, bits8 mask) return popcnt; } -#endif /* HAVE_X86_64_POPCNTQ */ +#endif /* HAVE_X86_POPCNT_INTRINSICS */ -- 2.50.1 (Apple Git-155)