From a497ba6958c3ef273a4496b1012e4d9e4beb51e4 Mon Sep 17 00:00:00 2001 From: Nathan Bossart Date: Fri, 23 Jan 2026 17:31:20 -0600 Subject: [PATCH v10 2/3] Remove specialized word-length popcount implementations. The uses of these functions do not justify the level of micro-optimization we've done and may even hurt performance in some cases (e.g., due to using function pointers). This commit removes all architecture-specific implementations of pg_popcount{32,64}() and converts the portable ones to inlined functions in pg_bitutils.h. These inlined versions should produce the same code as before (but inlined), so in theory this is a net gain for many machines. As an exception, for x86-64/gcc without sse4.2/popcnt, we use a plain C version to ensure inlining because __builtin_popcount() and __builtin_popcountl() generate function calls for that configuration. Our tests indicate this is still a net win. Suggested-by: John Naylor Reviewed-by: John Naylor Reviewed-by: Greg Burd Discussion: https://postgr.es/m/CANWCAZY7R%2Biy%2Br9YM_sySNydHzNqUirx1xk0tB3ej5HO62GdgQ%40mail.gmail.com --- configure | 38 -------------------- configure.ac | 1 - meson.build | 1 - src/include/pg_config.h.in | 3 -- src/include/port/pg_bitutils.h | 59 +++++++++++++++++------------- src/port/pg_bitutils.c | 65 ++-------------------------------- src/port/pg_popcount_aarch64.c | 23 +++--------- src/port/pg_popcount_x86.c | 43 +--------------------- 8 files changed, 41 insertions(+), 192 deletions(-) diff --git a/configure b/configure index ba293931878..623aa397fae 100755 --- a/configure +++ b/configure @@ -15920,44 +15920,6 @@ cat >>confdefs.h <<_ACEOF #define HAVE__BUILTIN_CTZ 1 _ACEOF -fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for __builtin_popcount" >&5 -$as_echo_n "checking for __builtin_popcount... " >&6; } -if ${pgac_cv__builtin_popcount+:} false; then : - $as_echo_n "(cached) " >&6 -else - cat confdefs.h - <<_ACEOF >conftest.$ac_ext -/* end confdefs.h. */ - -int -call__builtin_popcount(unsigned int x) -{ - return __builtin_popcount(x); -} -int -main () -{ - - ; - return 0; -} -_ACEOF -if ac_fn_c_try_link "$LINENO"; then : - pgac_cv__builtin_popcount=yes -else - pgac_cv__builtin_popcount=no -fi -rm -f core conftest.err conftest.$ac_objext \ - conftest$ac_exeext conftest.$ac_ext -fi -{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $pgac_cv__builtin_popcount" >&5 -$as_echo "$pgac_cv__builtin_popcount" >&6; } -if test x"${pgac_cv__builtin_popcount}" = xyes ; then - -cat >>confdefs.h <<_ACEOF -#define HAVE__BUILTIN_POPCOUNT 1 -_ACEOF - fi # __builtin_frame_address may draw a diagnostic for non-constant argument, # so it needs a different test function. diff --git a/configure.ac b/configure.ac index 412fe358a2f..04c6a75bff7 100644 --- a/configure.ac +++ b/configure.ac @@ -1853,7 +1853,6 @@ PGAC_CHECK_BUILTIN_FUNC([__builtin_bswap64], [long int x]) # We assume that we needn't test all widths of these explicitly: PGAC_CHECK_BUILTIN_FUNC([__builtin_clz], [unsigned int x]) PGAC_CHECK_BUILTIN_FUNC([__builtin_ctz], [unsigned int x]) -PGAC_CHECK_BUILTIN_FUNC([__builtin_popcount], [unsigned int x]) # __builtin_frame_address may draw a diagnostic for non-constant argument, # so it needs a different test function. PGAC_CHECK_BUILTIN_FUNC_PTR([__builtin_frame_address], [0]) diff --git a/meson.build b/meson.build index 0722b16927e..c607d8ac69a 100644 --- a/meson.build +++ b/meson.build @@ -2004,7 +2004,6 @@ builtins = [ 'ctz', 'constant_p', 'frame_address', - 'popcount', 'unreachable', ] diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index c089f2252c3..301328b8cd3 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -530,9 +530,6 @@ /* Define to 1 if your compiler understands __builtin_$op_overflow. */ #undef HAVE__BUILTIN_OP_OVERFLOW -/* Define to 1 if your compiler understands __builtin_popcount. */ -#undef HAVE__BUILTIN_POPCOUNT - /* Define to 1 if your compiler understands __builtin_types_compatible_p. */ #undef HAVE__BUILTIN_TYPES_COMPATIBLE_P diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h index 20c11b79c61..c9b1f5f17dc 100644 --- a/src/include/port/pg_bitutils.h +++ b/src/include/port/pg_bitutils.h @@ -276,47 +276,56 @@ pg_ceil_log2_64(uint64 num) return pg_leftmost_one_pos64(num - 1) + 1; } -extern int pg_popcount32_portable(uint32 word); -extern int pg_popcount64_portable(uint64 word); extern uint64 pg_popcount_portable(const char *buf, int bytes); extern uint64 pg_popcount_masked_portable(const char *buf, int bytes, bits8 mask); -#ifdef HAVE_X86_64_POPCNTQ +#if defined(HAVE_X86_64_POPCNTQ) || defined(USE_SVE_POPCNT_WITH_RUNTIME_CHECK) /* - * Attempt to use SSE4.2 or AVX-512 instructions, but perform a runtime check + * Attempt to use specialized CPU instructions, but perform a runtime check * first. */ -extern PGDLLIMPORT int (*pg_popcount32) (uint32 word); -extern PGDLLIMPORT int (*pg_popcount64) (uint64 word); extern PGDLLIMPORT uint64 (*pg_popcount_optimized) (const char *buf, int bytes); extern PGDLLIMPORT uint64 (*pg_popcount_masked_optimized) (const char *buf, int bytes, bits8 mask); -#elif defined(USE_NEON) -/* Use the Neon version of pg_popcount{32,64} without function pointer. */ -extern int pg_popcount32(uint32 word); -extern int pg_popcount64(uint64 word); - -/* - * We can try to use an SVE-optimized pg_popcount() on some systems For that, - * we do use a function pointer. - */ -#ifdef USE_SVE_POPCNT_WITH_RUNTIME_CHECK -extern PGDLLIMPORT uint64 (*pg_popcount_optimized) (const char *buf, int bytes); -extern PGDLLIMPORT uint64 (*pg_popcount_masked_optimized) (const char *buf, int bytes, bits8 mask); -#else -extern uint64 pg_popcount_optimized(const char *buf, int bytes); -extern uint64 pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask); -#endif - #else /* Use a portable implementation -- no need for a function pointer. */ -extern int pg_popcount32(uint32 word); -extern int pg_popcount64(uint64 word); extern uint64 pg_popcount_optimized(const char *buf, int bytes); extern uint64 pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask); #endif +/* + * pg_popcount32 + * Return the number of 1 bits set in word + * + * Adapted from + * https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel. + */ +static inline int +pg_popcount32(uint32 word) +{ + word -= (word >> 1) & 0x55555555; + word = (word & 0x33333333) + ((word >> 2) & 0x33333333); + return (((word + (word >> 4)) & 0xf0f0f0f) * 0x1010101) >> 24; +} + +/* + * pg_popcount64 + * Return the number of 1 bits set in word + * + * Adapted from + * https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel. + */ +static inline int +pg_popcount64(uint64 word) +{ + word -= (word >> 1) & UINT64CONST(0x5555555555555555); + word = (word & UINT64CONST(0x3333333333333333)) + + ((word >> 2) & UINT64CONST(0x3333333333333333)); + word = (word + (word >> 4)) & UINT64CONST(0xf0f0f0f0f0f0f0f); + return (word * UINT64CONST(0x101010101010101)) >> 56; +} + /* * Returns the number of 1-bits in buf. * diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c index bec06c06fc3..49b130f1306 100644 --- a/src/port/pg_bitutils.c +++ b/src/port/pg_bitutils.c @@ -96,56 +96,6 @@ const uint8 pg_number_of_ones[256] = { 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8 }; -/* - * pg_popcount32_portable - * Return the number of 1 bits set in word - */ -int -pg_popcount32_portable(uint32 word) -{ -#ifdef HAVE__BUILTIN_POPCOUNT - return __builtin_popcount(word); -#else /* !HAVE__BUILTIN_POPCOUNT */ - int result = 0; - - while (word != 0) - { - result += pg_number_of_ones[word & 255]; - word >>= 8; - } - - return result; -#endif /* HAVE__BUILTIN_POPCOUNT */ -} - -/* - * pg_popcount64_portable - * Return the number of 1 bits set in word - */ -int -pg_popcount64_portable(uint64 word) -{ -#ifdef HAVE__BUILTIN_POPCOUNT -#if SIZEOF_LONG == 8 - return __builtin_popcountl(word); -#elif SIZEOF_LONG_LONG == 8 - return __builtin_popcountll(word); -#else -#error "cannot find integer of the same size as uint64_t" -#endif -#else /* !HAVE__BUILTIN_POPCOUNT */ - int result = 0; - - while (word != 0) - { - result += pg_number_of_ones[word & 255]; - word >>= 8; - } - - return result; -#endif /* HAVE__BUILTIN_POPCOUNT */ -} - /* * pg_popcount_portable * Returns the number of 1-bits in buf @@ -163,7 +113,7 @@ pg_popcount_portable(const char *buf, int bytes) while (bytes >= 8) { - popcnt += pg_popcount64_portable(*words++); + popcnt += pg_popcount64(*words++); bytes -= 8; } @@ -197,7 +147,7 @@ pg_popcount_masked_portable(const char *buf, int bytes, bits8 mask) while (bytes >= 8) { - popcnt += pg_popcount64_portable(*words++ & maskv); + popcnt += pg_popcount64(*words++ & maskv); bytes -= 8; } @@ -220,17 +170,6 @@ pg_popcount_masked_portable(const char *buf, int bytes, bits8 mask) * actual external functions. The compiler should be able to inline the * portable versions here. */ -int -pg_popcount32(uint32 word) -{ - return pg_popcount32_portable(word); -} - -int -pg_popcount64(uint64 word) -{ - return pg_popcount64_portable(word); -} /* * pg_popcount_optimized diff --git a/src/port/pg_popcount_aarch64.c b/src/port/pg_popcount_aarch64.c index ba57f2cd4bd..357e938549f 100644 --- a/src/port/pg_popcount_aarch64.c +++ b/src/port/pg_popcount_aarch64.c @@ -291,28 +291,13 @@ pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask) #endif /* ! USE_SVE_POPCNT_WITH_RUNTIME_CHECK */ -/* - * pg_popcount32 - * Return number of 1 bits in word - */ -int -pg_popcount32(uint32 word) -{ - return pg_popcount64((uint64) word); -} - /* * pg_popcount64 * Return number of 1 bits in word */ -int -pg_popcount64(uint64 word) +static inline int +pg_popcount64_neon(uint64 word) { - /* - * For some compilers, __builtin_popcountl() already emits Neon - * instructions. The line below should compile to the same code on those - * systems. - */ return vaddv_u8(vcnt_u8(vld1_u8((const uint8 *) &word))); } @@ -383,7 +368,7 @@ pg_popcount_neon(const char *buf, int bytes) */ for (; bytes >= sizeof(uint64); bytes -= sizeof(uint64)) { - popcnt += pg_popcount64(*((const uint64 *) buf)); + popcnt += pg_popcount64_neon(*((const uint64 *) buf)); buf += sizeof(uint64); } @@ -465,7 +450,7 @@ pg_popcount_masked_neon(const char *buf, int bytes, bits8 mask) */ for (; bytes >= sizeof(uint64); bytes -= sizeof(uint64)) { - popcnt += pg_popcount64(*((const uint64 *) buf) & mask64); + popcnt += pg_popcount64_neon(*((const uint64 *) buf) & mask64); buf += sizeof(uint64); } diff --git a/src/port/pg_popcount_x86.c b/src/port/pg_popcount_x86.c index 7aebf69898b..6bce089432f 100644 --- a/src/port/pg_popcount_x86.c +++ b/src/port/pg_popcount_x86.c @@ -36,8 +36,6 @@ * operation, but in practice this is close enough, and "sse42" seems easier to * follow than "popcnt" for these names. */ -static inline int pg_popcount32_sse42(uint32 word); -static inline int pg_popcount64_sse42(uint64 word); static uint64 pg_popcount_sse42(const char *buf, int bytes); static uint64 pg_popcount_masked_sse42(const char *buf, int bytes, bits8 mask); @@ -55,12 +53,8 @@ static uint64 pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask); * what the current CPU supports) and then will call the pointer to fulfill the * caller's request. */ -static int pg_popcount32_choose(uint32 word); -static int pg_popcount64_choose(uint64 word); static uint64 pg_popcount_choose(const char *buf, int bytes); static uint64 pg_popcount_masked_choose(const char *buf, int bytes, bits8 mask); -int (*pg_popcount32) (uint32 word) = pg_popcount32_choose; -int (*pg_popcount64) (uint64 word) = pg_popcount64_choose; uint64 (*pg_popcount_optimized) (const char *buf, int bytes) = pg_popcount_choose; uint64 (*pg_popcount_masked_optimized) (const char *buf, int bytes, bits8 mask) = pg_popcount_masked_choose; @@ -157,7 +151,7 @@ pg_popcount_avx512_available(void) #endif /* USE_AVX512_POPCNT_WITH_RUNTIME_CHECK */ /* - * These functions get called on the first call to pg_popcount32 etc. + * These functions get called on the first call to pg_popcount(), etc. * They detect whether we can use the asm implementations, and replace * the function pointers so that subsequent calls are routed directly to * the chosen implementation. @@ -167,15 +161,11 @@ choose_popcount_functions(void) { if (pg_popcount_sse42_available()) { - pg_popcount32 = pg_popcount32_sse42; - pg_popcount64 = pg_popcount64_sse42; pg_popcount_optimized = pg_popcount_sse42; pg_popcount_masked_optimized = pg_popcount_masked_sse42; } else { - pg_popcount32 = pg_popcount32_portable; - pg_popcount64 = pg_popcount64_portable; pg_popcount_optimized = pg_popcount_portable; pg_popcount_masked_optimized = pg_popcount_masked_portable; } @@ -189,20 +179,6 @@ choose_popcount_functions(void) #endif } -static int -pg_popcount32_choose(uint32 word) -{ - choose_popcount_functions(); - return pg_popcount32(word); -} - -static int -pg_popcount64_choose(uint64 word) -{ - choose_popcount_functions(); - return pg_popcount64(word); -} - static uint64 pg_popcount_choose(const char *buf, int bytes) { @@ -338,23 +314,6 @@ pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask) #endif /* USE_AVX512_POPCNT_WITH_RUNTIME_CHECK */ -/* - * pg_popcount32_sse42 - * Return the number of 1 bits set in word - */ -static inline int -pg_popcount32_sse42(uint32 word) -{ -#ifdef _MSC_VER - return __popcnt(word); -#else - uint32 res; - -__asm__ __volatile__(" popcntl %1,%0\n":"=q"(res):"rm"(word):"cc"); - return (int) res; -#endif -} - /* * pg_popcount64_sse42 * Return the number of 1 bits set in word -- 2.50.1 (Apple Git-155)