From 258be25552ecce2a4fca86c071500d9596f861fe Mon Sep 17 00:00:00 2001 From: Nathan Bossart Date: Fri, 23 Jan 2026 17:31:20 -0600 Subject: [PATCH v8 2/3] Remove specialized word-length popcount implementations. The uses of these functions do not justify the level of micro-optimization we've done and may even hurt performance in some cases (e.g., due to using function pointers). This commit removes all architecture-specific implementations of pg_popcount{32,64}() and converts the portable ones to inlined functions in pg_bitutils.h. These inlined versions should produce the same code as before (but inlined), so in theory this is a net gain for many machines. As an exception, for x86-64/gcc without sse4.2/popcnt, we use a plain C version to ensure inlining because __builtin_popcount() and __builtin_popcountl() generate function calls for that configuration. Our tests indicate this is still a net win. Suggested-by: John Naylor Reviewed-by: John Naylor Reviewed-by: Greg Burd Discussion: https://postgr.es/m/CANWCAZY7R%2Biy%2Br9YM_sySNydHzNqUirx1xk0tB3ej5HO62GdgQ%40mail.gmail.com --- src/include/port/pg_bitutils.h | 83 ++++++++++++++++++++++++---------- src/port/pg_bitutils.c | 65 +------------------------- src/port/pg_popcount_aarch64.c | 25 ---------- src/port/pg_popcount_x86.c | 43 +----------------- 4 files changed, 63 insertions(+), 153 deletions(-) diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h index c3049d71894..08b9abf5fe7 100644 --- a/src/include/port/pg_bitutils.h +++ b/src/include/port/pg_bitutils.h @@ -276,46 +276,83 @@ pg_ceil_log2_64(uint64 num) return pg_leftmost_one_pos64(num - 1) + 1; } -extern int pg_popcount32_portable(uint32 word); -extern int pg_popcount64_portable(uint64 word); extern uint64 pg_popcount_portable(const char *buf, int bytes); extern uint64 pg_popcount_masked_portable(const char *buf, int bytes, bits8 mask); -#ifdef HAVE_X86_64_POPCNTQ +#if defined(HAVE_X86_64_POPCNTQ) || defined(USE_SVE_POPCNT_WITH_RUNTIME_CHECK) /* - * Attempt to use SSE4.2 or AVX-512 instructions, but perform a runtime check + * Attempt to use specialized CPU instructions, but perform a runtime check * first. */ -extern PGDLLIMPORT int (*pg_popcount32) (uint32 word); -extern PGDLLIMPORT int (*pg_popcount64) (uint64 word); extern PGDLLIMPORT uint64 (*pg_popcount_optimized) (const char *buf, int bytes); extern PGDLLIMPORT uint64 (*pg_popcount_masked_optimized) (const char *buf, int bytes, bits8 mask); -#elif defined(USE_NEON) -/* Use the Neon version of pg_popcount{32,64} without function pointer. */ -extern int pg_popcount32(uint32 word); -extern int pg_popcount64(uint64 word); - -/* - * We can try to use an SVE-optimized pg_popcount() on some systems For that, - * we do use a function pointer. - */ -#ifdef USE_SVE_POPCNT_WITH_RUNTIME_CHECK -extern PGDLLIMPORT uint64 (*pg_popcount_optimized) (const char *buf, int bytes); -extern PGDLLIMPORT uint64 (*pg_popcount_masked_optimized) (const char *buf, int bytes, bits8 mask); #else +/* Use a portable implementation -- no need for a function pointer. */ extern uint64 pg_popcount_optimized(const char *buf, int bytes); extern uint64 pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask); + #endif +/* + * pg_popcount32 + * Return the number of 1 bits set in word + * + * Plain C version adapted from + * https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel. + */ +static inline int +pg_popcount32(uint32 word) +{ + /* + * On x86, gcc generates a function call for this built-in unless the + * popcnt instruction is available, so we use the plain C version in that + * case to ensure inlining. + */ +#if defined(HAVE__BUILTIN_POPCOUNT) && (defined(__POPCNT__) || !defined(__x86_64__)) + return __builtin_popcount(word); +#elif defined(_MSC_VER) + return __popcnt(word); #else -/* Use a portable implementation -- no need for a function pointer. */ -extern int pg_popcount32(uint32 word); -extern int pg_popcount64(uint64 word); -extern uint64 pg_popcount_optimized(const char *buf, int bytes); -extern uint64 pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask); + word -= (word >> 1) & 0x55555555; + word = (word & 0x33333333) + ((word >> 2) & 0x33333333); + return ((word + (word >> 4) & 0xf0f0f0f) * 0x1010101) >> 24; +#endif +} +/* + * pg_popcount64 + * Return the number of 1 bits set in word + * + * Plain C version adapted from + * https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel. + */ +static inline int +pg_popcount64(uint64 word) +{ + /* + * On x86, gcc generates a function call for this built-in unless the + * popcnt instruction is available, so we use the plain C version in that + * case to ensure inlining. + */ +#if defined(HAVE__BUILTIN_POPCOUNT) && (defined(__POPCNT__) || !defined(__x86_64__)) +#if SIZEOF_LONG == 8 + return __builtin_popcountl(word); +#elif SIZEOF_LONG_LONG == 8 + return __builtin_popcountll(word); +#else +#error "cannot find integer of the same size as uint64_t" #endif +#elif defined(_MSC_VER) + return __popcnt64(word); +#else + word -= (word >> 1) & UINT64CONST(0x5555555555555555); + word = (word & UINT64CONST(0x3333333333333333)) + + ((word >> 2) & UINT64CONST(0x3333333333333333)); + word = (word + (word >> 4)) & UINT64CONST(0xf0f0f0f0f0f0f0f); + return (word * UINT64CONST(0x101010101010101)) >> 56; +#endif +} /* * Returns the number of 1-bits in buf. diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c index bec06c06fc3..49b130f1306 100644 --- a/src/port/pg_bitutils.c +++ b/src/port/pg_bitutils.c @@ -96,56 +96,6 @@ const uint8 pg_number_of_ones[256] = { 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8 }; -/* - * pg_popcount32_portable - * Return the number of 1 bits set in word - */ -int -pg_popcount32_portable(uint32 word) -{ -#ifdef HAVE__BUILTIN_POPCOUNT - return __builtin_popcount(word); -#else /* !HAVE__BUILTIN_POPCOUNT */ - int result = 0; - - while (word != 0) - { - result += pg_number_of_ones[word & 255]; - word >>= 8; - } - - return result; -#endif /* HAVE__BUILTIN_POPCOUNT */ -} - -/* - * pg_popcount64_portable - * Return the number of 1 bits set in word - */ -int -pg_popcount64_portable(uint64 word) -{ -#ifdef HAVE__BUILTIN_POPCOUNT -#if SIZEOF_LONG == 8 - return __builtin_popcountl(word); -#elif SIZEOF_LONG_LONG == 8 - return __builtin_popcountll(word); -#else -#error "cannot find integer of the same size as uint64_t" -#endif -#else /* !HAVE__BUILTIN_POPCOUNT */ - int result = 0; - - while (word != 0) - { - result += pg_number_of_ones[word & 255]; - word >>= 8; - } - - return result; -#endif /* HAVE__BUILTIN_POPCOUNT */ -} - /* * pg_popcount_portable * Returns the number of 1-bits in buf @@ -163,7 +113,7 @@ pg_popcount_portable(const char *buf, int bytes) while (bytes >= 8) { - popcnt += pg_popcount64_portable(*words++); + popcnt += pg_popcount64(*words++); bytes -= 8; } @@ -197,7 +147,7 @@ pg_popcount_masked_portable(const char *buf, int bytes, bits8 mask) while (bytes >= 8) { - popcnt += pg_popcount64_portable(*words++ & maskv); + popcnt += pg_popcount64(*words++ & maskv); bytes -= 8; } @@ -220,17 +170,6 @@ pg_popcount_masked_portable(const char *buf, int bytes, bits8 mask) * actual external functions. The compiler should be able to inline the * portable versions here. */ -int -pg_popcount32(uint32 word) -{ - return pg_popcount32_portable(word); -} - -int -pg_popcount64(uint64 word) -{ - return pg_popcount64_portable(word); -} /* * pg_popcount_optimized diff --git a/src/port/pg_popcount_aarch64.c b/src/port/pg_popcount_aarch64.c index ba57f2cd4bd..74f71593721 100644 --- a/src/port/pg_popcount_aarch64.c +++ b/src/port/pg_popcount_aarch64.c @@ -291,31 +291,6 @@ pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mask) #endif /* ! USE_SVE_POPCNT_WITH_RUNTIME_CHECK */ -/* - * pg_popcount32 - * Return number of 1 bits in word - */ -int -pg_popcount32(uint32 word) -{ - return pg_popcount64((uint64) word); -} - -/* - * pg_popcount64 - * Return number of 1 bits in word - */ -int -pg_popcount64(uint64 word) -{ - /* - * For some compilers, __builtin_popcountl() already emits Neon - * instructions. The line below should compile to the same code on those - * systems. - */ - return vaddv_u8(vcnt_u8(vld1_u8((const uint8 *) &word))); -} - /* * pg_popcount_neon * Returns number of 1 bits in buf diff --git a/src/port/pg_popcount_x86.c b/src/port/pg_popcount_x86.c index 7aebf69898b..6bce089432f 100644 --- a/src/port/pg_popcount_x86.c +++ b/src/port/pg_popcount_x86.c @@ -36,8 +36,6 @@ * operation, but in practice this is close enough, and "sse42" seems easier to * follow than "popcnt" for these names. */ -static inline int pg_popcount32_sse42(uint32 word); -static inline int pg_popcount64_sse42(uint64 word); static uint64 pg_popcount_sse42(const char *buf, int bytes); static uint64 pg_popcount_masked_sse42(const char *buf, int bytes, bits8 mask); @@ -55,12 +53,8 @@ static uint64 pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask); * what the current CPU supports) and then will call the pointer to fulfill the * caller's request. */ -static int pg_popcount32_choose(uint32 word); -static int pg_popcount64_choose(uint64 word); static uint64 pg_popcount_choose(const char *buf, int bytes); static uint64 pg_popcount_masked_choose(const char *buf, int bytes, bits8 mask); -int (*pg_popcount32) (uint32 word) = pg_popcount32_choose; -int (*pg_popcount64) (uint64 word) = pg_popcount64_choose; uint64 (*pg_popcount_optimized) (const char *buf, int bytes) = pg_popcount_choose; uint64 (*pg_popcount_masked_optimized) (const char *buf, int bytes, bits8 mask) = pg_popcount_masked_choose; @@ -157,7 +151,7 @@ pg_popcount_avx512_available(void) #endif /* USE_AVX512_POPCNT_WITH_RUNTIME_CHECK */ /* - * These functions get called on the first call to pg_popcount32 etc. + * These functions get called on the first call to pg_popcount(), etc. * They detect whether we can use the asm implementations, and replace * the function pointers so that subsequent calls are routed directly to * the chosen implementation. @@ -167,15 +161,11 @@ choose_popcount_functions(void) { if (pg_popcount_sse42_available()) { - pg_popcount32 = pg_popcount32_sse42; - pg_popcount64 = pg_popcount64_sse42; pg_popcount_optimized = pg_popcount_sse42; pg_popcount_masked_optimized = pg_popcount_masked_sse42; } else { - pg_popcount32 = pg_popcount32_portable; - pg_popcount64 = pg_popcount64_portable; pg_popcount_optimized = pg_popcount_portable; pg_popcount_masked_optimized = pg_popcount_masked_portable; } @@ -189,20 +179,6 @@ choose_popcount_functions(void) #endif } -static int -pg_popcount32_choose(uint32 word) -{ - choose_popcount_functions(); - return pg_popcount32(word); -} - -static int -pg_popcount64_choose(uint64 word) -{ - choose_popcount_functions(); - return pg_popcount64(word); -} - static uint64 pg_popcount_choose(const char *buf, int bytes) { @@ -338,23 +314,6 @@ pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask) #endif /* USE_AVX512_POPCNT_WITH_RUNTIME_CHECK */ -/* - * pg_popcount32_sse42 - * Return the number of 1 bits set in word - */ -static inline int -pg_popcount32_sse42(uint32 word) -{ -#ifdef _MSC_VER - return __popcnt(word); -#else - uint32 res; - -__asm__ __volatile__(" popcntl %1,%0\n":"=q"(res):"rm"(word):"cc"); - return (int) res; -#endif -} - /* * pg_popcount64_sse42 * Return the number of 1 bits set in word -- 2.50.1 (Apple Git-155)