From bcc9f40bc11e22477d1d0597baa9c2de27d8cd29 Mon Sep 17 00:00:00 2001 From: Nathan Bossart Date: Wed, 14 Jan 2026 11:54:54 -0600 Subject: [PATCH v1 2/3] Move x86 popcount code to pg_popcount_x86_64.c. This moves the SSE4.2 popcount implementations to the recently renamed file for x86-64-specific popcount code. --- src/include/port/pg_bitutils.h | 20 ++- src/port/pg_bitutils.c | 263 +------------------------------- src/port/pg_popcount_x86_64.c | 264 +++++++++++++++++++++++++++++++-- 3 files changed, 267 insertions(+), 280 deletions(-) diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h index 8ed12f7a021..c9c508d4ba3 100644 --- a/src/include/port/pg_bitutils.h +++ b/src/include/port/pg_bitutils.h @@ -307,23 +307,21 @@ pg_ceil_log2_64(uint64 num) #define POPCNT_AARCH64 1 #endif +extern int pg_popcount32_slow(uint32 word); +extern int pg_popcount64_slow(uint64 word); +extern uint64 pg_popcount_slow(const char *buf, int bytes); +extern uint64 pg_popcount_masked_slow(const char *buf, int bytes, bits8 mask); + #ifdef TRY_POPCNT_X86_64 -/* Attempt to use the POPCNT instruction, but perform a runtime check first */ +/* + * Attempt to use SSE4.2 or AVX-512 instructions, but perform a runtime check + * first. + */ extern PGDLLIMPORT int (*pg_popcount32) (uint32 word); extern PGDLLIMPORT int (*pg_popcount64) (uint64 word); extern PGDLLIMPORT uint64 (*pg_popcount_optimized) (const char *buf, int bytes); extern PGDLLIMPORT uint64 (*pg_popcount_masked_optimized) (const char *buf, int bytes, bits8 mask); -/* - * We can also try to use the AVX-512 popcount instruction on some systems. - * The implementation of that is located in its own file. - */ -#ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK -extern bool pg_popcount_avx512_available(void); -extern uint64 pg_popcount_avx512(const char *buf, int bytes); -extern uint64 pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask); -#endif - #elif POPCNT_AARCH64 /* Use the Neon version of pg_popcount{32,64} without function pointer. */ extern int pg_popcount32(uint32 word); diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c index 7875bbb0f4b..9f9f90ddd4d 100644 --- a/src/port/pg_bitutils.c +++ b/src/port/pg_bitutils.c @@ -12,13 +12,6 @@ */ #include "c.h" -#ifdef HAVE__GET_CPUID -#include -#endif -#ifdef HAVE__CPUID -#include -#endif - #include "port/pg_bitutils.h" @@ -103,257 +96,11 @@ const uint8 pg_number_of_ones[256] = { 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8 }; -/* - * If we are building the Neon versions, we don't need the "slow" fallbacks. - */ -#ifndef POPCNT_AARCH64 -static inline int pg_popcount32_slow(uint32 word); -static inline int pg_popcount64_slow(uint64 word); -static uint64 pg_popcount_slow(const char *buf, int bytes); -static uint64 pg_popcount_masked_slow(const char *buf, int bytes, bits8 mask); -#endif - -#ifdef TRY_POPCNT_X86_64 -static bool pg_popcount_available(void); -static int pg_popcount32_choose(uint32 word); -static int pg_popcount64_choose(uint64 word); -static uint64 pg_popcount_choose(const char *buf, int bytes); -static uint64 pg_popcount_masked_choose(const char *buf, int bytes, bits8 mask); -static inline int pg_popcount32_fast(uint32 word); -static inline int pg_popcount64_fast(uint64 word); -static uint64 pg_popcount_fast(const char *buf, int bytes); -static uint64 pg_popcount_masked_fast(const char *buf, int bytes, bits8 mask); - -int (*pg_popcount32) (uint32 word) = pg_popcount32_choose; -int (*pg_popcount64) (uint64 word) = pg_popcount64_choose; -uint64 (*pg_popcount_optimized) (const char *buf, int bytes) = pg_popcount_choose; -uint64 (*pg_popcount_masked_optimized) (const char *buf, int bytes, bits8 mask) = pg_popcount_masked_choose; -#endif /* TRY_POPCNT_X86_64 */ - -#ifdef TRY_POPCNT_X86_64 - -/* - * Return true if CPUID indicates that the POPCNT instruction is available. - */ -static bool -pg_popcount_available(void) -{ - unsigned int exx[4] = {0, 0, 0, 0}; - -#if defined(HAVE__GET_CPUID) - __get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]); -#elif defined(HAVE__CPUID) - __cpuid(exx, 1); -#else -#error cpuid instruction not available -#endif - - return (exx[2] & (1 << 23)) != 0; /* POPCNT */ -} - -/* - * These functions get called on the first call to pg_popcount32 etc. - * They detect whether we can use the asm implementations, and replace - * the function pointers so that subsequent calls are routed directly to - * the chosen implementation. - */ -static inline void -choose_popcount_functions(void) -{ - if (pg_popcount_available()) - { - pg_popcount32 = pg_popcount32_fast; - pg_popcount64 = pg_popcount64_fast; - pg_popcount_optimized = pg_popcount_fast; - pg_popcount_masked_optimized = pg_popcount_masked_fast; - } - else - { - pg_popcount32 = pg_popcount32_slow; - pg_popcount64 = pg_popcount64_slow; - pg_popcount_optimized = pg_popcount_slow; - pg_popcount_masked_optimized = pg_popcount_masked_slow; - } - -#ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK - if (pg_popcount_avx512_available()) - { - pg_popcount_optimized = pg_popcount_avx512; - pg_popcount_masked_optimized = pg_popcount_masked_avx512; - } -#endif -} - -static int -pg_popcount32_choose(uint32 word) -{ - choose_popcount_functions(); - return pg_popcount32(word); -} - -static int -pg_popcount64_choose(uint64 word) -{ - choose_popcount_functions(); - return pg_popcount64(word); -} - -static uint64 -pg_popcount_choose(const char *buf, int bytes) -{ - choose_popcount_functions(); - return pg_popcount_optimized(buf, bytes); -} - -static uint64 -pg_popcount_masked_choose(const char *buf, int bytes, bits8 mask) -{ - choose_popcount_functions(); - return pg_popcount_masked(buf, bytes, mask); -} - -/* - * pg_popcount32_fast - * Return the number of 1 bits set in word - */ -static inline int -pg_popcount32_fast(uint32 word) -{ -#ifdef _MSC_VER - return __popcnt(word); -#else - uint32 res; - -__asm__ __volatile__(" popcntl %1,%0\n":"=q"(res):"rm"(word):"cc"); - return (int) res; -#endif -} - -/* - * pg_popcount64_fast - * Return the number of 1 bits set in word - */ -static inline int -pg_popcount64_fast(uint64 word) -{ -#ifdef _MSC_VER - return __popcnt64(word); -#else - uint64 res; - -__asm__ __volatile__(" popcntq %1,%0\n":"=q"(res):"rm"(word):"cc"); - return (int) res; -#endif -} - -/* - * pg_popcount_fast - * Returns the number of 1-bits in buf - */ -static uint64 -pg_popcount_fast(const char *buf, int bytes) -{ - uint64 popcnt = 0; - -#if SIZEOF_VOID_P >= 8 - /* Process in 64-bit chunks if the buffer is aligned. */ - if (buf == (const char *) TYPEALIGN(8, buf)) - { - const uint64 *words = (const uint64 *) buf; - - while (bytes >= 8) - { - popcnt += pg_popcount64_fast(*words++); - bytes -= 8; - } - - buf = (const char *) words; - } -#else - /* Process in 32-bit chunks if the buffer is aligned. */ - if (buf == (const char *) TYPEALIGN(4, buf)) - { - const uint32 *words = (const uint32 *) buf; - - while (bytes >= 4) - { - popcnt += pg_popcount32_fast(*words++); - bytes -= 4; - } - - buf = (const char *) words; - } -#endif - - /* Process any remaining bytes */ - while (bytes--) - popcnt += pg_number_of_ones[(unsigned char) *buf++]; - - return popcnt; -} - -/* - * pg_popcount_masked_fast - * Returns the number of 1-bits in buf after applying the mask to each byte - */ -static uint64 -pg_popcount_masked_fast(const char *buf, int bytes, bits8 mask) -{ - uint64 popcnt = 0; - -#if SIZEOF_VOID_P >= 8 - /* Process in 64-bit chunks if the buffer is aligned */ - uint64 maskv = ~UINT64CONST(0) / 0xFF * mask; - - if (buf == (const char *) TYPEALIGN(8, buf)) - { - const uint64 *words = (const uint64 *) buf; - - while (bytes >= 8) - { - popcnt += pg_popcount64_fast(*words++ & maskv); - bytes -= 8; - } - - buf = (const char *) words; - } -#else - /* Process in 32-bit chunks if the buffer is aligned. */ - uint32 maskv = ~((uint32) 0) / 0xFF * mask; - - if (buf == (const char *) TYPEALIGN(4, buf)) - { - const uint32 *words = (const uint32 *) buf; - - while (bytes >= 4) - { - popcnt += pg_popcount32_fast(*words++ & maskv); - bytes -= 4; - } - - buf = (const char *) words; - } -#endif - - /* Process any remaining bytes */ - while (bytes--) - popcnt += pg_number_of_ones[(unsigned char) *buf++ & mask]; - - return popcnt; -} - -#endif /* TRY_POPCNT_X86_64 */ - -/* - * If we are building the Neon versions, we don't need the "slow" fallbacks. - */ -#ifndef POPCNT_AARCH64 - /* * pg_popcount32_slow * Return the number of 1 bits set in word */ -static inline int +int pg_popcount32_slow(uint32 word) { #ifdef HAVE__BUILTIN_POPCOUNT @@ -375,7 +122,7 @@ pg_popcount32_slow(uint32 word) * pg_popcount64_slow * Return the number of 1 bits set in word */ -static inline int +int pg_popcount64_slow(uint64 word) { #ifdef HAVE__BUILTIN_POPCOUNT @@ -403,7 +150,7 @@ pg_popcount64_slow(uint64 word) * pg_popcount_slow * Returns the number of 1-bits in buf */ -static uint64 +uint64 pg_popcount_slow(const char *buf, int bytes) { uint64 popcnt = 0; @@ -449,7 +196,7 @@ pg_popcount_slow(const char *buf, int bytes) * pg_popcount_masked_slow * Returns the number of 1-bits in buf after applying the mask to each byte */ -static uint64 +uint64 pg_popcount_masked_slow(const char *buf, int bytes, bits8 mask) { uint64 popcnt = 0; @@ -495,8 +242,6 @@ pg_popcount_masked_slow(const char *buf, int bytes, bits8 mask) return popcnt; } -#endif /* ! POPCNT_AARCH64 */ - #if !defined(TRY_POPCNT_X86_64) && !defined(POPCNT_AARCH64) /* diff --git a/src/port/pg_popcount_x86_64.c b/src/port/pg_popcount_x86_64.c index 453c7a06ce9..f8643642613 100644 --- a/src/port/pg_popcount_x86_64.c +++ b/src/port/pg_popcount_x86_64.c @@ -12,26 +12,74 @@ */ #include "c.h" -#ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK +#include "port/pg_bitutils.h" + +#ifdef TRY_POPCNT_X86_64 #if defined(HAVE__GET_CPUID) || defined(HAVE__GET_CPUID_COUNT) #include #endif +#ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK #include +#endif #if defined(HAVE__CPUID) || defined(HAVE__CPUIDEX) #include #endif -#include "port/pg_bitutils.h" +/* + * The SSE4.2 versions are built regardless of whether we are building the + * AVX-512 versions. + */ +static inline int pg_popcount32_fast(uint32 word); +static inline int pg_popcount64_fast(uint64 word); +static uint64 pg_popcount_fast(const char *buf, int bytes); +static uint64 pg_popcount_masked_fast(const char *buf, int bytes, bits8 mask); /* - * It's probably unlikely that TRY_POPCNT_X86_64 won't be set if we are able to - * use AVX-512 intrinsics, but we check it anyway to be sure. We piggy-back on - * the function pointers that are only used when TRY_POPCNT_X86_64 is set. + * These are the AVX-512 implementations of the popcount functions. */ -#ifdef TRY_POPCNT_X86_64 +#ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK +static uint64 pg_popcount_avx512(const char *buf, int bytes); +static uint64 pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask); +#endif /* USE_AVX512_POPCNT_WITH_RUNTIME_CHECK */ + +/* + * The function pointers are initially set to "choose" functions. These + * functions will first set the pointers to the right implementations (base on + * what the current CPU supports) and then will call the pointer to fulfill the + * caller's request. + */ +static int pg_popcount32_choose(uint32 word); +static int pg_popcount64_choose(uint64 word); +static uint64 pg_popcount_choose(const char *buf, int bytes); +static uint64 pg_popcount_masked_choose(const char *buf, int bytes, bits8 mask); +int (*pg_popcount32) (uint32 word) = pg_popcount32_choose; +int (*pg_popcount64) (uint64 word) = pg_popcount64_choose; +uint64 (*pg_popcount_optimized) (const char *buf, int bytes) = pg_popcount_choose; +uint64 (*pg_popcount_masked_optimized) (const char *buf, int bytes, bits8 mask) = pg_popcount_masked_choose; + +/* + * Return true if CPUID indicates that the POPCNT instruction is available. + */ +static bool +pg_popcount_available(void) +{ + unsigned int exx[4] = {0, 0, 0, 0}; + +#if defined(HAVE__GET_CPUID) + __get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]); +#elif defined(HAVE__CPUID) + __cpuid(exx, 1); +#else +#error cpuid instruction not available +#endif + + return (exx[2] & (1 << 23)) != 0; /* POPCNT */ +} + +#ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK /* * Does CPUID say there's support for XSAVE instructions? @@ -94,7 +142,7 @@ avx512_popcnt_available(void) * Returns true if the CPU supports the instructions required for the AVX-512 * pg_popcount() implementation. */ -bool +static bool pg_popcount_avx512_available(void) { return xsave_available() && @@ -102,12 +150,77 @@ pg_popcount_avx512_available(void) avx512_popcnt_available(); } +#endif /* USE_AVX512_POPCNT_WITH_RUNTIME_CHECK */ + +/* + * These functions get called on the first call to pg_popcount32 etc. + * They detect whether we can use the asm implementations, and replace + * the function pointers so that subsequent calls are routed directly to + * the chosen implementation. + */ +static inline void +choose_popcount_functions(void) +{ + if (pg_popcount_available()) + { + pg_popcount32 = pg_popcount32_fast; + pg_popcount64 = pg_popcount64_fast; + pg_popcount_optimized = pg_popcount_fast; + pg_popcount_masked_optimized = pg_popcount_masked_fast; + } + else + { + pg_popcount32 = pg_popcount32_slow; + pg_popcount64 = pg_popcount64_slow; + pg_popcount_optimized = pg_popcount_slow; + pg_popcount_masked_optimized = pg_popcount_masked_slow; + } + +#ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK + if (pg_popcount_avx512_available()) + { + pg_popcount_optimized = pg_popcount_avx512; + pg_popcount_masked_optimized = pg_popcount_masked_avx512; + } +#endif +} + +static int +pg_popcount32_choose(uint32 word) +{ + choose_popcount_functions(); + return pg_popcount32(word); +} + +static int +pg_popcount64_choose(uint64 word) +{ + choose_popcount_functions(); + return pg_popcount64(word); +} + +static uint64 +pg_popcount_choose(const char *buf, int bytes) +{ + choose_popcount_functions(); + return pg_popcount_optimized(buf, bytes); +} + +static uint64 +pg_popcount_masked_choose(const char *buf, int bytes, bits8 mask) +{ + choose_popcount_functions(); + return pg_popcount_masked(buf, bytes, mask); +} + +#ifdef USE_AVX512_POPCNT_WITH_RUNTIME_CHECK + /* * pg_popcount_avx512 * Returns the number of 1-bits in buf */ pg_attribute_target("avx512vpopcntdq,avx512bw") -uint64 +static uint64 pg_popcount_avx512(const char *buf, int bytes) { __m512i val, @@ -163,7 +276,7 @@ pg_popcount_avx512(const char *buf, int bytes) * Returns the number of 1-bits in buf after applying the mask to each byte */ pg_attribute_target("avx512vpopcntdq,avx512bw") -uint64 +static uint64 pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask) { __m512i val, @@ -219,5 +332,136 @@ pg_popcount_masked_avx512(const char *buf, int bytes, bits8 mask) return _mm512_reduce_add_epi64(accum); } -#endif /* TRY_POPCNT_X86_64 */ #endif /* USE_AVX512_POPCNT_WITH_RUNTIME_CHECK */ + +/* + * pg_popcount32_fast + * Return the number of 1 bits set in word + */ +static inline int +pg_popcount32_fast(uint32 word) +{ +#ifdef _MSC_VER + return __popcnt(word); +#else + uint32 res; + +__asm__ __volatile__(" popcntl %1,%0\n":"=q"(res):"rm"(word):"cc"); + return (int) res; +#endif +} + +/* + * pg_popcount64_fast + * Return the number of 1 bits set in word + */ +static inline int +pg_popcount64_fast(uint64 word) +{ +#ifdef _MSC_VER + return __popcnt64(word); +#else + uint64 res; + +__asm__ __volatile__(" popcntq %1,%0\n":"=q"(res):"rm"(word):"cc"); + return (int) res; +#endif +} + +/* + * pg_popcount_fast + * Returns the number of 1-bits in buf + */ +static uint64 +pg_popcount_fast(const char *buf, int bytes) +{ + uint64 popcnt = 0; + +#if SIZEOF_VOID_P >= 8 + /* Process in 64-bit chunks if the buffer is aligned. */ + if (buf == (const char *) TYPEALIGN(8, buf)) + { + const uint64 *words = (const uint64 *) buf; + + while (bytes >= 8) + { + popcnt += pg_popcount64_fast(*words++); + bytes -= 8; + } + + buf = (const char *) words; + } +#else + /* Process in 32-bit chunks if the buffer is aligned. */ + if (buf == (const char *) TYPEALIGN(4, buf)) + { + const uint32 *words = (const uint32 *) buf; + + while (bytes >= 4) + { + popcnt += pg_popcount32_fast(*words++); + bytes -= 4; + } + + buf = (const char *) words; + } +#endif + + /* Process any remaining bytes */ + while (bytes--) + popcnt += pg_number_of_ones[(unsigned char) *buf++]; + + return popcnt; +} + +/* + * pg_popcount_masked_fast + * Returns the number of 1-bits in buf after applying the mask to each byte + */ +static uint64 +pg_popcount_masked_fast(const char *buf, int bytes, bits8 mask) +{ + uint64 popcnt = 0; + +#if SIZEOF_VOID_P >= 8 + /* Process in 64-bit chunks if the buffer is aligned */ + uint64 maskv = ~UINT64CONST(0) / 0xFF * mask; + + if (buf == (const char *) TYPEALIGN(8, buf)) + { + const uint64 *words = (const uint64 *) buf; + + while (bytes >= 8) + { + popcnt += pg_popcount64_fast(*words++ & maskv); + bytes -= 8; + } + + buf = (const char *) words; + } +#else + /* Process in 32-bit chunks if the buffer is aligned. */ + uint32 maskv = ~((uint32) 0) / 0xFF * mask; + + if (buf == (const char *) TYPEALIGN(4, buf)) + { + const uint32 *words = (const uint32 *) buf; + + while (bytes >= 4) + { + popcnt += pg_popcount32_fast(*words++ & maskv); + bytes -= 4; + } + + buf = (const char *) words; + } +#endif + + /* Process any remaining bytes */ + while (bytes--) + popcnt += pg_number_of_ones[(unsigned char) *buf++ & mask]; + + return popcnt; +} + +#endif /* TRY_POPCNT_X86_64 */ -- 2.50.1 (Apple Git-155)