From 19404ae038c6fa678c41a2b4db62c9b885896c18 Mon Sep 17 00:00:00 2001 From: Nathan Bossart Date: Thu, 22 Jan 2026 11:33:56 -0600 Subject: [PATCH v8 1/3] Remove some unnecessary optimizations in popcount code. Over the past few releases, we've added a huge amount of complexity to our popcount implementations. Commits fbe327e5b4, 79e232ca01, 8c6653516c, and 25dc485074 did some preliminary refactoring, but many opportunities remain. In particular, if we disclaim interest in micro-optimizing this code for 32-bit builds and in unproven alignment checks, we can remove a decent chunk of code. This commit does the following: * Removes the code in pg_popcount() and pg_popcount_masked() that sets the function pointer threshold based on SIDEOF_VOID_P. Consequently, 32-bit builds should follow the inline path for inputs between 4-8 bytes instead of calling pg_popcount_optimized() (which is probably just calling pg_popcount_portable(), anyway). While it is possible that this results in a small regression for those inputs on 32-bit builds, it seems unlikely to produce noticeable performance differences on those machines. Furthermore, I found no evidence of benchmarks for this area of code for 32-bit builds. * Removes the 32-bit optimizations in pg_popcount_portable() and pg_popcount_masked_portable(). This means that 32-bit builds instead use a simple while loop. As above, we are not too concerned about regressions on 32-bit machines. * Removes 32-bit optimizations in pg_popcount_x86.c. This is dead code because everything in this file is only compiled when HAVE_X86_64_POPCNTQ is defined, and that macro is only defined for x86-64. * Removes alignment checks in pg_popcount_sse42() and pg_popcount_masked_sse42(). These are unnecessary for x86, and it's unclear whether they make any meaningful performance difference. Since we allow misaligned accesses now, this commit also adds pg_attribute_no_sanitize_alignment() to these functions. Suggested-by: John Naylor Reviewed-by: John Naylor Discussion: https://postgr.es/m/CANWCAZY7R%2Biy%2Br9YM_sySNydHzNqUirx1xk0tB3ej5HO62GdgQ%40mail.gmail.com --- src/include/port/pg_bitutils.h | 24 +----------- src/port/pg_bitutils.c | 30 --------------- src/port/pg_popcount_x86.c | 67 ++++++---------------------------- 3 files changed, 14 insertions(+), 107 deletions(-) diff --git a/src/include/port/pg_bitutils.h b/src/include/port/pg_bitutils.h index 35761f509ec..c3049d71894 100644 --- a/src/include/port/pg_bitutils.h +++ b/src/include/port/pg_bitutils.h @@ -329,17 +329,7 @@ extern uint64 pg_popcount_masked_optimized(const char *buf, int bytes, bits8 mas static inline uint64 pg_popcount(const char *buf, int bytes) { - /* - * We set the threshold to the point at which we'll first use special - * instructions in the optimized version. - */ -#if SIZEOF_VOID_P >= 8 - int threshold = 8; -#else - int threshold = 4; -#endif - - if (bytes < threshold) + if (bytes < 8) { uint64 popcnt = 0; @@ -360,17 +350,7 @@ pg_popcount(const char *buf, int bytes) static inline uint64 pg_popcount_masked(const char *buf, int bytes, bits8 mask) { - /* - * We set the threshold to the point at which we'll first use special - * instructions in the optimized version. - */ -#if SIZEOF_VOID_P >= 8 - int threshold = 8; -#else - int threshold = 4; -#endif - - if (bytes < threshold) + if (bytes < 8) { uint64 popcnt = 0; diff --git a/src/port/pg_bitutils.c b/src/port/pg_bitutils.c index ffda75825e5..bec06c06fc3 100644 --- a/src/port/pg_bitutils.c +++ b/src/port/pg_bitutils.c @@ -167,20 +167,6 @@ pg_popcount_portable(const char *buf, int bytes) bytes -= 8; } - buf = (const char *) words; - } -#else - /* Process in 32-bit chunks if the buffer is aligned. */ - if (buf == (const char *) TYPEALIGN(4, buf)) - { - const uint32 *words = (const uint32 *) buf; - - while (bytes >= 4) - { - popcnt += pg_popcount32_portable(*words++); - bytes -= 4; - } - buf = (const char *) words; } #endif @@ -215,22 +201,6 @@ pg_popcount_masked_portable(const char *buf, int bytes, bits8 mask) bytes -= 8; } - buf = (const char *) words; - } -#else - /* Process in 32-bit chunks if the buffer is aligned. */ - uint32 maskv = ~((uint32) 0) / 0xFF * mask; - - if (buf == (const char *) TYPEALIGN(4, buf)) - { - const uint32 *words = (const uint32 *) buf; - - while (bytes >= 4) - { - popcnt += pg_popcount32_portable(*words++ & maskv); - bytes -= 4; - } - buf = (const char *) words; } #endif diff --git a/src/port/pg_popcount_x86.c b/src/port/pg_popcount_x86.c index 245f0167d00..7aebf69898b 100644 --- a/src/port/pg_popcount_x86.c +++ b/src/port/pg_popcount_x86.c @@ -376,40 +376,20 @@ __asm__ __volatile__(" popcntq %1,%0\n":"=q"(res):"rm"(word):"cc"); * pg_popcount_sse42 * Returns the number of 1-bits in buf */ +pg_attribute_no_sanitize_alignment() static uint64 pg_popcount_sse42(const char *buf, int bytes) { uint64 popcnt = 0; + const uint64 *words = (const uint64 *) buf; -#if SIZEOF_VOID_P >= 8 - /* Process in 64-bit chunks if the buffer is aligned. */ - if (buf == (const char *) TYPEALIGN(8, buf)) + while (bytes >= 8) { - const uint64 *words = (const uint64 *) buf; - - while (bytes >= 8) - { - popcnt += pg_popcount64_sse42(*words++); - bytes -= 8; - } - - buf = (const char *) words; + popcnt += pg_popcount64_sse42(*words++); + bytes -= 8; } -#else - /* Process in 32-bit chunks if the buffer is aligned. */ - if (buf == (const char *) TYPEALIGN(4, buf)) - { - const uint32 *words = (const uint32 *) buf; - while (bytes >= 4) - { - popcnt += pg_popcount32_sse42(*words++); - bytes -= 4; - } - - buf = (const char *) words; - } -#endif + buf = (const char *) words; /* Process any remaining bytes */ while (bytes--) @@ -422,44 +402,21 @@ pg_popcount_sse42(const char *buf, int bytes) * pg_popcount_masked_sse42 * Returns the number of 1-bits in buf after applying the mask to each byte */ +pg_attribute_no_sanitize_alignment() static uint64 pg_popcount_masked_sse42(const char *buf, int bytes, bits8 mask) { uint64 popcnt = 0; - -#if SIZEOF_VOID_P >= 8 - /* Process in 64-bit chunks if the buffer is aligned */ uint64 maskv = ~UINT64CONST(0) / 0xFF * mask; + const uint64 *words = (const uint64 *) buf; - if (buf == (const char *) TYPEALIGN(8, buf)) + while (bytes >= 8) { - const uint64 *words = (const uint64 *) buf; - - while (bytes >= 8) - { - popcnt += pg_popcount64_sse42(*words++ & maskv); - bytes -= 8; - } - - buf = (const char *) words; + popcnt += pg_popcount64_sse42(*words++ & maskv); + bytes -= 8; } -#else - /* Process in 32-bit chunks if the buffer is aligned. */ - uint32 maskv = ~((uint32) 0) / 0xFF * mask; - - if (buf == (const char *) TYPEALIGN(4, buf)) - { - const uint32 *words = (const uint32 *) buf; - - while (bytes >= 4) - { - popcnt += pg_popcount32_sse42(*words++ & maskv); - bytes -= 4; - } - buf = (const char *) words; - } -#endif + buf = (const char *) words; /* Process any remaining bytes */ while (bytes--) -- 2.50.1 (Apple Git-155)