diff --git a/Include/internal/pycore_byteswap.h b/Include/internal/pycore_bitutils.h similarity index 59% rename from Include/internal/pycore_byteswap.h rename to Include/internal/pycore_bitutils.h index 5e64704a004..36ffe23b9ff 100644 --- a/Include/internal/pycore_byteswap.h +++ b/Include/internal/pycore_bitutils.h @@ -1,4 +1,6 @@ -/* Bytes swap functions, reverse order of bytes: +/* Bit and bytes utilities. + + Bytes swap functions, reverse order of bytes: - _Py_bswap16(uint16_t) - _Py_bswap32(uint32_t) @@ -82,6 +84,53 @@ _Py_bswap64(uint64_t word) } +// Population count: count the number of 1's in 'x' +// (number of bits set to 1), also known as the hamming weight. +// +// Implementation note. CPUID is not used, to test if x86 POPCNT instruction +// can be used, to keep the implementation simple. For example, Visual Studio +// __popcnt() is not used this reason. The clang and GCC builtin function can +// use the x86 POPCNT instruction if the target architecture has SSE4a or +// newer. +static inline int +_Py_popcount32(uint32_t x) +{ +#if (defined(__clang__) || defined(__GNUC__)) + +#if SIZEOF_INT >= 4 + Py_BUILD_ASSERT(sizeof(x) <= sizeof(unsigned int)); + return __builtin_popcount(x); +#else + // The C standard guarantees that unsigned long will always be big enough + // to hold a uint32_t value without losing information. + Py_BUILD_ASSERT(sizeof(x) <= sizeof(unsigned long)); + return __builtin_popcountl(x); +#endif + +#else + // 32-bit SWAR (SIMD Within A Register) popcount + + // Binary: 0 1 0 1 ... + const uint32_t M1 = 0x55555555; + // Binary: 00 11 00 11. .. + const uint32_t M2 = 0x33333333; + // Binary: 0000 1111 0000 1111 ... + const uint32_t M4 = 0x0F0F0F0F; + // 256**4 + 256**3 + 256**2 + 256**1 + const uint32_t SUM = 0x01010101; + + // Put count of each 2 bits into those 2 bits + x = x - ((x >> 1) & M1); + // Put count of each 4 bits into those 4 bits + x = (x & M2) + ((x >> 2) & M2); + // Put count of each 8 bits into those 8 bits + x = (x + (x >> 4)) & M4; + // Sum of the 4 byte counts + return (uint32_t)((uint64_t)x * (uint64_t)SUM) >> 24; +#endif +} + + #ifdef __cplusplus } #endif diff --git a/Makefile.pre.in b/Makefile.pre.in index 5a18704e441..b115e7fc01f 100644 --- a/Makefile.pre.in +++ b/Makefile.pre.in @@ -1121,7 +1121,7 @@ PYTHON_HEADERS= \ $(srcdir)/Include/internal/pycore_abstract.h \ $(srcdir)/Include/internal/pycore_accu.h \ $(srcdir)/Include/internal/pycore_atomic.h \ - $(srcdir)/Include/internal/pycore_byteswap.h \ + $(srcdir)/Include/internal/pycore_bitutils.h \ $(srcdir)/Include/internal/pycore_bytes_methods.h \ $(srcdir)/Include/internal/pycore_call.h \ $(srcdir)/Include/internal/pycore_ceval.h \ diff --git a/Modules/_ctypes/cfield.c b/Modules/_ctypes/cfield.c index 32a2beeb744..3a9b7119201 100644 --- a/Modules/_ctypes/cfield.c +++ b/Modules/_ctypes/cfield.c @@ -1,5 +1,5 @@ #include "Python.h" -#include "pycore_byteswap.h" // _Py_bswap32() +#include "pycore_bitutils.h" // _Py_bswap32() #include #ifdef MS_WIN32 diff --git a/Modules/_testinternalcapi.c b/Modules/_testinternalcapi.c index 5f217dcb897..6d5af5917f1 100644 --- a/Modules/_testinternalcapi.c +++ b/Modules/_testinternalcapi.c @@ -12,7 +12,7 @@ #define PY_SSIZE_T_CLEAN #include "Python.h" -#include "pycore_byteswap.h" // _Py_bswap32() +#include "pycore_bitutils.h" // _Py_bswap32() #include "pycore_initconfig.h" // _Py_GetConfigsAsDict() #include "pycore_hashtable.h" // _Py_hashtable_new() #include "pycore_gc.h" // PyGC_Head @@ -63,6 +63,45 @@ test_bswap(PyObject *self, PyObject *Py_UNUSED(args)) } +static int +check_popcount(uint32_t x, int expected) +{ + // Use volatile to prevent the compiler to optimize out the whole test + volatile uint32_t u = x; + int bits = _Py_popcount32(u); + if (bits != expected) { + PyErr_Format(PyExc_AssertionError, + "_Py_popcount32(%lu) returns %i, expected %i", + (unsigned long)x, bits, expected); + return -1; + } + return 0; +} + + +static PyObject* +test_popcount(PyObject *self, PyObject *Py_UNUSED(args)) +{ +#define CHECK(X, RESULT) \ + do { \ + if (check_popcount(X, RESULT) < 0) { \ + return NULL; \ + } \ + } while (0) + + CHECK(0, 0); + CHECK(1, 1); + CHECK(0x08080808, 4); + CHECK(0x10101010, 4); + CHECK(0x10204080, 4); + CHECK(0xDEADCAFE, 22); + CHECK(0xFFFFFFFF, 32); + Py_RETURN_NONE; + +#undef CHECK +} + + #define TO_PTR(ch) ((void*)(uintptr_t)ch) #define FROM_PTR(ptr) ((uintptr_t)ptr) #define VALUE(key) (1 + ((int)(key) - 'a')) @@ -157,6 +196,7 @@ static PyMethodDef TestMethods[] = { {"get_configs", get_configs, METH_NOARGS}, {"get_recursion_depth", get_recursion_depth, METH_NOARGS}, {"test_bswap", test_bswap, METH_NOARGS}, + {"test_popcount", test_popcount, METH_NOARGS}, {"test_hashtable", test_hashtable, METH_NOARGS}, {NULL, NULL} /* sentinel */ }; diff --git a/Modules/sha256module.c b/Modules/sha256module.c index 8edb1d53828..261f9daee28 100644 --- a/Modules/sha256module.c +++ b/Modules/sha256module.c @@ -17,7 +17,7 @@ /* SHA objects */ #include "Python.h" -#include "pycore_byteswap.h" // _Py_bswap32() +#include "pycore_bitutils.h" // _Py_bswap32() #include "structmember.h" // PyMemberDef #include "hashlib.h" #include "pystrhex.h" diff --git a/Modules/sha512module.c b/Modules/sha512module.c index 561ef8ef0e8..aa2aeedcc6c 100644 --- a/Modules/sha512module.c +++ b/Modules/sha512module.c @@ -17,7 +17,7 @@ /* SHA objects */ #include "Python.h" -#include "pycore_byteswap.h" // _Py_bswap32() +#include "pycore_bitutils.h" // _Py_bswap32() #include "structmember.h" // PyMemberDef #include "hashlib.h" #include "pystrhex.h" diff --git a/Objects/longobject.c b/Objects/longobject.c index 0b209a403c4..ce10c4f6658 100644 --- a/Objects/longobject.c +++ b/Objects/longobject.c @@ -3,8 +3,9 @@ /* XXX The functional organization of this file is terrible */ #include "Python.h" -#include "pycore_interp.h" // _PY_NSMALLPOSINTS -#include "pycore_pystate.h" // _Py_IsMainInterpreter() +#include "pycore_bitutils.h" // _Py_popcount32() +#include "pycore_interp.h" // _PY_NSMALLPOSINTS +#include "pycore_pystate.h" // _Py_IsMainInterpreter() #include "longintrepr.h" #include @@ -5307,12 +5308,10 @@ int_bit_length_impl(PyObject *self) static int popcount_digit(digit d) { - /* 32bit SWAR popcount. */ - uint32_t u = d; - u -= (u >> 1) & 0x55555555U; - u = (u & 0x33333333U) + ((u >> 2) & 0x33333333U); - u = (u + (u >> 4)) & 0x0f0f0f0fU; - return (uint32_t)(u * 0x01010101U) >> 24; + // digit can be larger than uint32_t, but only PyLong_SHIFT bits + // of it will be ever used. + Py_BUILD_ASSERT(PyLong_SHIFT <= 32); + return _Py_popcount32((uint32_t)d); } /*[clinic input] diff --git a/Objects/stringlib/codecs.h b/Objects/stringlib/codecs.h index 9b2a29ba3b8..197605b012e 100644 --- a/Objects/stringlib/codecs.h +++ b/Objects/stringlib/codecs.h @@ -4,7 +4,7 @@ # error "codecs.h is specific to Unicode" #endif -#include "pycore_byteswap.h" // _Py_bswap32() +#include "pycore_bitutils.h" // _Py_bswap32() /* Mask to quickly check whether a C 'long' contains a non-ASCII, UTF8-encoded char. */ diff --git a/PCbuild/pythoncore.vcxproj b/PCbuild/pythoncore.vcxproj index b6b0cf3e991..8d5f99f8336 100644 --- a/PCbuild/pythoncore.vcxproj +++ b/PCbuild/pythoncore.vcxproj @@ -170,7 +170,7 @@ - + diff --git a/PCbuild/pythoncore.vcxproj.filters b/PCbuild/pythoncore.vcxproj.filters index 10dfffba611..7bc9f8f1664 100644 --- a/PCbuild/pythoncore.vcxproj.filters +++ b/PCbuild/pythoncore.vcxproj.filters @@ -201,7 +201,7 @@ Include - + Include diff --git a/Python/hamt.c b/Python/hamt.c index 8801c5ea418..e272e8808fd 100644 --- a/Python/hamt.c +++ b/Python/hamt.c @@ -1,5 +1,6 @@ #include "Python.h" +#include "pycore_bitutils.h" // _Py_popcount32 #include "pycore_hamt.h" #include "pycore_object.h" // _PyObject_GC_TRACK() #include // offsetof() @@ -433,30 +434,10 @@ hamt_bitpos(int32_t hash, uint32_t shift) return (uint32_t)1 << hamt_mask(hash, shift); } -static inline uint32_t -hamt_bitcount(uint32_t i) -{ - /* We could use native popcount instruction but that would - require to either add configure flags to enable SSE4.2 - support or to detect it dynamically. Otherwise, we have - a risk of CPython not working properly on older hardware. - - In practice, there's no observable difference in - performance between using a popcount instruction or the - following fallback code. - - The algorithm is copied from: - https://graphics.stanford.edu/~seander/bithacks.html - */ - i = i - ((i >> 1) & 0x55555555); - i = (i & 0x33333333) + ((i >> 2) & 0x33333333); - return (((i + (i >> 4)) & 0xF0F0F0F) * 0x1010101) >> 24; -} - static inline uint32_t hamt_bitindex(uint32_t bitmap, uint32_t bit) { - return hamt_bitcount(bitmap & (bit - 1)); + return (uint32_t)_Py_popcount32(bitmap & (bit - 1)); } @@ -820,7 +801,7 @@ hamt_node_bitmap_assoc(PyHamtNode_Bitmap *self, else { /* There was no key before with the same (shift,hash). */ - uint32_t n = hamt_bitcount(self->b_bitmap); + uint32_t n = (uint32_t)_Py_popcount32(self->b_bitmap); if (n >= 16) { /* When we have a situation where we want to store more