diff --git a/Include/internal/pycore_cpuinfo.h b/Include/internal/pycore_cpuinfo.h index 27b4bc0fad8638d..c6ac446c2fc1358 100644 --- a/Include/internal/pycore_cpuinfo.h +++ b/Include/internal/pycore_cpuinfo.h @@ -9,15 +9,25 @@ extern "C" { # error "this header requires Py_BUILD_CORE define" #endif -#include - typedef struct { - bool sse, sse2, sse3, sse41, sse42, avx, avx2, avx512vbmi; - bool done; -} _py_cpu_simd_flags; + /* Streaming SIMD Extensions */ + uint8_t sse: 1; + uint8_t sse2: 1; + uint8_t sse3: 1; + uint8_t sse41: 1; // SSE4.1 + uint8_t sse42: 1; // SSE4.2 + + /* Advanced Vector Extensions */ + uint8_t avx: 1; + uint8_t avx2: 1; + uint8_t avx512vbmi: 1; // AVX-512 Vector Byte Manipulation Instructions + + uint8_t done; // indicate whether the structure was filled or not +} py_cpu_simd_flags; +/* Detect the available SIMD features on this machine. */ extern void -_Py_detect_cpu_simd_features(_py_cpu_simd_flags *flags); +_Py_detect_cpu_simd_features(py_cpu_simd_flags *flags); #ifdef __cplusplus } diff --git a/Python/cpuinfo.c b/Python/cpuinfo.c index 28ad48ab52bd73c..8b7b24c6fd75607 100644 --- a/Python/cpuinfo.c +++ b/Python/cpuinfo.c @@ -1,18 +1,25 @@ /* - * Naive CPU SIMD features detection. + * Python CPU SIMD features detection. * - * See Modules/black2module.c. + * See https://en.wikipedia.org/wiki/CPUID for details. */ #include "Python.h" #include "pycore_cpuinfo.h" -#include +#define CPUID_REG(ARG) ARG +/* + * For simplicity, we only enable SIMD instructions for Intel CPUs, + * even though we could support ARM NEON and POWER. + */ #if defined(__x86_64__) && defined(__GNUC__) # include #elif defined(_M_X64) # include +#else +# undef CPUID_REG +# define CPUID_REG(ARG) Py_UNUSED(ARG) #endif // AVX2 cannot be compiled on macOS ARM64 (yet it can be compiled on x86_64). @@ -24,6 +31,15 @@ # undef CAN_COMPILE_SIMD_AVX512_VBMI_INSTRUCTIONS #endif +/* + * The macros below describe masks to apply on CPUID output registers. + * + * Each macro is of the form [REGISTER][PAGE]_[FEATURE] where + * + * - REGISTER is either EBX, ECX or EDX, + * - PAGE is either 1 or 7 depending, and + * - FEATURE is an SIMD instruction set. + */ #define EDX1_SSE (1 << 25) // sse, EDX, page 1, bit 25 #define EDX1_SSE2 (1 << 26) // sse2, EDX, page 1, bit 26 #define ECX1_SSE3 (1 << 9) // sse3, ECX, page 1, bit 0 @@ -33,78 +49,106 @@ #define EBX7_AVX2 (1 << 5) // avx2, EBX, page 7, bit 5 #define ECX7_AVX512_VBMI (1 << 1) // avx512-vbmi, ECX, page 7, bit 1 -void -_Py_detect_cpu_simd_features(_py_cpu_simd_flags *flags) -{ - if (flags->done) { - return; - } +#define CHECK_CPUID_REGISTER(REGISTER, MASK) ((REGISTER) & (MASK)) == 0 ? 0 : 1 - int eax1 = 0, ebx1 = 0, ecx1 = 0, edx1 = 0; - int eax7 = 0, ebx7 = 0, ecx7 = 0, edx7 = 0; +/* + * Indicate whether the CPUID input EAX=1 may be needed to + * detect SIMD basic features (e.g., SSE). + */ +#if defined(CAN_COMPILE_SIMD_SSE_INSTRUCTIONS) \ + || defined(CAN_COMPILE_SIMD_SSE2_INSTRUCTIONS) \ + || defined(CAN_COMPILE_SIMD_SSE3_INSTRUCTIONS) \ + || defined(CAN_COMPILE_SIMD_SSE4_1_INSTRUCTIONS) \ + || defined(CAN_COMPILE_SIMD_SSE4_2_INSTRUCTIONS) \ + || defined(CAN_COMPILE_SIMD_AVX_INSTRUCTIONS) +# define MAY_DETECT_CPUID_SIMD_FEATURES +#endif + +/* + * Indicate whether the CPUID input EAX=7 may be needed to + * detect SIMD extended features (e.g., AVX2 or AVX-512). + */ +#if defined(CAN_COMPILE_SIMD_AVX2_INSTRUCTIONS) \ + || defined(CAN_COMPILE_SIMD_AVX512_VBMI_INSTRUCTIONS) +# define MAY_DETECT_CPUID_SIMD_EXTENDED_FEATURES +#endif + +static inline void +get_cpuid_info(int32_t level /* input eax */, + int32_t count /* input ecx */, + int32_t *CPUID_REG(eax), + int32_t *CPUID_REG(ebx), + int32_t *CPUID_REG(ecx), + int32_t *CPUID_REG(edx)) +{ #if defined(__x86_64__) && defined(__GNUC__) - __cpuid_count(1, 0, eax1, ebx1, ecx1, edx1); - __cpuid_count(7, 0, eax7, ebx7, ecx7, edx7); + __cpuid_count(level, count, *eax, *ebx, *ecx, *edx); #elif defined(_M_X64) - int info1[4] = {0}; - __cpuidex(info1, 1, 0); - eax1 = info1[0]; - ebx1 = info1[1]; - ecx1 = info1[2]; - edx1 = info1[3]; - - int info7[4] = {0}; - __cpuidex(info7, 7, 0); - eax7 = info7[0]; - ebx7 = info7[1]; - ecx7 = info7[2]; - edx7 = info7[3]; -#else - // use (void) expressions to avoid warnings - (void) eax1; (void) ebx1; (void) ecx1; (void) edx1; - (void) eax7; (void) ebx7; (void) ecx7; (void) edx7; + int32_t info[4] = {0}; + __cpuidex(info, page, count); + *eax = info[0]; + *ebx = info[1]; + *ecx = info[2]; + *edx = info[3]; #endif +} +/* Processor Info and Feature Bits (EAX=1, ECX=0). */ +static inline void +detect_cpu_simd_features(py_cpu_simd_flags *flags) +{ + int32_t eax = 0, ebx = 0, ecx = 0, edx = 0; + get_cpuid_info(1, 0, &eax, &ebx, &ecx, &edx); #ifdef CAN_COMPILE_SIMD_SSE_INSTRUCTIONS - flags->sse = (edx1 & EDX1_SSE) != 0; -#else - flags->sse = false; + flags->sse = CHECK_CPUID_REGISTER(edx, EDX1_SSE); #endif #ifdef CAN_COMPILE_SIMD_SSE2_INSTRUCTIONS - flags->sse2 = (edx1 & EDX1_SSE2) != 0; -#else - flags->sse2 = false; + flags->sse2 = CHECK_CPUID_REGISTER(edx, EDX1_SSE2); #endif #ifdef CAN_COMPILE_SIMD_SSE3_INSTRUCTIONS - flags->sse3 = (ecx1 & ECX1_SSE3) != 0; - #else + flags->sse3 = CHECK_CPUID_REGISTER(ecx, ECX1_SSE3); #endif - flags->sse3 = false; #ifdef CAN_COMPILE_SIMD_SSE4_1_INSTRUCTIONS - flags->sse41 = (ecx1 & ECX1_SSE4_1) != 0; -#else - flags->sse41 = false; + flags->sse41 = CHECK_CPUID_REGISTER(ecx, ECX1_SSE4_1); #endif #ifdef CAN_COMPILE_SIMD_SSE4_2_INSTRUCTIONS - flags->sse42 = (ecx1 & ECX1_SSE4_2) != 0; -#else - flags->sse42 = false; + flags->sse42 = CHECK_CPUID_REGISTER(ecx, ECX1_SSE4_2); #endif #ifdef CAN_COMPILE_SIMD_AVX_INSTRUCTIONS - flags->avx = (ecx1 & ECX1_AVX) != 0; -#else - flags->avx = false; + flags->avx = CHECK_CPUID_REGISTER(ecx, ECX1_AVX); #endif +} + +/* Extended feature bits (EAX=7, ECX=0). */ +static inline void +detect_cpu_simd_extended_features(py_cpu_simd_flags *flags) +{ + int32_t eax = 0, ebx = 0, ecx = 0, edx = 0; + get_cpuid_info(7, 0, &eax, &ebx, &ecx, &edx); #ifdef CAN_COMPILE_SIMD_AVX2_INSTRUCTIONS - flags->avx2 = (ebx7 & EBX7_AVX2) != 0; -#else - flags->avx2 = false; + flags->avx2 = CHECK_CPUID_REGISTER(ebx, EBX7_AVX2); #endif #ifdef CAN_COMPILE_SIMD_AVX512_VBMI_INSTRUCTIONS - flags->avx512vbmi = (ecx7 & ECX7_AVX512_VBMI) != 0; -#else - flags->avx512vbmi = false; + flags->avx512vbmi = CHECK_CPUID_REGISTER(ecx, ECX7_AVX512_VBMI); #endif +} - flags->done = true; +void +_Py_detect_cpu_simd_features(py_cpu_simd_flags *flags) +{ + if (flags->done) { + return; + } +#ifdef MAY_DETECT_CPUID_SIMD_FEATURES + detect_cpu_simd_features(flags); +#else + flags->sse = flags->sse2 = flags->sse3 = flags->sse41 = flags->sse42 = 0; + flags->avx = 0; +#endif +#ifdef MAY_DETECT_CPUID_SIMD_EXTENDED_FEATURES + detect_cpu_simd_extended_features(flags); +#else + flags->avx2 = flags->avx512vbmi = 0; +#endif + flags->done = 1; }