-
-
Notifications
You must be signed in to change notification settings - Fork 34.5k
Expand file tree
/
Copy pathcpuinfo.c
More file actions
597 lines (544 loc) · 22.4 KB
/
cpuinfo.c
File metadata and controls
597 lines (544 loc) · 22.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
/*
* Python CPU SIMD features detection.
*
* See https://en.wikipedia.org/wiki/CPUID for details.
*/
#include "Python.h"
#include "pycore_cpuinfo.h"
#include <stdint.h> // UINT32_C()
/* CPUID input and output registers are 32-bit unsigned integers */
#define CPUID_REG uint32_t
/* Check one or more CPUID register bits. */
#define CPUID_CHECK_REG(REG, MASK) ((((REG) & (MASK)) == (MASK)) ? 0 : 1)
// For now, we only try to enable SIMD instructions for x86-64 Intel CPUs.
// In the future, we should carefully enable support for ARM NEON and POWER
// as well as AMD.
#if defined(__x86_64__) && defined(__GNUC__)
# include <cpuid.h> // __cpuid_count()
# define HAS_CPUID_SUPPORT
# define HAS_XGETBV_SUPPORT
#elif defined(_M_X64)
# include <immintrin.h> // _xgetbv()
# define HAS_XGETBV_SUPPORT
# include <intrin.h> // __cpuidex()
# define HAS_CPUID_SUPPORT
#else
# undef HAS_CPUID_SUPPORT
# undef HAS_XGETBV_SUPPORT
#endif
// Below, we declare macros for guarding the detection of SSE, AVX/AVX2
// and AVX-512 instructions. If the compiler does not even recognize the
// corresponding flags or if we are not on an 64-bit platform we do not
// even try to inspect the output of CPUID for those specific features.
#ifdef HAS_CPUID_SUPPORT
#if defined(CAN_COMPILE_SIMD_SSE_INSTRUCTIONS) \
|| defined(CAN_COMPILE_SIMD_SSE2_INSTRUCTIONS) \
|| defined(CAN_COMPILE_SIMD_SSE3_INSTRUCTIONS) \
|| defined(CAN_COMPILE_SIMD_SSSE3_INSTRUCTIONS) \
|| defined(CAN_COMPILE_SIMD_SSE4_1_INSTRUCTIONS) \
|| defined(CAN_COMPILE_SIMD_SSE4_2_INSTRUCTIONS) \
// macros above should be sorted in alphabetical order
# define SIMD_SSE_INSTRUCTIONS_DETECTION_GUARD
#endif
#if defined(CAN_COMPILE_SIMD_AVX_INSTRUCTIONS) \
|| defined(CAN_COMPILE_SIMD_AVX_IFMA_INSTRUCTIONS) \
|| defined(CAN_COMPILE_SIMD_AVX_NE_CONVERT_INSTRUCTIONS) \
|| defined(CAN_COMPILE_SIMD_AVX_VNNI_INSTRUCTIONS) \
|| defined(CAN_COMPILE_SIMD_AVX_VNNI_INT8_INSTRUCTIONS) \
|| defined(CAN_COMPILE_SIMD_AVX_VNNI_INT16_INSTRUCTIONS) \
// macros above should be sorted in alphabetical order
# define SIMD_AVX_INSTRUCTIONS_DETECTION_GUARD
#endif
#if defined(CAN_COMPILE_SIMD_AVX2_INSTRUCTIONS) \
// macros above should be sorted in alphabetical order
# define SIMD_AVX2_INSTRUCTIONS_DETECTION_GUARD
#endif
#if defined(CAN_COMPILE_SIMD_AVX512_BITALG_INSTRUCTIONS) \
|| defined(CAN_COMPILE_SIMD_AVX512_BW_INSTRUCTIONS) \
|| defined(CAN_COMPILE_SIMD_AVX512_CD_INSTRUCTIONS) \
|| defined(CAN_COMPILE_SIMD_AVX512_DQ_INSTRUCTIONS) \
|| defined(CAN_COMPILE_SIMD_AVX512_ER_INSTRUCTIONS) \
|| defined(CAN_COMPILE_SIMD_AVX512_F_INSTRUCTIONS) \
|| defined(CAN_COMPILE_SIMD_AVX512_IFMA_INSTRUCTIONS) \
|| defined(CAN_COMPILE_SIMD_AVX512_PF_INSTRUCTIONS) \
|| defined(CAN_COMPILE_SIMD_AVX512_VBMI_INSTRUCTIONS) \
|| defined(CAN_COMPILE_SIMD_AVX512_VBMI2_INSTRUCTIONS) \
|| defined(CAN_COMPILE_SIMD_AVX512_VL_INSTRUCTIONS) \
|| defined(CAN_COMPILE_SIMD_AVX512_VNNI_INSTRUCTIONS) \
|| defined(CAN_COMPILE_SIMD_AVX512_VP2INTERSECT_INSTRUCTIONS) \
|| defined(CAN_COMPILE_SIMD_AVX512_VPOPCNTDQ_INSTRUCTIONS) \
|| defined(CAN_COMPILE_SIMD_AVX512_4FMAPS_INSTRUCTIONS) \
|| defined(CAN_COMPILE_SIMD_AVX512_4VNNIW_INSTRUCTIONS) \
// macros above should be sorted in alphabetical order
# define SIMD_AVX512_INSTRUCTIONS_DETECTION_GUARD
#endif
#endif // HAS_CPUID_SUPPORT
// On macOS, checking the XCR0 register is NOT a guaranteed way
// to ensure the usability of AVX-512. As such, we disable the
// entire set of AVX-512 instructions.
//
// See https://stackoverflow.com/a/72523150/9579194.
#if defined(__APPLE__)
# undef SIMD_AVX512_INSTRUCTIONS_DETECTION_GUARD
// Additionally, AVX2 cannot be compiled on macOS ARM64 (yet it can be
// compiled on x86_64). However, since autoconf incorrectly assumes so
// when compiling a universal2 binary, we disable SIMD on such builds.
# if defined(__aarch64__) || defined(__arm64__)
# undef SIMD_AVX_INSTRUCTIONS_DETECTION_GUARD
# undef SIMD_AVX2_INSTRUCTIONS_DETECTION_GUARD
# endif
#endif
// Below, we declare macros indicating how CPUID can be called at runtime,
// so that we only call CPUID with specific inputs when needed.
#if defined(SIMD_SSE_INSTRUCTIONS_DETECTION_GUARD) \
|| defined(SIMD_AVX_INSTRUCTIONS_DETECTION_GUARD)
/* Indicate that cpuid should be called once with EAX=1 and ECX=0. */
# define SHOULD_PARSE_CPUID_L1
#endif
#if defined(SIMD_AVX2_INSTRUCTIONS_DETECTION_GUARD) \
|| defined(SIMD_AVX512_INSTRUCTIONS_DETECTION_GUARD)
/* Indicate that cpuid should be called once with EAX=7 and ECX=0. */
# define SHOULD_PARSE_CPUID_L7
# define SHOULD_PARSE_CPUID_L7S0
#endif
#if defined(SIMD_AVX_INSTRUCTIONS_DETECTION_GUARD)
/* Indicate that cpuid should be called once with EAX=7 and ECX=1. */
# define SHOULD_PARSE_CPUID_L7
# define SHOULD_PARSE_CPUID_L7S1
#endif
/*
* The macros below describe masks to apply on CPUID output registers.
*
* Each macro is of the form <REGISTER>_L<LEAF>[S<SUBLEAF>]_<FEATURE>,
* where <> (resp. []) denotes a required (resp. optional) group and:
*
* - REGISTER is EAX, EBX, ECX or EDX,
* - LEAF is the initial value of the EAX register (1 or 7),
* - SUBLEAF is the initial value of the ECX register (omitted if 0), and
* - FEATURE is a SIMD feature (with one or more specialized instructions).
*
* For maintainability, the flags are ordered by registers, leafs, subleafs,
* and bits. See https://en.wikipedia.org/wiki/CPUID for the values.
*
* Note 1: The LEAF is also called the 'page' or the 'level'.
* Note 2: The SUBLEAF is also referred to as the 'count'.
*/
/* CPUID (LEAF=1, SUBLEAF=0) [ECX] */
#define ECX_L1_SSE3 (UINT32_C(1) << 0) // 0x00000001
#define ECX_L1_PCLMULQDQ (UINT32_C(1) << 1) // 0x00000002
#define ECX_L1_SSSE3 (UINT32_C(1) << 9) // 0x00000200
#define ECX_L1_FMA (UINT32_C(1) << 12) // 0x00001000
#define ECX_L1_SSE4_1 (UINT32_C(1) << 19) // 0x00080000
#define ECX_L1_SSE4_2 (UINT32_C(1) << 20) // 0x00100000
#define ECX_L1_POPCNT (UINT32_C(1) << 23) // 0x00800000
#define ECX_L1_XSAVE (UINT32_C(1) << 26) // 0x04000000
#define ECX_L1_OSXSAVE (UINT32_C(1) << 27) // 0x08000000
#define ECX_L1_AVX (UINT32_C(1) << 28) // 0x10000000
/* CPUID (LEAF=1, SUBLEAF=0) [EDX] */
#define EDX_L1_CMOV (UINT32_C(1) << 15) // 0x00008000
#define EDX_L1_SSE (UINT32_C(1) << 25) // 0x02000000
#define EDX_L1_SSE2 (UINT32_C(1) << 26) // 0x04000000
/* CPUID (LEAF=7, SUBLEAF=0) [EBX] */
#define EBX_L7_AVX2 (UINT32_C(1) << 5) // 0x00000020
#define EBX_L7_AVX512_F (UINT32_C(1) << 16) // 0x00010000
#define EBX_L7_AVX512_DQ (UINT32_C(1) << 17) // 0x00020000
#define EBX_L7_AVX512_IFMA (UINT32_C(1) << 21) // 0x00200000
#define EBX_L7_AVX512_PF (UINT32_C(1) << 26) // 0x04000000
#define EBX_L7_AVX512_ER (UINT32_C(1) << 27) // 0x08000000
#define EBX_L7_AVX512_CD (UINT32_C(1) << 28) // 0x10000000
#define EBX_L7_AVX512_BW (UINT32_C(1) << 30) // 0x40000000
#define EBX_L7_AVX512_VL (UINT32_C(1) << 31) // 0x80000000
/* CPUID (LEAF=7, SUBLEAF=0) [ECX] */
#define ECX_L7_AVX512_VBMI (UINT32_C(1) << 1) // 0x00000002
#define ECX_L7_AVX512_VBMI2 (UINT32_C(1) << 6) // 0x00000040
#define ECX_L7_AVX512_VNNI (UINT32_C(1) << 11) // 0x00000800
#define ECX_L7_AVX512_BITALG (UINT32_C(1) << 12) // 0x00001000
#define ECX_L7_AVX512_VPOPCNTDQ (UINT32_C(1) << 14) // 0x00004000
/* CPUID (LEAF=7, SUBLEAF=0) [EDX] */
#define EDX_L7_AVX512_4VNNIW (UINT32_C(1) << 2) // 0x00000004
#define EDX_L7_AVX512_4FMAPS (UINT32_C(1) << 3) // 0x00000008
#define EDX_L7_AVX512_VP2INTERSECT (UINT32_C(1) << 8) // 0x00000100
/* CPUID (LEAF=7, SUBLEAF=1) [EAX] */
#define EAX_L7S1_AVX_VNNI (UINT32_C(1) << 4) // 0x00000010
#define EAX_L7S1_AVX_IFMA (UINT32_C(1) << 23) // 0x00800000
/* CPUID (LEAF=7, SUBLEAF=1) [EDX] */
#define EDX_L7S1_AVX_VNNI_INT8 (UINT32_C(1) << 4) // 0x00000010
#define EDX_L7S1_AVX_NE_CONVERT (UINT32_C(1) << 5) // 0x00000020
#define EDX_L7S1_AVX_VNNI_INT16 (UINT32_C(1) << 10) // 0x00000400
/*
* Call __cpuid_count() or equivalent and get
* its EAX, EBX, ECX and EDX output registers.
*
* If CPUID is not supported, registers are set to 0.
*/
static inline void
get_cpuid_info(uint32_t level /* input eax */,
uint32_t count /* input ecx */,
CPUID_REG *eax, CPUID_REG *ebx, CPUID_REG *ecx, CPUID_REG *edx)
{
*eax = *ebx = *ecx = *edx = 0; // ensure the output to be initialized
#if defined(HAS_CPUID_SUPPORT) && defined(__x86_64__) && defined(__GNUC__)
__cpuid_count(level, count, *eax, *ebx, *ecx, *edx);
#elif defined(HAS_CPUID_SUPPORT) && defined(_M_X64)
uint32_t info[4] = {0};
__cpuidex(info, level, count);
*eax = info[0], *ebx = info[1], *ecx = info[2], *edx = info[3];
#endif
}
/* XSAVE state components (XCR0 control register) */
#define XCR0_SSE (UINT32_C(1) << 1) // 0x00000002
#define XCR0_AVX (UINT32_C(1) << 2) // 0x00000004
#define XCR0_AVX512_OPMASK (UINT32_C(1) << 5) // 0x00000020
#define XCR0_AVX512_ZMM_HI256 (UINT32_C(1) << 6) // 0x00000040
#define XCR0_AVX512_HI16_ZMM (UINT32_C(1) << 7) // 0x00000080
static inline uint64_t
get_xgetbv(uint32_t index)
{
assert(index == 0); // only XCR0 is supported for now
#if defined(HAS_CPUID_SUPPORT) && defined(__x86_64__) && defined(__GNUC__)
uint32_t eax = 0, edx = 0;
__asm__ __volatile__("xgetbv" : "=a" (eax), "=d" (edx) : "c" (index));
return ((uint64_t)edx << 32) | eax;
#elif defined(HAS_CPUID_SUPPORT) && defined(_M_X64)
return (uint64_t)_xgetbv(index);
#else
(void)index;
return 0;
#endif
}
/* Highest Function Parameter and Manufacturer ID (LEAF=0, SUBLEAF=0). */
static inline uint32_t
detect_cpuid_maxleaf(void)
{
CPUID_REG maxlevel = 0, ebx = 0, ecx = 0, edx = 0;
get_cpuid_info(0, 0, &maxlevel, &ebx, &ecx, &edx);
return maxlevel;
}
/* Processor Info and Feature Bits (LEAF=1, SUBLEAF=0). */
static inline void
detect_cpuid_features(py_cpuid_features *flags, CPUID_REG ecx, CPUID_REG edx)
{
// Keep the ordering and newlines as they are declared in the structure.
#ifdef SIMD_SSE_INSTRUCTIONS_DETECTION_GUARD
#ifdef CAN_COMPILE_SIMD_SSE_INSTRUCTIONS
flags->sse = CPUID_CHECK_REG(edx, EDX_L1_SSE);
#endif
#ifdef CAN_COMPILE_SIMD_SSE2_INSTRUCTIONS
flags->sse2 = CPUID_CHECK_REG(edx, EDX_L1_SSE2);
#endif
#ifdef CAN_COMPILE_SIMD_SSE3_INSTRUCTIONS
flags->sse3 = CPUID_CHECK_REG(ecx, ECX_L1_SSE3);
#endif
#ifdef CAN_COMPILE_SIMD_SSSE3_INSTRUCTIONS
flags->ssse3 = CPUID_CHECK_REG(ecx, ECX_L1_SSSE3);
#endif
#ifdef CAN_COMPILE_SIMD_SSE4_1_INSTRUCTIONS
flags->sse41 = CPUID_CHECK_REG(ecx, ECX_L1_SSE4_1);
#endif
#ifdef CAN_COMPILE_SIMD_SSE4_2_INSTRUCTIONS
flags->sse42 = CPUID_CHECK_REG(ecx, ECX_L1_SSE4_2);
#endif
#endif // SIMD_SSE_INSTRUCTIONS_DETECTION_GUARD
#ifdef SIMD_AVX_INSTRUCTIONS_DETECTION_GUARD
#ifdef CAN_COMPILE_SIMD_AVX_INSTRUCTIONS
flags->avx = CPUID_CHECK_REG(ecx, ECX_L1_AVX);
#endif
#endif // SIMD_AVX_INSTRUCTIONS_DETECTION_GUARD
#ifdef HAS_CPUID_SUPPORT
flags->cmov = CPUID_CHECK_REG(edx, EDX_L1_CMOV);
flags->fma = CPUID_CHECK_REG(ecx, ECX_L1_FMA);
flags->popcnt = CPUID_CHECK_REG(ecx, ECX_L1_POPCNT);
flags->pclmulqdq = CPUID_CHECK_REG(ecx, ECX_L1_PCLMULQDQ);
flags->xsave = CPUID_CHECK_REG(ecx, ECX_L1_XSAVE);
flags->os_xsave = CPUID_CHECK_REG(ecx, ECX_L1_OSXSAVE);
#endif
}
/* Extended Feature Bits (LEAF=7, SUBLEAF=0). */
static inline void
detect_cpuid_extended_features_L7S0(py_cpuid_features *flags,
CPUID_REG ebx, CPUID_REG ecx, CPUID_REG edx)
{
(void)ebx, (void)ecx, (void)edx; // to suppress unused warnings
// Keep the ordering and newlines as they are declared in the structure.
#ifdef SIMD_AVX2_INSTRUCTIONS_DETECTION_GUARD
#ifdef CAN_COMPILE_SIMD_AVX2_INSTRUCTIONS
flags->avx2 = CPUID_CHECK_REG(ebx, EBX_L7_AVX2);
#endif
#endif // SIMD_AVX2_INSTRUCTIONS_DETECTION_GUARD
#ifdef SIMD_AVX512_INSTRUCTIONS_DETECTION_GUARD
#ifdef CAN_COMPILE_SIMD_AVX512_F_INSTRUCTIONS
flags->avx512_f = CPUID_CHECK_REG(ebx, EBX_L7_AVX512_F);
#endif
#ifdef CAN_COMPILE_SIMD_AVX512_CD_INSTRUCTIONS
flags->avx512_cd = CPUID_CHECK_REG(ebx, EBX_L7_AVX512_CD);
#endif
#ifdef CAN_COMPILE_SIMD_AVX512_ER_INSTRUCTIONS
flags->avx512_er = CPUID_CHECK_REG(ebx, EBX_L7_AVX512_ER);
#endif
#ifdef CAN_COMPILE_SIMD_AVX512_PF_INSTRUCTIONS
flags->avx512_pf = CPUID_CHECK_REG(ebx, EBX_L7_AVX512_PF);
#endif
#ifdef CAN_COMPILE_SIMD_AVX512_4FMAPS_INSTRUCTIONS
flags->avx512_4fmaps = CPUID_CHECK_REG(edx, EDX_L7_AVX512_4FMAPS);
#endif
#ifdef CAN_COMPILE_SIMD_AVX512_4VNNIW_INSTRUCTIONS
flags->avx512_4vnniw = CPUID_CHECK_REG(edx, EDX_L7_AVX512_4VNNIW);
#endif
#ifdef CAN_COMPILE_SIMD_AVX512_VPOPCNTDQ_INSTRUCTIONS
flags->avx512_vpopcntdq = CPUID_CHECK_REG(ecx, ECX_L7_AVX512_VPOPCNTDQ);
#endif
#ifdef CAN_COMPILE_SIMD_AVX512_VL_INSTRUCTIONS
flags->avx512_vl = CPUID_CHECK_REG(ebx, EBX_L7_AVX512_VL);
#endif
#ifdef CAN_COMPILE_SIMD_AVX512_DQ_INSTRUCTIONS
flags->avx512_dq = CPUID_CHECK_REG(ebx, EBX_L7_AVX512_DQ);
#endif
#ifdef CAN_COMPILE_SIMD_AVX512_BW_INSTRUCTIONS
flags->avx512_bw = CPUID_CHECK_REG(ebx, EBX_L7_AVX512_BW);
#endif
#ifdef CAN_COMPILE_SIMD_AVX512_IFMA_INSTRUCTIONS
flags->avx512_ifma = CPUID_CHECK_REG(ebx, EBX_L7_AVX512_IFMA);
#endif
#ifdef CAN_COMPILE_SIMD_AVX512_VBMI_INSTRUCTIONS
flags->avx512_vbmi = CPUID_CHECK_REG(ecx, ECX_L7_AVX512_VBMI);
#endif
#ifdef CAN_COMPILE_SIMD_AVX512_VNNI_INSTRUCTIONS
flags->avx512_vnni = CPUID_CHECK_REG(ecx, ECX_L7_AVX512_VNNI);
#endif
#ifdef CAN_COMPILE_SIMD_AVX512_VBMI2_INSTRUCTIONS
flags->avx512_vbmi2 = CPUID_CHECK_REG(ecx, ECX_L7_AVX512_VBMI2);
#endif
#ifdef CAN_COMPILE_SIMD_AVX512_BITALG_INSTRUCTIONS
flags->avx512_bitalg = CPUID_CHECK_REG(ecx, ECX_L7_AVX512_BITALG);
#endif
#ifdef CAN_COMPILE_SIMD_AVX512_VP2INTERSECT_INSTRUCTIONS
flags->avx512_vp2intersect = CPUID_CHECK_REG(edx, EDX_L7_AVX512_VP2INTERSECT);
#endif
#endif // SIMD_AVX512_INSTRUCTIONS_DETECTION_GUARD
}
/* Extended Feature Bits (LEAF=7, SUBLEAF=1). */
static inline void
detect_cpuid_extended_features_L7S1(py_cpuid_features *flags,
CPUID_REG eax,
CPUID_REG ebx,
CPUID_REG ecx,
CPUID_REG edx)
{
(void)eax, (void)ebx, (void)ecx, (void)edx; // to suppress unused warnings
// Keep the ordering and newlines as they are declared in the structure.
#ifdef SIMD_AVX_INSTRUCTIONS_DETECTION_GUARD
#ifdef CAN_COMPILE_SIMD_AVX_NE_CONVERT_INSTRUCTIONS
flags->avx_ne_convert = CPUID_CHECK_REG(edx, EDX_L7S1_AVX_NE_CONVERT);
#endif
#ifdef CAN_COMPILE_SIMD_AVX_IFMA_INSTRUCTIONS
flags->avx_ifma = CPUID_CHECK_REG(eax, EAX_L7S1_AVX_IFMA);
#endif
#ifdef CAN_COMPILE_SIMD_AVX_VNNI_INSTRUCTIONS
flags->avx_vnni = CPUID_CHECK_REG(eax, EAX_L7S1_AVX_VNNI);
#endif
#ifdef CAN_COMPILE_SIMD_AVX_VNNI_INT8_INSTRUCTIONS
flags->avx_vnni_int8 = CPUID_CHECK_REG(edx, EDX_L7S1_AVX_VNNI_INT8);
#endif
#ifdef CAN_COMPILE_SIMD_AVX_VNNI_INT16_INSTRUCTIONS
flags->avx_vnni_int16 = CPUID_CHECK_REG(edx, EDX_L7S1_AVX_VNNI_INT16);
#endif
#endif // SIMD_AVX_INSTRUCTIONS_DETECTION_GUARD
}
static inline void
detect_cpuid_xsave_state(py_cpuid_features *flags)
{
// Keep the ordering and newlines as they are declared in the structure.
#ifdef HAS_XGETBV_SUPPORT
uint64_t xcr0 = flags->os_xsave ? get_xgetbv(0) : 0;
flags->xcr0_sse = CPUID_CHECK_REG(xcr0, XCR0_SSE);
flags->xcr0_avx = CPUID_CHECK_REG(xcr0, XCR0_AVX);
flags->xcr0_avx512_opmask = CPUID_CHECK_REG(xcr0, XCR0_AVX512_OPMASK);
flags->xcr0_avx512_zmm_hi256 = CPUID_CHECK_REG(xcr0, XCR0_AVX512_ZMM_HI256);
flags->xcr0_avx512_hi16_zmm = CPUID_CHECK_REG(xcr0, XCR0_AVX512_HI16_ZMM);
#endif
}
static inline void
cpuid_features_finalize(py_cpuid_features *flags)
{
assert(flags->ready == 0);
// Here, any flag that may depend on others should be correctly set
// at runtime to avoid illegal instruction errors.
flags->ready = 1;
}
static inline int
cpuid_features_validate(const py_cpuid_features *flags)
{
if (flags->ready != 1) {
return -1;
}
// AVX-512/F is required to support any other AVX-512 instruction set
uint8_t avx512_require_f = (
// newlines are placed according to processor generations
flags->avx512_cd ||
flags->avx512_er || flags->avx512_pf ||
flags->avx512_4fmaps || flags->avx512_4vnniw ||
flags->avx512_vpopcntdq ||
flags->avx512_vl || flags->avx512_dq || flags->avx512_bw ||
flags->avx512_ifma || flags->avx512_vbmi ||
flags->avx512_vnni ||
flags->avx512_vbmi2 || flags->avx512_bitalg ||
flags->avx512_vp2intersect
);
if (!flags->avx512_f && !avx512_require_f) {
return -1;
}
return 0;
}
int
_Py_cpuid_check_features(const py_cpuid_features *flags)
{
return cpuid_features_validate(flags) < 0 ? 0 : 1;
}
/*
* Apply a 1-parameter macro MACRO(FLAG) on all members
* of a 'py_cpuid_features' object ('ready' is omitted).
*/
#define CPUID_APPLY_MACRO(MACRO) \
do { \
MACRO(sse); \
MACRO(sse2); \
MACRO(sse3); \
MACRO(ssse3); \
MACRO(sse41); \
MACRO(sse42); \
\
MACRO(avx); \
MACRO(avx_ifma); \
MACRO(avx_ne_convert); \
\
MACRO(avx_vnni); \
MACRO(avx_vnni_int8); \
MACRO(avx_vnni_int16); \
\
MACRO(avx2); \
\
MACRO(avx512_f); \
MACRO(avx512_cd); \
\
MACRO(avx512_er); \
MACRO(avx512_pf); \
\
MACRO(avx512_4fmaps); \
MACRO(avx512_4vnniw); \
\
MACRO(avx512_vpopcntdq); \
\
MACRO(avx512_vl); \
MACRO(avx512_dq); \
MACRO(avx512_bw); \
\
MACRO(avx512_ifma); \
MACRO(avx512_vbmi); \
\
MACRO(avx512_vnni); \
\
MACRO(avx512_vbmi2); \
MACRO(avx512_bitalg); \
\
MACRO(avx512_vp2intersect); \
\
MACRO(cmov); \
MACRO(fma); \
MACRO(popcnt); \
MACRO(pclmulqdq); \
\
MACRO(xsave); \
MACRO(os_xsave); \
\
MACRO(xcr0_sse); \
MACRO(xcr0_avx); \
MACRO(xcr0_avx512_opmask); \
MACRO(xcr0_avx512_zmm_hi256); \
MACRO(xcr0_avx512_hi16_zmm); \
} while (0)
void
_Py_cpuid_disable_features(py_cpuid_features *flags)
{
#define CPUID_DISABLE(FLAG) flags->FLAG = 0
CPUID_APPLY_MACRO(CPUID_DISABLE);
#undef CPUID_DISABLE
}
int
_Py_cpuid_has_features(const py_cpuid_features *actual,
const py_cpuid_features *expect)
{
if (!actual->ready || !expect->ready) {
return 0;
}
#define CPUID_CHECK_FEATURE(FLAG) \
do { \
if (expect->FLAG && !actual->FLAG) { \
return 0; \
} \
} while (0)
CPUID_APPLY_MACRO(CPUID_CHECK_FEATURE);
#undef CPUID_CHECK_FEATURE
return 1;
}
int
_Py_cpuid_match_features(const py_cpuid_features *actual,
const py_cpuid_features *expect)
{
if (!actual->ready || !expect->ready) {
return 0;
}
#define CPUID_MATCH_FEATURE(FLAG) \
do { \
if (expect->FLAG != actual->FLAG) { \
return 0; \
} \
} while (0)
CPUID_APPLY_MACRO(CPUID_MATCH_FEATURE);
#undef CPUID_MATCH_FEATURE
return 1;
}
#undef CPUID_APPLY_MACRO
void
_Py_cpuid_detect_features(py_cpuid_features *flags)
{
if (flags->ready) {
return;
}
_Py_cpuid_disable_features(flags);
#ifdef HAS_CPUID_SUPPORT
uint32_t maxleaf = detect_cpuid_maxleaf();
(void)maxleaf; // to suppress unused warnings
CPUID_REG eax = 0, ebx = 0, ecx = 0, edx = 0;
(void)eax, (void)ebx, (void)ecx, (void)edx; // to suppress unused warnings
#ifdef SHOULD_PARSE_CPUID_L1
if (maxleaf >= 1) {
eax = 0, ebx = 0, ecx = 0, edx = 0;
get_cpuid_info(1, 0, &eax, &ebx, &ecx, &edx);
detect_cpuid_features(flags, ecx, edx);
if (flags->os_xsave) {
detect_cpuid_xsave_state(flags);
}
}
#endif // SHOULD_PARSE_CPUID_L1
#ifdef SHOULD_PARSE_CPUID_L7
if (maxleaf >= 7) {
#ifdef SHOULD_PARSE_CPUID_L7S0
eax = 0, ebx = 0, ecx = 0, edx = 0;
get_cpuid_info(7, 0, &eax, &ebx, &ecx, &edx);
detect_cpuid_extended_features_L7S0(flags, ebx, ecx, edx);
#endif
#ifdef SHOULD_PARSE_CPUID_L7S1
eax = 0, ebx = 0, ecx = 0, edx = 0;
get_cpuid_info(7, 1, &eax, &ebx, &ecx, &edx);
detect_cpuid_extended_features_L7S1(flags, eax, ebx, ecx, edx);
#endif
}
#endif // SHOULD_PARSE_CPUID_L7
cpuid_features_finalize(flags);
if (cpuid_features_validate(flags) < 0) {
_Py_cpuid_disable_features(flags);
}
#else
flags->ready = 1;
#endif // HAS_CPUID_SUPPORT
}