Skip to content

Commit fbf3dd7

Browse files
author
peng.li24
committed
ci: detect AVX-512 usability at cmake time; fix DISPATCH ifdef guards
Root cause: Azure CI VMs expose avx512f in CPUID but the hypervisor traps ZMM instruction execution. -mavx512f makes GCC compile AVX-512 code (including avx512_loops.h template specializations without runtime guards) → SIGILL at test startup. Fix 1 — CMakeLists.txt: use check_cxx_source_runs to probe whether a ZMM instruction actually executes. Only adds -mavx512f/-mprefer-vector- width=256 when the probe succeeds. On Azure CI runners the probe gets SIGILL (exit 132) → NUMPYCPP_AVX512_WORKS=FALSE → no -mavx512f → no SIGILL. On real AVX-512 hardware the probe succeeds → SVML + wide AVX-512 loop paths are enabled as before. Fix 2 — svml_bridge.h: wrap all DISPATCH_F64/F32 macro bodies and call sites in #ifdef __AVX512F__ so the dispatchers compile cleanly when -mavx512f is absent. Without -mavx512f: always scalar npy_* path (still bit-exact — numpy uses the same scalar path on non-AVX-512).
1 parent fde9e28 commit fbf3dd7

2 files changed

Lines changed: 55 additions & 14 deletions

File tree

numpy/detail/svml_bridge.h

Lines changed: 27 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -261,31 +261,42 @@ inline float atan2_npy_f32(float y, float x) {
261261
#undef NUMPY_NPY_F32
262262

263263
// ============================================================================
264-
// Dispatchers — select SVML (AVX-512) or npy_* (scalar) at runtime
264+
// Dispatchers — select SVML (AVX-512) or npy_* (scalar) at runtime.
265+
//
266+
// SVML call sites are guarded by #ifdef __AVX512F__ so the dispatchers
267+
// compile cleanly when -mavx512f is absent (e.g. cloud CI runners where
268+
// CPUID reports avx512f but the hypervisor traps ZMM execution).
269+
// Without -mavx512f: always use the scalar npy_* path — still bit-exact.
265270
// ============================================================================
266271

272+
#ifdef __AVX512F__
267273
#define DISPATCH_F64(name) \
268274
inline double name##_f64(double x) { \
269-
if (cpu_has_avx512f()) { \
270-
return name##_svml_f64(x); \
271-
} \
275+
if (cpu_has_avx512f()) return name##_svml_f64(x); \
272276
return name##_npy_f64(x); \
273277
}
274-
275278
#define DISPATCH_F32(name) \
276279
inline float name##_f32(float x) { \
277-
if (cpu_has_avx512f()) { \
278-
return name##_svml_f32(x); \
279-
} \
280+
if (cpu_has_avx512f()) return name##_svml_f32(x); \
280281
return name##_npy_f32(x); \
281282
}
283+
#else
284+
#define DISPATCH_F64(name) \
285+
inline double name##_f64(double x) { return name##_npy_f64(x); }
286+
#define DISPATCH_F32(name) \
287+
inline float name##_f32(float x) { return name##_npy_f32(x); }
288+
#endif
282289

283290
DISPATCH_F64(exp)
284291
DISPATCH_F64(log)
285292
// sin_f64: custom — SVML scalar broadcast path loses signed zero (sin(-0)→+0).
286293
// IEEE 754 requires sin(±0) = ±0; preserve sign of zero explicitly.
287294
inline double sin_f64(double x) {
295+
#ifdef __AVX512F__
288296
double r = cpu_has_avx512f() ? sin_svml_f64(x) : sin_npy_f64(x);
297+
#else
298+
double r = sin_npy_f64(x);
299+
#endif
289300
if (__builtin_expect(x == 0.0 && r == 0.0, 0)) return x; // ±0 → ±0
290301
return r;
291302
}
@@ -326,19 +337,27 @@ inline float cos_f32(float x) { return cos_npy_f32(x); }
326337

327338
// pow / atan2 dispatchers
328339
inline double pow_f64(double x, double e) {
340+
#ifdef __AVX512F__
329341
if (cpu_has_avx512f()) return pow_svml_f64(x, e);
342+
#endif
330343
return pow_npy_f64(x, e);
331344
}
332345
inline float pow_f32(float x, float e) {
346+
#ifdef __AVX512F__
333347
if (cpu_has_avx512f()) return pow_svml_f32(x, e);
348+
#endif
334349
return pow_npy_f32(x, e);
335350
}
336351
inline double atan2_f64(double y, double x) {
352+
#ifdef __AVX512F__
337353
if (cpu_has_avx512f()) return atan2_svml_f64(y, x);
354+
#endif
338355
return atan2_npy_f64(y, x);
339356
}
340357
inline float atan2_f32(float y, float x) {
358+
#ifdef __AVX512F__
341359
if (cpu_has_avx512f()) return atan2_svml_f32(y, x);
360+
#endif
342361
return atan2_npy_f32(y, x);
343362
}
344363

tests/CMakeLists.txt

Lines changed: 28 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,23 @@ find_package(Eigen3 CONFIG QUIET)
5353
# OpenMP (optional — einsum uses #pragma omp only when _OPENMP is defined)
5454
find_package(OpenMP)
5555

56+
# AVX-512 runtime probe — compile AND run a tiny binary that executes a ZMM
57+
# instruction. check_cxx_source_runs returns TRUE only if the binary exits 0,
58+
# so it correctly distinguishes machines where CPUID says avx512f but the
59+
# hypervisor traps ZMM execution (e.g. some Azure VM SKUs used by GitHub Actions).
60+
include(CheckCXXSourceRuns)
61+
set(CMAKE_REQUIRED_FLAGS "-mavx512f -mfma")
62+
check_cxx_source_runs("
63+
#include <immintrin.h>
64+
int main() {
65+
__m512d a = _mm512_set1_pd(2.0);
66+
__m512d b = _mm512_sqrt_pd(a);
67+
(void)b;
68+
return 0;
69+
}
70+
" NUMPYCPP_AVX512_WORKS)
71+
unset(CMAKE_REQUIRED_FLAGS)
72+
5673
# ---- Python extension module -------------------------------------------------
5774
pybind11_add_module(numpycpp MODULE module.cpp)
5875

@@ -69,12 +86,7 @@ endif()
6986
target_compile_options(numpycpp PRIVATE
7087
-O2
7188
-ffp-contract=off # no implicit FMA for a+b*c (keeps Cody-Waite exact)
72-
-msse4.1 -mavx512f -mfma # -mavx512f needed to compile AVX-512 intrinsics
73-
-mprefer-vector-width=256 # prevent auto-vectorizer from emitting 512-bit (ZMM)
74-
# instructions globally; Azure VMs expose avx512f in
75-
# cpuinfo but may have ZMM state disabled by hypervisor
76-
# → explicit __attribute__((target("avx512f"))) + runtime
77-
# __builtin_cpu_supports guard handles AVX-512 safely
89+
-msse4.1 -mfma # baseline SSE4.1 + FMA
7890
# disable builtin replacements so our calls go through SVML/npy_math paths
7991
-fno-builtin-exp -fno-builtin-log -fno-builtin-sin
8092
-fno-builtin-cos -fno-builtin-tan -fno-builtin-pow
@@ -83,6 +95,16 @@ target_compile_options(numpycpp PRIVATE
8395
-fno-builtin-atan -fno-builtin-exp2
8496
-fno-builtin-cbrt -fno-builtin-expm1 -fno-builtin-log1p
8597
)
98+
if(NUMPYCPP_AVX512_WORKS)
99+
# AVX-512 is actually executable (not just advertised in CPUID).
100+
# Enable SVML vector path + wide AVX-512 loops — these are the same
101+
# paths numpy uses, giving bit-exact results.
102+
target_compile_options(numpycpp PRIVATE
103+
-mavx512f -mprefer-vector-width=256)
104+
message(STATUS "AVX-512: usable — SVML + wide-loop path enabled")
105+
else()
106+
message(STATUS "AVX-512: not usable (missing or hypervisor-trapped) — scalar npy_* path (still bit-exact)")
107+
endif()
86108

87109
if(OpenMP_CXX_FOUND)
88110
target_link_libraries(numpycpp PRIVATE OpenMP::OpenMP_CXX)

0 commit comments

Comments
 (0)