ci: detect AVX-512 usability at cmake time; fix DISPATCH ifdef guards

peng.li24 · peng.li24 · commit fbf3dd7ba6c6 · 2026-06-06T23:02:30.000+08:00
Root cause: Azure CI VMs expose avx512f in CPUID but the hypervisor
traps ZMM instruction execution. -mavx512f makes GCC compile AVX-512
code (including avx512_loops.h template specializations without runtime
guards) → SIGILL at test startup.

Fix 1 — CMakeLists.txt: use check_cxx_source_runs to probe whether a
ZMM instruction actually executes. Only adds -mavx512f/-mprefer-vector-
width=256 when the probe succeeds. On Azure CI runners the probe gets
SIGILL (exit 132) → NUMPYCPP_AVX512_WORKS=FALSE → no -mavx512f →
no SIGILL. On real AVX-512 hardware the probe succeeds → SVML + wide
AVX-512 loop paths are enabled as before.

Fix 2 — svml_bridge.h: wrap all DISPATCH_F64/F32 macro bodies and
call sites in #ifdef __AVX512F__ so the dispatchers compile cleanly
when -mavx512f is absent. Without -mavx512f: always scalar npy_* path
(still bit-exact — numpy uses the same scalar path on non-AVX-512).
diff --git a/numpy/detail/svml_bridge.h b/numpy/detail/svml_bridge.h
@@ -261,31 +261,42 @@ inline float atan2_npy_f32(float y, float x) {
 #undef NUMPY_NPY_F32
 
 // ============================================================================
-// Dispatchers — select SVML (AVX-512) or npy_* (scalar) at runtime
+// Dispatchers — select SVML (AVX-512) or npy_* (scalar) at runtime.
+//
+// SVML call sites are guarded by #ifdef __AVX512F__ so the dispatchers
+// compile cleanly when -mavx512f is absent (e.g. cloud CI runners where
+// CPUID reports avx512f but the hypervisor traps ZMM execution).
+// Without -mavx512f: always use the scalar npy_* path — still bit-exact.
 // ============================================================================
 
+#ifdef __AVX512F__
 #define DISPATCH_F64(name)                                              \
     inline double name##_f64(double x) {                                 \
-        if (cpu_has_avx512f()) {                                         \
-            return name##_svml_f64(x);                                   \
-        }                                                                \
+        if (cpu_has_avx512f()) return name##_svml_f64(x);               \
         return name##_npy_f64(x);                                        \
     }
-
 #define DISPATCH_F32(name)                                              \
     inline float name##_f32(float x) {                                   \
-        if (cpu_has_avx512f()) {                                         \
-            return name##_svml_f32(x);                                   \
-        }                                                                \
+        if (cpu_has_avx512f()) return name##_svml_f32(x);               \
         return name##_npy_f32(x);                                        \
     }
+#else
+#define DISPATCH_F64(name)                                              \
+    inline double name##_f64(double x) { return name##_npy_f64(x); }
+#define DISPATCH_F32(name)                                              \
+    inline float name##_f32(float x)  { return name##_npy_f32(x); }
+#endif
 
 DISPATCH_F64(exp)
 DISPATCH_F64(log)
 // sin_f64: custom — SVML scalar broadcast path loses signed zero (sin(-0)→+0).
 // IEEE 754 requires sin(±0) = ±0; preserve sign of zero explicitly.
 inline double sin_f64(double x) {
+#ifdef __AVX512F__
     double r = cpu_has_avx512f() ? sin_svml_f64(x) : sin_npy_f64(x);
+#else
+    double r = sin_npy_f64(x);
+#endif
     if (__builtin_expect(x == 0.0 && r == 0.0, 0)) return x;  // ±0 → ±0
     return r;
 }
@@ -326,19 +337,27 @@ inline float cos_f32(float x)  { return cos_npy_f32(x); }
 
 // pow / atan2 dispatchers
 inline double pow_f64(double x, double e) {
+#ifdef __AVX512F__
     if (cpu_has_avx512f()) return pow_svml_f64(x, e);
+#endif
     return pow_npy_f64(x, e);
 }
 inline float pow_f32(float x, float e) {
+#ifdef __AVX512F__
     if (cpu_has_avx512f()) return pow_svml_f32(x, e);
+#endif
     return pow_npy_f32(x, e);
 }
 inline double atan2_f64(double y, double x) {
+#ifdef __AVX512F__
     if (cpu_has_avx512f()) return atan2_svml_f64(y, x);
+#endif
     return atan2_npy_f64(y, x);
 }
 inline float atan2_f32(float y, float x) {
+#ifdef __AVX512F__
     if (cpu_has_avx512f()) return atan2_svml_f32(y, x);
+#endif
     return atan2_npy_f32(y, x);
 }
 
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
@@ -53,6 +53,23 @@ find_package(Eigen3 CONFIG QUIET)
 # OpenMP (optional — einsum uses #pragma omp only when _OPENMP is defined)
 find_package(OpenMP)
 
+# AVX-512 runtime probe — compile AND run a tiny binary that executes a ZMM
+# instruction.  check_cxx_source_runs returns TRUE only if the binary exits 0,
+# so it correctly distinguishes machines where CPUID says avx512f but the
+# hypervisor traps ZMM execution (e.g. some Azure VM SKUs used by GitHub Actions).
+include(CheckCXXSourceRuns)
+set(CMAKE_REQUIRED_FLAGS "-mavx512f -mfma")
+check_cxx_source_runs("
+    #include <immintrin.h>
+    int main() {
+        __m512d a = _mm512_set1_pd(2.0);
+        __m512d b = _mm512_sqrt_pd(a);
+        (void)b;
+        return 0;
+    }
+" NUMPYCPP_AVX512_WORKS)
+unset(CMAKE_REQUIRED_FLAGS)
+
 # ---- Python extension module -------------------------------------------------
 pybind11_add_module(numpycpp MODULE module.cpp)
 
@@ -69,12 +86,7 @@ endif()
 target_compile_options(numpycpp PRIVATE
     -O2
     -ffp-contract=off          # no implicit FMA for a+b*c (keeps Cody-Waite exact)
-    -msse4.1 -mavx512f -mfma  # -mavx512f needed to compile AVX-512 intrinsics
-    -mprefer-vector-width=256  # prevent auto-vectorizer from emitting 512-bit (ZMM)
-                               # instructions globally; Azure VMs expose avx512f in
-                               # cpuinfo but may have ZMM state disabled by hypervisor
-                               # → explicit __attribute__((target("avx512f"))) + runtime
-                               # __builtin_cpu_supports guard handles AVX-512 safely
+    -msse4.1 -mfma             # baseline SSE4.1 + FMA
     # disable builtin replacements so our calls go through SVML/npy_math paths
     -fno-builtin-exp   -fno-builtin-log   -fno-builtin-sin
     -fno-builtin-cos   -fno-builtin-tan   -fno-builtin-pow
@@ -83,6 +95,16 @@ target_compile_options(numpycpp PRIVATE
     -fno-builtin-atan  -fno-builtin-exp2
     -fno-builtin-cbrt  -fno-builtin-expm1 -fno-builtin-log1p
 )
+if(NUMPYCPP_AVX512_WORKS)
+    # AVX-512 is actually executable (not just advertised in CPUID).
+    # Enable SVML vector path + wide AVX-512 loops — these are the same
+    # paths numpy uses, giving bit-exact results.
+    target_compile_options(numpycpp PRIVATE
+        -mavx512f -mprefer-vector-width=256)
+    message(STATUS "AVX-512: usable — SVML + wide-loop path enabled")
+else()
+    message(STATUS "AVX-512: not usable (missing or hypervisor-trapped) — scalar npy_* path (still bit-exact)")
+endif()
 
 if(OpenMP_CXX_FOUND)
     target_link_libraries(numpycpp PRIVATE OpenMP::OpenMP_CXX)