fix: normalize log domain-error NaN to +qNaN; align einsum vs-matmul tests

peng.li24 · peng.li24 · commit 5dbf32bb9734 · 2026-06-06T23:28:39.000+08:00
Two remaining CI failures (numpy 1.26.4, non-AVX512 runner):

1. test_domain_log_neg [float64]
   npy_log(-x) returns 0xfff8000000000000 (negative qNaN);
   numpy 1.26.4 ufunc returns 0x7ff8000000000000 (positive qNaN).
   Fix: add custom log_f64() in svml_bridge.h that normalises any
   domain-error NaN (isnan(r) &amp;&amp; !isnan(x)) to positive qNaN via memcpy,
   matching numpy's canonical NaN convention.

2. test_vs_matmul / test_vs_batch_matmul
   np.einsum('ij,jk-&gt;ik') uses numpy's SSE forward mul+add kernel;
   numpy.matmul / a @ b uses BLAS (cblas_sgemm).  They can differ by
   ±1 ULP for arbitrary inputs — the two tests were comparing against
   the wrong reference (a @ b instead of np.einsum).
   Fix: compare cpp.einsum result against np.einsum (same path as
   test_random, which already passes), not against matmul.
diff --git a/numpy/detail/svml_bridge.h b/numpy/detail/svml_bridge.h
@@ -288,7 +288,24 @@ inline float atan2_npy_f32(float y, float x) {
 #endif
 
 DISPATCH_F64(exp)
-DISPATCH_F64(log)
+// log_f64: custom — npy_log(-x) returns negative NaN (0xfff8...); numpy's ufunc
+// normalizes domain-error NaN to positive qNaN (0x7ff8...).  Mirror that here.
+inline double log_f64(double x) {
+    double r;
+#ifdef __AVX512F__
+    r = cpu_has_avx512f() ? log_svml_f64(x) : log_npy_f64(x);
+#else
+    r = log_npy_f64(x);
+#endif
+    // Normalize: domain-error NaN (finite/inf negative input) → positive qNaN
+    if (__builtin_expect(std::isnan(r) && !std::isnan(x), 0)) {
+        constexpr uint64_t qnan_bits = 0x7ff8000000000000ULL;
+        double pos_nan;
+        std::memcpy(&pos_nan, &qnan_bits, 8);
+        return pos_nan;
+    }
+    return r;
+}
 // sin_f64: custom — SVML scalar broadcast path loses signed zero (sin(-0)→+0).
 // IEEE 754 requires sin(±0) = ±0; preserve sign of zero explicitly.
 inline double sin_f64(double x) {
diff --git a/tests/test_all.py b/tests/test_all.py
@@ -1063,8 +1063,12 @@ def test_random(self, cpp, dtype, i, j, k):
     def test_vs_matmul(self, cpp, dtype):
         a = random_array((4, 5), seed=1, dtype=dtype)
         b = random_array((5, 3), seed=2, dtype=dtype)
-        assert_bit_aligned(np.asarray(cpp.einsum("ij,jk->ik", a, b)), a @ b,
-                           "ij,jk->ik vs matmul")
+        # Compare vs np.einsum (same SSE forward mul+add path as our implementation).
+        # np.einsum and np.matmul use different BLAS paths and can differ at machine
+        # epsilon — the bit-exact contract is with np.einsum, not with matmul.
+        assert_bit_aligned(np.asarray(cpp.einsum("ij,jk->ik", a, b)),
+                           np.einsum("ij,jk->ik", a, b),
+                           "ij,jk->ik vs np.einsum")
 
 
 class TestEinsumBijBjkToBik:
@@ -1086,8 +1090,12 @@ def test_random(self, cpp, dtype, batch, i, j, k):
     def test_vs_batch_matmul(self, cpp, dtype):
         a = random_array((4, 5, 6), seed=1, dtype=dtype)
         b = random_array((4, 6, 3), seed=2, dtype=dtype)
-        assert_bit_aligned(np.asarray(cpp.einsum("bij,bjk->bik", a, b)), a @ b,
-                           "bij,bjk->bik vs batch matmul")
+        # Compare vs np.einsum (same SSE forward mul+add path as our implementation).
+        # np.einsum and np.matmul use different BLAS paths and can differ at machine
+        # epsilon — the bit-exact contract is with np.einsum, not with batched matmul.
+        assert_bit_aligned(np.asarray(cpp.einsum("bij,bjk->bik", a, b)),
+                           np.einsum("bij,bjk->bik", a, b),
+                           "bij,bjk->bik vs np.einsum")
 
 
 class TestEinsumAijAijToAi: