fix(linalg.inv): bit-exact float32 via float64 DGESV path

peng.li24 · peng.li24 · commit 459308970ec3 · 2026-06-11T17:03:54.000+08:00
OpenBLAS sgesv_64_ gives 1-ULP-off results vs numpy on this build.
numpy.linalg.inv for float32 is bit-equivalent to:
  float32 → float64 → dgesv_64_ → float32
which we now follow.  Both dtypes are IEEE-754 bit-identical to numpy.

- blas_bridge.h: replace LAPACKE getrf+getri with DGESV for both types
  (float32 promoted to f64 → dgesv_64_ → demoted to f32)
- linalg.h: remove -0.0→+0.0 normalisation (no longer needed with DGESV)
- test_all.py: assert_allclose → assert_bit_aligned for all inv tests
- README: add linalg.inv to alignment table
diff --git a/README.md b/README.md
@@ -230,6 +230,7 @@ Two backends, same API — choose with `cmake -DNUMPYCPP_STD_ONLY=ON/OFF`.
 | **Norm** | `numpy.linalg.norm` (scalar + axis) | ✅ | 〜 0–1 ULP |
 | **Matmul** | `numpy.matmul` (2-D, 1-D×2-D, 2-D×1-D, batched 3-D) | ✅ | 〜 0–2 ULP |
 | **Einsum** | `ij,ij→i` `ij,jk→ik` `bij,bjk→bik` and all 2-operand patterns | ✅ | 〜 0–2 ULP |
+| **Matrix inverse** | `numpy.linalg.inv` (N×N) | ✅ | 〜 0–2 ULP |
 
 > **bitexact backend**: transcendentals resolved via `dlsym` from numpy's
 > `_multiarray_umath.so` — same `npy_exp`/`npy_log` kernels numpy uses, with
diff --git a/numpycpp/detail/blas_bridge.h b/numpycpp/detail/blas_bridge.h
@@ -38,6 +38,7 @@ Use #include "numpycpp/numpy.h" instead."
 #endif
 
 #include <cstdint>
+#include <cstring>
 #include <cmath>
 #include <dlfcn.h>
 #include <fstream>
@@ -233,46 +234,77 @@ inline void blas_dgemm(const double* A, const double* B, double* C,
 }
 
 // ============================================================================
-// LAPACK — LU factorisation + matrix inverse (numpy.linalg.inv)
+// LAPACK — matrix inverse (numpy.linalg.inv) via Fortran DGESV
 // ============================================================================
-// numpy.linalg.inv uses LAPACKE (C interface) routing through OpenBLAS ILP64.
-// LAPACKE internally handles row→column conversion and workspace allocation,
-// producing the exact same floating-point rounding path as numpy.
+// numpy.linalg.inv calls LAPACK ?gesv (solve A·X = I).  DGESV fuses LU
+// factorisation + forward/back substitution in a single kernel.
 //
-// LAPACKE function signatures (ILP64, return info as int64_t):
-//   LAPACKE_sgetrf64_(layout, m, n, a, lda, ipiv)
-//   LAPACKE_sgetri64_(layout, n, a, lda, ipiv)
-// layout = 101 (LAPACK_ROW_MAJOR)
+// On this OpenBLAS build, sgesv_64_ produces 1‑ULP differences vs numpy for
+// float32 inputs.  NumPy's float32 inv is bit‑equivalent to: promote to
+// float64 → dgesv → demote to float32.  We follow that same path so both
+// dtypes are IEEE‑754 bit‑identical to numpy.
+//
+// Fortran DGESV signature (ILP64, _64_ suffix):
+//   dgesv_64_(int64_t *N, int64_t *NRHS, double *A, int64_t *LDA,
+//             int64_t *IPIV, double *B, int64_t *LDB, int64_t *INFO);
+
+using dgesv64_fn = void(int64_t*, int64_t*, double*, int64_t*,
+                         int64_t*, double*, int64_t*, int64_t*);
 
-using LAPACKE_sgetrf64_fn = int64_t(int, int64_t, int64_t, float*,  int64_t, int64_t*);
-using LAPACKE_dgetrf64_fn = int64_t(int, int64_t, int64_t, double*, int64_t, int64_t*);
-using LAPACKE_sgetri64_fn = int64_t(int, int64_t, float*,  int64_t, const int64_t*);
-using LAPACKE_dgetri64_fn = int64_t(int, int64_t, double*, int64_t, const int64_t*);
+/// Fortran DGESV-based matrix inverse.  Matches numpy.linalg.inv exactly
+/// — same Fortran symbol, same ILP64 ABI, same memory layout.
+template<typename T> inline bool blas_gesv_inv(T* A, size_t N);
 
-/// LAPACKE-based matrix inverse (C interface, RowMajor).
-/// Uses ?getrf (LU factorisation) + ?getri (inverse from LU).
-/// Matches numpy.linalg.inv exactly — same LAPACKE path, same ILP64 ABI.
-inline bool blas_sinv(float* A, size_t N) {
-    static auto getrf = (LAPACKE_sgetrf64_fn*)resolve_blas("LAPACKE_sgetrf64_");
-    static auto getri = (LAPACKE_sgetri64_fn*)resolve_blas("LAPACKE_sgetri64_");
-    if (__builtin_expect(getrf == nullptr || getri == nullptr, 0)) return false;
+template<> inline bool blas_gesv_inv<float>(float* A, size_t N) {
+    // numpy.linalg.inv for float32 produces the same bits as:
+    //   float32 → float64 → dgesv → float32
+    // (OpenBLAS sgesv_64_ gives 1-ULP-off results vs numpy on this build;
+    //  the float64 path is bit-identical for both types.)
+    static auto gesv = (dgesv64_fn*)resolve_blas("dgesv_64_");
+    if (__builtin_expect(gesv == nullptr, 0)) return false;
     int64_t n = static_cast<int64_t>(N);
     auto ipiv = std::make_unique<int64_t[]>(N);
-    int64_t info = getrf(101, n, n, A, n, ipiv.get());
+    // Double-precision work buffers (column-major)
+    auto A_col = std::make_unique<double[]>(N * N);
+    auto B_col = std::make_unique<double[]>(N * N);
+    // Promote A row-major → A_col column-major (float→double)
+    for (size_t i = 0; i < N; ++i)
+        for (size_t j = 0; j < N; ++j)
+            A_col[j*N + i] = static_cast<double>(A[i*N + j]);
+    // B = identity (column-major, double)
+    std::memset(B_col.get(), 0, N * N * sizeof(double));
+    for (size_t i = 0; i < N; ++i)
+        B_col[i + i*N] = 1.0;
+    int64_t nrhs = n, lda = n, ldb = n, info = 0;
+    gesv(&n, &nrhs, A_col.get(), &lda, ipiv.get(), B_col.get(), &ldb, &info);
     if (info != 0) return false;
-    info = getri(101, n, A, n, ipiv.get());
-    return info == 0;
+    // Demote solution back to float32 row-major
+    for (size_t i = 0; i < N; ++i)
+        for (size_t j = 0; j < N; ++j)
+            A[i*N + j] = static_cast<float>(B_col[j*N + i]);
+    return true;
 }
-inline bool blas_dinv(double* A, size_t N) {
-    static auto getrf = (LAPACKE_dgetrf64_fn*)resolve_blas("LAPACKE_dgetrf64_");
-    static auto getri = (LAPACKE_dgetri64_fn*)resolve_blas("LAPACKE_dgetri64_");
-    if (__builtin_expect(getrf == nullptr || getri == nullptr, 0)) return false;
+
+template<> inline bool blas_gesv_inv<double>(double* A, size_t N) {
+    static auto gesv = (dgesv64_fn*)resolve_blas("dgesv_64_");
+    if (__builtin_expect(gesv == nullptr, 0)) return false;
     int64_t n = static_cast<int64_t>(N);
     auto ipiv = std::make_unique<int64_t[]>(N);
-    int64_t info = getrf(101, n, n, A, n, ipiv.get());
+    auto A_col = std::make_unique<double[]>(N * N);
+    auto B_col = std::make_unique<double[]>(N * N);
+    for (size_t i = 0; i < N; ++i)
+        for (size_t j = 0; j < N; ++j)
+            A_col[j*N + i] = A[i*N + j];
+    for (size_t i = 0; i < N; ++i)
+        for (size_t j = 0; j < N; ++j)
+            B_col[i + j*N] = (i == j) ? 1.0 : 0.0;
+    int64_t nrhs = n, lda = n, ldb = n, info = 0;
+    gesv(&n, &nrhs, A_col.get(), &lda, ipiv.get(), B_col.get(), &ldb, &info);
     if (info != 0) return false;
-    info = getri(101, n, A, n, ipiv.get());
-    return info == 0;
+    for (size_t i = 0; i < N; ++i)
+        for (size_t j = 0; j < N; ++j)
+            A[i*N + j] = B_col[j*N + i];
+    return true;
 }
 
 // Template dispatcher
@@ -290,7 +322,7 @@ template<> struct blas_ops<float> {
     static void  gemvt(const float*  B, const float*  a, float*  y,
                        size_t K, size_t N) { blas_sgemv_t(B, a, y, K, N); }
     // A_inv[N×N] = inv(A[N×N]) — in-place, returns true on success
-    static bool  inv  (float* A, size_t N) { return blas_sinv(A, N); }
+    static bool  inv  (float* A, size_t N) { return blas_gesv_inv<float>(A, N); }
 };
 template<> struct blas_ops<double> {
     static double dot  (const double* x, const double* y, size_t n) { return blas_ddot(x, y, n); }
@@ -301,7 +333,7 @@ template<> struct blas_ops<double> {
                         size_t M, size_t K) { blas_dgemv(A, x, y, M, K); }
     static void   gemvt(const double* B, const double* a, double* y,
                         size_t K, size_t N) { blas_dgemv_t(B, a, y, K, N); }
-    static bool   inv  (double* A, size_t N) { return blas_dinv(A, N); }
+    static bool   inv  (double* A, size_t N) { return blas_gesv_inv<double>(A, N); }
 };
 
 } // namespace detail
diff --git a/numpycpp/linalg.h b/numpycpp/linalg.h
@@ -97,18 +97,13 @@ inline void norm_axis(const T* src, T* dst,
 }
 
 /// numpy.linalg.inv(a) — matrix inverse (square N×N)
-/// Uses LAPACKE getrf+getri (bitexact) or Gauss-Jordan (std backend).
+/// Uses DGESV (bitexact) or Gauss-Jordan (std backend).
 /// Returns true on success; false if matrix is singular or LAPACK unavailable.
 template<typename T>
 inline bool inv(const T* A, T* A_inv, size_t N) {
     // Copy input to output buffer (inv modifies in-place)
     for (size_t i = 0; i < N * N; ++i) A_inv[i] = A[i];
     bool ok = numpy::detail::blas_ops<T>::inv(A_inv, N);
-    if (ok) {
-        // Normalise -0.0 → +0.0 (LAPACK build variance in signed-zero output)
-        for (size_t i = 0; i < N * N; ++i)
-            if (A_inv[i] == T(0)) A_inv[i] = T(0);
-    }
     return ok;
 }
 
diff --git a/tests/test_all.py b/tests/test_all.py
@@ -1257,30 +1257,15 @@ def test_inv_2x2(cpp):
     a = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float64)
     assert_bit_aligned(cpp.linalg.inv(a), np.linalg.inv(a), "inv(2x2) f64")
 
-def test_inv_random_correctness(cpp, dtype):
-    """inv(A) @ A ≈ I for random 4×4."""
+def test_inv_random(cpp, dtype):
+    """inv(A) bit-identical to numpy for random 4×4."""
     a = random_array((4, 4), dtype=dtype)
-    a_inv = cpp.linalg.inv(a)
-    # Verify A @ A_inv ≈ I
-    prod = cpp.matmul(a_inv, a)
-    eye = np.eye(4, dtype=dtype)
-    np.testing.assert_allclose(
-        np.asarray(prod), eye,
-        rtol=0, atol=dtype(2e-6),
-        err_msg=f"inv * A != I ({dtype.__name__})"
-    )
-
-def test_inv_random_3x3_correctness(cpp, dtype):
-    """inv(A) @ A ≈ I for random 3×3."""
+    assert_bit_aligned(cpp.linalg.inv(a), np.linalg.inv(a), f"inv(4x4) {dtype.__name__}")
+
+def test_inv_random_3x3(cpp, dtype):
+    """inv(A) bit-identical to numpy for random 3×3."""
     a = random_array((3, 3), dtype=dtype)
-    a_inv = cpp.linalg.inv(a)
-    prod = cpp.matmul(a_inv, a)
-    eye = np.eye(3, dtype=dtype)
-    np.testing.assert_allclose(
-        np.asarray(prod), eye,
-        rtol=0, atol=dtype(2e-6),
-        err_msg=f"inv(3x3) * A != I ({dtype.__name__})"
-    )
+    assert_bit_aligned(cpp.linalg.inv(a), np.linalg.inv(a), f"inv(3x3) {dtype.__name__}")
 
 def test_inv_singular(cpp):
     """inv(singular) — numpy raises LinAlgError."""
@@ -1289,17 +1274,10 @@ def test_inv_singular(cpp):
     with _pytest.raises(RuntimeError):
         cpp.linalg.inv(a)
 
-def test_inv_random_8x8_correctness(cpp):
-    """inv(A) @ A ≈ I for random 8×8 float64."""
+def test_inv_random_8x8(cpp):
+    """inv(A) bit-identical to numpy for random 8×8 float64."""
     a = random_array((8, 8), dtype=np.float64, seed=42)
-    a_inv = cpp.linalg.inv(a)
-    prod = cpp.matmul(a_inv, a)
-    eye = np.eye(8, dtype=np.float64)
-    np.testing.assert_allclose(
-        np.asarray(prod), eye,
-        rtol=0, atol=1e-12,
-        err_msg="inv(8x8) * A != I (f64)"
-    )
+    assert_bit_aligned(cpp.linalg.inv(a), np.linalg.inv(a), "inv(8x8) f64")
 
 def test_dot(cpp, dtype):
     a = random_array((5,), dtype=dtype)