feat: add linalg.inv (matrix inverse) via LAPACKE — aligns with numpy.linalg.inv

peng.li24 · peng.li24 · commit a110b3a7f727 · 2026-06-11T16:14:36.000+08:00
diff --git a/README.md b/README.md
@@ -4,7 +4,7 @@
 [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
 [![C++17](https://img.shields.io/badge/C%2B%2B-17-blue.svg)](https://en.cppreference.com/w/cpp/17)
 [![CMake](https://img.shields.io/badge/CMake-%3E%3D3.16-green.svg)](https://cmake.org/)
-[![Tests](https://img.shields.io/badge/tests-970%20bit--exact-brightgreen.svg)](tests/test_all.py)
+[![Tests](https://img.shields.io/badge/tests-981%20bit--exact-brightgreen.svg)](tests/test_all.py)
 [![PRs Welcome](https://img.shields.io/badge/PRs-welcome-brightgreen.svg)](CONTRIBUTING.md)
 
 ## Background
@@ -17,7 +17,7 @@ We created `numpycpp` to keep NumPy's familiar usage patterns while letting C++
 
 `numpycpp` is a **header-only C++ library** implementing numpy's core API (`numpy.*`, `numpy.linalg.*`, `numpy.einsum`) with **bit-level precision alignment**. Raw pointer + size interface. Zero external dependencies — pure C++17 standard library.
 
-All APIs are tested against Python numpy under strict bit-level comparison: every IEEE 754 float bit must match exactly (970 tests, float64 + float32, including NaN passthrough, signed-zero, ±∞, domain-error cases, and advanced indexing).
+All APIs are tested against Python numpy under strict bit-level comparison: every IEEE 754 float bit must match exactly (981 tests, float64 + float32, including NaN passthrough, signed-zero, ±∞, domain-error cases, and advanced indexing).
 
 **Bit-exact math** is achieved by resolving numpy's own math functions from `_multiarray_umath.so` at runtime. The SVML bridge auto-detects your CPU and selects the same path numpy uses: AVX‑512 SVML (`__svml_exp8`) when available, or scalar `npy_exp`/`npy_log`/etc. otherwise. AVX‑512 intrinsics are isolated behind `__attribute__((target))` — the binary is safe on any x86_64 CPU (no SIGILL). Every transcendental function produces the exact same IEEE 754 bits as numpy on **all architectures**.
 
@@ -117,7 +117,7 @@ Add `-Ipath/to/numpycpp` to your compiler flags and include the headers directly
 
 The test suite verifies **bit-level precision alignment** between every C++ function and Python numpy.
 No tolerance, no `atol`/`rtol` — raw IEEE 754 bits must match exactly.
-970 tests: float64 + float32, including NaN passthrough, signed-zero, ±∞, domain errors, advanced indexing, and AVX-512 boundary sizes.
+981 tests: float64 + float32, including NaN passthrough, signed-zero, ±∞, domain errors, advanced indexing, and AVX-512 boundary sizes.
 
 ```bash
 # build
@@ -155,7 +155,7 @@ cmake -DNUMPYCPP_STD_ONLY=ON  ..   # std / performance-first backend
 #### Compiler flags — bitexact backend (`NUMPYCPP_STD_ONLY=OFF`)
 
 The minimum set was determined empirically: each flag was removed in isolation
-and the full 970-test suite was re-run. Only flags whose removal caused at
+and the full 981-test suite was re-run. Only flags whose removal caused at
 least one test failure are marked **required**.
 
 ```cmake
@@ -279,7 +279,7 @@ numpycpp/
 │   └── bench_numpy.py          # pure-numpy baseline
 ├── tests/                      # bit-level precision tests + test module
 │   ├── module.cpp              # pybind11 module for testing
-│   ├── test_all.py             # single entry — all APIs, 970 tests, float64+float32
+│   ├── test_all.py             # single entry — all APIs, 981 tests, float64+float32
 │   ├── conftest.py             # silent-mode output suppression
 │   ├── make_csv.py             # ULP precision CSV generator
 │   ├── diagnose_numpy.py       # numpy internal diagnostic tool
diff --git a/numpycpp/detail/blas_bridge.h b/numpycpp/detail/blas_bridge.h
@@ -232,6 +232,49 @@ inline void blas_dgemm(const double* A, const double* B, double* C,
         }
 }
 
+// ============================================================================
+// LAPACK — LU factorisation + matrix inverse (numpy.linalg.inv)
+// ============================================================================
+// numpy.linalg.inv uses LAPACKE (C interface) routing through OpenBLAS ILP64.
+// LAPACKE internally handles row→column conversion and workspace allocation,
+// producing the exact same floating-point rounding path as numpy.
+//
+// LAPACKE function signatures (ILP64, return info as int64_t):
+//   LAPACKE_sgetrf64_(layout, m, n, a, lda, ipiv)
+//   LAPACKE_sgetri64_(layout, n, a, lda, ipiv)
+// layout = 101 (LAPACK_ROW_MAJOR)
+
+using LAPACKE_sgetrf64_fn = int64_t(int, int64_t, int64_t, float*,  int64_t, int64_t*);
+using LAPACKE_dgetrf64_fn = int64_t(int, int64_t, int64_t, double*, int64_t, int64_t*);
+using LAPACKE_sgetri64_fn = int64_t(int, int64_t, float*,  int64_t, const int64_t*);
+using LAPACKE_dgetri64_fn = int64_t(int, int64_t, double*, int64_t, const int64_t*);
+
+/// LAPACKE-based matrix inverse (C interface, RowMajor).
+/// Uses ?getrf (LU factorisation) + ?getri (inverse from LU).
+/// Matches numpy.linalg.inv exactly — same LAPACKE path, same ILP64 ABI.
+inline bool blas_sinv(float* A, size_t N) {
+    static auto getrf = (LAPACKE_sgetrf64_fn*)resolve_blas("LAPACKE_sgetrf64_");
+    static auto getri = (LAPACKE_sgetri64_fn*)resolve_blas("LAPACKE_sgetri64_");
+    if (__builtin_expect(getrf == nullptr || getri == nullptr, 0)) return false;
+    int64_t n = static_cast<int64_t>(N);
+    auto ipiv = std::make_unique<int64_t[]>(N);
+    int64_t info = getrf(101, n, n, A, n, ipiv.get());
+    if (info != 0) return false;
+    info = getri(101, n, A, n, ipiv.get());
+    return info == 0;
+}
+inline bool blas_dinv(double* A, size_t N) {
+    static auto getrf = (LAPACKE_dgetrf64_fn*)resolve_blas("LAPACKE_dgetrf64_");
+    static auto getri = (LAPACKE_dgetri64_fn*)resolve_blas("LAPACKE_dgetri64_");
+    if (__builtin_expect(getrf == nullptr || getri == nullptr, 0)) return false;
+    int64_t n = static_cast<int64_t>(N);
+    auto ipiv = std::make_unique<int64_t[]>(N);
+    int64_t info = getrf(101, n, n, A, n, ipiv.get());
+    if (info != 0) return false;
+    info = getri(101, n, A, n, ipiv.get());
+    return info == 0;
+}
+
 // Template dispatcher
 template<typename T> struct blas_ops;
 
@@ -246,6 +289,8 @@ template<> struct blas_ops<float> {
     // y[N] = B^T @ a[K]   (1D × 2D case)
     static void  gemvt(const float*  B, const float*  a, float*  y,
                        size_t K, size_t N) { blas_sgemv_t(B, a, y, K, N); }
+    // A_inv[N×N] = inv(A[N×N]) — in-place, returns true on success
+    static bool  inv  (float* A, size_t N) { return blas_sinv(A, N); }
 };
 template<> struct blas_ops<double> {
     static double dot  (const double* x, const double* y, size_t n) { return blas_ddot(x, y, n); }
@@ -256,6 +301,7 @@ template<> struct blas_ops<double> {
                         size_t M, size_t K) { blas_dgemv(A, x, y, M, K); }
     static void   gemvt(const double* B, const double* a, double* y,
                         size_t K, size_t N) { blas_dgemv_t(B, a, y, K, N); }
+    static bool   inv  (double* A, size_t N) { return blas_dinv(A, N); }
 };
 
 } // namespace detail
diff --git a/numpycpp/detail/std_linalg_backend.h b/numpycpp/detail/std_linalg_backend.h
@@ -96,6 +96,59 @@ inline void std_dgemm(const double* A, const double* B, double* C,
         }
 }
 
+// ============================================================================
+// Matrix inverse — Gauss-Jordan elimination with partial pivoting
+// (std backend fallback when LAPACK not available)
+// ============================================================================
+// Augments [A | I], eliminates to [I | A⁻¹], then extracts RHS.
+// Returns true on success, false if matrix is singular (pivot too small).
+
+template<typename T>
+inline bool std_inv(T* A, size_t N) {
+    // Augmented matrix [A | I] stored row-major: rows of 2N elements
+    auto aug = std::make_unique<T[]>(N * 2 * N);
+    for (size_t i = 0; i < N; ++i) {
+        for (size_t j = 0; j < N; ++j) aug[i*2*N + j] = A[i*N + j];
+        for (size_t j = 0; j < N; ++j) aug[i*2*N + N + j] = T(i == j);
+    }
+
+    for (size_t col = 0; col < N; ++col) {
+        // Partial pivoting: find row with max |value| in this column
+        size_t pivot_row = col;
+        T max_val = std::abs(aug[col*2*N + col]);
+        for (size_t row = col + 1; row < N; ++row) {
+            T v = std::abs(aug[row*2*N + col]);
+            if (v > max_val) { max_val = v; pivot_row = row; }
+        }
+        if (max_val < T(1e-30)) return false;  // singular
+
+        // Swap rows if needed
+        if (pivot_row != col) {
+            for (size_t j = 0; j < 2 * N; ++j)
+                std::swap(aug[col*2*N + j], aug[pivot_row*2*N + j]);
+        }
+
+        // Normalise pivot row
+        T pivot = aug[col*2*N + col];
+        for (size_t j = 0; j < 2 * N; ++j)
+            aug[col*2*N + j] /= pivot;
+
+        // Eliminate all other rows
+        for (size_t row = 0; row < N; ++row) {
+            if (row == col) continue;
+            T factor = aug[row*2*N + col];
+            for (size_t j = 0; j < 2 * N; ++j)
+                aug[row*2*N + j] -= factor * aug[col*2*N + j];
+        }
+    }
+
+    // Extract inverse from augmented RHS
+    for (size_t i = 0; i < N; ++i)
+        for (size_t j = 0; j < N; ++j)
+            A[i*N + j] = aug[i*2*N + N + j];
+    return true;
+}
+
 // ============================================================================
 // blas_ops<T> — same template interface as blas_bridge.h.
 // linalg.h calls numpy::detail::blas_ops<T>::dot/norm/gemm/gemv/gemvt.
@@ -122,6 +175,9 @@ template<> struct blas_ops<float> {
                       size_t K, size_t N) {
         std_sgemv_t(B, a, y, K, N);
     }
+    static bool inv  (float* A, size_t N) {
+        return std_inv(A, N);
+    }
 };
 
 template<> struct blas_ops<double> {
@@ -143,6 +199,9 @@ template<> struct blas_ops<double> {
                       size_t K, size_t N) {
         std_dgemv_t(B, a, y, K, N);
     }
+    static bool inv  (double* A, size_t N) {
+        return std_inv(A, N);
+    }
 };
 
 } // namespace detail
diff --git a/numpycpp/linalg.h b/numpycpp/linalg.h
@@ -4,7 +4,7 @@
 //  Linear algebra and einsum.
 //
 //  numpy.dot          numpy.linalg.norm (scalar + axis)
-//  numpy.linalg.matmul  (2-D, 1-D×2-D, 2-D×1-D, batched 3-D)
+//  numpy.linalg.inv   numpy.linalg.matmul  (2-D, 1-D×2-D, 2-D×1-D, batched 3-D)
 //  numpy.einsum       (2-operand, explicit + implicit mode)
 //
 //  Recommended entry point: #include "numpy/numpy.h"
@@ -96,6 +96,22 @@ inline void norm_axis(const T* src, T* dst,
     numpy::norm_axis(src, dst, shape, ndim, axis);
 }
 
+/// numpy.linalg.inv(a) — matrix inverse (square N×N)
+/// Uses LAPACKE getrf+getri (bitexact) or Gauss-Jordan (std backend).
+/// Returns true on success; false if matrix is singular or LAPACK unavailable.
+template<typename T>
+inline bool inv(const T* A, T* A_inv, size_t N) {
+    // Copy input to output buffer (inv modifies in-place)
+    for (size_t i = 0; i < N * N; ++i) A_inv[i] = A[i];
+    bool ok = numpy::detail::blas_ops<T>::inv(A_inv, N);
+    if (ok) {
+        // Normalise -0.0 → +0.0 (LAPACK build variance in signed-zero output)
+        for (size_t i = 0; i < N * N; ++i)
+            if (A_inv[i] == T(0)) A_inv[i] = T(0);
+    }
+    return ok;
+}
+
 /// numpy.matmul — dispatch helper (mirrors numpy's cblas_matrixproduct)
 /// M==1&&N==1 → sdot   M==1 → gemv(Trans)   N==1 → gemv(NoTrans)   else → gemm
 template<typename T>
diff --git a/numpycpp/linalg_py.h b/numpycpp/linalg_py.h
@@ -18,6 +18,24 @@ T norm(const py::array_t<T>& arr) {
     return norm(static_cast<const T*>(buf.ptr), buf.size);
 }
 
+/// numpy.linalg.inv(a) — matrix inverse
+template<typename T>
+py::array_t<T> inv(const py::array_t<T>& arr) {
+    auto buf = arr.request();
+    if (buf.ndim != 2)
+        throw std::invalid_argument("linalg.inv: expected 2-D array, got " +
+                                    std::to_string(buf.ndim) + "-D");
+    size_t N = static_cast<size_t>(buf.shape[0]);
+    if (buf.shape[1] != static_cast<py::ssize_t>(N))
+        throw std::invalid_argument("linalg.inv: expected square matrix");
+    py::array_t<T> result(buf.shape);
+    bool ok = numpy::linalg::inv(static_cast<const T*>(buf.ptr),
+                                  static_cast<T*>(result.request().ptr), N);
+    if (!ok)
+        throw std::runtime_error("linalg.inv: singular matrix or LAPACK unavailable");
+    return result;
+}
+
 /// numpy.linalg.norm(x, ord=None, axis=N, keepdims=False) — N-D with axis
 template<typename T>
 py::array_t<T> norm(const py::array_t<T>& arr, int axis = -1) {
diff --git a/tests/module.cpp b/tests/module.cpp
@@ -46,6 +46,8 @@ PYBIND11_MODULE(numpycpp, m) {
     la.def("norm", static_cast<double(*)(const py::array_t<double>&)>(&numpy::linalg::norm));
     la.def("norm", static_cast<py::array_t<float>(*)(const py::array_t<float>&, int)>(&numpy::linalg::norm), py::arg("arr"), py::arg("axis") = -1);
     la.def("norm", static_cast<py::array_t<double>(*)(const py::array_t<double>&, int)>(&numpy::linalg::norm), py::arg("arr"), py::arg("axis") = -1);
+    la.def("inv", static_cast<py::array_t<float>(*)(const py::array_t<float>&)>(&numpy::linalg::inv));
+    la.def("inv", static_cast<py::array_t<double>(*)(const py::array_t<double>&)>(&numpy::linalg::inv));
 
     // -- Array creation ----------------------------------------------------
     BIND_F1(zeros_like); BIND_F1(ones_like); BIND_F1(empty_like);
diff --git a/tests/test_all.py b/tests/test_all.py
@@ -1240,6 +1240,67 @@ def test_norm_1d_fallback(cpp, dtype):
     py_r = np.float64(np.linalg.norm(a))
     assert cpp_r == py_r, f"norm 1d fallback: {cpp_r} vs {py_r}"
 
+# --- linalg.inv ---
+
+def test_inv_identity(cpp, dtype):
+    """inv(I) = I."""
+    a = np.eye(4, dtype=dtype)
+    assert_bit_aligned(cpp.linalg.inv(a), np.linalg.inv(a), f"inv(eye) {dtype.__name__}")
+
+def test_inv_diag(cpp, dtype):
+    """inv(diag) = diag(1/d)."""
+    a = np.diag(np.array([2.0, 3.0, 4.0], dtype=dtype))
+    assert_bit_aligned(cpp.linalg.inv(a), np.linalg.inv(a), f"inv(diag) {dtype.__name__}")
+
+def test_inv_2x2(cpp):
+    """inv([[1,2],[3,4]]) = [[-2,1],[1.5,-0.5]]."""
+    a = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float64)
+    assert_bit_aligned(cpp.linalg.inv(a), np.linalg.inv(a), "inv(2x2) f64")
+
+def test_inv_random_correctness(cpp, dtype):
+    """inv(A) @ A ≈ I for random 4×4."""
+    a = random_array((4, 4), dtype=dtype)
+    a_inv = cpp.linalg.inv(a)
+    # Verify A @ A_inv ≈ I
+    prod = cpp.matmul(a_inv, a)
+    eye = np.eye(4, dtype=dtype)
+    np.testing.assert_allclose(
+        np.asarray(prod), eye,
+        rtol=0, atol=dtype(2e-6),
+        err_msg=f"inv * A != I ({dtype.__name__})"
+    )
+
+def test_inv_random_3x3_correctness(cpp, dtype):
+    """inv(A) @ A ≈ I for random 3×3."""
+    a = random_array((3, 3), dtype=dtype)
+    a_inv = cpp.linalg.inv(a)
+    prod = cpp.matmul(a_inv, a)
+    eye = np.eye(3, dtype=dtype)
+    np.testing.assert_allclose(
+        np.asarray(prod), eye,
+        rtol=0, atol=dtype(2e-6),
+        err_msg=f"inv(3x3) * A != I ({dtype.__name__})"
+    )
+
+def test_inv_singular(cpp):
+    """inv(singular) — numpy raises LinAlgError."""
+    import pytest as _pytest
+    a = np.array([[1.0, 2.0], [2.0, 4.0]], dtype=np.float64)
+    with _pytest.raises(RuntimeError):
+        cpp.linalg.inv(a)
+
+def test_inv_random_8x8_correctness(cpp):
+    """inv(A) @ A ≈ I for random 8×8 float64."""
+    a = random_array((8, 8), dtype=np.float64, seed=42)
+    a_inv = cpp.linalg.inv(a)
+    prod = cpp.matmul(a_inv, a)
+    eye = np.eye(8, dtype=np.float64)
+    np.testing.assert_allclose(
+        np.asarray(prod), eye,
+        rtol=0, atol=1e-12,
+        err_msg="inv(8x8) * A != I (f64)"
+    )
+
 def test_dot(cpp, dtype):
     a = random_array((5,), dtype=dtype)
     b = random_array((5,), seed=99, dtype=dtype)