2828// sdot_64_(n*, x*, incx*, y*, incy*) → float (return in xmm0)
2929// ddot_64_(n*, x*, incx*, y*, incy*) → double (return in xmm0)
3030//
31- // Fallback (if OpenBLAS not discovered): sequential accumulation.
31+ // No fallback: if OpenBLAS ILP64 symbols are not found, throws
32+ // std::runtime_error immediately. Silent failures are forbidden.
3233
3334#pragma once
3435
@@ -41,6 +42,7 @@ Use #include "numpycpp/numpy.h" instead."
4142#include <cstring>
4243#include <cmath>
4344#include <memory>
45+ #include <stdexcept>
4446#include <dlfcn.h>
4547#include <fstream>
4648#include <string>
@@ -50,40 +52,63 @@ namespace detail {
5052
5153inline void* g_blas_handle = nullptr;
5254
53- inline const char* find_openblas_path() {
55+ inline const std::string& find_openblas_path() {
5456 static std::string path;
5557 static bool tried = false;
56- if (tried) return path.empty() ? nullptr : path.c_str();
58+ if (tried) {
59+ if (path.empty())
60+ throw std::runtime_error(
61+ " OpenBLAS ILP64 library not found in /proc/self/maps. "
62+ " Import numpy (or another package that loads libopenblas64) "
63+ "before calling numpycpp BLAS/LAPACK functions.");
64+ return path;
65+ }
5766 tried = true ;
5867
5968 std::ifstream maps (" /proc/self/maps" );
6069 std::string line;
6170 while (std::getline(maps, line)) {
62- if (line.find(" libopenblas" ) != std::string::npos &&
63- line.find(" .so" ) != std::string::npos) {
71+ // Match only ILP64 OpenBLAS (e.g. libopenblas64_p-*.so).
72+ // scipy's LP64 libopenblas**p**-*.so does NOT have _64_ symbols
73+ // (dgesv_64_, sdot_64_, etc.), so picking it causes silent failures.
74+ if (line.find (" libopenblas64" ) != std::string::npos &&
75+ line.find (" .so" ) != std::string::npos) {
6476 auto pos = line.rfind (' /' );
6577 auto start = line.rfind (' ' , pos);
6678 if (start != std::string::npos && pos != std::string::npos) {
6779 path = line.substr (start + 1 );
68- // trim trailing whitespace / newline
6980 while (!path.empty () && (path.back () == ' \n ' || path.back () == ' \r '
7081 || path.back () == ' ' ))
7182 path.pop_back ();
72- break ;
83+ return path ;
7384 }
7485 }
7586 }
76- // If not found yet, allow retry on next call (OpenBLAS may be loaded later)
77- if (path.empty()) tried = false;
78- return path.empty() ? nullptr : path.c_str();
87+ // Not found — throw immediately. No retry, no fallback.
88+ throw std::runtime_error (
89+ " OpenBLAS ILP64 library (libopenblas64) not found in /proc/self/maps. "
90+ " Import numpy before calling numpycpp BLAS/LAPACK functions." );
7991}
8092
8193inline void * resolve_blas (const char * sym) {
8294 if (!g_blas_handle) {
83- const char* path = find_openblas_path();
84- if (path) g_blas_handle = dlopen(path, RTLD_NOLOAD | RTLD_LAZY);
95+ const std::string& path = find_openblas_path (); // throws if not found
96+ void * h = dlopen (path.c_str (), RTLD_NOLOAD | RTLD_LAZY );
97+ if (!h)
98+ throw std::runtime_error (std::string (" dlopen failed for " ) + path +
99+ " : " + (dlerror () ? dlerror () : " unknown" ));
100+ // Validate the symbol exists before caching the handle
101+ if (!dlsym (h, sym))
102+ throw std::runtime_error (std::string (" BLAS symbol '" ) + sym +
103+ " ' not found in " + path +
104+ " (ILP64 ABI mismatch? Check libopenblas64_p-*.so)" );
105+ g_blas_handle = h;
85106 }
86- return g_blas_handle ? dlsym(g_blas_handle, sym) : nullptr;
107+ void * fn = dlsym (g_blas_handle, sym);
108+ if (!fn)
109+ throw std::runtime_error (std::string (" BLAS symbol '" ) + sym +
110+ " ' disappeared from cached handle" );
111+ return fn;
87112}
88113
89114// ILP64 Fortran function types (all int args are int64_t by pointer)
@@ -108,28 +133,15 @@ using cblas_dgemm64_fn = void(int, int, int,
108133 double , double *, int64_t );
109134
110135inline float blas_sdot (const float * x, const float * y, size_t n) {
111- static sdot64_fn* fn = nullptr;
112- if (!fn) fn = (sdot64_fn*)resolve_blas(" sdot_64_" );
113- if (__builtin_expect(fn != nullptr, 1)) {
114- const int64_t ni = static_cast<int64_t>(n), inc = 1;
115- return fn(&ni, x, &inc, y, &inc);
116- }
117- // Fallback: sequential accumulation
118- float r = 0.0f;
119- for (size_t i = 0; i < n; ++i) r += x[i] * y[i];
120- return r;
136+ static sdot64_fn* fn = (sdot64_fn*)resolve_blas (" sdot_64_" );
137+ const int64_t ni = static_cast <int64_t >(n), inc = 1 ;
138+ return fn (&ni, x, &inc, y, &inc);
121139}
122140
123141inline double blas_ddot (const double * x, const double * y, size_t n) {
124- static ddot64_fn* fn = nullptr;
125- if (!fn) fn = (ddot64_fn*)resolve_blas(" ddot_64_" );
126- if (__builtin_expect(fn != nullptr, 1)) {
127- const int64_t ni = static_cast<int64_t>(n), inc = 1;
128- return fn(&ni, x, &inc, y, &inc);
129- }
130- double r = 0.0;
131- for (size_t i = 0; i < n; ++i) r += x[i] * y[i];
132- return r;
142+ static ddot64_fn* fn = (ddot64_fn*)resolve_blas (" ddot_64_" );
143+ const int64_t ni = static_cast <int64_t >(n), inc = 1 ;
144+ return fn (&ni, x, &inc, y, &inc);
133145}
134146
135147// cblas_sgemv64_ / cblas_dgemv64_ — matrix-vector, ILP64
@@ -145,103 +157,46 @@ using cblas_dgemv64_fn = void(int, int, int64_t, int64_t,
145157
146158// y[M] = A[M×K] @ x[K] — 2D × 1D case
147159inline void blas_sgemv (const float * A, const float * x, float * y, size_t M, size_t K) {
148- static cblas_sgemv64_fn* fn = nullptr;
149- if (!fn) fn = (cblas_sgemv64_fn*)resolve_blas(" cblas_sgemv64_" );
150- if (__builtin_expect(fn != nullptr, 1)) {
151- fn(101, 111, (int64_t)M, (int64_t)K, 1.0f, A, (int64_t)K,
152- x, 1, 0.0f, y, 1);
153- return;
154- }
155- for (size_t i = 0; i < M; ++i) {
156- float s = 0.0f;
157- for (size_t k = 0; k < K; ++k) s += A[i*K+k] * x[k];
158- y[i] = s;
159- }
160+ static cblas_sgemv64_fn* fn = (cblas_sgemv64_fn*)resolve_blas (" cblas_sgemv64_" );
161+ fn (101 , 111 , (int64_t )M, (int64_t )K, 1 .0f , A, (int64_t )K,
162+ x, 1 , 0 .0f , y, 1 );
160163}
161164inline void blas_dgemv (const double * A, const double * x, double * y, size_t M, size_t K) {
162- static cblas_dgemv64_fn* fn = nullptr;
163- if (!fn) fn = (cblas_dgemv64_fn*)resolve_blas(" cblas_dgemv64_" );
164- if (__builtin_expect(fn != nullptr, 1)) {
165- fn(101, 111, (int64_t)M, (int64_t)K, 1.0, A, (int64_t)K,
166- x, 1, 0.0, y, 1);
167- return;
168- }
169- for (size_t i = 0; i < M; ++i) {
170- double s = 0.0;
171- for (size_t k = 0; k < K; ++k) s += A[i*K+k] * x[k];
172- y[i] = s;
173- }
165+ static cblas_dgemv64_fn* fn = (cblas_dgemv64_fn*)resolve_blas (" cblas_dgemv64_" );
166+ fn (101 , 111 , (int64_t )M, (int64_t )K, 1.0 , A, (int64_t )K,
167+ x, 1 , 0.0 , y, 1 );
174168}
175169
176170// y[N] = B^T[K×N] @ a[K] — 1D × 2D case (Trans=112)
177171inline void blas_sgemv_t (const float * B, const float * a, float * y, size_t K, size_t N) {
178- static cblas_sgemv64_fn* fn = nullptr;
179- if (!fn) fn = (cblas_sgemv64_fn*)resolve_blas(" cblas_sgemv64_" );
180- if (__builtin_expect(fn != nullptr, 1)) {
181- fn(101, 112, (int64_t)K, (int64_t)N, 1.0f, B, (int64_t)N,
182- a, 1, 0.0f, y, 1);
183- return;
184- }
185- for (size_t j = 0; j < N; ++j) {
186- float s = 0.0f;
187- for (size_t k = 0; k < K; ++k) s += B[k*N+j] * a[k];
188- y[j] = s;
189- }
172+ static cblas_sgemv64_fn* fn = (cblas_sgemv64_fn*)resolve_blas (" cblas_sgemv64_" );
173+ fn (101 , 112 , (int64_t )K, (int64_t )N, 1 .0f , B, (int64_t )N,
174+ a, 1 , 0 .0f , y, 1 );
190175}
191176inline void blas_dgemv_t (const double * B, const double * a, double * y, size_t K, size_t N) {
192- static cblas_dgemv64_fn* fn = nullptr;
193- if (!fn) fn = (cblas_dgemv64_fn*)resolve_blas(" cblas_dgemv64_" );
194- if (__builtin_expect(fn != nullptr, 1)) {
195- fn(101, 112, (int64_t)K, (int64_t)N, 1.0, B, (int64_t)N,
196- a, 1, 0.0, y, 1);
197- return;
198- }
199- for (size_t j = 0; j < N; ++j) {
200- double s = 0.0;
201- for (size_t k = 0; k < K; ++k) s += B[k*N+j] * a[k];
202- y[j] = s;
203- }
177+ static cblas_dgemv64_fn* fn = (cblas_dgemv64_fn*)resolve_blas (" cblas_dgemv64_" );
178+ fn (101 , 112 , (int64_t )K, (int64_t )N, 1.0 , B, (int64_t )N,
179+ a, 1 , 0.0 , y, 1 );
204180}
205181
206182// C = A @ B (all row-major) C[M×N] = A[M×K] @ B[K×N]
207183// Uses cblas_sgemm64_ — same kernel numpy.matmul calls → 0 ULP by construction.
208184inline void blas_sgemm (const float * A, const float * B, float * C,
209185 size_t M, size_t K, size_t N) {
210- static cblas_sgemm64_fn* fn = nullptr;
211- if (!fn) fn = (cblas_sgemm64_fn*)resolve_blas(" cblas_sgemm64_" );
212- if (__builtin_expect(fn != nullptr, 1)) {
213- fn(101, 111, 111, // RowMajor, NoTrans, NoTrans
214- (int64_t)M, (int64_t)N, (int64_t)K,
215- 1.0f, A, (int64_t)K, B, (int64_t)N,
216- 0.0f, C, (int64_t)N);
217- return;
218- }
219- // Fallback (no OpenBLAS): naive triple loop — not bit-exact but always correct
220- for (size_t i = 0; i < M; ++i)
221- for (size_t j = 0; j < N; ++j) {
222- float s = 0.0f;
223- for (size_t k = 0; k < K; ++k) s += A[i*K+k] * B[k*N+j];
224- C[i*N+j] = s;
225- }
186+ static cblas_sgemm64_fn* fn = (cblas_sgemm64_fn*)resolve_blas (" cblas_sgemm64_" );
187+ fn (101 , 111 , 111 , // RowMajor, NoTrans, NoTrans
188+ (int64_t )M, (int64_t )N, (int64_t )K,
189+ 1 .0f , A, (int64_t )K, B, (int64_t )N,
190+ 0 .0f , C, (int64_t )N);
226191}
227192
228193inline void blas_dgemm (const double * A, const double * B, double * C,
229194 size_t M, size_t K, size_t N) {
230- static cblas_dgemm64_fn* fn = nullptr;
231- if (!fn) fn = (cblas_dgemm64_fn*)resolve_blas(" cblas_dgemm64_" );
232- if (__builtin_expect(fn != nullptr, 1)) {
233- fn(101, 111, 111,
234- (int64_t)M, (int64_t)N, (int64_t)K,
235- 1.0, A, (int64_t)K, B, (int64_t)N,
236- 0.0, C, (int64_t)N);
237- return;
238- }
239- for (size_t i = 0; i < M; ++i)
240- for (size_t j = 0; j < N; ++j) {
241- double s = 0.0;
242- for (size_t k = 0; k < K; ++k) s += A[i*K+k] * B[k*N+j];
243- C[i*N+j] = s;
244- }
195+ static cblas_dgemm64_fn* fn = (cblas_dgemm64_fn*)resolve_blas (" cblas_dgemm64_" );
196+ fn (101 , 111 , 111 ,
197+ (int64_t )M, (int64_t )N, (int64_t )K,
198+ 1.0 , A, (int64_t )K, B, (int64_t )N,
199+ 0.0 , C, (int64_t )N);
245200}
246201
247202// ============================================================================
@@ -264,16 +219,15 @@ using dgesv64_fn = void(int64_t*, int64_t*, double*, int64_t*,
264219
265220// / Fortran DGESV-based matrix inverse. Matches numpy.linalg.inv exactly
266221// / — same Fortran symbol, same ILP64 ABI, same memory layout.
267- template<typename T> inline bool blas_gesv_inv(T* A, size_t N);
222+ // / Throws std::runtime_error on singular matrix (LAPACK info != 0).
223+ template <typename T> inline void blas_gesv_inv (T* A, size_t N);
268224
269- template<> inline bool blas_gesv_inv<float>(float* A, size_t N) {
225+ template <> inline void blas_gesv_inv<float >(float * A, size_t N) {
270226 // numpy.linalg.inv for float32 produces the same bits as:
271227 // float32 → float64 → dgesv → float32
272228 // (OpenBLAS sgesv_64_ gives 1-ULP-off results vs numpy on this build;
273229 // the float64 path is bit-identical for both types.)
274- static dgesv64_fn* gesv = nullptr;
275- if (!gesv) gesv = (dgesv64_fn*)resolve_blas(" dgesv_64_" );
276- if (__builtin_expect(gesv == nullptr, 0)) return false;
230+ static dgesv64_fn* gesv = (dgesv64_fn*)resolve_blas (" dgesv_64_" );
277231 int64_t n = static_cast <int64_t >(N);
278232 auto ipiv = std::make_unique<int64_t []>(N);
279233 // Double-precision work buffers (column-major)
@@ -289,18 +243,17 @@ template<> inline bool blas_gesv_inv<float>(float* A, size_t N) {
289243 B_col[i + i*N] = 1.0 ;
290244 int64_t nrhs = n, lda = n, ldb = n, info = 0 ;
291245 gesv (&n, &nrhs, A_col.get (), &lda, ipiv.get (), B_col.get (), &ldb, &info);
292- if (info != 0) return false;
246+ if (info != 0 )
247+ throw std::runtime_error (" linalg.inv: singular matrix (DGESV info=" +
248+ std::to_string (info) + " )" );
293249 // Demote solution back to float32 row-major
294250 for (size_t i = 0 ; i < N; ++i)
295251 for (size_t j = 0 ; j < N; ++j)
296252 A[i*N + j] = static_cast <float >(B_col[j*N + i]);
297- return true;
298253}
299254
300- template<> inline bool blas_gesv_inv<double>(double* A, size_t N) {
301- static dgesv64_fn* gesv = nullptr;
302- if (!gesv) gesv = (dgesv64_fn*)resolve_blas(" dgesv_64_" );
303- if (__builtin_expect(gesv == nullptr, 0)) return false;
255+ template <> inline void blas_gesv_inv<double >(double * A, size_t N) {
256+ static dgesv64_fn* gesv = (dgesv64_fn*)resolve_blas (" dgesv_64_" );
304257 int64_t n = static_cast <int64_t >(N);
305258 auto ipiv = std::make_unique<int64_t []>(N);
306259 auto A_col = std::make_unique<double []>(N * N);
@@ -313,11 +266,12 @@ template<> inline bool blas_gesv_inv<double>(double* A, size_t N) {
313266 B_col[i + j*N] = (i == j) ? 1.0 : 0.0 ;
314267 int64_t nrhs = n, lda = n, ldb = n, info = 0 ;
315268 gesv (&n, &nrhs, A_col.get (), &lda, ipiv.get (), B_col.get (), &ldb, &info);
316- if (info != 0) return false;
269+ if (info != 0 )
270+ throw std::runtime_error (" linalg.inv: singular matrix (DGESV info=" +
271+ std::to_string (info) + " )" );
317272 for (size_t i = 0 ; i < N; ++i)
318273 for (size_t j = 0 ; j < N; ++j)
319274 A[i*N + j] = B_col[j*N + i];
320- return true;
321275}
322276
323277// Template dispatcher
@@ -328,14 +282,11 @@ template<> struct blas_ops<float> {
328282 static float norm (const float * x, size_t n) { return std::sqrt (blas_sdot (x, x, n)); }
329283 static void gemm (const float * A, const float * B, float * C,
330284 size_t M, size_t K, size_t N) { blas_sgemm (A, B, C, M, K, N); }
331- // y[M] = A[M×K] @ x[K]
332285 static void gemv (const float * A, const float * x, float * y,
333286 size_t M, size_t K) { blas_sgemv (A, x, y, M, K); }
334- // y[N] = B^T @ a[K] (1D × 2D case)
335287 static void gemvt (const float * B, const float * a, float * y,
336288 size_t K, size_t N) { blas_sgemv_t (B, a, y, K, N); }
337- // A_inv[N×N] = inv(A[N×N]) — in-place, returns true on success
338- static bool inv (float* A, size_t N) { return blas_gesv_inv<float>(A, N); }
289+ static void inv (float * A, size_t N) { blas_gesv_inv<float >(A, N); }
339290};
340291template <> struct blas_ops <double > {
341292 static double dot (const double * x, const double * y, size_t n) { return blas_ddot (x, y, n); }
@@ -346,7 +297,7 @@ template<> struct blas_ops<double> {
346297 size_t M, size_t K) { blas_dgemv (A, x, y, M, K); }
347298 static void gemvt (const double * B, const double * a, double * y,
348299 size_t K, size_t N) { blas_dgemv_t (B, a, y, K, N); }
349- static bool inv (double* A, size_t N) { return blas_gesv_inv<double>(A, N); }
300+ static void inv (double * A, size_t N) { blas_gesv_inv<double >(A, N); }
350301};
351302
352303} // namespace detail
0 commit comments