@@ -73,6 +73,8 @@ inline const char* find_openblas_path() {
7373 }
7474 }
7575 }
76+ // If not found yet, allow retry on next call (OpenBLAS may be loaded later)
77+ if (path.empty()) tried = false;
7678 return path.empty() ? nullptr : path.c_str();
7779}
7880
@@ -106,7 +108,8 @@ using cblas_dgemm64_fn = void(int, int, int,
106108 double, double*, int64_t);
107109
108110inline float blas_sdot(const float* x, const float* y, size_t n) {
109- static auto fn = (sdot64_fn*)resolve_blas(" sdot_64_" );
111+ static sdot64_fn* fn = nullptr;
112+ if (!fn) fn = (sdot64_fn*)resolve_blas(" sdot_64_" );
110113 if (__builtin_expect(fn != nullptr, 1)) {
111114 const int64_t ni = static_cast<int64_t>(n), inc = 1;
112115 return fn(&ni, x, &inc, y, &inc);
@@ -118,7 +121,8 @@ inline float blas_sdot(const float* x, const float* y, size_t n) {
118121}
119122
120123inline double blas_ddot(const double* x, const double* y, size_t n) {
121- static auto fn = (ddot64_fn*)resolve_blas(" ddot_64_" );
124+ static ddot64_fn* fn = nullptr;
125+ if (!fn) fn = (ddot64_fn*)resolve_blas(" ddot_64_" );
122126 if (__builtin_expect(fn != nullptr, 1)) {
123127 const int64_t ni = static_cast<int64_t>(n), inc = 1;
124128 return fn(&ni, x, &inc, y, &inc);
@@ -141,7 +145,8 @@ using cblas_dgemv64_fn = void(int, int, int64_t, int64_t,
141145
142146// y[M] = A[M×K] @ x[K] — 2D × 1D case
143147inline void blas_sgemv(const float* A, const float* x, float* y, size_t M, size_t K) {
144- static auto fn = (cblas_sgemv64_fn*)resolve_blas(" cblas_sgemv64_" );
148+ static cblas_sgemv64_fn* fn = nullptr;
149+ if (!fn) fn = (cblas_sgemv64_fn*)resolve_blas(" cblas_sgemv64_" );
145150 if (__builtin_expect(fn != nullptr, 1)) {
146151 fn(101, 111, (int64_t)M, (int64_t)K, 1.0f, A, (int64_t)K,
147152 x, 1, 0.0f, y, 1);
@@ -154,7 +159,8 @@ inline void blas_sgemv(const float* A, const float* x, float* y, size_t M, size_
154159 }
155160}
156161inline void blas_dgemv(const double* A, const double* x, double* y, size_t M, size_t K) {
157- static auto fn = (cblas_dgemv64_fn*)resolve_blas(" cblas_dgemv64_" );
162+ static cblas_dgemv64_fn* fn = nullptr;
163+ if (!fn) fn = (cblas_dgemv64_fn*)resolve_blas(" cblas_dgemv64_" );
158164 if (__builtin_expect(fn != nullptr, 1)) {
159165 fn(101, 111, (int64_t)M, (int64_t)K, 1.0, A, (int64_t)K,
160166 x, 1, 0.0, y, 1);
@@ -169,7 +175,8 @@ inline void blas_dgemv(const double* A, const double* x, double* y, size_t M, si
169175
170176// y[N] = B^T[K×N] @ a[K] — 1D × 2D case (Trans=112)
171177inline void blas_sgemv_t(const float* B, const float* a, float* y, size_t K, size_t N) {
172- static auto fn = (cblas_sgemv64_fn*)resolve_blas(" cblas_sgemv64_" );
178+ static cblas_sgemv64_fn* fn = nullptr;
179+ if (!fn) fn = (cblas_sgemv64_fn*)resolve_blas(" cblas_sgemv64_" );
173180 if (__builtin_expect(fn != nullptr, 1)) {
174181 fn(101, 112, (int64_t)K, (int64_t)N, 1.0f, B, (int64_t)N,
175182 a, 1, 0.0f, y, 1);
@@ -182,7 +189,8 @@ inline void blas_sgemv_t(const float* B, const float* a, float* y, size_t K, siz
182189 }
183190}
184191inline void blas_dgemv_t(const double* B, const double* a, double* y, size_t K, size_t N) {
185- static auto fn = (cblas_dgemv64_fn*)resolve_blas(" cblas_dgemv64_" );
192+ static cblas_dgemv64_fn* fn = nullptr;
193+ if (!fn) fn = (cblas_dgemv64_fn*)resolve_blas(" cblas_dgemv64_" );
186194 if (__builtin_expect(fn != nullptr, 1)) {
187195 fn(101, 112, (int64_t)K, (int64_t)N, 1.0, B, (int64_t)N,
188196 a, 1, 0.0, y, 1);
@@ -199,7 +207,8 @@ inline void blas_dgemv_t(const double* B, const double* a, double* y, size_t K,
199207// Uses cblas_sgemm64_ — same kernel numpy.matmul calls → 0 ULP by construction.
200208inline void blas_sgemm(const float* A, const float* B, float* C,
201209 size_t M, size_t K, size_t N) {
202- static auto fn = (cblas_sgemm64_fn*)resolve_blas(" cblas_sgemm64_" );
210+ static cblas_sgemm64_fn* fn = nullptr;
211+ if (!fn) fn = (cblas_sgemm64_fn*)resolve_blas(" cblas_sgemm64_" );
203212 if (__builtin_expect(fn != nullptr, 1)) {
204213 fn(101, 111, 111, // RowMajor, NoTrans, NoTrans
205214 (int64_t)M, (int64_t)N, (int64_t)K,
@@ -218,7 +227,8 @@ inline void blas_sgemm(const float* A, const float* B, float* C,
218227
219228inline void blas_dgemm(const double* A, const double* B, double* C,
220229 size_t M, size_t K, size_t N) {
221- static auto fn = (cblas_dgemm64_fn*)resolve_blas(" cblas_dgemm64_" );
230+ static cblas_dgemm64_fn* fn = nullptr;
231+ if (!fn) fn = (cblas_dgemm64_fn*)resolve_blas(" cblas_dgemm64_" );
222232 if (__builtin_expect(fn != nullptr, 1)) {
223233 fn(101, 111, 111,
224234 (int64_t)M, (int64_t)N, (int64_t)K,
@@ -261,7 +271,8 @@ template<> inline bool blas_gesv_inv<float>(float* A, size_t N) {
261271 // float32 → float64 → dgesv → float32
262272 // (OpenBLAS sgesv_64_ gives 1-ULP-off results vs numpy on this build;
263273 // the float64 path is bit-identical for both types.)
264- static auto gesv = (dgesv64_fn*)resolve_blas(" dgesv_64_" );
274+ static dgesv64_fn* gesv = nullptr;
275+ if (!gesv) gesv = (dgesv64_fn*)resolve_blas(" dgesv_64_" );
265276 if (__builtin_expect(gesv == nullptr, 0)) return false;
266277 int64_t n = static_cast<int64_t>(N);
267278 auto ipiv = std::make_unique<int64_t[]>(N);
@@ -287,7 +298,8 @@ template<> inline bool blas_gesv_inv<float>(float* A, size_t N) {
287298}
288299
289300template<> inline bool blas_gesv_inv<double>(double* A, size_t N) {
290- static auto gesv = (dgesv64_fn*)resolve_blas(" dgesv_64_" );
301+ static dgesv64_fn* gesv = nullptr;
302+ if (!gesv) gesv = (dgesv64_fn*)resolve_blas(" dgesv_64_" );
291303 if (__builtin_expect(gesv == nullptr, 0)) return false;
292304 int64_t n = static_cast<int64_t>(N);
293305 auto ipiv = std::make_unique<int64_t[]>(N);
0 commit comments