feat: add cbrt, expm1, log1p — bit-exact with numpy via SVML bridge

peng.li24 · peng.li24 · commit 49647b7aa457 · 2026-06-03T10:26:54.000+08:00
SVML bridge auto-dispatches __svml_cbrt8/__svml_expm18/__svml_log1p8
on AVX-512, or falls back to dlsym npy_* scalars otherwise. Same
pattern as exp/log/sin/cos/etc.

- svml_bridge.h: SVML + npy_* fallback + dispatchers + svml_impl
- core.h: cbrt/expm1/log1p array functions (NUMPY_UNROLL4)
- pycpp: DEF_ELEMWISE wrappers
- tests: 24 new tests (3 funcs × 4 sizes × 2 dtypes)
- test count: 476 → 500
- Makefile: -fno-builtin-cbrt -fno-builtin-expm1 -fno-builtin-log1p

Verified: 0/100000 random diffs for all 3 funcs in f64 and f32.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -8,7 +8,7 @@ on:
     branches: [master]
 
 jobs:
-  # ---- Test: build module + run 476 precision tests --------------------------
+  # ---- Test: build module + run 500 precision tests --------------------------
   test:
     runs-on: ubuntu-22.04
     steps:
diff --git a/README.md b/README.md
@@ -15,7 +15,7 @@ We created `numpycpp` to keep NumPy's familiar usage patterns while letting C++
 
 `numpycpp` is a **header-only C++ library** implementing numpy's core API (`numpy.*`, `numpy.linalg.*`, `numpy.einsum`) with **bit-level precision alignment**. Raw pointer + size interface. Zero external dependencies — pure C++17 standard library.
 
-All APIs are tested against Python numpy under strict bit-level comparison: every IEEE 754 float bit must match exactly (476 tests, float64 + float32).
+All APIs are tested against Python numpy under strict bit-level comparison: every IEEE 754 float bit must match exactly (500 tests, float64 + float32).
 
 **Bit-exact math** is achieved by resolving numpy's own math functions from `_multiarray_umath.so` at runtime. The SVML bridge auto-detects your CPU and selects the same path numpy uses: AVX‑512 SVML (`__svml_exp8`) when available, or scalar `npy_exp`/`npy_log`/etc. otherwise. AVX‑512 intrinsics are isolated behind `__attribute__((target))` — the binary is safe on any x86_64 CPU (no SIGILL). Every transcendental function produces the exact same IEEE 754 bits as numpy on **all architectures**.
 
@@ -89,12 +89,12 @@ Add `-Ipath/to/numpycpp` to your compiler flags and include the headers directly
 ### Testing
 
 The test suite verifies **bit-level precision alignment** between every C++ function and Python numpy.
-No tolerance, no `atol`/`rtol` — raw IEEE 754 bits must match exactly. 476 tests, float64 + float32.
+No tolerance, no `atol`/`rtol` — raw IEEE 754 bits must match exactly. 500 tests, float64 + float32.
 
 ```bash
 cd tests
 make                    # compile C++ test module
-make test               # run all 476 tests (silent mode: only failures print)
+make test               # run all 500 tests (silent mode: only failures print)
 ```
 
 To run with verbose output:
@@ -118,7 +118,9 @@ CXXFLAGS ?= -std=c++17 -O2 -fPIC -fopenmp                \
             -fno-builtin-sqrt   -fno-builtin-atan2         \
             -fno-builtin-log2   -fno-builtin-log10         \
             -fno-builtin-asin   -fno-builtin-acos          \
-            -fno-builtin-atan   -fno-builtin-exp2
+            -fno-builtin-atan   -fno-builtin-exp2         \
+            -fno-builtin-cbrt   -fno-builtin-expm1      \
+            -fno-builtin-log1p
 LDFLAGS   = -shared -ldl
 ```
 
@@ -142,7 +144,7 @@ LDFLAGS   = -shared -ldl
 ### Alignment status
 
 The table below reflects the current bit-level parity between `numpycpp` C++ and Python numpy.
-All 476 tests pass under strict IEEE 754 bit comparison (float64 + float32).
+All 500 tests pass under strict IEEE 754 bit comparison (float64 + float32).
 
 ✅ = bit-exact on ALL architectures (SVML bridge with runtime CPU dispatch).
 
@@ -158,7 +160,7 @@ All 476 tests pass under strict IEEE 754 bit comparison (float64 + float32).
 | Setops / interp   | ✅ | ✅ | isin, intersect1d, interp, safe_divide |
 | Access / convert  | ✅ | ✅ | array_get, asarray, to_vector |
 | **Math — element-wise** (sqrt, abs, sign, clip, round, floor, ceil, degrees, radians) | ✅ | ✅ | Pure C++, no libm dependency |
-| **Math — transcendental** (exp, log, sin, cos, tan, asin, acos, atan, log10, log2, exp2) | ✅ | ✅ | npy_* scalar functions via dlsym, bit-exact on all archs |
+| **Math — transcendental** (exp, log, sin, cos, tan, asin, acos, atan, log10, log2, exp2, cbrt, expm1, log1p) | ✅ | ✅ | dlsym npy_* or SVML via bridge, bit-exact on all archs |
 | **Math — power**   | ✅ | ✅ | npy_pow / npy_powf via SVML bridge |
 | **Math — hypot**   | ✅ | ✅ | std::hypot — bit-exact (numpy matches libm) |
 | **Math — atan2**   | ✅ | ✅ | npy_atan2 / npy_atan2f via SVML bridge |
@@ -190,7 +192,7 @@ numpycpp/
 │   └── einsum_py.h
 ├── tests/              # bit-level precision tests + test module
 │   ├── module.cpp      # pybind11 module for testing
-│   ├── test_all.py     # single entry — all APIs, 476 tests, float64+float32
+│   ├── test_all.py     # single entry — all APIs, 500 tests, float64+float32
 │   ├── conftest.py     # silent-mode output suppression
 │   └── Makefile
 ├── CMakeLists.txt      # build & .deb packaging
diff --git a/numpy/core.h b/numpy/core.h
@@ -113,6 +113,24 @@ inline void tan(const T* src, T* dst, size_t n) {
     NUMPY_UNROLL4(i, dst[i] = detail::tan(src[i]));
 }
 
+/// numpy.cbrt(x, /, out=None, *, where=True, ...)
+template<typename T>
+inline void cbrt(const T* src, T* dst, size_t n) {
+    NUMPY_UNROLL4(i, dst[i] = detail::cbrt(src[i]));
+}
+
+/// numpy.expm1(x, /, out=None, *, where=True, ...)
+template<typename T>
+inline void expm1(const T* src, T* dst, size_t n) {
+    NUMPY_UNROLL4(i, dst[i] = detail::expm1(src[i]));
+}
+
+/// numpy.log1p(x, /, out=None, *, where=True, ...)
+template<typename T>
+inline void log1p(const T* src, T* dst, size_t n) {
+    NUMPY_UNROLL4(i, dst[i] = detail::log1p(src[i]));
+}
+
 /// numpy.power(x1, x2, /, out=None, *, where=True, ...)
 template<typename T>
 inline void power(const T* src, T* dst, size_t n, T exponent) {
diff --git a/numpy/svml_bridge.h b/numpy/svml_bridge.h
@@ -125,6 +125,9 @@ NUMPY_SVML_F64(atan,  "__svml_atan8",  "npy_atan")
 NUMPY_SVML_F64(log10, "__svml_log108", "npy_log10")
 NUMPY_SVML_F64(log2,  "__svml_log28",  "npy_log2")
 NUMPY_SVML_F64(exp2,  "__svml_exp28",  "npy_exp2")
+NUMPY_SVML_F64(cbrt,  "__svml_cbrt8",  "npy_cbrt")
+NUMPY_SVML_F64(expm1, "__svml_expm18", "npy_expm1")
+NUMPY_SVML_F64(log1p, "__svml_log1p8", "npy_log1p")
 
 NUMPY_SVML_F32(tan,   "__svml_tanf16",  "npy_tanf")
 NUMPY_SVML_F32(asin,  "__svml_asinf16", "npy_asinf")
@@ -133,6 +136,9 @@ NUMPY_SVML_F32(atan,  "__svml_atanf16", "npy_atanf")
 NUMPY_SVML_F32(log10, "__svml_log10f16","npy_log10f")
 NUMPY_SVML_F32(log2,  "__svml_log2f16", "npy_log2f")
 NUMPY_SVML_F32(exp2,  "__svml_exp2f16", "npy_exp2f")
+NUMPY_SVML_F32(cbrt,  "__svml_cbrtf16", "npy_cbrtf")
+NUMPY_SVML_F32(expm1, "__svml_expm1f16","npy_expm1f")
+NUMPY_SVML_F32(log1p, "__svml_log1pf16","npy_log1pf")
 
 // pow / atan2 — SVML 2-arg
 __attribute__((target("avx512f")))
@@ -195,6 +201,9 @@ NUMPY_NPY_F64(atan,  std::atan(x))
 NUMPY_NPY_F64(log10, std::log10(x))
 NUMPY_NPY_F64(log2,  std::log2(x))
 NUMPY_NPY_F64(exp2,  std::exp2(x))
+NUMPY_NPY_F64(cbrt,  std::cbrt(x))
+NUMPY_NPY_F64(expm1, std::expm1(x))
+NUMPY_NPY_F64(log1p, std::log1p(x))
 
 // f32: fallback via numpy's own polynomial approximations
 // f32 exp/log/sin/cos: numpy's own polynomial approximations (npy_math_float.h)
@@ -211,6 +220,9 @@ NUMPY_NPY_F32(atan,  std::atan(x))
 NUMPY_NPY_F32(log10, std::log10(x))
 NUMPY_NPY_F32(log2,  std::log2(x))
 NUMPY_NPY_F32(exp2,  std::exp2(x))
+NUMPY_NPY_F32(cbrt,  std::cbrt(x))
+NUMPY_NPY_F32(expm1, std::expm1(x))
+NUMPY_NPY_F32(log1p, std::log1p(x))
 
 // hypot — numpy matches libm bit-exact for both f32 and f64
 inline double hypot_f64(double x, double y) { return std::hypot(x, y); }
@@ -271,13 +283,19 @@ DISPATCH_F64(atan)
 DISPATCH_F64(log10)
 DISPATCH_F64(log2)
 DISPATCH_F64(exp2)
+DISPATCH_F64(cbrt)
+DISPATCH_F64(expm1)
+DISPATCH_F64(log1p)
 DISPATCH_F32(tan)
 DISPATCH_F32(asin)
 DISPATCH_F32(acos)
 DISPATCH_F32(atan)
 DISPATCH_F32(log10)
 DISPATCH_F32(log2)
 DISPATCH_F32(exp2)
+DISPATCH_F32(cbrt)
+DISPATCH_F32(expm1)
+DISPATCH_F32(log1p)
 
 // f32 exp/log/sin/cos: numpy uses its own polynomial approximations
 // (npy_math_float.h), NOT SVML. These are bit-exact on all architectures.
@@ -328,6 +346,9 @@ template<> struct svml_impl<T> {                                     \
     static T log10(T x){ return log10_##suff(x); }                   \
     static T log2(T x) { return log2_##suff(x); }                    \
     static T exp2(T x) { return exp2_##suff(x); }                    \
+    static T cbrt(T x) { return cbrt_##suff(x); }                    \
+    static T expm1(T x){ return expm1_##suff(x); }                   \
+    static T log1p(T x){ return log1p_##suff(x); }                   \
     static T sqrt(T x) { return sqrt_##suff(x); }                    \
     static T pow(T x, T e)    { return pow_##suff(x, e); }           \
     static T atan2(T y, T x)  { return atan2_##suff(y, x); }         \
@@ -353,6 +374,9 @@ NUMPY_SVML_D1(atan)
 NUMPY_SVML_D1(log10)
 NUMPY_SVML_D1(log2)
 NUMPY_SVML_D1(exp2)
+NUMPY_SVML_D1(cbrt)
+NUMPY_SVML_D1(expm1)
+NUMPY_SVML_D1(log1p)
 NUMPY_SVML_D1(sqrt)
 #undef NUMPY_SVML_D1
 
diff --git a/pycpp/core_py.h b/pycpp/core_py.h
@@ -282,6 +282,12 @@ DEF_ELEMWISE(sin)
 DEF_ELEMWISE(cos)
 /// numpy.tan(x, /, out=None, *, where=True, ...)
 DEF_ELEMWISE(tan)
+/// numpy.cbrt(x, /, out=None, *, where=True, ...)
+DEF_ELEMWISE(cbrt)
+/// numpy.expm1(x, /, out=None, *, where=True, ...)
+DEF_ELEMWISE(expm1)
+/// numpy.log1p(x, /, out=None, *, where=True, ...)
+DEF_ELEMWISE(log1p)
 /// numpy.log10(x, /, out=None, *, where=True, ...)
 DEF_ELEMWISE(log10)
 /// numpy.log2(x, /, out=None, *, where=True, ...)
diff --git a/tests/Makefile b/tests/Makefile
@@ -10,7 +10,8 @@ CXXFLAGS ?= -std=c++17 -O2 -fPIC -fopenmp -ffp-contract=off \
 	-fno-builtin-cos -fno-builtin-tan -fno-builtin-pow \
 	-fno-builtin-sqrt -fno-builtin-atan2 -fno-builtin-log2 \
 	-fno-builtin-log10 -fno-builtin-asin -fno-builtin-acos \
-	-fno-builtin-atan -fno-builtin-exp2
+	-fno-builtin-atan -fno-builtin-exp2 \
+	-fno-builtin-cbrt -fno-builtin-expm1 -fno-builtin-log1p
 INCLUDES  = -I.. -I../pycpp $(shell python3 -m pybind11 --includes) $(shell pkg-config --cflags eigen3 2>/dev/null || echo)
 LDFLAGS   = -shared -ldl
 
diff --git a/tests/module.cpp b/tests/module.cpp
@@ -76,6 +76,7 @@ PYBIND11_MODULE(numpycpp, m) {
     // -- Element-wise math -------------------------------------------------
     BIND_F1(sqrt); BIND_F1(abs); BIND_F1(exp); BIND_F1(log);
     BIND_F1(sin); BIND_F1(cos); BIND_F1(tan);
+    BIND_F1(cbrt); BIND_F1(expm1); BIND_F1(log1p);
     BIND_F1(log10); BIND_F1(log2); BIND_F1(arcsin); BIND_F1(arccos); BIND_F1(arctan);
     BIND_F1(round); BIND_F1(floor); BIND_F1(ceil);
     BIND_F1(degrees); BIND_F1(radians); BIND_F1(sign);
diff --git a/tests/test_all.py b/tests/test_all.py
@@ -155,6 +155,9 @@ def dtype(request):
     ("sin",        np.sin,        None,                               [(100, 42), (1000, 7), (10000, 7), (100000, 7)]),
     ("cos",        np.cos,        None,                               [(100, 42), (1000, 7), (10000, 7), (100000, 7)]),
     ("tan",        np.tan,        lambda a: a * 0.5,                  [(100, 42), (1000, 7), (10000, 7), (100000, 7)]),
+    ("cbrt",       np.cbrt,       None,                               [(100, 42), (1000, 7), (10000, 7), (100000, 7)]),
+    ("expm1",      np.expm1,      lambda a: a * 2.0,                  [(100, 42), (1000, 7), (10000, 7), (100000, 7)]),
+    ("log1p",      np.log1p,      lambda a: np.abs(a) + 0.1,          [(100, 42), (1000, 7), (10000, 7), (100000, 7)]),
     ("log10",      np.log10,      lambda a: np.abs(a) + 0.1,          [(100, 42), (1000, 7), (10000, 7), (100000, 7)]),
     ("log2",       np.log2,       lambda a: np.abs(a) + 0.1,          [(100, 42), (1000, 7), (10000, 7), (100000, 7)]),
     ("arcsin",     np.arcsin,     lambda a: np.clip(a * 0.5, -1, 1),  [(100, 42), (1000, 7), (10000, 7), (100000, 7)]),