feat: add np.cumsum, np.squeeze, dtype-flexible creation templates

peng.li24 · peng.li24 · commit 8745fdac8b8b · 2026-06-03T00:28:50.000+08:00
- Add cumsum to core.h (native 1D cumulative sum)
- Add cumsum, squeeze pycpp wrappers to core_py.h
- Add zeros_t&lt;T&gt;/ones_t&lt;T&gt;/full_t&lt;T&gt; template creation wrappers
  for dtype flexibility (pybind11 modules can bind e.g. zeros_f32)
- Wire bindings in module.cpp
- Add tests: test_cumsum (3 cases), test_squeeze (3 cases)
- Test count: 466 → 468
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -8,7 +8,7 @@ on:
     branches: [master]
 
 jobs:
-  # ---- Test: build module + run 460 precision tests --------------------------
+  # ---- Test: build module + run 468 precision tests --------------------------
   test:
     runs-on: ubuntu-22.04
     steps:
diff --git a/README.md b/README.md
@@ -15,7 +15,7 @@ We created `numpycpp` to keep NumPy's familiar usage patterns while letting C++
 
 `numpycpp` is a **header-only C++ library** implementing numpy's core API (`numpy.*`, `numpy.linalg.*`, `numpy.einsum`) with **bit-level precision alignment**. Raw pointer + size interface. Zero external dependencies — pure C++17 standard library.
 
-All APIs are tested against Python numpy under strict bit-level comparison: every IEEE 754 float bit must match exactly (460 tests, float64 + float32).
+All APIs are tested against Python numpy under strict bit-level comparison: every IEEE 754 float bit must match exactly (468 tests, float64 + float32).
 
 **Bit-exact math** is achieved by resolving numpy's own math functions from `_multiarray_umath.so` at runtime. The SVML bridge auto-detects your CPU and selects the same path numpy uses: AVX‑512 SVML (`__svml_exp8`) when available, or scalar `npy_exp`/`npy_log`/etc. otherwise. AVX‑512 intrinsics are isolated behind `__attribute__((target))` — the binary is safe on any x86_64 CPU (no SIGILL). Every transcendental function produces the exact same IEEE 754 bits as numpy on **all architectures**.
 
@@ -89,12 +89,12 @@ Add `-Ipath/to/numpycpp` to your compiler flags and include the headers directly
 ### Testing
 
 The test suite verifies **bit-level precision alignment** between every C++ function and Python numpy.
-No tolerance, no `atol`/`rtol` — raw IEEE 754 bits must match exactly. 460 tests, float64 + float32.
+No tolerance, no `atol`/`rtol` — raw IEEE 754 bits must match exactly. 468 tests, float64 + float32.
 
 ```bash
 cd tests
 make                    # compile C++ test module
-make test               # run all 460 tests (silent mode: only failures print)
+make test               # run all 468 tests (silent mode: only failures print)
 ```
 
 To run with verbose output:
@@ -142,7 +142,7 @@ LDFLAGS   = -shared -ldl
 ### Alignment status
 
 The table below reflects the current bit-level parity between `numpycpp` C++ and Python numpy.
-All 460 tests pass under strict IEEE 754 bit comparison (float64 + float32).
+All 468 tests pass under strict IEEE 754 bit comparison (float64 + float32).
 
 ✅ = bit-exact on ALL architectures (SVML bridge with runtime CPU dispatch).
 
@@ -189,7 +189,7 @@ numpycpp/
 │   └── einsum_py.h
 ├── tests/              # bit-level precision tests + test module
 │   ├── module.cpp      # pybind11 module for testing
-│   ├── test_all.py     # single entry — all APIs, 460 tests, float64+float32
+│   ├── test_all.py     # single entry — all APIs, 468 tests, float64+float32
 │   ├── conftest.py     # silent-mode output suppression
 │   └── Makefile
 ├── CMakeLists.txt      # build & .deb packaging
diff --git a/numpy/core.h b/numpy/core.h
@@ -738,6 +738,17 @@ inline void unwrap(const T* src, T* dst, size_t n, T discont = T(M_PI)) {
     }
 }
 
+/// numpy.cumsum(a, axis=None, dtype=None, out=None)
+/// 1D cumulative sum: dst[i] = sum_{j=0}^{i} src[j]
+template<typename T>
+inline void cumsum(const T* src, T* dst, size_t n) {
+    if (n == 0) return;
+    dst[0] = src[0];
+    for (size_t i = 1; i < n; ++i) {
+        dst[i] = dst[i-1] + src[i];
+    }
+}
+
 // ============================================================================
 // astype conversions
 // ============================================================================
diff --git a/pycpp/core_py.h b/pycpp/core_py.h
@@ -54,6 +54,7 @@ py::array_t<T> empty_like(const py::array_t<T>& arr) {
 }
 
 /// numpy.zeros(shape, dtype=float, order='C', *, like=None)
+/// NOTE: convenient double default. For dtype flexibility, use zeros_t<T>(shape).
 inline py::array_t<double> zeros(const std::vector<py::ssize_t>& shape) {
     py::array_t<double> result(shape);
     zeros_like(static_cast<double*>(result.request().ptr), result.request().size);
@@ -74,6 +75,29 @@ inline py::array_t<double> full(const std::vector<py::ssize_t>& shape, double fi
     return result;
 }
 
+// Template counterparts — dtype-flexible creation for pybind11 modules
+// that need float32 or other dtypes. Bind as e.g. m.def("zeros_f32", &numpy::zeros_t<float>);
+template<typename T>
+py::array_t<T> zeros_t(const std::vector<py::ssize_t>& shape) {
+    py::array_t<T> result(shape);
+    zeros_like(static_cast<T*>(result.request().ptr), result.request().size);
+    return result;
+}
+
+template<typename T>
+py::array_t<T> ones_t(const std::vector<py::ssize_t>& shape) {
+    py::array_t<T> result(shape);
+    ones_like(static_cast<T*>(result.request().ptr), result.request().size);
+    return result;
+}
+
+template<typename T>
+py::array_t<T> full_t(const std::vector<py::ssize_t>& shape, T fill_value) {
+    py::array_t<T> result(shape);
+    numpy::full(static_cast<T*>(result.request().ptr), result.request().size, fill_value);
+    return result;
+}
+
 // Bool specializations
 // NOTE: _bool suffix — dtype-specific wrappers; pybind11 cannot deduce template
 // argument from a Python dtype keyword, so each dtype needs its own binding.
@@ -913,6 +937,27 @@ inline py::array_t<double> unwrap(const py::array_t<double>& arr, double discont
     return result;
 }
 
+/// numpy.cumsum(a, axis=None) — 1D cumulative sum
+inline py::array_t<double> cumsum(const py::array_t<double>& arr) {
+    auto buf = arr.request();
+    py::array_t<double> result(buf.shape);
+    numpy::cumsum(static_cast<const double*>(buf.ptr),
+                  static_cast<double*>(result.request().ptr), buf.size);
+    return result;
+}
+
+/// numpy.squeeze(a, axis=None) — remove axes of length 1
+inline py::array_t<double> squeeze(const py::array_t<double>& arr) {
+    auto buf = arr.request();
+    std::vector<py::ssize_t> new_shape;
+    for (auto s : buf.shape)
+        if (s != 1) new_shape.push_back(s);
+    if (new_shape.empty()) new_shape.push_back(1);
+    py::array_t<double> result(new_shape);
+    std::memcpy(result.request().ptr, buf.ptr, buf.size * sizeof(double));
+    return result;
+}
+
 /// numpy.intersect1d(ar1, ar2, assume_unique=False, return_indices=False)
 inline py::array_t<double> intersect1d(const py::array_t<double>& a, const py::array_t<double>& b) {
     auto ba = a.request(), bb = b.request();
diff --git a/tests/module.cpp b/tests/module.cpp
@@ -218,6 +218,8 @@ PYBIND11_MODULE(numpycpp, m) {
     m.def("intersect1d", static_cast<py::array_t<double>(*)(const py::array_t<double>&, const py::array_t<double>&)>(&numpy::intersect1d));
 	m.def("flatnonzero", static_cast<py::array_t<py::ssize_t>(*)(const py::array_t<double>&)>(&numpy::flatnonzero));
 	m.def("unwrap", static_cast<py::array_t<double>(*)(const py::array_t<double>&, double)>(&numpy::unwrap), py::arg("arr"), py::arg("discont") = M_PI);
+	m.def("cumsum", static_cast<py::array_t<double>(*)(const py::array_t<double>&)>(&numpy::cumsum));
+	m.def("squeeze", static_cast<py::array_t<double>(*)(const py::array_t<double>&)>(&numpy::squeeze));
 
     // -- Interpolation -----------------------------------------------------
     m.def("interp", static_cast<py::array_t<double>(*)(const py::array_t<double>&, const py::array_t<double>&, const py::array_t<double>&)>(&numpy::interp));
diff --git a/tests/test_all.py b/tests/test_all.py
@@ -710,6 +710,22 @@ def test_unwrap(cpp):
     a2 = np.array([0.0, 2.5, 5.0, -2.5, -5.0]) * np.pi
     assert_bit_aligned(cpp.unwrap(a2), np.unwrap(a2), "unwrap_large")
 
+def test_cumsum(cpp):
+    a = np.array([1.0, 2.0, 3.0, 4.0, 5.0])
+    assert_bit_aligned(cpp.cumsum(a), np.cumsum(a), "cumsum")
+    a2 = np.array([0.1, 0.2, 0.3, 0.4, 0.5])
+    assert_bit_aligned(cpp.cumsum(a2), np.cumsum(a2), "cumsum_frac")
+    a3 = np.array([-1.0, 2.0, -3.0, 4.0])
+    assert_bit_aligned(cpp.cumsum(a3), np.cumsum(a3), "cumsum_neg")
+
+def test_squeeze(cpp):
+    a = np.array([1.0, 2.0, 3.0]).reshape(3, 1)
+    assert_bit_aligned(cpp.squeeze(a), np.squeeze(a), "squeeze_col")
+    a2 = np.array([1.0, 2.0, 3.0]).reshape(1, 3)
+    assert_bit_aligned(cpp.squeeze(a2), np.squeeze(a2), "squeeze_row")
+    a3 = np.array([1.0, 2.0, 3.0, 4.0]).reshape(1, 2, 1, 2, 1)
+    assert_bit_aligned(cpp.squeeze(a3), np.squeeze(a3), "squeeze_multi")
+
 def test_intersect1d(cpp):
     a, b = np.array([1.0, 2.0, 3.0, 4.0]), np.array([3.0, 4.0, 5.0, 6.0])
     cpp_r = np.sort(np.asarray(cpp.intersect1d(a, b)))