Skip to content

Commit c984454

Browse files
author
peng.li24
committed
refactor: unroll comparisons/logical/is*/where/flip/astype; extract axis_reduce_impl fiber skeleton (-63 lines in core.h); update .gitignore
1 parent 9b1a2dc commit c984454

2 files changed

Lines changed: 66 additions & 113 deletions

File tree

.gitignore

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,3 +39,11 @@ compile_commands.json
3939
# Packaging
4040
*.deb
4141
*.rpm
42+
43+
# CMake packaging artifacts (in-source builds)
44+
CPackConfig.cmake
45+
CPackSourceConfig.cmake
46+
_CPack_Packages/
47+
install_manifest.txt
48+
numpycpp-config-version.cmake
49+
numpycpp-config.cmake

numpy/core.h

Lines changed: 58 additions & 113 deletions
Original file line numberDiff line numberDiff line change
@@ -333,43 +333,37 @@ inline T var(const T* data, size_t n) {
333333
/// numpy.greater(x1, x2, /, out=None, *, where=True, ...)
334334
template<typename T>
335335
inline void greater(const T* src, bool* dst, size_t n, T threshold) {
336-
for (size_t i = 0; i < n; ++i) dst[i] = (src[i] > threshold);
336+
NUMPY_UNROLL4(i, dst[i] = (src[i] > threshold));
337337
}
338-
339338
/// numpy.less(x1, x2, /, out=None, *, where=True, ...)
340339
template<typename T>
341340
inline void less(const T* src, bool* dst, size_t n, T threshold) {
342-
for (size_t i = 0; i < n; ++i) dst[i] = (src[i] < threshold);
341+
NUMPY_UNROLL4(i, dst[i] = (src[i] < threshold));
343342
}
344-
345343
/// numpy.equal(x1, x2, /, out=None, *, where=True, ...)
346344
template<typename T>
347345
inline void equal(const T* src, bool* dst, size_t n, T val) {
348-
for (size_t i = 0; i < n; ++i) dst[i] = (src[i] == val);
346+
NUMPY_UNROLL4(i, dst[i] = (src[i] == val));
349347
}
350-
351348
/// numpy.greater_equal(x1, x2, /, out=None, *, where=True, ...)
352349
template<typename T>
353350
inline void greater_equal(const T* src, bool* dst, size_t n, T threshold) {
354-
for (size_t i = 0; i < n; ++i) dst[i] = (src[i] >= threshold);
351+
NUMPY_UNROLL4(i, dst[i] = (src[i] >= threshold));
355352
}
356-
357353
/// numpy.less_equal(x1, x2, /, out=None, *, where=True, ...)
358354
template<typename T>
359355
inline void less_equal(const T* src, bool* dst, size_t n, T threshold) {
360-
for (size_t i = 0; i < n; ++i) dst[i] = (src[i] <= threshold);
356+
NUMPY_UNROLL4(i, dst[i] = (src[i] <= threshold));
361357
}
362-
363358
/// numpy.not_equal(x1, x2, /, out=None, *, where=True, ...) — scalar variant
364359
template<typename T>
365360
inline void not_equal_scalar(const T* src, bool* dst, size_t n, T val) {
366-
for (size_t i = 0; i < n; ++i) dst[i] = (src[i] != val);
361+
NUMPY_UNROLL4(i, dst[i] = (src[i] != val));
367362
}
368-
369363
/// numpy.not_equal(x1, x2, /, out=None, *, where=True, ...) — array variant
370364
template<typename T>
371365
inline void not_equal_array(const T* a, const T* b, bool* dst, size_t n) {
372-
for (size_t i = 0; i < n; ++i) dst[i] = (a[i] != b[i]);
366+
NUMPY_UNROLL4(i, dst[i] = (a[i] != b[i]));
373367
}
374368

375369
// ============================================================================
@@ -378,22 +372,19 @@ inline void not_equal_array(const T* a, const T* b, bool* dst, size_t n) {
378372

379373
/// numpy.logical_and(x1, x2, /, out=None, *, where=True, ...)
380374
inline void logical_and(const bool* a, const bool* b, bool* dst, size_t n) {
381-
for (size_t i = 0; i < n; ++i) dst[i] = a[i] && b[i];
375+
NUMPY_UNROLL4(i, dst[i] = a[i] && b[i]);
382376
}
383-
384377
/// numpy.logical_or(x1, x2, /, out=None, *, where=True, ...)
385378
inline void logical_or(const bool* a, const bool* b, bool* dst, size_t n) {
386-
for (size_t i = 0; i < n; ++i) dst[i] = a[i] || b[i];
379+
NUMPY_UNROLL4(i, dst[i] = a[i] || b[i]);
387380
}
388-
389381
/// numpy.logical_not(x, /, out=None, *, where=True, ...)
390382
inline void logical_not(const bool* src, bool* dst, size_t n) {
391-
for (size_t i = 0; i < n; ++i) dst[i] = !src[i];
383+
NUMPY_UNROLL4(i, dst[i] = !src[i]);
392384
}
393-
394385
/// numpy.logical_xor(x1, x2, /, out=None, *, where=True, ...)
395386
inline void logical_xor(const bool* a, const bool* b, bool* dst, size_t n) {
396-
for (size_t i = 0; i < n; ++i) dst[i] = a[i] ^ b[i];
387+
NUMPY_UNROLL4(i, dst[i] = a[i] ^ b[i]);
397388
}
398389

399390
// ============================================================================
@@ -403,19 +394,17 @@ inline void logical_xor(const bool* a, const bool* b, bool* dst, size_t n) {
403394
/// numpy.isnan(x, /, out=None, *, where=True, ...)
404395
template<typename T>
405396
inline void isnan(const T* src, bool* dst, size_t n) {
406-
for (size_t i = 0; i < n; ++i) dst[i] = std::isnan(src[i]);
397+
NUMPY_UNROLL4(i, dst[i] = std::isnan(src[i]));
407398
}
408-
409399
/// numpy.isinf(x, /, out=None, *, where=True, ...)
410400
template<typename T>
411401
inline void isinf(const T* src, bool* dst, size_t n) {
412-
for (size_t i = 0; i < n; ++i) dst[i] = std::isinf(src[i]);
402+
NUMPY_UNROLL4(i, dst[i] = std::isinf(src[i]));
413403
}
414-
415404
/// numpy.isfinite(x, /, out=None, *, where=True, ...)
416405
template<typename T>
417406
inline void isfinite(const T* src, bool* dst, size_t n) {
418-
for (size_t i = 0; i < n; ++i) dst[i] = std::isfinite(src[i]);
407+
NUMPY_UNROLL4(i, dst[i] = std::isfinite(src[i]));
419408
}
420409

421410
// ============================================================================
@@ -425,37 +414,32 @@ inline void isfinite(const T* src, bool* dst, size_t n) {
425414
/// numpy.arctan2(x1, x2, /, out=None, *, where=True, ...) — array-array
426415
template<typename T>
427416
inline void arctan2_array(const T* a, const T* b, T* dst, size_t n) {
428-
for (size_t i = 0; i < n; ++i) dst[i] = svml::atan2(a[i], b[i]);
417+
NUMPY_UNROLL4(i, dst[i] = svml::atan2(a[i], b[i]));
429418
}
430-
431419
/// numpy.arctan2(x1, x2, /, out=None, *, where=True, ...) — array-scalar
432420
template<typename T>
433421
inline void arctan2_scalar(const T* src, T* dst, size_t n, T b) {
434-
for (size_t i = 0; i < n; ++i) dst[i] = svml::atan2(src[i], b);
422+
NUMPY_UNROLL4(i, dst[i] = svml::atan2(src[i], b));
435423
}
436-
437424
/// numpy.maximum(x1, x2, /, out=None, *, where=True, ...) — array-array
438425
template<typename T>
439426
inline void maximum_array(const T* a, const T* b, T* dst, size_t n) {
440-
for (size_t i = 0; i < n; ++i) dst[i] = std::max(a[i], b[i]);
427+
NUMPY_UNROLL4(i, dst[i] = std::max(a[i], b[i]));
441428
}
442-
443429
/// numpy.maximum(x1, x2, /, out=None, *, where=True, ...) — scalar variant
444430
template<typename T>
445431
inline void maximum_scalar(const T* src, T* dst, size_t n, T b) {
446-
for (size_t i = 0; i < n; ++i) dst[i] = std::max(src[i], b);
432+
NUMPY_UNROLL4(i, dst[i] = std::max(src[i], b));
447433
}
448-
449434
/// numpy.minimum(x1, x2, /, out=None, *, where=True, ...) — array-array
450435
template<typename T>
451436
inline void minimum_array(const T* a, const T* b, T* dst, size_t n) {
452-
for (size_t i = 0; i < n; ++i) dst[i] = std::min(a[i], b[i]);
437+
NUMPY_UNROLL4(i, dst[i] = std::min(a[i], b[i]));
453438
}
454-
455439
/// numpy.minimum(x1, x2, /, out=None, *, where=True, ...) — scalar variant
456440
template<typename T>
457441
inline void minimum_scalar(const T* src, T* dst, size_t n, T b) {
458-
for (size_t i = 0; i < n; ++i) dst[i] = std::min(src[i], b);
442+
NUMPY_UNROLL4(i, dst[i] = std::min(src[i], b));
459443
}
460444

461445
// ============================================================================
@@ -536,13 +520,12 @@ inline void concatenate(const T* const* arrays, T* dst, const size_t* sizes, siz
536520
/// numpy.where(condition, x, y) — scalar x, y
537521
template<typename T>
538522
inline void where_scalar(const bool* cond, T* dst, size_t n, T x, T y) {
539-
for (size_t i = 0; i < n; ++i) dst[i] = cond[i] ? x : y;
523+
NUMPY_UNROLL4(i, dst[i] = cond[i] ? x : y);
540524
}
541-
542525
/// numpy.where(condition, x, y) — array x, y
543526
template<typename T>
544527
inline void where_array(const bool* cond, T* dst, size_t n, const T* x, const T* y) {
545-
for (size_t i = 0; i < n; ++i) dst[i] = cond[i] ? x[i] : y[i];
528+
NUMPY_UNROLL4(i, dst[i] = cond[i] ? x[i] : y[i]);
546529
}
547530

548531
/// numpy.transpose(a, axes=None) — 2D only
@@ -604,15 +587,13 @@ inline void roll(const T* src, T* dst, size_t n, ptrdiff_t shift) {
604587
shift = shift % static_cast<ptrdiff_t>(n);
605588
if (shift < 0) shift += static_cast<ptrdiff_t>(n);
606589
size_t s = static_cast<size_t>(shift);
607-
for (size_t i = 0; i < n; ++i)
608-
dst[(i + s) % n] = src[i];
590+
NUMPY_UNROLL4(i, dst[(i + s) % n] = src[i]);
609591
}
610592

611593
/// numpy.flip(m, axis=None)
612594
template<typename T>
613595
inline void flip(const T* src, T* dst, size_t n) {
614-
for (size_t i = 0; i < n; ++i)
615-
dst[i] = src[n - 1 - i];
596+
NUMPY_UNROLL4(i, dst[i] = src[n - 1 - i]);
616597
}
617598

618599
/// numpy.repeat(a, repeats, axis=None)
@@ -733,72 +714,74 @@ inline double safe_divide(double a, double b, double default_val = 0.0) {
733714
/// ndarray.astype(dtype, order='K', casting='unsafe', subok=True, copy=True)
734715
template<typename Tout, typename Tin>
735716
inline void astype(const Tin* src, Tout* dst, size_t n) {
736-
for (size_t i = 0; i < n; ++i) dst[i] = static_cast<Tout>(src[i]);
717+
NUMPY_UNROLL4(i, dst[i] = static_cast<Tout>(src[i]));
737718
}
738719

739720
/// float64 → float32 → float64 roundtrip (for precision testing)
740721
inline void truncate_to_float32(const double* src, double* dst, size_t n) {
741-
for (size_t i = 0; i < n; ++i) {
742-
float tmp = static_cast<float>(src[i]);
743-
dst[i] = static_cast<double>(tmp);
744-
}
722+
NUMPY_UNROLL4(i, { float tmp = static_cast<float>(src[i]);
723+
dst[i] = static_cast<double>(tmp); });
745724
}
746725

747726
// ============================================================================
748-
// mean_axis: T in → double out (matching numpy dtype promotion)
727+
// Axis reductions — shared fiber-skeleton for mean_axis, norm_axis, etc.
749728
// ============================================================================
750729

751-
/// ndarray.mean(axis=N) — N-D, T in → T out (matches numpy: preserves input dtype)
752-
template<typename T>
753-
inline void mean_axis(const T* src, T* dst, const ptrdiff_t* shape, int ndim, int axis) {
730+
/// Shared axis-reduction skeleton: stride setup, fiber loop, coordinate decomp.
731+
/// Calls op(in_base, out_base, axis_stride, axis_size, buf) per fiber.
732+
template<typename T, typename F>
733+
inline void axis_reduce_impl(const T* src, T* dst, const ptrdiff_t* shape, int ndim,
734+
int axis, F&& op) {
754735
if (ndim == 0) return;
755736
if (axis < 0) axis += ndim;
756737
ptrdiff_t axis_size = shape[axis];
757738
if (axis_size == 0) return;
758739

759-
// Strides for C-contiguous layout
760740
std::vector<ptrdiff_t> stride(ndim);
761741
stride[ndim - 1] = 1;
762742
for (int d = ndim - 2; d >= 0; --d)
763743
stride[d] = stride[d + 1] * shape[d + 1];
764744

765745
ptrdiff_t axis_stride = stride[axis];
766746

767-
// Number of fibers = product of all non-axis dimensions
768747
ptrdiff_t n_fibers = 1;
769748
for (int d = 0; d < ndim; ++d)
770749
if (d != axis) n_fibers *= shape[d];
771750

772-
// Output stride for the flattened 1D dst
773751
std::vector<ptrdiff_t> out_shape(shape, shape + ndim);
774-
out_shape[axis] = 1; // reduced dimension
752+
out_shape[axis] = 1;
775753
std::vector<ptrdiff_t> out_stride(ndim);
776754
out_stride[ndim - 1] = 1;
777755
for (int d = ndim - 2; d >= 0; --d)
778756
out_stride[d] = out_stride[d + 1] * out_shape[d + 1];
779757

780-
// Temporary buffer for pairwise sum along fiber
781-
std::vector<T> fiber_buf(static_cast<size_t>(axis_size));
758+
std::vector<T> buf(static_cast<size_t>(axis_size));
782759

783760
for (ptrdiff_t f = 0; f < n_fibers; ++f) {
784-
ptrdiff_t rem = f;
785-
ptrdiff_t in_base = 0, out_base = 0;
761+
ptrdiff_t rem = f, in_base = 0, out_base = 0;
786762
for (int d = ndim - 1; d >= 0; --d) {
787763
if (d == axis) continue;
788764
ptrdiff_t idx = rem % shape[d];
789765
rem /= shape[d];
790766
in_base += idx * stride[d];
791767
out_base += idx * out_stride[d];
792768
}
793-
794-
for (ptrdiff_t i = 0; i < axis_size; ++i)
795-
fiber_buf[static_cast<size_t>(i)] = src[in_base + i * axis_stride];
796-
797-
T sum = pairwise_sum(fiber_buf.data(), static_cast<size_t>(axis_size));
798-
dst[out_base] = sum / static_cast<T>(axis_size);
769+
op(in_base, out_base, axis_stride, axis_size, buf.data());
799770
}
800771
}
801772

773+
/// ndarray.mean(axis=N) — N-D, T in → T out
774+
template<typename T>
775+
inline void mean_axis(const T* src, T* dst, const ptrdiff_t* shape, int ndim, int axis) {
776+
axis_reduce_impl<T>(src, dst, shape, ndim, axis,
777+
[&](ptrdiff_t ib, ptrdiff_t ob, ptrdiff_t as, ptrdiff_t n, T* buf) {
778+
for (ptrdiff_t i = 0; i < n; ++i)
779+
buf[static_cast<size_t>(i)] = src[ib + i * as];
780+
T sum = pairwise_sum(buf, static_cast<size_t>(n));
781+
dst[ob] = sum / static_cast<T>(n);
782+
});
783+
}
784+
802785
// ============================================================================
803786
// norm, dot — used by linalg
804787
// ============================================================================
@@ -828,53 +811,15 @@ inline T dot(const T* a, const T* b, size_t n) {
828811
/// numpy.linalg.norm(x, ord=None, axis=N, keepdims=False) — N-D
829812
template<typename T>
830813
inline void norm_axis(const T* src, T* dst, const ptrdiff_t* shape, int ndim, int axis) {
831-
if (ndim == 0) return;
832-
if (axis < 0) axis += ndim;
833-
ptrdiff_t axis_size = shape[axis];
834-
if (axis_size == 0) return;
835-
836-
// Strides for C-contiguous layout
837-
std::vector<ptrdiff_t> stride(ndim);
838-
stride[ndim - 1] = 1;
839-
for (int d = ndim - 2; d >= 0; --d)
840-
stride[d] = stride[d + 1] * shape[d + 1];
841-
842-
ptrdiff_t axis_stride = stride[axis];
843-
844-
// Number of fibers
845-
ptrdiff_t n_fibers = 1;
846-
for (int d = 0; d < ndim; ++d)
847-
if (d != axis) n_fibers *= shape[d];
848-
849-
// Output stride (same pattern as mean_axis: reduced axis has size 1)
850-
std::vector<ptrdiff_t> out_shape(shape, shape + ndim);
851-
out_shape[axis] = 1;
852-
std::vector<ptrdiff_t> out_stride(ndim);
853-
out_stride[ndim - 1] = 1;
854-
for (int d = ndim - 2; d >= 0; --d)
855-
out_stride[d] = out_stride[d + 1] * out_shape[d + 1];
856-
857-
// Temporary buffer for pairwise sum along fiber
858-
std::vector<T> fiber_buf(static_cast<size_t>(axis_size));
859-
860-
for (ptrdiff_t f = 0; f < n_fibers; ++f) {
861-
ptrdiff_t rem = f;
862-
ptrdiff_t in_base = 0, out_base = 0;
863-
for (int d = ndim - 1; d >= 0; --d) {
864-
if (d == axis) continue;
865-
ptrdiff_t idx = rem % shape[d];
866-
rem /= shape[d];
867-
in_base += idx * stride[d];
868-
out_base += idx * out_stride[d];
869-
}
870-
871-
for (ptrdiff_t i = 0; i < axis_size; ++i) {
872-
T v = src[in_base + i * axis_stride];
873-
fiber_buf[static_cast<size_t>(i)] = v * v;
874-
}
875-
T sum = pairwise_sum(fiber_buf.data(), static_cast<size_t>(axis_size));
876-
dst[out_base] = std::sqrt(sum);
877-
}
814+
axis_reduce_impl<T>(src, dst, shape, ndim, axis,
815+
[&](ptrdiff_t ib, ptrdiff_t ob, ptrdiff_t as, ptrdiff_t n, T* buf) {
816+
for (ptrdiff_t i = 0; i < n; ++i) {
817+
T v = src[ib + i * as];
818+
buf[static_cast<size_t>(i)] = v * v;
819+
}
820+
T sum = pairwise_sum(buf, static_cast<size_t>(n));
821+
dst[ob] = std::sqrt(sum);
822+
});
878823
}
879824

880825
} // namespace numpy

0 commit comments

Comments
 (0)