@@ -333,43 +333,37 @@ inline T var(const T* data, size_t n) {
333333// / numpy.greater(x1, x2, /, out=None, *, where=True, ...)
334334template <typename T>
335335inline void greater (const T* src, bool * dst, size_t n, T threshold) {
336- for ( size_t i = 0 ; i < n; ++i) dst[i] = (src[i] > threshold);
336+ NUMPY_UNROLL4 (i, dst[i] = (src[i] > threshold) );
337337}
338-
339338// / numpy.less(x1, x2, /, out=None, *, where=True, ...)
340339template <typename T>
341340inline void less (const T* src, bool * dst, size_t n, T threshold) {
342- for ( size_t i = 0 ; i < n; ++i) dst[i] = (src[i] < threshold);
341+ NUMPY_UNROLL4 (i, dst[i] = (src[i] < threshold) );
343342}
344-
345343// / numpy.equal(x1, x2, /, out=None, *, where=True, ...)
346344template <typename T>
347345inline void equal (const T* src, bool * dst, size_t n, T val) {
348- for ( size_t i = 0 ; i < n; ++i) dst[i] = (src[i] == val);
346+ NUMPY_UNROLL4 (i, dst[i] = (src[i] == val) );
349347}
350-
351348// / numpy.greater_equal(x1, x2, /, out=None, *, where=True, ...)
352349template <typename T>
353350inline void greater_equal (const T* src, bool * dst, size_t n, T threshold) {
354- for ( size_t i = 0 ; i < n; ++i) dst[i] = (src[i] >= threshold);
351+ NUMPY_UNROLL4 (i, dst[i] = (src[i] >= threshold) );
355352}
356-
357353// / numpy.less_equal(x1, x2, /, out=None, *, where=True, ...)
358354template <typename T>
359355inline void less_equal (const T* src, bool * dst, size_t n, T threshold) {
360- for ( size_t i = 0 ; i < n; ++i) dst[i] = (src[i] <= threshold);
356+ NUMPY_UNROLL4 (i, dst[i] = (src[i] <= threshold) );
361357}
362-
363358// / numpy.not_equal(x1, x2, /, out=None, *, where=True, ...) — scalar variant
364359template <typename T>
365360inline void not_equal_scalar (const T* src, bool * dst, size_t n, T val) {
366- for ( size_t i = 0 ; i < n; ++i) dst[i] = (src[i] != val);
361+ NUMPY_UNROLL4 (i, dst[i] = (src[i] != val) );
367362}
368-
369363// / numpy.not_equal(x1, x2, /, out=None, *, where=True, ...) — array variant
370364template <typename T>
371365inline void not_equal_array (const T* a, const T* b, bool * dst, size_t n) {
372- for ( size_t i = 0 ; i < n; ++i) dst[i] = (a[i] != b[i]);
366+ NUMPY_UNROLL4 (i, dst[i] = (a[i] != b[i]) );
373367}
374368
375369// ============================================================================
@@ -378,22 +372,19 @@ inline void not_equal_array(const T* a, const T* b, bool* dst, size_t n) {
378372
379373// / numpy.logical_and(x1, x2, /, out=None, *, where=True, ...)
380374inline void logical_and (const bool * a, const bool * b, bool * dst, size_t n) {
381- for ( size_t i = 0 ; i < n; ++i) dst[i] = a[i] && b[i];
375+ NUMPY_UNROLL4 (i, dst[i] = a[i] && b[i]) ;
382376}
383-
384377// / numpy.logical_or(x1, x2, /, out=None, *, where=True, ...)
385378inline void logical_or (const bool * a, const bool * b, bool * dst, size_t n) {
386- for ( size_t i = 0 ; i < n; ++i) dst[i] = a[i] || b[i];
379+ NUMPY_UNROLL4 (i, dst[i] = a[i] || b[i]) ;
387380}
388-
389381// / numpy.logical_not(x, /, out=None, *, where=True, ...)
390382inline void logical_not (const bool * src, bool * dst, size_t n) {
391- for ( size_t i = 0 ; i < n; ++i) dst[i] = !src[i];
383+ NUMPY_UNROLL4 (i, dst[i] = !src[i]) ;
392384}
393-
394385// / numpy.logical_xor(x1, x2, /, out=None, *, where=True, ...)
395386inline void logical_xor (const bool * a, const bool * b, bool * dst, size_t n) {
396- for ( size_t i = 0 ; i < n; ++i) dst[i] = a[i] ^ b[i];
387+ NUMPY_UNROLL4 (i, dst[i] = a[i] ^ b[i]) ;
397388}
398389
399390// ============================================================================
@@ -403,19 +394,17 @@ inline void logical_xor(const bool* a, const bool* b, bool* dst, size_t n) {
403394// / numpy.isnan(x, /, out=None, *, where=True, ...)
404395template <typename T>
405396inline void isnan (const T* src, bool * dst, size_t n) {
406- for ( size_t i = 0 ; i < n; ++i) dst[i] = std::isnan (src[i]);
397+ NUMPY_UNROLL4 (i, dst[i] = std::isnan (src[i]) );
407398}
408-
409399// / numpy.isinf(x, /, out=None, *, where=True, ...)
410400template <typename T>
411401inline void isinf (const T* src, bool * dst, size_t n) {
412- for ( size_t i = 0 ; i < n; ++i) dst[i] = std::isinf (src[i]);
402+ NUMPY_UNROLL4 (i, dst[i] = std::isinf (src[i]) );
413403}
414-
415404// / numpy.isfinite(x, /, out=None, *, where=True, ...)
416405template <typename T>
417406inline void isfinite (const T* src, bool * dst, size_t n) {
418- for ( size_t i = 0 ; i < n; ++i) dst[i] = std::isfinite (src[i]);
407+ NUMPY_UNROLL4 (i, dst[i] = std::isfinite (src[i]) );
419408}
420409
421410// ============================================================================
@@ -425,37 +414,32 @@ inline void isfinite(const T* src, bool* dst, size_t n) {
425414// / numpy.arctan2(x1, x2, /, out=None, *, where=True, ...) — array-array
426415template <typename T>
427416inline void arctan2_array (const T* a, const T* b, T* dst, size_t n) {
428- for ( size_t i = 0 ; i < n; ++i) dst[i] = svml::atan2 (a[i], b[i]);
417+ NUMPY_UNROLL4 (i, dst[i] = svml::atan2 (a[i], b[i]) );
429418}
430-
431419// / numpy.arctan2(x1, x2, /, out=None, *, where=True, ...) — array-scalar
432420template <typename T>
433421inline void arctan2_scalar (const T* src, T* dst, size_t n, T b) {
434- for ( size_t i = 0 ; i < n; ++i) dst[i] = svml::atan2 (src[i], b);
422+ NUMPY_UNROLL4 (i, dst[i] = svml::atan2 (src[i], b) );
435423}
436-
437424// / numpy.maximum(x1, x2, /, out=None, *, where=True, ...) — array-array
438425template <typename T>
439426inline void maximum_array (const T* a, const T* b, T* dst, size_t n) {
440- for ( size_t i = 0 ; i < n; ++i) dst[i] = std::max (a[i], b[i]);
427+ NUMPY_UNROLL4 (i, dst[i] = std::max (a[i], b[i]) );
441428}
442-
443429// / numpy.maximum(x1, x2, /, out=None, *, where=True, ...) — scalar variant
444430template <typename T>
445431inline void maximum_scalar (const T* src, T* dst, size_t n, T b) {
446- for ( size_t i = 0 ; i < n; ++i) dst[i] = std::max (src[i], b);
432+ NUMPY_UNROLL4 (i, dst[i] = std::max (src[i], b) );
447433}
448-
449434// / numpy.minimum(x1, x2, /, out=None, *, where=True, ...) — array-array
450435template <typename T>
451436inline void minimum_array (const T* a, const T* b, T* dst, size_t n) {
452- for ( size_t i = 0 ; i < n; ++i) dst[i] = std::min (a[i], b[i]);
437+ NUMPY_UNROLL4 (i, dst[i] = std::min (a[i], b[i]) );
453438}
454-
455439// / numpy.minimum(x1, x2, /, out=None, *, where=True, ...) — scalar variant
456440template <typename T>
457441inline void minimum_scalar (const T* src, T* dst, size_t n, T b) {
458- for ( size_t i = 0 ; i < n; ++i) dst[i] = std::min (src[i], b);
442+ NUMPY_UNROLL4 (i, dst[i] = std::min (src[i], b) );
459443}
460444
461445// ============================================================================
@@ -536,13 +520,12 @@ inline void concatenate(const T* const* arrays, T* dst, const size_t* sizes, siz
536520// / numpy.where(condition, x, y) — scalar x, y
537521template <typename T>
538522inline void where_scalar (const bool * cond, T* dst, size_t n, T x, T y) {
539- for ( size_t i = 0 ; i < n; ++i) dst[i] = cond[i] ? x : y;
523+ NUMPY_UNROLL4 (i, dst[i] = cond[i] ? x : y) ;
540524}
541-
542525// / numpy.where(condition, x, y) — array x, y
543526template <typename T>
544527inline void where_array (const bool * cond, T* dst, size_t n, const T* x, const T* y) {
545- for ( size_t i = 0 ; i < n; ++i) dst[i] = cond[i] ? x[i] : y[i];
528+ NUMPY_UNROLL4 (i, dst[i] = cond[i] ? x[i] : y[i]) ;
546529}
547530
548531// / numpy.transpose(a, axes=None) — 2D only
@@ -604,15 +587,13 @@ inline void roll(const T* src, T* dst, size_t n, ptrdiff_t shift) {
604587 shift = shift % static_cast <ptrdiff_t >(n);
605588 if (shift < 0 ) shift += static_cast <ptrdiff_t >(n);
606589 size_t s = static_cast <size_t >(shift);
607- for (size_t i = 0 ; i < n; ++i)
608- dst[(i + s) % n] = src[i];
590+ NUMPY_UNROLL4 (i, dst[(i + s) % n] = src[i]);
609591}
610592
611593// / numpy.flip(m, axis=None)
612594template <typename T>
613595inline void flip (const T* src, T* dst, size_t n) {
614- for (size_t i = 0 ; i < n; ++i)
615- dst[i] = src[n - 1 - i];
596+ NUMPY_UNROLL4 (i, dst[i] = src[n - 1 - i]);
616597}
617598
618599// / numpy.repeat(a, repeats, axis=None)
@@ -733,72 +714,74 @@ inline double safe_divide(double a, double b, double default_val = 0.0) {
733714// / ndarray.astype(dtype, order='K', casting='unsafe', subok=True, copy=True)
734715template <typename Tout, typename Tin>
735716inline void astype (const Tin* src, Tout* dst, size_t n) {
736- for ( size_t i = 0 ; i < n; ++i) dst[i] = static_cast <Tout>(src[i]);
717+ NUMPY_UNROLL4 (i, dst[i] = static_cast <Tout>(src[i]) );
737718}
738719
739720// / float64 → float32 → float64 roundtrip (for precision testing)
740721inline void truncate_to_float32 (const double * src, double * dst, size_t n) {
741- for (size_t i = 0 ; i < n; ++i) {
742- float tmp = static_cast <float >(src[i]);
743- dst[i] = static_cast <double >(tmp);
744- }
722+ NUMPY_UNROLL4 (i, { float tmp = static_cast <float >(src[i]);
723+ dst[i] = static_cast <double >(tmp); });
745724}
746725
747726// ============================================================================
748- // mean_axis: T in → double out (matching numpy dtype promotion)
727+ // Axis reductions — shared fiber-skeleton for mean_axis, norm_axis, etc.
749728// ============================================================================
750729
751- // / ndarray.mean(axis=N) — N-D, T in → T out (matches numpy: preserves input dtype)
752- template <typename T>
753- inline void mean_axis (const T* src, T* dst, const ptrdiff_t * shape, int ndim, int axis) {
730+ // / Shared axis-reduction skeleton: stride setup, fiber loop, coordinate decomp.
731+ // / Calls op(in_base, out_base, axis_stride, axis_size, buf) per fiber.
732+ template <typename T, typename F>
733+ inline void axis_reduce_impl (const T* src, T* dst, const ptrdiff_t * shape, int ndim,
734+ int axis, F&& op) {
754735 if (ndim == 0 ) return ;
755736 if (axis < 0 ) axis += ndim;
756737 ptrdiff_t axis_size = shape[axis];
757738 if (axis_size == 0 ) return ;
758739
759- // Strides for C-contiguous layout
760740 std::vector<ptrdiff_t > stride (ndim);
761741 stride[ndim - 1 ] = 1 ;
762742 for (int d = ndim - 2 ; d >= 0 ; --d)
763743 stride[d] = stride[d + 1 ] * shape[d + 1 ];
764744
765745 ptrdiff_t axis_stride = stride[axis];
766746
767- // Number of fibers = product of all non-axis dimensions
768747 ptrdiff_t n_fibers = 1 ;
769748 for (int d = 0 ; d < ndim; ++d)
770749 if (d != axis) n_fibers *= shape[d];
771750
772- // Output stride for the flattened 1D dst
773751 std::vector<ptrdiff_t > out_shape (shape, shape + ndim);
774- out_shape[axis] = 1 ; // reduced dimension
752+ out_shape[axis] = 1 ;
775753 std::vector<ptrdiff_t > out_stride (ndim);
776754 out_stride[ndim - 1 ] = 1 ;
777755 for (int d = ndim - 2 ; d >= 0 ; --d)
778756 out_stride[d] = out_stride[d + 1 ] * out_shape[d + 1 ];
779757
780- // Temporary buffer for pairwise sum along fiber
781- std::vector<T> fiber_buf (static_cast <size_t >(axis_size));
758+ std::vector<T> buf (static_cast <size_t >(axis_size));
782759
783760 for (ptrdiff_t f = 0 ; f < n_fibers; ++f) {
784- ptrdiff_t rem = f;
785- ptrdiff_t in_base = 0 , out_base = 0 ;
761+ ptrdiff_t rem = f, in_base = 0 , out_base = 0 ;
786762 for (int d = ndim - 1 ; d >= 0 ; --d) {
787763 if (d == axis) continue ;
788764 ptrdiff_t idx = rem % shape[d];
789765 rem /= shape[d];
790766 in_base += idx * stride[d];
791767 out_base += idx * out_stride[d];
792768 }
793-
794- for (ptrdiff_t i = 0 ; i < axis_size; ++i)
795- fiber_buf[static_cast <size_t >(i)] = src[in_base + i * axis_stride];
796-
797- T sum = pairwise_sum (fiber_buf.data (), static_cast <size_t >(axis_size));
798- dst[out_base] = sum / static_cast <T>(axis_size);
769+ op (in_base, out_base, axis_stride, axis_size, buf.data ());
799770 }
800771}
801772
773+ // / ndarray.mean(axis=N) — N-D, T in → T out
774+ template <typename T>
775+ inline void mean_axis (const T* src, T* dst, const ptrdiff_t * shape, int ndim, int axis) {
776+ axis_reduce_impl<T>(src, dst, shape, ndim, axis,
777+ [&](ptrdiff_t ib, ptrdiff_t ob, ptrdiff_t as, ptrdiff_t n, T* buf) {
778+ for (ptrdiff_t i = 0 ; i < n; ++i)
779+ buf[static_cast <size_t >(i)] = src[ib + i * as];
780+ T sum = pairwise_sum (buf, static_cast <size_t >(n));
781+ dst[ob] = sum / static_cast <T>(n);
782+ });
783+ }
784+
802785// ============================================================================
803786// norm, dot — used by linalg
804787// ============================================================================
@@ -828,53 +811,15 @@ inline T dot(const T* a, const T* b, size_t n) {
828811// / numpy.linalg.norm(x, ord=None, axis=N, keepdims=False) — N-D
829812template <typename T>
830813inline void norm_axis (const T* src, T* dst, const ptrdiff_t * shape, int ndim, int axis) {
831- if (ndim == 0 ) return ;
832- if (axis < 0 ) axis += ndim;
833- ptrdiff_t axis_size = shape[axis];
834- if (axis_size == 0 ) return ;
835-
836- // Strides for C-contiguous layout
837- std::vector<ptrdiff_t > stride (ndim);
838- stride[ndim - 1 ] = 1 ;
839- for (int d = ndim - 2 ; d >= 0 ; --d)
840- stride[d] = stride[d + 1 ] * shape[d + 1 ];
841-
842- ptrdiff_t axis_stride = stride[axis];
843-
844- // Number of fibers
845- ptrdiff_t n_fibers = 1 ;
846- for (int d = 0 ; d < ndim; ++d)
847- if (d != axis) n_fibers *= shape[d];
848-
849- // Output stride (same pattern as mean_axis: reduced axis has size 1)
850- std::vector<ptrdiff_t > out_shape (shape, shape + ndim);
851- out_shape[axis] = 1 ;
852- std::vector<ptrdiff_t > out_stride (ndim);
853- out_stride[ndim - 1 ] = 1 ;
854- for (int d = ndim - 2 ; d >= 0 ; --d)
855- out_stride[d] = out_stride[d + 1 ] * out_shape[d + 1 ];
856-
857- // Temporary buffer for pairwise sum along fiber
858- std::vector<T> fiber_buf (static_cast <size_t >(axis_size));
859-
860- for (ptrdiff_t f = 0 ; f < n_fibers; ++f) {
861- ptrdiff_t rem = f;
862- ptrdiff_t in_base = 0 , out_base = 0 ;
863- for (int d = ndim - 1 ; d >= 0 ; --d) {
864- if (d == axis) continue ;
865- ptrdiff_t idx = rem % shape[d];
866- rem /= shape[d];
867- in_base += idx * stride[d];
868- out_base += idx * out_stride[d];
869- }
870-
871- for (ptrdiff_t i = 0 ; i < axis_size; ++i) {
872- T v = src[in_base + i * axis_stride];
873- fiber_buf[static_cast <size_t >(i)] = v * v;
874- }
875- T sum = pairwise_sum (fiber_buf.data (), static_cast <size_t >(axis_size));
876- dst[out_base] = std::sqrt (sum);
877- }
814+ axis_reduce_impl<T>(src, dst, shape, ndim, axis,
815+ [&](ptrdiff_t ib, ptrdiff_t ob, ptrdiff_t as, ptrdiff_t n, T* buf) {
816+ for (ptrdiff_t i = 0 ; i < n; ++i) {
817+ T v = src[ib + i * as];
818+ buf[static_cast <size_t >(i)] = v * v;
819+ }
820+ T sum = pairwise_sum (buf, static_cast <size_t >(n));
821+ dst[ob] = std::sqrt (sum);
822+ });
878823}
879824
880825} // namespace numpy
0 commit comments