From cb24d65c2587bb29b9ed467a3b45b1928fa1b29d Mon Sep 17 00:00:00 2001 From: Prakhar Deep Date: Thu, 2 Apr 2026 14:11:40 +0530 Subject: [PATCH 1/4] enable MSVC cl.exe build changes as per c++ 14 --- .github/workflows/windows.yml | 74 ++- .../xsimd/arch/common/xsimd_common_memory.hpp | 3 + include/xsimd/arch/xsimd_neon.hpp | 557 +++++++++++++++++- include/xsimd/arch/xsimd_neon64.hpp | 288 ++++++++- 4 files changed, 872 insertions(+), 50 deletions(-) diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml index 3fae59208..b76a22a14 100644 --- a/.github/workflows/windows.yml +++ b/.github/workflows/windows.yml @@ -85,7 +85,7 @@ jobs: run: ./_build/test/test_xsimd build-windows-arm64: - name: 'MSVC arm64' + name: 'MSVC ARM64' defaults: run: shell: bash {0} @@ -94,7 +94,7 @@ jobs: - name: Setup compiler uses: ilammy/msvc-dev-cmd@v1 with: - arch: amd64 + arch: arm64 - name: Setup Ninja run: | python3 -m pip install --upgrade pip setuptools wheel @@ -107,3 +107,73 @@ jobs: run: cmake --build _build - name: Testing xsimd run: ./_build/test/test_xsimd + + build-windows-arm64-msys2-clang: + name: 'MSYS2 CLANG ARM64' + runs-on: windows-11-arm + defaults: + run: + shell: msys2 {0} + steps: + - name: Setup MSYS2 with Clang (ARM64) + uses: msys2/setup-msys2@v2 + with: + msystem: CLANGARM64 + update: true + path-type: minimal + pacboy: >- + cc:p + cmake:p + ninja:p + - name: Checkout xsimd + uses: actions/checkout@v4 + - name: Configure + run: | + cmake -B _build \ + -DBUILD_TESTS=ON \ + -DDOWNLOAD_DOCTEST=ON \ + -DBUILD_BENCHMARK=ON \ + -DBUILD_EXAMPLES=ON \ + -DCMAKE_BUILD_TYPE=Release \ + -G Ninja + - name: Build + run: cmake --build _build + - name: Testing xsimd + run: ./_build/test/test_xsimd + + build-windows-arm64-clang: + name: 'LLVM CLANG ARM64' + defaults: + run: + shell: bash {0} + runs-on: windows-11-arm + steps: + - name: Install LLVM/Clang for Windows ARM64 + shell: pwsh + run: | + winget install --id LLVM.LLVM --accept-source-agreements --accept-package-agreements --silent + # Add LLVM bin directory to PATH for subsequent steps + echo "C:\Program Files\LLVM\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append + - name: Setup Ninja + run: | + python3 -m pip install --upgrade pip setuptools wheel + python3 -m pip install ninja + - name: Checkout xsimd + uses: actions/checkout@v4 + - name: Verify clang-cl version + run: clang-cl --version + - name: Configure + run: | + cmake -B _build \ + -DCMAKE_C_COMPILER=clang-cl \ + -DCMAKE_CXX_COMPILER=clang-cl \ + -DBUILD_TESTS=ON \ + -DDOWNLOAD_DOCTEST=ON \ + -DBUILD_BENCHMARK=ON \ + -DBUILD_EXAMPLES=ON \ + -DCMAKE_BUILD_TYPE=Release \ + -G Ninja + - name: Build + run: cmake --build _build + - name: Testing xsimd + run: ./_build/test/test_xsimd \ No newline at end of file diff --git a/include/xsimd/arch/common/xsimd_common_memory.hpp b/include/xsimd/arch/common/xsimd_common_memory.hpp index 6a301dd44..f80a1927a 100644 --- a/include/xsimd/arch/common/xsimd_common_memory.hpp +++ b/include/xsimd/arch/common/xsimd_common_memory.hpp @@ -71,6 +71,9 @@ namespace xsimd for (size_t i = 0; i < sizeof...(Is); ++i) if ((bitmask >> i) & 1u) std::swap(mask_buffer[inserted++], mask_buffer[i]); + // Fill remaining (don't-care) tail positions with index 0. + for (size_t i = inserted; i < sizeof...(Is); ++i) + mask_buffer[i] = 0; return batch::load_aligned(&mask_buffer[0]); } } diff --git a/include/xsimd/arch/xsimd_neon.hpp b/include/xsimd/arch/xsimd_neon.hpp index 4af19a650..648d8a7dd 100644 --- a/include/xsimd/arch/xsimd_neon.hpp +++ b/include/xsimd/arch/xsimd_neon.hpp @@ -222,51 +222,33 @@ namespace xsimd * comparison dispatchers * **************************/ + // On MSVC ARM64, all NEON types are the same __n128 type, so we can't specialize + // We use a function-based approach instead template - struct comp_return_type_impl; - - template <> - struct comp_return_type_impl + struct comp_return_type_impl { - using type = uint8x16_t; + using type = T; }; +#if !defined(_MSC_VER) || !defined(_M_ARM64) template <> struct comp_return_type_impl { using type = uint8x16_t; }; - template <> - struct comp_return_type_impl - { - using type = uint16x8_t; - }; - template <> struct comp_return_type_impl { using type = uint16x8_t; }; - template <> - struct comp_return_type_impl - { - using type = uint32x4_t; - }; - template <> struct comp_return_type_impl { using type = uint32x4_t; }; - template <> - struct comp_return_type_impl - { - using type = uint64x2_t; - }; - template <> struct comp_return_type_impl { @@ -278,6 +260,7 @@ namespace xsimd { using type = uint32x4_t; }; +#endif template using comp_return_type = typename comp_return_type_impl::type; @@ -305,6 +288,313 @@ namespace xsimd = std::enable_if_t<(std::is_integral::value && sizeof(T) != 8) || std::is_same::value, int>; } +#if defined(_MSC_VER) && defined(_M_ARM64) + // ----------------------------------------------------------------------- + // C++14-compatible dispatch helpers for MSVC ARM64. + // + // On MSVC ARM64, all NEON types are the same underlying type (__n128), + // so overload resolution on NEON types does not work and the existing + // std::tuple-based dispatcher cannot be used. The original workaround + // used `if constexpr` (C++17). The helpers below replace that with + // std::enable_if overloads, which are valid C++14. + // + // Each helper is a function template parameterised on the *element* type + // T. The correct intrinsic is selected at compile time via enable_if on + // sizeof(T) and std::is_unsigned / std::is_floating_point. + // ----------------------------------------------------------------------- + namespace detail { + + // -- load (for set) -- + template + XSIMD_INLINE typename std::enable_if::value, __n128>::type + msvc_arm64_load(const T* d) noexcept { return vld1q_u8(reinterpret_cast(d)); } + template + XSIMD_INLINE typename std::enable_if::value, __n128>::type + msvc_arm64_load(const T* d) noexcept { return vld1q_s8(reinterpret_cast(d)); } + template + XSIMD_INLINE typename std::enable_if::value, __n128>::type + msvc_arm64_load(const T* d) noexcept { return vld1q_u16(reinterpret_cast(d)); } + template + XSIMD_INLINE typename std::enable_if::value, __n128>::type + msvc_arm64_load(const T* d) noexcept { return vld1q_s16(reinterpret_cast(d)); } + template + XSIMD_INLINE typename std::enable_if::value, __n128>::type + msvc_arm64_load(const T* d) noexcept { return vld1q_u32(reinterpret_cast(d)); } + template + XSIMD_INLINE typename std::enable_if::value && !std::is_floating_point::value, __n128>::type + msvc_arm64_load(const T* d) noexcept { return vld1q_s32(reinterpret_cast(d)); } + template + XSIMD_INLINE typename std::enable_if::value, __n128>::type + msvc_arm64_load(const T* d) noexcept { return vld1q_u64(reinterpret_cast(d)); } + template + XSIMD_INLINE typename std::enable_if::value, __n128>::type + msvc_arm64_load(const T* d) noexcept { return vld1q_s64(reinterpret_cast(d)); } + + // -- load_u (for set) -- loads from unsigned element array + template + XSIMD_INLINE typename std::enable_if::type + msvc_arm64_load_u(const as_unsigned_integer_t* d) noexcept { return vld1q_u8(reinterpret_cast(d)); } + template + XSIMD_INLINE typename std::enable_if::type + msvc_arm64_load_u(const as_unsigned_integer_t* d) noexcept { return vld1q_u16(reinterpret_cast(d)); } + template + XSIMD_INLINE typename std::enable_if::type + msvc_arm64_load_u(const as_unsigned_integer_t* d) noexcept { return vld1q_u32(reinterpret_cast(d)); } + template + XSIMD_INLINE typename std::enable_if::type + msvc_arm64_load_u(const as_unsigned_integer_t* d) noexcept { return vld1q_u64(reinterpret_cast(d)); } + + // -- eq for batch_bool (unsigned comparison by size) -- + template + XSIMD_INLINE typename std::enable_if::type + msvc_arm64_eq_bool(__n128 a, __n128 b) noexcept { return vceqq_u8(a,b); } + template + XSIMD_INLINE typename std::enable_if::type + msvc_arm64_eq_bool(__n128 a, __n128 b) noexcept { return vceqq_u16(a,b); } + template + XSIMD_INLINE typename std::enable_if::type + msvc_arm64_eq_bool(__n128 a, __n128 b) noexcept { return vceqq_u32(a,b); } + + } // namespace detail (MSVC ARM64 helpers) + +// Macro to generate C++14 enable_if dispatch overloads for a full binary op +// (all 9 NEON element types: u8,s8,u16,s16,u32,s32,u64,s64,f32). +#define XSIMD_MSVC_ARM64_BINARY_FULL(fname, u8fn, s8fn, u16fn, s16fn, u32fn, s32fn, u64fn, s64fn, f32fn) \ + namespace detail { \ + template \ + XSIMD_INLINE typename std::enable_if::value, __n128>::type \ + fname(__n128 a, __n128 b) noexcept { return u8fn(a,b); } \ + template \ + XSIMD_INLINE typename std::enable_if::value, __n128>::type \ + fname(__n128 a, __n128 b) noexcept { return s8fn(a,b); } \ + template \ + XSIMD_INLINE typename std::enable_if::value, __n128>::type \ + fname(__n128 a, __n128 b) noexcept { return u16fn(a,b); } \ + template \ + XSIMD_INLINE typename std::enable_if::value, __n128>::type \ + fname(__n128 a, __n128 b) noexcept { return s16fn(a,b); } \ + template \ + XSIMD_INLINE typename std::enable_if::value, __n128>::type \ + fname(__n128 a, __n128 b) noexcept { return u32fn(a,b); } \ + template \ + XSIMD_INLINE typename std::enable_if::value && !std::is_floating_point::value, __n128>::type \ + fname(__n128 a, __n128 b) noexcept { return s32fn(a,b); } \ + template \ + XSIMD_INLINE typename std::enable_if::value, __n128>::type \ + fname(__n128 a, __n128 b) noexcept { return f32fn(a,b); } \ + template \ + XSIMD_INLINE typename std::enable_if::value, __n128>::type \ + fname(__n128 a, __n128 b) noexcept { return u64fn(a,b); } \ + template \ + XSIMD_INLINE typename std::enable_if::value, __n128>::type \ + fname(__n128 a, __n128 b) noexcept { return s64fn(a,b); } \ + } + +// Macro for binary ops excluding int64 (u8,s8,u16,s16,u32,s32,f32). +#define XSIMD_MSVC_ARM64_BINARY_EX64(fname, u8fn, s8fn, u16fn, s16fn, u32fn, s32fn, f32fn) \ + namespace detail { \ + template \ + XSIMD_INLINE typename std::enable_if::value, __n128>::type \ + fname(__n128 a, __n128 b) noexcept { return u8fn(a,b); } \ + template \ + XSIMD_INLINE typename std::enable_if::value, __n128>::type \ + fname(__n128 a, __n128 b) noexcept { return s8fn(a,b); } \ + template \ + XSIMD_INLINE typename std::enable_if::value, __n128>::type \ + fname(__n128 a, __n128 b) noexcept { return u16fn(a,b); } \ + template \ + XSIMD_INLINE typename std::enable_if::value, __n128>::type \ + fname(__n128 a, __n128 b) noexcept { return s16fn(a,b); } \ + template \ + XSIMD_INLINE typename std::enable_if::value, __n128>::type \ + fname(__n128 a, __n128 b) noexcept { return u32fn(a,b); } \ + template \ + XSIMD_INLINE typename std::enable_if::value && !std::is_floating_point::value, __n128>::type \ + fname(__n128 a, __n128 b) noexcept { return s32fn(a,b); } \ + template \ + XSIMD_INLINE typename std::enable_if::value, __n128>::type \ + fname(__n128 a, __n128 b) noexcept { return f32fn(a,b); } \ + } + +// Macro for unsigned-only binary ops excluding int64 (u8,u16,u32). +#define XSIMD_MSVC_ARM64_BINARY_UINT_EX64(fname, u8fn, u16fn, u32fn) \ + namespace detail { \ + template \ + XSIMD_INLINE typename std::enable_if::value, __n128>::type \ + fname(__n128 a, __n128 b) noexcept { return u8fn(a,b); } \ + template \ + XSIMD_INLINE typename std::enable_if::value, __n128>::type \ + fname(__n128 a, __n128 b) noexcept { return u16fn(a,b); } \ + template \ + XSIMD_INLINE typename std::enable_if::value, __n128>::type \ + fname(__n128 a, __n128 b) noexcept { return u32fn(a,b); } \ + } + +// Macro for unary ops excluding int64 (u8,s8,u16,s16,u32,s32,f32). +#define XSIMD_MSVC_ARM64_UNARY_EX64(fname, u8fn, s8fn, u16fn, s16fn, u32fn, s32fn, f32fn) \ + namespace detail { \ + template \ + XSIMD_INLINE typename std::enable_if::value, __n128>::type \ + fname(__n128 a) noexcept { return u8fn(a); } \ + template \ + XSIMD_INLINE typename std::enable_if::value, __n128>::type \ + fname(__n128 a) noexcept { return s8fn(a); } \ + template \ + XSIMD_INLINE typename std::enable_if::value, __n128>::type \ + fname(__n128 a) noexcept { return u16fn(a); } \ + template \ + XSIMD_INLINE typename std::enable_if::value, __n128>::type \ + fname(__n128 a) noexcept { return s16fn(a); } \ + template \ + XSIMD_INLINE typename std::enable_if::value, __n128>::type \ + fname(__n128 a) noexcept { return u32fn(a); } \ + template \ + XSIMD_INLINE typename std::enable_if::value && !std::is_floating_point::value, __n128>::type \ + fname(__n128 a) noexcept { return s32fn(a); } \ + template \ + XSIMD_INLINE typename std::enable_if::value, __n128>::type \ + fname(__n128 a) noexcept { return f32fn(a); } \ + } + +// Macro for select (ternary: cond, a, b) — all 9 types. +#define XSIMD_MSVC_ARM64_SELECT_FULL(fname, u8fn, s8fn, u16fn, s16fn, u32fn, s32fn, u64fn, s64fn, f32fn) \ + namespace detail { \ + template \ + XSIMD_INLINE typename std::enable_if::value, __n128>::type \ + fname(__n128 c, __n128 a, __n128 b) noexcept { return u8fn(c,a,b); } \ + template \ + XSIMD_INLINE typename std::enable_if::value, __n128>::type \ + fname(__n128 c, __n128 a, __n128 b) noexcept { return s8fn(c,a,b); } \ + template \ + XSIMD_INLINE typename std::enable_if::value, __n128>::type \ + fname(__n128 c, __n128 a, __n128 b) noexcept { return u16fn(c,a,b); } \ + template \ + XSIMD_INLINE typename std::enable_if::value, __n128>::type \ + fname(__n128 c, __n128 a, __n128 b) noexcept { return s16fn(c,a,b); } \ + template \ + XSIMD_INLINE typename std::enable_if::value, __n128>::type \ + fname(__n128 c, __n128 a, __n128 b) noexcept { return u32fn(c,a,b); } \ + template \ + XSIMD_INLINE typename std::enable_if::value && !std::is_floating_point::value, __n128>::type \ + fname(__n128 c, __n128 a, __n128 b) noexcept { return s32fn(c,a,b); } \ + template \ + XSIMD_INLINE typename std::enable_if::value, __n128>::type \ + fname(__n128 c, __n128 a, __n128 b) noexcept { return f32fn(c,a,b); } \ + template \ + XSIMD_INLINE typename std::enable_if::value, __n128>::type \ + fname(__n128 c, __n128 a, __n128 b) noexcept { return u64fn(c,a,b); } \ + template \ + XSIMD_INLINE typename std::enable_if::value, __n128>::type \ + fname(__n128 c, __n128 a, __n128 b) noexcept { return s64fn(c,a,b); } \ + } + +// Macro for bitwise ops on batch_bool (unsigned only, all sizes). +#define XSIMD_MSVC_ARM64_BINARY_UINT_ALL(fname, u8fn, u16fn, u32fn, u64fn) \ + namespace detail { \ + template \ + XSIMD_INLINE typename std::enable_if::type \ + fname##_bool(__n128 a, __n128 b) noexcept { return u8fn(a,b); } \ + template \ + XSIMD_INLINE typename std::enable_if::type \ + fname##_bool(__n128 a, __n128 b) noexcept { return u16fn(a,b); } \ + template \ + XSIMD_INLINE typename std::enable_if::type \ + fname##_bool(__n128 a, __n128 b) noexcept { return u32fn(a,b); } \ + template \ + XSIMD_INLINE typename std::enable_if::type \ + fname##_bool(__n128 a, __n128 b) noexcept { return u64fn(a,b); } \ + } + +// Macro for bitwise unary ops on batch_bool (unsigned only, all sizes). +#define XSIMD_MSVC_ARM64_UNARY_UINT_ALL(fname, u8fn, u16fn, u32fn, u64fn) \ + namespace detail { \ + template \ + XSIMD_INLINE typename std::enable_if::type \ + fname##_bool(__n128 a) noexcept { return u8fn(a); } \ + template \ + XSIMD_INLINE typename std::enable_if::type \ + fname##_bool(__n128 a) noexcept { return u16fn(a); } \ + template \ + XSIMD_INLINE typename std::enable_if::type \ + fname##_bool(__n128 a) noexcept { return u32fn(a); } \ + template \ + XSIMD_INLINE typename std::enable_if::type \ + fname##_bool(__n128 a) noexcept { return u64fn(a); } \ + } + +// Generate all dispatch helpers used by the MSVC ARM64 paths below. +XSIMD_MSVC_ARM64_BINARY_FULL(msvc_arm64_add, vaddq_u8, vaddq_s8, vaddq_u16, vaddq_s16, vaddq_u32, vaddq_s32, vaddq_u64, vaddq_s64, vaddq_f32) +XSIMD_MSVC_ARM64_BINARY_FULL(msvc_arm64_sadd, vqaddq_u8, vqaddq_s8, vqaddq_u16, vqaddq_s16, vqaddq_u32, vqaddq_s32, vqaddq_u64, vqaddq_s64, vaddq_f32) +XSIMD_MSVC_ARM64_BINARY_FULL(msvc_arm64_sub, vsubq_u8, vsubq_s8, vsubq_u16, vsubq_s16, vsubq_u32, vsubq_s32, vsubq_u64, vsubq_s64, vsubq_f32) +XSIMD_MSVC_ARM64_BINARY_FULL(msvc_arm64_ssub, vqsubq_u8, vqsubq_s8, vqsubq_u16, vqsubq_s16, vqsubq_u32, vqsubq_s32, vqsubq_u64, vqsubq_s64, vsubq_f32) +XSIMD_MSVC_ARM64_BINARY_EX64(msvc_arm64_mul, vmulq_u8, vmulq_s8, vmulq_u16, vmulq_s16, vmulq_u32, vmulq_s32, vmulq_f32) +XSIMD_MSVC_ARM64_BINARY_UINT_EX64(msvc_arm64_avg, vhaddq_u8, vhaddq_u16, vhaddq_u32) +XSIMD_MSVC_ARM64_BINARY_UINT_EX64(msvc_arm64_avgr, vrhaddq_u8, vrhaddq_u16, vrhaddq_u32) +XSIMD_MSVC_ARM64_BINARY_EX64(msvc_arm64_eq, vceqq_u8, vceqq_s8, vceqq_u16, vceqq_s16, vceqq_u32, vceqq_s32, vceqq_f32) +XSIMD_MSVC_ARM64_BINARY_EX64(msvc_arm64_lt, vcltq_u8, vcltq_s8, vcltq_u16, vcltq_s16, vcltq_u32, vcltq_s32, vcltq_f32) +XSIMD_MSVC_ARM64_BINARY_EX64(msvc_arm64_le, vcleq_u8, vcleq_s8, vcleq_u16, vcleq_s16, vcleq_u32, vcleq_s32, vcleq_f32) +XSIMD_MSVC_ARM64_BINARY_EX64(msvc_arm64_gt, vcgtq_u8, vcgtq_s8, vcgtq_u16, vcgtq_s16, vcgtq_u32, vcgtq_s32, vcgtq_f32) +XSIMD_MSVC_ARM64_BINARY_EX64(msvc_arm64_ge, vcgeq_u8, vcgeq_s8, vcgeq_u16, vcgeq_s16, vcgeq_u32, vcgeq_s32, vcgeq_f32) +XSIMD_MSVC_ARM64_BINARY_EX64(msvc_arm64_min, vminq_u8, vminq_s8, vminq_u16, vminq_s16, vminq_u32, vminq_s32, vminq_f32) +XSIMD_MSVC_ARM64_BINARY_EX64(msvc_arm64_max, vmaxq_u8, vmaxq_s8, vmaxq_u16, vmaxq_s16, vmaxq_u32, vmaxq_s32, vmaxq_f32) +namespace detail { + XSIMD_INLINE __n128 msvc_arm64_abs_u8(__n128 a) noexcept { return a; } + XSIMD_INLINE __n128 msvc_arm64_abs_u16(__n128 a) noexcept { return a; } + XSIMD_INLINE __n128 msvc_arm64_abs_u32(__n128 a) noexcept { return a; } +} +XSIMD_MSVC_ARM64_UNARY_EX64(msvc_arm64_abs, msvc_arm64_abs_u8, vabsq_s8, msvc_arm64_abs_u16, vabsq_s16, msvc_arm64_abs_u32, vabsq_s32, vabsq_f32) + +// bitwise ops on batch +XSIMD_MSVC_ARM64_BINARY_FULL(msvc_arm64_and, vandq_u8, vandq_u8, vandq_u16, vandq_u16, vandq_u32, vandq_u32, vandq_u64, vandq_u64, vandq_u8) +XSIMD_MSVC_ARM64_BINARY_FULL(msvc_arm64_or, vorrq_u8, vorrq_u8, vorrq_u16, vorrq_u16, vorrq_u32, vorrq_u32, vorrq_u64, vorrq_u64, vorrq_u8) +XSIMD_MSVC_ARM64_BINARY_FULL(msvc_arm64_xor, veorq_u8, veorq_u8, veorq_u16, veorq_u16, veorq_u32, veorq_u32, veorq_u64, veorq_u64, veorq_u8) +XSIMD_MSVC_ARM64_BINARY_FULL(msvc_arm64_andn, vbicq_u8, vbicq_u8, vbicq_u16, vbicq_u16, vbicq_u32, vbicq_u32, vbicq_u64, vbicq_u64, vbicq_u8) +// bitwise ops on batch_bool +XSIMD_MSVC_ARM64_BINARY_UINT_ALL(msvc_arm64_and, vandq_u8, vandq_u16, vandq_u32, vandq_u64) +XSIMD_MSVC_ARM64_BINARY_UINT_ALL(msvc_arm64_or, vorrq_u8, vorrq_u16, vorrq_u32, vorrq_u64) +XSIMD_MSVC_ARM64_BINARY_UINT_ALL(msvc_arm64_xor, veorq_u8, veorq_u16, veorq_u32, veorq_u64) +XSIMD_MSVC_ARM64_BINARY_UINT_ALL(msvc_arm64_andn, vbicq_u8, vbicq_u16, vbicq_u32, vbicq_u64) +namespace detail { + // On MSVC ARM64 all NEON types are __n128, so vmvnq_u32 works for any lane width. + XSIMD_INLINE __n128 msvc_arm64_not_u64_impl(__n128 a) noexcept { return vmvnq_u32(a); } +} +XSIMD_MSVC_ARM64_UNARY_UINT_ALL(msvc_arm64_not, vmvnq_u8, vmvnq_u16, vmvnq_u32, msvc_arm64_not_u64_impl) + +// select +XSIMD_MSVC_ARM64_SELECT_FULL(msvc_arm64_select, vbslq_u8, vbslq_s8, vbslq_u16, vbslq_s16, vbslq_u32, vbslq_s32, vbslq_u64, vbslq_s64, vbslq_f32) + +// rotate_left (N is a compile-time constant) +namespace detail { + template + XSIMD_INLINE typename std::enable_if::value, __n128>::type + msvc_arm64_rotate_left(__n128 a) noexcept { return vextq_u8(a, a, N); } + template + XSIMD_INLINE typename std::enable_if::value, __n128>::type + msvc_arm64_rotate_left(__n128 a) noexcept { return vextq_s8(a, a, N); } + template + XSIMD_INLINE typename std::enable_if::value, __n128>::type + msvc_arm64_rotate_left(__n128 a) noexcept { return vextq_u16(a, a, N % 8); } + template + XSIMD_INLINE typename std::enable_if::value, __n128>::type + msvc_arm64_rotate_left(__n128 a) noexcept { return vextq_s16(a, a, N % 8); } + template + XSIMD_INLINE typename std::enable_if::value, __n128>::type + msvc_arm64_rotate_left(__n128 a) noexcept { return vextq_u32(a, a, N % 4); } + template + XSIMD_INLINE typename std::enable_if::value && !std::is_floating_point::value, __n128>::type + msvc_arm64_rotate_left(__n128 a) noexcept { return vextq_s32(a, a, N % 4); } + template + XSIMD_INLINE typename std::enable_if::value, __n128>::type + msvc_arm64_rotate_left(__n128 a) noexcept { return vextq_f32(a, a, N % 4); } + template + XSIMD_INLINE typename std::enable_if::value, __n128>::type + msvc_arm64_rotate_left(__n128 a) noexcept { return vextq_u64(a, a, N % 2); } + template + XSIMD_INLINE typename std::enable_if::value, __n128>::type + msvc_arm64_rotate_left(__n128 a) noexcept { return vextq_s64(a, a, N % 2); } +} // namespace detail +#endif + /************* * broadcast * *************/ @@ -370,21 +660,38 @@ namespace xsimd template = 0> XSIMD_INLINE batch set(batch const&, requires_arch, Args... args) noexcept { +#if defined(_MSC_VER) && defined(_M_ARM64) + alignas(16) T data[] = { static_cast(args)... }; + return detail::msvc_arm64_load(data); +#else return xsimd::types::detail::neon_vector_type { args... }; +#endif } template = 0> XSIMD_INLINE batch_bool set(batch_bool const&, requires_arch, Args... args) noexcept { +#if defined(_MSC_VER) && defined(_M_ARM64) + using unsigned_type = as_unsigned_integer_t; + alignas(16) unsigned_type data[] = { static_cast(args ? -1LL : 0LL)... }; + return detail::msvc_arm64_load_u(data); +#else using register_type = typename batch_bool::register_type; using unsigned_type = as_unsigned_integer_t; return register_type { static_cast(args ? -1LL : 0LL)... }; +#endif } template XSIMD_INLINE batch set(batch const&, requires_arch, float f0, float f1, float f2, float f3) noexcept { +#if defined(_MSC_VER) && defined(_M_ARM64) + // On MSVC ARM64, use load from array instead of brace initialization + alignas(16) float data[] = { f0, f1, f2, f3 }; + return vld1q_f32(data); +#else return float32x4_t { f0, f1, f2, f3 }; +#endif } template @@ -392,16 +699,30 @@ namespace xsimd std::complex c0, std::complex c1, std::complex c2, std::complex c3) noexcept { +#if defined(_MSC_VER) && defined(_M_ARM64) + // On MSVC ARM64, use load from array instead of brace initialization + alignas(16) float real_data[] = { c0.real(), c1.real(), c2.real(), c3.real() }; + alignas(16) float imag_data[] = { c0.imag(), c1.imag(), c2.imag(), c3.imag() }; + return batch, A>(vld1q_f32(real_data), vld1q_f32(imag_data)); +#else return batch, A>(float32x4_t { c0.real(), c1.real(), c2.real(), c3.real() }, float32x4_t { c0.imag(), c1.imag(), c2.imag(), c3.imag() }); +#endif } template XSIMD_INLINE batch_bool set(batch_bool const&, requires_arch, Args... args) noexcept { +#if defined(_MSC_VER) && defined(_M_ARM64) + // On MSVC ARM64, use load from array instead of brace initialization + using unsigned_type = as_unsigned_integer_t; + alignas(16) unsigned_type data[] = { static_cast(args ? -1LL : 0LL)... }; + return vld1q_u32(data); +#else using register_type = typename batch_bool::register_type; using unsigned_type = as_unsigned_integer_t; return register_type { static_cast(args ? -1LL : 0LL)... }; +#endif } /************* @@ -417,7 +738,7 @@ namespace xsimd template = 0> XSIMD_INLINE batch from_bool(batch_bool const& arg, requires_arch) noexcept { - return vandq_s8(reinterpret_cast(arg.data), vdupq_n_s8(1)); + return vandq_s8(vreinterpretq_s8_u8(arg.data), vdupq_n_s8(1)); } template = 0> @@ -429,7 +750,7 @@ namespace xsimd template = 0> XSIMD_INLINE batch from_bool(batch_bool const& arg, requires_arch) noexcept { - return vandq_s16(reinterpret_cast(arg.data), vdupq_n_s16(1)); + return vandq_s16(vreinterpretq_s16_u16(arg.data), vdupq_n_s16(1)); } template = 0> @@ -441,7 +762,7 @@ namespace xsimd template = 0> XSIMD_INLINE batch from_bool(batch_bool const& arg, requires_arch) noexcept { - return vandq_s32(reinterpret_cast(arg.data), vdupq_n_s32(1)); + return vandq_s32(vreinterpretq_s32_u32(arg.data), vdupq_n_s32(1)); } template = 0> @@ -453,7 +774,7 @@ namespace xsimd template = 0> XSIMD_INLINE batch from_bool(batch_bool const& arg, requires_arch) noexcept { - return vandq_s64(reinterpret_cast(arg.data), vdupq_n_s64(1)); + return vandq_s64(vreinterpretq_s64_u64(arg.data), vdupq_n_s64(1)); } template @@ -581,7 +902,7 @@ namespace xsimd XSIMD_INLINE batch_bool load_unaligned(bool const* mem, batch_bool, requires_arch) noexcept { auto vmem = load_unaligned((unsigned char const*)mem, convert {}, A {}); - return { 0 - vmem.data }; + return { vsubq_u8(vdupq_n_u8(0), vmem.data) }; } template = 0> XSIMD_INLINE batch_bool load_aligned(bool const* mem, batch_bool t, requires_arch r) noexcept @@ -593,7 +914,7 @@ namespace xsimd XSIMD_INLINE batch_bool load_unaligned(bool const* mem, batch_bool, requires_arch) noexcept { uint16x8_t vmem = vmovl_u8(vld1_u8((unsigned char const*)mem)); - return { 0 - vmem }; + return { vsubq_u16(vdupq_n_u16(0), vmem) }; } template = 0> @@ -606,7 +927,7 @@ namespace xsimd XSIMD_INLINE batch_bool load_unaligned(bool const* mem, batch_bool, requires_arch) noexcept { uint8x8_t tmp = vreinterpret_u8_u32(vset_lane_u32(*(unsigned int*)mem, vdup_n_u32(0), 0)); - return { 0 - vmovl_u16(vget_low_u16(vmovl_u8(tmp))) }; + return { vsubq_u32(vdupq_n_u32(0), vmovl_u16(vget_low_u16(vmovl_u8(tmp)))) }; } template = 0> @@ -851,12 +1172,17 @@ namespace xsimd * add * *******/ +#if !defined(_MSC_VER) || !defined(_M_ARM64) WRAP_BINARY_INT(vaddq, detail::identity_return_type) WRAP_BINARY_FLOAT(vaddq, detail::identity_return_type) +#endif template = 0> XSIMD_INLINE batch add(batch const& lhs, batch const& rhs, requires_arch) noexcept { +#if defined(_MSC_VER) && defined(_M_ARM64) + return detail::msvc_arm64_add(lhs, rhs); +#else using register_type = typename batch::register_type; const detail::neon_dispatcher::binary dispatcher = { std::make_tuple(wrap::vaddq_u8, wrap::vaddq_s8, wrap::vaddq_u16, wrap::vaddq_s16, @@ -864,49 +1190,67 @@ namespace xsimd wrap::vaddq_f32) }; return dispatcher.apply(register_type(lhs), register_type(rhs)); +#endif } /******* * avg * *******/ +#if !defined(_MSC_VER) || !defined(_M_ARM64) WRAP_BINARY_UINT_EXCLUDING_64(vhaddq, detail::identity_return_type) +#endif template ::value && sizeof(T) != 8)>> XSIMD_INLINE batch avg(batch const& lhs, batch const& rhs, requires_arch) noexcept { +#if defined(_MSC_VER) && defined(_M_ARM64) + return detail::msvc_arm64_avg(lhs, rhs); +#else using register_type = typename batch::register_type; const detail::neon_dispatcher_impl::binary dispatcher = { std::make_tuple(wrap::vhaddq_u8, wrap::vhaddq_u16, wrap::vhaddq_u32) }; return dispatcher.apply(register_type(lhs), register_type(rhs)); +#endif } /******** * avgr * ********/ +#if !defined(_MSC_VER) || !defined(_M_ARM64) WRAP_BINARY_UINT_EXCLUDING_64(vrhaddq, detail::identity_return_type) +#endif template ::value && sizeof(T) != 8)>> XSIMD_INLINE batch avgr(batch const& lhs, batch const& rhs, requires_arch) noexcept { +#if defined(_MSC_VER) && defined(_M_ARM64) + return detail::msvc_arm64_avgr(lhs, rhs); +#else using register_type = typename batch::register_type; const detail::neon_dispatcher_impl::binary dispatcher = { std::make_tuple(wrap::vrhaddq_u8, wrap::vrhaddq_u16, wrap::vrhaddq_u32) }; return dispatcher.apply(register_type(lhs), register_type(rhs)); +#endif } /******** * sadd * ********/ +#if !defined(_MSC_VER) || !defined(_M_ARM64) WRAP_BINARY_INT(vqaddq, detail::identity_return_type) +#endif template = 0> XSIMD_INLINE batch sadd(batch const& lhs, batch const& rhs, requires_arch) noexcept { +#if defined(_MSC_VER) && defined(_M_ARM64) + return detail::msvc_arm64_sadd(lhs, rhs); +#else using register_type = typename batch::register_type; const detail::neon_dispatcher::binary dispatcher = { std::make_tuple(wrap::vqaddq_u8, wrap::vqaddq_s8, wrap::vqaddq_u16, wrap::vqaddq_s16, @@ -914,18 +1258,24 @@ namespace xsimd wrap::vaddq_f32) }; return dispatcher.apply(register_type(lhs), register_type(rhs)); +#endif } /******* * sub * *******/ +#if !defined(_MSC_VER) || !defined(_M_ARM64) WRAP_BINARY_INT(vsubq, detail::identity_return_type) WRAP_BINARY_FLOAT(vsubq, detail::identity_return_type) +#endif template = 0> XSIMD_INLINE batch sub(batch const& lhs, batch const& rhs, requires_arch) noexcept { +#if defined(_MSC_VER) && defined(_M_ARM64) + return detail::msvc_arm64_sub(lhs, rhs); +#else using register_type = typename batch::register_type; const detail::neon_dispatcher::binary dispatcher = { std::make_tuple(wrap::vsubq_u8, wrap::vsubq_s8, wrap::vsubq_u16, wrap::vsubq_s16, @@ -933,17 +1283,23 @@ namespace xsimd wrap::vsubq_f32) }; return dispatcher.apply(register_type(lhs), register_type(rhs)); +#endif } /******** * ssub * ********/ +#if !defined(_MSC_VER) || !defined(_M_ARM64) WRAP_BINARY_INT(vqsubq, detail::identity_return_type) +#endif template = 0> XSIMD_INLINE batch ssub(batch const& lhs, batch const& rhs, requires_arch) noexcept { +#if defined(_MSC_VER) && defined(_M_ARM64) + return detail::msvc_arm64_ssub(lhs, rhs); +#else using register_type = typename batch::register_type; const detail::neon_dispatcher::binary dispatcher = { std::make_tuple(wrap::vqsubq_u8, wrap::vqsubq_s8, wrap::vqsubq_u16, wrap::vqsubq_s16, @@ -951,24 +1307,31 @@ namespace xsimd wrap::vsubq_f32) }; return dispatcher.apply(register_type(lhs), register_type(rhs)); +#endif } /******* * mul * *******/ +#if !defined(_MSC_VER) || !defined(_M_ARM64) WRAP_BINARY_INT_EXCLUDING_64(vmulq, detail::identity_return_type) WRAP_BINARY_FLOAT(vmulq, detail::identity_return_type) +#endif template = 0> XSIMD_INLINE batch mul(batch const& lhs, batch const& rhs, requires_arch) noexcept { +#if defined(_MSC_VER) && defined(_M_ARM64) + return detail::msvc_arm64_mul(lhs, rhs); +#else using register_type = typename batch::register_type; const detail::excluding_int64_dispatcher::binary dispatcher = { std::make_tuple(wrap::vmulq_u8, wrap::vmulq_s8, wrap::vmulq_u16, wrap::vmulq_s16, wrap::vmulq_u32, wrap::vmulq_s32, wrap::vmulq_f32) }; return dispatcher.apply(register_type(lhs), register_type(rhs)); +#endif } /******* @@ -1010,29 +1373,39 @@ namespace xsimd * eq * ******/ +#if !defined(_MSC_VER) || !defined(_M_ARM64) WRAP_BINARY_INT_EXCLUDING_64(vceqq, detail::comp_return_type) WRAP_BINARY_FLOAT(vceqq, detail::comp_return_type) +#endif template = 0> XSIMD_INLINE batch_bool eq(batch const& lhs, batch const& rhs, requires_arch) noexcept { +#if defined(_MSC_VER) && defined(_M_ARM64) + return detail::msvc_arm64_eq(lhs, rhs); +#else using register_type = typename batch::register_type; const detail::excluding_int64_comp_dispatcher::binary dispatcher = { std::make_tuple(wrap::vceqq_u8, wrap::vceqq_s8, wrap::vceqq_u16, wrap::vceqq_s16, wrap::vceqq_u32, wrap::vceqq_s32, wrap::vceqq_f32) }; return dispatcher.apply(register_type(lhs), register_type(rhs)); +#endif } template = 0> XSIMD_INLINE batch_bool eq(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept { +#if defined(_MSC_VER) && defined(_M_ARM64) + return detail::msvc_arm64_eq_bool(lhs, rhs); +#else using register_type = typename batch_bool::register_type; using dispatcher_type = detail::neon_comp_dispatcher_impl::binary; const dispatcher_type dispatcher = { std::make_tuple(wrap::vceqq_u8, wrap::vceqq_u16, wrap::vceqq_u32) }; return dispatcher.apply(register_type(lhs), register_type(rhs)); +#endif } template = 0> @@ -1095,18 +1468,24 @@ namespace xsimd * lt * ******/ +#if !defined(_MSC_VER) || !defined(_M_ARM64) WRAP_BINARY_INT_EXCLUDING_64(vcltq, detail::comp_return_type) WRAP_BINARY_FLOAT(vcltq, detail::comp_return_type) +#endif template = 0> XSIMD_INLINE batch_bool lt(batch const& lhs, batch const& rhs, requires_arch) noexcept { +#if defined(_MSC_VER) && defined(_M_ARM64) + return detail::msvc_arm64_lt(lhs, rhs); +#else using register_type = typename batch::register_type; const detail::excluding_int64_comp_dispatcher::binary dispatcher = { std::make_tuple(wrap::vcltq_u8, wrap::vcltq_s8, wrap::vcltq_u16, wrap::vcltq_s16, wrap::vcltq_u32, wrap::vcltq_s32, wrap::vcltq_f32) }; return dispatcher.apply(register_type(lhs), register_type(rhs)); +#endif } template = 0> @@ -1128,18 +1507,24 @@ namespace xsimd * le * ******/ +#if !defined(_MSC_VER) || !defined(_M_ARM64) WRAP_BINARY_INT_EXCLUDING_64(vcleq, detail::comp_return_type) WRAP_BINARY_FLOAT(vcleq, detail::comp_return_type) +#endif template = 0> XSIMD_INLINE batch_bool le(batch const& lhs, batch const& rhs, requires_arch) noexcept { +#if defined(_MSC_VER) && defined(_M_ARM64) + return detail::msvc_arm64_le(lhs, rhs); +#else using register_type = typename batch::register_type; const detail::excluding_int64_comp_dispatcher::binary dispatcher = { std::make_tuple(wrap::vcleq_u8, wrap::vcleq_s8, wrap::vcleq_u16, wrap::vcleq_s16, wrap::vcleq_u32, wrap::vcleq_s32, wrap::vcleq_f32) }; return dispatcher.apply(register_type(lhs), register_type(rhs)); +#endif } template = 0> @@ -1164,18 +1549,24 @@ namespace xsimd } } +#if !defined(_MSC_VER) || !defined(_M_ARM64) WRAP_BINARY_INT_EXCLUDING_64(vcgtq, detail::comp_return_type) WRAP_BINARY_FLOAT(vcgtq, detail::comp_return_type) +#endif template = 0> XSIMD_INLINE batch_bool gt(batch const& lhs, batch const& rhs, requires_arch) noexcept { +#if defined(_MSC_VER) && defined(_M_ARM64) + return detail::msvc_arm64_gt(lhs, rhs); +#else using register_type = typename batch::register_type; const detail::excluding_int64_comp_dispatcher::binary dispatcher = { std::make_tuple(wrap::vcgtq_u8, wrap::vcgtq_s8, wrap::vcgtq_u16, wrap::vcgtq_s16, wrap::vcgtq_u32, wrap::vcgtq_s32, wrap::vcgtq_f32) }; return dispatcher.apply(register_type(lhs), register_type(rhs)); +#endif } template = 0> @@ -1197,18 +1588,24 @@ namespace xsimd * ge * ******/ +#if !defined(_MSC_VER) || !defined(_M_ARM64) WRAP_BINARY_INT_EXCLUDING_64(vcgeq, detail::comp_return_type) WRAP_BINARY_FLOAT(vcgeq, detail::comp_return_type) +#endif template = 0> XSIMD_INLINE batch_bool ge(batch const& lhs, batch const& rhs, requires_arch) noexcept { +#if defined(_MSC_VER) && defined(_M_ARM64) + return detail::msvc_arm64_ge(lhs, rhs); +#else using register_type = typename batch::register_type; const detail::excluding_int64_comp_dispatcher::binary dispatcher = { std::make_tuple(wrap::vcgeq_u8, wrap::vcgeq_s8, wrap::vcgeq_u16, wrap::vcgeq_s16, wrap::vcgeq_u32, wrap::vcgeq_s32, wrap::vcgeq_f32) }; return dispatcher.apply(register_type(lhs), register_type(rhs)); +#endif } template = 0> @@ -1232,7 +1629,9 @@ namespace xsimd * bitwise_and * ***************/ +#if !defined(_MSC_VER) || !defined(_M_ARM64) WRAP_BINARY_INT(vandq, detail::identity_return_type) +#endif namespace detail { @@ -1242,6 +1641,7 @@ namespace xsimd vreinterpretq_u32_f32(rhs))); } +#if !defined(_MSC_VER) || !defined(_M_ARM64) template V bitwise_and_neon(V const& lhs, V const& rhs) { @@ -1252,27 +1652,39 @@ namespace xsimd }; return dispatcher.apply(lhs, rhs); } +#endif } template = 0> XSIMD_INLINE batch bitwise_and(batch const& lhs, batch const& rhs, requires_arch) noexcept { +#if defined(_MSC_VER) && defined(_M_ARM64) + // On MSVC ARM64 all NEON types are __n128; vandq_u32 works for all. + return vandq_u32(lhs, rhs); +#else using register_type = typename batch::register_type; return detail::bitwise_and_neon(register_type(lhs), register_type(rhs)); +#endif } template = 0> XSIMD_INLINE batch_bool bitwise_and(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept { +#if defined(_MSC_VER) && defined(_M_ARM64) + return vandq_u32(lhs, rhs); +#else using register_type = typename batch_bool::register_type; return detail::bitwise_and_neon(register_type(lhs), register_type(rhs)); +#endif } /************** * bitwise_or * **************/ +#if !defined(_MSC_VER) || !defined(_M_ARM64) WRAP_BINARY_INT(vorrq, detail::identity_return_type) +#endif namespace detail { @@ -1282,6 +1694,7 @@ namespace xsimd vreinterpretq_u32_f32(rhs))); } +#if !defined(_MSC_VER) || !defined(_M_ARM64) template XSIMD_INLINE V bitwise_or_neon(V const& lhs, V const& rhs) noexcept { @@ -1292,27 +1705,38 @@ namespace xsimd }; return dispatcher.apply(lhs, rhs); } +#endif } template = 0> XSIMD_INLINE batch bitwise_or(batch const& lhs, batch const& rhs, requires_arch) noexcept { +#if defined(_MSC_VER) && defined(_M_ARM64) + return vorrq_u32(lhs, rhs); +#else using register_type = typename batch::register_type; return detail::bitwise_or_neon(register_type(lhs), register_type(rhs)); +#endif } template = 0> XSIMD_INLINE batch_bool bitwise_or(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept { +#if defined(_MSC_VER) && defined(_M_ARM64) + return vorrq_u32(lhs, rhs); +#else using register_type = typename batch_bool::register_type; return detail::bitwise_or_neon(register_type(lhs), register_type(rhs)); +#endif } /*************** * bitwise_xor * ***************/ +#if !defined(_MSC_VER) || !defined(_M_ARM64) WRAP_BINARY_INT(veorq, detail::identity_return_type) +#endif namespace detail { @@ -1322,6 +1746,7 @@ namespace xsimd vreinterpretq_u32_f32(rhs))); } +#if !defined(_MSC_VER) || !defined(_M_ARM64) template XSIMD_INLINE V bitwise_xor_neon(V const& lhs, V const& rhs) noexcept { @@ -1332,20 +1757,29 @@ namespace xsimd }; return dispatcher.apply(lhs, rhs); } +#endif } template = 0> XSIMD_INLINE batch bitwise_xor(batch const& lhs, batch const& rhs, requires_arch) noexcept { +#if defined(_MSC_VER) && defined(_M_ARM64) + return veorq_u32(lhs, rhs); +#else using register_type = typename batch::register_type; return detail::bitwise_xor_neon(register_type(lhs), register_type(rhs)); +#endif } template = 0> XSIMD_INLINE batch_bool bitwise_xor(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept { +#if defined(_MSC_VER) && defined(_M_ARM64) + return veorq_u32(lhs, rhs); +#else using register_type = typename batch_bool::register_type; return detail::bitwise_xor_neon(register_type(lhs), register_type(rhs)); +#endif } /******* @@ -1362,7 +1796,9 @@ namespace xsimd * bitwise_not * ***************/ +#if !defined(_MSC_VER) || !defined(_M_ARM64) WRAP_UNARY_INT_EXCLUDING_64(vmvnq) +#endif namespace detail { @@ -1371,6 +1807,7 @@ namespace xsimd return vreinterpretq_f32_u32(vmvnq_u32(vreinterpretq_u32_f32(arg))); } +#if !defined(_MSC_VER) || !defined(_M_ARM64) template XSIMD_INLINE V bitwise_not_neon(V const& arg) noexcept { @@ -1382,27 +1819,39 @@ namespace xsimd }; return dispatcher.apply(arg); } +#endif } template = 0> XSIMD_INLINE batch bitwise_not(batch const& arg, requires_arch) noexcept { +#if defined(_MSC_VER) && defined(_M_ARM64) + // On MSVC ARM64 all NEON types are __n128; vmvnq_u32 works for all. + return vmvnq_u32(arg); +#else using register_type = typename batch::register_type; return detail::bitwise_not_neon(register_type(arg)); +#endif } template = 0> XSIMD_INLINE batch_bool bitwise_not(batch_bool const& arg, requires_arch) noexcept { +#if defined(_MSC_VER) && defined(_M_ARM64) + return vmvnq_u32(arg); +#else using register_type = typename batch_bool::register_type; return detail::bitwise_not_neon(register_type(arg)); +#endif } /****************** * bitwise_andnot * ******************/ +#if !defined(_MSC_VER) || !defined(_M_ARM64) WRAP_BINARY_INT(vbicq, detail::identity_return_type) +#endif namespace detail { @@ -1411,6 +1860,7 @@ namespace xsimd return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(lhs), vreinterpretq_u32_f32(rhs))); } +#if !defined(_MSC_VER) || !defined(_M_ARM64) template XSIMD_INLINE V bitwise_andnot_neon(V const& lhs, V const& rhs) noexcept { @@ -1421,38 +1871,54 @@ namespace xsimd }; return dispatcher.apply(lhs, rhs); } +#endif } template = 0> XSIMD_INLINE batch bitwise_andnot(batch const& lhs, batch const& rhs, requires_arch) noexcept { +#if defined(_MSC_VER) && defined(_M_ARM64) + // On MSVC ARM64 all NEON types are __n128; vbicq_u32 works for all. + return vbicq_u32(lhs, rhs); +#else using register_type = typename batch::register_type; return detail::bitwise_andnot_neon(register_type(lhs), register_type(rhs)); +#endif } template = 0> XSIMD_INLINE batch_bool bitwise_andnot(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept { +#if defined(_MSC_VER) && defined(_M_ARM64) + return vbicq_u32(lhs, rhs); +#else using register_type = typename batch_bool::register_type; return detail::bitwise_andnot_neon(register_type(lhs), register_type(rhs)); +#endif } /******* * min * *******/ +#if !defined(_MSC_VER) || !defined(_M_ARM64) WRAP_BINARY_INT_EXCLUDING_64(vminq, detail::identity_return_type) WRAP_BINARY_FLOAT(vminq, detail::identity_return_type) +#endif template = 0> XSIMD_INLINE batch min(batch const& lhs, batch const& rhs, requires_arch) noexcept { +#if defined(_MSC_VER) && defined(_M_ARM64) + return detail::msvc_arm64_min(lhs, rhs); +#else using register_type = typename batch::register_type; const detail::excluding_int64_dispatcher::binary dispatcher = { std::make_tuple(wrap::vminq_u8, wrap::vminq_s8, wrap::vminq_u16, wrap::vminq_s16, wrap::vminq_u32, wrap::vminq_s32, wrap::vminq_f32) }; return dispatcher.apply(register_type(lhs), register_type(rhs)); +#endif } template = 0> @@ -1465,18 +1931,24 @@ namespace xsimd * max * *******/ +#if !defined(_MSC_VER) || !defined(_M_ARM64) WRAP_BINARY_INT_EXCLUDING_64(vmaxq, detail::identity_return_type) WRAP_BINARY_FLOAT(vmaxq, detail::identity_return_type) +#endif template = 0> XSIMD_INLINE batch max(batch const& lhs, batch const& rhs, requires_arch) noexcept { +#if defined(_MSC_VER) && defined(_M_ARM64) + return detail::msvc_arm64_max(lhs, rhs); +#else using register_type = typename batch::register_type; const detail::excluding_int64_dispatcher::binary dispatcher = { std::make_tuple(wrap::vmaxq_u8, wrap::vmaxq_s8, wrap::vmaxq_u16, wrap::vmaxq_s16, wrap::vmaxq_u32, wrap::vmaxq_s32, wrap::vmaxq_f32) }; return dispatcher.apply(register_type(lhs), register_type(rhs)); +#endif } template = 0> @@ -1489,6 +1961,7 @@ namespace xsimd * abs * *******/ +#if !defined(_MSC_VER) || !defined(_M_ARM64) namespace wrap { XSIMD_INLINE int8x16_t vabsq_s8(int8x16_t a) noexcept { return ::vabsq_s8(a); } @@ -1496,6 +1969,7 @@ namespace xsimd XSIMD_INLINE int32x4_t vabsq_s32(int32x4_t a) noexcept { return ::vabsq_s32(a); } } WRAP_UNARY_FLOAT(vabsq) +#endif namespace detail { @@ -1518,12 +1992,16 @@ namespace xsimd template = 0> XSIMD_INLINE batch abs(batch const& arg, requires_arch) noexcept { +#if defined(_MSC_VER) && defined(_M_ARM64) + return detail::msvc_arm64_abs(arg); +#else using register_type = typename batch::register_type; const detail::excluding_int64_dispatcher::unary dispatcher = { std::make_tuple(detail::abs_u8, wrap::vabsq_s8, detail::abs_u16, wrap::vabsq_s16, detail::abs_u32, wrap::vabsq_s32, wrap::vabsq_f32) }; return dispatcher.apply(register_type(arg)); +#endif } /******** @@ -1843,6 +2321,7 @@ namespace xsimd * select * **********/ +#if !defined(_MSC_VER) || !defined(_M_ARM64) namespace wrap { XSIMD_INLINE uint8x16_t vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) noexcept { return ::vbslq_u8(a, b, c); } @@ -1855,6 +2334,7 @@ namespace xsimd XSIMD_INLINE int64x2_t vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c) noexcept { return ::vbslq_s64(a, b, c); } XSIMD_INLINE float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c) noexcept { return ::vbslq_f32(a, b, c); } } +#endif namespace detail { @@ -1883,6 +2363,9 @@ namespace xsimd template = 0> XSIMD_INLINE batch select(batch_bool const& cond, batch const& a, batch const& b, requires_arch) noexcept { +#if defined(_MSC_VER) && defined(_M_ARM64) + return detail::msvc_arm64_select(cond, a, b); +#else using bool_register_type = typename batch_bool::register_type; using register_type = typename batch::register_type; const detail::neon_select_dispatcher dispatcher = { @@ -1891,6 +2374,7 @@ namespace xsimd wrap::vbslq_f32) }; return dispatcher.apply(bool_register_type(cond), register_type(a), register_type(b)); +#endif } template = 0> @@ -2861,6 +3345,7 @@ namespace xsimd * bitwise_cast * ****************/ +#if !defined(_MSC_VER) || !defined(_M_ARM64) #define WRAP_CAST(SUFFIX, TYPE) \ namespace wrap \ { \ @@ -2913,6 +3398,7 @@ namespace xsimd WRAP_CAST(f32, float32x4_t) #undef WRAP_CAST +#endif namespace detail { @@ -2973,6 +3459,10 @@ namespace xsimd template XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept { +#if defined(_MSC_VER) && defined(_M_ARM64) + // On MSVC ARM64, all NEON types are __n128, so just return the argument + return arg.data; +#else const detail::neon_bitwise_caster caster = { std::make_tuple( detail::make_bitwise_caster_impl(wrap::vreinterpretq_u8_u8, wrap::vreinterpretq_u8_s8, wrap::vreinterpretq_u8_u16, wrap::vreinterpretq_u8_s16, @@ -3006,6 +3496,7 @@ namespace xsimd using src_register_type = typename batch::register_type; using dst_register_type = typename batch::register_type; return caster.apply(src_register_type(arg)); +#endif } /********* @@ -3087,6 +3578,7 @@ namespace xsimd /**************** * rotate_left * ****************/ +#if !defined(_MSC_VER) || !defined(_M_ARM64) namespace wrap { template @@ -3108,10 +3600,14 @@ namespace xsimd template XSIMD_INLINE float32x4_t rotate_left_f32(float32x4_t a, float32x4_t b) noexcept { return vextq_f32(a, b, N); } } +#endif template = 0> XSIMD_INLINE batch rotate_left(batch const& a, requires_arch) noexcept { +#if defined(_MSC_VER) && defined(_M_ARM64) + return detail::msvc_arm64_rotate_left(a); +#else using register_type = typename batch::register_type; // Adding modulo to avoid warning. const detail::neon_dispatcher::binary dispatcher = { @@ -3120,6 +3616,7 @@ namespace xsimd wrap::rotate_left_f32) }; return dispatcher.apply(register_type(a), register_type(a)); +#endif } } diff --git a/include/xsimd/arch/xsimd_neon64.hpp b/include/xsimd/arch/xsimd_neon64.hpp index 7a5263fb1..656185292 100644 --- a/include/xsimd/arch/xsimd_neon64.hpp +++ b/include/xsimd/arch/xsimd_neon64.hpp @@ -12,6 +12,8 @@ #ifndef XSIMD_NEON64_HPP #define XSIMD_NEON64_HPP +#include +#include #include #include #include @@ -117,16 +119,19 @@ namespace xsimd template XSIMD_INLINE batch set(batch const&, requires_arch, double d0, double d1) noexcept { - return float64x2_t { d0, d1 }; + alignas(16) double data[] = { d0, d1 }; + return vld1q_f64(data); } template XSIMD_INLINE batch_bool set(batch_bool const&, requires_arch, bool b0, bool b1) noexcept { - using register_type = typename batch_bool::register_type; using unsigned_type = as_unsigned_integer_t; - return register_type { static_cast(b0 ? -1LL : 0LL), - static_cast(b1 ? -1LL : 0LL) }; + alignas(16) unsigned_type data[] = { + static_cast(b0 ? -1LL : 0LL), + static_cast(b1 ? -1LL : 0LL) + }; + return vld1q_u64(data); } /************* @@ -145,7 +150,11 @@ namespace xsimd #if defined(__clang__) || defined(__GNUC__) #define xsimd_aligned_load(inst, type, expr) inst((type)__builtin_assume_aligned(expr, 16)) #elif defined(_MSC_VER) +#if defined(_M_ARM64) +#define xsimd_aligned_load(inst, type, expr) inst((type)expr) +#else #define xsimd_aligned_load(inst, type, expr) inst##_ex((type)expr, 128) +#endif #else #define xsimd_aligned_load(inst, type, expr) inst((type)expr) #endif @@ -194,18 +203,18 @@ namespace xsimd ****************/ template - XSIMD_INLINE batch, A> load_complex_aligned(std::complex const* mem, convert>, requires_arch) noexcept + XSIMD_INLINE batch<::std::complex, A> load_complex_aligned(::std::complex const* mem, convert<::std::complex>, requires_arch) noexcept { using real_batch = batch; const double* buf = reinterpret_cast(mem); float64x2x2_t tmp = vld2q_f64(buf); real_batch real = tmp.val[0], imag = tmp.val[1]; - return batch, A> { real, imag }; + return batch<::std::complex, A> { real, imag }; } template - XSIMD_INLINE batch, A> load_complex_unaligned(std::complex const* mem, convert> cvt, requires_arch) noexcept + XSIMD_INLINE batch<::std::complex, A> load_complex_unaligned(::std::complex const* mem, convert<::std::complex> cvt, requires_arch) noexcept { return load_complex_aligned(mem, cvt, A {}); } @@ -215,7 +224,7 @@ namespace xsimd *****************/ template - XSIMD_INLINE void store_complex_aligned(std::complex* dst, batch, A> const& src, requires_arch) noexcept + XSIMD_INLINE void store_complex_aligned(::std::complex* dst, batch<::std::complex, A> const& src, requires_arch) noexcept { float64x2x2_t tmp; tmp.val[0] = src.real(); @@ -225,7 +234,7 @@ namespace xsimd } template - XSIMD_INLINE void store_complex_unaligned(std::complex* dst, batch, A> const& src, requires_arch) noexcept + XSIMD_INLINE void store_complex_unaligned(::std::complex* dst, batch<::std::complex, A> const& src, requires_arch) noexcept { store_complex_aligned(dst, src, A {}); } @@ -725,7 +734,7 @@ namespace xsimd template XSIMD_INLINE batch reciprocal(const batch& x, - kernel::requires_arch) noexcept + requires_arch) noexcept { return vrecpeq_f64(x); } @@ -794,7 +803,9 @@ namespace xsimd // Wrap reducer intrinsics so we can pass them as function pointers // - OP: intrinsics name prefix, e.g., vorrq + // On MSVC ARM64, skip these wrappers since all types are __n128 +#if !defined(_MSC_VER) || !defined(_M_ARM64) #define WRAP_REDUCER_INT_EXCLUDING_64(OP) \ namespace wrap \ { \ @@ -850,9 +861,16 @@ namespace xsimd return ::OP##_f64(a); \ } \ } +#else +// On MSVC ARM64, skip wrapper macros +#define WRAP_REDUCER_INT_EXCLUDING_64(OP) +#define WRAP_REDUCER_INT(OP) +#define WRAP_REDUCER_FLOAT(OP) +#endif namespace detail { +#if !defined(_MSC_VER) || !defined(_M_ARM64) template struct reducer_return_type_impl; @@ -929,6 +947,7 @@ namespace xsimd uint32x4_t, int32x4_t, uint64x2_t, int64x2_t, float32x4_t, float64x2_t>; +#endif template using enable_neon64_type_t = std::enable_if_t::value || std::is_same::value || std::is_same::value, int>; @@ -938,12 +957,112 @@ namespace xsimd * reduce_add * **************/ +#if defined(_MSC_VER) && defined(_M_ARM64) + namespace detail { + template + XSIMD_INLINE typename std::enable_if::value, T>::type + msvc_arm64_reduce_add(__n128 a) noexcept { return vaddvq_u8(a); } + template + XSIMD_INLINE typename std::enable_if::value, T>::type + msvc_arm64_reduce_add(__n128 a) noexcept { return vaddvq_s8(a); } + template + XSIMD_INLINE typename std::enable_if::value, T>::type + msvc_arm64_reduce_add(__n128 a) noexcept { return vaddvq_u16(a); } + template + XSIMD_INLINE typename std::enable_if::value, T>::type + msvc_arm64_reduce_add(__n128 a) noexcept { return vaddvq_s16(a); } + template + XSIMD_INLINE typename std::enable_if::value, T>::type + msvc_arm64_reduce_add(__n128 a) noexcept { return vaddvq_u32(a); } + template + XSIMD_INLINE typename std::enable_if::value && !std::is_floating_point::value, T>::type + msvc_arm64_reduce_add(__n128 a) noexcept { return vaddvq_s32(a); } + template + XSIMD_INLINE typename std::enable_if::value, T>::type + msvc_arm64_reduce_add(__n128 a) noexcept { return vaddvq_f32(a); } + template + XSIMD_INLINE typename std::enable_if::value, T>::type + msvc_arm64_reduce_add(__n128 a) noexcept { return vaddvq_u64(a); } + template + XSIMD_INLINE typename std::enable_if::value && !std::is_floating_point::value, T>::type + msvc_arm64_reduce_add(__n128 a) noexcept { return vaddvq_s64(a); } + template + XSIMD_INLINE typename std::enable_if::value, T>::type + msvc_arm64_reduce_add(__n128 a) noexcept { return vaddvq_f64(a); } + + template + XSIMD_INLINE typename std::enable_if::value, T>::type + msvc_arm64_reduce_max(__n128 a) noexcept { return vmaxvq_u8(a); } + template + XSIMD_INLINE typename std::enable_if::value, T>::type + msvc_arm64_reduce_max(__n128 a) noexcept { return vmaxvq_s8(a); } + template + XSIMD_INLINE typename std::enable_if::value, T>::type + msvc_arm64_reduce_max(__n128 a) noexcept { return vmaxvq_u16(a); } + template + XSIMD_INLINE typename std::enable_if::value, T>::type + msvc_arm64_reduce_max(__n128 a) noexcept { return vmaxvq_s16(a); } + template + XSIMD_INLINE typename std::enable_if::value, T>::type + msvc_arm64_reduce_max(__n128 a) noexcept { return vmaxvq_u32(a); } + template + XSIMD_INLINE typename std::enable_if::value && !std::is_floating_point::value, T>::type + msvc_arm64_reduce_max(__n128 a) noexcept { return vmaxvq_s32(a); } + template + XSIMD_INLINE typename std::enable_if::value, T>::type + msvc_arm64_reduce_max(__n128 a) noexcept { return vmaxvq_f32(a); } + template + XSIMD_INLINE typename std::enable_if::value, T>::type + msvc_arm64_reduce_max(__n128 a) noexcept { return std::max(vdupd_laneq_u64(a, 0), vdupd_laneq_u64(a, 1)); } + template + XSIMD_INLINE typename std::enable_if::value && !std::is_floating_point::value, T>::type + msvc_arm64_reduce_max(__n128 a) noexcept { return std::max(vdupd_laneq_s64(a, 0), vdupd_laneq_s64(a, 1)); } + template + XSIMD_INLINE typename std::enable_if::value, T>::type + msvc_arm64_reduce_max(__n128 a) noexcept { return vmaxvq_f64(a); } + + template + XSIMD_INLINE typename std::enable_if::value, T>::type + msvc_arm64_reduce_min(__n128 a) noexcept { return vminvq_u8(a); } + template + XSIMD_INLINE typename std::enable_if::value, T>::type + msvc_arm64_reduce_min(__n128 a) noexcept { return vminvq_s8(a); } + template + XSIMD_INLINE typename std::enable_if::value, T>::type + msvc_arm64_reduce_min(__n128 a) noexcept { return vminvq_u16(a); } + template + XSIMD_INLINE typename std::enable_if::value, T>::type + msvc_arm64_reduce_min(__n128 a) noexcept { return vminvq_s16(a); } + template + XSIMD_INLINE typename std::enable_if::value, T>::type + msvc_arm64_reduce_min(__n128 a) noexcept { return vminvq_u32(a); } + template + XSIMD_INLINE typename std::enable_if::value && !std::is_floating_point::value, T>::type + msvc_arm64_reduce_min(__n128 a) noexcept { return vminvq_s32(a); } + template + XSIMD_INLINE typename std::enable_if::value, T>::type + msvc_arm64_reduce_min(__n128 a) noexcept { return vminvq_f32(a); } + template + XSIMD_INLINE typename std::enable_if::value, T>::type + msvc_arm64_reduce_min(__n128 a) noexcept { return std::min(vdupd_laneq_u64(a, 0), vdupd_laneq_u64(a, 1)); } + template + XSIMD_INLINE typename std::enable_if::value && !std::is_floating_point::value, T>::type + msvc_arm64_reduce_min(__n128 a) noexcept { return std::min(vdupd_laneq_s64(a, 0), vdupd_laneq_s64(a, 1)); } + template + XSIMD_INLINE typename std::enable_if::value, T>::type + msvc_arm64_reduce_min(__n128 a) noexcept { return vminvq_f64(a); } + } // namespace detail +#endif + WRAP_REDUCER_INT(vaddvq) WRAP_REDUCER_FLOAT(vaddvq) template = 0> XSIMD_INLINE typename batch::value_type reduce_add(batch const& arg, requires_arch) noexcept { +#if defined(_MSC_VER) && defined(_M_ARM64) + return detail::msvc_arm64_reduce_add(arg); +#else using register_type = typename batch::register_type; const detail::neon_reducer_dispatcher::unary dispatcher = { std::make_tuple(wrap::vaddvq_u8, wrap::vaddvq_s8, wrap::vaddvq_u16, wrap::vaddvq_s16, @@ -951,6 +1070,7 @@ namespace xsimd wrap::vaddvq_f32, wrap::vaddvq_f64) }; return dispatcher.apply(register_type(arg)); +#endif } /************** @@ -960,6 +1080,7 @@ namespace xsimd WRAP_REDUCER_INT_EXCLUDING_64(vmaxvq) WRAP_REDUCER_FLOAT(vmaxvq) +#if !defined(_MSC_VER) || !defined(_M_ARM64) namespace wrap { XSIMD_INLINE uint64_t vmaxvq_u64(uint64x2_t a) noexcept @@ -972,10 +1093,14 @@ namespace xsimd return std::max(vdupd_laneq_s64(a, 0), vdupd_laneq_s64(a, 1)); } } +#endif template = 0> XSIMD_INLINE typename batch::value_type reduce_max(batch const& arg, requires_arch) noexcept { +#if defined(_MSC_VER) && defined(_M_ARM64) + return detail::msvc_arm64_reduce_max(arg); +#else using register_type = typename batch::register_type; const detail::neon_reducer_dispatcher::unary dispatcher = { std::make_tuple(wrap::vmaxvq_u8, wrap::vmaxvq_s8, wrap::vmaxvq_u16, wrap::vmaxvq_s16, @@ -983,6 +1108,7 @@ namespace xsimd wrap::vmaxvq_f32, wrap::vmaxvq_f64) }; return dispatcher.apply(register_type(arg)); +#endif } /************** @@ -992,6 +1118,7 @@ namespace xsimd WRAP_REDUCER_INT_EXCLUDING_64(vminvq) WRAP_REDUCER_FLOAT(vminvq) +#if !defined(_MSC_VER) || !defined(_M_ARM64) namespace wrap { XSIMD_INLINE uint64_t vminvq_u64(uint64x2_t a) noexcept @@ -1004,10 +1131,14 @@ namespace xsimd return std::min(vdupd_laneq_s64(a, 0), vdupd_laneq_s64(a, 1)); } } +#endif template = 0> XSIMD_INLINE typename batch::value_type reduce_min(batch const& arg, requires_arch) noexcept { +#if defined(_MSC_VER) && defined(_M_ARM64) + return detail::msvc_arm64_reduce_min(arg); +#else using register_type = typename batch::register_type; const detail::neon_reducer_dispatcher::unary dispatcher = { std::make_tuple(wrap::vminvq_u8, wrap::vminvq_s8, wrap::vminvq_u16, wrap::vminvq_s16, @@ -1015,6 +1146,7 @@ namespace xsimd wrap::vminvq_f32, wrap::vminvq_f64) }; return dispatcher.apply(register_type(arg)); +#endif } #undef WRAP_REDUCER_INT_EXCLUDING_64 @@ -1260,6 +1392,7 @@ namespace xsimd * bitwise_cast * ****************/ +#if !defined(_MSC_VER) || !defined(_M_ARM64) #define WRAP_CAST(SUFFIX, TYPE) \ namespace wrap \ { \ @@ -1284,10 +1417,15 @@ namespace xsimd WRAP_CAST(f32, float32x4_t) #undef WRAP_CAST +#endif template XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept { +#if defined(_MSC_VER) && defined(_M_ARM64) + // On MSVC ARM64, all NEON types are __n128; reinterpret is a no-op. + return arg.data; +#else using caster_type = detail::bitwise_caster_impl::register_type; return caster.apply(register_type(arg)); +#endif } +#if !defined(_MSC_VER) || !defined(_M_ARM64) namespace detail { template @@ -1320,10 +1460,15 @@ namespace xsimd } }; } +#endif template XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept { +#if defined(_MSC_VER) && defined(_M_ARM64) + // On MSVC ARM64, all NEON types are __n128; reinterpret is a no-op. + return arg.data; +#else using caster_type = detail::bitwise_caster_neon64::register_type; using dst_register_type = typename batch::register_type; return caster.apply(src_register_type(arg)); +#endif } template @@ -1378,14 +1524,31 @@ namespace xsimd XSIMD_INLINE batch swizzle(batch const& self, batch idx, requires_arch) noexcept { +#if defined(_MSC_VER) && defined(_M_ARM64) + // MSVC ARM64: vqtbl1q_* are macro-based and conflict with our wrapper usage. + // Use the two-table lookup (vtbl2_u8) on low/high halves. + uint8x8x2_t tbl = { vget_low_u8(self), vget_high_u8(self) }; + uint8x8_t lo = vtbl2_u8(tbl, vget_low_u8(idx)); + uint8x8_t hi = vtbl2_u8(tbl, vget_high_u8(idx)); + return vcombine_u8(lo, hi); +#else return vqtbl1q_u8(self, idx); +#endif } template XSIMD_INLINE batch swizzle(batch const& self, batch idx, requires_arch) noexcept { +#if defined(_MSC_VER) && defined(_M_ARM64) + // Same approach as above but for signed payload. + uint8x8x2_t tbl = { vreinterpret_u8_s8(vget_low_s8(self)), vreinterpret_u8_s8(vget_high_s8(self)) }; + uint8x8_t lo = vtbl2_u8(tbl, vget_low_u8(idx)); + uint8x8_t hi = vtbl2_u8(tbl, vget_high_u8(idx)); + return vreinterpretq_s8_u8(vcombine_u8(lo, hi)); +#else return vqtbl1q_s8(self, idx); +#endif } template @@ -1395,9 +1558,26 @@ namespace xsimd { using batch_type = batch; using index_type = batch; +#if defined(_MSC_VER) && defined(_M_ARM64) + batch_type self_bytes = batch_type(vreinterpretq_u8_u16(self)); + constexpr std::size_t lanes = batch::size; + constexpr std::size_t elem_bytes = sizeof(uint16_t); + alignas(A::alignment()) uint16_t idx_in[lanes]; + idx.store_aligned(&idx_in[0]); + alignas(A::alignment()) uint8_t idx_out[batch_type::size]; + for (std::size_t j = 0; j < lanes; ++j) + { + std::size_t base = static_cast(idx_in[j]) * elem_bytes; + for (std::size_t k = 0; k < elem_bytes; ++k) + idx_out[j * elem_bytes + k] = static_cast(base + k); + } + index_type indices = index_type::load_aligned(&idx_out[0]); + return vreinterpretq_u16_u8(swizzle(self_bytes, indices, neon64 {})); +#else return vreinterpretq_u16_u8(swizzle(batch_type(vreinterpretq_u8_u16(self)), index_type(vreinterpretq_u8_u16(idx * 0x0202 + 0x0100)), neon64 {})); +#endif } template @@ -1415,9 +1595,26 @@ namespace xsimd { using batch_type = batch; using index_type = batch; +#if defined(_MSC_VER) && defined(_M_ARM64) + batch_type self_bytes = batch_type(vreinterpretq_u8_u32(self)); + constexpr std::size_t lanes = batch::size; + constexpr std::size_t elem_bytes = sizeof(uint32_t); + alignas(A::alignment()) uint32_t idx_in[lanes]; + idx.store_aligned(&idx_in[0]); + alignas(A::alignment()) uint8_t idx_out[batch_type::size]; + for (std::size_t j = 0; j < lanes; ++j) + { + std::size_t base = static_cast(idx_in[j]) * elem_bytes; + for (std::size_t k = 0; k < elem_bytes; ++k) + idx_out[j * elem_bytes + k] = static_cast(base + k); + } + index_type indices = index_type::load_aligned(&idx_out[0]); + return vreinterpretq_u32_u8(swizzle(self_bytes, indices, neon64 {})); +#else return vreinterpretq_u32_u8(swizzle(batch_type(vreinterpretq_u8_u32(self)), index_type(vreinterpretq_u8_u32(idx * 0x04040404 + 0x03020100)), neon64 {})); +#endif } template @@ -1435,9 +1632,26 @@ namespace xsimd { using batch_type = batch; using index_type = batch; +#if defined(_MSC_VER) && defined(_M_ARM64) + batch_type self_bytes = batch_type(vreinterpretq_u8_u64(self)); + constexpr std::size_t lanes = batch::size; + constexpr std::size_t elem_bytes = sizeof(uint64_t); + alignas(A::alignment()) uint64_t idx_in[lanes]; + idx.store_aligned(&idx_in[0]); + alignas(A::alignment()) uint8_t idx_out[batch_type::size]; + for (std::size_t j = 0; j < lanes; ++j) + { + std::size_t base = static_cast(idx_in[j]) * elem_bytes; + for (std::size_t k = 0; k < elem_bytes; ++k) + idx_out[j * elem_bytes + k] = static_cast(base + k); + } + index_type indices_batch = index_type::load_aligned(&idx_out[0]); + return vreinterpretq_u64_u8(swizzle(self_bytes, indices_batch, neon64 {})); +#else return vreinterpretq_u64_u8(swizzle(batch_type(vreinterpretq_u8_u64(self)), index_type(vreinterpretq_u8_u64(idx * 0x0808080808080808ull + 0x0706050403020100ull)), neon64 {})); +#endif } template @@ -1521,7 +1735,11 @@ namespace xsimd batch_constant idx, requires_arch) noexcept { +#if defined(_MSC_VER) && defined(_M_ARM64) + return swizzle(self, batch(idx), neon64 {}); +#else return vqtbl1q_u8(self, batch(idx)); +#endif } template idx, requires_arch) noexcept { +#if defined(_MSC_VER) && defined(_M_ARM64) + return swizzle(self, batch(idx), neon64 {}); +#else return vqtbl1q_s8(self, batch(idx)); +#endif } template @@ -1569,6 +1791,35 @@ namespace xsimd return vreinterpretq_s32_s8(swizzle(batch_type(vreinterpretq_s8_s32(self)), detail::burst_index(idx), A())); } +#if defined(_MSC_VER) && defined(_M_ARM64) + template + XSIMD_INLINE batch swizzle(batch const& self, + batch_constant, + requires_arch) noexcept + { + static_assert(batch::size == 2, "neon64 uint64 batch must have size 2"); + uint64_t in[2]; + uint64_t out[2]; + self.store_unaligned(in); + out[0] = in[V0]; + out[1] = in[V1]; + return batch::load_unaligned(out); + } + + template + XSIMD_INLINE batch swizzle(batch const& self, + batch_constant, + requires_arch) noexcept + { + static_assert(batch::size == 2, "neon64 int64 batch must have size 2"); + int64_t in[2]; + int64_t out[2]; + self.store_unaligned(in); + out[0] = in[V0]; + out[1] = in[V1]; + return batch::load_unaligned(out); + } +#else template XSIMD_INLINE batch swizzle(batch const& self, batch_constant idx, @@ -1586,6 +1837,7 @@ namespace xsimd using batch_type = batch; return vreinterpretq_s64_s8(swizzle(batch_type(vreinterpretq_s8_s64(self)), detail::burst_index(idx), A())); } +#endif template XSIMD_INLINE batch swizzle(batch const& self, @@ -1606,19 +1858,19 @@ namespace xsimd } template - XSIMD_INLINE batch, A> swizzle(batch, A> const& self, - batch_constant idx, - requires_arch) noexcept + XSIMD_INLINE batch<::std::complex, A> swizzle(batch<::std::complex, A> const& self, + batch_constant idx, + requires_arch) noexcept { - return batch>(swizzle(self.real(), idx, A()), swizzle(self.imag(), idx, A())); + return batch<::std::complex>(swizzle(self.real(), idx, A()), swizzle(self.imag(), idx, A())); } template - XSIMD_INLINE batch, A> swizzle(batch, A> const& self, - batch_constant idx, - requires_arch) noexcept + XSIMD_INLINE batch<::std::complex, A> swizzle(batch<::std::complex, A> const& self, + batch_constant idx, + requires_arch) noexcept { - return batch>(swizzle(self.real(), idx, A()), swizzle(self.imag(), idx, A())); + return batch<::std::complex>(swizzle(self.real(), idx, A()), swizzle(self.imag(), idx, A())); } /********* From 6437cdd5812ae4a2a59607f75c733d165bd2ea8b Mon Sep 17 00:00:00 2001 From: Prakhar Deep Date: Thu, 2 Apr 2026 14:16:25 +0530 Subject: [PATCH 2/4] enable MSVC cl.exe build changes --- include/xsimd/arch/xsimd_neon.hpp | 36 +++---------------------------- 1 file changed, 3 insertions(+), 33 deletions(-) diff --git a/include/xsimd/arch/xsimd_neon.hpp b/include/xsimd/arch/xsimd_neon.hpp index 648d8a7dd..5373e47e9 100644 --- a/include/xsimd/arch/xsimd_neon.hpp +++ b/include/xsimd/arch/xsimd_neon.hpp @@ -289,22 +289,7 @@ namespace xsimd } #if defined(_MSC_VER) && defined(_M_ARM64) - // ----------------------------------------------------------------------- - // C++14-compatible dispatch helpers for MSVC ARM64. - // - // On MSVC ARM64, all NEON types are the same underlying type (__n128), - // so overload resolution on NEON types does not work and the existing - // std::tuple-based dispatcher cannot be used. The original workaround - // used `if constexpr` (C++17). The helpers below replace that with - // std::enable_if overloads, which are valid C++14. - // - // Each helper is a function template parameterised on the *element* type - // T. The correct intrinsic is selected at compile time via enable_if on - // sizeof(T) and std::is_unsigned / std::is_floating_point. - // ----------------------------------------------------------------------- namespace detail { - - // -- load (for set) -- template XSIMD_INLINE typename std::enable_if::value, __n128>::type msvc_arm64_load(const T* d) noexcept { return vld1q_u8(reinterpret_cast(d)); } @@ -330,7 +315,6 @@ namespace xsimd XSIMD_INLINE typename std::enable_if::value, __n128>::type msvc_arm64_load(const T* d) noexcept { return vld1q_s64(reinterpret_cast(d)); } - // -- load_u (for set) -- loads from unsigned element array template XSIMD_INLINE typename std::enable_if::type msvc_arm64_load_u(const as_unsigned_integer_t* d) noexcept { return vld1q_u8(reinterpret_cast(d)); } @@ -344,7 +328,6 @@ namespace xsimd XSIMD_INLINE typename std::enable_if::type msvc_arm64_load_u(const as_unsigned_integer_t* d) noexcept { return vld1q_u64(reinterpret_cast(d)); } - // -- eq for batch_bool (unsigned comparison by size) -- template XSIMD_INLINE typename std::enable_if::type msvc_arm64_eq_bool(__n128 a, __n128 b) noexcept { return vceqq_u8(a,b); } @@ -355,10 +338,8 @@ namespace xsimd XSIMD_INLINE typename std::enable_if::type msvc_arm64_eq_bool(__n128 a, __n128 b) noexcept { return vceqq_u32(a,b); } - } // namespace detail (MSVC ARM64 helpers) + } -// Macro to generate C++14 enable_if dispatch overloads for a full binary op -// (all 9 NEON element types: u8,s8,u16,s16,u32,s32,u64,s64,f32). #define XSIMD_MSVC_ARM64_BINARY_FULL(fname, u8fn, s8fn, u16fn, s16fn, u32fn, s32fn, u64fn, s64fn, f32fn) \ namespace detail { \ template \ @@ -390,7 +371,6 @@ namespace xsimd fname(__n128 a, __n128 b) noexcept { return s64fn(a,b); } \ } -// Macro for binary ops excluding int64 (u8,s8,u16,s16,u32,s32,f32). #define XSIMD_MSVC_ARM64_BINARY_EX64(fname, u8fn, s8fn, u16fn, s16fn, u32fn, s32fn, f32fn) \ namespace detail { \ template \ @@ -416,7 +396,6 @@ namespace xsimd fname(__n128 a, __n128 b) noexcept { return f32fn(a,b); } \ } -// Macro for unsigned-only binary ops excluding int64 (u8,u16,u32). #define XSIMD_MSVC_ARM64_BINARY_UINT_EX64(fname, u8fn, u16fn, u32fn) \ namespace detail { \ template \ @@ -430,7 +409,6 @@ namespace xsimd fname(__n128 a, __n128 b) noexcept { return u32fn(a,b); } \ } -// Macro for unary ops excluding int64 (u8,s8,u16,s16,u32,s32,f32). #define XSIMD_MSVC_ARM64_UNARY_EX64(fname, u8fn, s8fn, u16fn, s16fn, u32fn, s32fn, f32fn) \ namespace detail { \ template \ @@ -456,7 +434,6 @@ namespace xsimd fname(__n128 a) noexcept { return f32fn(a); } \ } -// Macro for select (ternary: cond, a, b) — all 9 types. #define XSIMD_MSVC_ARM64_SELECT_FULL(fname, u8fn, s8fn, u16fn, s16fn, u32fn, s32fn, u64fn, s64fn, f32fn) \ namespace detail { \ template \ @@ -488,7 +465,6 @@ namespace xsimd fname(__n128 c, __n128 a, __n128 b) noexcept { return s64fn(c,a,b); } \ } -// Macro for bitwise ops on batch_bool (unsigned only, all sizes). #define XSIMD_MSVC_ARM64_BINARY_UINT_ALL(fname, u8fn, u16fn, u32fn, u64fn) \ namespace detail { \ template \ @@ -505,7 +481,6 @@ namespace xsimd fname##_bool(__n128 a, __n128 b) noexcept { return u64fn(a,b); } \ } -// Macro for bitwise unary ops on batch_bool (unsigned only, all sizes). #define XSIMD_MSVC_ARM64_UNARY_UINT_ALL(fname, u8fn, u16fn, u32fn, u64fn) \ namespace detail { \ template \ @@ -522,7 +497,6 @@ namespace xsimd fname##_bool(__n128 a) noexcept { return u64fn(a); } \ } -// Generate all dispatch helpers used by the MSVC ARM64 paths below. XSIMD_MSVC_ARM64_BINARY_FULL(msvc_arm64_add, vaddq_u8, vaddq_s8, vaddq_u16, vaddq_s16, vaddq_u32, vaddq_s32, vaddq_u64, vaddq_s64, vaddq_f32) XSIMD_MSVC_ARM64_BINARY_FULL(msvc_arm64_sadd, vqaddq_u8, vqaddq_s8, vqaddq_u16, vqaddq_s16, vqaddq_u32, vqaddq_s32, vqaddq_u64, vqaddq_s64, vaddq_f32) XSIMD_MSVC_ARM64_BINARY_FULL(msvc_arm64_sub, vsubq_u8, vsubq_s8, vsubq_u16, vsubq_s16, vsubq_u32, vsubq_s32, vsubq_u64, vsubq_s64, vsubq_f32) @@ -544,26 +518,22 @@ namespace detail { } XSIMD_MSVC_ARM64_UNARY_EX64(msvc_arm64_abs, msvc_arm64_abs_u8, vabsq_s8, msvc_arm64_abs_u16, vabsq_s16, msvc_arm64_abs_u32, vabsq_s32, vabsq_f32) -// bitwise ops on batch XSIMD_MSVC_ARM64_BINARY_FULL(msvc_arm64_and, vandq_u8, vandq_u8, vandq_u16, vandq_u16, vandq_u32, vandq_u32, vandq_u64, vandq_u64, vandq_u8) XSIMD_MSVC_ARM64_BINARY_FULL(msvc_arm64_or, vorrq_u8, vorrq_u8, vorrq_u16, vorrq_u16, vorrq_u32, vorrq_u32, vorrq_u64, vorrq_u64, vorrq_u8) XSIMD_MSVC_ARM64_BINARY_FULL(msvc_arm64_xor, veorq_u8, veorq_u8, veorq_u16, veorq_u16, veorq_u32, veorq_u32, veorq_u64, veorq_u64, veorq_u8) XSIMD_MSVC_ARM64_BINARY_FULL(msvc_arm64_andn, vbicq_u8, vbicq_u8, vbicq_u16, vbicq_u16, vbicq_u32, vbicq_u32, vbicq_u64, vbicq_u64, vbicq_u8) -// bitwise ops on batch_bool + XSIMD_MSVC_ARM64_BINARY_UINT_ALL(msvc_arm64_and, vandq_u8, vandq_u16, vandq_u32, vandq_u64) XSIMD_MSVC_ARM64_BINARY_UINT_ALL(msvc_arm64_or, vorrq_u8, vorrq_u16, vorrq_u32, vorrq_u64) XSIMD_MSVC_ARM64_BINARY_UINT_ALL(msvc_arm64_xor, veorq_u8, veorq_u16, veorq_u32, veorq_u64) XSIMD_MSVC_ARM64_BINARY_UINT_ALL(msvc_arm64_andn, vbicq_u8, vbicq_u16, vbicq_u32, vbicq_u64) namespace detail { - // On MSVC ARM64 all NEON types are __n128, so vmvnq_u32 works for any lane width. XSIMD_INLINE __n128 msvc_arm64_not_u64_impl(__n128 a) noexcept { return vmvnq_u32(a); } } XSIMD_MSVC_ARM64_UNARY_UINT_ALL(msvc_arm64_not, vmvnq_u8, vmvnq_u16, vmvnq_u32, msvc_arm64_not_u64_impl) -// select XSIMD_MSVC_ARM64_SELECT_FULL(msvc_arm64_select, vbslq_u8, vbslq_s8, vbslq_u16, vbslq_s16, vbslq_u32, vbslq_s32, vbslq_u64, vbslq_s64, vbslq_f32) -// rotate_left (N is a compile-time constant) namespace detail { template XSIMD_INLINE typename std::enable_if::value, __n128>::type @@ -592,7 +562,7 @@ namespace detail { template XSIMD_INLINE typename std::enable_if::value, __n128>::type msvc_arm64_rotate_left(__n128 a) noexcept { return vextq_s64(a, a, N % 2); } -} // namespace detail +} #endif /************* From 7e3fe1afca90de8475e411d6b48419838edef622 Mon Sep 17 00:00:00 2001 From: Prakhar Deep Date: Thu, 2 Apr 2026 15:19:22 +0530 Subject: [PATCH 3/4] fix for clang build failure --- include/xsimd/arch/xsimd_neon.hpp | 130 ++++++++++++++-------------- include/xsimd/arch/xsimd_neon64.hpp | 42 ++++----- 2 files changed, 86 insertions(+), 86 deletions(-) diff --git a/include/xsimd/arch/xsimd_neon.hpp b/include/xsimd/arch/xsimd_neon.hpp index 5373e47e9..1f022746e 100644 --- a/include/xsimd/arch/xsimd_neon.hpp +++ b/include/xsimd/arch/xsimd_neon.hpp @@ -1,4 +1,4 @@ -/*************************************************************************** +/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * @@ -230,7 +230,7 @@ namespace xsimd using type = T; }; -#if !defined(_MSC_VER) || !defined(_M_ARM64) +#if !defined(_MSC_VER) || !defined(_M_ARM64) || defined(__clang__) template <> struct comp_return_type_impl { @@ -288,7 +288,7 @@ namespace xsimd = std::enable_if_t<(std::is_integral::value && sizeof(T) != 8) || std::is_same::value, int>; } -#if defined(_MSC_VER) && defined(_M_ARM64) +#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) namespace detail { template XSIMD_INLINE typename std::enable_if::value, __n128>::type @@ -630,7 +630,7 @@ namespace detail { template = 0> XSIMD_INLINE batch set(batch const&, requires_arch, Args... args) noexcept { -#if defined(_MSC_VER) && defined(_M_ARM64) +#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) alignas(16) T data[] = { static_cast(args)... }; return detail::msvc_arm64_load(data); #else @@ -641,7 +641,7 @@ namespace detail { template = 0> XSIMD_INLINE batch_bool set(batch_bool const&, requires_arch, Args... args) noexcept { -#if defined(_MSC_VER) && defined(_M_ARM64) +#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) using unsigned_type = as_unsigned_integer_t; alignas(16) unsigned_type data[] = { static_cast(args ? -1LL : 0LL)... }; return detail::msvc_arm64_load_u(data); @@ -655,7 +655,7 @@ namespace detail { template XSIMD_INLINE batch set(batch const&, requires_arch, float f0, float f1, float f2, float f3) noexcept { -#if defined(_MSC_VER) && defined(_M_ARM64) +#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) // On MSVC ARM64, use load from array instead of brace initialization alignas(16) float data[] = { f0, f1, f2, f3 }; return vld1q_f32(data); @@ -669,7 +669,7 @@ namespace detail { std::complex c0, std::complex c1, std::complex c2, std::complex c3) noexcept { -#if defined(_MSC_VER) && defined(_M_ARM64) +#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) // On MSVC ARM64, use load from array instead of brace initialization alignas(16) float real_data[] = { c0.real(), c1.real(), c2.real(), c3.real() }; alignas(16) float imag_data[] = { c0.imag(), c1.imag(), c2.imag(), c3.imag() }; @@ -683,7 +683,7 @@ namespace detail { template XSIMD_INLINE batch_bool set(batch_bool const&, requires_arch, Args... args) noexcept { -#if defined(_MSC_VER) && defined(_M_ARM64) +#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) // On MSVC ARM64, use load from array instead of brace initialization using unsigned_type = as_unsigned_integer_t; alignas(16) unsigned_type data[] = { static_cast(args ? -1LL : 0LL)... }; @@ -1142,7 +1142,7 @@ namespace detail { * add * *******/ -#if !defined(_MSC_VER) || !defined(_M_ARM64) +#if !defined(_MSC_VER) || !defined(_M_ARM64) || defined(__clang__) WRAP_BINARY_INT(vaddq, detail::identity_return_type) WRAP_BINARY_FLOAT(vaddq, detail::identity_return_type) #endif @@ -1150,7 +1150,7 @@ namespace detail { template = 0> XSIMD_INLINE batch add(batch const& lhs, batch const& rhs, requires_arch) noexcept { -#if defined(_MSC_VER) && defined(_M_ARM64) +#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) return detail::msvc_arm64_add(lhs, rhs); #else using register_type = typename batch::register_type; @@ -1167,14 +1167,14 @@ namespace detail { * avg * *******/ -#if !defined(_MSC_VER) || !defined(_M_ARM64) +#if !defined(_MSC_VER) || !defined(_M_ARM64) || defined(__clang__) WRAP_BINARY_UINT_EXCLUDING_64(vhaddq, detail::identity_return_type) #endif template ::value && sizeof(T) != 8)>> XSIMD_INLINE batch avg(batch const& lhs, batch const& rhs, requires_arch) noexcept { -#if defined(_MSC_VER) && defined(_M_ARM64) +#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) return detail::msvc_arm64_avg(lhs, rhs); #else using register_type = typename batch::register_type; @@ -1189,14 +1189,14 @@ namespace detail { * avgr * ********/ -#if !defined(_MSC_VER) || !defined(_M_ARM64) +#if !defined(_MSC_VER) || !defined(_M_ARM64) || defined(__clang__) WRAP_BINARY_UINT_EXCLUDING_64(vrhaddq, detail::identity_return_type) #endif template ::value && sizeof(T) != 8)>> XSIMD_INLINE batch avgr(batch const& lhs, batch const& rhs, requires_arch) noexcept { -#if defined(_MSC_VER) && defined(_M_ARM64) +#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) return detail::msvc_arm64_avgr(lhs, rhs); #else using register_type = typename batch::register_type; @@ -1211,14 +1211,14 @@ namespace detail { * sadd * ********/ -#if !defined(_MSC_VER) || !defined(_M_ARM64) +#if !defined(_MSC_VER) || !defined(_M_ARM64) || defined(__clang__) WRAP_BINARY_INT(vqaddq, detail::identity_return_type) #endif template = 0> XSIMD_INLINE batch sadd(batch const& lhs, batch const& rhs, requires_arch) noexcept { -#if defined(_MSC_VER) && defined(_M_ARM64) +#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) return detail::msvc_arm64_sadd(lhs, rhs); #else using register_type = typename batch::register_type; @@ -1235,7 +1235,7 @@ namespace detail { * sub * *******/ -#if !defined(_MSC_VER) || !defined(_M_ARM64) +#if !defined(_MSC_VER) || !defined(_M_ARM64) || defined(__clang__) WRAP_BINARY_INT(vsubq, detail::identity_return_type) WRAP_BINARY_FLOAT(vsubq, detail::identity_return_type) #endif @@ -1243,7 +1243,7 @@ namespace detail { template = 0> XSIMD_INLINE batch sub(batch const& lhs, batch const& rhs, requires_arch) noexcept { -#if defined(_MSC_VER) && defined(_M_ARM64) +#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) return detail::msvc_arm64_sub(lhs, rhs); #else using register_type = typename batch::register_type; @@ -1260,14 +1260,14 @@ namespace detail { * ssub * ********/ -#if !defined(_MSC_VER) || !defined(_M_ARM64) +#if !defined(_MSC_VER) || !defined(_M_ARM64) || defined(__clang__) WRAP_BINARY_INT(vqsubq, detail::identity_return_type) #endif template = 0> XSIMD_INLINE batch ssub(batch const& lhs, batch const& rhs, requires_arch) noexcept { -#if defined(_MSC_VER) && defined(_M_ARM64) +#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) return detail::msvc_arm64_ssub(lhs, rhs); #else using register_type = typename batch::register_type; @@ -1284,7 +1284,7 @@ namespace detail { * mul * *******/ -#if !defined(_MSC_VER) || !defined(_M_ARM64) +#if !defined(_MSC_VER) || !defined(_M_ARM64) || defined(__clang__) WRAP_BINARY_INT_EXCLUDING_64(vmulq, detail::identity_return_type) WRAP_BINARY_FLOAT(vmulq, detail::identity_return_type) #endif @@ -1292,7 +1292,7 @@ namespace detail { template = 0> XSIMD_INLINE batch mul(batch const& lhs, batch const& rhs, requires_arch) noexcept { -#if defined(_MSC_VER) && defined(_M_ARM64) +#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) return detail::msvc_arm64_mul(lhs, rhs); #else using register_type = typename batch::register_type; @@ -1343,7 +1343,7 @@ namespace detail { * eq * ******/ -#if !defined(_MSC_VER) || !defined(_M_ARM64) +#if !defined(_MSC_VER) || !defined(_M_ARM64) || defined(__clang__) WRAP_BINARY_INT_EXCLUDING_64(vceqq, detail::comp_return_type) WRAP_BINARY_FLOAT(vceqq, detail::comp_return_type) #endif @@ -1351,7 +1351,7 @@ namespace detail { template = 0> XSIMD_INLINE batch_bool eq(batch const& lhs, batch const& rhs, requires_arch) noexcept { -#if defined(_MSC_VER) && defined(_M_ARM64) +#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) return detail::msvc_arm64_eq(lhs, rhs); #else using register_type = typename batch::register_type; @@ -1366,7 +1366,7 @@ namespace detail { template = 0> XSIMD_INLINE batch_bool eq(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept { -#if defined(_MSC_VER) && defined(_M_ARM64) +#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) return detail::msvc_arm64_eq_bool(lhs, rhs); #else using register_type = typename batch_bool::register_type; @@ -1438,7 +1438,7 @@ namespace detail { * lt * ******/ -#if !defined(_MSC_VER) || !defined(_M_ARM64) +#if !defined(_MSC_VER) || !defined(_M_ARM64) || defined(__clang__) WRAP_BINARY_INT_EXCLUDING_64(vcltq, detail::comp_return_type) WRAP_BINARY_FLOAT(vcltq, detail::comp_return_type) #endif @@ -1446,7 +1446,7 @@ namespace detail { template = 0> XSIMD_INLINE batch_bool lt(batch const& lhs, batch const& rhs, requires_arch) noexcept { -#if defined(_MSC_VER) && defined(_M_ARM64) +#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) return detail::msvc_arm64_lt(lhs, rhs); #else using register_type = typename batch::register_type; @@ -1477,7 +1477,7 @@ namespace detail { * le * ******/ -#if !defined(_MSC_VER) || !defined(_M_ARM64) +#if !defined(_MSC_VER) || !defined(_M_ARM64) || defined(__clang__) WRAP_BINARY_INT_EXCLUDING_64(vcleq, detail::comp_return_type) WRAP_BINARY_FLOAT(vcleq, detail::comp_return_type) #endif @@ -1485,7 +1485,7 @@ namespace detail { template = 0> XSIMD_INLINE batch_bool le(batch const& lhs, batch const& rhs, requires_arch) noexcept { -#if defined(_MSC_VER) && defined(_M_ARM64) +#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) return detail::msvc_arm64_le(lhs, rhs); #else using register_type = typename batch::register_type; @@ -1519,7 +1519,7 @@ namespace detail { } } -#if !defined(_MSC_VER) || !defined(_M_ARM64) +#if !defined(_MSC_VER) || !defined(_M_ARM64) || defined(__clang__) WRAP_BINARY_INT_EXCLUDING_64(vcgtq, detail::comp_return_type) WRAP_BINARY_FLOAT(vcgtq, detail::comp_return_type) #endif @@ -1527,7 +1527,7 @@ namespace detail { template = 0> XSIMD_INLINE batch_bool gt(batch const& lhs, batch const& rhs, requires_arch) noexcept { -#if defined(_MSC_VER) && defined(_M_ARM64) +#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) return detail::msvc_arm64_gt(lhs, rhs); #else using register_type = typename batch::register_type; @@ -1558,7 +1558,7 @@ namespace detail { * ge * ******/ -#if !defined(_MSC_VER) || !defined(_M_ARM64) +#if !defined(_MSC_VER) || !defined(_M_ARM64) || defined(__clang__) WRAP_BINARY_INT_EXCLUDING_64(vcgeq, detail::comp_return_type) WRAP_BINARY_FLOAT(vcgeq, detail::comp_return_type) #endif @@ -1566,7 +1566,7 @@ namespace detail { template = 0> XSIMD_INLINE batch_bool ge(batch const& lhs, batch const& rhs, requires_arch) noexcept { -#if defined(_MSC_VER) && defined(_M_ARM64) +#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) return detail::msvc_arm64_ge(lhs, rhs); #else using register_type = typename batch::register_type; @@ -1599,7 +1599,7 @@ namespace detail { * bitwise_and * ***************/ -#if !defined(_MSC_VER) || !defined(_M_ARM64) +#if !defined(_MSC_VER) || !defined(_M_ARM64) || defined(__clang__) WRAP_BINARY_INT(vandq, detail::identity_return_type) #endif @@ -1611,7 +1611,7 @@ namespace detail { vreinterpretq_u32_f32(rhs))); } -#if !defined(_MSC_VER) || !defined(_M_ARM64) +#if !defined(_MSC_VER) || !defined(_M_ARM64) || defined(__clang__) template V bitwise_and_neon(V const& lhs, V const& rhs) { @@ -1628,7 +1628,7 @@ namespace detail { template = 0> XSIMD_INLINE batch bitwise_and(batch const& lhs, batch const& rhs, requires_arch) noexcept { -#if defined(_MSC_VER) && defined(_M_ARM64) +#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) // On MSVC ARM64 all NEON types are __n128; vandq_u32 works for all. return vandq_u32(lhs, rhs); #else @@ -1640,7 +1640,7 @@ namespace detail { template = 0> XSIMD_INLINE batch_bool bitwise_and(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept { -#if defined(_MSC_VER) && defined(_M_ARM64) +#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) return vandq_u32(lhs, rhs); #else using register_type = typename batch_bool::register_type; @@ -1652,7 +1652,7 @@ namespace detail { * bitwise_or * **************/ -#if !defined(_MSC_VER) || !defined(_M_ARM64) +#if !defined(_MSC_VER) || !defined(_M_ARM64) || defined(__clang__) WRAP_BINARY_INT(vorrq, detail::identity_return_type) #endif @@ -1664,7 +1664,7 @@ namespace detail { vreinterpretq_u32_f32(rhs))); } -#if !defined(_MSC_VER) || !defined(_M_ARM64) +#if !defined(_MSC_VER) || !defined(_M_ARM64) || defined(__clang__) template XSIMD_INLINE V bitwise_or_neon(V const& lhs, V const& rhs) noexcept { @@ -1681,7 +1681,7 @@ namespace detail { template = 0> XSIMD_INLINE batch bitwise_or(batch const& lhs, batch const& rhs, requires_arch) noexcept { -#if defined(_MSC_VER) && defined(_M_ARM64) +#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) return vorrq_u32(lhs, rhs); #else using register_type = typename batch::register_type; @@ -1692,7 +1692,7 @@ namespace detail { template = 0> XSIMD_INLINE batch_bool bitwise_or(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept { -#if defined(_MSC_VER) && defined(_M_ARM64) +#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) return vorrq_u32(lhs, rhs); #else using register_type = typename batch_bool::register_type; @@ -1704,7 +1704,7 @@ namespace detail { * bitwise_xor * ***************/ -#if !defined(_MSC_VER) || !defined(_M_ARM64) +#if !defined(_MSC_VER) || !defined(_M_ARM64) || defined(__clang__) WRAP_BINARY_INT(veorq, detail::identity_return_type) #endif @@ -1716,7 +1716,7 @@ namespace detail { vreinterpretq_u32_f32(rhs))); } -#if !defined(_MSC_VER) || !defined(_M_ARM64) +#if !defined(_MSC_VER) || !defined(_M_ARM64) || defined(__clang__) template XSIMD_INLINE V bitwise_xor_neon(V const& lhs, V const& rhs) noexcept { @@ -1733,7 +1733,7 @@ namespace detail { template = 0> XSIMD_INLINE batch bitwise_xor(batch const& lhs, batch const& rhs, requires_arch) noexcept { -#if defined(_MSC_VER) && defined(_M_ARM64) +#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) return veorq_u32(lhs, rhs); #else using register_type = typename batch::register_type; @@ -1744,7 +1744,7 @@ namespace detail { template = 0> XSIMD_INLINE batch_bool bitwise_xor(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept { -#if defined(_MSC_VER) && defined(_M_ARM64) +#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) return veorq_u32(lhs, rhs); #else using register_type = typename batch_bool::register_type; @@ -1766,7 +1766,7 @@ namespace detail { * bitwise_not * ***************/ -#if !defined(_MSC_VER) || !defined(_M_ARM64) +#if !defined(_MSC_VER) || !defined(_M_ARM64) || defined(__clang__) WRAP_UNARY_INT_EXCLUDING_64(vmvnq) #endif @@ -1777,7 +1777,7 @@ namespace detail { return vreinterpretq_f32_u32(vmvnq_u32(vreinterpretq_u32_f32(arg))); } -#if !defined(_MSC_VER) || !defined(_M_ARM64) +#if !defined(_MSC_VER) || !defined(_M_ARM64) || defined(__clang__) template XSIMD_INLINE V bitwise_not_neon(V const& arg) noexcept { @@ -1795,7 +1795,7 @@ namespace detail { template = 0> XSIMD_INLINE batch bitwise_not(batch const& arg, requires_arch) noexcept { -#if defined(_MSC_VER) && defined(_M_ARM64) +#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) // On MSVC ARM64 all NEON types are __n128; vmvnq_u32 works for all. return vmvnq_u32(arg); #else @@ -1807,7 +1807,7 @@ namespace detail { template = 0> XSIMD_INLINE batch_bool bitwise_not(batch_bool const& arg, requires_arch) noexcept { -#if defined(_MSC_VER) && defined(_M_ARM64) +#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) return vmvnq_u32(arg); #else using register_type = typename batch_bool::register_type; @@ -1819,7 +1819,7 @@ namespace detail { * bitwise_andnot * ******************/ -#if !defined(_MSC_VER) || !defined(_M_ARM64) +#if !defined(_MSC_VER) || !defined(_M_ARM64) || defined(__clang__) WRAP_BINARY_INT(vbicq, detail::identity_return_type) #endif @@ -1830,7 +1830,7 @@ namespace detail { return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(lhs), vreinterpretq_u32_f32(rhs))); } -#if !defined(_MSC_VER) || !defined(_M_ARM64) +#if !defined(_MSC_VER) || !defined(_M_ARM64) || defined(__clang__) template XSIMD_INLINE V bitwise_andnot_neon(V const& lhs, V const& rhs) noexcept { @@ -1847,7 +1847,7 @@ namespace detail { template = 0> XSIMD_INLINE batch bitwise_andnot(batch const& lhs, batch const& rhs, requires_arch) noexcept { -#if defined(_MSC_VER) && defined(_M_ARM64) +#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) // On MSVC ARM64 all NEON types are __n128; vbicq_u32 works for all. return vbicq_u32(lhs, rhs); #else @@ -1859,7 +1859,7 @@ namespace detail { template = 0> XSIMD_INLINE batch_bool bitwise_andnot(batch_bool const& lhs, batch_bool const& rhs, requires_arch) noexcept { -#if defined(_MSC_VER) && defined(_M_ARM64) +#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) return vbicq_u32(lhs, rhs); #else using register_type = typename batch_bool::register_type; @@ -1871,7 +1871,7 @@ namespace detail { * min * *******/ -#if !defined(_MSC_VER) || !defined(_M_ARM64) +#if !defined(_MSC_VER) || !defined(_M_ARM64) || defined(__clang__) WRAP_BINARY_INT_EXCLUDING_64(vminq, detail::identity_return_type) WRAP_BINARY_FLOAT(vminq, detail::identity_return_type) #endif @@ -1879,7 +1879,7 @@ namespace detail { template = 0> XSIMD_INLINE batch min(batch const& lhs, batch const& rhs, requires_arch) noexcept { -#if defined(_MSC_VER) && defined(_M_ARM64) +#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) return detail::msvc_arm64_min(lhs, rhs); #else using register_type = typename batch::register_type; @@ -1901,7 +1901,7 @@ namespace detail { * max * *******/ -#if !defined(_MSC_VER) || !defined(_M_ARM64) +#if !defined(_MSC_VER) || !defined(_M_ARM64) || defined(__clang__) WRAP_BINARY_INT_EXCLUDING_64(vmaxq, detail::identity_return_type) WRAP_BINARY_FLOAT(vmaxq, detail::identity_return_type) #endif @@ -1909,7 +1909,7 @@ namespace detail { template = 0> XSIMD_INLINE batch max(batch const& lhs, batch const& rhs, requires_arch) noexcept { -#if defined(_MSC_VER) && defined(_M_ARM64) +#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) return detail::msvc_arm64_max(lhs, rhs); #else using register_type = typename batch::register_type; @@ -1931,7 +1931,7 @@ namespace detail { * abs * *******/ -#if !defined(_MSC_VER) || !defined(_M_ARM64) +#if !defined(_MSC_VER) || !defined(_M_ARM64) || defined(__clang__) namespace wrap { XSIMD_INLINE int8x16_t vabsq_s8(int8x16_t a) noexcept { return ::vabsq_s8(a); } @@ -1962,7 +1962,7 @@ namespace detail { template = 0> XSIMD_INLINE batch abs(batch const& arg, requires_arch) noexcept { -#if defined(_MSC_VER) && defined(_M_ARM64) +#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) return detail::msvc_arm64_abs(arg); #else using register_type = typename batch::register_type; @@ -2291,7 +2291,7 @@ namespace detail { * select * **********/ -#if !defined(_MSC_VER) || !defined(_M_ARM64) +#if !defined(_MSC_VER) || !defined(_M_ARM64) || defined(__clang__) namespace wrap { XSIMD_INLINE uint8x16_t vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) noexcept { return ::vbslq_u8(a, b, c); } @@ -2333,7 +2333,7 @@ namespace detail { template = 0> XSIMD_INLINE batch select(batch_bool const& cond, batch const& a, batch const& b, requires_arch) noexcept { -#if defined(_MSC_VER) && defined(_M_ARM64) +#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) return detail::msvc_arm64_select(cond, a, b); #else using bool_register_type = typename batch_bool::register_type; @@ -3315,7 +3315,7 @@ namespace detail { * bitwise_cast * ****************/ -#if !defined(_MSC_VER) || !defined(_M_ARM64) +#if !defined(_MSC_VER) || !defined(_M_ARM64) || defined(__clang__) #define WRAP_CAST(SUFFIX, TYPE) \ namespace wrap \ { \ @@ -3429,7 +3429,7 @@ namespace detail { template XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept { -#if defined(_MSC_VER) && defined(_M_ARM64) +#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) // On MSVC ARM64, all NEON types are __n128, so just return the argument return arg.data; #else @@ -3548,7 +3548,7 @@ namespace detail { /**************** * rotate_left * ****************/ -#if !defined(_MSC_VER) || !defined(_M_ARM64) +#if !defined(_MSC_VER) || !defined(_M_ARM64) || defined(__clang__) namespace wrap { template @@ -3575,7 +3575,7 @@ namespace detail { template = 0> XSIMD_INLINE batch rotate_left(batch const& a, requires_arch) noexcept { -#if defined(_MSC_VER) && defined(_M_ARM64) +#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) return detail::msvc_arm64_rotate_left(a); #else using register_type = typename batch::register_type; diff --git a/include/xsimd/arch/xsimd_neon64.hpp b/include/xsimd/arch/xsimd_neon64.hpp index 656185292..041b12d92 100644 --- a/include/xsimd/arch/xsimd_neon64.hpp +++ b/include/xsimd/arch/xsimd_neon64.hpp @@ -1,4 +1,4 @@ -/*************************************************************************** +/*************************************************************************** * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * * Martin Renou * * Copyright (c) QuantStack * @@ -805,7 +805,7 @@ namespace xsimd // - OP: intrinsics name prefix, e.g., vorrq // On MSVC ARM64, skip these wrappers since all types are __n128 -#if !defined(_MSC_VER) || !defined(_M_ARM64) +#if !defined(_MSC_VER) || !defined(_M_ARM64) || defined(__clang__) #define WRAP_REDUCER_INT_EXCLUDING_64(OP) \ namespace wrap \ { \ @@ -870,7 +870,7 @@ namespace xsimd namespace detail { -#if !defined(_MSC_VER) || !defined(_M_ARM64) +#if !defined(_MSC_VER) || !defined(_M_ARM64) || defined(__clang__) template struct reducer_return_type_impl; @@ -957,7 +957,7 @@ namespace xsimd * reduce_add * **************/ -#if defined(_MSC_VER) && defined(_M_ARM64) +#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) namespace detail { template XSIMD_INLINE typename std::enable_if::value, T>::type @@ -1060,7 +1060,7 @@ namespace xsimd template = 0> XSIMD_INLINE typename batch::value_type reduce_add(batch const& arg, requires_arch) noexcept { -#if defined(_MSC_VER) && defined(_M_ARM64) +#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) return detail::msvc_arm64_reduce_add(arg); #else using register_type = typename batch::register_type; @@ -1080,7 +1080,7 @@ namespace xsimd WRAP_REDUCER_INT_EXCLUDING_64(vmaxvq) WRAP_REDUCER_FLOAT(vmaxvq) -#if !defined(_MSC_VER) || !defined(_M_ARM64) +#if !defined(_MSC_VER) || !defined(_M_ARM64) || defined(__clang__) namespace wrap { XSIMD_INLINE uint64_t vmaxvq_u64(uint64x2_t a) noexcept @@ -1098,7 +1098,7 @@ namespace xsimd template = 0> XSIMD_INLINE typename batch::value_type reduce_max(batch const& arg, requires_arch) noexcept { -#if defined(_MSC_VER) && defined(_M_ARM64) +#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) return detail::msvc_arm64_reduce_max(arg); #else using register_type = typename batch::register_type; @@ -1118,7 +1118,7 @@ namespace xsimd WRAP_REDUCER_INT_EXCLUDING_64(vminvq) WRAP_REDUCER_FLOAT(vminvq) -#if !defined(_MSC_VER) || !defined(_M_ARM64) +#if !defined(_MSC_VER) || !defined(_M_ARM64) || defined(__clang__) namespace wrap { XSIMD_INLINE uint64_t vminvq_u64(uint64x2_t a) noexcept @@ -1136,7 +1136,7 @@ namespace xsimd template = 0> XSIMD_INLINE typename batch::value_type reduce_min(batch const& arg, requires_arch) noexcept { -#if defined(_MSC_VER) && defined(_M_ARM64) +#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) return detail::msvc_arm64_reduce_min(arg); #else using register_type = typename batch::register_type; @@ -1392,7 +1392,7 @@ namespace xsimd * bitwise_cast * ****************/ -#if !defined(_MSC_VER) || !defined(_M_ARM64) +#if !defined(_MSC_VER) || !defined(_M_ARM64) || defined(__clang__) #define WRAP_CAST(SUFFIX, TYPE) \ namespace wrap \ { \ @@ -1422,7 +1422,7 @@ namespace xsimd template XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept { -#if defined(_MSC_VER) && defined(_M_ARM64) +#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) // On MSVC ARM64, all NEON types are __n128; reinterpret is a no-op. return arg.data; #else @@ -1442,7 +1442,7 @@ namespace xsimd #endif } -#if !defined(_MSC_VER) || !defined(_M_ARM64) +#if !defined(_MSC_VER) || !defined(_M_ARM64) || defined(__clang__) namespace detail { template @@ -1465,7 +1465,7 @@ namespace xsimd template XSIMD_INLINE batch bitwise_cast(batch const& arg, batch const&, requires_arch) noexcept { -#if defined(_MSC_VER) && defined(_M_ARM64) +#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) // On MSVC ARM64, all NEON types are __n128; reinterpret is a no-op. return arg.data; #else @@ -1524,7 +1524,7 @@ namespace xsimd XSIMD_INLINE batch swizzle(batch const& self, batch idx, requires_arch) noexcept { -#if defined(_MSC_VER) && defined(_M_ARM64) +#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) // MSVC ARM64: vqtbl1q_* are macro-based and conflict with our wrapper usage. // Use the two-table lookup (vtbl2_u8) on low/high halves. uint8x8x2_t tbl = { vget_low_u8(self), vget_high_u8(self) }; @@ -1540,7 +1540,7 @@ namespace xsimd XSIMD_INLINE batch swizzle(batch const& self, batch idx, requires_arch) noexcept { -#if defined(_MSC_VER) && defined(_M_ARM64) +#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) // Same approach as above but for signed payload. uint8x8x2_t tbl = { vreinterpret_u8_s8(vget_low_s8(self)), vreinterpret_u8_s8(vget_high_s8(self)) }; uint8x8_t lo = vtbl2_u8(tbl, vget_low_u8(idx)); @@ -1558,7 +1558,7 @@ namespace xsimd { using batch_type = batch; using index_type = batch; -#if defined(_MSC_VER) && defined(_M_ARM64) +#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) batch_type self_bytes = batch_type(vreinterpretq_u8_u16(self)); constexpr std::size_t lanes = batch::size; constexpr std::size_t elem_bytes = sizeof(uint16_t); @@ -1595,7 +1595,7 @@ namespace xsimd { using batch_type = batch; using index_type = batch; -#if defined(_MSC_VER) && defined(_M_ARM64) +#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) batch_type self_bytes = batch_type(vreinterpretq_u8_u32(self)); constexpr std::size_t lanes = batch::size; constexpr std::size_t elem_bytes = sizeof(uint32_t); @@ -1632,7 +1632,7 @@ namespace xsimd { using batch_type = batch; using index_type = batch; -#if defined(_MSC_VER) && defined(_M_ARM64) +#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) batch_type self_bytes = batch_type(vreinterpretq_u8_u64(self)); constexpr std::size_t lanes = batch::size; constexpr std::size_t elem_bytes = sizeof(uint64_t); @@ -1735,7 +1735,7 @@ namespace xsimd batch_constant idx, requires_arch) noexcept { -#if defined(_MSC_VER) && defined(_M_ARM64) +#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) return swizzle(self, batch(idx), neon64 {}); #else return vqtbl1q_u8(self, batch(idx)); @@ -1748,7 +1748,7 @@ namespace xsimd batch_constant idx, requires_arch) noexcept { -#if defined(_MSC_VER) && defined(_M_ARM64) +#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) return swizzle(self, batch(idx), neon64 {}); #else return vqtbl1q_s8(self, batch(idx)); @@ -1791,7 +1791,7 @@ namespace xsimd return vreinterpretq_s32_s8(swizzle(batch_type(vreinterpretq_s8_s32(self)), detail::burst_index(idx), A())); } -#if defined(_MSC_VER) && defined(_M_ARM64) +#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) template XSIMD_INLINE batch swizzle(batch const& self, batch_constant, From 7337ff85552abb290706ad86b76fe043b63848b7 Mon Sep 17 00:00:00 2001 From: Prakhar Deep Date: Fri, 3 Apr 2026 17:30:20 +0530 Subject: [PATCH 4/4] removed unneccesary macro --- .../xsimd/arch/common/xsimd_common_memory.hpp | 3 +- include/xsimd/arch/xsimd_neon.hpp | 125 ++++++++---------- 2 files changed, 59 insertions(+), 69 deletions(-) diff --git a/include/xsimd/arch/common/xsimd_common_memory.hpp b/include/xsimd/arch/common/xsimd_common_memory.hpp index f80a1927a..1877fcd97 100644 --- a/include/xsimd/arch/common/xsimd_common_memory.hpp +++ b/include/xsimd/arch/common/xsimd_common_memory.hpp @@ -72,8 +72,7 @@ namespace xsimd if ((bitmask >> i) & 1u) std::swap(mask_buffer[inserted++], mask_buffer[i]); // Fill remaining (don't-care) tail positions with index 0. - for (size_t i = inserted; i < sizeof...(Is); ++i) - mask_buffer[i] = 0; + std::fill(mask_buffer + inserted, mask_buffer + sizeof...(Is), IT(0)); return batch::load_aligned(&mask_buffer[0]); } } diff --git a/include/xsimd/arch/xsimd_neon.hpp b/include/xsimd/arch/xsimd_neon.hpp index 1f022746e..447e370fb 100644 --- a/include/xsimd/arch/xsimd_neon.hpp +++ b/include/xsimd/arch/xsimd_neon.hpp @@ -255,6 +255,12 @@ namespace xsimd using type = uint64x2_t; }; + template <> + struct comp_return_type_impl + { + using type = uint64x2_t; + }; + template <> struct comp_return_type_impl { @@ -290,43 +296,9 @@ namespace xsimd #if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) namespace detail { - template - XSIMD_INLINE typename std::enable_if::value, __n128>::type - msvc_arm64_load(const T* d) noexcept { return vld1q_u8(reinterpret_cast(d)); } - template - XSIMD_INLINE typename std::enable_if::value, __n128>::type - msvc_arm64_load(const T* d) noexcept { return vld1q_s8(reinterpret_cast(d)); } - template - XSIMD_INLINE typename std::enable_if::value, __n128>::type - msvc_arm64_load(const T* d) noexcept { return vld1q_u16(reinterpret_cast(d)); } - template - XSIMD_INLINE typename std::enable_if::value, __n128>::type - msvc_arm64_load(const T* d) noexcept { return vld1q_s16(reinterpret_cast(d)); } - template - XSIMD_INLINE typename std::enable_if::value, __n128>::type - msvc_arm64_load(const T* d) noexcept { return vld1q_u32(reinterpret_cast(d)); } - template - XSIMD_INLINE typename std::enable_if::value && !std::is_floating_point::value, __n128>::type - msvc_arm64_load(const T* d) noexcept { return vld1q_s32(reinterpret_cast(d)); } - template - XSIMD_INLINE typename std::enable_if::value, __n128>::type - msvc_arm64_load(const T* d) noexcept { return vld1q_u64(reinterpret_cast(d)); } - template - XSIMD_INLINE typename std::enable_if::value, __n128>::type - msvc_arm64_load(const T* d) noexcept { return vld1q_s64(reinterpret_cast(d)); } - - template - XSIMD_INLINE typename std::enable_if::type - msvc_arm64_load_u(const as_unsigned_integer_t* d) noexcept { return vld1q_u8(reinterpret_cast(d)); } - template - XSIMD_INLINE typename std::enable_if::type - msvc_arm64_load_u(const as_unsigned_integer_t* d) noexcept { return vld1q_u16(reinterpret_cast(d)); } - template - XSIMD_INLINE typename std::enable_if::type - msvc_arm64_load_u(const as_unsigned_integer_t* d) noexcept { return vld1q_u32(reinterpret_cast(d)); } - template - XSIMD_INLINE typename std::enable_if::type - msvc_arm64_load_u(const as_unsigned_integer_t* d) noexcept { return vld1q_u64(reinterpret_cast(d)); } + // msvc_arm64_load / msvc_arm64_load_u have been superseded by the + // cross-platform detail::neon_load / detail::neon_load_u helpers + // defined below (outside this block). They are no longer used here. template XSIMD_INLINE typename std::enable_if::type @@ -565,6 +537,51 @@ namespace detail { } #endif + namespace detail + { + // Cross-platform helpers: load a NEON register from an aligned array. + // On GCC/Clang the return type is the specific NEON vector type; + // on MSVC ARM64 all NEON types are __n128, so the same code works. + template + XSIMD_INLINE typename std::enable_if::value, uint8x16_t>::type + neon_load(const T* d) noexcept { return vld1q_u8(reinterpret_cast(d)); } + template + XSIMD_INLINE typename std::enable_if::value, int8x16_t>::type + neon_load(const T* d) noexcept { return vld1q_s8(reinterpret_cast(d)); } + template + XSIMD_INLINE typename std::enable_if::value, uint16x8_t>::type + neon_load(const T* d) noexcept { return vld1q_u16(reinterpret_cast(d)); } + template + XSIMD_INLINE typename std::enable_if::value, int16x8_t>::type + neon_load(const T* d) noexcept { return vld1q_s16(reinterpret_cast(d)); } + template + XSIMD_INLINE typename std::enable_if::value, uint32x4_t>::type + neon_load(const T* d) noexcept { return vld1q_u32(reinterpret_cast(d)); } + template + XSIMD_INLINE typename std::enable_if::value && !std::is_floating_point::value, int32x4_t>::type + neon_load(const T* d) noexcept { return vld1q_s32(reinterpret_cast(d)); } + template + XSIMD_INLINE typename std::enable_if::value, uint64x2_t>::type + neon_load(const T* d) noexcept { return vld1q_u64(reinterpret_cast(d)); } + template + XSIMD_INLINE typename std::enable_if::value, int64x2_t>::type + neon_load(const T* d) noexcept { return vld1q_s64(reinterpret_cast(d)); } + + // Load the unsigned-integer representation of T from an aligned array. + template + XSIMD_INLINE typename std::enable_if::type + neon_load_u(const as_unsigned_integer_t* d) noexcept { return vld1q_u8(reinterpret_cast(d)); } + template + XSIMD_INLINE typename std::enable_if::type + neon_load_u(const as_unsigned_integer_t* d) noexcept { return vld1q_u16(reinterpret_cast(d)); } + template + XSIMD_INLINE typename std::enable_if::type + neon_load_u(const as_unsigned_integer_t* d) noexcept { return vld1q_u32(reinterpret_cast(d)); } + template + XSIMD_INLINE typename std::enable_if::type + neon_load_u(const as_unsigned_integer_t* d) noexcept { return vld1q_u64(reinterpret_cast(d)); } + } + /************* * broadcast * *************/ @@ -630,38 +647,25 @@ namespace detail { template = 0> XSIMD_INLINE batch set(batch const&, requires_arch, Args... args) noexcept { -#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) + // Use load-from-array on all platforms: avoids brace-init of NEON types + // (which MSVC ARM64 does not support) while remaining portable. alignas(16) T data[] = { static_cast(args)... }; - return detail::msvc_arm64_load(data); -#else - return xsimd::types::detail::neon_vector_type { args... }; -#endif + return detail::neon_load(data); } template = 0> XSIMD_INLINE batch_bool set(batch_bool const&, requires_arch, Args... args) noexcept { -#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) using unsigned_type = as_unsigned_integer_t; alignas(16) unsigned_type data[] = { static_cast(args ? -1LL : 0LL)... }; - return detail::msvc_arm64_load_u(data); -#else - using register_type = typename batch_bool::register_type; - using unsigned_type = as_unsigned_integer_t; - return register_type { static_cast(args ? -1LL : 0LL)... }; -#endif + return detail::neon_load_u(data); } template XSIMD_INLINE batch set(batch const&, requires_arch, float f0, float f1, float f2, float f3) noexcept { -#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) - // On MSVC ARM64, use load from array instead of brace initialization alignas(16) float data[] = { f0, f1, f2, f3 }; return vld1q_f32(data); -#else - return float32x4_t { f0, f1, f2, f3 }; -#endif } template @@ -669,30 +673,17 @@ namespace detail { std::complex c0, std::complex c1, std::complex c2, std::complex c3) noexcept { -#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) - // On MSVC ARM64, use load from array instead of brace initialization alignas(16) float real_data[] = { c0.real(), c1.real(), c2.real(), c3.real() }; alignas(16) float imag_data[] = { c0.imag(), c1.imag(), c2.imag(), c3.imag() }; return batch, A>(vld1q_f32(real_data), vld1q_f32(imag_data)); -#else - return batch, A>(float32x4_t { c0.real(), c1.real(), c2.real(), c3.real() }, - float32x4_t { c0.imag(), c1.imag(), c2.imag(), c3.imag() }); -#endif } template XSIMD_INLINE batch_bool set(batch_bool const&, requires_arch, Args... args) noexcept { -#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) - // On MSVC ARM64, use load from array instead of brace initialization using unsigned_type = as_unsigned_integer_t; alignas(16) unsigned_type data[] = { static_cast(args ? -1LL : 0LL)... }; return vld1q_u32(data); -#else - using register_type = typename batch_bool::register_type; - using unsigned_type = as_unsigned_integer_t; - return register_type { static_cast(args ? -1LL : 0LL)... }; -#endif } /*************