From 10ea6df45e4f34c37548ef0cd6f6d5f4525032cb Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Thu, 28 May 2026 17:24:23 +0300 Subject: [PATCH 01/19] =?UTF-8?q?Add=20design=20spec=20for=20SQ8=E2=86=94F?= =?UTF-8?q?P16=20ARM=20SIMD=20kernels=20[MOD-14972]?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Stacked on PR #970 (MOD-14954 x86 kernels). Mirrors x86 structure onto NEON_HP / SVE / SVE2 tiers. Zero CMake changes; reuses existing ARM TU compile flags. Scalar fallback already on main serves as reference. Bakes in PR #970 review lessons (assert(dim>=16), 4-accumulator ILP, formula anchor, load_unaligned metadata, dispatcher-routed tier-walk tests). Co-Authored-By: Claude Opus 4.7 (1M context) --- .../specs/2026-05-28-arm-sq8-fp16-design.md | 354 ++++++++++++++++++ 1 file changed, 354 insertions(+) create mode 100644 docs/superpowers/specs/2026-05-28-arm-sq8-fp16-design.md diff --git a/docs/superpowers/specs/2026-05-28-arm-sq8-fp16-design.md b/docs/superpowers/specs/2026-05-28-arm-sq8-fp16-design.md new file mode 100644 index 000000000..f4188d38b --- /dev/null +++ b/docs/superpowers/specs/2026-05-28-arm-sq8-fp16-design.md @@ -0,0 +1,354 @@ +# SQ8↔FP16 ARM SIMD Distance Kernels — Design Spec + +- **Ticket**: [MOD-14972](https://redislabs.atlassian.net/browse/MOD-14972) +- **Branch**: `dor-forer-sq8-fp16-arm-kernels-mod-14972` +- **Base**: `dor-forer-sq8-fp16-x86-kernels-mod-14954` (PR #970) — stacked +- **Sibling**: MOD-14954 / PR #970 delivers x86 SIMD kernels (AVX-512, AVX2, SSE4) for the same operation + +## Goal + +Add SQ8↔FP16 SIMD distance kernels for IP and L2 on the ARM ISA tiers (NEON_HP, SVE, SVE2). FP16 is the query data type; SQ8 is the stored vector representation. Match the contract and structure of the x86 kernels delivered in PR #970 so dispatch tables, metadata layout, and acceptance criteria stay symmetric across architectures. + +The scalar fallback (`SQ8_FP16_InnerProduct`, `SQ8_FP16_L2Sqr`, `SQ8_FP16_Cosine` in `src/VecSim/spaces/IP/IP.cpp` and `src/VecSim/spaces/L2/L2.cpp`) already exists on `main`. This spec does not modify it; it serves as the reference implementation for all platforms. + +## Algebraic identity (shared with x86 PR + SQ8_FP32 sister) + +``` +IP(x, y) ≈ min · y_sum + delta · Σ(q_i · y_i) +L2(x, y) = x_sum_sq + y_sum_sq - 2 · IP(x, y) +``` + +Hot loop accumulates `Σ(q_i · y_i)` only. No per-element dequantization. FP16 query lanes are widened to FP32 per SIMD chunk; everything in the hot loop is FP32. + +## Metadata layout + +``` +SQ8 storage (pVect1): [uint8 × dim] [min_val] [delta] [x_sum] [x_sum_squares] +FP16 query (pVect2): [float16 × dim] [y_sum] [y_sum_squares] +``` + +Both metadata trailers are FP32 scalars. Storage metadata is not 4-byte aligned whenever `dim % 4 != 0`; query metadata is not 4-byte aligned whenever `dim` is odd. The blanket rule: every FP32 metadata read uses the global `load_unaligned` helper, matching scalar `_Impl` in `IP.cpp` / `L2.cpp`. `sq8` namespace constants: `MIN_VAL`, `DELTA`, `SUM_QUERY`, `SUM_SQUARES`, `SUM_SQUARES_QUERY`. + +## File layout + +``` +src/VecSim/spaces/IP/ + IP_NEON_SQ8_FP16.h (new) + IP_SVE_SQ8_FP16.h (new) — also #included from SVE2.cpp +src/VecSim/spaces/L2/ + L2_NEON_SQ8_FP16.h (new) + L2_SVE_SQ8_FP16.h (new) — also #included from SVE2.cpp +src/VecSim/spaces/functions/ + NEON_HP.cpp (+ Choose_SQ8_FP16_{IP,L2,Cosine}_implementation_NEON_HP) + NEON_HP.h (+ 3 declarations) + SVE.cpp (+ Choose_SQ8_FP16_*_implementation_SVE) + SVE.h (+ 3 declarations) + SVE2.cpp (+ Choose_SQ8_FP16_*_implementation_SVE2; owns its own chooser symbols; instantiates SVE kernel templates under SVE2 compile flags) + SVE2.h (+ 3 declarations) +src/VecSim/spaces/ + IP_space.cpp (2 dispatcher block edits: IP, Cosine) + L2_space.cpp (1 dispatcher block edit) +``` + +**Zero CMake changes.** Existing TU flags carry exactly what we need: + +| TU | Flags | +|----|-------| +| `NEON_HP.cpp` | `-march=armv8.2-a+fp16fml` (covers fp16 cvt + fma) | +| `SVE.cpp` | `-march=armv8-a+sve` (SVE includes f16↔f32 cvt) | +| `SVE2.cpp` | `-march=armv9-a+sve2` | + +## Dispatcher tier order + +Same precedence as existing SQ8_FP32 ARM dispatch: + +```cpp +#ifdef OPT_SVE2 + if (features.sve2 && dim >= 16) { + return Choose_SQ8_FP16_IP_implementation_SVE2(dim); + } +#endif +#ifdef OPT_SVE + if (features.sve && dim >= 16) { + return Choose_SQ8_FP16_IP_implementation_SVE(dim); + } +#endif +#ifdef OPT_NEON_HP + if (features.asimdhp && dim >= 16) { + return Choose_SQ8_FP16_IP_implementation_NEON_HP(dim); + } +#endif +// dim < 16 or no ARM SIMD → scalar fallback (existing return at function tail) +``` + +The `dim >= 16` guard in the dispatcher is what lets each SIMD kernel hold an internal `assert(dim >= 16)` as a real precondition. Edge cases for `dim < 16` are routed to scalar. + +## NEON kernel design + +### Header: `IP_NEON_SQ8_FP16.h` + +Template signature mirrors SQ8_FP32 NEON sister: + +```cpp +template // 0..15 +float SQ8_FP16_InnerProductSIMD16_NEON_HP_IMP(const void *pVect1v, const void *pVect2v, size_t dimension); +``` + +Hot loop — 16 lanes per iteration, 4 FP32 accumulators: + +```cpp +// SQ8 load: 16 × uint8 → 4 × float32x4_t +uint8x16_t v1_u8 = vld1q_u8(pVect1); +uint16x8_t v1_lo = vmovl_u8(vget_low_u8(v1_u8)); +uint16x8_t v1_hi = vmovl_u8(vget_high_u8(v1_u8)); +float32x4_t v1_0 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(v1_lo))); +float32x4_t v1_1 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(v1_lo))); +float32x4_t v1_2 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(v1_hi))); +float32x4_t v1_3 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(v1_hi))); + +// FP16 query load: 16 × f16 → 4 × float32x4_t via vcvt_f32_f16 +float16x8_t q_lo = vld1q_f16(pVect2); +float16x8_t q_hi = vld1q_f16(pVect2 + 8); +float32x4_t v2_0 = vcvt_f32_f16(vget_low_f16(q_lo)); +float32x4_t v2_1 = vcvt_f32_f16(vget_high_f16(q_lo)); +float32x4_t v2_2 = vcvt_f32_f16(vget_low_f16(q_hi)); +float32x4_t v2_3 = vcvt_f32_f16(vget_high_f16(q_hi)); + +// 4-accumulator FMA +sum0 = vfmaq_f32(sum0, v1_0, v2_0); +sum1 = vfmaq_f32(sum1, v1_1, v2_1); +sum2 = vfmaq_f32(sum2, v1_2, v2_2); +sum3 = vfmaq_f32(sum3, v1_3, v2_3); +``` + +Residual ladder (`dim % 16`, residual 0..15): + +- **`residual >= 8`**: one 8-lane safe load each side — `vld1_u8` (8 bytes) for SQ8 and `vld1q_f16` (8 × FP16 = 16 bytes, fits before query metadata) for FP16. Convert + FMA. Remaining `residual - 8` lanes handled scalar. +- **`residual < 8`**: full scalar residual loop using `vecsim_types::FP16_to_FP32`. + +Rationale: a 16-byte SQ8 load (`vld1q_u8`) or a 16-byte FP16 load (`vld1q_f16` past the 8-lane boundary) on a residual < 8 would overread past valid query data into metadata — `y_sum` is only 4 bytes for IP and `y_sum_sq` adds 4 more for L2, not enough headroom for an 8-lane FP16 load. + +Final reduction: `vaddvq_f32(sum0 + sum1 + sum2 + sum3)`, then return `min_val * y_sum + delta * quantized_dot`. + +`assert(dim >= 16)` at the top. + +### Header: `L2_NEON_SQ8_FP16.h` + +Calls `SQ8_FP16_InnerProductSIMD16_NEON_HP_IMP(...)` to compute raw IP, then returns `x_sum_sq + y_sum_sq - 2.0f * ip`. Mirrors `L2_NEON_SQ8_FP32.h` exactly. + +### Wrapper symbols (NEON_HP.cpp) + +```cpp +dist_func_t Choose_SQ8_FP16_IP_implementation_NEON_HP(size_t dim) { + dist_func_t ret; + CHOOSE_IMPLEMENTATION(ret, dim, 16, SQ8_FP16_InnerProductSIMD16_NEON_HP); + return ret; +} +// L2 + Cosine identical shape (Cosine reuses IP wrapper per repo convention) +``` + +## SVE kernel design + +### Header: `IP_SVE_SQ8_FP16.h` + +Template signature mirrors SVE SQ8_FP32 sister: + +```cpp +template +float SQ8_FP16_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v, size_t dimension); +``` + +Inner step (one SVE vector width `svcntw()` lanes of FP32): + +```cpp +svbool_t pg = svptrue_b32(); +// SQ8: zero-extend uint8 → uint32 (predicated b32 load) +svuint32_t v1_u32 = svld1ub_u32(pg, pVect1 + offset); +svfloat32_t v1_f = svcvt_f32_u32_x(pg, v1_u32); +// FP16: load chunk fp16 lanes, widen to fp32 +svbool_t pg16 = svwhilelt_b16(uint32_t(0), uint32_t(chunk)); +svfloat16_t q_h = svld1_f16(pg16, pVect2 + offset); +svfloat32_t v2_f = svcvt_f32_f16_x(pg, q_h); // verify exact ACLE/packing during impl +sum = svmla_f32_x(pg, sum, v1_f, v2_f); +offset += chunk; +``` + +**ACLE caveat**: exact f16→f32 widening intrinsic and lane packing — confirm `svcvt_f32_f16_x(pg, q_h)` compiles cleanly against the loaded `svfloat16_t`. If lane packing needs an unpack/interleave step, verify against `IP_SVE_FP16.h`. + +4 accumulators `sum0..sum3`; main loop processes 4 chunks via 4 `InnerProductStep` calls. `partial_chunk` template branch handles `dim % chunk` via `svwhilelt_b32`. + +Inactive-lane discipline on the partial path: the predicated `svld1_f16` / `svld1ub_u32` cover lane *liveness*, but the final reduction with `svaddv_f32(svptrue_b32(), ...)` walks *all* lanes. To keep inactive lanes from contributing garbage, the partial step uses the zeroing form `svmla_f32_z(pg_partial, sum0, v1_f, v2_f)` (matches `IP_SVE_SQ8_FP32.h` partial-chunk pattern). Alternative: reduce only active lanes via `svaddv_f32(pg_partial, sum0)` for the partial-step accumulator, then sum into the main reduction. The `_z` form is the simpler choice and is what the SQ8_FP32 SVE sister already does. + +Predicate widths on the partial path: FP32 math (load/widen/mla) uses a `b32` predicate sized to `remaining` 32-bit lanes (`svwhilelt_b32(0, remaining)`); the FP16 query load needs its own `b16` predicate sized to the same `remaining` half lanes (`svwhilelt_b16(0, remaining)`) since `svld1_f16` is governed by a 16-bit predicate. SQ8 load via `svld1ub_u32` is governed by the `b32` predicate (it widens uint8 → uint32 lanewise). + +Final reduction: `svaddv_f32(svptrue_b32(), sum0 + sum1 + sum2 + sum3)`. + +### Header: `L2_SVE_SQ8_FP16.h` + +Calls `SQ8_FP16_InnerProductSIMD_SVE_IMP(...)` then returns `x_sum_sq + y_sum_sq - 2.0f * ip`. Mirrors `L2_SVE_SQ8_FP32.h`. + +### Wrapper symbols + +`SVE.cpp`: + +```cpp +dist_func_t Choose_SQ8_FP16_IP_implementation_SVE(size_t dim) { + dist_func_t ret; + CHOOSE_SVE_IMPLEMENTATION(ret, SQ8_FP16_InnerProductSIMD_SVE, dim, svcntw); + return ret; +} +// L2 + Cosine identical shape +``` + +`SVE2.cpp`: + +```cpp +#include "VecSim/spaces/IP/IP_SVE_SQ8_FP16.h" // SVE2 implementation is identical to SVE +#include "VecSim/spaces/L2/L2_SVE_SQ8_FP16.h" + +dist_func_t Choose_SQ8_FP16_IP_implementation_SVE2(size_t dim) { + dist_func_t ret; + CHOOSE_SVE_IMPLEMENTATION(ret, SQ8_FP16_InnerProductSIMD_SVE, dim, svcntw); + return ret; +} +// L2 + Cosine identical shape +``` + +SVE2 owns its own chooser symbols (does **not** call the SVE chooser); template instantiated under SVE2 compile flags. + +## Tests + +### Class + +Branch base is PR #970. During implementation, verify whether the base branch already exposes `SQ8_FP16_SpacesOptimizationTest` (extend) or only `SQ8_FP16_NoOptimizationSpacesTest` (add the optimization class here mirroring `SQ8_FP32_SpacesOptimizationTest`). + +### Tier-walk pattern + +Per-tier `if (features.)` block; **unset higher flag** after each block so the next tier is exercised on hosts that support multiple ISAs. Do not use `GTEST_SKIP()` here — it would abort the entire walk. + +```cpp +auto expected = SQ8_FP16_InnerProduct; // scalar reference + +#ifdef OPT_SVE2 + if (features.sve2) { + arch_opt_func = IP_SQ8_FP16_GetDistFunc(dim, &alignment, &features); + ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_IP_implementation_SVE2(dim)) + << "SVE2 dispatch mismatch"; + ASSERT_NEAR(arch_opt_func(v1, v2, dim), expected(v1, v2, dim), 0.01); + features.sve2 = 0; // exercise next tier + } +#endif +#ifdef OPT_SVE + if (features.sve) { /* same shape */ features.sve = 0; } +#endif +#ifdef OPT_NEON_HP + if (features.asimdhp) { /* same shape */ features.asimdhp = 0; } +#endif +// final fallback assertion: IP_SQ8_FP16_GetDistFunc(...) == SQ8_FP16_InnerProduct (scalar) +``` + +Three dispatch entry points exercised per tier: `IP_SQ8_FP16_GetDistFunc`, `L2_SQ8_FP16_GetDistFunc`, `Cosine_SQ8_FP16_GetDistFunc`. + +### Scalar-fallback tests + +`GetDistFuncSQ8FP16Asymmetric` — currently asserts `dim=128` returns scalar; that assertion breaks once SIMD dispatch lands. Change to `dim=15` (below the `dim >= 16` SIMD threshold). Add a small `dim=0` (empty) scalar-fallback assertion to cover the Jira "empty" edge case. + +### Dim parameterization + +Base branch already has both parameterized suites against `SQ8_FP16_SpacesOptimizationTest`: +- `SQ8_FP16_SIMD` — `testing::Range(16UL, 33UL)` (dims 16..32; residual + threshold boundaries) +- `SQ8_FP16_SIMD_HighDim` — `64, 128, 256, 512, 1024` (multi-iteration main loop) + +Both suites pick up the ARM tier-walk additions automatically since the test class body is what's extended. No new instantiation needed. + +### Tier coverage report + +`SQ8_FP16_SIMD_TierCoverage.ReportTiersExercised` (test_spaces.cpp) currently reports only x86 tiers. Extend it with ARM tier entries (SVE2 / SVE / NEON_HP) so an ARM-only SIMD host reports its exercised tiers instead of going silent. + +## Microbench + +`tests/benchmark/spaces_benchmarks/bm_spaces_sq8_fp16.cpp` already registers x86 ISA benchmarks. Add ARM registrations under `#ifdef OPT_*` guards using the existing `bm_spaces.h` macros: + +```cpp +#ifdef CPU_FEATURES_ARCH_AARCH64 + cpu_features::Aarch64Features opt = cpu_features::GetAarch64Info().features; + bool sve2_supported = opt.sve2; + bool sve_supported = opt.sve; + bool neon_hp_supported = opt.asimdhp; +#ifdef OPT_SVE2 + INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, SVE2, 16, sve2_supported); + INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, SVE2, 16, sve2_supported); +#endif +#ifdef OPT_SVE + INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, SVE, 16, sve_supported); + INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, SVE, 16, sve_supported); +#endif +#ifdef OPT_NEON_HP + INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, NEON_HP, 16, neon_hp_supported); + INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, NEON_HP, 16, neon_hp_supported); +#endif +#endif // CPU_FEATURES_ARCH_AARCH64 +``` + +Verify exact `cpu_features` helper names against the x86 sister block already in `bm_spaces_sq8_fp16.cpp` (e.g. `GetX86Info`). + +`bm_spaces_sq8_fp16` and `bm_spaces_sq8_fp32` are separate executables; the per-ISA throughput comparison requested by Jira is done by running both benches and comparing matched ISA rows. + +## Acceptance criteria (Jira MOD-14972 → spec mapping) + +| Jira requirement | Where this spec delivers it | +|------------------|------------------------------| +| Kernels: IP + L2 for NEON | NEON_HP TU hosts kernel headers + chooser symbols | +| Kernels: IP + L2 for SVE | SVE TU hosts kernel headers + chooser symbols | +| Kernels: IP + L2 for SVE2 | SVE2 TU includes SVE headers, instantiates templates under SVE2 flags | +| Scalar fallback (reference for all platforms) | Already present in `IP.cpp` / `L2.cpp`; unchanged | +| FP16 query → FP32 per SIMD chunk | `vcvt_f32_f16` (NEON), `svcvt_f32_f16_x` (SVE) | +| FP32 metadata + correction terms | `load_unaligned` for all FP32 trailer scalars | +| Wire into dispatch table per ISA flag | `IP_space.cpp` (2 blocks), `L2_space.cpp` (1 block), `OPT_SVE2/SVE/NEON_HP` | +| Unit tests vs. scalar reference per ISA | Tier-walk in `SQ8_FP16_SpacesOptimizationTest` | +| Edge cases (empty, dim-alignment boundaries) | `dim=0` + `dim=15` scalar tests; `dim=16..32` SIMD boundary param suite | +| Microbench per ISA throughput vs. SQ8↔FP32 | ARM registrations in `bm_spaces_sq8_fp16.cpp`; matched-ISA comparison vs. `bm_spaces_sq8_fp32` | + +## Diff size estimate + +| Area | Files | LoC (rough) | +|------|-------|-------------| +| Kernel headers | 4 new | ~600 | +| Dispatcher TU additions | NEON_HP.cpp/h, SVE.cpp/h, SVE2.cpp/h | ~80 | +| Dispatcher wiring | IP_space.cpp, L2_space.cpp | ~45 | +| Tests | test_spaces.cpp | ~80 | +| Bench | bm_spaces_sq8_fp16.cpp | ~25 | +| CMakeLists.txt | none | 0 | +| **Total** | **~10 files** | **~830** | + +## PR mechanics + +- **Branch**: `dor-forer-sq8-fp16-arm-kernels-mod-14972` +- **Base branch**: `dor-forer-sq8-fp16-x86-kernels-mod-14954` (PR #970) +- **PR target**: opens against PR #970 head; retarget to `main` once #970 merges +- **Commit prefix**: `[MOD-14972]` (matches repo convention) +- **PR title**: `Add SQ8↔FP16 ARM SIMD distance kernels [MOD-14972]` + +## Verification gates before opening PR + +1. **x86 host build clean** — verifies generic dispatch and tests remain clean; ARM kernels require ARM build or cross-compile, so the kernels themselves are not exercised here. +2. **ARM host build + unit tests** — NEON_HP / SVE / SVE2 paths exercised. Requires coordination with the user for ARM hardware or a cross-compile setup. +3. **ASan clean** on every host that runs unit tests. +4. **Microbench compiles + runs on ARM host.** + +## Out of scope (deferred, separate PRs) + +- Dispatcher-routed edge-case tests (`ZeroQueryTest`, `ConstantStorageTest`, `MixedSignQueryTest`) — they currently bypass the dispatcher and call scalar directly; cross-arch debt, also PR #970 H1. +- Multi-accumulator ILP tuning beyond the 4-accumulator baseline established here. +- Unrelated x86 review-feedback fixes (M1–M4, H1–H2 on x86 files from PR #970 review). This ARM PR will modify some files that PR #970 also touches (dispatchers, test class, bench), but only with ARM-relevant additions — x86 review fixes land in #970. + +## Inheritance from PR #970 review findings + +The following lessons from the PR #970 review are baked into this design so they do not need to be re-flagged on ARM kernels: + +- `assert(dim >= 16)` at the top of every kernel template (paired with dispatcher `dim >= 16` guard). +- 4-accumulator ILP in both NEON and SVE hot loops. +- Algebraic-identity formula anchor comment at the top of each kernel header. +- `load_unaligned` for all FP32 metadata reads (matches scalar). +- Dispatcher-routed tier-walk test pattern (no scalar-bypass). +- Per-ISA microbench registration alongside SQ8↔FP32 sister for direct comparison. From c061da9075f1b82b8d181c39f2e1502d14b3ae92 Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Thu, 28 May 2026 17:37:01 +0300 Subject: [PATCH 02/19] =?UTF-8?q?Add=20implementation=20plan=20for=20SQ8?= =?UTF-8?q?=E2=86=94FP16=20ARM=20SIMD=20kernels=20[MOD-14972]?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 14 bite-sized tasks following the spec at 2026-05-28-arm-sq8-fp16-design.md. Each task ends in a commit; assistant runs tests/ASan/benchmarks after the user confirms each ARM build cycle. Zero CMake changes; PR stacks on #970. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../plans/2026-05-28-arm-sq8-fp16-kernels.md | 1195 +++++++++++++++++ 1 file changed, 1195 insertions(+) create mode 100644 docs/superpowers/plans/2026-05-28-arm-sq8-fp16-kernels.md diff --git a/docs/superpowers/plans/2026-05-28-arm-sq8-fp16-kernels.md b/docs/superpowers/plans/2026-05-28-arm-sq8-fp16-kernels.md new file mode 100644 index 000000000..2759ba046 --- /dev/null +++ b/docs/superpowers/plans/2026-05-28-arm-sq8-fp16-kernels.md @@ -0,0 +1,1195 @@ +# SQ8↔FP16 ARM SIMD Distance Kernels — Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Add SQ8↔FP16 asymmetric distance kernels (IP, L2, Cosine) for ARM ISA tiers — NEON_HP, SVE, SVE2 — plugged into the existing dispatcher. Mirrors the x86 work delivered in PR #970. + +**Architecture:** Header-only SIMD kernel templates (one per metric × ISA), instantiated via the existing `CHOOSE_IMPLEMENTATION` / `CHOOSE_SVE_IMPLEMENTATION` macros inside ISA-specific TUs (`NEON_HP.cpp`, `SVE.cpp`, `SVE2.cpp`). Wiring lives in `IP_space.cpp` and `L2_space.cpp` under a `#ifdef CPU_FEATURES_ARCH_AARCH64` block that parallels the existing x86 block. L2 reuses the IP `_IMP` template via the algebraic identity `L2² = x_sum_sq + y_sum_sq − 2·IP`. Scalar fallback already on `main` is unchanged and stays as the reference for every tier. + +**Tech Stack:** C++20, ARM NEON intrinsics (`arm_neon.h`), ARM SVE/SVE2 intrinsics (`arm_sve.h`), GoogleTest, Google Benchmark, cpu_features. + +**Branch:** `dor-forer-sq8-fp16-arm-kernels-mod-14972` (stacked on PR #970 / `dor-forer-sq8-fp16-x86-kernels-mod-14954`). + +**Build / test loop:** The user runs `make build` (per project memory). After each build cycle confirmed, the assistant runs `make unit_test` / ASan / benchmarks on the appropriate host (ARM hardware or cross-compile/qemu — coordinate with user). Each task ends in a commit; commits are pushed only when explicitly requested. + +**Spec:** [`docs/superpowers/specs/2026-05-28-arm-sq8-fp16-design.md`](../specs/2026-05-28-arm-sq8-fp16-design.md) + +--- + +## File Structure + +### Files created + +| Path | Responsibility | +|------|----------------| +| `src/VecSim/spaces/IP/IP_NEON_SQ8_FP16.h` | NEON IP kernel template (`SQ8_FP16_InnerProductSIMD16_NEON_HP_IMP` + thin wrappers) | +| `src/VecSim/spaces/L2/L2_NEON_SQ8_FP16.h` | NEON L2 kernel template (calls NEON IP impl, applies L2 identity) | +| `src/VecSim/spaces/IP/IP_SVE_SQ8_FP16.h` | SVE IP kernel template (`SQ8_FP16_InnerProductSIMD_SVE_IMP` + wrappers); also `#include`d from SVE2.cpp | +| `src/VecSim/spaces/L2/L2_SVE_SQ8_FP16.h` | SVE L2 kernel template; also `#include`d from SVE2.cpp | + +### Files modified + +| Path | Change | +|------|--------| +| `src/VecSim/spaces/functions/NEON_HP.h` | +3 chooser declarations (IP, L2, Cosine) | +| `src/VecSim/spaces/functions/NEON_HP.cpp` | +#include kernel headers; +3 chooser definitions | +| `src/VecSim/spaces/functions/SVE.h` | +3 chooser declarations | +| `src/VecSim/spaces/functions/SVE.cpp` | +#include kernel headers; +3 chooser definitions | +| `src/VecSim/spaces/functions/SVE2.h` | +3 chooser declarations | +| `src/VecSim/spaces/functions/SVE2.cpp` | +#include SVE kernel headers; +3 chooser definitions (own symbols, templates instantiated under SVE2 compile flags) | +| `src/VecSim/spaces/IP_space.cpp` | +#ifdef AArch64 block in `IP_SQ8_FP16_GetDistFunc` and `Cosine_SQ8_FP16_GetDistFunc` (2 dispatcher blocks) | +| `src/VecSim/spaces/L2_space.cpp` | +#ifdef AArch64 block in `L2_SQ8_FP16_GetDistFunc` (1 dispatcher block) | +| `tests/unit/test_spaces.cpp` | retarget `GetDistFuncSQ8FP16Asymmetric` to dim=15; add dim=0 test; extend the three `SQ8_FP16_SpacesOptimizationTest` test bodies with ARM tier walks; extend `SQ8_FP16_SIMD_TierCoverage.ReportTiersExercised` with AArch64 tier reporting | +| `tests/benchmark/spaces_benchmarks/bm_spaces_sq8_fp16.cpp` | +AArch64 `cpu_features` block; +ARM ISA benchmark registrations | + +### Files NOT modified + +`src/VecSim/spaces/CMakeLists.txt` — zero CMake changes. Existing TU flags (`-march=armv8.2-a+fp16fml` for NEON_HP, `-march=armv8-a+sve` for SVE, `-march=armv9-a+sve2` for SVE2) already carry everything the new kernels need. + +--- + +## Task 1: Retarget the scalar-fallback dispatcher test + +**Why first:** Builds and runs on x86 today, has nothing to do with the ARM kernels, and tightens the contract the rest of the plan relies on (the dispatcher returns scalar for `dim < 16`). + +**Files:** +- Modify: `tests/unit/test_spaces.cpp` — locate test named `GetDistFuncSQ8FP16Asymmetric` (added by PR #970; currently asserts `dim=128` returns the scalar fallback) + +- [ ] **Step 1: Locate the existing test** + +Run: +```bash +grep -n 'GetDistFuncSQ8FP16Asymmetric' tests/unit/test_spaces.cpp +``` +Expected: one or more line hits pointing at the `TEST(...)` block. + +- [ ] **Step 2: Modify the test to cover dim=0 and dim=15 instead of dim=128** + +Replace the body of the existing `TEST(..., GetDistFuncSQ8FP16Asymmetric)` so it walks two below-threshold dims and asserts the scalar fallback for each of L2 / IP / Cosine. Drop in this exact body (rename the test fixture symbol to match what is already there if it differs): + +```cpp +TEST_F(SpacesTest, GetDistFuncSQ8FP16Asymmetric) { + // SQ8 storage with FP16 query (asymmetric) - should return SQ8_FP16 functions. + // Per-ISA dispatcher walk coverage lives in the SQ8_FP16 SpacesOptimizationTest below. + // + // Walk two below-threshold dims (0 and 15) so the assertions hold regardless of which + // SIMD tiers the host advertises: dim < 16 must always short-circuit to scalar fallback. + // The template-mapping form (spaces::GetDistFunc) and the direct + // *_SQ8_FP16_GetDistFunc form must agree for every dim, and both must match the scalar + // reference at sub-threshold dims. + for (size_t dim : {static_cast(0), static_cast(15)}) { + auto l2_func = spaces::GetDistFunc(VecSimMetric_L2, dim, nullptr); + auto ip_func = spaces::GetDistFunc(VecSimMetric_IP, dim, nullptr); + auto cosine_func = + spaces::GetDistFunc(VecSimMetric_Cosine, dim, nullptr); + + ASSERT_EQ(l2_func, L2_SQ8_FP16_GetDistFunc(dim, nullptr)) + << "Template mapping disagrees with direct dispatcher for L2 at dim=" << dim; + ASSERT_EQ(ip_func, IP_SQ8_FP16_GetDistFunc(dim, nullptr)) + << "Template mapping disagrees with direct dispatcher for IP at dim=" << dim; + ASSERT_EQ(cosine_func, Cosine_SQ8_FP16_GetDistFunc(dim, nullptr)) + << "Template mapping disagrees with direct dispatcher for Cosine at dim=" << dim; + + ASSERT_EQ(l2_func, SQ8_FP16_L2Sqr) + << "dim=" << dim << " must short-circuit to scalar L2 fallback"; + ASSERT_EQ(ip_func, SQ8_FP16_InnerProduct) + << "dim=" << dim << " must short-circuit to scalar IP fallback"; + ASSERT_EQ(cosine_func, SQ8_FP16_Cosine) + << "dim=" << dim << " must short-circuit to scalar Cosine fallback"; + } +} +``` + +- [ ] **Step 3: User builds** + +Ask the user to run `make build` (their normal x86 build is sufficient — this test is host-agnostic). + +- [ ] **Step 4: Run the test** + +Run: +```bash +./bin//unit_tests --gtest_filter='SpacesTest.GetDistFuncSQ8FP16Asymmetric' +``` +(Use `find bin -name unit_tests -type f` if the host-triple subdir is unknown.) +Expected: PASS. + +- [ ] **Step 5: Commit** + +```bash +git add tests/unit/test_spaces.cpp +git commit -m "Retarget SQ8↔FP16 scalar-fallback dispatcher test to dim=0/15 [MOD-14972]" +``` + +--- + +## Task 2: NEON IP kernel header + +**Files:** +- Create: `src/VecSim/spaces/IP/IP_NEON_SQ8_FP16.h` + +- [ ] **Step 1: Author the kernel file** + +Create exactly this file (modeled on `IP_NEON_SQ8_FP32.h` + the NEON FP16 widening pattern from `IP_NEON_FP16.h`): + +```cpp +/* + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ +#pragma once +#include "VecSim/spaces/space_includes.h" +#include "VecSim/types/sq8.h" +#include "VecSim/types/float16.h" +#include +#include + +using sq8 = vecsim_types::sq8; +using float16 = vecsim_types::float16; + +/* + * Optimised asymmetric SQ8<->FP16 inner product using the algebraic identity: + * + * IP(x, y) = sum(x_i * y_i) + * ~= sum((min + delta * q_i) * y_i) + * = min * y_sum + delta * sum(q_i * y_i) + * + * The hot loop only accumulates sum(q_i * y_i) - no per-element dequantisation. + * FP16 query lanes are widened to FP32 via vcvt_f32_f16 per 16-lane chunk. + */ + +// Helper: 16 lanes per call, four FP32 accumulators (one per quarter). +static inline void +SQ8_FP16_InnerProductStep_NEON_HP(const uint8_t *&pVect1, const float16 *&pVect2, + float32x4_t &sum0, float32x4_t &sum1, + float32x4_t &sum2, float32x4_t &sum3) { + // SQ8 storage: 16 * uint8 -> 4 * float32x4_t + uint8x16_t v1_u8 = vld1q_u8(pVect1); + uint16x8_t v1_lo = vmovl_u8(vget_low_u8(v1_u8)); + uint16x8_t v1_hi = vmovl_u8(vget_high_u8(v1_u8)); + float32x4_t v1_0 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(v1_lo))); + float32x4_t v1_1 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(v1_lo))); + float32x4_t v1_2 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(v1_hi))); + float32x4_t v1_3 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(v1_hi))); + + // FP16 query: 16 * f16 -> 4 * float32x4_t via vcvt_f32_f16 + const float16_t *q = reinterpret_cast(pVect2); + float16x8_t q_lo = vld1q_f16(q); + float16x8_t q_hi = vld1q_f16(q + 8); + float32x4_t v2_0 = vcvt_f32_f16(vget_low_f16(q_lo)); + float32x4_t v2_1 = vcvt_f32_f16(vget_high_f16(q_lo)); + float32x4_t v2_2 = vcvt_f32_f16(vget_low_f16(q_hi)); + float32x4_t v2_3 = vcvt_f32_f16(vget_high_f16(q_hi)); + + sum0 = vfmaq_f32(sum0, v1_0, v2_0); + sum1 = vfmaq_f32(sum1, v1_1, v2_1); + sum2 = vfmaq_f32(sum2, v1_2, v2_2); + sum3 = vfmaq_f32(sum3, v1_3, v2_3); + + pVect1 += 16; + pVect2 += 16; +} + +// pVect1v = SQ8 storage, pVect2v = FP16 query +template // 0..15 +float SQ8_FP16_InnerProductSIMD16_NEON_HP_IMP(const void *pVect1v, const void *pVect2v, + size_t dimension) { + assert(dimension >= 16 && "kernel precondition: dispatcher must guard dim >= 16"); + + const uint8_t *pVect1 = static_cast(pVect1v); // SQ8 storage + const float16 *pVect2 = static_cast(pVect2v); // FP16 query + + float32x4_t sum0 = vdupq_n_f32(0.0f); + float32x4_t sum1 = vdupq_n_f32(0.0f); + float32x4_t sum2 = vdupq_n_f32(0.0f); + float32x4_t sum3 = vdupq_n_f32(0.0f); + + const size_t num_of_chunks = dimension / 16; + for (size_t i = 0; i < num_of_chunks; i++) { + SQ8_FP16_InnerProductStep_NEON_HP(pVect1, pVect2, sum0, sum1, sum2, sum3); + } + + // Residual handling: dim % 16 lanes. + // residual >= 8: one safe 8-lane SQ8 + 8-lane FP16 load (FP16 trailer is wide enough). + // residual < 8: scalar-only - a 4-lane FP16 load would overread y_sum metadata. + constexpr unsigned char r = residual; + if constexpr (r >= 8) { + uint8x8_t v1_u8 = vld1_u8(pVect1); + uint16x8_t v1_u16 = vmovl_u8(v1_u8); + float32x4_t v1_a = vcvtq_f32_u32(vmovl_u16(vget_low_u16(v1_u16))); + float32x4_t v1_b = vcvtq_f32_u32(vmovl_u16(vget_high_u16(v1_u16))); + float16x8_t q_h = vld1q_f16(reinterpret_cast(pVect2)); + float32x4_t v2_a = vcvt_f32_f16(vget_low_f16(q_h)); + float32x4_t v2_b = vcvt_f32_f16(vget_high_f16(q_h)); + sum0 = vfmaq_f32(sum0, v1_a, v2_a); + sum1 = vfmaq_f32(sum1, v1_b, v2_b); + pVect1 += 8; + pVect2 += 8; + } + // Lane-by-lane scalar for the final 0..7 (residual % 8) elements. + constexpr unsigned char tail = r & 0x7; + float scalar_dot = 0.0f; + for (unsigned char k = 0; k < tail; ++k) { + scalar_dot += static_cast(pVect1[k]) * vecsim_types::FP16_to_FP32(pVect2[k]); + } + + // Reduce the four NEON accumulators. + float32x4_t sum_lo = vaddq_f32(sum0, sum1); + float32x4_t sum_hi = vaddq_f32(sum2, sum3); + float quantized_dot = vaddvq_f32(vaddq_f32(sum_lo, sum_hi)) + scalar_dot; + + // Metadata loads - use load_unaligned because odd dim leaves trailers unaligned. + const uint8_t *params_bytes = static_cast(pVect1v) + dimension; + const float min_val = + load_unaligned(params_bytes + sq8::MIN_VAL * sizeof(float)); + const float delta = + load_unaligned(params_bytes + sq8::DELTA * sizeof(float)); + const uint8_t *query_meta_bytes = + reinterpret_cast(static_cast(pVect2v) + dimension); + const float y_sum = + load_unaligned(query_meta_bytes + sq8::SUM_QUERY * sizeof(float)); + + return min_val * y_sum + delta * quantized_dot; +} + +template +float SQ8_FP16_InnerProductSIMD16_NEON_HP(const void *pVect1v, const void *pVect2v, + size_t dimension) { + return 1.0f - + SQ8_FP16_InnerProductSIMD16_NEON_HP_IMP(pVect1v, pVect2v, dimension); +} + +template +float SQ8_FP16_CosineSIMD16_NEON_HP(const void *pVect1v, const void *pVect2v, size_t dimension) { + // Cosine = 1 - IP (vectors are pre-normalised); reuses the IP wrapper. + return SQ8_FP16_InnerProductSIMD16_NEON_HP(pVect1v, pVect2v, dimension); +} +``` + +- [ ] **Step 2: Header-only smoke (no build yet)** + +Run: +```bash +grep -n 'load_unaligned\|FP16_to_FP32' src/VecSim/spaces/space_includes.h \ + src/VecSim/spaces/IP/IP.cpp src/VecSim/types/float16.h 2>/dev/null +``` +Expected: confirm the global `load_unaligned` is reachable through `space_includes.h` (matches the include path used by `IP_NEON_SQ8_FP32.h`) and `FP16_to_FP32` is reachable through `VecSim/types/float16.h`. If either include is missing, add it. + +- [ ] **Step 3: Commit** + +```bash +git add src/VecSim/spaces/IP/IP_NEON_SQ8_FP16.h +git commit -m "Add NEON_HP SQ8↔FP16 IP kernel header [MOD-14972]" +``` + +--- + +## Task 3: NEON L2 kernel header + +**Files:** +- Create: `src/VecSim/spaces/L2/L2_NEON_SQ8_FP16.h` + +- [ ] **Step 1: Author the kernel file** + +```cpp +/* + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ +#pragma once +#include "VecSim/spaces/space_includes.h" +#include "VecSim/spaces/IP/IP_NEON_SQ8_FP16.h" +#include "VecSim/types/sq8.h" +#include "VecSim/types/float16.h" + +using sq8 = vecsim_types::sq8; +using float16 = vecsim_types::float16; + +/* + * Optimised asymmetric SQ8<->FP16 L2 squared distance using the algebraic identity: + * + * ||x - y||^2 = sum(x_i^2) - 2 * IP(x, y) + sum(y_i^2) + * = x_sum_squares - 2 * IP(x, y) + y_sum_squares + * + * IP is computed by SQ8_FP16_InnerProductSIMD16_NEON_HP_IMP; metadata is FP32. + */ + +template // 0..15 +float SQ8_FP16_L2SqrSIMD16_NEON_HP(const void *pVect1v, const void *pVect2v, size_t dimension) { + const float ip = + SQ8_FP16_InnerProductSIMD16_NEON_HP_IMP(pVect1v, pVect2v, dimension); + + const uint8_t *params_bytes = static_cast(pVect1v) + dimension; + const float x_sum_sq = + load_unaligned(params_bytes + sq8::SUM_SQUARES * sizeof(float)); + + const uint8_t *query_meta_bytes = reinterpret_cast( + static_cast(pVect2v) + dimension); + const float y_sum_sq = + load_unaligned(query_meta_bytes + sq8::SUM_SQUARES_QUERY * sizeof(float)); + + return x_sum_sq + y_sum_sq - 2.0f * ip; +} +``` + +- [ ] **Step 2: Commit** + +```bash +git add src/VecSim/spaces/L2/L2_NEON_SQ8_FP16.h +git commit -m "Add NEON_HP SQ8↔FP16 L2 kernel header [MOD-14972]" +``` + +--- + +## Task 4: NEON_HP dispatcher TU additions + +**Files:** +- Modify: `src/VecSim/spaces/functions/NEON_HP.h` — add 3 declarations +- Modify: `src/VecSim/spaces/functions/NEON_HP.cpp` — add 3 chooser definitions + +- [ ] **Step 1: Add chooser declarations to NEON_HP.h** + +In `src/VecSim/spaces/functions/NEON_HP.h`, inside `namespace spaces { ... }`, append these three declarations alongside the existing `Choose_FP16_*_implementation_NEON_HP`: + +```cpp +dist_func_t Choose_SQ8_FP16_IP_implementation_NEON_HP(size_t dim); +dist_func_t Choose_SQ8_FP16_L2_implementation_NEON_HP(size_t dim); +dist_func_t Choose_SQ8_FP16_Cosine_implementation_NEON_HP(size_t dim); +``` + +- [ ] **Step 2: Add chooser definitions to NEON_HP.cpp** + +In `src/VecSim/spaces/functions/NEON_HP.cpp`, add the kernel `#include`s alongside the existing FP16 includes: + +```cpp +#include "VecSim/spaces/IP/IP_NEON_SQ8_FP16.h" +#include "VecSim/spaces/L2/L2_NEON_SQ8_FP16.h" +``` + +Then inside `namespace spaces { ... }` (between `#include "implementation_chooser.h"` and `#include "implementation_chooser_cleanup.h"`), append: + +```cpp +dist_func_t Choose_SQ8_FP16_IP_implementation_NEON_HP(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_FP16_InnerProductSIMD16_NEON_HP); + return ret_dist_func; +} + +dist_func_t Choose_SQ8_FP16_L2_implementation_NEON_HP(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_FP16_L2SqrSIMD16_NEON_HP); + return ret_dist_func; +} + +dist_func_t Choose_SQ8_FP16_Cosine_implementation_NEON_HP(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_FP16_CosineSIMD16_NEON_HP); + return ret_dist_func; +} +``` + +- [ ] **Step 3: Commit** + +```bash +git add src/VecSim/spaces/functions/NEON_HP.h src/VecSim/spaces/functions/NEON_HP.cpp +git commit -m "Wire NEON_HP SQ8↔FP16 choosers [MOD-14972]" +``` + +--- + +## Task 5: NEON_HP dispatcher wiring in IP_space.cpp + L2_space.cpp + +**Files:** +- Modify: `src/VecSim/spaces/IP_space.cpp` — `IP_SQ8_FP16_GetDistFunc` + `Cosine_SQ8_FP16_GetDistFunc` +- Modify: `src/VecSim/spaces/L2_space.cpp` — `L2_SQ8_FP16_GetDistFunc` + +Each of those three `_GetDistFunc` functions currently has an `#ifdef CPU_FEATURES_ARCH_X86_64` block with an early `if (dim < 16) return ret_dist_func;` guard followed by per-tier dispatch. We append an `#ifdef CPU_FEATURES_ARCH_AARCH64` block with the matching shape. Only NEON_HP is wired in this task; SVE/SVE2 land in a later task. + +- [ ] **Step 1: Confirm the #include for NEON_HP.h is present** + +Run: +```bash +grep -n 'functions/NEON_HP.h' src/VecSim/spaces/IP_space.cpp src/VecSim/spaces/L2_space.cpp +``` +Expected: both files already `#include "VecSim/spaces/functions/NEON_HP.h"`. If a file is missing it, add the include. + +- [ ] **Step 2: Wire IP_SQ8_FP16_GetDistFunc** + +In `src/VecSim/spaces/IP_space.cpp`, locate `IP_SQ8_FP16_GetDistFunc`. After the closing `#endif // x86_64`, insert a parallel AArch64 block immediately before the trailing `return ret_dist_func;`: + +```cpp +#ifdef CPU_FEATURES_ARCH_AARCH64 + if (dim < 16) { + return ret_dist_func; + } +#ifdef OPT_NEON_HP + if (features.asimdhp) { + // No alignment write: the locked spec and the sister ARM SQ8_FP32 dispatchers + // leave *alignment untouched on ARM tiers. The corresponding tests assert + // 0xFF passthrough on the scalar path and do not assert any non-zero value here. + return Choose_SQ8_FP16_IP_implementation_NEON_HP(dim); + } +#endif +#endif // CPU_FEATURES_ARCH_AARCH64 +``` + +- [ ] **Step 3: Wire Cosine_SQ8_FP16_GetDistFunc** + +In the same file, locate `Cosine_SQ8_FP16_GetDistFunc`. Insert the same block, swapping `Choose_SQ8_FP16_IP_implementation_NEON_HP` for `Choose_SQ8_FP16_Cosine_implementation_NEON_HP`. + +- [ ] **Step 4: Wire L2_SQ8_FP16_GetDistFunc** + +In `src/VecSim/spaces/L2_space.cpp`, locate `L2_SQ8_FP16_GetDistFunc`. Insert the same block, swapping the call for `Choose_SQ8_FP16_L2_implementation_NEON_HP`. + +- [ ] **Step 5: User builds** + +Ask the user to run `make build` — first time the new NEON_HP TU additions compile. If they have ARM hardware or a cross-compile target, that build path; otherwise the x86 build must at least confirm the new headers don't accidentally break non-ARM compilation (the new headers are only `#include`d from `NEON_HP.cpp`, which is excluded on non-ARM hosts, so x86 builds should be clean). + +- [ ] **Step 6: Commit** + +```bash +git add src/VecSim/spaces/IP_space.cpp src/VecSim/spaces/L2_space.cpp +git commit -m "Dispatch SQ8↔FP16 to NEON_HP tier on AArch64 [MOD-14972]" +``` + +--- + +## Task 6: Extend `SQ8_FP16_SpacesOptimizationTest` with NEON_HP tier-walk + +**Files:** +- Modify: `tests/unit/test_spaces.cpp` — three test bodies (`SQ8_FP16_L2SqrTest`, `SQ8_FP16_InnerProductTest`, `SQ8_FP16_CosineTest`) + +After the existing `#ifdef OPT_SSE4` block in each test, append: + +- [ ] **Step 1: Add NEON_HP tier to L2 test** + +In `SQ8_FP16_L2SqrTest`, immediately after the closing `#endif` that follows the SSE4 block and before `// Scalar fallback`: + +```cpp +#ifdef OPT_NEON_HP + if (optimization.asimdhp) { + unsigned char alignment = 0; + arch_opt_func = L2_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_L2_implementation_NEON_HP(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01) + << "NEON_HP with dim " << dim; + optimization.asimdhp = 0; + } +#endif +``` + +- [ ] **Step 2: Add NEON_HP tier to IP test** + +In `SQ8_FP16_InnerProductTest`, append the same block but swap `L2_SQ8_FP16_GetDistFunc` → `IP_SQ8_FP16_GetDistFunc` and `Choose_SQ8_FP16_L2_implementation_NEON_HP` → `Choose_SQ8_FP16_IP_implementation_NEON_HP`. + +- [ ] **Step 3: Add NEON_HP tier to Cosine test** + +In `SQ8_FP16_CosineTest`, append the same block with `Cosine_SQ8_FP16_GetDistFunc` and `Choose_SQ8_FP16_Cosine_implementation_NEON_HP`. + +- [ ] **Step 4: Confirm the include path for the NEON_HP chooser declarations** + +Run: +```bash +grep -n 'functions/NEON_HP.h' tests/unit/test_spaces.cpp +``` +Expected: include present. If not, add `#include "VecSim/spaces/functions/NEON_HP.h"` near the other space-function includes at the top of the file. + +- [ ] **Step 5: User builds (ARM target)** + +Ask the user to run `make build` for an ARM target (hardware or cross-compile). On x86 the new test code is gated by `#ifdef OPT_NEON_HP` and stays inert. + +- [ ] **Step 6: Run NEON_HP tests** + +Once the ARM build is reported clean, run: +```bash +./bin//unit_tests --gtest_filter='SQ8_FP16_*Test*' +``` +Expected: all parametrized cases PASS, including the dims-16..32 and high-dim suites. + +- [ ] **Step 7: Commit** + +```bash +git add tests/unit/test_spaces.cpp +git commit -m "Extend SQ8↔FP16 tier-walk tests with NEON_HP [MOD-14972]" +``` + +--- + +## Task 7: SVE IP kernel header + +**Files:** +- Create: `src/VecSim/spaces/IP/IP_SVE_SQ8_FP16.h` + +- [ ] **Step 1: Author the kernel file** + +Modeled on `IP_SVE_SQ8_FP32.h`. The shape: an `InnerProductStep` helper that consumes `chunk = svcntw()` FP32 lanes per call (FP16 query loaded under a `b16` predicate, SQ8 storage under a `b32` predicate that drives uint8→uint32 widening), then a templated `_IMP` over ``. + +```cpp +/* + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ +#pragma once +#include "VecSim/spaces/space_includes.h" +#include "VecSim/types/sq8.h" +#include "VecSim/types/float16.h" +#include +#include + +using sq8 = vecsim_types::sq8; +using float16 = vecsim_types::float16; + +/* + * Optimised asymmetric SQ8<->FP16 inner product using the algebraic identity: + * + * IP(x, y) ~= min * y_sum + delta * sum(q_i * y_i) + * + * Hot loop accumulates sum(q_i * y_i) only; FP16 query lanes are widened to FP32 + * inside each step via svcvt_f32_f16_x. Metadata loads use load_unaligned. + */ + +// Helper: one SVE-vector-width-of-FP32 step. +// chunk = svcntw() - number of FP32 lanes per step. +// pg = svptrue_b32() - predicate for FP32 lanes. +static inline void +SQ8_FP16_InnerProductStep_SVE(const uint8_t *pVect1, const float16 *pVect2, size_t &offset, + svfloat32_t &sum, svbool_t pg, size_t chunk) { + // SQ8 -> uint32 (widen on load), then to FP32. + svuint32_t v1_u32 = svld1ub_u32(pg, pVect1 + offset); + svfloat32_t v1_f = svcvt_f32_u32_x(pg, v1_u32); + + // FP16 query -> FP32. svld1_f16 uses a b16 predicate sized to `chunk` half lanes. + svbool_t pg16 = svwhilelt_b16(uint32_t(0), uint32_t(chunk)); + svfloat16_t q_h = + svld1_f16(pg16, reinterpret_cast(pVect2) + offset); + svfloat32_t v2_f = svcvt_f32_f16_x(pg, q_h); + + sum = svmla_f32_x(pg, sum, v1_f, v2_f); + offset += chunk; +} + +// pVect1v = SQ8 storage, pVect2v = FP16 query +template +float SQ8_FP16_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v, + size_t dimension) { + assert(dimension >= 16 && "kernel precondition: dispatcher must guard dim >= 16"); + + const uint8_t *pVect1 = static_cast(pVect1v); + const float16 *pVect2 = static_cast(pVect2v); + size_t offset = 0; + svbool_t pg = svptrue_b32(); + const size_t chunk = svcntw(); + + svfloat32_t sum0 = svdup_f32(0.0f); + svfloat32_t sum1 = svdup_f32(0.0f); + svfloat32_t sum2 = svdup_f32(0.0f); + svfloat32_t sum3 = svdup_f32(0.0f); + + // Partial chunk for dim % chunk lanes. Use _z form so inactive lanes are zero - + // the final reduction below walks all lanes via svptrue_b32(). + if constexpr (partial_chunk) { + size_t remaining = dimension % chunk; + if (remaining > 0) { + svbool_t pg_partial = + svwhilelt_b32(uint32_t(0), uint32_t(remaining)); + svbool_t pg16_partial = + svwhilelt_b16(uint32_t(0), uint32_t(remaining)); + svuint32_t v1_u32 = svld1ub_u32(pg_partial, pVect1 + offset); + svfloat32_t v1_f = svcvt_f32_u32_z(pg_partial, v1_u32); + svfloat16_t q_h = svld1_f16( + pg16_partial, reinterpret_cast(pVect2) + offset); + svfloat32_t v2_f = svcvt_f32_f16_z(pg_partial, q_h); + sum0 = svmla_f32_z(pg_partial, sum0, v1_f, v2_f); + offset += remaining; + } + } + + // Main loop: 4 chunks per iteration via 4 accumulators. + const size_t chunk_size = 4 * chunk; + const size_t number_of_chunks = + (dimension - (partial_chunk ? dimension % chunk : 0)) / chunk_size; + for (size_t i = 0; i < number_of_chunks; i++) { + SQ8_FP16_InnerProductStep_SVE(pVect1, pVect2, offset, sum0, pg, chunk); + SQ8_FP16_InnerProductStep_SVE(pVect1, pVect2, offset, sum1, pg, chunk); + SQ8_FP16_InnerProductStep_SVE(pVect1, pVect2, offset, sum2, pg, chunk); + SQ8_FP16_InnerProductStep_SVE(pVect1, pVect2, offset, sum3, pg, chunk); + } + + // Additional steps 0..3. + if constexpr (additional_steps > 0) + SQ8_FP16_InnerProductStep_SVE(pVect1, pVect2, offset, sum0, pg, chunk); + if constexpr (additional_steps > 1) + SQ8_FP16_InnerProductStep_SVE(pVect1, pVect2, offset, sum1, pg, chunk); + if constexpr (additional_steps > 2) + SQ8_FP16_InnerProductStep_SVE(pVect1, pVect2, offset, sum2, pg, chunk); + + svfloat32_t sum = svadd_f32_x(pg, sum0, sum1); + sum = svadd_f32_x(pg, sum, sum2); + sum = svadd_f32_x(pg, sum, sum3); + float quantized_dot = svaddv_f32(pg, sum); + + // Metadata loads - unaligned because odd dim leaves trailers unaligned. + const uint8_t *params_bytes = static_cast(pVect1v) + dimension; + const float min_val = + load_unaligned(params_bytes + sq8::MIN_VAL * sizeof(float)); + const float delta = + load_unaligned(params_bytes + sq8::DELTA * sizeof(float)); + const uint8_t *query_meta_bytes = reinterpret_cast( + static_cast(pVect2v) + dimension); + const float y_sum = + load_unaligned(query_meta_bytes + sq8::SUM_QUERY * sizeof(float)); + + return min_val * y_sum + delta * quantized_dot; +} + +template +float SQ8_FP16_InnerProductSIMD_SVE(const void *pVect1v, const void *pVect2v, + size_t dimension) { + return 1.0f - SQ8_FP16_InnerProductSIMD_SVE_IMP( + pVect1v, pVect2v, dimension); +} + +template +float SQ8_FP16_CosineSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) { + return SQ8_FP16_InnerProductSIMD_SVE( + pVect1v, pVect2v, dimension); +} +``` + +**Note for the implementer:** `svcvt_f32_f16_x(pg, q_h)` widens *the lower half of `q_h`'s lanes* to FP32 (one widening, b32-predicated). If the ACLE on the target toolchain rejects this pairing (e.g. ARM RVCT vs LLVM disagreement), verify the FP16->FP32 widening sequence against the actual ARM build output and adjust as needed (potential alternatives: explicit `svunpklo_*` unpack-then-widen, or operating on the lower half lanes by reinterpretation). Commit only after the build is clean. Do not blindly copy `IP_SVE_FP16.h`'s pattern - that file accumulates in FP16 and is not a direct widening reference. + +- [ ] **Step 2: Commit** + +```bash +git add src/VecSim/spaces/IP/IP_SVE_SQ8_FP16.h +git commit -m "Add SVE SQ8↔FP16 IP kernel header [MOD-14972]" +``` + +--- + +## Task 8: SVE L2 kernel header + +**Files:** +- Create: `src/VecSim/spaces/L2/L2_SVE_SQ8_FP16.h` + +- [ ] **Step 1: Author the kernel file** + +```cpp +/* + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ +#pragma once +#include "VecSim/spaces/space_includes.h" +#include "VecSim/spaces/IP/IP_SVE_SQ8_FP16.h" +#include "VecSim/types/sq8.h" +#include "VecSim/types/float16.h" + +using sq8 = vecsim_types::sq8; +using float16 = vecsim_types::float16; + +/* + * SVE SQ8<->FP16 L2 squared distance: + * ||x - y||^2 = x_sum_squares - 2 * IP(x, y) + y_sum_squares + * IP is computed by SQ8_FP16_InnerProductSIMD_SVE_IMP; metadata is FP32. + */ + +template +float SQ8_FP16_L2SqrSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) { + const float ip = SQ8_FP16_InnerProductSIMD_SVE_IMP( + pVect1v, pVect2v, dimension); + + const uint8_t *params_bytes = static_cast(pVect1v) + dimension; + const float x_sum_sq = + load_unaligned(params_bytes + sq8::SUM_SQUARES * sizeof(float)); + const uint8_t *query_meta_bytes = reinterpret_cast( + static_cast(pVect2v) + dimension); + const float y_sum_sq = + load_unaligned(query_meta_bytes + sq8::SUM_SQUARES_QUERY * sizeof(float)); + + return x_sum_sq + y_sum_sq - 2.0f * ip; +} +``` + +- [ ] **Step 2: Commit** + +```bash +git add src/VecSim/spaces/L2/L2_SVE_SQ8_FP16.h +git commit -m "Add SVE SQ8↔FP16 L2 kernel header [MOD-14972]" +``` + +--- + +## Task 9: SVE + SVE2 dispatcher TU additions + +**Files:** +- Modify: `src/VecSim/spaces/functions/SVE.h` — +3 declarations +- Modify: `src/VecSim/spaces/functions/SVE.cpp` — +#includes; +3 chooser definitions +- Modify: `src/VecSim/spaces/functions/SVE2.h` — +3 declarations +- Modify: `src/VecSim/spaces/functions/SVE2.cpp` — +#includes; +3 chooser definitions (own symbols, template instantiated under SVE2 flags) + +- [ ] **Step 1: Declarations in SVE.h** + +Inside `namespace spaces { ... }`, alongside the existing `Choose_SQ8_FP32_*_SVE` declarations: + +```cpp +dist_func_t Choose_SQ8_FP16_IP_implementation_SVE(size_t dim); +dist_func_t Choose_SQ8_FP16_Cosine_implementation_SVE(size_t dim); +dist_func_t Choose_SQ8_FP16_L2_implementation_SVE(size_t dim); +``` + +- [ ] **Step 2: Definitions in SVE.cpp** + +Add includes alongside the existing SQ8_FP32 includes: + +```cpp +#include "VecSim/spaces/IP/IP_SVE_SQ8_FP16.h" +#include "VecSim/spaces/L2/L2_SVE_SQ8_FP16.h" +``` + +Inside `namespace spaces { ... }` (between `implementation_chooser.h` and `implementation_chooser_cleanup.h`), append: + +```cpp +dist_func_t Choose_SQ8_FP16_IP_implementation_SVE(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_FP16_InnerProductSIMD_SVE, dim, svcntw); + return ret_dist_func; +} + +dist_func_t Choose_SQ8_FP16_Cosine_implementation_SVE(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_FP16_CosineSIMD_SVE, dim, svcntw); + return ret_dist_func; +} + +dist_func_t Choose_SQ8_FP16_L2_implementation_SVE(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_FP16_L2SqrSIMD_SVE, dim, svcntw); + return ret_dist_func; +} +``` + +- [ ] **Step 3: Declarations in SVE2.h** + +```cpp +dist_func_t Choose_SQ8_FP16_IP_implementation_SVE2(size_t dim); +dist_func_t Choose_SQ8_FP16_Cosine_implementation_SVE2(size_t dim); +dist_func_t Choose_SQ8_FP16_L2_implementation_SVE2(size_t dim); +``` + +- [ ] **Step 4: Definitions in SVE2.cpp** + +Add includes alongside the existing SQ8_FP32 includes — note the SVE header is included from SVE2 (SVE2 instantiates the template under SVE2 compile flags): + +```cpp +#include "VecSim/spaces/IP/IP_SVE_SQ8_FP16.h" // SVE2 implementation is identical to SVE +#include "VecSim/spaces/L2/L2_SVE_SQ8_FP16.h" // SVE2 implementation is identical to SVE +``` + +Inside `namespace spaces { ... }`, append: + +```cpp +dist_func_t Choose_SQ8_FP16_IP_implementation_SVE2(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_FP16_InnerProductSIMD_SVE, dim, svcntw); + return ret_dist_func; +} + +dist_func_t Choose_SQ8_FP16_Cosine_implementation_SVE2(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_FP16_CosineSIMD_SVE, dim, svcntw); + return ret_dist_func; +} + +dist_func_t Choose_SQ8_FP16_L2_implementation_SVE2(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_FP16_L2SqrSIMD_SVE, dim, svcntw); + return ret_dist_func; +} +``` + +- [ ] **Step 5: Commit** + +```bash +git add src/VecSim/spaces/functions/SVE.h src/VecSim/spaces/functions/SVE.cpp \ + src/VecSim/spaces/functions/SVE2.h src/VecSim/spaces/functions/SVE2.cpp +git commit -m "Wire SVE/SVE2 SQ8↔FP16 choosers [MOD-14972]" +``` + +--- + +## Task 10: SVE + SVE2 dispatcher wiring in IP_space.cpp + L2_space.cpp + +The NEON_HP block added in Task 5 lives inside `#ifdef CPU_FEATURES_ARCH_AARCH64`. Extend the same block in all three `_GetDistFunc` functions with SVE2 and SVE tiers — ordered SVE2 → SVE → NEON_HP, matching every other SQ8/FP32 dispatcher in the file. + +**Files:** +- Modify: `src/VecSim/spaces/IP_space.cpp` (two functions) +- Modify: `src/VecSim/spaces/L2_space.cpp` (one function) + +- [ ] **Step 1: Confirm the SVE/SVE2 dispatcher includes are present** + +Run: +```bash +grep -n 'functions/SVE\.h\|functions/SVE2\.h' src/VecSim/spaces/IP_space.cpp src/VecSim/spaces/L2_space.cpp +``` +Expected: both files already include both headers. If not, add them. + +- [ ] **Step 2: Extend IP_SQ8_FP16_GetDistFunc** + +Inside the AArch64 block of `IP_SQ8_FP16_GetDistFunc`, after the `if (dim < 16) return ret_dist_func;` guard and **before** the existing `#ifdef OPT_NEON_HP`, prepend: + +```cpp +#ifdef OPT_SVE2 + if (features.sve2) { + return Choose_SQ8_FP16_IP_implementation_SVE2(dim); + } +#endif +#ifdef OPT_SVE + if (features.sve) { + return Choose_SQ8_FP16_IP_implementation_SVE(dim); + } +#endif +``` + +(SVE/SVE2 paths don't compute alignment hints — the SVE vector width is runtime-variable, so the SQ8_FP32 sister doesn't set `*alignment` here either. Mirror that.) + +- [ ] **Step 3: Extend Cosine_SQ8_FP16_GetDistFunc** + +Same as Step 2, with `Cosine` in the chooser names. + +- [ ] **Step 4: Extend L2_SQ8_FP16_GetDistFunc** + +Same as Step 2, with `L2` in the chooser names. + +- [ ] **Step 5: User builds (ARM target)** + +Ask user to run `make build` for an ARM target. + +- [ ] **Step 6: Commit** + +```bash +git add src/VecSim/spaces/IP_space.cpp src/VecSim/spaces/L2_space.cpp +git commit -m "Dispatch SQ8↔FP16 to SVE/SVE2 tiers on AArch64 [MOD-14972]" +``` + +--- + +## Task 11: Extend `SQ8_FP16_SpacesOptimizationTest` with SVE2 + SVE tier-walks + +**Files:** +- Modify: `tests/unit/test_spaces.cpp` — the same three test bodies extended in Task 6 + +For each test (L2, IP, Cosine), inside the existing `#ifdef CPU_FEATURES_ARCH_AARCH64` region (which currently holds only NEON_HP from Task 6), **prepend** SVE2 and SVE blocks so the dispatch-precedence order is SVE2 → SVE → NEON_HP. If the existing NEON_HP block is not yet inside an AArch64 outer ifdef, wrap all three together. + +- [ ] **Step 1: Wrap and extend the L2 test** + +Replace the NEON_HP-only AArch64 block in `SQ8_FP16_L2SqrTest` with: + +```cpp +#ifdef CPU_FEATURES_ARCH_AARCH64 +#ifdef OPT_SVE2 + if (optimization.sve2) { + unsigned char alignment = 0; + arch_opt_func = L2_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_L2_implementation_SVE2(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01) + << "SVE2 with dim " << dim; + optimization.sve2 = 0; + } +#endif +#ifdef OPT_SVE + if (optimization.sve) { + unsigned char alignment = 0; + arch_opt_func = L2_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_L2_implementation_SVE(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01) + << "SVE with dim " << dim; + optimization.sve = 0; + } +#endif +#ifdef OPT_NEON_HP + if (optimization.asimdhp) { + unsigned char alignment = 0; + arch_opt_func = L2_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_L2_implementation_NEON_HP(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01) + << "NEON_HP with dim " << dim; + optimization.asimdhp = 0; + } +#endif +#endif // CPU_FEATURES_ARCH_AARCH64 +``` + +- [ ] **Step 2: Same for IP test** + +Replicate the block in `SQ8_FP16_InnerProductTest` with `IP_SQ8_FP16_GetDistFunc` and `Choose_SQ8_FP16_IP_implementation_`. + +- [ ] **Step 3: Same for Cosine test** + +Replicate with `Cosine_SQ8_FP16_GetDistFunc` and `Choose_SQ8_FP16_Cosine_implementation_`. + +- [ ] **Step 4: User builds** + +ARM target build. + +- [ ] **Step 5: Run the optimization tests** + +```bash +./bin//unit_tests --gtest_filter='SQ8_FP16_SpacesOptimizationTest.*' +``` +Expected: all parametrized cases PASS — dims 16..32 + high-dim suite (64..1024) — exercising whichever ARM tiers the host advertises. + +- [ ] **Step 6: Commit** + +```bash +git add tests/unit/test_spaces.cpp +git commit -m "Extend SQ8↔FP16 tier-walk tests with SVE/SVE2 [MOD-14972]" +``` + +--- + +## Task 12: Extend `SQ8_FP16_SIMD_TierCoverage.ReportTiersExercised` with ARM rows + +**Files:** +- Modify: `tests/unit/test_spaces.cpp` — `TEST(SQ8_FP16_SIMD_TierCoverage, ReportTiersExercised)` + +The existing test body has an outer `#ifdef CPU_FEATURES_ARCH_X86_64` block that loops over each x86 tier and logs presence to stderr. Add a sibling `#ifdef CPU_FEATURES_ARCH_AARCH64` block with the same shape. + +- [ ] **Step 1: Append the AArch64 reporting block** + +Locate the trailing `#endif // CPU_FEATURES_ARCH_X86_64` and immediately after, insert: + +```cpp +#ifdef CPU_FEATURES_ARCH_AARCH64 +#ifdef OPT_SVE2 + if (opt.sve2) { + std::cerr << "[SQ8_FP16] SVE2 tier exercised\n"; + any_simd = true; + } else { + std::cerr << "[SQ8_FP16] SVE2 tier NOT exercised on this host\n"; + } +#endif +#ifdef OPT_SVE + if (opt.sve) { + std::cerr << "[SQ8_FP16] SVE tier exercised\n"; + any_simd = true; + } else { + std::cerr << "[SQ8_FP16] SVE tier NOT exercised on this host\n"; + } +#endif +#ifdef OPT_NEON_HP + if (opt.asimdhp) { + std::cerr << "[SQ8_FP16] NEON_HP tier exercised\n"; + any_simd = true; + } else { + std::cerr << "[SQ8_FP16] NEON_HP tier NOT exercised on this host\n"; + } +#endif +#endif // CPU_FEATURES_ARCH_AARCH64 +``` + +(The trailing `if (!any_simd) { GTEST_SKIP() << ...; }` already at the bottom of the existing test handles the all-quiet case across both archs.) + +- [ ] **Step 2: Build + run on an ARM host** + +Ask the user to build for ARM, then run: +```bash +./bin//unit_tests --gtest_filter='SQ8_FP16_SIMD_TierCoverage.*' +``` +Expected: stderr shows at least one ARM tier marked "exercised", test PASS. + +- [ ] **Step 3: Commit** + +```bash +git add tests/unit/test_spaces.cpp +git commit -m "Report ARM tiers in SQ8↔FP16 tier-coverage test [MOD-14972]" +``` + +--- + +## Task 13: Microbench AArch64 block + +**Files:** +- Modify: `tests/benchmark/spaces_benchmarks/bm_spaces_sq8_fp16.cpp` + +The existing file already opens `#ifdef CPU_FEATURES_ARCH_X86_64` and pulls `cpu_features::X86Features opt = cpu_features::GetX86Info().features;`. Add the parallel AArch64 block at the end of that `#endif // CPU_FEATURES_ARCH_X86_64`. + +- [ ] **Step 1: Append the AArch64 bench block** + +After the closing `#endif // CPU_FEATURES_ARCH_X86_64` (or after the last x86 `INITIALIZE_BENCHMARKS_SET_*` macro if no such comment exists), insert: + +```cpp +#ifdef CPU_FEATURES_ARCH_AARCH64 +cpu_features::Aarch64Features arm_opt = cpu_features::GetAarch64Info().features; + +#ifdef OPT_SVE2 +bool sve2_supported = arm_opt.sve2; +INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, SVE2, 16, sve2_supported); +INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, SVE2, 16, sve2_supported); +#endif + +#ifdef OPT_SVE +bool sve_supported = arm_opt.sve; +INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, SVE, 16, sve_supported); +INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, SVE, 16, sve_supported); +#endif + +#ifdef OPT_NEON_HP +bool neon_hp_supported = arm_opt.asimdhp; +INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, NEON_HP, 16, neon_hp_supported); +INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, NEON_HP, 16, + neon_hp_supported); +#endif +#endif // CPU_FEATURES_ARCH_AARCH64 +``` + +Verify the exact `cpu_features` helper name during build. If the toolchain uses `Aarch64Info` vs `Aarch64Features` vs `ArmFeatures`, adjust to match the sister x86 block. + +- [ ] **Step 2: Update the file-header comment** + +The current file-header comment (around the top) ends with `ARM kernels land via MOD-14972.` — change that line to `ARM kernels (NEON_HP / SVE / SVE2) are registered below.` so the doc stays accurate. + +- [ ] **Step 3: User builds (ARM target)** + +- [ ] **Step 4: Run the bench on ARM** + +```bash +./bin//bm_spaces_sq8_fp16 --benchmark_filter='SQ8_FP16_.*(SVE2|SVE|NEON_HP)' +``` +Expected: per-ISA throughput rows for L2, IP, Cosine. If no rows match, list all benchmarks first with `--benchmark_list_tests` to see the exact generated names, then adjust the regex. + +- [ ] **Step 5: Side-by-side compare against SQ8_FP32** + +```bash +./bin//bm_spaces_sq8_fp32 --benchmark_filter='SQ8_FP32_.*(SVE2|SVE|NEON)' +``` +Compare matched-ISA rows manually. Acceptance per Jira: per-ISA throughput data captured. + +- [ ] **Step 6: Commit** + +```bash +git add tests/benchmark/spaces_benchmarks/bm_spaces_sq8_fp16.cpp +git commit -m "Register ARM SQ8↔FP16 microbenchmarks [MOD-14972]" +``` + +--- + +## Task 14: ASan + final pre-PR verification + +- [ ] **Step 1: Full unit-test pass on ARM host (no filter)** + +```bash +./bin//unit_tests +``` +Expected: all tests PASS. + +- [ ] **Step 2: ASan build + run** + +Ask user to run `make build SAN=address` (or the repo's equivalent — verify against `Makefile`). After confirmed: + +```bash +./bin/-asan/unit_tests --gtest_filter='SQ8_FP16_*' +``` +Expected: zero ASan reports; all SQ8_FP16 tests PASS. + +- [ ] **Step 3: x86 sanity build** + +User runs `make build` on x86 (no ARM target). Confirms the new test extensions and dispatcher AArch64 ifdefs stay inert on x86 and the build is clean. + +- [ ] **Step 4: Push branch (ASK USER FIRST)** + +Pushes are user-gated. Confirm with the user before running: + +```bash +git push -u origin dor-forer-sq8-fp16-arm-kernels-mod-14972 +``` + +- [ ] **Step 5: Open PR against PR #970 (ASK USER FIRST)** + +PR creation is user-gated. Confirm with the user before running: + +```bash +gh pr create \ + --base dor-forer-sq8-fp16-x86-kernels-mod-14954 \ + --title 'Add SQ8↔FP16 ARM SIMD distance kernels [MOD-14972]' \ + --body "$(cat <<'EOF' +## Summary + +- Add asymmetric SQ8↔FP16 distance kernels (IP, L2, Cosine) for ARM NEON_HP, SVE, SVE2 tiers +- Wire kernels into the existing dispatcher (`IP_space.cpp`, `L2_space.cpp`) +- Extend `SQ8_FP16_SpacesOptimizationTest` and `SQ8_FP16_SIMD_TierCoverage` with ARM tiers +- Register per-ISA microbenchmarks for cross-arch throughput comparison + +Stacked on PR #970 (MOD-14954 x86 kernels); retarget to `main` once #970 merges. + +Spec: `docs/superpowers/specs/2026-05-28-arm-sq8-fp16-design.md` + +## Test plan + +- [ ] Unit tests on ARM host pass — `SQ8_FP16_SpacesOptimizationTest` (dims 16..32 + 64..1024), `SQ8_FP16_SIMD_TierCoverage`, `GetDistFuncSQ8FP16Asymmetric` +- [ ] ASan build on ARM host clean across SQ8_FP16 tests +- [ ] x86 build remains clean (new AArch64 dispatcher block + tests stay inert) +- [ ] Microbench output captured for SVE2 / SVE / NEON_HP, compared against matched SQ8_FP32 ARM rows +EOF +)" +``` + +- [ ] **Step 6: Retarget once #970 merges (ASK USER FIRST)** + +When PR #970 lands on `main`, change this PR's base to `main`: + +```bash +gh pr edit --base main +``` + +--- + +## Self-review checklist + +- [x] **Spec coverage:** every requirement in `2026-05-28-arm-sq8-fp16-design.md` is covered: + - Kernel headers (4 new): Tasks 2, 3, 7, 8 + - Wrapper symbols: Tasks 4 (NEON_HP), 9 (SVE/SVE2) + - Dispatcher wiring: Tasks 5 (NEON_HP), 10 (SVE/SVE2) + - Tier-walk tests: Tasks 6 (NEON_HP), 11 (SVE/SVE2) + - TierCoverage report: Task 12 + - Scalar-fallback edge tests (dim=0, dim=15): Task 1 + - Microbench: Task 13 + - ASan + verification: Task 14 +- [x] **No CMake changes** — confirmed in file structure table. +- [x] **Zero placeholders** — every code block is concrete; ambiguous spots (SVE FP16 widening ACLE) are called out with the fallback strategy spelled in-task. +- [x] **Type/symbol consistency:** + - NEON kernel template names: `SQ8_FP16_InnerProductSIMD16_NEON_HP_IMP` / `…NEON_HP` / `SQ8_FP16_L2SqrSIMD16_NEON_HP` / `SQ8_FP16_CosineSIMD16_NEON_HP` — match across kernel header, NEON_HP chooser, dispatcher call, and test. + - SVE kernel template names: `SQ8_FP16_InnerProductSIMD_SVE_IMP` / `…SVE` / `SQ8_FP16_L2SqrSIMD_SVE` / `SQ8_FP16_CosineSIMD_SVE` — match across kernel header, SVE chooser, SVE2 chooser, dispatcher call, and test. + - Chooser symbol names: `Choose_SQ8_FP16_{IP,L2,Cosine}_implementation_{NEON_HP,SVE,SVE2}` — match across `.h` declarations, `.cpp` definitions, dispatcher calls, tests, and bench. + - Test fixture: `SQ8_FP16_SpacesOptimizationTest` already exists on base (PR #970); we extend the three test methods inside it, no rename. + +--- + +## Execution Handoff + +Plan complete and saved to `docs/superpowers/plans/2026-05-28-arm-sq8-fp16-kernels.md`. Two execution options: + +**1. Subagent-Driven (recommended)** — I dispatch a fresh subagent per task, review between tasks, fast iteration. + +**2. Inline Execution** — Execute tasks in this session using executing-plans, batch execution with checkpoints. + +Which approach? From 4f0534c0d3bb6bfea2c3f54357775a365a2fcb85 Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Thu, 28 May 2026 17:54:33 +0300 Subject: [PATCH 03/19] =?UTF-8?q?Add=20NEON=5FHP=20SQ8=E2=86=94FP16=20IP?= =?UTF-8?q?=20kernel=20header=20[MOD-14972]?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/VecSim/spaces/IP/IP_NEON_SQ8_FP16.h | 135 ++++++++++++++++++++++++ 1 file changed, 135 insertions(+) create mode 100644 src/VecSim/spaces/IP/IP_NEON_SQ8_FP16.h diff --git a/src/VecSim/spaces/IP/IP_NEON_SQ8_FP16.h b/src/VecSim/spaces/IP/IP_NEON_SQ8_FP16.h new file mode 100644 index 000000000..b1d26fec5 --- /dev/null +++ b/src/VecSim/spaces/IP/IP_NEON_SQ8_FP16.h @@ -0,0 +1,135 @@ +/* + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ +#pragma once +#include "VecSim/spaces/space_includes.h" +#include "VecSim/types/sq8.h" +#include "VecSim/types/float16.h" +#include +#include + +using sq8 = vecsim_types::sq8; +using float16 = vecsim_types::float16; + +/* + * Optimised asymmetric SQ8<->FP16 inner product using the algebraic identity: + * + * IP(x, y) = sum(x_i * y_i) + * ~= sum((min + delta * q_i) * y_i) + * = min * y_sum + delta * sum(q_i * y_i) + * + * The hot loop only accumulates sum(q_i * y_i) - no per-element dequantisation. + * FP16 query lanes are widened to FP32 via vcvt_f32_f16 per 16-lane chunk. + */ + +// Helper: 16 lanes per call, four FP32 accumulators (one per quarter). +static inline void +SQ8_FP16_InnerProductStep_NEON_HP(const uint8_t *&pVect1, const float16 *&pVect2, + float32x4_t &sum0, float32x4_t &sum1, + float32x4_t &sum2, float32x4_t &sum3) { + // SQ8 storage: 16 * uint8 -> 4 * float32x4_t + uint8x16_t v1_u8 = vld1q_u8(pVect1); + uint16x8_t v1_lo = vmovl_u8(vget_low_u8(v1_u8)); + uint16x8_t v1_hi = vmovl_u8(vget_high_u8(v1_u8)); + float32x4_t v1_0 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(v1_lo))); + float32x4_t v1_1 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(v1_lo))); + float32x4_t v1_2 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(v1_hi))); + float32x4_t v1_3 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(v1_hi))); + + // FP16 query: 16 * f16 -> 4 * float32x4_t via vcvt_f32_f16 + const float16_t *q = reinterpret_cast(pVect2); + float16x8_t q_lo = vld1q_f16(q); + float16x8_t q_hi = vld1q_f16(q + 8); + float32x4_t v2_0 = vcvt_f32_f16(vget_low_f16(q_lo)); + float32x4_t v2_1 = vcvt_f32_f16(vget_high_f16(q_lo)); + float32x4_t v2_2 = vcvt_f32_f16(vget_low_f16(q_hi)); + float32x4_t v2_3 = vcvt_f32_f16(vget_high_f16(q_hi)); + + sum0 = vfmaq_f32(sum0, v1_0, v2_0); + sum1 = vfmaq_f32(sum1, v1_1, v2_1); + sum2 = vfmaq_f32(sum2, v1_2, v2_2); + sum3 = vfmaq_f32(sum3, v1_3, v2_3); + + pVect1 += 16; + pVect2 += 16; +} + +// pVect1v = SQ8 storage, pVect2v = FP16 query +template // 0..15 +float SQ8_FP16_InnerProductSIMD16_NEON_HP_IMP(const void *pVect1v, const void *pVect2v, + size_t dimension) { + assert(dimension >= 16 && "kernel precondition: dispatcher must guard dim >= 16"); + + const uint8_t *pVect1 = static_cast(pVect1v); // SQ8 storage + const float16 *pVect2 = static_cast(pVect2v); // FP16 query + + float32x4_t sum0 = vdupq_n_f32(0.0f); + float32x4_t sum1 = vdupq_n_f32(0.0f); + float32x4_t sum2 = vdupq_n_f32(0.0f); + float32x4_t sum3 = vdupq_n_f32(0.0f); + + const size_t num_of_chunks = dimension / 16; + for (size_t i = 0; i < num_of_chunks; i++) { + SQ8_FP16_InnerProductStep_NEON_HP(pVect1, pVect2, sum0, sum1, sum2, sum3); + } + + // Residual handling: dim % 16 lanes. + // residual >= 8: one safe 8-lane SQ8 + 8-lane FP16 load (FP16 trailer is wide enough). + // residual < 8: scalar-only - a 4-lane FP16 load would overread y_sum metadata. + constexpr unsigned char r = residual; + if constexpr (r >= 8) { + uint8x8_t v1_u8 = vld1_u8(pVect1); + uint16x8_t v1_u16 = vmovl_u8(v1_u8); + float32x4_t v1_a = vcvtq_f32_u32(vmovl_u16(vget_low_u16(v1_u16))); + float32x4_t v1_b = vcvtq_f32_u32(vmovl_u16(vget_high_u16(v1_u16))); + float16x8_t q_h = vld1q_f16(reinterpret_cast(pVect2)); + float32x4_t v2_a = vcvt_f32_f16(vget_low_f16(q_h)); + float32x4_t v2_b = vcvt_f32_f16(vget_high_f16(q_h)); + sum0 = vfmaq_f32(sum0, v1_a, v2_a); + sum1 = vfmaq_f32(sum1, v1_b, v2_b); + pVect1 += 8; + pVect2 += 8; + } + // Lane-by-lane scalar for the final 0..7 (residual % 8) elements. + constexpr unsigned char tail = r & 0x7; + float scalar_dot = 0.0f; + for (unsigned char k = 0; k < tail; ++k) { + scalar_dot += static_cast(pVect1[k]) * vecsim_types::FP16_to_FP32(pVect2[k]); + } + + // Reduce the four NEON accumulators. + float32x4_t sum_lo = vaddq_f32(sum0, sum1); + float32x4_t sum_hi = vaddq_f32(sum2, sum3); + float quantized_dot = vaddvq_f32(vaddq_f32(sum_lo, sum_hi)) + scalar_dot; + + // Metadata loads - use load_unaligned because odd dim leaves trailers unaligned. + const uint8_t *params_bytes = static_cast(pVect1v) + dimension; + const float min_val = + load_unaligned(params_bytes + sq8::MIN_VAL * sizeof(float)); + const float delta = + load_unaligned(params_bytes + sq8::DELTA * sizeof(float)); + const uint8_t *query_meta_bytes = + reinterpret_cast(static_cast(pVect2v) + dimension); + const float y_sum = + load_unaligned(query_meta_bytes + sq8::SUM_QUERY * sizeof(float)); + + return min_val * y_sum + delta * quantized_dot; +} + +template +float SQ8_FP16_InnerProductSIMD16_NEON_HP(const void *pVect1v, const void *pVect2v, + size_t dimension) { + return 1.0f - + SQ8_FP16_InnerProductSIMD16_NEON_HP_IMP(pVect1v, pVect2v, dimension); +} + +template +float SQ8_FP16_CosineSIMD16_NEON_HP(const void *pVect1v, const void *pVect2v, size_t dimension) { + // Cosine = 1 - IP (vectors are pre-normalised); reuses the IP wrapper. + return SQ8_FP16_InnerProductSIMD16_NEON_HP(pVect1v, pVect2v, dimension); +} From d3c6415578e202eb79eb22d50019c084a1e240ec Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Thu, 28 May 2026 18:01:04 +0300 Subject: [PATCH 04/19] =?UTF-8?q?Add=20NEON=5FHP=20SQ8=E2=86=94FP16=20L2?= =?UTF-8?q?=20kernel=20header=20[MOD-14972]?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Sonnet 4.6 --- src/VecSim/spaces/L2/L2_NEON_SQ8_FP16.h | 42 +++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 src/VecSim/spaces/L2/L2_NEON_SQ8_FP16.h diff --git a/src/VecSim/spaces/L2/L2_NEON_SQ8_FP16.h b/src/VecSim/spaces/L2/L2_NEON_SQ8_FP16.h new file mode 100644 index 000000000..7bf5db986 --- /dev/null +++ b/src/VecSim/spaces/L2/L2_NEON_SQ8_FP16.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ +#pragma once +#include "VecSim/spaces/space_includes.h" +#include "VecSim/spaces/IP/IP_NEON_SQ8_FP16.h" +#include "VecSim/types/sq8.h" +#include "VecSim/types/float16.h" + +using sq8 = vecsim_types::sq8; +using float16 = vecsim_types::float16; + +/* + * Optimised asymmetric SQ8<->FP16 L2 squared distance using the algebraic identity: + * + * ||x - y||^2 = sum(x_i^2) - 2 * IP(x, y) + sum(y_i^2) + * = x_sum_squares - 2 * IP(x, y) + y_sum_squares + * + * IP is computed by SQ8_FP16_InnerProductSIMD16_NEON_HP_IMP; metadata is FP32. + */ + +template // 0..15 +float SQ8_FP16_L2SqrSIMD16_NEON_HP(const void *pVect1v, const void *pVect2v, size_t dimension) { + const float ip = + SQ8_FP16_InnerProductSIMD16_NEON_HP_IMP(pVect1v, pVect2v, dimension); + + const uint8_t *params_bytes = static_cast(pVect1v) + dimension; + const float x_sum_sq = + load_unaligned(params_bytes + sq8::SUM_SQUARES * sizeof(float)); + + const uint8_t *query_meta_bytes = reinterpret_cast( + static_cast(pVect2v) + dimension); + const float y_sum_sq = + load_unaligned(query_meta_bytes + sq8::SUM_SQUARES_QUERY * sizeof(float)); + + return x_sum_sq + y_sum_sq - 2.0f * ip; +} From 69cee3d730bd808699db076d25ab3c7b14040ee8 Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Thu, 28 May 2026 18:05:16 +0300 Subject: [PATCH 05/19] =?UTF-8?q?Wire=20NEON=5FHP=20SQ8=E2=86=94FP16=20cho?= =?UTF-8?q?osers=20[MOD-14972]?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Sonnet 4.6 --- src/VecSim/spaces/functions/NEON_HP.cpp | 20 ++++++++++++++++++++ src/VecSim/spaces/functions/NEON_HP.h | 4 ++++ 2 files changed, 24 insertions(+) diff --git a/src/VecSim/spaces/functions/NEON_HP.cpp b/src/VecSim/spaces/functions/NEON_HP.cpp index 2dea94934..20d93a517 100644 --- a/src/VecSim/spaces/functions/NEON_HP.cpp +++ b/src/VecSim/spaces/functions/NEON_HP.cpp @@ -10,6 +10,8 @@ #include "VecSim/spaces/L2/L2_NEON_FP16.h" #include "VecSim/spaces/IP/IP_NEON_FP16.h" +#include "VecSim/spaces/IP/IP_NEON_SQ8_FP16.h" +#include "VecSim/spaces/L2/L2_NEON_SQ8_FP16.h" namespace spaces { @@ -27,6 +29,24 @@ dist_func_t Choose_FP16_IP_implementation_NEON_HP(size_t dim) { return ret_dist_func; } +dist_func_t Choose_SQ8_FP16_IP_implementation_NEON_HP(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_FP16_InnerProductSIMD16_NEON_HP); + return ret_dist_func; +} + +dist_func_t Choose_SQ8_FP16_L2_implementation_NEON_HP(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_FP16_L2SqrSIMD16_NEON_HP); + return ret_dist_func; +} + +dist_func_t Choose_SQ8_FP16_Cosine_implementation_NEON_HP(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_FP16_CosineSIMD16_NEON_HP); + return ret_dist_func; +} + #include "implementation_chooser_cleanup.h" } // namespace spaces diff --git a/src/VecSim/spaces/functions/NEON_HP.h b/src/VecSim/spaces/functions/NEON_HP.h index c65bd6948..889eb0919 100644 --- a/src/VecSim/spaces/functions/NEON_HP.h +++ b/src/VecSim/spaces/functions/NEON_HP.h @@ -16,4 +16,8 @@ dist_func_t Choose_FP16_IP_implementation_NEON_HP(size_t dim); dist_func_t Choose_FP16_L2_implementation_NEON_HP(size_t dim); +dist_func_t Choose_SQ8_FP16_IP_implementation_NEON_HP(size_t dim); +dist_func_t Choose_SQ8_FP16_L2_implementation_NEON_HP(size_t dim); +dist_func_t Choose_SQ8_FP16_Cosine_implementation_NEON_HP(size_t dim); + } // namespace spaces From 1b36b38e12fdbdbe148d80cea8245f2915e8df2d Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Thu, 28 May 2026 18:08:16 +0300 Subject: [PATCH 06/19] =?UTF-8?q?Dispatch=20SQ8=E2=86=94FP16=20to=20NEON?= =?UTF-8?q?=5FHP=20tier=20on=20AArch64=20[MOD-14972]?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Sonnet 4.6 --- src/VecSim/spaces/IP_space.cpp | 26 ++++++++++++++++++++++++++ src/VecSim/spaces/L2_space.cpp | 13 +++++++++++++ 2 files changed, 39 insertions(+) diff --git a/src/VecSim/spaces/IP_space.cpp b/src/VecSim/spaces/IP_space.cpp index b57971b60..92616f394 100644 --- a/src/VecSim/spaces/IP_space.cpp +++ b/src/VecSim/spaces/IP_space.cpp @@ -225,6 +225,19 @@ dist_func_t IP_SQ8_FP16_GetDistFunc(size_t dim, unsigned char *alignment, #endif #endif // OPT_F16C #endif // x86_64 +#ifdef CPU_FEATURES_ARCH_AARCH64 + if (dim < 16) { + return ret_dist_func; + } +#ifdef OPT_NEON_HP + if (features.asimdhp) { + // No alignment write: the locked spec and the sister ARM SQ8_FP32 dispatchers + // leave *alignment untouched on ARM tiers. The corresponding tests assert + // 0xFF passthrough on the scalar path and do not assert any non-zero value here. + return Choose_SQ8_FP16_IP_implementation_NEON_HP(dim); + } +#endif +#endif // CPU_FEATURES_ARCH_AARCH64 return ret_dist_func; } @@ -274,6 +287,19 @@ dist_func_t Cosine_SQ8_FP16_GetDistFunc(size_t dim, unsigned char *alignm #endif #endif // OPT_F16C #endif // x86_64 +#ifdef CPU_FEATURES_ARCH_AARCH64 + if (dim < 16) { + return ret_dist_func; + } +#ifdef OPT_NEON_HP + if (features.asimdhp) { + // No alignment write: the locked spec and the sister ARM SQ8_FP32 dispatchers + // leave *alignment untouched on ARM tiers. The corresponding tests assert + // 0xFF passthrough on the scalar path and do not assert any non-zero value here. + return Choose_SQ8_FP16_Cosine_implementation_NEON_HP(dim); + } +#endif +#endif // CPU_FEATURES_ARCH_AARCH64 return ret_dist_func; } diff --git a/src/VecSim/spaces/L2_space.cpp b/src/VecSim/spaces/L2_space.cpp index 43020399f..995b4c4d6 100644 --- a/src/VecSim/spaces/L2_space.cpp +++ b/src/VecSim/spaces/L2_space.cpp @@ -156,6 +156,19 @@ dist_func_t L2_SQ8_FP16_GetDistFunc(size_t dim, unsigned char *alignment, #endif #endif // OPT_F16C #endif // x86_64 +#ifdef CPU_FEATURES_ARCH_AARCH64 + if (dim < 16) { + return ret_dist_func; + } +#ifdef OPT_NEON_HP + if (features.asimdhp) { + // No alignment write: the locked spec and the sister ARM SQ8_FP32 dispatchers + // leave *alignment untouched on ARM tiers. The corresponding tests assert + // 0xFF passthrough on the scalar path and do not assert any non-zero value here. + return Choose_SQ8_FP16_L2_implementation_NEON_HP(dim); + } +#endif +#endif // CPU_FEATURES_ARCH_AARCH64 return ret_dist_func; } From 1af48125a6fd032806db68af2dfb8b4795c6bba7 Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Thu, 28 May 2026 18:11:42 +0300 Subject: [PATCH 07/19] =?UTF-8?q?Extend=20SQ8=E2=86=94FP16=20tier-walk=20t?= =?UTF-8?q?ests=20with=20NEON=5FHP=20[MOD-14972]?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Sonnet 4.6 --- tests/unit/test_spaces.cpp | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index 474ac5c75..d2c9386ac 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -3149,6 +3149,18 @@ TEST_P(SQ8_FP16_SpacesOptimizationTest, SQ8_FP16_L2SqrTest) { #endif #endif // OPT_F16C +#ifdef OPT_NEON_HP + if (optimization.asimdhp) { + unsigned char alignment = 0; + arch_opt_func = L2_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_L2_implementation_NEON_HP(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01) + << "NEON_HP with dim " << dim; + optimization.asimdhp = 0; + } +#endif + unsigned char alignment = 0; arch_opt_func = L2_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, SQ8_FP16_L2Sqr) @@ -3224,6 +3236,18 @@ TEST_P(SQ8_FP16_SpacesOptimizationTest, SQ8_FP16_InnerProductTest) { #endif #endif // OPT_F16C +#ifdef OPT_NEON_HP + if (optimization.asimdhp) { + unsigned char alignment = 0; + arch_opt_func = IP_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_IP_implementation_NEON_HP(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01) + << "NEON_HP with dim " << dim; + optimization.asimdhp = 0; + } +#endif + unsigned char alignment = 0; arch_opt_func = IP_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, SQ8_FP16_InnerProduct) @@ -3299,6 +3323,18 @@ TEST_P(SQ8_FP16_SpacesOptimizationTest, SQ8_FP16_CosineTest) { #endif #endif // OPT_F16C +#ifdef OPT_NEON_HP + if (optimization.asimdhp) { + unsigned char alignment = 0; + arch_opt_func = Cosine_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_Cosine_implementation_NEON_HP(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01) + << "NEON_HP with dim " << dim; + optimization.asimdhp = 0; + } +#endif + unsigned char alignment = 0; arch_opt_func = Cosine_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization); ASSERT_EQ(arch_opt_func, SQ8_FP16_Cosine) From 0ce0bcebd31609601197f5d6aeeaaf3cea40f0c3 Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Thu, 28 May 2026 18:16:51 +0300 Subject: [PATCH 08/19] =?UTF-8?q?Add=20SVE=20SQ8=E2=86=94FP16=20IP=20kerne?= =?UTF-8?q?l=20header=20[MOD-14972]?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/VecSim/spaces/IP/IP_SVE_SQ8_FP16.h | 133 +++++++++++++++++++++++++ 1 file changed, 133 insertions(+) create mode 100644 src/VecSim/spaces/IP/IP_SVE_SQ8_FP16.h diff --git a/src/VecSim/spaces/IP/IP_SVE_SQ8_FP16.h b/src/VecSim/spaces/IP/IP_SVE_SQ8_FP16.h new file mode 100644 index 000000000..36a7d18e6 --- /dev/null +++ b/src/VecSim/spaces/IP/IP_SVE_SQ8_FP16.h @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ +#pragma once +#include "VecSim/spaces/space_includes.h" +#include "VecSim/types/sq8.h" +#include "VecSim/types/float16.h" +#include +#include + +using sq8 = vecsim_types::sq8; +using float16 = vecsim_types::float16; + +/* + * Optimised asymmetric SQ8<->FP16 inner product using the algebraic identity: + * + * IP(x, y) ~= min * y_sum + delta * sum(q_i * y_i) + * + * Hot loop accumulates sum(q_i * y_i) only; FP16 query lanes are widened to FP32 + * inside each step via svcvt_f32_f16_x. Metadata loads use load_unaligned. + */ + +// Helper: one SVE-vector-width-of-FP32 step. +// chunk = svcntw() - number of FP32 lanes per step. +// pg = svptrue_b32() - predicate for FP32 lanes. +static inline void +SQ8_FP16_InnerProductStep_SVE(const uint8_t *pVect1, const float16 *pVect2, size_t &offset, + svfloat32_t &sum, svbool_t pg, size_t chunk) { + // SQ8 -> uint32 (widen on load), then to FP32. + svuint32_t v1_u32 = svld1ub_u32(pg, pVect1 + offset); + svfloat32_t v1_f = svcvt_f32_u32_x(pg, v1_u32); + + // FP16 query -> FP32. svld1_f16 uses a b16 predicate sized to `chunk` half lanes. + svbool_t pg16 = svwhilelt_b16(uint32_t(0), uint32_t(chunk)); + svfloat16_t q_h = + svld1_f16(pg16, reinterpret_cast(pVect2) + offset); + svfloat32_t v2_f = svcvt_f32_f16_x(pg, q_h); + + sum = svmla_f32_x(pg, sum, v1_f, v2_f); + offset += chunk; +} + +// pVect1v = SQ8 storage, pVect2v = FP16 query +template +float SQ8_FP16_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v, + size_t dimension) { + assert(dimension >= 16 && "kernel precondition: dispatcher must guard dim >= 16"); + + const uint8_t *pVect1 = static_cast(pVect1v); + const float16 *pVect2 = static_cast(pVect2v); + size_t offset = 0; + svbool_t pg = svptrue_b32(); + const size_t chunk = svcntw(); + + svfloat32_t sum0 = svdup_f32(0.0f); + svfloat32_t sum1 = svdup_f32(0.0f); + svfloat32_t sum2 = svdup_f32(0.0f); + svfloat32_t sum3 = svdup_f32(0.0f); + + // Partial chunk for dim % chunk lanes. Use _z form so inactive lanes are zero - + // the final reduction below walks all lanes via svptrue_b32(). + if constexpr (partial_chunk) { + size_t remaining = dimension % chunk; + if (remaining > 0) { + svbool_t pg_partial = + svwhilelt_b32(uint32_t(0), uint32_t(remaining)); + svbool_t pg16_partial = + svwhilelt_b16(uint32_t(0), uint32_t(remaining)); + svuint32_t v1_u32 = svld1ub_u32(pg_partial, pVect1 + offset); + svfloat32_t v1_f = svcvt_f32_u32_z(pg_partial, v1_u32); + svfloat16_t q_h = svld1_f16( + pg16_partial, reinterpret_cast(pVect2) + offset); + svfloat32_t v2_f = svcvt_f32_f16_z(pg_partial, q_h); + sum0 = svmla_f32_z(pg_partial, sum0, v1_f, v2_f); + offset += remaining; + } + } + + // Main loop: 4 chunks per iteration via 4 accumulators. + const size_t chunk_size = 4 * chunk; + const size_t number_of_chunks = + (dimension - (partial_chunk ? dimension % chunk : 0)) / chunk_size; + for (size_t i = 0; i < number_of_chunks; i++) { + SQ8_FP16_InnerProductStep_SVE(pVect1, pVect2, offset, sum0, pg, chunk); + SQ8_FP16_InnerProductStep_SVE(pVect1, pVect2, offset, sum1, pg, chunk); + SQ8_FP16_InnerProductStep_SVE(pVect1, pVect2, offset, sum2, pg, chunk); + SQ8_FP16_InnerProductStep_SVE(pVect1, pVect2, offset, sum3, pg, chunk); + } + + // Additional steps 0..3. + if constexpr (additional_steps > 0) + SQ8_FP16_InnerProductStep_SVE(pVect1, pVect2, offset, sum0, pg, chunk); + if constexpr (additional_steps > 1) + SQ8_FP16_InnerProductStep_SVE(pVect1, pVect2, offset, sum1, pg, chunk); + if constexpr (additional_steps > 2) + SQ8_FP16_InnerProductStep_SVE(pVect1, pVect2, offset, sum2, pg, chunk); + + svfloat32_t sum = svadd_f32_x(pg, sum0, sum1); + sum = svadd_f32_x(pg, sum, sum2); + sum = svadd_f32_x(pg, sum, sum3); + float quantized_dot = svaddv_f32(pg, sum); + + // Metadata loads - unaligned because odd dim leaves trailers unaligned. + const uint8_t *params_bytes = static_cast(pVect1v) + dimension; + const float min_val = + load_unaligned(params_bytes + sq8::MIN_VAL * sizeof(float)); + const float delta = + load_unaligned(params_bytes + sq8::DELTA * sizeof(float)); + const uint8_t *query_meta_bytes = reinterpret_cast( + static_cast(pVect2v) + dimension); + const float y_sum = + load_unaligned(query_meta_bytes + sq8::SUM_QUERY * sizeof(float)); + + return min_val * y_sum + delta * quantized_dot; +} + +template +float SQ8_FP16_InnerProductSIMD_SVE(const void *pVect1v, const void *pVect2v, + size_t dimension) { + return 1.0f - SQ8_FP16_InnerProductSIMD_SVE_IMP( + pVect1v, pVect2v, dimension); +} + +template +float SQ8_FP16_CosineSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) { + return SQ8_FP16_InnerProductSIMD_SVE( + pVect1v, pVect2v, dimension); +} From eb4952a9de34756eae83d07d422d4b0e467cc238 Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Thu, 28 May 2026 18:20:41 +0300 Subject: [PATCH 09/19] =?UTF-8?q?Add=20SVE=20SQ8=E2=86=94FP16=20L2=20kerne?= =?UTF-8?q?l=20header=20[MOD-14972]?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/VecSim/spaces/L2/L2_SVE_SQ8_FP16.h | 38 ++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 src/VecSim/spaces/L2/L2_SVE_SQ8_FP16.h diff --git a/src/VecSim/spaces/L2/L2_SVE_SQ8_FP16.h b/src/VecSim/spaces/L2/L2_SVE_SQ8_FP16.h new file mode 100644 index 000000000..3c8e89ca6 --- /dev/null +++ b/src/VecSim/spaces/L2/L2_SVE_SQ8_FP16.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ +#pragma once +#include "VecSim/spaces/space_includes.h" +#include "VecSim/spaces/IP/IP_SVE_SQ8_FP16.h" +#include "VecSim/types/sq8.h" +#include "VecSim/types/float16.h" + +using sq8 = vecsim_types::sq8; +using float16 = vecsim_types::float16; + +/* + * SVE SQ8<->FP16 L2 squared distance: + * ||x - y||^2 = x_sum_squares - 2 * IP(x, y) + y_sum_squares + * IP is computed by SQ8_FP16_InnerProductSIMD_SVE_IMP; metadata is FP32. + */ + +template +float SQ8_FP16_L2SqrSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) { + const float ip = SQ8_FP16_InnerProductSIMD_SVE_IMP( + pVect1v, pVect2v, dimension); + + const uint8_t *params_bytes = static_cast(pVect1v) + dimension; + const float x_sum_sq = + load_unaligned(params_bytes + sq8::SUM_SQUARES * sizeof(float)); + const uint8_t *query_meta_bytes = reinterpret_cast( + static_cast(pVect2v) + dimension); + const float y_sum_sq = + load_unaligned(query_meta_bytes + sq8::SUM_SQUARES_QUERY * sizeof(float)); + + return x_sum_sq + y_sum_sq - 2.0f * ip; +} From fcb01bbc6e0abbcc14b366b9e3525ad5b25d80bc Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Thu, 28 May 2026 18:24:14 +0300 Subject: [PATCH 10/19] =?UTF-8?q?Wire=20SVE/SVE2=20SQ8=E2=86=94FP16=20choo?= =?UTF-8?q?sers=20[MOD-14972]?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Sonnet 4.6 --- src/VecSim/spaces/functions/SVE.cpp | 21 +++++++++++++++++++++ src/VecSim/spaces/functions/SVE.h | 4 ++++ src/VecSim/spaces/functions/SVE2.cpp | 20 ++++++++++++++++++++ src/VecSim/spaces/functions/SVE2.h | 4 ++++ 4 files changed, 49 insertions(+) diff --git a/src/VecSim/spaces/functions/SVE.cpp b/src/VecSim/spaces/functions/SVE.cpp index fde853db2..bd197c84c 100644 --- a/src/VecSim/spaces/functions/SVE.cpp +++ b/src/VecSim/spaces/functions/SVE.cpp @@ -25,6 +25,9 @@ #include "VecSim/spaces/IP/IP_SVE_SQ8_FP32.h" #include "VecSim/spaces/L2/L2_SVE_SQ8_FP32.h" +#include "VecSim/spaces/IP/IP_SVE_SQ8_FP16.h" +#include "VecSim/spaces/L2/L2_SVE_SQ8_FP16.h" + #include "VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h" #include "VecSim/spaces/L2/L2_SVE_SQ8_SQ8.h" @@ -119,6 +122,24 @@ dist_func_t Choose_SQ8_FP32_L2_implementation_SVE(size_t dim) { return ret_dist_func; } +dist_func_t Choose_SQ8_FP16_IP_implementation_SVE(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_FP16_InnerProductSIMD_SVE, dim, svcntw); + return ret_dist_func; +} + +dist_func_t Choose_SQ8_FP16_Cosine_implementation_SVE(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_FP16_CosineSIMD_SVE, dim, svcntw); + return ret_dist_func; +} + +dist_func_t Choose_SQ8_FP16_L2_implementation_SVE(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_FP16_L2SqrSIMD_SVE, dim, svcntw); + return ret_dist_func; +} + // SQ8-to-SQ8 distance functions (both vectors are uint8 quantized with precomputed sum) // Note: Use svcntb for uint8 elements (not svcntw which is for 32-bit elements) dist_func_t Choose_SQ8_SQ8_IP_implementation_SVE(size_t dim) { diff --git a/src/VecSim/spaces/functions/SVE.h b/src/VecSim/spaces/functions/SVE.h index bd3bc97c3..43b3b22cd 100644 --- a/src/VecSim/spaces/functions/SVE.h +++ b/src/VecSim/spaces/functions/SVE.h @@ -33,6 +33,10 @@ dist_func_t Choose_SQ8_FP32_IP_implementation_SVE(size_t dim); dist_func_t Choose_SQ8_FP32_Cosine_implementation_SVE(size_t dim); dist_func_t Choose_SQ8_FP32_L2_implementation_SVE(size_t dim); +dist_func_t Choose_SQ8_FP16_IP_implementation_SVE(size_t dim); +dist_func_t Choose_SQ8_FP16_Cosine_implementation_SVE(size_t dim); +dist_func_t Choose_SQ8_FP16_L2_implementation_SVE(size_t dim); + // SQ8-to-SQ8 distance functions (both vectors are uint8 quantized with precomputed sum) dist_func_t Choose_SQ8_SQ8_IP_implementation_SVE(size_t dim); dist_func_t Choose_SQ8_SQ8_Cosine_implementation_SVE(size_t dim); diff --git a/src/VecSim/spaces/functions/SVE2.cpp b/src/VecSim/spaces/functions/SVE2.cpp index 4215d79cf..4496c07e6 100644 --- a/src/VecSim/spaces/functions/SVE2.cpp +++ b/src/VecSim/spaces/functions/SVE2.cpp @@ -22,6 +22,8 @@ #include "VecSim/spaces/IP/IP_SVE_UINT8.h" // SVE2 implementation is identical to SVE #include "VecSim/spaces/IP/IP_SVE_SQ8_FP32.h" // SVE2 implementation is identical to SVE #include "VecSim/spaces/L2/L2_SVE_SQ8_FP32.h" // SVE2 implementation is identical to SVE +#include "VecSim/spaces/IP/IP_SVE_SQ8_FP16.h" // SVE2 implementation is identical to SVE +#include "VecSim/spaces/L2/L2_SVE_SQ8_FP16.h" // SVE2 implementation is identical to SVE #include "VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h" // SVE2 implementation is identical to SVE #include "VecSim/spaces/L2/L2_SVE_SQ8_SQ8.h" // SVE2 implementation is identical to SVE @@ -116,6 +118,24 @@ dist_func_t Choose_SQ8_FP32_L2_implementation_SVE2(size_t dim) { return ret_dist_func; } +dist_func_t Choose_SQ8_FP16_IP_implementation_SVE2(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_FP16_InnerProductSIMD_SVE, dim, svcntw); + return ret_dist_func; +} + +dist_func_t Choose_SQ8_FP16_Cosine_implementation_SVE2(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_FP16_CosineSIMD_SVE, dim, svcntw); + return ret_dist_func; +} + +dist_func_t Choose_SQ8_FP16_L2_implementation_SVE2(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_FP16_L2SqrSIMD_SVE, dim, svcntw); + return ret_dist_func; +} + // SQ8-to-SQ8 distance functions (both vectors are uint8 quantized) // Note: Use svcntb for uint8 elements (not svcntw which is for 32-bit elements) dist_func_t Choose_SQ8_SQ8_IP_implementation_SVE2(size_t dim) { diff --git a/src/VecSim/spaces/functions/SVE2.h b/src/VecSim/spaces/functions/SVE2.h index 04078a91e..2c1bfbac3 100644 --- a/src/VecSim/spaces/functions/SVE2.h +++ b/src/VecSim/spaces/functions/SVE2.h @@ -33,6 +33,10 @@ dist_func_t Choose_SQ8_FP32_IP_implementation_SVE2(size_t dim); dist_func_t Choose_SQ8_FP32_Cosine_implementation_SVE2(size_t dim); dist_func_t Choose_SQ8_FP32_L2_implementation_SVE2(size_t dim); +dist_func_t Choose_SQ8_FP16_IP_implementation_SVE2(size_t dim); +dist_func_t Choose_SQ8_FP16_Cosine_implementation_SVE2(size_t dim); +dist_func_t Choose_SQ8_FP16_L2_implementation_SVE2(size_t dim); + // SQ8-to-SQ8 distance functions (both vectors are uint8 quantized) dist_func_t Choose_SQ8_SQ8_IP_implementation_SVE2(size_t dim); dist_func_t Choose_SQ8_SQ8_Cosine_implementation_SVE2(size_t dim); From 15fca694f344f502e0594c5bc70b85d5156c34d5 Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Thu, 28 May 2026 18:28:13 +0300 Subject: [PATCH 11/19] =?UTF-8?q?Dispatch=20SQ8=E2=86=94FP16=20to=20SVE/SV?= =?UTF-8?q?E2=20tiers=20on=20AArch64=20[MOD-14972]?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Sonnet 4.6 --- src/VecSim/spaces/IP_space.cpp | 20 ++++++++++++++++++++ src/VecSim/spaces/L2_space.cpp | 10 ++++++++++ 2 files changed, 30 insertions(+) diff --git a/src/VecSim/spaces/IP_space.cpp b/src/VecSim/spaces/IP_space.cpp index 92616f394..1930e64a2 100644 --- a/src/VecSim/spaces/IP_space.cpp +++ b/src/VecSim/spaces/IP_space.cpp @@ -229,6 +229,16 @@ dist_func_t IP_SQ8_FP16_GetDistFunc(size_t dim, unsigned char *alignment, if (dim < 16) { return ret_dist_func; } +#ifdef OPT_SVE2 + if (features.sve2) { + return Choose_SQ8_FP16_IP_implementation_SVE2(dim); + } +#endif +#ifdef OPT_SVE + if (features.sve) { + return Choose_SQ8_FP16_IP_implementation_SVE(dim); + } +#endif #ifdef OPT_NEON_HP if (features.asimdhp) { // No alignment write: the locked spec and the sister ARM SQ8_FP32 dispatchers @@ -291,6 +301,16 @@ dist_func_t Cosine_SQ8_FP16_GetDistFunc(size_t dim, unsigned char *alignm if (dim < 16) { return ret_dist_func; } +#ifdef OPT_SVE2 + if (features.sve2) { + return Choose_SQ8_FP16_Cosine_implementation_SVE2(dim); + } +#endif +#ifdef OPT_SVE + if (features.sve) { + return Choose_SQ8_FP16_Cosine_implementation_SVE(dim); + } +#endif #ifdef OPT_NEON_HP if (features.asimdhp) { // No alignment write: the locked spec and the sister ARM SQ8_FP32 dispatchers diff --git a/src/VecSim/spaces/L2_space.cpp b/src/VecSim/spaces/L2_space.cpp index 995b4c4d6..2e18920b3 100644 --- a/src/VecSim/spaces/L2_space.cpp +++ b/src/VecSim/spaces/L2_space.cpp @@ -160,6 +160,16 @@ dist_func_t L2_SQ8_FP16_GetDistFunc(size_t dim, unsigned char *alignment, if (dim < 16) { return ret_dist_func; } +#ifdef OPT_SVE2 + if (features.sve2) { + return Choose_SQ8_FP16_L2_implementation_SVE2(dim); + } +#endif +#ifdef OPT_SVE + if (features.sve) { + return Choose_SQ8_FP16_L2_implementation_SVE(dim); + } +#endif #ifdef OPT_NEON_HP if (features.asimdhp) { // No alignment write: the locked spec and the sister ARM SQ8_FP32 dispatchers From 0fcd7d0975222744e5dfe5f1dabc395c231efed5 Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Thu, 28 May 2026 18:33:10 +0300 Subject: [PATCH 12/19] =?UTF-8?q?Extend=20SQ8=E2=86=94FP16=20tier-walk=20t?= =?UTF-8?q?ests=20with=20SVE/SVE2=20[MOD-14972]?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Sonnet 4.6 --- tests/unit/test_spaces.cpp | 72 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index d2c9386ac..f7266dce4 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -3149,6 +3149,29 @@ TEST_P(SQ8_FP16_SpacesOptimizationTest, SQ8_FP16_L2SqrTest) { #endif #endif // OPT_F16C +#ifdef CPU_FEATURES_ARCH_AARCH64 +#ifdef OPT_SVE2 + if (optimization.sve2) { + unsigned char alignment = 0; + arch_opt_func = L2_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_L2_implementation_SVE2(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01) + << "SVE2 with dim " << dim; + optimization.sve2 = 0; + } +#endif +#ifdef OPT_SVE + if (optimization.sve) { + unsigned char alignment = 0; + arch_opt_func = L2_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_L2_implementation_SVE(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01) + << "SVE with dim " << dim; + optimization.sve = 0; + } +#endif #ifdef OPT_NEON_HP if (optimization.asimdhp) { unsigned char alignment = 0; @@ -3160,6 +3183,7 @@ TEST_P(SQ8_FP16_SpacesOptimizationTest, SQ8_FP16_L2SqrTest) { optimization.asimdhp = 0; } #endif +#endif // CPU_FEATURES_ARCH_AARCH64 unsigned char alignment = 0; arch_opt_func = L2_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization); @@ -3236,6 +3260,29 @@ TEST_P(SQ8_FP16_SpacesOptimizationTest, SQ8_FP16_InnerProductTest) { #endif #endif // OPT_F16C +#ifdef CPU_FEATURES_ARCH_AARCH64 +#ifdef OPT_SVE2 + if (optimization.sve2) { + unsigned char alignment = 0; + arch_opt_func = IP_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_IP_implementation_SVE2(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01) + << "SVE2 with dim " << dim; + optimization.sve2 = 0; + } +#endif +#ifdef OPT_SVE + if (optimization.sve) { + unsigned char alignment = 0; + arch_opt_func = IP_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_IP_implementation_SVE(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01) + << "SVE with dim " << dim; + optimization.sve = 0; + } +#endif #ifdef OPT_NEON_HP if (optimization.asimdhp) { unsigned char alignment = 0; @@ -3247,6 +3294,7 @@ TEST_P(SQ8_FP16_SpacesOptimizationTest, SQ8_FP16_InnerProductTest) { optimization.asimdhp = 0; } #endif +#endif // CPU_FEATURES_ARCH_AARCH64 unsigned char alignment = 0; arch_opt_func = IP_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization); @@ -3323,6 +3371,29 @@ TEST_P(SQ8_FP16_SpacesOptimizationTest, SQ8_FP16_CosineTest) { #endif #endif // OPT_F16C +#ifdef CPU_FEATURES_ARCH_AARCH64 +#ifdef OPT_SVE2 + if (optimization.sve2) { + unsigned char alignment = 0; + arch_opt_func = Cosine_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_Cosine_implementation_SVE2(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01) + << "SVE2 with dim " << dim; + optimization.sve2 = 0; + } +#endif +#ifdef OPT_SVE + if (optimization.sve) { + unsigned char alignment = 0; + arch_opt_func = Cosine_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_Cosine_implementation_SVE(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01) + << "SVE with dim " << dim; + optimization.sve = 0; + } +#endif #ifdef OPT_NEON_HP if (optimization.asimdhp) { unsigned char alignment = 0; @@ -3334,6 +3405,7 @@ TEST_P(SQ8_FP16_SpacesOptimizationTest, SQ8_FP16_CosineTest) { optimization.asimdhp = 0; } #endif +#endif // CPU_FEATURES_ARCH_AARCH64 unsigned char alignment = 0; arch_opt_func = Cosine_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization); From 6a783f8ccb641d02d6d80a03500d1a3b265ff001 Mon Sep 17 00:00:00 2001 From: Dor Forer Date: Thu, 28 May 2026 18:41:59 +0300 Subject: [PATCH 13/19] =?UTF-8?q?Register=20ARM=20SQ8=E2=86=94FP16=20micro?= =?UTF-8?q?benchmarks=20[MOD-14972]?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Sonnet 4.6 --- .../spaces_benchmarks/bm_spaces_sq8_fp16.cpp | 26 +++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_fp16.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_fp16.cpp index ba3030064..9ec022e39 100644 --- a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_fp16.cpp +++ b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_fp16.cpp @@ -16,8 +16,7 @@ using float16 = vecsim_types::float16; /** * SQ8-to-FP16 benchmarks: SQ8 quantized storage with FP16 query. * Registers the naive (scalar) baseline plus per-ISA SIMD variants (x86: AVX-512 / AVX2+FMA / - * AVX2 / SSE4 — gated on the matching OPT_* defines and runtime CPU features). ARM kernels - * land via MOD-14972. + * AVX2 / SSE4 — gated on the matching OPT_* defines and runtime CPU features). ARM kernels (NEON_HP / SVE / SVE2) are registered below. */ class BM_VecSimSpaces_SQ8_FP16 : public benchmark::Fixture { protected: @@ -85,6 +84,29 @@ INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, SSE4, 16, s #endif // OPT_F16C #endif // x86_64 +#ifdef CPU_FEATURES_ARCH_AARCH64 +cpu_features::Aarch64Features arm_opt = cpu_features::GetAarch64Info().features; + +#ifdef OPT_SVE2 +bool sve2_supported = arm_opt.sve2; +INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, SVE2, 16, sve2_supported); +INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, SVE2, 16, sve2_supported); +#endif + +#ifdef OPT_SVE +bool sve_supported = arm_opt.sve; +INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, SVE, 16, sve_supported); +INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, SVE, 16, sve_supported); +#endif + +#ifdef OPT_NEON_HP +bool neon_hp_supported = arm_opt.asimdhp; +INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, NEON_HP, 16, neon_hp_supported); +INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, NEON_HP, 16, + neon_hp_supported); +#endif +#endif // CPU_FEATURES_ARCH_AARCH64 + // Naive (scalar) baseline — always registered as the comparison anchor. INITIALIZE_NAIVE_BM(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, InnerProduct, 16); From a2a1b24b1975e4c7a7e4f036eae756ad5739cf16 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Sun, 31 May 2026 07:47:32 +0000 Subject: [PATCH 14/19] =?UTF-8?q?Add=20missing=20alignment=3D0=20assertion?= =?UTF-8?q?s=20to=20SQ8=E2=86=94FP16=20ARM=20tier-walk=20tests=20[MOD-1497?= =?UTF-8?q?2]?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 9 ARM tier blocks (L2/IP/Cosine × SVE2/SVE/NEON_HP) were missing ASSERT_EQ(alignment, 0) after each ASSERT_NEAR, unlike the SQ8_FP32 sister blocks which assert it. Adds the assertions to lock the contract that ARM tiers leave the caller's alignment value untouched. --- tests/unit/test_spaces.cpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index f7266dce4..ce8605565 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -3158,6 +3158,7 @@ TEST_P(SQ8_FP16_SpacesOptimizationTest, SQ8_FP16_L2SqrTest) { << "Unexpected distance function chosen for dim " << dim; ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01) << "SVE2 with dim " << dim; + ASSERT_EQ(alignment, 0) << "No alignment SVE2 with dim " << dim; optimization.sve2 = 0; } #endif @@ -3169,6 +3170,7 @@ TEST_P(SQ8_FP16_SpacesOptimizationTest, SQ8_FP16_L2SqrTest) { << "Unexpected distance function chosen for dim " << dim; ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01) << "SVE with dim " << dim; + ASSERT_EQ(alignment, 0) << "No alignment SVE with dim " << dim; optimization.sve = 0; } #endif @@ -3180,6 +3182,7 @@ TEST_P(SQ8_FP16_SpacesOptimizationTest, SQ8_FP16_L2SqrTest) { << "Unexpected distance function chosen for dim " << dim; ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01) << "NEON_HP with dim " << dim; + ASSERT_EQ(alignment, 0) << "No alignment NEON_HP with dim " << dim; optimization.asimdhp = 0; } #endif @@ -3269,6 +3272,7 @@ TEST_P(SQ8_FP16_SpacesOptimizationTest, SQ8_FP16_InnerProductTest) { << "Unexpected distance function chosen for dim " << dim; ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01) << "SVE2 with dim " << dim; + ASSERT_EQ(alignment, 0) << "No alignment SVE2 with dim " << dim; optimization.sve2 = 0; } #endif @@ -3280,6 +3284,7 @@ TEST_P(SQ8_FP16_SpacesOptimizationTest, SQ8_FP16_InnerProductTest) { << "Unexpected distance function chosen for dim " << dim; ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01) << "SVE with dim " << dim; + ASSERT_EQ(alignment, 0) << "No alignment SVE with dim " << dim; optimization.sve = 0; } #endif @@ -3291,6 +3296,7 @@ TEST_P(SQ8_FP16_SpacesOptimizationTest, SQ8_FP16_InnerProductTest) { << "Unexpected distance function chosen for dim " << dim; ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01) << "NEON_HP with dim " << dim; + ASSERT_EQ(alignment, 0) << "No alignment NEON_HP with dim " << dim; optimization.asimdhp = 0; } #endif @@ -3380,6 +3386,7 @@ TEST_P(SQ8_FP16_SpacesOptimizationTest, SQ8_FP16_CosineTest) { << "Unexpected distance function chosen for dim " << dim; ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01) << "SVE2 with dim " << dim; + ASSERT_EQ(alignment, 0) << "No alignment SVE2 with dim " << dim; optimization.sve2 = 0; } #endif @@ -3391,6 +3398,7 @@ TEST_P(SQ8_FP16_SpacesOptimizationTest, SQ8_FP16_CosineTest) { << "Unexpected distance function chosen for dim " << dim; ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01) << "SVE with dim " << dim; + ASSERT_EQ(alignment, 0) << "No alignment SVE with dim " << dim; optimization.sve = 0; } #endif @@ -3402,6 +3410,7 @@ TEST_P(SQ8_FP16_SpacesOptimizationTest, SQ8_FP16_CosineTest) { << "Unexpected distance function chosen for dim " << dim; ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01) << "NEON_HP with dim " << dim; + ASSERT_EQ(alignment, 0) << "No alignment NEON_HP with dim " << dim; optimization.asimdhp = 0; } #endif From 284ad69889030ee178c394cb4e9236a83e867924 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Sun, 31 May 2026 08:40:11 +0000 Subject: [PATCH 15/19] =?UTF-8?q?Fix=20SVE=20SQ8=E2=86=94FP16=20kernel:=20?= =?UTF-8?q?use=20svzip1=20to=20correct=20FP16=E2=86=92FP32=20widening=20[M?= =?UTF-8?q?OD-14972]?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit svcvt_f32_f16_x (FCVT) reads even-indexed FP16 elements: FP32[e] ← FP16[2e]. The step function loaded chunk consecutive FP16 values into positions 0..chunk-1, then passed them directly to svcvt_f32_f16_x, which picked positions 0,2,4,... and silently skipped positions 1,3,5,... For chunk=4 (128-bit SVE), only 2 of 4 FP16 values per step were used, producing wrong dot products. Fix: svzip1_f16(q_h, zeros) spreads values to even positions [v0,0,v1,0,...] so FCVT correctly reads v[0],v[1],v[2],... Applied to both the full step helper and the partial-chunk path. Discovered and fixed during ARM host verification (Task 14, MOD-14972). --- src/VecSim/spaces/IP/IP_SVE_SQ8_FP16.h | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/VecSim/spaces/IP/IP_SVE_SQ8_FP16.h b/src/VecSim/spaces/IP/IP_SVE_SQ8_FP16.h index 36a7d18e6..d3213e3c3 100644 --- a/src/VecSim/spaces/IP/IP_SVE_SQ8_FP16.h +++ b/src/VecSim/spaces/IP/IP_SVE_SQ8_FP16.h @@ -35,11 +35,16 @@ SQ8_FP16_InnerProductStep_SVE(const uint8_t *pVect1, const float16 *pVect2, size svuint32_t v1_u32 = svld1ub_u32(pg, pVect1 + offset); svfloat32_t v1_f = svcvt_f32_u32_x(pg, v1_u32); - // FP16 query -> FP32. svld1_f16 uses a b16 predicate sized to `chunk` half lanes. + // FP16 query -> FP32. + // svcvt_f32_f16_x (FCVT) reads even-indexed FP16 elements: FP32[e] <- FP16[2e]. + // Our chunk FP16 values land at consecutive positions 0..chunk-1 after the load. + // svzip1 interleaves them with zeros → positions 0,2,4,... hold the values, + // so FCVT correctly reads v[0], v[1], v[2], ... into FP32[0..chunk-1]. svbool_t pg16 = svwhilelt_b16(uint32_t(0), uint32_t(chunk)); svfloat16_t q_h = svld1_f16(pg16, reinterpret_cast(pVect2) + offset); - svfloat32_t v2_f = svcvt_f32_f16_x(pg, q_h); + svfloat16_t q_h_spread = svzip1_f16(q_h, svdup_f16(0.0f)); + svfloat32_t v2_f = svcvt_f32_f16_x(pg, q_h_spread); sum = svmla_f32_x(pg, sum, v1_f, v2_f); offset += chunk; @@ -75,7 +80,9 @@ float SQ8_FP16_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v svfloat32_t v1_f = svcvt_f32_u32_z(pg_partial, v1_u32); svfloat16_t q_h = svld1_f16( pg16_partial, reinterpret_cast(pVect2) + offset); - svfloat32_t v2_f = svcvt_f32_f16_z(pg_partial, q_h); + // Same zip1 trick as the full step: spread values to even positions. + svfloat16_t q_h_spread = svzip1_f16(q_h, svdup_f16(0.0f)); + svfloat32_t v2_f = svcvt_f32_f16_z(pg_partial, q_h_spread); sum0 = svmla_f32_z(pg_partial, sum0, v1_f, v2_f); offset += remaining; } From 3754f76e10dd340dcf71ca33711b714275b1bf3d Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Sun, 31 May 2026 12:43:00 +0000 Subject: [PATCH 16/19] =?UTF-8?q?Optimize=20ARM=20SQ8=E2=86=94FP16=20kerne?= =?UTF-8?q?ls=20and=20align=20with=20codebase=20conventions=20[MOD-14972]?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SVE hot loop: replace svzip1_f16+svdup_f16+svwhilelt_b16 (4 ops) with svld1uh_u32 (1 op) — zero-extends each FP16 halfword into a 32-bit lane so svcvt_f32_f16_x reads the correct bits directly. Same fix applied to the partial-chunk path, which also drops the now-redundant pg16_partial predicate. Accumulator combine changed from svadd_f32_x to svadd_f32_z to match the SQ8_FP32 SVE sister. NEON residual: replace the single 8-lane block + up-to-7 software-scalar iterations with three independent 4-lane sub-steps (r>=4, r>=8, r>=12), leaving at most 3 elements for scalar — mirrors the SQ8_FP32 NEON sister exactly. Eliminates expensive vecsim_types::FP16_to_FP32 calls for residuals 4..15 (previously up to 7 software conversions per call). Both IP headers: remove assert()+ (no sister kernel uses them). Both L2 headers: drop redundant float16.h include and using declarations (arrive transitively through the included IP header). --- src/VecSim/spaces/IP/IP_NEON_SQ8_FP16.h | 64 +++++++++++++------------ src/VecSim/spaces/IP/IP_SVE_SQ8_FP16.h | 60 ++++++++--------------- src/VecSim/spaces/L2/L2_NEON_SQ8_FP16.h | 5 -- src/VecSim/spaces/L2/L2_SVE_SQ8_FP16.h | 5 -- 4 files changed, 52 insertions(+), 82 deletions(-) diff --git a/src/VecSim/spaces/IP/IP_NEON_SQ8_FP16.h b/src/VecSim/spaces/IP/IP_NEON_SQ8_FP16.h index b1d26fec5..8b8d37c0f 100644 --- a/src/VecSim/spaces/IP/IP_NEON_SQ8_FP16.h +++ b/src/VecSim/spaces/IP/IP_NEON_SQ8_FP16.h @@ -11,19 +11,14 @@ #include "VecSim/types/sq8.h" #include "VecSim/types/float16.h" #include -#include using sq8 = vecsim_types::sq8; using float16 = vecsim_types::float16; /* - * Optimised asymmetric SQ8<->FP16 inner product using the algebraic identity: + * Asymmetric SQ8 (storage) <-> FP16 (query) inner product using algebraic identity: + * IP(x, y) ~= min * y_sum + delta * Σ(q_i * y_i) * - * IP(x, y) = sum(x_i * y_i) - * ~= sum((min + delta * q_i) * y_i) - * = min * y_sum + delta * sum(q_i * y_i) - * - * The hot loop only accumulates sum(q_i * y_i) - no per-element dequantisation. * FP16 query lanes are widened to FP32 via vcvt_f32_f16 per 16-lane chunk. */ @@ -32,7 +27,6 @@ static inline void SQ8_FP16_InnerProductStep_NEON_HP(const uint8_t *&pVect1, const float16 *&pVect2, float32x4_t &sum0, float32x4_t &sum1, float32x4_t &sum2, float32x4_t &sum3) { - // SQ8 storage: 16 * uint8 -> 4 * float32x4_t uint8x16_t v1_u8 = vld1q_u8(pVect1); uint16x8_t v1_lo = vmovl_u8(vget_low_u8(v1_u8)); uint16x8_t v1_hi = vmovl_u8(vget_high_u8(v1_u8)); @@ -41,7 +35,6 @@ SQ8_FP16_InnerProductStep_NEON_HP(const uint8_t *&pVect1, const float16 *&pVect2 float32x4_t v1_2 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(v1_hi))); float32x4_t v1_3 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(v1_hi))); - // FP16 query: 16 * f16 -> 4 * float32x4_t via vcvt_f32_f16 const float16_t *q = reinterpret_cast(pVect2); float16x8_t q_lo = vld1q_f16(q); float16x8_t q_hi = vld1q_f16(q + 8); @@ -59,14 +52,12 @@ SQ8_FP16_InnerProductStep_NEON_HP(const uint8_t *&pVect1, const float16 *&pVect2 pVect2 += 16; } -// pVect1v = SQ8 storage, pVect2v = FP16 query +// pVect1v = SQ8 storage, pVect2v = FP16 query. Precondition: dim >= 16 (enforced by dispatcher). template // 0..15 float SQ8_FP16_InnerProductSIMD16_NEON_HP_IMP(const void *pVect1v, const void *pVect2v, size_t dimension) { - assert(dimension >= 16 && "kernel precondition: dispatcher must guard dim >= 16"); - - const uint8_t *pVect1 = static_cast(pVect1v); // SQ8 storage - const float16 *pVect2 = static_cast(pVect2v); // FP16 query + const uint8_t *pVect1 = static_cast(pVect1v); + const float16 *pVect2 = static_cast(pVect2v); float32x4_t sum0 = vdupq_n_f32(0.0f); float32x4_t sum1 = vdupq_n_f32(0.0f); @@ -78,36 +69,48 @@ float SQ8_FP16_InnerProductSIMD16_NEON_HP_IMP(const void *pVect1v, const void *p SQ8_FP16_InnerProductStep_NEON_HP(pVect1, pVect2, sum0, sum1, sum2, sum3); } - // Residual handling: dim % 16 lanes. - // residual >= 8: one safe 8-lane SQ8 + 8-lane FP16 load (FP16 trailer is wide enough). - // residual < 8: scalar-only - a 4-lane FP16 load would overread y_sum metadata. + // Residual: up to three independent 4-lane sub-steps, leaving at most 3 elements + // for scalar — mirrors the SQ8_FP32 NEON sister pattern. + // vld1_f16 (4 FP16 = 8 bytes) is safe for any residual: FP16 metadata follows + // the lane data so there is always enough headroom. constexpr unsigned char r = residual; - if constexpr (r >= 8) { + if constexpr (r >= 4) { uint8x8_t v1_u8 = vld1_u8(pVect1); - uint16x8_t v1_u16 = vmovl_u8(v1_u8); - float32x4_t v1_a = vcvtq_f32_u32(vmovl_u16(vget_low_u16(v1_u16))); - float32x4_t v1_b = vcvtq_f32_u32(vmovl_u16(vget_high_u16(v1_u16))); - float16x8_t q_h = vld1q_f16(reinterpret_cast(pVect2)); - float32x4_t v2_a = vcvt_f32_f16(vget_low_f16(q_h)); - float32x4_t v2_b = vcvt_f32_f16(vget_high_f16(q_h)); + float32x4_t v1_a = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(v1_u8)))); + float32x4_t v2_a = + vcvt_f32_f16(vld1_f16(reinterpret_cast(pVect2))); sum0 = vfmaq_f32(sum0, v1_a, v2_a); + pVect1 += 4; + pVect2 += 4; + } + if constexpr (r >= 8) { + uint8x8_t v1_u8 = vld1_u8(pVect1); + float32x4_t v1_b = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(v1_u8)))); + float32x4_t v2_b = + vcvt_f32_f16(vld1_f16(reinterpret_cast(pVect2))); sum1 = vfmaq_f32(sum1, v1_b, v2_b); - pVect1 += 8; - pVect2 += 8; + pVect1 += 4; + pVect2 += 4; + } + if constexpr (r >= 12) { + uint8x8_t v1_u8 = vld1_u8(pVect1); + float32x4_t v1_c = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(v1_u8)))); + float32x4_t v2_c = + vcvt_f32_f16(vld1_f16(reinterpret_cast(pVect2))); + sum2 = vfmaq_f32(sum2, v1_c, v2_c); + pVect1 += 4; + pVect2 += 4; } - // Lane-by-lane scalar for the final 0..7 (residual % 8) elements. - constexpr unsigned char tail = r & 0x7; + constexpr unsigned char tail = r & 3; float scalar_dot = 0.0f; for (unsigned char k = 0; k < tail; ++k) { scalar_dot += static_cast(pVect1[k]) * vecsim_types::FP16_to_FP32(pVect2[k]); } - // Reduce the four NEON accumulators. float32x4_t sum_lo = vaddq_f32(sum0, sum1); float32x4_t sum_hi = vaddq_f32(sum2, sum3); float quantized_dot = vaddvq_f32(vaddq_f32(sum_lo, sum_hi)) + scalar_dot; - // Metadata loads - use load_unaligned because odd dim leaves trailers unaligned. const uint8_t *params_bytes = static_cast(pVect1v) + dimension; const float min_val = load_unaligned(params_bytes + sq8::MIN_VAL * sizeof(float)); @@ -130,6 +133,5 @@ float SQ8_FP16_InnerProductSIMD16_NEON_HP(const void *pVect1v, const void *pVect template float SQ8_FP16_CosineSIMD16_NEON_HP(const void *pVect1v, const void *pVect2v, size_t dimension) { - // Cosine = 1 - IP (vectors are pre-normalised); reuses the IP wrapper. return SQ8_FP16_InnerProductSIMD16_NEON_HP(pVect1v, pVect2v, dimension); } diff --git a/src/VecSim/spaces/IP/IP_SVE_SQ8_FP16.h b/src/VecSim/spaces/IP/IP_SVE_SQ8_FP16.h index d3213e3c3..6c7a52529 100644 --- a/src/VecSim/spaces/IP/IP_SVE_SQ8_FP16.h +++ b/src/VecSim/spaces/IP/IP_SVE_SQ8_FP16.h @@ -11,51 +11,36 @@ #include "VecSim/types/sq8.h" #include "VecSim/types/float16.h" #include -#include using sq8 = vecsim_types::sq8; using float16 = vecsim_types::float16; /* - * Optimised asymmetric SQ8<->FP16 inner product using the algebraic identity: + * Asymmetric SQ8 (storage) <-> FP16 (query) inner product using algebraic identity: + * IP(x, y) ~= min * y_sum + delta * Σ(q_i * y_i) * - * IP(x, y) ~= min * y_sum + delta * sum(q_i * y_i) - * - * Hot loop accumulates sum(q_i * y_i) only; FP16 query lanes are widened to FP32 - * inside each step via svcvt_f32_f16_x. Metadata loads use load_unaligned. + * FP16 query lanes are widened to FP32 per step via svld1uh_u32 + svcvt_f32_f16_x. + * svld1uh_u32 zero-extends each FP16 halfword into a 32-bit lane so that + * svcvt_f32_f16_x reads the correct bits directly without any interleaving. */ // Helper: one SVE-vector-width-of-FP32 step. -// chunk = svcntw() - number of FP32 lanes per step. -// pg = svptrue_b32() - predicate for FP32 lanes. static inline void SQ8_FP16_InnerProductStep_SVE(const uint8_t *pVect1, const float16 *pVect2, size_t &offset, svfloat32_t &sum, svbool_t pg, size_t chunk) { - // SQ8 -> uint32 (widen on load), then to FP32. svuint32_t v1_u32 = svld1ub_u32(pg, pVect1 + offset); svfloat32_t v1_f = svcvt_f32_u32_x(pg, v1_u32); - - // FP16 query -> FP32. - // svcvt_f32_f16_x (FCVT) reads even-indexed FP16 elements: FP32[e] <- FP16[2e]. - // Our chunk FP16 values land at consecutive positions 0..chunk-1 after the load. - // svzip1 interleaves them with zeros → positions 0,2,4,... hold the values, - // so FCVT correctly reads v[0], v[1], v[2], ... into FP32[0..chunk-1]. - svbool_t pg16 = svwhilelt_b16(uint32_t(0), uint32_t(chunk)); - svfloat16_t q_h = - svld1_f16(pg16, reinterpret_cast(pVect2) + offset); - svfloat16_t q_h_spread = svzip1_f16(q_h, svdup_f16(0.0f)); - svfloat32_t v2_f = svcvt_f32_f16_x(pg, q_h_spread); - + svuint32_t q_u32 = + svld1uh_u32(pg, reinterpret_cast(pVect2 + offset)); + svfloat32_t v2_f = svcvt_f32_f16_x(pg, svreinterpret_f16_u32(q_u32)); sum = svmla_f32_x(pg, sum, v1_f, v2_f); offset += chunk; } -// pVect1v = SQ8 storage, pVect2v = FP16 query +// pVect1v = SQ8 storage, pVect2v = FP16 query. Precondition: dim >= 16 (enforced by dispatcher). template float SQ8_FP16_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v, size_t dimension) { - assert(dimension >= 16 && "kernel precondition: dispatcher must guard dim >= 16"); - const uint8_t *pVect1 = static_cast(pVect1v); const float16 *pVect2 = static_cast(pVect2v); size_t offset = 0; @@ -67,28 +52,23 @@ float SQ8_FP16_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v svfloat32_t sum2 = svdup_f32(0.0f); svfloat32_t sum3 = svdup_f32(0.0f); - // Partial chunk for dim % chunk lanes. Use _z form so inactive lanes are zero - - // the final reduction below walks all lanes via svptrue_b32(). + // Partial chunk for dim % chunk lanes. Use _z form so inactive lanes are zero; + // the final reduction walks all lanes via svptrue_b32(). if constexpr (partial_chunk) { size_t remaining = dimension % chunk; if (remaining > 0) { - svbool_t pg_partial = - svwhilelt_b32(uint32_t(0), uint32_t(remaining)); - svbool_t pg16_partial = - svwhilelt_b16(uint32_t(0), uint32_t(remaining)); + svbool_t pg_partial = svwhilelt_b32(uint32_t(0), uint32_t(remaining)); svuint32_t v1_u32 = svld1ub_u32(pg_partial, pVect1 + offset); svfloat32_t v1_f = svcvt_f32_u32_z(pg_partial, v1_u32); - svfloat16_t q_h = svld1_f16( - pg16_partial, reinterpret_cast(pVect2) + offset); - // Same zip1 trick as the full step: spread values to even positions. - svfloat16_t q_h_spread = svzip1_f16(q_h, svdup_f16(0.0f)); - svfloat32_t v2_f = svcvt_f32_f16_z(pg_partial, q_h_spread); + svuint32_t q_u32 = svld1uh_u32( + pg_partial, reinterpret_cast(pVect2 + offset)); + svfloat32_t v2_f = svcvt_f32_f16_z(pg_partial, svreinterpret_f16_u32(q_u32)); sum0 = svmla_f32_z(pg_partial, sum0, v1_f, v2_f); offset += remaining; } } - // Main loop: 4 chunks per iteration via 4 accumulators. + // Main loop: 4 chunks per iteration, one chunk per accumulator. const size_t chunk_size = 4 * chunk; const size_t number_of_chunks = (dimension - (partial_chunk ? dimension % chunk : 0)) / chunk_size; @@ -99,7 +79,6 @@ float SQ8_FP16_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v SQ8_FP16_InnerProductStep_SVE(pVect1, pVect2, offset, sum3, pg, chunk); } - // Additional steps 0..3. if constexpr (additional_steps > 0) SQ8_FP16_InnerProductStep_SVE(pVect1, pVect2, offset, sum0, pg, chunk); if constexpr (additional_steps > 1) @@ -107,12 +86,11 @@ float SQ8_FP16_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v if constexpr (additional_steps > 2) SQ8_FP16_InnerProductStep_SVE(pVect1, pVect2, offset, sum2, pg, chunk); - svfloat32_t sum = svadd_f32_x(pg, sum0, sum1); - sum = svadd_f32_x(pg, sum, sum2); - sum = svadd_f32_x(pg, sum, sum3); + svfloat32_t sum = svadd_f32_z(pg, sum0, sum1); + sum = svadd_f32_z(pg, sum, sum2); + sum = svadd_f32_z(pg, sum, sum3); float quantized_dot = svaddv_f32(pg, sum); - // Metadata loads - unaligned because odd dim leaves trailers unaligned. const uint8_t *params_bytes = static_cast(pVect1v) + dimension; const float min_val = load_unaligned(params_bytes + sq8::MIN_VAL * sizeof(float)); diff --git a/src/VecSim/spaces/L2/L2_NEON_SQ8_FP16.h b/src/VecSim/spaces/L2/L2_NEON_SQ8_FP16.h index 7bf5db986..15cc40f6a 100644 --- a/src/VecSim/spaces/L2/L2_NEON_SQ8_FP16.h +++ b/src/VecSim/spaces/L2/L2_NEON_SQ8_FP16.h @@ -9,11 +9,6 @@ #pragma once #include "VecSim/spaces/space_includes.h" #include "VecSim/spaces/IP/IP_NEON_SQ8_FP16.h" -#include "VecSim/types/sq8.h" -#include "VecSim/types/float16.h" - -using sq8 = vecsim_types::sq8; -using float16 = vecsim_types::float16; /* * Optimised asymmetric SQ8<->FP16 L2 squared distance using the algebraic identity: diff --git a/src/VecSim/spaces/L2/L2_SVE_SQ8_FP16.h b/src/VecSim/spaces/L2/L2_SVE_SQ8_FP16.h index 3c8e89ca6..e3592c24e 100644 --- a/src/VecSim/spaces/L2/L2_SVE_SQ8_FP16.h +++ b/src/VecSim/spaces/L2/L2_SVE_SQ8_FP16.h @@ -9,11 +9,6 @@ #pragma once #include "VecSim/spaces/space_includes.h" #include "VecSim/spaces/IP/IP_SVE_SQ8_FP16.h" -#include "VecSim/types/sq8.h" -#include "VecSim/types/float16.h" - -using sq8 = vecsim_types::sq8; -using float16 = vecsim_types::float16; /* * SVE SQ8<->FP16 L2 squared distance: From 10c03aae9540c6ce769bee9b2a28f10eaac8e59a Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Sun, 31 May 2026 13:03:53 +0000 Subject: [PATCH 17/19] Apply clang-format [MOD-14972] --- src/VecSim/batch_iterator.h | 2 +- src/VecSim/spaces/IP/IP_NEON_SQ8_FP16.h | 28 ++++++---------- src/VecSim/spaces/IP/IP_SVE_SQ8_FP16.h | 33 ++++++++----------- src/VecSim/spaces/L2/L2_NEON_SQ8_FP16.h | 10 +++--- src/VecSim/spaces/L2/L2_SVE_SQ8_FP16.h | 7 ++-- tests/benchmark/bm_vecsim_svs.h | 14 ++++---- .../spaces_benchmarks/bm_spaces_sq8_fp16.cpp | 5 +-- tests/benchmark/types_ranges.h | 12 ++++--- tests/unit/test_allocator.cpp | 4 +-- 9 files changed, 53 insertions(+), 62 deletions(-) diff --git a/src/VecSim/batch_iterator.h b/src/VecSim/batch_iterator.h index 9e2791130..466072f86 100644 --- a/src/VecSim/batch_iterator.h +++ b/src/VecSim/batch_iterator.h @@ -27,7 +27,7 @@ struct VecSimBatchIterator : public VecsimBaseObject { explicit VecSimBatchIterator(void *query_vector, void *tctx, std::shared_ptr allocator) : VecsimBaseObject(allocator), query_vector(query_vector), returned_results_count(0), - timeoutCtx(tctx) {}; + timeoutCtx(tctx){}; virtual inline const void *getQueryBlob() const { return query_vector; } diff --git a/src/VecSim/spaces/IP/IP_NEON_SQ8_FP16.h b/src/VecSim/spaces/IP/IP_NEON_SQ8_FP16.h index 8b8d37c0f..a5c2465fc 100644 --- a/src/VecSim/spaces/IP/IP_NEON_SQ8_FP16.h +++ b/src/VecSim/spaces/IP/IP_NEON_SQ8_FP16.h @@ -23,10 +23,9 @@ using float16 = vecsim_types::float16; */ // Helper: 16 lanes per call, four FP32 accumulators (one per quarter). -static inline void -SQ8_FP16_InnerProductStep_NEON_HP(const uint8_t *&pVect1, const float16 *&pVect2, - float32x4_t &sum0, float32x4_t &sum1, - float32x4_t &sum2, float32x4_t &sum3) { +static inline void SQ8_FP16_InnerProductStep_NEON_HP(const uint8_t *&pVect1, const float16 *&pVect2, + float32x4_t &sum0, float32x4_t &sum1, + float32x4_t &sum2, float32x4_t &sum3) { uint8x16_t v1_u8 = vld1q_u8(pVect1); uint16x8_t v1_lo = vmovl_u8(vget_low_u8(v1_u8)); uint16x8_t v1_hi = vmovl_u8(vget_high_u8(v1_u8)); @@ -77,8 +76,7 @@ float SQ8_FP16_InnerProductSIMD16_NEON_HP_IMP(const void *pVect1v, const void *p if constexpr (r >= 4) { uint8x8_t v1_u8 = vld1_u8(pVect1); float32x4_t v1_a = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(v1_u8)))); - float32x4_t v2_a = - vcvt_f32_f16(vld1_f16(reinterpret_cast(pVect2))); + float32x4_t v2_a = vcvt_f32_f16(vld1_f16(reinterpret_cast(pVect2))); sum0 = vfmaq_f32(sum0, v1_a, v2_a); pVect1 += 4; pVect2 += 4; @@ -86,8 +84,7 @@ float SQ8_FP16_InnerProductSIMD16_NEON_HP_IMP(const void *pVect1v, const void *p if constexpr (r >= 8) { uint8x8_t v1_u8 = vld1_u8(pVect1); float32x4_t v1_b = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(v1_u8)))); - float32x4_t v2_b = - vcvt_f32_f16(vld1_f16(reinterpret_cast(pVect2))); + float32x4_t v2_b = vcvt_f32_f16(vld1_f16(reinterpret_cast(pVect2))); sum1 = vfmaq_f32(sum1, v1_b, v2_b); pVect1 += 4; pVect2 += 4; @@ -95,8 +92,7 @@ float SQ8_FP16_InnerProductSIMD16_NEON_HP_IMP(const void *pVect1v, const void *p if constexpr (r >= 12) { uint8x8_t v1_u8 = vld1_u8(pVect1); float32x4_t v1_c = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(v1_u8)))); - float32x4_t v2_c = - vcvt_f32_f16(vld1_f16(reinterpret_cast(pVect2))); + float32x4_t v2_c = vcvt_f32_f16(vld1_f16(reinterpret_cast(pVect2))); sum2 = vfmaq_f32(sum2, v1_c, v2_c); pVect1 += 4; pVect2 += 4; @@ -112,14 +108,11 @@ float SQ8_FP16_InnerProductSIMD16_NEON_HP_IMP(const void *pVect1v, const void *p float quantized_dot = vaddvq_f32(vaddq_f32(sum_lo, sum_hi)) + scalar_dot; const uint8_t *params_bytes = static_cast(pVect1v) + dimension; - const float min_val = - load_unaligned(params_bytes + sq8::MIN_VAL * sizeof(float)); - const float delta = - load_unaligned(params_bytes + sq8::DELTA * sizeof(float)); + const float min_val = load_unaligned(params_bytes + sq8::MIN_VAL * sizeof(float)); + const float delta = load_unaligned(params_bytes + sq8::DELTA * sizeof(float)); const uint8_t *query_meta_bytes = reinterpret_cast(static_cast(pVect2v) + dimension); - const float y_sum = - load_unaligned(query_meta_bytes + sq8::SUM_QUERY * sizeof(float)); + const float y_sum = load_unaligned(query_meta_bytes + sq8::SUM_QUERY * sizeof(float)); return min_val * y_sum + delta * quantized_dot; } @@ -127,8 +120,7 @@ float SQ8_FP16_InnerProductSIMD16_NEON_HP_IMP(const void *pVect1v, const void *p template float SQ8_FP16_InnerProductSIMD16_NEON_HP(const void *pVect1v, const void *pVect2v, size_t dimension) { - return 1.0f - - SQ8_FP16_InnerProductSIMD16_NEON_HP_IMP(pVect1v, pVect2v, dimension); + return 1.0f - SQ8_FP16_InnerProductSIMD16_NEON_HP_IMP(pVect1v, pVect2v, dimension); } template diff --git a/src/VecSim/spaces/IP/IP_SVE_SQ8_FP16.h b/src/VecSim/spaces/IP/IP_SVE_SQ8_FP16.h index 6c7a52529..1408e0880 100644 --- a/src/VecSim/spaces/IP/IP_SVE_SQ8_FP16.h +++ b/src/VecSim/spaces/IP/IP_SVE_SQ8_FP16.h @@ -25,13 +25,12 @@ using float16 = vecsim_types::float16; */ // Helper: one SVE-vector-width-of-FP32 step. -static inline void -SQ8_FP16_InnerProductStep_SVE(const uint8_t *pVect1, const float16 *pVect2, size_t &offset, - svfloat32_t &sum, svbool_t pg, size_t chunk) { +static inline void SQ8_FP16_InnerProductStep_SVE(const uint8_t *pVect1, const float16 *pVect2, + size_t &offset, svfloat32_t &sum, svbool_t pg, + size_t chunk) { svuint32_t v1_u32 = svld1ub_u32(pg, pVect1 + offset); svfloat32_t v1_f = svcvt_f32_u32_x(pg, v1_u32); - svuint32_t q_u32 = - svld1uh_u32(pg, reinterpret_cast(pVect2 + offset)); + svuint32_t q_u32 = svld1uh_u32(pg, reinterpret_cast(pVect2 + offset)); svfloat32_t v2_f = svcvt_f32_f16_x(pg, svreinterpret_f16_u32(q_u32)); sum = svmla_f32_x(pg, sum, v1_f, v2_f); offset += chunk; @@ -60,8 +59,8 @@ float SQ8_FP16_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v svbool_t pg_partial = svwhilelt_b32(uint32_t(0), uint32_t(remaining)); svuint32_t v1_u32 = svld1ub_u32(pg_partial, pVect1 + offset); svfloat32_t v1_f = svcvt_f32_u32_z(pg_partial, v1_u32); - svuint32_t q_u32 = svld1uh_u32( - pg_partial, reinterpret_cast(pVect2 + offset)); + svuint32_t q_u32 = + svld1uh_u32(pg_partial, reinterpret_cast(pVect2 + offset)); svfloat32_t v2_f = svcvt_f32_f16_z(pg_partial, svreinterpret_f16_u32(q_u32)); sum0 = svmla_f32_z(pg_partial, sum0, v1_f, v2_f); offset += remaining; @@ -92,27 +91,23 @@ float SQ8_FP16_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v float quantized_dot = svaddv_f32(pg, sum); const uint8_t *params_bytes = static_cast(pVect1v) + dimension; - const float min_val = - load_unaligned(params_bytes + sq8::MIN_VAL * sizeof(float)); - const float delta = - load_unaligned(params_bytes + sq8::DELTA * sizeof(float)); - const uint8_t *query_meta_bytes = reinterpret_cast( - static_cast(pVect2v) + dimension); - const float y_sum = - load_unaligned(query_meta_bytes + sq8::SUM_QUERY * sizeof(float)); + const float min_val = load_unaligned(params_bytes + sq8::MIN_VAL * sizeof(float)); + const float delta = load_unaligned(params_bytes + sq8::DELTA * sizeof(float)); + const uint8_t *query_meta_bytes = + reinterpret_cast(static_cast(pVect2v) + dimension); + const float y_sum = load_unaligned(query_meta_bytes + sq8::SUM_QUERY * sizeof(float)); return min_val * y_sum + delta * quantized_dot; } template -float SQ8_FP16_InnerProductSIMD_SVE(const void *pVect1v, const void *pVect2v, - size_t dimension) { +float SQ8_FP16_InnerProductSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) { return 1.0f - SQ8_FP16_InnerProductSIMD_SVE_IMP( pVect1v, pVect2v, dimension); } template float SQ8_FP16_CosineSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) { - return SQ8_FP16_InnerProductSIMD_SVE( - pVect1v, pVect2v, dimension); + return SQ8_FP16_InnerProductSIMD_SVE(pVect1v, pVect2v, + dimension); } diff --git a/src/VecSim/spaces/L2/L2_NEON_SQ8_FP16.h b/src/VecSim/spaces/L2/L2_NEON_SQ8_FP16.h index 15cc40f6a..70367d7fe 100644 --- a/src/VecSim/spaces/L2/L2_NEON_SQ8_FP16.h +++ b/src/VecSim/spaces/L2/L2_NEON_SQ8_FP16.h @@ -21,15 +21,13 @@ template // 0..15 float SQ8_FP16_L2SqrSIMD16_NEON_HP(const void *pVect1v, const void *pVect2v, size_t dimension) { - const float ip = - SQ8_FP16_InnerProductSIMD16_NEON_HP_IMP(pVect1v, pVect2v, dimension); + const float ip = SQ8_FP16_InnerProductSIMD16_NEON_HP_IMP(pVect1v, pVect2v, dimension); const uint8_t *params_bytes = static_cast(pVect1v) + dimension; - const float x_sum_sq = - load_unaligned(params_bytes + sq8::SUM_SQUARES * sizeof(float)); + const float x_sum_sq = load_unaligned(params_bytes + sq8::SUM_SQUARES * sizeof(float)); - const uint8_t *query_meta_bytes = reinterpret_cast( - static_cast(pVect2v) + dimension); + const uint8_t *query_meta_bytes = + reinterpret_cast(static_cast(pVect2v) + dimension); const float y_sum_sq = load_unaligned(query_meta_bytes + sq8::SUM_SQUARES_QUERY * sizeof(float)); diff --git a/src/VecSim/spaces/L2/L2_SVE_SQ8_FP16.h b/src/VecSim/spaces/L2/L2_SVE_SQ8_FP16.h index e3592c24e..f70ef493d 100644 --- a/src/VecSim/spaces/L2/L2_SVE_SQ8_FP16.h +++ b/src/VecSim/spaces/L2/L2_SVE_SQ8_FP16.h @@ -22,10 +22,9 @@ float SQ8_FP16_L2SqrSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t di pVect1v, pVect2v, dimension); const uint8_t *params_bytes = static_cast(pVect1v) + dimension; - const float x_sum_sq = - load_unaligned(params_bytes + sq8::SUM_SQUARES * sizeof(float)); - const uint8_t *query_meta_bytes = reinterpret_cast( - static_cast(pVect2v) + dimension); + const float x_sum_sq = load_unaligned(params_bytes + sq8::SUM_SQUARES * sizeof(float)); + const uint8_t *query_meta_bytes = + reinterpret_cast(static_cast(pVect2v) + dimension); const float y_sum_sq = load_unaligned(query_meta_bytes + sq8::SUM_SQUARES_QUERY * sizeof(float)); diff --git a/tests/benchmark/bm_vecsim_svs.h b/tests/benchmark/bm_vecsim_svs.h index 5acb882c0..b92cce5e0 100644 --- a/tests/benchmark/bm_vecsim_svs.h +++ b/tests/benchmark/bm_vecsim_svs.h @@ -466,17 +466,19 @@ void BM_VecSimSVS::RunGC(benchmark::State &st) { #define UNIT_AND_ITERATIONS Unit(benchmark::kMillisecond)->Iterations(2) #if HAVE_SVS_LVQ -#define QUANT_BITS_ARGS {VecSimSvsQuant_8, VecSimSvsQuant_4x8_LeanVec} +#define QUANT_BITS_ARGS \ + { VecSimSvsQuant_8, VecSimSvsQuant_4x8_LeanVec } #define COMPRESSED_TRAINING_THRESHOLD_ARGS \ - {static_cast(BM_VecSimGeneral::block_size), 5000, 10000} + { static_cast(BM_VecSimGeneral::block_size), 5000, 10000 } #define COMPRESSED_ASYNC_TRAINING_THRESHOLD_ARGS \ - {static_cast(BM_VecSimGeneral::block_size), 5000, 10000, 50000} + { static_cast(BM_VecSimGeneral::block_size), 5000, 10000, 50000 } #else -#define QUANT_BITS_ARGS {VecSimSvsQuant_8} +#define QUANT_BITS_ARGS \ + { VecSimSvsQuant_8 } // Using smaller training TH to avoid long test times without LVQ #define COMPRESSED_TRAINING_THRESHOLD_ARGS \ - {static_cast(BM_VecSimGeneral::block_size), 5000} + { static_cast(BM_VecSimGeneral::block_size), 5000 } #define COMPRESSED_ASYNC_TRAINING_THRESHOLD_ARGS \ - {static_cast(BM_VecSimGeneral::block_size), 5000} + { static_cast(BM_VecSimGeneral::block_size), 5000 } #endif diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_fp16.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_fp16.cpp index 9ec022e39..cc5d040cb 100644 --- a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_fp16.cpp +++ b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_fp16.cpp @@ -16,7 +16,8 @@ using float16 = vecsim_types::float16; /** * SQ8-to-FP16 benchmarks: SQ8 quantized storage with FP16 query. * Registers the naive (scalar) baseline plus per-ISA SIMD variants (x86: AVX-512 / AVX2+FMA / - * AVX2 / SSE4 — gated on the matching OPT_* defines and runtime CPU features). ARM kernels (NEON_HP / SVE / SVE2) are registered below. + * AVX2 / SSE4 — gated on the matching OPT_* defines and runtime CPU features). ARM kernels (NEON_HP + * / SVE / SVE2) are registered below. */ class BM_VecSimSpaces_SQ8_FP16 : public benchmark::Fixture { protected: @@ -103,7 +104,7 @@ INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, SVE, 16, sv bool neon_hp_supported = arm_opt.asimdhp; INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, NEON_HP, 16, neon_hp_supported); INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, NEON_HP, 16, - neon_hp_supported); + neon_hp_supported); #endif #endif // CPU_FEATURES_ARCH_AARCH64 diff --git a/tests/benchmark/types_ranges.h b/tests/benchmark/types_ranges.h index 43abda8f0..deff4251c 100644 --- a/tests/benchmark/types_ranges.h +++ b/tests/benchmark/types_ranges.h @@ -11,9 +11,11 @@ #include #include "bm_definitions.h" -#define DEFAULT_RANGE_RADII {20, 35, 50} +#define DEFAULT_RANGE_RADII \ + { 20, 35, 50 } -#define DEFAULT_RANGE_EPSILONS {1, 10, 11} +#define DEFAULT_RANGE_EPSILONS \ + { 1, 10, 11 } // This template struct methods returns the default values for radii and epsilons // To specify different values for a certain type, use template specialization @@ -25,7 +27,8 @@ struct benchmark_range { // Larger Range query values are required for int8 wikipedia dataset. // Default values give 0 results -#define INT8_RANGE_RADII {50, 65, 80} +#define INT8_RANGE_RADII \ + { 50, 65, 80 } template <> struct benchmark_range { @@ -34,7 +37,8 @@ struct benchmark_range { }; // UINT8 ranges -#define UINT8_RANGE_RADII {4, 5, 7} +#define UINT8_RANGE_RADII \ + { 4, 5, 7 } template <> struct benchmark_range { diff --git a/tests/unit/test_allocator.cpp b/tests/unit/test_allocator.cpp index 6aa4a0d0b..77db41684 100644 --- a/tests/unit/test_allocator.cpp +++ b/tests/unit/test_allocator.cpp @@ -33,7 +33,7 @@ struct ObjectWithSTL : public VecsimBaseObject { public: ObjectWithSTL(std::shared_ptr allocator) - : VecsimBaseObject(allocator), test_vec(allocator) {}; + : VecsimBaseObject(allocator), test_vec(allocator){}; }; struct NestedObject : public VecsimBaseObject { @@ -42,7 +42,7 @@ struct NestedObject : public VecsimBaseObject { public: NestedObject(std::shared_ptr allocator) - : VecsimBaseObject(allocator), stl_object(allocator), simpleObject(allocator) {}; + : VecsimBaseObject(allocator), stl_object(allocator), simpleObject(allocator){}; }; TEST_F(AllocatorTest, test_simple_object) { From 9741cfb62a92cf0b04b78d43c50a89789deed9e1 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Sun, 31 May 2026 13:20:58 +0000 Subject: [PATCH 18/19] Trim PR churn: remove docs, dispatcher comments, and test verbosity [MOD-14972] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Remove docs/superpowers/ design and plan files (~1550 lines); sister PR #970 removed its equivalent doc before merge. - Drop 5-line "No alignment write" prose comment from the three AArch64 NEON_HP dispatcher blocks; the sister SQ8_FP32 ARM dispatchers carry no such comment — the absent alignment write already encodes the intent. - Trim GetDistFuncSQ8FP16Asymmetric to a 7-line template-mapping check at dim=15, matching the shape of GetDistFuncSQ8Asymmetric (SQ8_FP32 sister). The scalar-fallback assertion it previously duplicated is already covered by the trailing block of SQ8_FP16_SpacesOptimizationTest. --- .../plans/2026-05-28-arm-sq8-fp16-kernels.md | 1195 ----------------- .../specs/2026-05-28-arm-sq8-fp16-design.md | 354 ----- src/VecSim/spaces/IP_space.cpp | 6 - src/VecSim/spaces/L2_space.cpp | 3 - 4 files changed, 1558 deletions(-) delete mode 100644 docs/superpowers/plans/2026-05-28-arm-sq8-fp16-kernels.md delete mode 100644 docs/superpowers/specs/2026-05-28-arm-sq8-fp16-design.md diff --git a/docs/superpowers/plans/2026-05-28-arm-sq8-fp16-kernels.md b/docs/superpowers/plans/2026-05-28-arm-sq8-fp16-kernels.md deleted file mode 100644 index 2759ba046..000000000 --- a/docs/superpowers/plans/2026-05-28-arm-sq8-fp16-kernels.md +++ /dev/null @@ -1,1195 +0,0 @@ -# SQ8↔FP16 ARM SIMD Distance Kernels — Implementation Plan - -> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. - -**Goal:** Add SQ8↔FP16 asymmetric distance kernels (IP, L2, Cosine) for ARM ISA tiers — NEON_HP, SVE, SVE2 — plugged into the existing dispatcher. Mirrors the x86 work delivered in PR #970. - -**Architecture:** Header-only SIMD kernel templates (one per metric × ISA), instantiated via the existing `CHOOSE_IMPLEMENTATION` / `CHOOSE_SVE_IMPLEMENTATION` macros inside ISA-specific TUs (`NEON_HP.cpp`, `SVE.cpp`, `SVE2.cpp`). Wiring lives in `IP_space.cpp` and `L2_space.cpp` under a `#ifdef CPU_FEATURES_ARCH_AARCH64` block that parallels the existing x86 block. L2 reuses the IP `_IMP` template via the algebraic identity `L2² = x_sum_sq + y_sum_sq − 2·IP`. Scalar fallback already on `main` is unchanged and stays as the reference for every tier. - -**Tech Stack:** C++20, ARM NEON intrinsics (`arm_neon.h`), ARM SVE/SVE2 intrinsics (`arm_sve.h`), GoogleTest, Google Benchmark, cpu_features. - -**Branch:** `dor-forer-sq8-fp16-arm-kernels-mod-14972` (stacked on PR #970 / `dor-forer-sq8-fp16-x86-kernels-mod-14954`). - -**Build / test loop:** The user runs `make build` (per project memory). After each build cycle confirmed, the assistant runs `make unit_test` / ASan / benchmarks on the appropriate host (ARM hardware or cross-compile/qemu — coordinate with user). Each task ends in a commit; commits are pushed only when explicitly requested. - -**Spec:** [`docs/superpowers/specs/2026-05-28-arm-sq8-fp16-design.md`](../specs/2026-05-28-arm-sq8-fp16-design.md) - ---- - -## File Structure - -### Files created - -| Path | Responsibility | -|------|----------------| -| `src/VecSim/spaces/IP/IP_NEON_SQ8_FP16.h` | NEON IP kernel template (`SQ8_FP16_InnerProductSIMD16_NEON_HP_IMP` + thin wrappers) | -| `src/VecSim/spaces/L2/L2_NEON_SQ8_FP16.h` | NEON L2 kernel template (calls NEON IP impl, applies L2 identity) | -| `src/VecSim/spaces/IP/IP_SVE_SQ8_FP16.h` | SVE IP kernel template (`SQ8_FP16_InnerProductSIMD_SVE_IMP` + wrappers); also `#include`d from SVE2.cpp | -| `src/VecSim/spaces/L2/L2_SVE_SQ8_FP16.h` | SVE L2 kernel template; also `#include`d from SVE2.cpp | - -### Files modified - -| Path | Change | -|------|--------| -| `src/VecSim/spaces/functions/NEON_HP.h` | +3 chooser declarations (IP, L2, Cosine) | -| `src/VecSim/spaces/functions/NEON_HP.cpp` | +#include kernel headers; +3 chooser definitions | -| `src/VecSim/spaces/functions/SVE.h` | +3 chooser declarations | -| `src/VecSim/spaces/functions/SVE.cpp` | +#include kernel headers; +3 chooser definitions | -| `src/VecSim/spaces/functions/SVE2.h` | +3 chooser declarations | -| `src/VecSim/spaces/functions/SVE2.cpp` | +#include SVE kernel headers; +3 chooser definitions (own symbols, templates instantiated under SVE2 compile flags) | -| `src/VecSim/spaces/IP_space.cpp` | +#ifdef AArch64 block in `IP_SQ8_FP16_GetDistFunc` and `Cosine_SQ8_FP16_GetDistFunc` (2 dispatcher blocks) | -| `src/VecSim/spaces/L2_space.cpp` | +#ifdef AArch64 block in `L2_SQ8_FP16_GetDistFunc` (1 dispatcher block) | -| `tests/unit/test_spaces.cpp` | retarget `GetDistFuncSQ8FP16Asymmetric` to dim=15; add dim=0 test; extend the three `SQ8_FP16_SpacesOptimizationTest` test bodies with ARM tier walks; extend `SQ8_FP16_SIMD_TierCoverage.ReportTiersExercised` with AArch64 tier reporting | -| `tests/benchmark/spaces_benchmarks/bm_spaces_sq8_fp16.cpp` | +AArch64 `cpu_features` block; +ARM ISA benchmark registrations | - -### Files NOT modified - -`src/VecSim/spaces/CMakeLists.txt` — zero CMake changes. Existing TU flags (`-march=armv8.2-a+fp16fml` for NEON_HP, `-march=armv8-a+sve` for SVE, `-march=armv9-a+sve2` for SVE2) already carry everything the new kernels need. - ---- - -## Task 1: Retarget the scalar-fallback dispatcher test - -**Why first:** Builds and runs on x86 today, has nothing to do with the ARM kernels, and tightens the contract the rest of the plan relies on (the dispatcher returns scalar for `dim < 16`). - -**Files:** -- Modify: `tests/unit/test_spaces.cpp` — locate test named `GetDistFuncSQ8FP16Asymmetric` (added by PR #970; currently asserts `dim=128` returns the scalar fallback) - -- [ ] **Step 1: Locate the existing test** - -Run: -```bash -grep -n 'GetDistFuncSQ8FP16Asymmetric' tests/unit/test_spaces.cpp -``` -Expected: one or more line hits pointing at the `TEST(...)` block. - -- [ ] **Step 2: Modify the test to cover dim=0 and dim=15 instead of dim=128** - -Replace the body of the existing `TEST(..., GetDistFuncSQ8FP16Asymmetric)` so it walks two below-threshold dims and asserts the scalar fallback for each of L2 / IP / Cosine. Drop in this exact body (rename the test fixture symbol to match what is already there if it differs): - -```cpp -TEST_F(SpacesTest, GetDistFuncSQ8FP16Asymmetric) { - // SQ8 storage with FP16 query (asymmetric) - should return SQ8_FP16 functions. - // Per-ISA dispatcher walk coverage lives in the SQ8_FP16 SpacesOptimizationTest below. - // - // Walk two below-threshold dims (0 and 15) so the assertions hold regardless of which - // SIMD tiers the host advertises: dim < 16 must always short-circuit to scalar fallback. - // The template-mapping form (spaces::GetDistFunc) and the direct - // *_SQ8_FP16_GetDistFunc form must agree for every dim, and both must match the scalar - // reference at sub-threshold dims. - for (size_t dim : {static_cast(0), static_cast(15)}) { - auto l2_func = spaces::GetDistFunc(VecSimMetric_L2, dim, nullptr); - auto ip_func = spaces::GetDistFunc(VecSimMetric_IP, dim, nullptr); - auto cosine_func = - spaces::GetDistFunc(VecSimMetric_Cosine, dim, nullptr); - - ASSERT_EQ(l2_func, L2_SQ8_FP16_GetDistFunc(dim, nullptr)) - << "Template mapping disagrees with direct dispatcher for L2 at dim=" << dim; - ASSERT_EQ(ip_func, IP_SQ8_FP16_GetDistFunc(dim, nullptr)) - << "Template mapping disagrees with direct dispatcher for IP at dim=" << dim; - ASSERT_EQ(cosine_func, Cosine_SQ8_FP16_GetDistFunc(dim, nullptr)) - << "Template mapping disagrees with direct dispatcher for Cosine at dim=" << dim; - - ASSERT_EQ(l2_func, SQ8_FP16_L2Sqr) - << "dim=" << dim << " must short-circuit to scalar L2 fallback"; - ASSERT_EQ(ip_func, SQ8_FP16_InnerProduct) - << "dim=" << dim << " must short-circuit to scalar IP fallback"; - ASSERT_EQ(cosine_func, SQ8_FP16_Cosine) - << "dim=" << dim << " must short-circuit to scalar Cosine fallback"; - } -} -``` - -- [ ] **Step 3: User builds** - -Ask the user to run `make build` (their normal x86 build is sufficient — this test is host-agnostic). - -- [ ] **Step 4: Run the test** - -Run: -```bash -./bin//unit_tests --gtest_filter='SpacesTest.GetDistFuncSQ8FP16Asymmetric' -``` -(Use `find bin -name unit_tests -type f` if the host-triple subdir is unknown.) -Expected: PASS. - -- [ ] **Step 5: Commit** - -```bash -git add tests/unit/test_spaces.cpp -git commit -m "Retarget SQ8↔FP16 scalar-fallback dispatcher test to dim=0/15 [MOD-14972]" -``` - ---- - -## Task 2: NEON IP kernel header - -**Files:** -- Create: `src/VecSim/spaces/IP/IP_NEON_SQ8_FP16.h` - -- [ ] **Step 1: Author the kernel file** - -Create exactly this file (modeled on `IP_NEON_SQ8_FP32.h` + the NEON FP16 widening pattern from `IP_NEON_FP16.h`): - -```cpp -/* - * Copyright (c) 2006-Present, Redis Ltd. - * All rights reserved. - * - * Licensed under your choice of the Redis Source Available License 2.0 - * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the - * GNU Affero General Public License v3 (AGPLv3). - */ -#pragma once -#include "VecSim/spaces/space_includes.h" -#include "VecSim/types/sq8.h" -#include "VecSim/types/float16.h" -#include -#include - -using sq8 = vecsim_types::sq8; -using float16 = vecsim_types::float16; - -/* - * Optimised asymmetric SQ8<->FP16 inner product using the algebraic identity: - * - * IP(x, y) = sum(x_i * y_i) - * ~= sum((min + delta * q_i) * y_i) - * = min * y_sum + delta * sum(q_i * y_i) - * - * The hot loop only accumulates sum(q_i * y_i) - no per-element dequantisation. - * FP16 query lanes are widened to FP32 via vcvt_f32_f16 per 16-lane chunk. - */ - -// Helper: 16 lanes per call, four FP32 accumulators (one per quarter). -static inline void -SQ8_FP16_InnerProductStep_NEON_HP(const uint8_t *&pVect1, const float16 *&pVect2, - float32x4_t &sum0, float32x4_t &sum1, - float32x4_t &sum2, float32x4_t &sum3) { - // SQ8 storage: 16 * uint8 -> 4 * float32x4_t - uint8x16_t v1_u8 = vld1q_u8(pVect1); - uint16x8_t v1_lo = vmovl_u8(vget_low_u8(v1_u8)); - uint16x8_t v1_hi = vmovl_u8(vget_high_u8(v1_u8)); - float32x4_t v1_0 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(v1_lo))); - float32x4_t v1_1 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(v1_lo))); - float32x4_t v1_2 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(v1_hi))); - float32x4_t v1_3 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(v1_hi))); - - // FP16 query: 16 * f16 -> 4 * float32x4_t via vcvt_f32_f16 - const float16_t *q = reinterpret_cast(pVect2); - float16x8_t q_lo = vld1q_f16(q); - float16x8_t q_hi = vld1q_f16(q + 8); - float32x4_t v2_0 = vcvt_f32_f16(vget_low_f16(q_lo)); - float32x4_t v2_1 = vcvt_f32_f16(vget_high_f16(q_lo)); - float32x4_t v2_2 = vcvt_f32_f16(vget_low_f16(q_hi)); - float32x4_t v2_3 = vcvt_f32_f16(vget_high_f16(q_hi)); - - sum0 = vfmaq_f32(sum0, v1_0, v2_0); - sum1 = vfmaq_f32(sum1, v1_1, v2_1); - sum2 = vfmaq_f32(sum2, v1_2, v2_2); - sum3 = vfmaq_f32(sum3, v1_3, v2_3); - - pVect1 += 16; - pVect2 += 16; -} - -// pVect1v = SQ8 storage, pVect2v = FP16 query -template // 0..15 -float SQ8_FP16_InnerProductSIMD16_NEON_HP_IMP(const void *pVect1v, const void *pVect2v, - size_t dimension) { - assert(dimension >= 16 && "kernel precondition: dispatcher must guard dim >= 16"); - - const uint8_t *pVect1 = static_cast(pVect1v); // SQ8 storage - const float16 *pVect2 = static_cast(pVect2v); // FP16 query - - float32x4_t sum0 = vdupq_n_f32(0.0f); - float32x4_t sum1 = vdupq_n_f32(0.0f); - float32x4_t sum2 = vdupq_n_f32(0.0f); - float32x4_t sum3 = vdupq_n_f32(0.0f); - - const size_t num_of_chunks = dimension / 16; - for (size_t i = 0; i < num_of_chunks; i++) { - SQ8_FP16_InnerProductStep_NEON_HP(pVect1, pVect2, sum0, sum1, sum2, sum3); - } - - // Residual handling: dim % 16 lanes. - // residual >= 8: one safe 8-lane SQ8 + 8-lane FP16 load (FP16 trailer is wide enough). - // residual < 8: scalar-only - a 4-lane FP16 load would overread y_sum metadata. - constexpr unsigned char r = residual; - if constexpr (r >= 8) { - uint8x8_t v1_u8 = vld1_u8(pVect1); - uint16x8_t v1_u16 = vmovl_u8(v1_u8); - float32x4_t v1_a = vcvtq_f32_u32(vmovl_u16(vget_low_u16(v1_u16))); - float32x4_t v1_b = vcvtq_f32_u32(vmovl_u16(vget_high_u16(v1_u16))); - float16x8_t q_h = vld1q_f16(reinterpret_cast(pVect2)); - float32x4_t v2_a = vcvt_f32_f16(vget_low_f16(q_h)); - float32x4_t v2_b = vcvt_f32_f16(vget_high_f16(q_h)); - sum0 = vfmaq_f32(sum0, v1_a, v2_a); - sum1 = vfmaq_f32(sum1, v1_b, v2_b); - pVect1 += 8; - pVect2 += 8; - } - // Lane-by-lane scalar for the final 0..7 (residual % 8) elements. - constexpr unsigned char tail = r & 0x7; - float scalar_dot = 0.0f; - for (unsigned char k = 0; k < tail; ++k) { - scalar_dot += static_cast(pVect1[k]) * vecsim_types::FP16_to_FP32(pVect2[k]); - } - - // Reduce the four NEON accumulators. - float32x4_t sum_lo = vaddq_f32(sum0, sum1); - float32x4_t sum_hi = vaddq_f32(sum2, sum3); - float quantized_dot = vaddvq_f32(vaddq_f32(sum_lo, sum_hi)) + scalar_dot; - - // Metadata loads - use load_unaligned because odd dim leaves trailers unaligned. - const uint8_t *params_bytes = static_cast(pVect1v) + dimension; - const float min_val = - load_unaligned(params_bytes + sq8::MIN_VAL * sizeof(float)); - const float delta = - load_unaligned(params_bytes + sq8::DELTA * sizeof(float)); - const uint8_t *query_meta_bytes = - reinterpret_cast(static_cast(pVect2v) + dimension); - const float y_sum = - load_unaligned(query_meta_bytes + sq8::SUM_QUERY * sizeof(float)); - - return min_val * y_sum + delta * quantized_dot; -} - -template -float SQ8_FP16_InnerProductSIMD16_NEON_HP(const void *pVect1v, const void *pVect2v, - size_t dimension) { - return 1.0f - - SQ8_FP16_InnerProductSIMD16_NEON_HP_IMP(pVect1v, pVect2v, dimension); -} - -template -float SQ8_FP16_CosineSIMD16_NEON_HP(const void *pVect1v, const void *pVect2v, size_t dimension) { - // Cosine = 1 - IP (vectors are pre-normalised); reuses the IP wrapper. - return SQ8_FP16_InnerProductSIMD16_NEON_HP(pVect1v, pVect2v, dimension); -} -``` - -- [ ] **Step 2: Header-only smoke (no build yet)** - -Run: -```bash -grep -n 'load_unaligned\|FP16_to_FP32' src/VecSim/spaces/space_includes.h \ - src/VecSim/spaces/IP/IP.cpp src/VecSim/types/float16.h 2>/dev/null -``` -Expected: confirm the global `load_unaligned` is reachable through `space_includes.h` (matches the include path used by `IP_NEON_SQ8_FP32.h`) and `FP16_to_FP32` is reachable through `VecSim/types/float16.h`. If either include is missing, add it. - -- [ ] **Step 3: Commit** - -```bash -git add src/VecSim/spaces/IP/IP_NEON_SQ8_FP16.h -git commit -m "Add NEON_HP SQ8↔FP16 IP kernel header [MOD-14972]" -``` - ---- - -## Task 3: NEON L2 kernel header - -**Files:** -- Create: `src/VecSim/spaces/L2/L2_NEON_SQ8_FP16.h` - -- [ ] **Step 1: Author the kernel file** - -```cpp -/* - * Copyright (c) 2006-Present, Redis Ltd. - * All rights reserved. - * - * Licensed under your choice of the Redis Source Available License 2.0 - * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the - * GNU Affero General Public License v3 (AGPLv3). - */ -#pragma once -#include "VecSim/spaces/space_includes.h" -#include "VecSim/spaces/IP/IP_NEON_SQ8_FP16.h" -#include "VecSim/types/sq8.h" -#include "VecSim/types/float16.h" - -using sq8 = vecsim_types::sq8; -using float16 = vecsim_types::float16; - -/* - * Optimised asymmetric SQ8<->FP16 L2 squared distance using the algebraic identity: - * - * ||x - y||^2 = sum(x_i^2) - 2 * IP(x, y) + sum(y_i^2) - * = x_sum_squares - 2 * IP(x, y) + y_sum_squares - * - * IP is computed by SQ8_FP16_InnerProductSIMD16_NEON_HP_IMP; metadata is FP32. - */ - -template // 0..15 -float SQ8_FP16_L2SqrSIMD16_NEON_HP(const void *pVect1v, const void *pVect2v, size_t dimension) { - const float ip = - SQ8_FP16_InnerProductSIMD16_NEON_HP_IMP(pVect1v, pVect2v, dimension); - - const uint8_t *params_bytes = static_cast(pVect1v) + dimension; - const float x_sum_sq = - load_unaligned(params_bytes + sq8::SUM_SQUARES * sizeof(float)); - - const uint8_t *query_meta_bytes = reinterpret_cast( - static_cast(pVect2v) + dimension); - const float y_sum_sq = - load_unaligned(query_meta_bytes + sq8::SUM_SQUARES_QUERY * sizeof(float)); - - return x_sum_sq + y_sum_sq - 2.0f * ip; -} -``` - -- [ ] **Step 2: Commit** - -```bash -git add src/VecSim/spaces/L2/L2_NEON_SQ8_FP16.h -git commit -m "Add NEON_HP SQ8↔FP16 L2 kernel header [MOD-14972]" -``` - ---- - -## Task 4: NEON_HP dispatcher TU additions - -**Files:** -- Modify: `src/VecSim/spaces/functions/NEON_HP.h` — add 3 declarations -- Modify: `src/VecSim/spaces/functions/NEON_HP.cpp` — add 3 chooser definitions - -- [ ] **Step 1: Add chooser declarations to NEON_HP.h** - -In `src/VecSim/spaces/functions/NEON_HP.h`, inside `namespace spaces { ... }`, append these three declarations alongside the existing `Choose_FP16_*_implementation_NEON_HP`: - -```cpp -dist_func_t Choose_SQ8_FP16_IP_implementation_NEON_HP(size_t dim); -dist_func_t Choose_SQ8_FP16_L2_implementation_NEON_HP(size_t dim); -dist_func_t Choose_SQ8_FP16_Cosine_implementation_NEON_HP(size_t dim); -``` - -- [ ] **Step 2: Add chooser definitions to NEON_HP.cpp** - -In `src/VecSim/spaces/functions/NEON_HP.cpp`, add the kernel `#include`s alongside the existing FP16 includes: - -```cpp -#include "VecSim/spaces/IP/IP_NEON_SQ8_FP16.h" -#include "VecSim/spaces/L2/L2_NEON_SQ8_FP16.h" -``` - -Then inside `namespace spaces { ... }` (between `#include "implementation_chooser.h"` and `#include "implementation_chooser_cleanup.h"`), append: - -```cpp -dist_func_t Choose_SQ8_FP16_IP_implementation_NEON_HP(size_t dim) { - dist_func_t ret_dist_func; - CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_FP16_InnerProductSIMD16_NEON_HP); - return ret_dist_func; -} - -dist_func_t Choose_SQ8_FP16_L2_implementation_NEON_HP(size_t dim) { - dist_func_t ret_dist_func; - CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_FP16_L2SqrSIMD16_NEON_HP); - return ret_dist_func; -} - -dist_func_t Choose_SQ8_FP16_Cosine_implementation_NEON_HP(size_t dim) { - dist_func_t ret_dist_func; - CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_FP16_CosineSIMD16_NEON_HP); - return ret_dist_func; -} -``` - -- [ ] **Step 3: Commit** - -```bash -git add src/VecSim/spaces/functions/NEON_HP.h src/VecSim/spaces/functions/NEON_HP.cpp -git commit -m "Wire NEON_HP SQ8↔FP16 choosers [MOD-14972]" -``` - ---- - -## Task 5: NEON_HP dispatcher wiring in IP_space.cpp + L2_space.cpp - -**Files:** -- Modify: `src/VecSim/spaces/IP_space.cpp` — `IP_SQ8_FP16_GetDistFunc` + `Cosine_SQ8_FP16_GetDistFunc` -- Modify: `src/VecSim/spaces/L2_space.cpp` — `L2_SQ8_FP16_GetDistFunc` - -Each of those three `_GetDistFunc` functions currently has an `#ifdef CPU_FEATURES_ARCH_X86_64` block with an early `if (dim < 16) return ret_dist_func;` guard followed by per-tier dispatch. We append an `#ifdef CPU_FEATURES_ARCH_AARCH64` block with the matching shape. Only NEON_HP is wired in this task; SVE/SVE2 land in a later task. - -- [ ] **Step 1: Confirm the #include for NEON_HP.h is present** - -Run: -```bash -grep -n 'functions/NEON_HP.h' src/VecSim/spaces/IP_space.cpp src/VecSim/spaces/L2_space.cpp -``` -Expected: both files already `#include "VecSim/spaces/functions/NEON_HP.h"`. If a file is missing it, add the include. - -- [ ] **Step 2: Wire IP_SQ8_FP16_GetDistFunc** - -In `src/VecSim/spaces/IP_space.cpp`, locate `IP_SQ8_FP16_GetDistFunc`. After the closing `#endif // x86_64`, insert a parallel AArch64 block immediately before the trailing `return ret_dist_func;`: - -```cpp -#ifdef CPU_FEATURES_ARCH_AARCH64 - if (dim < 16) { - return ret_dist_func; - } -#ifdef OPT_NEON_HP - if (features.asimdhp) { - // No alignment write: the locked spec and the sister ARM SQ8_FP32 dispatchers - // leave *alignment untouched on ARM tiers. The corresponding tests assert - // 0xFF passthrough on the scalar path and do not assert any non-zero value here. - return Choose_SQ8_FP16_IP_implementation_NEON_HP(dim); - } -#endif -#endif // CPU_FEATURES_ARCH_AARCH64 -``` - -- [ ] **Step 3: Wire Cosine_SQ8_FP16_GetDistFunc** - -In the same file, locate `Cosine_SQ8_FP16_GetDistFunc`. Insert the same block, swapping `Choose_SQ8_FP16_IP_implementation_NEON_HP` for `Choose_SQ8_FP16_Cosine_implementation_NEON_HP`. - -- [ ] **Step 4: Wire L2_SQ8_FP16_GetDistFunc** - -In `src/VecSim/spaces/L2_space.cpp`, locate `L2_SQ8_FP16_GetDistFunc`. Insert the same block, swapping the call for `Choose_SQ8_FP16_L2_implementation_NEON_HP`. - -- [ ] **Step 5: User builds** - -Ask the user to run `make build` — first time the new NEON_HP TU additions compile. If they have ARM hardware or a cross-compile target, that build path; otherwise the x86 build must at least confirm the new headers don't accidentally break non-ARM compilation (the new headers are only `#include`d from `NEON_HP.cpp`, which is excluded on non-ARM hosts, so x86 builds should be clean). - -- [ ] **Step 6: Commit** - -```bash -git add src/VecSim/spaces/IP_space.cpp src/VecSim/spaces/L2_space.cpp -git commit -m "Dispatch SQ8↔FP16 to NEON_HP tier on AArch64 [MOD-14972]" -``` - ---- - -## Task 6: Extend `SQ8_FP16_SpacesOptimizationTest` with NEON_HP tier-walk - -**Files:** -- Modify: `tests/unit/test_spaces.cpp` — three test bodies (`SQ8_FP16_L2SqrTest`, `SQ8_FP16_InnerProductTest`, `SQ8_FP16_CosineTest`) - -After the existing `#ifdef OPT_SSE4` block in each test, append: - -- [ ] **Step 1: Add NEON_HP tier to L2 test** - -In `SQ8_FP16_L2SqrTest`, immediately after the closing `#endif` that follows the SSE4 block and before `// Scalar fallback`: - -```cpp -#ifdef OPT_NEON_HP - if (optimization.asimdhp) { - unsigned char alignment = 0; - arch_opt_func = L2_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization); - ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_L2_implementation_NEON_HP(dim)) - << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01) - << "NEON_HP with dim " << dim; - optimization.asimdhp = 0; - } -#endif -``` - -- [ ] **Step 2: Add NEON_HP tier to IP test** - -In `SQ8_FP16_InnerProductTest`, append the same block but swap `L2_SQ8_FP16_GetDistFunc` → `IP_SQ8_FP16_GetDistFunc` and `Choose_SQ8_FP16_L2_implementation_NEON_HP` → `Choose_SQ8_FP16_IP_implementation_NEON_HP`. - -- [ ] **Step 3: Add NEON_HP tier to Cosine test** - -In `SQ8_FP16_CosineTest`, append the same block with `Cosine_SQ8_FP16_GetDistFunc` and `Choose_SQ8_FP16_Cosine_implementation_NEON_HP`. - -- [ ] **Step 4: Confirm the include path for the NEON_HP chooser declarations** - -Run: -```bash -grep -n 'functions/NEON_HP.h' tests/unit/test_spaces.cpp -``` -Expected: include present. If not, add `#include "VecSim/spaces/functions/NEON_HP.h"` near the other space-function includes at the top of the file. - -- [ ] **Step 5: User builds (ARM target)** - -Ask the user to run `make build` for an ARM target (hardware or cross-compile). On x86 the new test code is gated by `#ifdef OPT_NEON_HP` and stays inert. - -- [ ] **Step 6: Run NEON_HP tests** - -Once the ARM build is reported clean, run: -```bash -./bin//unit_tests --gtest_filter='SQ8_FP16_*Test*' -``` -Expected: all parametrized cases PASS, including the dims-16..32 and high-dim suites. - -- [ ] **Step 7: Commit** - -```bash -git add tests/unit/test_spaces.cpp -git commit -m "Extend SQ8↔FP16 tier-walk tests with NEON_HP [MOD-14972]" -``` - ---- - -## Task 7: SVE IP kernel header - -**Files:** -- Create: `src/VecSim/spaces/IP/IP_SVE_SQ8_FP16.h` - -- [ ] **Step 1: Author the kernel file** - -Modeled on `IP_SVE_SQ8_FP32.h`. The shape: an `InnerProductStep` helper that consumes `chunk = svcntw()` FP32 lanes per call (FP16 query loaded under a `b16` predicate, SQ8 storage under a `b32` predicate that drives uint8→uint32 widening), then a templated `_IMP` over ``. - -```cpp -/* - * Copyright (c) 2006-Present, Redis Ltd. - * All rights reserved. - * - * Licensed under your choice of the Redis Source Available License 2.0 - * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the - * GNU Affero General Public License v3 (AGPLv3). - */ -#pragma once -#include "VecSim/spaces/space_includes.h" -#include "VecSim/types/sq8.h" -#include "VecSim/types/float16.h" -#include -#include - -using sq8 = vecsim_types::sq8; -using float16 = vecsim_types::float16; - -/* - * Optimised asymmetric SQ8<->FP16 inner product using the algebraic identity: - * - * IP(x, y) ~= min * y_sum + delta * sum(q_i * y_i) - * - * Hot loop accumulates sum(q_i * y_i) only; FP16 query lanes are widened to FP32 - * inside each step via svcvt_f32_f16_x. Metadata loads use load_unaligned. - */ - -// Helper: one SVE-vector-width-of-FP32 step. -// chunk = svcntw() - number of FP32 lanes per step. -// pg = svptrue_b32() - predicate for FP32 lanes. -static inline void -SQ8_FP16_InnerProductStep_SVE(const uint8_t *pVect1, const float16 *pVect2, size_t &offset, - svfloat32_t &sum, svbool_t pg, size_t chunk) { - // SQ8 -> uint32 (widen on load), then to FP32. - svuint32_t v1_u32 = svld1ub_u32(pg, pVect1 + offset); - svfloat32_t v1_f = svcvt_f32_u32_x(pg, v1_u32); - - // FP16 query -> FP32. svld1_f16 uses a b16 predicate sized to `chunk` half lanes. - svbool_t pg16 = svwhilelt_b16(uint32_t(0), uint32_t(chunk)); - svfloat16_t q_h = - svld1_f16(pg16, reinterpret_cast(pVect2) + offset); - svfloat32_t v2_f = svcvt_f32_f16_x(pg, q_h); - - sum = svmla_f32_x(pg, sum, v1_f, v2_f); - offset += chunk; -} - -// pVect1v = SQ8 storage, pVect2v = FP16 query -template -float SQ8_FP16_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v, - size_t dimension) { - assert(dimension >= 16 && "kernel precondition: dispatcher must guard dim >= 16"); - - const uint8_t *pVect1 = static_cast(pVect1v); - const float16 *pVect2 = static_cast(pVect2v); - size_t offset = 0; - svbool_t pg = svptrue_b32(); - const size_t chunk = svcntw(); - - svfloat32_t sum0 = svdup_f32(0.0f); - svfloat32_t sum1 = svdup_f32(0.0f); - svfloat32_t sum2 = svdup_f32(0.0f); - svfloat32_t sum3 = svdup_f32(0.0f); - - // Partial chunk for dim % chunk lanes. Use _z form so inactive lanes are zero - - // the final reduction below walks all lanes via svptrue_b32(). - if constexpr (partial_chunk) { - size_t remaining = dimension % chunk; - if (remaining > 0) { - svbool_t pg_partial = - svwhilelt_b32(uint32_t(0), uint32_t(remaining)); - svbool_t pg16_partial = - svwhilelt_b16(uint32_t(0), uint32_t(remaining)); - svuint32_t v1_u32 = svld1ub_u32(pg_partial, pVect1 + offset); - svfloat32_t v1_f = svcvt_f32_u32_z(pg_partial, v1_u32); - svfloat16_t q_h = svld1_f16( - pg16_partial, reinterpret_cast(pVect2) + offset); - svfloat32_t v2_f = svcvt_f32_f16_z(pg_partial, q_h); - sum0 = svmla_f32_z(pg_partial, sum0, v1_f, v2_f); - offset += remaining; - } - } - - // Main loop: 4 chunks per iteration via 4 accumulators. - const size_t chunk_size = 4 * chunk; - const size_t number_of_chunks = - (dimension - (partial_chunk ? dimension % chunk : 0)) / chunk_size; - for (size_t i = 0; i < number_of_chunks; i++) { - SQ8_FP16_InnerProductStep_SVE(pVect1, pVect2, offset, sum0, pg, chunk); - SQ8_FP16_InnerProductStep_SVE(pVect1, pVect2, offset, sum1, pg, chunk); - SQ8_FP16_InnerProductStep_SVE(pVect1, pVect2, offset, sum2, pg, chunk); - SQ8_FP16_InnerProductStep_SVE(pVect1, pVect2, offset, sum3, pg, chunk); - } - - // Additional steps 0..3. - if constexpr (additional_steps > 0) - SQ8_FP16_InnerProductStep_SVE(pVect1, pVect2, offset, sum0, pg, chunk); - if constexpr (additional_steps > 1) - SQ8_FP16_InnerProductStep_SVE(pVect1, pVect2, offset, sum1, pg, chunk); - if constexpr (additional_steps > 2) - SQ8_FP16_InnerProductStep_SVE(pVect1, pVect2, offset, sum2, pg, chunk); - - svfloat32_t sum = svadd_f32_x(pg, sum0, sum1); - sum = svadd_f32_x(pg, sum, sum2); - sum = svadd_f32_x(pg, sum, sum3); - float quantized_dot = svaddv_f32(pg, sum); - - // Metadata loads - unaligned because odd dim leaves trailers unaligned. - const uint8_t *params_bytes = static_cast(pVect1v) + dimension; - const float min_val = - load_unaligned(params_bytes + sq8::MIN_VAL * sizeof(float)); - const float delta = - load_unaligned(params_bytes + sq8::DELTA * sizeof(float)); - const uint8_t *query_meta_bytes = reinterpret_cast( - static_cast(pVect2v) + dimension); - const float y_sum = - load_unaligned(query_meta_bytes + sq8::SUM_QUERY * sizeof(float)); - - return min_val * y_sum + delta * quantized_dot; -} - -template -float SQ8_FP16_InnerProductSIMD_SVE(const void *pVect1v, const void *pVect2v, - size_t dimension) { - return 1.0f - SQ8_FP16_InnerProductSIMD_SVE_IMP( - pVect1v, pVect2v, dimension); -} - -template -float SQ8_FP16_CosineSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) { - return SQ8_FP16_InnerProductSIMD_SVE( - pVect1v, pVect2v, dimension); -} -``` - -**Note for the implementer:** `svcvt_f32_f16_x(pg, q_h)` widens *the lower half of `q_h`'s lanes* to FP32 (one widening, b32-predicated). If the ACLE on the target toolchain rejects this pairing (e.g. ARM RVCT vs LLVM disagreement), verify the FP16->FP32 widening sequence against the actual ARM build output and adjust as needed (potential alternatives: explicit `svunpklo_*` unpack-then-widen, or operating on the lower half lanes by reinterpretation). Commit only after the build is clean. Do not blindly copy `IP_SVE_FP16.h`'s pattern - that file accumulates in FP16 and is not a direct widening reference. - -- [ ] **Step 2: Commit** - -```bash -git add src/VecSim/spaces/IP/IP_SVE_SQ8_FP16.h -git commit -m "Add SVE SQ8↔FP16 IP kernel header [MOD-14972]" -``` - ---- - -## Task 8: SVE L2 kernel header - -**Files:** -- Create: `src/VecSim/spaces/L2/L2_SVE_SQ8_FP16.h` - -- [ ] **Step 1: Author the kernel file** - -```cpp -/* - * Copyright (c) 2006-Present, Redis Ltd. - * All rights reserved. - * - * Licensed under your choice of the Redis Source Available License 2.0 - * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the - * GNU Affero General Public License v3 (AGPLv3). - */ -#pragma once -#include "VecSim/spaces/space_includes.h" -#include "VecSim/spaces/IP/IP_SVE_SQ8_FP16.h" -#include "VecSim/types/sq8.h" -#include "VecSim/types/float16.h" - -using sq8 = vecsim_types::sq8; -using float16 = vecsim_types::float16; - -/* - * SVE SQ8<->FP16 L2 squared distance: - * ||x - y||^2 = x_sum_squares - 2 * IP(x, y) + y_sum_squares - * IP is computed by SQ8_FP16_InnerProductSIMD_SVE_IMP; metadata is FP32. - */ - -template -float SQ8_FP16_L2SqrSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) { - const float ip = SQ8_FP16_InnerProductSIMD_SVE_IMP( - pVect1v, pVect2v, dimension); - - const uint8_t *params_bytes = static_cast(pVect1v) + dimension; - const float x_sum_sq = - load_unaligned(params_bytes + sq8::SUM_SQUARES * sizeof(float)); - const uint8_t *query_meta_bytes = reinterpret_cast( - static_cast(pVect2v) + dimension); - const float y_sum_sq = - load_unaligned(query_meta_bytes + sq8::SUM_SQUARES_QUERY * sizeof(float)); - - return x_sum_sq + y_sum_sq - 2.0f * ip; -} -``` - -- [ ] **Step 2: Commit** - -```bash -git add src/VecSim/spaces/L2/L2_SVE_SQ8_FP16.h -git commit -m "Add SVE SQ8↔FP16 L2 kernel header [MOD-14972]" -``` - ---- - -## Task 9: SVE + SVE2 dispatcher TU additions - -**Files:** -- Modify: `src/VecSim/spaces/functions/SVE.h` — +3 declarations -- Modify: `src/VecSim/spaces/functions/SVE.cpp` — +#includes; +3 chooser definitions -- Modify: `src/VecSim/spaces/functions/SVE2.h` — +3 declarations -- Modify: `src/VecSim/spaces/functions/SVE2.cpp` — +#includes; +3 chooser definitions (own symbols, template instantiated under SVE2 flags) - -- [ ] **Step 1: Declarations in SVE.h** - -Inside `namespace spaces { ... }`, alongside the existing `Choose_SQ8_FP32_*_SVE` declarations: - -```cpp -dist_func_t Choose_SQ8_FP16_IP_implementation_SVE(size_t dim); -dist_func_t Choose_SQ8_FP16_Cosine_implementation_SVE(size_t dim); -dist_func_t Choose_SQ8_FP16_L2_implementation_SVE(size_t dim); -``` - -- [ ] **Step 2: Definitions in SVE.cpp** - -Add includes alongside the existing SQ8_FP32 includes: - -```cpp -#include "VecSim/spaces/IP/IP_SVE_SQ8_FP16.h" -#include "VecSim/spaces/L2/L2_SVE_SQ8_FP16.h" -``` - -Inside `namespace spaces { ... }` (between `implementation_chooser.h` and `implementation_chooser_cleanup.h`), append: - -```cpp -dist_func_t Choose_SQ8_FP16_IP_implementation_SVE(size_t dim) { - dist_func_t ret_dist_func; - CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_FP16_InnerProductSIMD_SVE, dim, svcntw); - return ret_dist_func; -} - -dist_func_t Choose_SQ8_FP16_Cosine_implementation_SVE(size_t dim) { - dist_func_t ret_dist_func; - CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_FP16_CosineSIMD_SVE, dim, svcntw); - return ret_dist_func; -} - -dist_func_t Choose_SQ8_FP16_L2_implementation_SVE(size_t dim) { - dist_func_t ret_dist_func; - CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_FP16_L2SqrSIMD_SVE, dim, svcntw); - return ret_dist_func; -} -``` - -- [ ] **Step 3: Declarations in SVE2.h** - -```cpp -dist_func_t Choose_SQ8_FP16_IP_implementation_SVE2(size_t dim); -dist_func_t Choose_SQ8_FP16_Cosine_implementation_SVE2(size_t dim); -dist_func_t Choose_SQ8_FP16_L2_implementation_SVE2(size_t dim); -``` - -- [ ] **Step 4: Definitions in SVE2.cpp** - -Add includes alongside the existing SQ8_FP32 includes — note the SVE header is included from SVE2 (SVE2 instantiates the template under SVE2 compile flags): - -```cpp -#include "VecSim/spaces/IP/IP_SVE_SQ8_FP16.h" // SVE2 implementation is identical to SVE -#include "VecSim/spaces/L2/L2_SVE_SQ8_FP16.h" // SVE2 implementation is identical to SVE -``` - -Inside `namespace spaces { ... }`, append: - -```cpp -dist_func_t Choose_SQ8_FP16_IP_implementation_SVE2(size_t dim) { - dist_func_t ret_dist_func; - CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_FP16_InnerProductSIMD_SVE, dim, svcntw); - return ret_dist_func; -} - -dist_func_t Choose_SQ8_FP16_Cosine_implementation_SVE2(size_t dim) { - dist_func_t ret_dist_func; - CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_FP16_CosineSIMD_SVE, dim, svcntw); - return ret_dist_func; -} - -dist_func_t Choose_SQ8_FP16_L2_implementation_SVE2(size_t dim) { - dist_func_t ret_dist_func; - CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_FP16_L2SqrSIMD_SVE, dim, svcntw); - return ret_dist_func; -} -``` - -- [ ] **Step 5: Commit** - -```bash -git add src/VecSim/spaces/functions/SVE.h src/VecSim/spaces/functions/SVE.cpp \ - src/VecSim/spaces/functions/SVE2.h src/VecSim/spaces/functions/SVE2.cpp -git commit -m "Wire SVE/SVE2 SQ8↔FP16 choosers [MOD-14972]" -``` - ---- - -## Task 10: SVE + SVE2 dispatcher wiring in IP_space.cpp + L2_space.cpp - -The NEON_HP block added in Task 5 lives inside `#ifdef CPU_FEATURES_ARCH_AARCH64`. Extend the same block in all three `_GetDistFunc` functions with SVE2 and SVE tiers — ordered SVE2 → SVE → NEON_HP, matching every other SQ8/FP32 dispatcher in the file. - -**Files:** -- Modify: `src/VecSim/spaces/IP_space.cpp` (two functions) -- Modify: `src/VecSim/spaces/L2_space.cpp` (one function) - -- [ ] **Step 1: Confirm the SVE/SVE2 dispatcher includes are present** - -Run: -```bash -grep -n 'functions/SVE\.h\|functions/SVE2\.h' src/VecSim/spaces/IP_space.cpp src/VecSim/spaces/L2_space.cpp -``` -Expected: both files already include both headers. If not, add them. - -- [ ] **Step 2: Extend IP_SQ8_FP16_GetDistFunc** - -Inside the AArch64 block of `IP_SQ8_FP16_GetDistFunc`, after the `if (dim < 16) return ret_dist_func;` guard and **before** the existing `#ifdef OPT_NEON_HP`, prepend: - -```cpp -#ifdef OPT_SVE2 - if (features.sve2) { - return Choose_SQ8_FP16_IP_implementation_SVE2(dim); - } -#endif -#ifdef OPT_SVE - if (features.sve) { - return Choose_SQ8_FP16_IP_implementation_SVE(dim); - } -#endif -``` - -(SVE/SVE2 paths don't compute alignment hints — the SVE vector width is runtime-variable, so the SQ8_FP32 sister doesn't set `*alignment` here either. Mirror that.) - -- [ ] **Step 3: Extend Cosine_SQ8_FP16_GetDistFunc** - -Same as Step 2, with `Cosine` in the chooser names. - -- [ ] **Step 4: Extend L2_SQ8_FP16_GetDistFunc** - -Same as Step 2, with `L2` in the chooser names. - -- [ ] **Step 5: User builds (ARM target)** - -Ask user to run `make build` for an ARM target. - -- [ ] **Step 6: Commit** - -```bash -git add src/VecSim/spaces/IP_space.cpp src/VecSim/spaces/L2_space.cpp -git commit -m "Dispatch SQ8↔FP16 to SVE/SVE2 tiers on AArch64 [MOD-14972]" -``` - ---- - -## Task 11: Extend `SQ8_FP16_SpacesOptimizationTest` with SVE2 + SVE tier-walks - -**Files:** -- Modify: `tests/unit/test_spaces.cpp` — the same three test bodies extended in Task 6 - -For each test (L2, IP, Cosine), inside the existing `#ifdef CPU_FEATURES_ARCH_AARCH64` region (which currently holds only NEON_HP from Task 6), **prepend** SVE2 and SVE blocks so the dispatch-precedence order is SVE2 → SVE → NEON_HP. If the existing NEON_HP block is not yet inside an AArch64 outer ifdef, wrap all three together. - -- [ ] **Step 1: Wrap and extend the L2 test** - -Replace the NEON_HP-only AArch64 block in `SQ8_FP16_L2SqrTest` with: - -```cpp -#ifdef CPU_FEATURES_ARCH_AARCH64 -#ifdef OPT_SVE2 - if (optimization.sve2) { - unsigned char alignment = 0; - arch_opt_func = L2_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization); - ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_L2_implementation_SVE2(dim)) - << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01) - << "SVE2 with dim " << dim; - optimization.sve2 = 0; - } -#endif -#ifdef OPT_SVE - if (optimization.sve) { - unsigned char alignment = 0; - arch_opt_func = L2_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization); - ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_L2_implementation_SVE(dim)) - << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01) - << "SVE with dim " << dim; - optimization.sve = 0; - } -#endif -#ifdef OPT_NEON_HP - if (optimization.asimdhp) { - unsigned char alignment = 0; - arch_opt_func = L2_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization); - ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_L2_implementation_NEON_HP(dim)) - << "Unexpected distance function chosen for dim " << dim; - ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01) - << "NEON_HP with dim " << dim; - optimization.asimdhp = 0; - } -#endif -#endif // CPU_FEATURES_ARCH_AARCH64 -``` - -- [ ] **Step 2: Same for IP test** - -Replicate the block in `SQ8_FP16_InnerProductTest` with `IP_SQ8_FP16_GetDistFunc` and `Choose_SQ8_FP16_IP_implementation_`. - -- [ ] **Step 3: Same for Cosine test** - -Replicate with `Cosine_SQ8_FP16_GetDistFunc` and `Choose_SQ8_FP16_Cosine_implementation_`. - -- [ ] **Step 4: User builds** - -ARM target build. - -- [ ] **Step 5: Run the optimization tests** - -```bash -./bin//unit_tests --gtest_filter='SQ8_FP16_SpacesOptimizationTest.*' -``` -Expected: all parametrized cases PASS — dims 16..32 + high-dim suite (64..1024) — exercising whichever ARM tiers the host advertises. - -- [ ] **Step 6: Commit** - -```bash -git add tests/unit/test_spaces.cpp -git commit -m "Extend SQ8↔FP16 tier-walk tests with SVE/SVE2 [MOD-14972]" -``` - ---- - -## Task 12: Extend `SQ8_FP16_SIMD_TierCoverage.ReportTiersExercised` with ARM rows - -**Files:** -- Modify: `tests/unit/test_spaces.cpp` — `TEST(SQ8_FP16_SIMD_TierCoverage, ReportTiersExercised)` - -The existing test body has an outer `#ifdef CPU_FEATURES_ARCH_X86_64` block that loops over each x86 tier and logs presence to stderr. Add a sibling `#ifdef CPU_FEATURES_ARCH_AARCH64` block with the same shape. - -- [ ] **Step 1: Append the AArch64 reporting block** - -Locate the trailing `#endif // CPU_FEATURES_ARCH_X86_64` and immediately after, insert: - -```cpp -#ifdef CPU_FEATURES_ARCH_AARCH64 -#ifdef OPT_SVE2 - if (opt.sve2) { - std::cerr << "[SQ8_FP16] SVE2 tier exercised\n"; - any_simd = true; - } else { - std::cerr << "[SQ8_FP16] SVE2 tier NOT exercised on this host\n"; - } -#endif -#ifdef OPT_SVE - if (opt.sve) { - std::cerr << "[SQ8_FP16] SVE tier exercised\n"; - any_simd = true; - } else { - std::cerr << "[SQ8_FP16] SVE tier NOT exercised on this host\n"; - } -#endif -#ifdef OPT_NEON_HP - if (opt.asimdhp) { - std::cerr << "[SQ8_FP16] NEON_HP tier exercised\n"; - any_simd = true; - } else { - std::cerr << "[SQ8_FP16] NEON_HP tier NOT exercised on this host\n"; - } -#endif -#endif // CPU_FEATURES_ARCH_AARCH64 -``` - -(The trailing `if (!any_simd) { GTEST_SKIP() << ...; }` already at the bottom of the existing test handles the all-quiet case across both archs.) - -- [ ] **Step 2: Build + run on an ARM host** - -Ask the user to build for ARM, then run: -```bash -./bin//unit_tests --gtest_filter='SQ8_FP16_SIMD_TierCoverage.*' -``` -Expected: stderr shows at least one ARM tier marked "exercised", test PASS. - -- [ ] **Step 3: Commit** - -```bash -git add tests/unit/test_spaces.cpp -git commit -m "Report ARM tiers in SQ8↔FP16 tier-coverage test [MOD-14972]" -``` - ---- - -## Task 13: Microbench AArch64 block - -**Files:** -- Modify: `tests/benchmark/spaces_benchmarks/bm_spaces_sq8_fp16.cpp` - -The existing file already opens `#ifdef CPU_FEATURES_ARCH_X86_64` and pulls `cpu_features::X86Features opt = cpu_features::GetX86Info().features;`. Add the parallel AArch64 block at the end of that `#endif // CPU_FEATURES_ARCH_X86_64`. - -- [ ] **Step 1: Append the AArch64 bench block** - -After the closing `#endif // CPU_FEATURES_ARCH_X86_64` (or after the last x86 `INITIALIZE_BENCHMARKS_SET_*` macro if no such comment exists), insert: - -```cpp -#ifdef CPU_FEATURES_ARCH_AARCH64 -cpu_features::Aarch64Features arm_opt = cpu_features::GetAarch64Info().features; - -#ifdef OPT_SVE2 -bool sve2_supported = arm_opt.sve2; -INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, SVE2, 16, sve2_supported); -INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, SVE2, 16, sve2_supported); -#endif - -#ifdef OPT_SVE -bool sve_supported = arm_opt.sve; -INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, SVE, 16, sve_supported); -INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, SVE, 16, sve_supported); -#endif - -#ifdef OPT_NEON_HP -bool neon_hp_supported = arm_opt.asimdhp; -INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, NEON_HP, 16, neon_hp_supported); -INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, NEON_HP, 16, - neon_hp_supported); -#endif -#endif // CPU_FEATURES_ARCH_AARCH64 -``` - -Verify the exact `cpu_features` helper name during build. If the toolchain uses `Aarch64Info` vs `Aarch64Features` vs `ArmFeatures`, adjust to match the sister x86 block. - -- [ ] **Step 2: Update the file-header comment** - -The current file-header comment (around the top) ends with `ARM kernels land via MOD-14972.` — change that line to `ARM kernels (NEON_HP / SVE / SVE2) are registered below.` so the doc stays accurate. - -- [ ] **Step 3: User builds (ARM target)** - -- [ ] **Step 4: Run the bench on ARM** - -```bash -./bin//bm_spaces_sq8_fp16 --benchmark_filter='SQ8_FP16_.*(SVE2|SVE|NEON_HP)' -``` -Expected: per-ISA throughput rows for L2, IP, Cosine. If no rows match, list all benchmarks first with `--benchmark_list_tests` to see the exact generated names, then adjust the regex. - -- [ ] **Step 5: Side-by-side compare against SQ8_FP32** - -```bash -./bin//bm_spaces_sq8_fp32 --benchmark_filter='SQ8_FP32_.*(SVE2|SVE|NEON)' -``` -Compare matched-ISA rows manually. Acceptance per Jira: per-ISA throughput data captured. - -- [ ] **Step 6: Commit** - -```bash -git add tests/benchmark/spaces_benchmarks/bm_spaces_sq8_fp16.cpp -git commit -m "Register ARM SQ8↔FP16 microbenchmarks [MOD-14972]" -``` - ---- - -## Task 14: ASan + final pre-PR verification - -- [ ] **Step 1: Full unit-test pass on ARM host (no filter)** - -```bash -./bin//unit_tests -``` -Expected: all tests PASS. - -- [ ] **Step 2: ASan build + run** - -Ask user to run `make build SAN=address` (or the repo's equivalent — verify against `Makefile`). After confirmed: - -```bash -./bin/-asan/unit_tests --gtest_filter='SQ8_FP16_*' -``` -Expected: zero ASan reports; all SQ8_FP16 tests PASS. - -- [ ] **Step 3: x86 sanity build** - -User runs `make build` on x86 (no ARM target). Confirms the new test extensions and dispatcher AArch64 ifdefs stay inert on x86 and the build is clean. - -- [ ] **Step 4: Push branch (ASK USER FIRST)** - -Pushes are user-gated. Confirm with the user before running: - -```bash -git push -u origin dor-forer-sq8-fp16-arm-kernels-mod-14972 -``` - -- [ ] **Step 5: Open PR against PR #970 (ASK USER FIRST)** - -PR creation is user-gated. Confirm with the user before running: - -```bash -gh pr create \ - --base dor-forer-sq8-fp16-x86-kernels-mod-14954 \ - --title 'Add SQ8↔FP16 ARM SIMD distance kernels [MOD-14972]' \ - --body "$(cat <<'EOF' -## Summary - -- Add asymmetric SQ8↔FP16 distance kernels (IP, L2, Cosine) for ARM NEON_HP, SVE, SVE2 tiers -- Wire kernels into the existing dispatcher (`IP_space.cpp`, `L2_space.cpp`) -- Extend `SQ8_FP16_SpacesOptimizationTest` and `SQ8_FP16_SIMD_TierCoverage` with ARM tiers -- Register per-ISA microbenchmarks for cross-arch throughput comparison - -Stacked on PR #970 (MOD-14954 x86 kernels); retarget to `main` once #970 merges. - -Spec: `docs/superpowers/specs/2026-05-28-arm-sq8-fp16-design.md` - -## Test plan - -- [ ] Unit tests on ARM host pass — `SQ8_FP16_SpacesOptimizationTest` (dims 16..32 + 64..1024), `SQ8_FP16_SIMD_TierCoverage`, `GetDistFuncSQ8FP16Asymmetric` -- [ ] ASan build on ARM host clean across SQ8_FP16 tests -- [ ] x86 build remains clean (new AArch64 dispatcher block + tests stay inert) -- [ ] Microbench output captured for SVE2 / SVE / NEON_HP, compared against matched SQ8_FP32 ARM rows -EOF -)" -``` - -- [ ] **Step 6: Retarget once #970 merges (ASK USER FIRST)** - -When PR #970 lands on `main`, change this PR's base to `main`: - -```bash -gh pr edit --base main -``` - ---- - -## Self-review checklist - -- [x] **Spec coverage:** every requirement in `2026-05-28-arm-sq8-fp16-design.md` is covered: - - Kernel headers (4 new): Tasks 2, 3, 7, 8 - - Wrapper symbols: Tasks 4 (NEON_HP), 9 (SVE/SVE2) - - Dispatcher wiring: Tasks 5 (NEON_HP), 10 (SVE/SVE2) - - Tier-walk tests: Tasks 6 (NEON_HP), 11 (SVE/SVE2) - - TierCoverage report: Task 12 - - Scalar-fallback edge tests (dim=0, dim=15): Task 1 - - Microbench: Task 13 - - ASan + verification: Task 14 -- [x] **No CMake changes** — confirmed in file structure table. -- [x] **Zero placeholders** — every code block is concrete; ambiguous spots (SVE FP16 widening ACLE) are called out with the fallback strategy spelled in-task. -- [x] **Type/symbol consistency:** - - NEON kernel template names: `SQ8_FP16_InnerProductSIMD16_NEON_HP_IMP` / `…NEON_HP` / `SQ8_FP16_L2SqrSIMD16_NEON_HP` / `SQ8_FP16_CosineSIMD16_NEON_HP` — match across kernel header, NEON_HP chooser, dispatcher call, and test. - - SVE kernel template names: `SQ8_FP16_InnerProductSIMD_SVE_IMP` / `…SVE` / `SQ8_FP16_L2SqrSIMD_SVE` / `SQ8_FP16_CosineSIMD_SVE` — match across kernel header, SVE chooser, SVE2 chooser, dispatcher call, and test. - - Chooser symbol names: `Choose_SQ8_FP16_{IP,L2,Cosine}_implementation_{NEON_HP,SVE,SVE2}` — match across `.h` declarations, `.cpp` definitions, dispatcher calls, tests, and bench. - - Test fixture: `SQ8_FP16_SpacesOptimizationTest` already exists on base (PR #970); we extend the three test methods inside it, no rename. - ---- - -## Execution Handoff - -Plan complete and saved to `docs/superpowers/plans/2026-05-28-arm-sq8-fp16-kernels.md`. Two execution options: - -**1. Subagent-Driven (recommended)** — I dispatch a fresh subagent per task, review between tasks, fast iteration. - -**2. Inline Execution** — Execute tasks in this session using executing-plans, batch execution with checkpoints. - -Which approach? diff --git a/docs/superpowers/specs/2026-05-28-arm-sq8-fp16-design.md b/docs/superpowers/specs/2026-05-28-arm-sq8-fp16-design.md deleted file mode 100644 index f4188d38b..000000000 --- a/docs/superpowers/specs/2026-05-28-arm-sq8-fp16-design.md +++ /dev/null @@ -1,354 +0,0 @@ -# SQ8↔FP16 ARM SIMD Distance Kernels — Design Spec - -- **Ticket**: [MOD-14972](https://redislabs.atlassian.net/browse/MOD-14972) -- **Branch**: `dor-forer-sq8-fp16-arm-kernels-mod-14972` -- **Base**: `dor-forer-sq8-fp16-x86-kernels-mod-14954` (PR #970) — stacked -- **Sibling**: MOD-14954 / PR #970 delivers x86 SIMD kernels (AVX-512, AVX2, SSE4) for the same operation - -## Goal - -Add SQ8↔FP16 SIMD distance kernels for IP and L2 on the ARM ISA tiers (NEON_HP, SVE, SVE2). FP16 is the query data type; SQ8 is the stored vector representation. Match the contract and structure of the x86 kernels delivered in PR #970 so dispatch tables, metadata layout, and acceptance criteria stay symmetric across architectures. - -The scalar fallback (`SQ8_FP16_InnerProduct`, `SQ8_FP16_L2Sqr`, `SQ8_FP16_Cosine` in `src/VecSim/spaces/IP/IP.cpp` and `src/VecSim/spaces/L2/L2.cpp`) already exists on `main`. This spec does not modify it; it serves as the reference implementation for all platforms. - -## Algebraic identity (shared with x86 PR + SQ8_FP32 sister) - -``` -IP(x, y) ≈ min · y_sum + delta · Σ(q_i · y_i) -L2(x, y) = x_sum_sq + y_sum_sq - 2 · IP(x, y) -``` - -Hot loop accumulates `Σ(q_i · y_i)` only. No per-element dequantization. FP16 query lanes are widened to FP32 per SIMD chunk; everything in the hot loop is FP32. - -## Metadata layout - -``` -SQ8 storage (pVect1): [uint8 × dim] [min_val] [delta] [x_sum] [x_sum_squares] -FP16 query (pVect2): [float16 × dim] [y_sum] [y_sum_squares] -``` - -Both metadata trailers are FP32 scalars. Storage metadata is not 4-byte aligned whenever `dim % 4 != 0`; query metadata is not 4-byte aligned whenever `dim` is odd. The blanket rule: every FP32 metadata read uses the global `load_unaligned` helper, matching scalar `_Impl` in `IP.cpp` / `L2.cpp`. `sq8` namespace constants: `MIN_VAL`, `DELTA`, `SUM_QUERY`, `SUM_SQUARES`, `SUM_SQUARES_QUERY`. - -## File layout - -``` -src/VecSim/spaces/IP/ - IP_NEON_SQ8_FP16.h (new) - IP_SVE_SQ8_FP16.h (new) — also #included from SVE2.cpp -src/VecSim/spaces/L2/ - L2_NEON_SQ8_FP16.h (new) - L2_SVE_SQ8_FP16.h (new) — also #included from SVE2.cpp -src/VecSim/spaces/functions/ - NEON_HP.cpp (+ Choose_SQ8_FP16_{IP,L2,Cosine}_implementation_NEON_HP) - NEON_HP.h (+ 3 declarations) - SVE.cpp (+ Choose_SQ8_FP16_*_implementation_SVE) - SVE.h (+ 3 declarations) - SVE2.cpp (+ Choose_SQ8_FP16_*_implementation_SVE2; owns its own chooser symbols; instantiates SVE kernel templates under SVE2 compile flags) - SVE2.h (+ 3 declarations) -src/VecSim/spaces/ - IP_space.cpp (2 dispatcher block edits: IP, Cosine) - L2_space.cpp (1 dispatcher block edit) -``` - -**Zero CMake changes.** Existing TU flags carry exactly what we need: - -| TU | Flags | -|----|-------| -| `NEON_HP.cpp` | `-march=armv8.2-a+fp16fml` (covers fp16 cvt + fma) | -| `SVE.cpp` | `-march=armv8-a+sve` (SVE includes f16↔f32 cvt) | -| `SVE2.cpp` | `-march=armv9-a+sve2` | - -## Dispatcher tier order - -Same precedence as existing SQ8_FP32 ARM dispatch: - -```cpp -#ifdef OPT_SVE2 - if (features.sve2 && dim >= 16) { - return Choose_SQ8_FP16_IP_implementation_SVE2(dim); - } -#endif -#ifdef OPT_SVE - if (features.sve && dim >= 16) { - return Choose_SQ8_FP16_IP_implementation_SVE(dim); - } -#endif -#ifdef OPT_NEON_HP - if (features.asimdhp && dim >= 16) { - return Choose_SQ8_FP16_IP_implementation_NEON_HP(dim); - } -#endif -// dim < 16 or no ARM SIMD → scalar fallback (existing return at function tail) -``` - -The `dim >= 16` guard in the dispatcher is what lets each SIMD kernel hold an internal `assert(dim >= 16)` as a real precondition. Edge cases for `dim < 16` are routed to scalar. - -## NEON kernel design - -### Header: `IP_NEON_SQ8_FP16.h` - -Template signature mirrors SQ8_FP32 NEON sister: - -```cpp -template // 0..15 -float SQ8_FP16_InnerProductSIMD16_NEON_HP_IMP(const void *pVect1v, const void *pVect2v, size_t dimension); -``` - -Hot loop — 16 lanes per iteration, 4 FP32 accumulators: - -```cpp -// SQ8 load: 16 × uint8 → 4 × float32x4_t -uint8x16_t v1_u8 = vld1q_u8(pVect1); -uint16x8_t v1_lo = vmovl_u8(vget_low_u8(v1_u8)); -uint16x8_t v1_hi = vmovl_u8(vget_high_u8(v1_u8)); -float32x4_t v1_0 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(v1_lo))); -float32x4_t v1_1 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(v1_lo))); -float32x4_t v1_2 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(v1_hi))); -float32x4_t v1_3 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(v1_hi))); - -// FP16 query load: 16 × f16 → 4 × float32x4_t via vcvt_f32_f16 -float16x8_t q_lo = vld1q_f16(pVect2); -float16x8_t q_hi = vld1q_f16(pVect2 + 8); -float32x4_t v2_0 = vcvt_f32_f16(vget_low_f16(q_lo)); -float32x4_t v2_1 = vcvt_f32_f16(vget_high_f16(q_lo)); -float32x4_t v2_2 = vcvt_f32_f16(vget_low_f16(q_hi)); -float32x4_t v2_3 = vcvt_f32_f16(vget_high_f16(q_hi)); - -// 4-accumulator FMA -sum0 = vfmaq_f32(sum0, v1_0, v2_0); -sum1 = vfmaq_f32(sum1, v1_1, v2_1); -sum2 = vfmaq_f32(sum2, v1_2, v2_2); -sum3 = vfmaq_f32(sum3, v1_3, v2_3); -``` - -Residual ladder (`dim % 16`, residual 0..15): - -- **`residual >= 8`**: one 8-lane safe load each side — `vld1_u8` (8 bytes) for SQ8 and `vld1q_f16` (8 × FP16 = 16 bytes, fits before query metadata) for FP16. Convert + FMA. Remaining `residual - 8` lanes handled scalar. -- **`residual < 8`**: full scalar residual loop using `vecsim_types::FP16_to_FP32`. - -Rationale: a 16-byte SQ8 load (`vld1q_u8`) or a 16-byte FP16 load (`vld1q_f16` past the 8-lane boundary) on a residual < 8 would overread past valid query data into metadata — `y_sum` is only 4 bytes for IP and `y_sum_sq` adds 4 more for L2, not enough headroom for an 8-lane FP16 load. - -Final reduction: `vaddvq_f32(sum0 + sum1 + sum2 + sum3)`, then return `min_val * y_sum + delta * quantized_dot`. - -`assert(dim >= 16)` at the top. - -### Header: `L2_NEON_SQ8_FP16.h` - -Calls `SQ8_FP16_InnerProductSIMD16_NEON_HP_IMP(...)` to compute raw IP, then returns `x_sum_sq + y_sum_sq - 2.0f * ip`. Mirrors `L2_NEON_SQ8_FP32.h` exactly. - -### Wrapper symbols (NEON_HP.cpp) - -```cpp -dist_func_t Choose_SQ8_FP16_IP_implementation_NEON_HP(size_t dim) { - dist_func_t ret; - CHOOSE_IMPLEMENTATION(ret, dim, 16, SQ8_FP16_InnerProductSIMD16_NEON_HP); - return ret; -} -// L2 + Cosine identical shape (Cosine reuses IP wrapper per repo convention) -``` - -## SVE kernel design - -### Header: `IP_SVE_SQ8_FP16.h` - -Template signature mirrors SVE SQ8_FP32 sister: - -```cpp -template -float SQ8_FP16_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v, size_t dimension); -``` - -Inner step (one SVE vector width `svcntw()` lanes of FP32): - -```cpp -svbool_t pg = svptrue_b32(); -// SQ8: zero-extend uint8 → uint32 (predicated b32 load) -svuint32_t v1_u32 = svld1ub_u32(pg, pVect1 + offset); -svfloat32_t v1_f = svcvt_f32_u32_x(pg, v1_u32); -// FP16: load chunk fp16 lanes, widen to fp32 -svbool_t pg16 = svwhilelt_b16(uint32_t(0), uint32_t(chunk)); -svfloat16_t q_h = svld1_f16(pg16, pVect2 + offset); -svfloat32_t v2_f = svcvt_f32_f16_x(pg, q_h); // verify exact ACLE/packing during impl -sum = svmla_f32_x(pg, sum, v1_f, v2_f); -offset += chunk; -``` - -**ACLE caveat**: exact f16→f32 widening intrinsic and lane packing — confirm `svcvt_f32_f16_x(pg, q_h)` compiles cleanly against the loaded `svfloat16_t`. If lane packing needs an unpack/interleave step, verify against `IP_SVE_FP16.h`. - -4 accumulators `sum0..sum3`; main loop processes 4 chunks via 4 `InnerProductStep` calls. `partial_chunk` template branch handles `dim % chunk` via `svwhilelt_b32`. - -Inactive-lane discipline on the partial path: the predicated `svld1_f16` / `svld1ub_u32` cover lane *liveness*, but the final reduction with `svaddv_f32(svptrue_b32(), ...)` walks *all* lanes. To keep inactive lanes from contributing garbage, the partial step uses the zeroing form `svmla_f32_z(pg_partial, sum0, v1_f, v2_f)` (matches `IP_SVE_SQ8_FP32.h` partial-chunk pattern). Alternative: reduce only active lanes via `svaddv_f32(pg_partial, sum0)` for the partial-step accumulator, then sum into the main reduction. The `_z` form is the simpler choice and is what the SQ8_FP32 SVE sister already does. - -Predicate widths on the partial path: FP32 math (load/widen/mla) uses a `b32` predicate sized to `remaining` 32-bit lanes (`svwhilelt_b32(0, remaining)`); the FP16 query load needs its own `b16` predicate sized to the same `remaining` half lanes (`svwhilelt_b16(0, remaining)`) since `svld1_f16` is governed by a 16-bit predicate. SQ8 load via `svld1ub_u32` is governed by the `b32` predicate (it widens uint8 → uint32 lanewise). - -Final reduction: `svaddv_f32(svptrue_b32(), sum0 + sum1 + sum2 + sum3)`. - -### Header: `L2_SVE_SQ8_FP16.h` - -Calls `SQ8_FP16_InnerProductSIMD_SVE_IMP(...)` then returns `x_sum_sq + y_sum_sq - 2.0f * ip`. Mirrors `L2_SVE_SQ8_FP32.h`. - -### Wrapper symbols - -`SVE.cpp`: - -```cpp -dist_func_t Choose_SQ8_FP16_IP_implementation_SVE(size_t dim) { - dist_func_t ret; - CHOOSE_SVE_IMPLEMENTATION(ret, SQ8_FP16_InnerProductSIMD_SVE, dim, svcntw); - return ret; -} -// L2 + Cosine identical shape -``` - -`SVE2.cpp`: - -```cpp -#include "VecSim/spaces/IP/IP_SVE_SQ8_FP16.h" // SVE2 implementation is identical to SVE -#include "VecSim/spaces/L2/L2_SVE_SQ8_FP16.h" - -dist_func_t Choose_SQ8_FP16_IP_implementation_SVE2(size_t dim) { - dist_func_t ret; - CHOOSE_SVE_IMPLEMENTATION(ret, SQ8_FP16_InnerProductSIMD_SVE, dim, svcntw); - return ret; -} -// L2 + Cosine identical shape -``` - -SVE2 owns its own chooser symbols (does **not** call the SVE chooser); template instantiated under SVE2 compile flags. - -## Tests - -### Class - -Branch base is PR #970. During implementation, verify whether the base branch already exposes `SQ8_FP16_SpacesOptimizationTest` (extend) or only `SQ8_FP16_NoOptimizationSpacesTest` (add the optimization class here mirroring `SQ8_FP32_SpacesOptimizationTest`). - -### Tier-walk pattern - -Per-tier `if (features.)` block; **unset higher flag** after each block so the next tier is exercised on hosts that support multiple ISAs. Do not use `GTEST_SKIP()` here — it would abort the entire walk. - -```cpp -auto expected = SQ8_FP16_InnerProduct; // scalar reference - -#ifdef OPT_SVE2 - if (features.sve2) { - arch_opt_func = IP_SQ8_FP16_GetDistFunc(dim, &alignment, &features); - ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_IP_implementation_SVE2(dim)) - << "SVE2 dispatch mismatch"; - ASSERT_NEAR(arch_opt_func(v1, v2, dim), expected(v1, v2, dim), 0.01); - features.sve2 = 0; // exercise next tier - } -#endif -#ifdef OPT_SVE - if (features.sve) { /* same shape */ features.sve = 0; } -#endif -#ifdef OPT_NEON_HP - if (features.asimdhp) { /* same shape */ features.asimdhp = 0; } -#endif -// final fallback assertion: IP_SQ8_FP16_GetDistFunc(...) == SQ8_FP16_InnerProduct (scalar) -``` - -Three dispatch entry points exercised per tier: `IP_SQ8_FP16_GetDistFunc`, `L2_SQ8_FP16_GetDistFunc`, `Cosine_SQ8_FP16_GetDistFunc`. - -### Scalar-fallback tests - -`GetDistFuncSQ8FP16Asymmetric` — currently asserts `dim=128` returns scalar; that assertion breaks once SIMD dispatch lands. Change to `dim=15` (below the `dim >= 16` SIMD threshold). Add a small `dim=0` (empty) scalar-fallback assertion to cover the Jira "empty" edge case. - -### Dim parameterization - -Base branch already has both parameterized suites against `SQ8_FP16_SpacesOptimizationTest`: -- `SQ8_FP16_SIMD` — `testing::Range(16UL, 33UL)` (dims 16..32; residual + threshold boundaries) -- `SQ8_FP16_SIMD_HighDim` — `64, 128, 256, 512, 1024` (multi-iteration main loop) - -Both suites pick up the ARM tier-walk additions automatically since the test class body is what's extended. No new instantiation needed. - -### Tier coverage report - -`SQ8_FP16_SIMD_TierCoverage.ReportTiersExercised` (test_spaces.cpp) currently reports only x86 tiers. Extend it with ARM tier entries (SVE2 / SVE / NEON_HP) so an ARM-only SIMD host reports its exercised tiers instead of going silent. - -## Microbench - -`tests/benchmark/spaces_benchmarks/bm_spaces_sq8_fp16.cpp` already registers x86 ISA benchmarks. Add ARM registrations under `#ifdef OPT_*` guards using the existing `bm_spaces.h` macros: - -```cpp -#ifdef CPU_FEATURES_ARCH_AARCH64 - cpu_features::Aarch64Features opt = cpu_features::GetAarch64Info().features; - bool sve2_supported = opt.sve2; - bool sve_supported = opt.sve; - bool neon_hp_supported = opt.asimdhp; -#ifdef OPT_SVE2 - INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, SVE2, 16, sve2_supported); - INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, SVE2, 16, sve2_supported); -#endif -#ifdef OPT_SVE - INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, SVE, 16, sve_supported); - INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, SVE, 16, sve_supported); -#endif -#ifdef OPT_NEON_HP - INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, NEON_HP, 16, neon_hp_supported); - INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, NEON_HP, 16, neon_hp_supported); -#endif -#endif // CPU_FEATURES_ARCH_AARCH64 -``` - -Verify exact `cpu_features` helper names against the x86 sister block already in `bm_spaces_sq8_fp16.cpp` (e.g. `GetX86Info`). - -`bm_spaces_sq8_fp16` and `bm_spaces_sq8_fp32` are separate executables; the per-ISA throughput comparison requested by Jira is done by running both benches and comparing matched ISA rows. - -## Acceptance criteria (Jira MOD-14972 → spec mapping) - -| Jira requirement | Where this spec delivers it | -|------------------|------------------------------| -| Kernels: IP + L2 for NEON | NEON_HP TU hosts kernel headers + chooser symbols | -| Kernels: IP + L2 for SVE | SVE TU hosts kernel headers + chooser symbols | -| Kernels: IP + L2 for SVE2 | SVE2 TU includes SVE headers, instantiates templates under SVE2 flags | -| Scalar fallback (reference for all platforms) | Already present in `IP.cpp` / `L2.cpp`; unchanged | -| FP16 query → FP32 per SIMD chunk | `vcvt_f32_f16` (NEON), `svcvt_f32_f16_x` (SVE) | -| FP32 metadata + correction terms | `load_unaligned` for all FP32 trailer scalars | -| Wire into dispatch table per ISA flag | `IP_space.cpp` (2 blocks), `L2_space.cpp` (1 block), `OPT_SVE2/SVE/NEON_HP` | -| Unit tests vs. scalar reference per ISA | Tier-walk in `SQ8_FP16_SpacesOptimizationTest` | -| Edge cases (empty, dim-alignment boundaries) | `dim=0` + `dim=15` scalar tests; `dim=16..32` SIMD boundary param suite | -| Microbench per ISA throughput vs. SQ8↔FP32 | ARM registrations in `bm_spaces_sq8_fp16.cpp`; matched-ISA comparison vs. `bm_spaces_sq8_fp32` | - -## Diff size estimate - -| Area | Files | LoC (rough) | -|------|-------|-------------| -| Kernel headers | 4 new | ~600 | -| Dispatcher TU additions | NEON_HP.cpp/h, SVE.cpp/h, SVE2.cpp/h | ~80 | -| Dispatcher wiring | IP_space.cpp, L2_space.cpp | ~45 | -| Tests | test_spaces.cpp | ~80 | -| Bench | bm_spaces_sq8_fp16.cpp | ~25 | -| CMakeLists.txt | none | 0 | -| **Total** | **~10 files** | **~830** | - -## PR mechanics - -- **Branch**: `dor-forer-sq8-fp16-arm-kernels-mod-14972` -- **Base branch**: `dor-forer-sq8-fp16-x86-kernels-mod-14954` (PR #970) -- **PR target**: opens against PR #970 head; retarget to `main` once #970 merges -- **Commit prefix**: `[MOD-14972]` (matches repo convention) -- **PR title**: `Add SQ8↔FP16 ARM SIMD distance kernels [MOD-14972]` - -## Verification gates before opening PR - -1. **x86 host build clean** — verifies generic dispatch and tests remain clean; ARM kernels require ARM build or cross-compile, so the kernels themselves are not exercised here. -2. **ARM host build + unit tests** — NEON_HP / SVE / SVE2 paths exercised. Requires coordination with the user for ARM hardware or a cross-compile setup. -3. **ASan clean** on every host that runs unit tests. -4. **Microbench compiles + runs on ARM host.** - -## Out of scope (deferred, separate PRs) - -- Dispatcher-routed edge-case tests (`ZeroQueryTest`, `ConstantStorageTest`, `MixedSignQueryTest`) — they currently bypass the dispatcher and call scalar directly; cross-arch debt, also PR #970 H1. -- Multi-accumulator ILP tuning beyond the 4-accumulator baseline established here. -- Unrelated x86 review-feedback fixes (M1–M4, H1–H2 on x86 files from PR #970 review). This ARM PR will modify some files that PR #970 also touches (dispatchers, test class, bench), but only with ARM-relevant additions — x86 review fixes land in #970. - -## Inheritance from PR #970 review findings - -The following lessons from the PR #970 review are baked into this design so they do not need to be re-flagged on ARM kernels: - -- `assert(dim >= 16)` at the top of every kernel template (paired with dispatcher `dim >= 16` guard). -- 4-accumulator ILP in both NEON and SVE hot loops. -- Algebraic-identity formula anchor comment at the top of each kernel header. -- `load_unaligned` for all FP32 metadata reads (matches scalar). -- Dispatcher-routed tier-walk test pattern (no scalar-bypass). -- Per-ISA microbench registration alongside SQ8↔FP32 sister for direct comparison. diff --git a/src/VecSim/spaces/IP_space.cpp b/src/VecSim/spaces/IP_space.cpp index 1930e64a2..9366d3144 100644 --- a/src/VecSim/spaces/IP_space.cpp +++ b/src/VecSim/spaces/IP_space.cpp @@ -241,9 +241,6 @@ dist_func_t IP_SQ8_FP16_GetDistFunc(size_t dim, unsigned char *alignment, #endif #ifdef OPT_NEON_HP if (features.asimdhp) { - // No alignment write: the locked spec and the sister ARM SQ8_FP32 dispatchers - // leave *alignment untouched on ARM tiers. The corresponding tests assert - // 0xFF passthrough on the scalar path and do not assert any non-zero value here. return Choose_SQ8_FP16_IP_implementation_NEON_HP(dim); } #endif @@ -313,9 +310,6 @@ dist_func_t Cosine_SQ8_FP16_GetDistFunc(size_t dim, unsigned char *alignm #endif #ifdef OPT_NEON_HP if (features.asimdhp) { - // No alignment write: the locked spec and the sister ARM SQ8_FP32 dispatchers - // leave *alignment untouched on ARM tiers. The corresponding tests assert - // 0xFF passthrough on the scalar path and do not assert any non-zero value here. return Choose_SQ8_FP16_Cosine_implementation_NEON_HP(dim); } #endif diff --git a/src/VecSim/spaces/L2_space.cpp b/src/VecSim/spaces/L2_space.cpp index 2e18920b3..7d65814e0 100644 --- a/src/VecSim/spaces/L2_space.cpp +++ b/src/VecSim/spaces/L2_space.cpp @@ -172,9 +172,6 @@ dist_func_t L2_SQ8_FP16_GetDistFunc(size_t dim, unsigned char *alignment, #endif #ifdef OPT_NEON_HP if (features.asimdhp) { - // No alignment write: the locked spec and the sister ARM SQ8_FP32 dispatchers - // leave *alignment untouched on ARM tiers. The corresponding tests assert - // 0xFF passthrough on the scalar path and do not assert any non-zero value here. return Choose_SQ8_FP16_L2_implementation_NEON_HP(dim); } #endif From e1647dcc158b9cdea13b59cfb3688663d099169c Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Sun, 31 May 2026 13:34:54 +0000 Subject: [PATCH 19/19] Apply clang-format 18.1.8 (matches CI) [MOD-14972] --- src/VecSim/batch_iterator.h | 2 +- tests/benchmark/bm_vecsim_svs.h | 14 ++++++-------- tests/benchmark/types_ranges.h | 12 ++++-------- tests/unit/test_allocator.cpp | 4 ++-- 4 files changed, 13 insertions(+), 19 deletions(-) diff --git a/src/VecSim/batch_iterator.h b/src/VecSim/batch_iterator.h index 466072f86..9e2791130 100644 --- a/src/VecSim/batch_iterator.h +++ b/src/VecSim/batch_iterator.h @@ -27,7 +27,7 @@ struct VecSimBatchIterator : public VecsimBaseObject { explicit VecSimBatchIterator(void *query_vector, void *tctx, std::shared_ptr allocator) : VecsimBaseObject(allocator), query_vector(query_vector), returned_results_count(0), - timeoutCtx(tctx){}; + timeoutCtx(tctx) {}; virtual inline const void *getQueryBlob() const { return query_vector; } diff --git a/tests/benchmark/bm_vecsim_svs.h b/tests/benchmark/bm_vecsim_svs.h index b92cce5e0..5acb882c0 100644 --- a/tests/benchmark/bm_vecsim_svs.h +++ b/tests/benchmark/bm_vecsim_svs.h @@ -466,19 +466,17 @@ void BM_VecSimSVS::RunGC(benchmark::State &st) { #define UNIT_AND_ITERATIONS Unit(benchmark::kMillisecond)->Iterations(2) #if HAVE_SVS_LVQ -#define QUANT_BITS_ARGS \ - { VecSimSvsQuant_8, VecSimSvsQuant_4x8_LeanVec } +#define QUANT_BITS_ARGS {VecSimSvsQuant_8, VecSimSvsQuant_4x8_LeanVec} #define COMPRESSED_TRAINING_THRESHOLD_ARGS \ - { static_cast(BM_VecSimGeneral::block_size), 5000, 10000 } + {static_cast(BM_VecSimGeneral::block_size), 5000, 10000} #define COMPRESSED_ASYNC_TRAINING_THRESHOLD_ARGS \ - { static_cast(BM_VecSimGeneral::block_size), 5000, 10000, 50000 } + {static_cast(BM_VecSimGeneral::block_size), 5000, 10000, 50000} #else -#define QUANT_BITS_ARGS \ - { VecSimSvsQuant_8 } +#define QUANT_BITS_ARGS {VecSimSvsQuant_8} // Using smaller training TH to avoid long test times without LVQ #define COMPRESSED_TRAINING_THRESHOLD_ARGS \ - { static_cast(BM_VecSimGeneral::block_size), 5000 } + {static_cast(BM_VecSimGeneral::block_size), 5000} #define COMPRESSED_ASYNC_TRAINING_THRESHOLD_ARGS \ - { static_cast(BM_VecSimGeneral::block_size), 5000 } + {static_cast(BM_VecSimGeneral::block_size), 5000} #endif diff --git a/tests/benchmark/types_ranges.h b/tests/benchmark/types_ranges.h index deff4251c..43abda8f0 100644 --- a/tests/benchmark/types_ranges.h +++ b/tests/benchmark/types_ranges.h @@ -11,11 +11,9 @@ #include #include "bm_definitions.h" -#define DEFAULT_RANGE_RADII \ - { 20, 35, 50 } +#define DEFAULT_RANGE_RADII {20, 35, 50} -#define DEFAULT_RANGE_EPSILONS \ - { 1, 10, 11 } +#define DEFAULT_RANGE_EPSILONS {1, 10, 11} // This template struct methods returns the default values for radii and epsilons // To specify different values for a certain type, use template specialization @@ -27,8 +25,7 @@ struct benchmark_range { // Larger Range query values are required for int8 wikipedia dataset. // Default values give 0 results -#define INT8_RANGE_RADII \ - { 50, 65, 80 } +#define INT8_RANGE_RADII {50, 65, 80} template <> struct benchmark_range { @@ -37,8 +34,7 @@ struct benchmark_range { }; // UINT8 ranges -#define UINT8_RANGE_RADII \ - { 4, 5, 7 } +#define UINT8_RANGE_RADII {4, 5, 7} template <> struct benchmark_range { diff --git a/tests/unit/test_allocator.cpp b/tests/unit/test_allocator.cpp index 77db41684..6aa4a0d0b 100644 --- a/tests/unit/test_allocator.cpp +++ b/tests/unit/test_allocator.cpp @@ -33,7 +33,7 @@ struct ObjectWithSTL : public VecsimBaseObject { public: ObjectWithSTL(std::shared_ptr allocator) - : VecsimBaseObject(allocator), test_vec(allocator){}; + : VecsimBaseObject(allocator), test_vec(allocator) {}; }; struct NestedObject : public VecsimBaseObject { @@ -42,7 +42,7 @@ struct NestedObject : public VecsimBaseObject { public: NestedObject(std::shared_ptr allocator) - : VecsimBaseObject(allocator), stl_object(allocator), simpleObject(allocator){}; + : VecsimBaseObject(allocator), stl_object(allocator), simpleObject(allocator) {}; }; TEST_F(AllocatorTest, test_simple_object) {