From 10ea6df45e4f34c37548ef0cd6f6d5f4525032cb Mon Sep 17 00:00:00 2001
From: Dor Forer <dor.forer@redis.com>
Date: Thu, 28 May 2026 17:24:23 +0300
Subject: [PATCH 01/19] =?UTF-8?q?Add=20design=20spec=20for=20SQ8=E2=86=94F?=
 =?UTF-8?q?P16=20ARM=20SIMD=20kernels=20[MOD-14972]?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Stacked on PR #970 (MOD-14954 x86 kernels). Mirrors x86 structure
onto NEON_HP / SVE / SVE2 tiers. Zero CMake changes; reuses existing
ARM TU compile flags. Scalar fallback already on main serves as
reference. Bakes in PR #970 review lessons (assert(dim>=16),
4-accumulator ILP, formula anchor, load_unaligned<float> metadata,
dispatcher-routed tier-walk tests).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../specs/2026-05-28-arm-sq8-fp16-design.md   | 354 ++++++++++++++++++
 1 file changed, 354 insertions(+)
 create mode 100644 docs/superpowers/specs/2026-05-28-arm-sq8-fp16-design.md
diff --git a/docs/superpowers/specs/2026-05-28-arm-sq8-fp16-design.md b/docs/superpowers/specs/2026-05-28-arm-sq8-fp16-design.md
new file mode 100644
index 000000000..f4188d38b
--- /dev/null
+++ b/docs/superpowers/specs/2026-05-28-arm-sq8-fp16-design.md
@@ -0,0 +1,354 @@
+# SQ8↔FP16 ARM SIMD Distance Kernels — Design Spec
+
+- **Ticket**: [MOD-14972](https://redislabs.atlassian.net/browse/MOD-14972)
+- **Branch**: `dor-forer-sq8-fp16-arm-kernels-mod-14972`
+- **Base**: `dor-forer-sq8-fp16-x86-kernels-mod-14954` (PR #970) — stacked
+- **Sibling**: MOD-14954 / PR #970 delivers x86 SIMD kernels (AVX-512, AVX2, SSE4) for the same operation
+
+## Goal
+
+Add SQ8↔FP16 SIMD distance kernels for IP and L2 on the ARM ISA tiers (NEON_HP, SVE, SVE2). FP16 is the query data type; SQ8 is the stored vector representation. Match the contract and structure of the x86 kernels delivered in PR #970 so dispatch tables, metadata layout, and acceptance criteria stay symmetric across architectures.
+
+The scalar fallback (`SQ8_FP16_InnerProduct`, `SQ8_FP16_L2Sqr`, `SQ8_FP16_Cosine` in `src/VecSim/spaces/IP/IP.cpp` and `src/VecSim/spaces/L2/L2.cpp`) already exists on `main`. This spec does not modify it; it serves as the reference implementation for all platforms.
+
+## Algebraic identity (shared with x86 PR + SQ8_FP32 sister)
+
+```
+IP(x, y) ≈ min · y_sum + delta · Σ(q_i · y_i)
+L2(x, y) = x_sum_sq + y_sum_sq - 2 · IP(x, y)
+```
+
+Hot loop accumulates `Σ(q_i · y_i)` only. No per-element dequantization. FP16 query lanes are widened to FP32 per SIMD chunk; everything in the hot loop is FP32.
+
+## Metadata layout
+
+```
+SQ8 storage (pVect1): [uint8 × dim] [min_val] [delta] [x_sum] [x_sum_squares]
+FP16 query   (pVect2): [float16 × dim] [y_sum] [y_sum_squares]
+```
+
+Both metadata trailers are FP32 scalars. Storage metadata is not 4-byte aligned whenever `dim % 4 != 0`; query metadata is not 4-byte aligned whenever `dim` is odd. The blanket rule: every FP32 metadata read uses the global `load_unaligned<float>` helper, matching scalar `_Impl` in `IP.cpp` / `L2.cpp`. `sq8` namespace constants: `MIN_VAL`, `DELTA`, `SUM_QUERY`, `SUM_SQUARES`, `SUM_SQUARES_QUERY`.
+
+## File layout
+
+```
+src/VecSim/spaces/IP/
+  IP_NEON_SQ8_FP16.h     (new)
+  IP_SVE_SQ8_FP16.h      (new) — also #included from SVE2.cpp
+src/VecSim/spaces/L2/
+  L2_NEON_SQ8_FP16.h     (new)
+  L2_SVE_SQ8_FP16.h      (new) — also #included from SVE2.cpp
+src/VecSim/spaces/functions/
+  NEON_HP.cpp            (+ Choose_SQ8_FP16_{IP,L2,Cosine}_implementation_NEON_HP)
+  NEON_HP.h              (+ 3 declarations)
+  SVE.cpp                (+ Choose_SQ8_FP16_*_implementation_SVE)
+  SVE.h                  (+ 3 declarations)
+  SVE2.cpp               (+ Choose_SQ8_FP16_*_implementation_SVE2; owns its own chooser symbols; instantiates SVE kernel templates under SVE2 compile flags)
+  SVE2.h                 (+ 3 declarations)
+src/VecSim/spaces/
+  IP_space.cpp           (2 dispatcher block edits: IP, Cosine)
+  L2_space.cpp           (1 dispatcher block edit)
+```
+
+**Zero CMake changes.** Existing TU flags carry exactly what we need:
+
+| TU | Flags |
+|----|-------|
+| `NEON_HP.cpp` | `-march=armv8.2-a+fp16fml` (covers fp16 cvt + fma) |
+| `SVE.cpp` | `-march=armv8-a+sve` (SVE includes f16↔f32 cvt) |
+| `SVE2.cpp` | `-march=armv9-a+sve2` |
+
+## Dispatcher tier order
+
+Same precedence as existing SQ8_FP32 ARM dispatch:
+
+```cpp
+#ifdef OPT_SVE2
+    if (features.sve2 && dim >= 16) {
+        return Choose_SQ8_FP16_IP_implementation_SVE2(dim);
+    }
+#endif
+#ifdef OPT_SVE
+    if (features.sve && dim >= 16) {
+        return Choose_SQ8_FP16_IP_implementation_SVE(dim);
+    }
+#endif
+#ifdef OPT_NEON_HP
+    if (features.asimdhp && dim >= 16) {
+        return Choose_SQ8_FP16_IP_implementation_NEON_HP(dim);
+    }
+#endif
+// dim < 16 or no ARM SIMD → scalar fallback (existing return at function tail)
+```
+
+The `dim >= 16` guard in the dispatcher is what lets each SIMD kernel hold an internal `assert(dim >= 16)` as a real precondition. Edge cases for `dim < 16` are routed to scalar.
+
+## NEON kernel design
+
+### Header: `IP_NEON_SQ8_FP16.h`
+
+Template signature mirrors SQ8_FP32 NEON sister:
+
+```cpp
+template <unsigned char residual>   // 0..15
+float SQ8_FP16_InnerProductSIMD16_NEON_HP_IMP(const void *pVect1v, const void *pVect2v, size_t dimension);
+```
+
+Hot loop — 16 lanes per iteration, 4 FP32 accumulators:
+
+```cpp
+// SQ8 load: 16 × uint8 → 4 × float32x4_t
+uint8x16_t v1_u8 = vld1q_u8(pVect1);
+uint16x8_t v1_lo = vmovl_u8(vget_low_u8(v1_u8));
+uint16x8_t v1_hi = vmovl_u8(vget_high_u8(v1_u8));
+float32x4_t v1_0 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(v1_lo)));
+float32x4_t v1_1 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(v1_lo)));
+float32x4_t v1_2 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(v1_hi)));
+float32x4_t v1_3 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(v1_hi)));
+
+// FP16 query load: 16 × f16 → 4 × float32x4_t via vcvt_f32_f16
+float16x8_t q_lo = vld1q_f16(pVect2);
+float16x8_t q_hi = vld1q_f16(pVect2 + 8);
+float32x4_t v2_0 = vcvt_f32_f16(vget_low_f16(q_lo));
+float32x4_t v2_1 = vcvt_f32_f16(vget_high_f16(q_lo));
+float32x4_t v2_2 = vcvt_f32_f16(vget_low_f16(q_hi));
+float32x4_t v2_3 = vcvt_f32_f16(vget_high_f16(q_hi));
+
+// 4-accumulator FMA
+sum0 = vfmaq_f32(sum0, v1_0, v2_0);
+sum1 = vfmaq_f32(sum1, v1_1, v2_1);
+sum2 = vfmaq_f32(sum2, v1_2, v2_2);
+sum3 = vfmaq_f32(sum3, v1_3, v2_3);
+```
+
+Residual ladder (`dim % 16`, residual 0..15):
+
+- **`residual >= 8`**: one 8-lane safe load each side — `vld1_u8` (8 bytes) for SQ8 and `vld1q_f16` (8 × FP16 = 16 bytes, fits before query metadata) for FP16. Convert + FMA. Remaining `residual - 8` lanes handled scalar.
+- **`residual < 8`**: full scalar residual loop using `vecsim_types::FP16_to_FP32`.
+
+Rationale: a 16-byte SQ8 load (`vld1q_u8`) or a 16-byte FP16 load (`vld1q_f16` past the 8-lane boundary) on a residual < 8 would overread past valid query data into metadata — `y_sum` is only 4 bytes for IP and `y_sum_sq` adds 4 more for L2, not enough headroom for an 8-lane FP16 load.
+
+Final reduction: `vaddvq_f32(sum0 + sum1 + sum2 + sum3)`, then return `min_val * y_sum + delta * quantized_dot`.
+
+`assert(dim >= 16)` at the top.
+
+### Header: `L2_NEON_SQ8_FP16.h`
+
+Calls `SQ8_FP16_InnerProductSIMD16_NEON_HP_IMP<residual>(...)` to compute raw IP, then returns `x_sum_sq + y_sum_sq - 2.0f * ip`. Mirrors `L2_NEON_SQ8_FP32.h` exactly.
+
+### Wrapper symbols (NEON_HP.cpp)
+
+```cpp
+dist_func_t<float> Choose_SQ8_FP16_IP_implementation_NEON_HP(size_t dim) {
+    dist_func_t<float> ret;
+    CHOOSE_IMPLEMENTATION(ret, dim, 16, SQ8_FP16_InnerProductSIMD16_NEON_HP);
+    return ret;
+}
+// L2 + Cosine identical shape (Cosine reuses IP wrapper per repo convention)
+```
+
+## SVE kernel design
+
+### Header: `IP_SVE_SQ8_FP16.h`
+
+Template signature mirrors SVE SQ8_FP32 sister:
+
+```cpp
+template <bool partial_chunk, unsigned char additional_steps>
+float SQ8_FP16_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v, size_t dimension);
+```
+
+Inner step (one SVE vector width `svcntw()` lanes of FP32):
+
+```cpp
+svbool_t pg = svptrue_b32();
+// SQ8: zero-extend uint8 → uint32 (predicated b32 load)
+svuint32_t v1_u32 = svld1ub_u32(pg, pVect1 + offset);
+svfloat32_t v1_f  = svcvt_f32_u32_x(pg, v1_u32);
+// FP16: load chunk fp16 lanes, widen to fp32
+svbool_t pg16 = svwhilelt_b16(uint32_t(0), uint32_t(chunk));
+svfloat16_t q_h = svld1_f16(pg16, pVect2 + offset);
+svfloat32_t v2_f = svcvt_f32_f16_x(pg, q_h);   // verify exact ACLE/packing during impl
+sum = svmla_f32_x(pg, sum, v1_f, v2_f);
+offset += chunk;
+```
+
+**ACLE caveat**: exact f16→f32 widening intrinsic and lane packing — confirm `svcvt_f32_f16_x(pg, q_h)` compiles cleanly against the loaded `svfloat16_t`. If lane packing needs an unpack/interleave step, verify against `IP_SVE_FP16.h`.
+
+4 accumulators `sum0..sum3`; main loop processes 4 chunks via 4 `InnerProductStep` calls. `partial_chunk` template branch handles `dim % chunk` via `svwhilelt_b32`.
+
+Inactive-lane discipline on the partial path: the predicated `svld1_f16` / `svld1ub_u32` cover lane *liveness*, but the final reduction with `svaddv_f32(svptrue_b32(), ...)` walks *all* lanes. To keep inactive lanes from contributing garbage, the partial step uses the zeroing form `svmla_f32_z(pg_partial, sum0, v1_f, v2_f)` (matches `IP_SVE_SQ8_FP32.h` partial-chunk pattern). Alternative: reduce only active lanes via `svaddv_f32(pg_partial, sum0)` for the partial-step accumulator, then sum into the main reduction. The `_z` form is the simpler choice and is what the SQ8_FP32 SVE sister already does.
+
+Predicate widths on the partial path: FP32 math (load/widen/mla) uses a `b32` predicate sized to `remaining` 32-bit lanes (`svwhilelt_b32(0, remaining)`); the FP16 query load needs its own `b16` predicate sized to the same `remaining` half lanes (`svwhilelt_b16(0, remaining)`) since `svld1_f16` is governed by a 16-bit predicate. SQ8 load via `svld1ub_u32` is governed by the `b32` predicate (it widens uint8 → uint32 lanewise).
+
+Final reduction: `svaddv_f32(svptrue_b32(), sum0 + sum1 + sum2 + sum3)`.
+
+### Header: `L2_SVE_SQ8_FP16.h`
+
+Calls `SQ8_FP16_InnerProductSIMD_SVE_IMP<partial_chunk, additional_steps>(...)` then returns `x_sum_sq + y_sum_sq - 2.0f * ip`. Mirrors `L2_SVE_SQ8_FP32.h`.
+
+### Wrapper symbols
+
+`SVE.cpp`:
+
+```cpp
+dist_func_t<float> Choose_SQ8_FP16_IP_implementation_SVE(size_t dim) {
+    dist_func_t<float> ret;
+    CHOOSE_SVE_IMPLEMENTATION(ret, SQ8_FP16_InnerProductSIMD_SVE, dim, svcntw);
+    return ret;
+}
+// L2 + Cosine identical shape
+```
+
+`SVE2.cpp`:
+
+```cpp
+#include "VecSim/spaces/IP/IP_SVE_SQ8_FP16.h"  // SVE2 implementation is identical to SVE
+#include "VecSim/spaces/L2/L2_SVE_SQ8_FP16.h"
+
+dist_func_t<float> Choose_SQ8_FP16_IP_implementation_SVE2(size_t dim) {
+    dist_func_t<float> ret;
+    CHOOSE_SVE_IMPLEMENTATION(ret, SQ8_FP16_InnerProductSIMD_SVE, dim, svcntw);
+    return ret;
+}
+// L2 + Cosine identical shape
+```
+
+SVE2 owns its own chooser symbols (does **not** call the SVE chooser); template instantiated under SVE2 compile flags.
+
+## Tests
+
+### Class
+
+Branch base is PR #970. During implementation, verify whether the base branch already exposes `SQ8_FP16_SpacesOptimizationTest` (extend) or only `SQ8_FP16_NoOptimizationSpacesTest` (add the optimization class here mirroring `SQ8_FP32_SpacesOptimizationTest`).
+
+### Tier-walk pattern
+
+Per-tier `if (features.<flag>)` block; **unset higher flag** after each block so the next tier is exercised on hosts that support multiple ISAs. Do not use `GTEST_SKIP()` here — it would abort the entire walk.
+
+```cpp
+auto expected = SQ8_FP16_InnerProduct;  // scalar reference
+
+#ifdef OPT_SVE2
+    if (features.sve2) {
+        arch_opt_func = IP_SQ8_FP16_GetDistFunc(dim, &alignment, &features);
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_IP_implementation_SVE2(dim))
+            << "SVE2 dispatch mismatch";
+        ASSERT_NEAR(arch_opt_func(v1, v2, dim), expected(v1, v2, dim), 0.01);
+        features.sve2 = 0;   // exercise next tier
+    }
+#endif
+#ifdef OPT_SVE
+    if (features.sve) { /* same shape */ features.sve = 0; }
+#endif
+#ifdef OPT_NEON_HP
+    if (features.asimdhp) { /* same shape */ features.asimdhp = 0; }
+#endif
+// final fallback assertion: IP_SQ8_FP16_GetDistFunc(...) == SQ8_FP16_InnerProduct (scalar)
+```
+
+Three dispatch entry points exercised per tier: `IP_SQ8_FP16_GetDistFunc`, `L2_SQ8_FP16_GetDistFunc`, `Cosine_SQ8_FP16_GetDistFunc`.
+
+### Scalar-fallback tests
+
+`GetDistFuncSQ8FP16Asymmetric` — currently asserts `dim=128` returns scalar; that assertion breaks once SIMD dispatch lands. Change to `dim=15` (below the `dim >= 16` SIMD threshold). Add a small `dim=0` (empty) scalar-fallback assertion to cover the Jira "empty" edge case.
+
+### Dim parameterization
+
+Base branch already has both parameterized suites against `SQ8_FP16_SpacesOptimizationTest`:
+- `SQ8_FP16_SIMD` — `testing::Range(16UL, 33UL)` (dims 16..32; residual + threshold boundaries)
+- `SQ8_FP16_SIMD_HighDim` — `64, 128, 256, 512, 1024` (multi-iteration main loop)
+
+Both suites pick up the ARM tier-walk additions automatically since the test class body is what's extended. No new instantiation needed.
+
+### Tier coverage report
+
+`SQ8_FP16_SIMD_TierCoverage.ReportTiersExercised` (test_spaces.cpp) currently reports only x86 tiers. Extend it with ARM tier entries (SVE2 / SVE / NEON_HP) so an ARM-only SIMD host reports its exercised tiers instead of going silent.
+
+## Microbench
+
+`tests/benchmark/spaces_benchmarks/bm_spaces_sq8_fp16.cpp` already registers x86 ISA benchmarks. Add ARM registrations under `#ifdef OPT_*` guards using the existing `bm_spaces.h` macros:
+
+```cpp
+#ifdef CPU_FEATURES_ARCH_AARCH64
+    cpu_features::Aarch64Features opt = cpu_features::GetAarch64Info().features;
+    bool sve2_supported    = opt.sve2;
+    bool sve_supported     = opt.sve;
+    bool neon_hp_supported = opt.asimdhp;
+#ifdef OPT_SVE2
+    INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, SVE2, 16, sve2_supported);
+    INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, SVE2, 16, sve2_supported);
+#endif
+#ifdef OPT_SVE
+    INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, SVE, 16, sve_supported);
+    INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, SVE, 16, sve_supported);
+#endif
+#ifdef OPT_NEON_HP
+    INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, NEON_HP, 16, neon_hp_supported);
+    INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, NEON_HP, 16, neon_hp_supported);
+#endif
+#endif // CPU_FEATURES_ARCH_AARCH64
+```
+
+Verify exact `cpu_features` helper names against the x86 sister block already in `bm_spaces_sq8_fp16.cpp` (e.g. `GetX86Info`).
+
+`bm_spaces_sq8_fp16` and `bm_spaces_sq8_fp32` are separate executables; the per-ISA throughput comparison requested by Jira is done by running both benches and comparing matched ISA rows.
+
+## Acceptance criteria (Jira MOD-14972 → spec mapping)
+
+| Jira requirement | Where this spec delivers it |
+|------------------|------------------------------|
+| Kernels: IP + L2 for NEON | NEON_HP TU hosts kernel headers + chooser symbols |
+| Kernels: IP + L2 for SVE | SVE TU hosts kernel headers + chooser symbols |
+| Kernels: IP + L2 for SVE2 | SVE2 TU includes SVE headers, instantiates templates under SVE2 flags |
+| Scalar fallback (reference for all platforms) | Already present in `IP.cpp` / `L2.cpp`; unchanged |
+| FP16 query → FP32 per SIMD chunk | `vcvt_f32_f16` (NEON), `svcvt_f32_f16_x` (SVE) |
+| FP32 metadata + correction terms | `load_unaligned<float>` for all FP32 trailer scalars |
+| Wire into dispatch table per ISA flag | `IP_space.cpp` (2 blocks), `L2_space.cpp` (1 block), `OPT_SVE2/SVE/NEON_HP` |
+| Unit tests vs. scalar reference per ISA | Tier-walk in `SQ8_FP16_SpacesOptimizationTest` |
+| Edge cases (empty, dim-alignment boundaries) | `dim=0` + `dim=15` scalar tests; `dim=16..32` SIMD boundary param suite |
+| Microbench per ISA throughput vs. SQ8↔FP32 | ARM registrations in `bm_spaces_sq8_fp16.cpp`; matched-ISA comparison vs. `bm_spaces_sq8_fp32` |
+
+## Diff size estimate
+
+| Area | Files | LoC (rough) |
+|------|-------|-------------|
+| Kernel headers | 4 new | ~600 |
+| Dispatcher TU additions | NEON_HP.cpp/h, SVE.cpp/h, SVE2.cpp/h | ~80 |
+| Dispatcher wiring | IP_space.cpp, L2_space.cpp | ~45 |
+| Tests | test_spaces.cpp | ~80 |
+| Bench | bm_spaces_sq8_fp16.cpp | ~25 |
+| CMakeLists.txt | none | 0 |
+| **Total** | **~10 files** | **~830** |
+
+## PR mechanics
+
+- **Branch**: `dor-forer-sq8-fp16-arm-kernels-mod-14972`
+- **Base branch**: `dor-forer-sq8-fp16-x86-kernels-mod-14954` (PR #970)
+- **PR target**: opens against PR #970 head; retarget to `main` once #970 merges
+- **Commit prefix**: `[MOD-14972]` (matches repo convention)
+- **PR title**: `Add SQ8↔FP16 ARM SIMD distance kernels [MOD-14972]`
+
+## Verification gates before opening PR
+
+1. **x86 host build clean** — verifies generic dispatch and tests remain clean; ARM kernels require ARM build or cross-compile, so the kernels themselves are not exercised here.
+2. **ARM host build + unit tests** — NEON_HP / SVE / SVE2 paths exercised. Requires coordination with the user for ARM hardware or a cross-compile setup.
+3. **ASan clean** on every host that runs unit tests.
+4. **Microbench compiles + runs on ARM host.**
+
+## Out of scope (deferred, separate PRs)
+
+- Dispatcher-routed edge-case tests (`ZeroQueryTest`, `ConstantStorageTest`, `MixedSignQueryTest`) — they currently bypass the dispatcher and call scalar directly; cross-arch debt, also PR #970 H1.
+- Multi-accumulator ILP tuning beyond the 4-accumulator baseline established here.
+- Unrelated x86 review-feedback fixes (M1–M4, H1–H2 on x86 files from PR #970 review). This ARM PR will modify some files that PR #970 also touches (dispatchers, test class, bench), but only with ARM-relevant additions — x86 review fixes land in #970.
+
+## Inheritance from PR #970 review findings
+
+The following lessons from the PR #970 review are baked into this design so they do not need to be re-flagged on ARM kernels:
+
+- `assert(dim >= 16)` at the top of every kernel template (paired with dispatcher `dim >= 16` guard).
+- 4-accumulator ILP in both NEON and SVE hot loops.
+- Algebraic-identity formula anchor comment at the top of each kernel header.
+- `load_unaligned<float>` for all FP32 metadata reads (matches scalar).
+- Dispatcher-routed tier-walk test pattern (no scalar-bypass).
+- Per-ISA microbench registration alongside SQ8↔FP32 sister for direct comparison.

From c061da9075f1b82b8d181c39f2e1502d14b3ae92 Mon Sep 17 00:00:00 2001
From: Dor Forer <dor.forer@redis.com>
Date: Thu, 28 May 2026 17:37:01 +0300
Subject: [PATCH 02/19] =?UTF-8?q?Add=20implementation=20plan=20for=20SQ8?=
 =?UTF-8?q?=E2=86=94FP16=20ARM=20SIMD=20kernels=20[MOD-14972]?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

14 bite-sized tasks following the spec at 2026-05-28-arm-sq8-fp16-design.md.
Each task ends in a commit; assistant runs tests/ASan/benchmarks after the
user confirms each ARM build cycle. Zero CMake changes; PR stacks on #970.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../plans/2026-05-28-arm-sq8-fp16-kernels.md  | 1195 +++++++++++++++++
 1 file changed, 1195 insertions(+)
 create mode 100644 docs/superpowers/plans/2026-05-28-arm-sq8-fp16-kernels.md

diff --git a/docs/superpowers/plans/2026-05-28-arm-sq8-fp16-kernels.md b/docs/superpowers/plans/2026-05-28-arm-sq8-fp16-kernels.md
new file mode 100644
index 000000000..2759ba046
--- /dev/null
+++ b/docs/superpowers/plans/2026-05-28-arm-sq8-fp16-kernels.md
@@ -0,0 +1,1195 @@
+# SQ8↔FP16 ARM SIMD Distance Kernels — Implementation Plan
+
+> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
+
+**Goal:** Add SQ8↔FP16 asymmetric distance kernels (IP, L2, Cosine) for ARM ISA tiers — NEON_HP, SVE, SVE2 — plugged into the existing dispatcher. Mirrors the x86 work delivered in PR #970.
+
+**Architecture:** Header-only SIMD kernel templates (one per metric × ISA), instantiated via the existing `CHOOSE_IMPLEMENTATION` / `CHOOSE_SVE_IMPLEMENTATION` macros inside ISA-specific TUs (`NEON_HP.cpp`, `SVE.cpp`, `SVE2.cpp`). Wiring lives in `IP_space.cpp` and `L2_space.cpp` under a `#ifdef CPU_FEATURES_ARCH_AARCH64` block that parallels the existing x86 block. L2 reuses the IP `_IMP` template via the algebraic identity `L2² = x_sum_sq + y_sum_sq − 2·IP`. Scalar fallback already on `main` is unchanged and stays as the reference for every tier.
+
+**Tech Stack:** C++20, ARM NEON intrinsics (`arm_neon.h`), ARM SVE/SVE2 intrinsics (`arm_sve.h`), GoogleTest, Google Benchmark, cpu_features.
+
+**Branch:** `dor-forer-sq8-fp16-arm-kernels-mod-14972` (stacked on PR #970 / `dor-forer-sq8-fp16-x86-kernels-mod-14954`).
+
+**Build / test loop:** The user runs `make build` (per project memory). After each build cycle confirmed, the assistant runs `make unit_test` / ASan / benchmarks on the appropriate host (ARM hardware or cross-compile/qemu — coordinate with user). Each task ends in a commit; commits are pushed only when explicitly requested.
+
+**Spec:** [`docs/superpowers/specs/2026-05-28-arm-sq8-fp16-design.md`](../specs/2026-05-28-arm-sq8-fp16-design.md)
+
+---
+
+## File Structure
+
+### Files created
+
+| Path | Responsibility |
+|------|----------------|
+| `src/VecSim/spaces/IP/IP_NEON_SQ8_FP16.h` | NEON IP kernel template (`SQ8_FP16_InnerProductSIMD16_NEON_HP_IMP` + thin wrappers) |
+| `src/VecSim/spaces/L2/L2_NEON_SQ8_FP16.h` | NEON L2 kernel template (calls NEON IP impl, applies L2 identity) |
+| `src/VecSim/spaces/IP/IP_SVE_SQ8_FP16.h` | SVE IP kernel template (`SQ8_FP16_InnerProductSIMD_SVE_IMP` + wrappers); also `#include`d from SVE2.cpp |
+| `src/VecSim/spaces/L2/L2_SVE_SQ8_FP16.h` | SVE L2 kernel template; also `#include`d from SVE2.cpp |
+
+### Files modified
+
+| Path | Change |
+|------|--------|
+| `src/VecSim/spaces/functions/NEON_HP.h` | +3 chooser declarations (IP, L2, Cosine) |
+| `src/VecSim/spaces/functions/NEON_HP.cpp` | +#include kernel headers; +3 chooser definitions |
+| `src/VecSim/spaces/functions/SVE.h` | +3 chooser declarations |
+| `src/VecSim/spaces/functions/SVE.cpp` | +#include kernel headers; +3 chooser definitions |
+| `src/VecSim/spaces/functions/SVE2.h` | +3 chooser declarations |
+| `src/VecSim/spaces/functions/SVE2.cpp` | +#include SVE kernel headers; +3 chooser definitions (own symbols, templates instantiated under SVE2 compile flags) |
+| `src/VecSim/spaces/IP_space.cpp` | +#ifdef AArch64 block in `IP_SQ8_FP16_GetDistFunc` and `Cosine_SQ8_FP16_GetDistFunc` (2 dispatcher blocks) |
+| `src/VecSim/spaces/L2_space.cpp` | +#ifdef AArch64 block in `L2_SQ8_FP16_GetDistFunc` (1 dispatcher block) |
+| `tests/unit/test_spaces.cpp` | retarget `GetDistFuncSQ8FP16Asymmetric` to dim=15; add dim=0 test; extend the three `SQ8_FP16_SpacesOptimizationTest` test bodies with ARM tier walks; extend `SQ8_FP16_SIMD_TierCoverage.ReportTiersExercised` with AArch64 tier reporting |
+| `tests/benchmark/spaces_benchmarks/bm_spaces_sq8_fp16.cpp` | +AArch64 `cpu_features` block; +ARM ISA benchmark registrations |
+
+### Files NOT modified
+
+`src/VecSim/spaces/CMakeLists.txt` — zero CMake changes. Existing TU flags (`-march=armv8.2-a+fp16fml` for NEON_HP, `-march=armv8-a+sve` for SVE, `-march=armv9-a+sve2` for SVE2) already carry everything the new kernels need.
+
+---
+
+## Task 1: Retarget the scalar-fallback dispatcher test
+
+**Why first:** Builds and runs on x86 today, has nothing to do with the ARM kernels, and tightens the contract the rest of the plan relies on (the dispatcher returns scalar for `dim < 16`).
+
+**Files:**
+- Modify: `tests/unit/test_spaces.cpp` — locate test named `GetDistFuncSQ8FP16Asymmetric` (added by PR #970; currently asserts `dim=128` returns the scalar fallback)
+
+- [ ] **Step 1: Locate the existing test**
+
+Run:
+```bash
+grep -n 'GetDistFuncSQ8FP16Asymmetric' tests/unit/test_spaces.cpp
+```
+Expected: one or more line hits pointing at the `TEST(...)` block.
+
+- [ ] **Step 2: Modify the test to cover dim=0 and dim=15 instead of dim=128**
+
+Replace the body of the existing `TEST(..., GetDistFuncSQ8FP16Asymmetric)` so it walks two below-threshold dims and asserts the scalar fallback for each of L2 / IP / Cosine. Drop in this exact body (rename the test fixture symbol to match what is already there if it differs):
+
+```cpp
+TEST_F(SpacesTest, GetDistFuncSQ8FP16Asymmetric) {
+    // SQ8 storage with FP16 query (asymmetric) - should return SQ8_FP16 functions.
+    // Per-ISA dispatcher walk coverage lives in the SQ8_FP16 SpacesOptimizationTest below.
+    //
+    // Walk two below-threshold dims (0 and 15) so the assertions hold regardless of which
+    // SIMD tiers the host advertises: dim < 16 must always short-circuit to scalar fallback.
+    // The template-mapping form (spaces::GetDistFunc<sq8, float, float16>) and the direct
+    // *_SQ8_FP16_GetDistFunc form must agree for every dim, and both must match the scalar
+    // reference at sub-threshold dims.
+    for (size_t dim : {static_cast<size_t>(0), static_cast<size_t>(15)}) {
+        auto l2_func = spaces::GetDistFunc<sq8, float, float16>(VecSimMetric_L2, dim, nullptr);
+        auto ip_func = spaces::GetDistFunc<sq8, float, float16>(VecSimMetric_IP, dim, nullptr);
+        auto cosine_func =
+            spaces::GetDistFunc<sq8, float, float16>(VecSimMetric_Cosine, dim, nullptr);
+
+        ASSERT_EQ(l2_func, L2_SQ8_FP16_GetDistFunc(dim, nullptr))
+            << "Template mapping disagrees with direct dispatcher for L2 at dim=" << dim;
+        ASSERT_EQ(ip_func, IP_SQ8_FP16_GetDistFunc(dim, nullptr))
+            << "Template mapping disagrees with direct dispatcher for IP at dim=" << dim;
+        ASSERT_EQ(cosine_func, Cosine_SQ8_FP16_GetDistFunc(dim, nullptr))
+            << "Template mapping disagrees with direct dispatcher for Cosine at dim=" << dim;
+
+        ASSERT_EQ(l2_func, SQ8_FP16_L2Sqr)
+            << "dim=" << dim << " must short-circuit to scalar L2 fallback";
+        ASSERT_EQ(ip_func, SQ8_FP16_InnerProduct)
+            << "dim=" << dim << " must short-circuit to scalar IP fallback";
+        ASSERT_EQ(cosine_func, SQ8_FP16_Cosine)
+            << "dim=" << dim << " must short-circuit to scalar Cosine fallback";
+    }
+}
+```
+
+- [ ] **Step 3: User builds**
+
+Ask the user to run `make build` (their normal x86 build is sufficient — this test is host-agnostic).
+
+- [ ] **Step 4: Run the test**
+
+Run:
+```bash
+./bin/<host-triple>/unit_tests --gtest_filter='SpacesTest.GetDistFuncSQ8FP16Asymmetric'
+```
+(Use `find bin -name unit_tests -type f` if the host-triple subdir is unknown.)
+Expected: PASS.
+
+- [ ] **Step 5: Commit**
+
+```bash
+git add tests/unit/test_spaces.cpp
+git commit -m "Retarget SQ8↔FP16 scalar-fallback dispatcher test to dim=0/15 [MOD-14972]"
+```
+
+---
+
+## Task 2: NEON IP kernel header
+
+**Files:**
+- Create: `src/VecSim/spaces/IP/IP_NEON_SQ8_FP16.h`
+
+- [ ] **Step 1: Author the kernel file**
+
+Create exactly this file (modeled on `IP_NEON_SQ8_FP32.h` + the NEON FP16 widening pattern from `IP_NEON_FP16.h`):
+
+```cpp
+/*
+ * Copyright (c) 2006-Present, Redis Ltd.
+ * All rights reserved.
+ *
+ * Licensed under your choice of the Redis Source Available License 2.0
+ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
+ * GNU Affero General Public License v3 (AGPLv3).
+ */
+#pragma once
+#include "VecSim/spaces/space_includes.h"
+#include "VecSim/types/sq8.h"
+#include "VecSim/types/float16.h"
+#include <arm_neon.h>
+#include <cassert>
+
+using sq8 = vecsim_types::sq8;
+using float16 = vecsim_types::float16;
+
+/*
+ * Optimised asymmetric SQ8<->FP16 inner product using the algebraic identity:
+ *
+ *   IP(x, y) = sum(x_i * y_i)
+ *            ~= sum((min + delta * q_i) * y_i)
+ *            = min * y_sum + delta * sum(q_i * y_i)
+ *
+ * The hot loop only accumulates sum(q_i * y_i) - no per-element dequantisation.
+ * FP16 query lanes are widened to FP32 via vcvt_f32_f16 per 16-lane chunk.
+ */
+
+// Helper: 16 lanes per call, four FP32 accumulators (one per quarter).
+static inline void
+SQ8_FP16_InnerProductStep_NEON_HP(const uint8_t *&pVect1, const float16 *&pVect2,
+                                  float32x4_t &sum0, float32x4_t &sum1,
+                                  float32x4_t &sum2, float32x4_t &sum3) {
+    // SQ8 storage: 16 * uint8 -> 4 * float32x4_t
+    uint8x16_t v1_u8 = vld1q_u8(pVect1);
+    uint16x8_t v1_lo = vmovl_u8(vget_low_u8(v1_u8));
+    uint16x8_t v1_hi = vmovl_u8(vget_high_u8(v1_u8));
+    float32x4_t v1_0 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(v1_lo)));
+    float32x4_t v1_1 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(v1_lo)));
+    float32x4_t v1_2 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(v1_hi)));
+    float32x4_t v1_3 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(v1_hi)));
+
+    // FP16 query: 16 * f16 -> 4 * float32x4_t via vcvt_f32_f16
+    const float16_t *q = reinterpret_cast<const float16_t *>(pVect2);
+    float16x8_t q_lo = vld1q_f16(q);
+    float16x8_t q_hi = vld1q_f16(q + 8);
+    float32x4_t v2_0 = vcvt_f32_f16(vget_low_f16(q_lo));
+    float32x4_t v2_1 = vcvt_f32_f16(vget_high_f16(q_lo));
+    float32x4_t v2_2 = vcvt_f32_f16(vget_low_f16(q_hi));
+    float32x4_t v2_3 = vcvt_f32_f16(vget_high_f16(q_hi));
+
+    sum0 = vfmaq_f32(sum0, v1_0, v2_0);
+    sum1 = vfmaq_f32(sum1, v1_1, v2_1);
+    sum2 = vfmaq_f32(sum2, v1_2, v2_2);
+    sum3 = vfmaq_f32(sum3, v1_3, v2_3);
+
+    pVect1 += 16;
+    pVect2 += 16;
+}
+
+// pVect1v = SQ8 storage, pVect2v = FP16 query
+template <unsigned char residual> // 0..15
+float SQ8_FP16_InnerProductSIMD16_NEON_HP_IMP(const void *pVect1v, const void *pVect2v,
+                                              size_t dimension) {
+    assert(dimension >= 16 && "kernel precondition: dispatcher must guard dim >= 16");
+
+    const uint8_t *pVect1 = static_cast<const uint8_t *>(pVect1v); // SQ8 storage
+    const float16 *pVect2 = static_cast<const float16 *>(pVect2v); // FP16 query
+
+    float32x4_t sum0 = vdupq_n_f32(0.0f);
+    float32x4_t sum1 = vdupq_n_f32(0.0f);
+    float32x4_t sum2 = vdupq_n_f32(0.0f);
+    float32x4_t sum3 = vdupq_n_f32(0.0f);
+
+    const size_t num_of_chunks = dimension / 16;
+    for (size_t i = 0; i < num_of_chunks; i++) {
+        SQ8_FP16_InnerProductStep_NEON_HP(pVect1, pVect2, sum0, sum1, sum2, sum3);
+    }
+
+    // Residual handling: dim % 16 lanes.
+    // residual >= 8: one safe 8-lane SQ8 + 8-lane FP16 load (FP16 trailer is wide enough).
+    // residual <  8: scalar-only - a 4-lane FP16 load would overread y_sum metadata.
+    constexpr unsigned char r = residual;
+    if constexpr (r >= 8) {
+        uint8x8_t v1_u8 = vld1_u8(pVect1);
+        uint16x8_t v1_u16 = vmovl_u8(v1_u8);
+        float32x4_t v1_a = vcvtq_f32_u32(vmovl_u16(vget_low_u16(v1_u16)));
+        float32x4_t v1_b = vcvtq_f32_u32(vmovl_u16(vget_high_u16(v1_u16)));
+        float16x8_t q_h = vld1q_f16(reinterpret_cast<const float16_t *>(pVect2));
+        float32x4_t v2_a = vcvt_f32_f16(vget_low_f16(q_h));
+        float32x4_t v2_b = vcvt_f32_f16(vget_high_f16(q_h));
+        sum0 = vfmaq_f32(sum0, v1_a, v2_a);
+        sum1 = vfmaq_f32(sum1, v1_b, v2_b);
+        pVect1 += 8;
+        pVect2 += 8;
+    }
+    // Lane-by-lane scalar for the final 0..7 (residual % 8) elements.
+    constexpr unsigned char tail = r & 0x7;
+    float scalar_dot = 0.0f;
+    for (unsigned char k = 0; k < tail; ++k) {
+        scalar_dot += static_cast<float>(pVect1[k]) * vecsim_types::FP16_to_FP32(pVect2[k]);
+    }
+
+    // Reduce the four NEON accumulators.
+    float32x4_t sum_lo = vaddq_f32(sum0, sum1);
+    float32x4_t sum_hi = vaddq_f32(sum2, sum3);
+    float quantized_dot = vaddvq_f32(vaddq_f32(sum_lo, sum_hi)) + scalar_dot;
+
+    // Metadata loads - use load_unaligned because odd dim leaves trailers unaligned.
+    const uint8_t *params_bytes = static_cast<const uint8_t *>(pVect1v) + dimension;
+    const float min_val =
+        load_unaligned<float>(params_bytes + sq8::MIN_VAL * sizeof(float));
+    const float delta =
+        load_unaligned<float>(params_bytes + sq8::DELTA * sizeof(float));
+    const uint8_t *query_meta_bytes =
+        reinterpret_cast<const uint8_t *>(static_cast<const float16 *>(pVect2v) + dimension);
+    const float y_sum =
+        load_unaligned<float>(query_meta_bytes + sq8::SUM_QUERY * sizeof(float));
+
+    return min_val * y_sum + delta * quantized_dot;
+}
+
+template <unsigned char residual>
+float SQ8_FP16_InnerProductSIMD16_NEON_HP(const void *pVect1v, const void *pVect2v,
+                                          size_t dimension) {
+    return 1.0f -
+           SQ8_FP16_InnerProductSIMD16_NEON_HP_IMP<residual>(pVect1v, pVect2v, dimension);
+}
+
+template <unsigned char residual>
+float SQ8_FP16_CosineSIMD16_NEON_HP(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    // Cosine = 1 - IP (vectors are pre-normalised); reuses the IP wrapper.
+    return SQ8_FP16_InnerProductSIMD16_NEON_HP<residual>(pVect1v, pVect2v, dimension);
+}
+```
+
+- [ ] **Step 2: Header-only smoke (no build yet)**
+
+Run:
+```bash
+grep -n 'load_unaligned\|FP16_to_FP32' src/VecSim/spaces/space_includes.h \
+    src/VecSim/spaces/IP/IP.cpp src/VecSim/types/float16.h 2>/dev/null
+```
+Expected: confirm the global `load_unaligned<T>` is reachable through `space_includes.h` (matches the include path used by `IP_NEON_SQ8_FP32.h`) and `FP16_to_FP32` is reachable through `VecSim/types/float16.h`. If either include is missing, add it.
+
+- [ ] **Step 3: Commit**
+
+```bash
+git add src/VecSim/spaces/IP/IP_NEON_SQ8_FP16.h
+git commit -m "Add NEON_HP SQ8↔FP16 IP kernel header [MOD-14972]"
+```
+
+---
+
+## Task 3: NEON L2 kernel header
+
+**Files:**
+- Create: `src/VecSim/spaces/L2/L2_NEON_SQ8_FP16.h`
+
+- [ ] **Step 1: Author the kernel file**
+
+```cpp
+/*
+ * Copyright (c) 2006-Present, Redis Ltd.
+ * All rights reserved.
+ *
+ * Licensed under your choice of the Redis Source Available License 2.0
+ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
+ * GNU Affero General Public License v3 (AGPLv3).
+ */
+#pragma once
+#include "VecSim/spaces/space_includes.h"
+#include "VecSim/spaces/IP/IP_NEON_SQ8_FP16.h"
+#include "VecSim/types/sq8.h"
+#include "VecSim/types/float16.h"
+
+using sq8 = vecsim_types::sq8;
+using float16 = vecsim_types::float16;
+
+/*
+ * Optimised asymmetric SQ8<->FP16 L2 squared distance using the algebraic identity:
+ *
+ *   ||x - y||^2 = sum(x_i^2) - 2 * IP(x, y) + sum(y_i^2)
+ *               = x_sum_squares - 2 * IP(x, y) + y_sum_squares
+ *
+ * IP is computed by SQ8_FP16_InnerProductSIMD16_NEON_HP_IMP; metadata is FP32.
+ */
+
+template <unsigned char residual> // 0..15
+float SQ8_FP16_L2SqrSIMD16_NEON_HP(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    const float ip =
+        SQ8_FP16_InnerProductSIMD16_NEON_HP_IMP<residual>(pVect1v, pVect2v, dimension);
+
+    const uint8_t *params_bytes = static_cast<const uint8_t *>(pVect1v) + dimension;
+    const float x_sum_sq =
+        load_unaligned<float>(params_bytes + sq8::SUM_SQUARES * sizeof(float));
+
+    const uint8_t *query_meta_bytes = reinterpret_cast<const uint8_t *>(
+        static_cast<const float16 *>(pVect2v) + dimension);
+    const float y_sum_sq =
+        load_unaligned<float>(query_meta_bytes + sq8::SUM_SQUARES_QUERY * sizeof(float));
+
+    return x_sum_sq + y_sum_sq - 2.0f * ip;
+}
+```
+
+- [ ] **Step 2: Commit**
+
+```bash
+git add src/VecSim/spaces/L2/L2_NEON_SQ8_FP16.h
+git commit -m "Add NEON_HP SQ8↔FP16 L2 kernel header [MOD-14972]"
+```
+
+---
+
+## Task 4: NEON_HP dispatcher TU additions
+
+**Files:**
+- Modify: `src/VecSim/spaces/functions/NEON_HP.h` — add 3 declarations
+- Modify: `src/VecSim/spaces/functions/NEON_HP.cpp` — add 3 chooser definitions
+
+- [ ] **Step 1: Add chooser declarations to NEON_HP.h**
+
+In `src/VecSim/spaces/functions/NEON_HP.h`, inside `namespace spaces { ... }`, append these three declarations alongside the existing `Choose_FP16_*_implementation_NEON_HP`:
+
+```cpp
+dist_func_t<float> Choose_SQ8_FP16_IP_implementation_NEON_HP(size_t dim);
+dist_func_t<float> Choose_SQ8_FP16_L2_implementation_NEON_HP(size_t dim);
+dist_func_t<float> Choose_SQ8_FP16_Cosine_implementation_NEON_HP(size_t dim);
+```
+
+- [ ] **Step 2: Add chooser definitions to NEON_HP.cpp**
+
+In `src/VecSim/spaces/functions/NEON_HP.cpp`, add the kernel `#include`s alongside the existing FP16 includes:
+
+```cpp
+#include "VecSim/spaces/IP/IP_NEON_SQ8_FP16.h"
+#include "VecSim/spaces/L2/L2_NEON_SQ8_FP16.h"
+```
+
+Then inside `namespace spaces { ... }` (between `#include "implementation_chooser.h"` and `#include "implementation_chooser_cleanup.h"`), append:
+
+```cpp
+dist_func_t<float> Choose_SQ8_FP16_IP_implementation_NEON_HP(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_FP16_InnerProductSIMD16_NEON_HP);
+    return ret_dist_func;
+}
+
+dist_func_t<float> Choose_SQ8_FP16_L2_implementation_NEON_HP(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_FP16_L2SqrSIMD16_NEON_HP);
+    return ret_dist_func;
+}
+
+dist_func_t<float> Choose_SQ8_FP16_Cosine_implementation_NEON_HP(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_FP16_CosineSIMD16_NEON_HP);
+    return ret_dist_func;
+}
+```
+
+- [ ] **Step 3: Commit**
+
+```bash
+git add src/VecSim/spaces/functions/NEON_HP.h src/VecSim/spaces/functions/NEON_HP.cpp
+git commit -m "Wire NEON_HP SQ8↔FP16 choosers [MOD-14972]"
+```
+
+---
+
+## Task 5: NEON_HP dispatcher wiring in IP_space.cpp + L2_space.cpp
+
+**Files:**
+- Modify: `src/VecSim/spaces/IP_space.cpp` — `IP_SQ8_FP16_GetDistFunc` + `Cosine_SQ8_FP16_GetDistFunc`
+- Modify: `src/VecSim/spaces/L2_space.cpp` — `L2_SQ8_FP16_GetDistFunc`
+
+Each of those three `_GetDistFunc` functions currently has an `#ifdef CPU_FEATURES_ARCH_X86_64` block with an early `if (dim < 16) return ret_dist_func;` guard followed by per-tier dispatch. We append an `#ifdef CPU_FEATURES_ARCH_AARCH64` block with the matching shape. Only NEON_HP is wired in this task; SVE/SVE2 land in a later task.
+
+- [ ] **Step 1: Confirm the #include for NEON_HP.h is present**
+
+Run:
+```bash
+grep -n 'functions/NEON_HP.h' src/VecSim/spaces/IP_space.cpp src/VecSim/spaces/L2_space.cpp
+```
+Expected: both files already `#include "VecSim/spaces/functions/NEON_HP.h"`. If a file is missing it, add the include.
+
+- [ ] **Step 2: Wire IP_SQ8_FP16_GetDistFunc**
+
+In `src/VecSim/spaces/IP_space.cpp`, locate `IP_SQ8_FP16_GetDistFunc`. After the closing `#endif // x86_64`, insert a parallel AArch64 block immediately before the trailing `return ret_dist_func;`:
+
+```cpp
+#ifdef CPU_FEATURES_ARCH_AARCH64
+    if (dim < 16) {
+        return ret_dist_func;
+    }
+#ifdef OPT_NEON_HP
+    if (features.asimdhp) {
+        // No alignment write: the locked spec and the sister ARM SQ8_FP32 dispatchers
+        // leave *alignment untouched on ARM tiers. The corresponding tests assert
+        // 0xFF passthrough on the scalar path and do not assert any non-zero value here.
+        return Choose_SQ8_FP16_IP_implementation_NEON_HP(dim);
+    }
+#endif
+#endif // CPU_FEATURES_ARCH_AARCH64
+```
+
+- [ ] **Step 3: Wire Cosine_SQ8_FP16_GetDistFunc**
+
+In the same file, locate `Cosine_SQ8_FP16_GetDistFunc`. Insert the same block, swapping `Choose_SQ8_FP16_IP_implementation_NEON_HP` for `Choose_SQ8_FP16_Cosine_implementation_NEON_HP`.
+
+- [ ] **Step 4: Wire L2_SQ8_FP16_GetDistFunc**
+
+In `src/VecSim/spaces/L2_space.cpp`, locate `L2_SQ8_FP16_GetDistFunc`. Insert the same block, swapping the call for `Choose_SQ8_FP16_L2_implementation_NEON_HP`.
+
+- [ ] **Step 5: User builds**
+
+Ask the user to run `make build` — first time the new NEON_HP TU additions compile. If they have ARM hardware or a cross-compile target, that build path; otherwise the x86 build must at least confirm the new headers don't accidentally break non-ARM compilation (the new headers are only `#include`d from `NEON_HP.cpp`, which is excluded on non-ARM hosts, so x86 builds should be clean).
+
+- [ ] **Step 6: Commit**
+
+```bash
+git add src/VecSim/spaces/IP_space.cpp src/VecSim/spaces/L2_space.cpp
+git commit -m "Dispatch SQ8↔FP16 to NEON_HP tier on AArch64 [MOD-14972]"
+```
+
+---
+
+## Task 6: Extend `SQ8_FP16_SpacesOptimizationTest` with NEON_HP tier-walk
+
+**Files:**
+- Modify: `tests/unit/test_spaces.cpp` — three test bodies (`SQ8_FP16_L2SqrTest`, `SQ8_FP16_InnerProductTest`, `SQ8_FP16_CosineTest`)
+
+After the existing `#ifdef OPT_SSE4` block in each test, append:
+
+- [ ] **Step 1: Add NEON_HP tier to L2 test**
+
+In `SQ8_FP16_L2SqrTest`, immediately after the closing `#endif` that follows the SSE4 block and before `// Scalar fallback`:
+
+```cpp
+#ifdef OPT_NEON_HP
+    if (optimization.asimdhp) {
+        unsigned char alignment = 0;
+        arch_opt_func = L2_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_L2_implementation_NEON_HP(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01)
+            << "NEON_HP with dim " << dim;
+        optimization.asimdhp = 0;
+    }
+#endif
+```
+
+- [ ] **Step 2: Add NEON_HP tier to IP test**
+
+In `SQ8_FP16_InnerProductTest`, append the same block but swap `L2_SQ8_FP16_GetDistFunc` → `IP_SQ8_FP16_GetDistFunc` and `Choose_SQ8_FP16_L2_implementation_NEON_HP` → `Choose_SQ8_FP16_IP_implementation_NEON_HP`.
+
+- [ ] **Step 3: Add NEON_HP tier to Cosine test**
+
+In `SQ8_FP16_CosineTest`, append the same block with `Cosine_SQ8_FP16_GetDistFunc` and `Choose_SQ8_FP16_Cosine_implementation_NEON_HP`.
+
+- [ ] **Step 4: Confirm the include path for the NEON_HP chooser declarations**
+
+Run:
+```bash
+grep -n 'functions/NEON_HP.h' tests/unit/test_spaces.cpp
+```
+Expected: include present. If not, add `#include "VecSim/spaces/functions/NEON_HP.h"` near the other space-function includes at the top of the file.
+
+- [ ] **Step 5: User builds (ARM target)**
+
+Ask the user to run `make build` for an ARM target (hardware or cross-compile). On x86 the new test code is gated by `#ifdef OPT_NEON_HP` and stays inert.
+
+- [ ] **Step 6: Run NEON_HP tests**
+
+Once the ARM build is reported clean, run:
+```bash
+./bin/<arm-triple>/unit_tests --gtest_filter='SQ8_FP16_*Test*'
+```
+Expected: all parametrized cases PASS, including the dims-16..32 and high-dim suites.
+
+- [ ] **Step 7: Commit**
+
+```bash
+git add tests/unit/test_spaces.cpp
+git commit -m "Extend SQ8↔FP16 tier-walk tests with NEON_HP [MOD-14972]"
+```
+
+---
+
+## Task 7: SVE IP kernel header
+
+**Files:**
+- Create: `src/VecSim/spaces/IP/IP_SVE_SQ8_FP16.h`
+
+- [ ] **Step 1: Author the kernel file**
+
+Modeled on `IP_SVE_SQ8_FP32.h`. The shape: an `InnerProductStep` helper that consumes `chunk = svcntw()` FP32 lanes per call (FP16 query loaded under a `b16` predicate, SQ8 storage under a `b32` predicate that drives uint8→uint32 widening), then a templated `_IMP` over `<bool partial_chunk, unsigned char additional_steps>`.
+
+```cpp
+/*
+ * Copyright (c) 2006-Present, Redis Ltd.
+ * All rights reserved.
+ *
+ * Licensed under your choice of the Redis Source Available License 2.0
+ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
+ * GNU Affero General Public License v3 (AGPLv3).
+ */
+#pragma once
+#include "VecSim/spaces/space_includes.h"
+#include "VecSim/types/sq8.h"
+#include "VecSim/types/float16.h"
+#include <arm_sve.h>
+#include <cassert>
+
+using sq8 = vecsim_types::sq8;
+using float16 = vecsim_types::float16;
+
+/*
+ * Optimised asymmetric SQ8<->FP16 inner product using the algebraic identity:
+ *
+ *   IP(x, y) ~= min * y_sum + delta * sum(q_i * y_i)
+ *
+ * Hot loop accumulates sum(q_i * y_i) only; FP16 query lanes are widened to FP32
+ * inside each step via svcvt_f32_f16_x. Metadata loads use load_unaligned<float>.
+ */
+
+// Helper: one SVE-vector-width-of-FP32 step.
+//   chunk = svcntw() - number of FP32 lanes per step.
+//   pg    = svptrue_b32() - predicate for FP32 lanes.
+static inline void
+SQ8_FP16_InnerProductStep_SVE(const uint8_t *pVect1, const float16 *pVect2, size_t &offset,
+                              svfloat32_t &sum, svbool_t pg, size_t chunk) {
+    // SQ8 -> uint32 (widen on load), then to FP32.
+    svuint32_t v1_u32 = svld1ub_u32(pg, pVect1 + offset);
+    svfloat32_t v1_f = svcvt_f32_u32_x(pg, v1_u32);
+
+    // FP16 query -> FP32. svld1_f16 uses a b16 predicate sized to `chunk` half lanes.
+    svbool_t pg16 = svwhilelt_b16(uint32_t(0), uint32_t(chunk));
+    svfloat16_t q_h =
+        svld1_f16(pg16, reinterpret_cast<const float16_t *>(pVect2) + offset);
+    svfloat32_t v2_f = svcvt_f32_f16_x(pg, q_h);
+
+    sum = svmla_f32_x(pg, sum, v1_f, v2_f);
+    offset += chunk;
+}
+
+// pVect1v = SQ8 storage, pVect2v = FP16 query
+template <bool partial_chunk, unsigned char additional_steps>
+float SQ8_FP16_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v,
+                                        size_t dimension) {
+    assert(dimension >= 16 && "kernel precondition: dispatcher must guard dim >= 16");
+
+    const uint8_t *pVect1 = static_cast<const uint8_t *>(pVect1v);
+    const float16 *pVect2 = static_cast<const float16 *>(pVect2v);
+    size_t offset = 0;
+    svbool_t pg = svptrue_b32();
+    const size_t chunk = svcntw();
+
+    svfloat32_t sum0 = svdup_f32(0.0f);
+    svfloat32_t sum1 = svdup_f32(0.0f);
+    svfloat32_t sum2 = svdup_f32(0.0f);
+    svfloat32_t sum3 = svdup_f32(0.0f);
+
+    // Partial chunk for dim % chunk lanes. Use _z form so inactive lanes are zero -
+    // the final reduction below walks all lanes via svptrue_b32().
+    if constexpr (partial_chunk) {
+        size_t remaining = dimension % chunk;
+        if (remaining > 0) {
+            svbool_t pg_partial =
+                svwhilelt_b32(uint32_t(0), uint32_t(remaining));
+            svbool_t pg16_partial =
+                svwhilelt_b16(uint32_t(0), uint32_t(remaining));
+            svuint32_t v1_u32 = svld1ub_u32(pg_partial, pVect1 + offset);
+            svfloat32_t v1_f = svcvt_f32_u32_z(pg_partial, v1_u32);
+            svfloat16_t q_h = svld1_f16(
+                pg16_partial, reinterpret_cast<const float16_t *>(pVect2) + offset);
+            svfloat32_t v2_f = svcvt_f32_f16_z(pg_partial, q_h);
+            sum0 = svmla_f32_z(pg_partial, sum0, v1_f, v2_f);
+            offset += remaining;
+        }
+    }
+
+    // Main loop: 4 chunks per iteration via 4 accumulators.
+    const size_t chunk_size = 4 * chunk;
+    const size_t number_of_chunks =
+        (dimension - (partial_chunk ? dimension % chunk : 0)) / chunk_size;
+    for (size_t i = 0; i < number_of_chunks; i++) {
+        SQ8_FP16_InnerProductStep_SVE(pVect1, pVect2, offset, sum0, pg, chunk);
+        SQ8_FP16_InnerProductStep_SVE(pVect1, pVect2, offset, sum1, pg, chunk);
+        SQ8_FP16_InnerProductStep_SVE(pVect1, pVect2, offset, sum2, pg, chunk);
+        SQ8_FP16_InnerProductStep_SVE(pVect1, pVect2, offset, sum3, pg, chunk);
+    }
+
+    // Additional steps 0..3.
+    if constexpr (additional_steps > 0)
+        SQ8_FP16_InnerProductStep_SVE(pVect1, pVect2, offset, sum0, pg, chunk);
+    if constexpr (additional_steps > 1)
+        SQ8_FP16_InnerProductStep_SVE(pVect1, pVect2, offset, sum1, pg, chunk);
+    if constexpr (additional_steps > 2)
+        SQ8_FP16_InnerProductStep_SVE(pVect1, pVect2, offset, sum2, pg, chunk);
+
+    svfloat32_t sum = svadd_f32_x(pg, sum0, sum1);
+    sum = svadd_f32_x(pg, sum, sum2);
+    sum = svadd_f32_x(pg, sum, sum3);
+    float quantized_dot = svaddv_f32(pg, sum);
+
+    // Metadata loads - unaligned because odd dim leaves trailers unaligned.
+    const uint8_t *params_bytes = static_cast<const uint8_t *>(pVect1v) + dimension;
+    const float min_val =
+        load_unaligned<float>(params_bytes + sq8::MIN_VAL * sizeof(float));
+    const float delta =
+        load_unaligned<float>(params_bytes + sq8::DELTA * sizeof(float));
+    const uint8_t *query_meta_bytes = reinterpret_cast<const uint8_t *>(
+        static_cast<const float16 *>(pVect2v) + dimension);
+    const float y_sum =
+        load_unaligned<float>(query_meta_bytes + sq8::SUM_QUERY * sizeof(float));
+
+    return min_val * y_sum + delta * quantized_dot;
+}
+
+template <bool partial_chunk, unsigned char additional_steps>
+float SQ8_FP16_InnerProductSIMD_SVE(const void *pVect1v, const void *pVect2v,
+                                    size_t dimension) {
+    return 1.0f - SQ8_FP16_InnerProductSIMD_SVE_IMP<partial_chunk, additional_steps>(
+                      pVect1v, pVect2v, dimension);
+}
+
+template <bool partial_chunk, unsigned char additional_steps>
+float SQ8_FP16_CosineSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    return SQ8_FP16_InnerProductSIMD_SVE<partial_chunk, additional_steps>(
+        pVect1v, pVect2v, dimension);
+}
+```
+
+**Note for the implementer:** `svcvt_f32_f16_x(pg, q_h)` widens *the lower half of `q_h`'s lanes* to FP32 (one widening, b32-predicated). If the ACLE on the target toolchain rejects this pairing (e.g. ARM RVCT vs LLVM disagreement), verify the FP16->FP32 widening sequence against the actual ARM build output and adjust as needed (potential alternatives: explicit `svunpklo_*` unpack-then-widen, or operating on the lower half lanes by reinterpretation). Commit only after the build is clean. Do not blindly copy `IP_SVE_FP16.h`'s pattern - that file accumulates in FP16 and is not a direct widening reference.
+
+- [ ] **Step 2: Commit**
+
+```bash
+git add src/VecSim/spaces/IP/IP_SVE_SQ8_FP16.h
+git commit -m "Add SVE SQ8↔FP16 IP kernel header [MOD-14972]"
+```
+
+---
+
+## Task 8: SVE L2 kernel header
+
+**Files:**
+- Create: `src/VecSim/spaces/L2/L2_SVE_SQ8_FP16.h`
+
+- [ ] **Step 1: Author the kernel file**
+
+```cpp
+/*
+ * Copyright (c) 2006-Present, Redis Ltd.
+ * All rights reserved.
+ *
+ * Licensed under your choice of the Redis Source Available License 2.0
+ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
+ * GNU Affero General Public License v3 (AGPLv3).
+ */
+#pragma once
+#include "VecSim/spaces/space_includes.h"
+#include "VecSim/spaces/IP/IP_SVE_SQ8_FP16.h"
+#include "VecSim/types/sq8.h"
+#include "VecSim/types/float16.h"
+
+using sq8 = vecsim_types::sq8;
+using float16 = vecsim_types::float16;
+
+/*
+ * SVE SQ8<->FP16 L2 squared distance:
+ *   ||x - y||^2 = x_sum_squares - 2 * IP(x, y) + y_sum_squares
+ * IP is computed by SQ8_FP16_InnerProductSIMD_SVE_IMP; metadata is FP32.
+ */
+
+template <bool partial_chunk, unsigned char additional_steps>
+float SQ8_FP16_L2SqrSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    const float ip = SQ8_FP16_InnerProductSIMD_SVE_IMP<partial_chunk, additional_steps>(
+        pVect1v, pVect2v, dimension);
+
+    const uint8_t *params_bytes = static_cast<const uint8_t *>(pVect1v) + dimension;
+    const float x_sum_sq =
+        load_unaligned<float>(params_bytes + sq8::SUM_SQUARES * sizeof(float));
+    const uint8_t *query_meta_bytes = reinterpret_cast<const uint8_t *>(
+        static_cast<const float16 *>(pVect2v) + dimension);
+    const float y_sum_sq =
+        load_unaligned<float>(query_meta_bytes + sq8::SUM_SQUARES_QUERY * sizeof(float));
+
+    return x_sum_sq + y_sum_sq - 2.0f * ip;
+}
+```
+
+- [ ] **Step 2: Commit**
+
+```bash
+git add src/VecSim/spaces/L2/L2_SVE_SQ8_FP16.h
+git commit -m "Add SVE SQ8↔FP16 L2 kernel header [MOD-14972]"
+```
+
+---
+
+## Task 9: SVE + SVE2 dispatcher TU additions
+
+**Files:**
+- Modify: `src/VecSim/spaces/functions/SVE.h` — +3 declarations
+- Modify: `src/VecSim/spaces/functions/SVE.cpp` — +#includes; +3 chooser definitions
+- Modify: `src/VecSim/spaces/functions/SVE2.h` — +3 declarations
+- Modify: `src/VecSim/spaces/functions/SVE2.cpp` — +#includes; +3 chooser definitions (own symbols, template instantiated under SVE2 flags)
+
+- [ ] **Step 1: Declarations in SVE.h**
+
+Inside `namespace spaces { ... }`, alongside the existing `Choose_SQ8_FP32_*_SVE` declarations:
+
+```cpp
+dist_func_t<float> Choose_SQ8_FP16_IP_implementation_SVE(size_t dim);
+dist_func_t<float> Choose_SQ8_FP16_Cosine_implementation_SVE(size_t dim);
+dist_func_t<float> Choose_SQ8_FP16_L2_implementation_SVE(size_t dim);
+```
+
+- [ ] **Step 2: Definitions in SVE.cpp**
+
+Add includes alongside the existing SQ8_FP32 includes:
+
+```cpp
+#include "VecSim/spaces/IP/IP_SVE_SQ8_FP16.h"
+#include "VecSim/spaces/L2/L2_SVE_SQ8_FP16.h"
+```
+
+Inside `namespace spaces { ... }` (between `implementation_chooser.h` and `implementation_chooser_cleanup.h`), append:
+
+```cpp
+dist_func_t<float> Choose_SQ8_FP16_IP_implementation_SVE(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_FP16_InnerProductSIMD_SVE, dim, svcntw);
+    return ret_dist_func;
+}
+
+dist_func_t<float> Choose_SQ8_FP16_Cosine_implementation_SVE(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_FP16_CosineSIMD_SVE, dim, svcntw);
+    return ret_dist_func;
+}
+
+dist_func_t<float> Choose_SQ8_FP16_L2_implementation_SVE(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_FP16_L2SqrSIMD_SVE, dim, svcntw);
+    return ret_dist_func;
+}
+```
+
+- [ ] **Step 3: Declarations in SVE2.h**
+
+```cpp
+dist_func_t<float> Choose_SQ8_FP16_IP_implementation_SVE2(size_t dim);
+dist_func_t<float> Choose_SQ8_FP16_Cosine_implementation_SVE2(size_t dim);
+dist_func_t<float> Choose_SQ8_FP16_L2_implementation_SVE2(size_t dim);
+```
+
+- [ ] **Step 4: Definitions in SVE2.cpp**
+
+Add includes alongside the existing SQ8_FP32 includes — note the SVE header is included from SVE2 (SVE2 instantiates the template under SVE2 compile flags):
+
+```cpp
+#include "VecSim/spaces/IP/IP_SVE_SQ8_FP16.h" // SVE2 implementation is identical to SVE
+#include "VecSim/spaces/L2/L2_SVE_SQ8_FP16.h" // SVE2 implementation is identical to SVE
+```
+
+Inside `namespace spaces { ... }`, append:
+
+```cpp
+dist_func_t<float> Choose_SQ8_FP16_IP_implementation_SVE2(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_FP16_InnerProductSIMD_SVE, dim, svcntw);
+    return ret_dist_func;
+}
+
+dist_func_t<float> Choose_SQ8_FP16_Cosine_implementation_SVE2(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_FP16_CosineSIMD_SVE, dim, svcntw);
+    return ret_dist_func;
+}
+
+dist_func_t<float> Choose_SQ8_FP16_L2_implementation_SVE2(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_FP16_L2SqrSIMD_SVE, dim, svcntw);
+    return ret_dist_func;
+}
+```
+
+- [ ] **Step 5: Commit**
+
+```bash
+git add src/VecSim/spaces/functions/SVE.h src/VecSim/spaces/functions/SVE.cpp \
+        src/VecSim/spaces/functions/SVE2.h src/VecSim/spaces/functions/SVE2.cpp
+git commit -m "Wire SVE/SVE2 SQ8↔FP16 choosers [MOD-14972]"
+```
+
+---
+
+## Task 10: SVE + SVE2 dispatcher wiring in IP_space.cpp + L2_space.cpp
+
+The NEON_HP block added in Task 5 lives inside `#ifdef CPU_FEATURES_ARCH_AARCH64`. Extend the same block in all three `_GetDistFunc` functions with SVE2 and SVE tiers — ordered SVE2 → SVE → NEON_HP, matching every other SQ8/FP32 dispatcher in the file.
+
+**Files:**
+- Modify: `src/VecSim/spaces/IP_space.cpp` (two functions)
+- Modify: `src/VecSim/spaces/L2_space.cpp` (one function)
+
+- [ ] **Step 1: Confirm the SVE/SVE2 dispatcher includes are present**
+
+Run:
+```bash
+grep -n 'functions/SVE\.h\|functions/SVE2\.h' src/VecSim/spaces/IP_space.cpp src/VecSim/spaces/L2_space.cpp
+```
+Expected: both files already include both headers. If not, add them.
+
+- [ ] **Step 2: Extend IP_SQ8_FP16_GetDistFunc**
+
+Inside the AArch64 block of `IP_SQ8_FP16_GetDistFunc`, after the `if (dim < 16) return ret_dist_func;` guard and **before** the existing `#ifdef OPT_NEON_HP`, prepend:
+
+```cpp
+#ifdef OPT_SVE2
+    if (features.sve2) {
+        return Choose_SQ8_FP16_IP_implementation_SVE2(dim);
+    }
+#endif
+#ifdef OPT_SVE
+    if (features.sve) {
+        return Choose_SQ8_FP16_IP_implementation_SVE(dim);
+    }
+#endif
+```
+
+(SVE/SVE2 paths don't compute alignment hints — the SVE vector width is runtime-variable, so the SQ8_FP32 sister doesn't set `*alignment` here either. Mirror that.)
+
+- [ ] **Step 3: Extend Cosine_SQ8_FP16_GetDistFunc**
+
+Same as Step 2, with `Cosine` in the chooser names.
+
+- [ ] **Step 4: Extend L2_SQ8_FP16_GetDistFunc**
+
+Same as Step 2, with `L2` in the chooser names.
+
+- [ ] **Step 5: User builds (ARM target)**
+
+Ask user to run `make build` for an ARM target.
+
+- [ ] **Step 6: Commit**
+
+```bash
+git add src/VecSim/spaces/IP_space.cpp src/VecSim/spaces/L2_space.cpp
+git commit -m "Dispatch SQ8↔FP16 to SVE/SVE2 tiers on AArch64 [MOD-14972]"
+```
+
+---
+
+## Task 11: Extend `SQ8_FP16_SpacesOptimizationTest` with SVE2 + SVE tier-walks
+
+**Files:**
+- Modify: `tests/unit/test_spaces.cpp` — the same three test bodies extended in Task 6
+
+For each test (L2, IP, Cosine), inside the existing `#ifdef CPU_FEATURES_ARCH_AARCH64` region (which currently holds only NEON_HP from Task 6), **prepend** SVE2 and SVE blocks so the dispatch-precedence order is SVE2 → SVE → NEON_HP. If the existing NEON_HP block is not yet inside an AArch64 outer ifdef, wrap all three together.
+
+- [ ] **Step 1: Wrap and extend the L2 test**
+
+Replace the NEON_HP-only AArch64 block in `SQ8_FP16_L2SqrTest` with:
+
+```cpp
+#ifdef CPU_FEATURES_ARCH_AARCH64
+#ifdef OPT_SVE2
+    if (optimization.sve2) {
+        unsigned char alignment = 0;
+        arch_opt_func = L2_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_L2_implementation_SVE2(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01)
+            << "SVE2 with dim " << dim;
+        optimization.sve2 = 0;
+    }
+#endif
+#ifdef OPT_SVE
+    if (optimization.sve) {
+        unsigned char alignment = 0;
+        arch_opt_func = L2_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_L2_implementation_SVE(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01)
+            << "SVE with dim " << dim;
+        optimization.sve = 0;
+    }
+#endif
+#ifdef OPT_NEON_HP
+    if (optimization.asimdhp) {
+        unsigned char alignment = 0;
+        arch_opt_func = L2_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_L2_implementation_NEON_HP(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01)
+            << "NEON_HP with dim " << dim;
+        optimization.asimdhp = 0;
+    }
+#endif
+#endif // CPU_FEATURES_ARCH_AARCH64
+```
+
+- [ ] **Step 2: Same for IP test**
+
+Replicate the block in `SQ8_FP16_InnerProductTest` with `IP_SQ8_FP16_GetDistFunc` and `Choose_SQ8_FP16_IP_implementation_<TIER>`.
+
+- [ ] **Step 3: Same for Cosine test**
+
+Replicate with `Cosine_SQ8_FP16_GetDistFunc` and `Choose_SQ8_FP16_Cosine_implementation_<TIER>`.
+
+- [ ] **Step 4: User builds**
+
+ARM target build.
+
+- [ ] **Step 5: Run the optimization tests**
+
+```bash
+./bin/<arm-triple>/unit_tests --gtest_filter='SQ8_FP16_SpacesOptimizationTest.*'
+```
+Expected: all parametrized cases PASS — dims 16..32 + high-dim suite (64..1024) — exercising whichever ARM tiers the host advertises.
+
+- [ ] **Step 6: Commit**
+
+```bash
+git add tests/unit/test_spaces.cpp
+git commit -m "Extend SQ8↔FP16 tier-walk tests with SVE/SVE2 [MOD-14972]"
+```
+
+---
+
+## Task 12: Extend `SQ8_FP16_SIMD_TierCoverage.ReportTiersExercised` with ARM rows
+
+**Files:**
+- Modify: `tests/unit/test_spaces.cpp` — `TEST(SQ8_FP16_SIMD_TierCoverage, ReportTiersExercised)`
+
+The existing test body has an outer `#ifdef CPU_FEATURES_ARCH_X86_64` block that loops over each x86 tier and logs presence to stderr. Add a sibling `#ifdef CPU_FEATURES_ARCH_AARCH64` block with the same shape.
+
+- [ ] **Step 1: Append the AArch64 reporting block**
+
+Locate the trailing `#endif // CPU_FEATURES_ARCH_X86_64` and immediately after, insert:
+
+```cpp
+#ifdef CPU_FEATURES_ARCH_AARCH64
+#ifdef OPT_SVE2
+    if (opt.sve2) {
+        std::cerr << "[SQ8_FP16] SVE2 tier exercised\n";
+        any_simd = true;
+    } else {
+        std::cerr << "[SQ8_FP16] SVE2 tier NOT exercised on this host\n";
+    }
+#endif
+#ifdef OPT_SVE
+    if (opt.sve) {
+        std::cerr << "[SQ8_FP16] SVE tier exercised\n";
+        any_simd = true;
+    } else {
+        std::cerr << "[SQ8_FP16] SVE tier NOT exercised on this host\n";
+    }
+#endif
+#ifdef OPT_NEON_HP
+    if (opt.asimdhp) {
+        std::cerr << "[SQ8_FP16] NEON_HP tier exercised\n";
+        any_simd = true;
+    } else {
+        std::cerr << "[SQ8_FP16] NEON_HP tier NOT exercised on this host\n";
+    }
+#endif
+#endif // CPU_FEATURES_ARCH_AARCH64
+```
+
+(The trailing `if (!any_simd) { GTEST_SKIP() << ...; }` already at the bottom of the existing test handles the all-quiet case across both archs.)
+
+- [ ] **Step 2: Build + run on an ARM host**
+
+Ask the user to build for ARM, then run:
+```bash
+./bin/<arm-triple>/unit_tests --gtest_filter='SQ8_FP16_SIMD_TierCoverage.*'
+```
+Expected: stderr shows at least one ARM tier marked "exercised", test PASS.
+
+- [ ] **Step 3: Commit**
+
+```bash
+git add tests/unit/test_spaces.cpp
+git commit -m "Report ARM tiers in SQ8↔FP16 tier-coverage test [MOD-14972]"
+```
+
+---
+
+## Task 13: Microbench AArch64 block
+
+**Files:**
+- Modify: `tests/benchmark/spaces_benchmarks/bm_spaces_sq8_fp16.cpp`
+
+The existing file already opens `#ifdef CPU_FEATURES_ARCH_X86_64` and pulls `cpu_features::X86Features opt = cpu_features::GetX86Info().features;`. Add the parallel AArch64 block at the end of that `#endif // CPU_FEATURES_ARCH_X86_64`.
+
+- [ ] **Step 1: Append the AArch64 bench block**
+
+After the closing `#endif // CPU_FEATURES_ARCH_X86_64` (or after the last x86 `INITIALIZE_BENCHMARKS_SET_*` macro if no such comment exists), insert:
+
+```cpp
+#ifdef CPU_FEATURES_ARCH_AARCH64
+cpu_features::Aarch64Features arm_opt = cpu_features::GetAarch64Info().features;
+
+#ifdef OPT_SVE2
+bool sve2_supported = arm_opt.sve2;
+INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, SVE2, 16, sve2_supported);
+INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, SVE2, 16, sve2_supported);
+#endif
+
+#ifdef OPT_SVE
+bool sve_supported = arm_opt.sve;
+INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, SVE, 16, sve_supported);
+INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, SVE, 16, sve_supported);
+#endif
+
+#ifdef OPT_NEON_HP
+bool neon_hp_supported = arm_opt.asimdhp;
+INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, NEON_HP, 16, neon_hp_supported);
+INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, NEON_HP, 16,
+                                  neon_hp_supported);
+#endif
+#endif // CPU_FEATURES_ARCH_AARCH64
+```
+
+Verify the exact `cpu_features` helper name during build. If the toolchain uses `Aarch64Info` vs `Aarch64Features` vs `ArmFeatures`, adjust to match the sister x86 block.
+
+- [ ] **Step 2: Update the file-header comment**
+
+The current file-header comment (around the top) ends with `ARM kernels land via MOD-14972.` — change that line to `ARM kernels (NEON_HP / SVE / SVE2) are registered below.` so the doc stays accurate.
+
+- [ ] **Step 3: User builds (ARM target)**
+
+- [ ] **Step 4: Run the bench on ARM**
+
+```bash
+./bin/<arm-triple>/bm_spaces_sq8_fp16 --benchmark_filter='SQ8_FP16_.*(SVE2|SVE|NEON_HP)'
+```
+Expected: per-ISA throughput rows for L2, IP, Cosine. If no rows match, list all benchmarks first with `--benchmark_list_tests` to see the exact generated names, then adjust the regex.
+
+- [ ] **Step 5: Side-by-side compare against SQ8_FP32**
+
+```bash
+./bin/<arm-triple>/bm_spaces_sq8_fp32 --benchmark_filter='SQ8_FP32_.*(SVE2|SVE|NEON)'
+```
+Compare matched-ISA rows manually. Acceptance per Jira: per-ISA throughput data captured.
+
+- [ ] **Step 6: Commit**
+
+```bash
+git add tests/benchmark/spaces_benchmarks/bm_spaces_sq8_fp16.cpp
+git commit -m "Register ARM SQ8↔FP16 microbenchmarks [MOD-14972]"
+```
+
+---
+
+## Task 14: ASan + final pre-PR verification
+
+- [ ] **Step 1: Full unit-test pass on ARM host (no filter)**
+
+```bash
+./bin/<arm-triple>/unit_tests
+```
+Expected: all tests PASS.
+
+- [ ] **Step 2: ASan build + run**
+
+Ask user to run `make build SAN=address` (or the repo's equivalent — verify against `Makefile`). After confirmed:
+
+```bash
+./bin/<arm-triple>-asan/unit_tests --gtest_filter='SQ8_FP16_*'
+```
+Expected: zero ASan reports; all SQ8_FP16 tests PASS.
+
+- [ ] **Step 3: x86 sanity build**
+
+User runs `make build` on x86 (no ARM target). Confirms the new test extensions and dispatcher AArch64 ifdefs stay inert on x86 and the build is clean.
+
+- [ ] **Step 4: Push branch (ASK USER FIRST)**
+
+Pushes are user-gated. Confirm with the user before running:
+
+```bash
+git push -u origin dor-forer-sq8-fp16-arm-kernels-mod-14972
+```
+
+- [ ] **Step 5: Open PR against PR #970 (ASK USER FIRST)**
+
+PR creation is user-gated. Confirm with the user before running:
+
+```bash
+gh pr create \
+  --base dor-forer-sq8-fp16-x86-kernels-mod-14954 \
+  --title 'Add SQ8↔FP16 ARM SIMD distance kernels [MOD-14972]' \
+  --body "$(cat <<'EOF'
+## Summary
+
+- Add asymmetric SQ8↔FP16 distance kernels (IP, L2, Cosine) for ARM NEON_HP, SVE, SVE2 tiers
+- Wire kernels into the existing dispatcher (`IP_space.cpp`, `L2_space.cpp`)
+- Extend `SQ8_FP16_SpacesOptimizationTest` and `SQ8_FP16_SIMD_TierCoverage` with ARM tiers
+- Register per-ISA microbenchmarks for cross-arch throughput comparison
+
+Stacked on PR #970 (MOD-14954 x86 kernels); retarget to `main` once #970 merges.
+
+Spec: `docs/superpowers/specs/2026-05-28-arm-sq8-fp16-design.md`
+
+## Test plan
+
+- [ ] Unit tests on ARM host pass — `SQ8_FP16_SpacesOptimizationTest` (dims 16..32 + 64..1024), `SQ8_FP16_SIMD_TierCoverage`, `GetDistFuncSQ8FP16Asymmetric`
+- [ ] ASan build on ARM host clean across SQ8_FP16 tests
+- [ ] x86 build remains clean (new AArch64 dispatcher block + tests stay inert)
+- [ ] Microbench output captured for SVE2 / SVE / NEON_HP, compared against matched SQ8_FP32 ARM rows
+EOF
+)"
+```
+
+- [ ] **Step 6: Retarget once #970 merges (ASK USER FIRST)**
+
+When PR #970 lands on `main`, change this PR's base to `main`:
+
+```bash
+gh pr edit <PR-number> --base main
+```
+
+---
+
+## Self-review checklist
+
+- [x] **Spec coverage:** every requirement in `2026-05-28-arm-sq8-fp16-design.md` is covered:
+  - Kernel headers (4 new): Tasks 2, 3, 7, 8
+  - Wrapper symbols: Tasks 4 (NEON_HP), 9 (SVE/SVE2)
+  - Dispatcher wiring: Tasks 5 (NEON_HP), 10 (SVE/SVE2)
+  - Tier-walk tests: Tasks 6 (NEON_HP), 11 (SVE/SVE2)
+  - TierCoverage report: Task 12
+  - Scalar-fallback edge tests (dim=0, dim=15): Task 1
+  - Microbench: Task 13
+  - ASan + verification: Task 14
+- [x] **No CMake changes** — confirmed in file structure table.
+- [x] **Zero placeholders** — every code block is concrete; ambiguous spots (SVE FP16 widening ACLE) are called out with the fallback strategy spelled in-task.
+- [x] **Type/symbol consistency:**
+  - NEON kernel template names: `SQ8_FP16_InnerProductSIMD16_NEON_HP_IMP` / `…NEON_HP` / `SQ8_FP16_L2SqrSIMD16_NEON_HP` / `SQ8_FP16_CosineSIMD16_NEON_HP` — match across kernel header, NEON_HP chooser, dispatcher call, and test.
+  - SVE kernel template names: `SQ8_FP16_InnerProductSIMD_SVE_IMP` / `…SVE` / `SQ8_FP16_L2SqrSIMD_SVE` / `SQ8_FP16_CosineSIMD_SVE` — match across kernel header, SVE chooser, SVE2 chooser, dispatcher call, and test.
+  - Chooser symbol names: `Choose_SQ8_FP16_{IP,L2,Cosine}_implementation_{NEON_HP,SVE,SVE2}` — match across `.h` declarations, `.cpp` definitions, dispatcher calls, tests, and bench.
+  - Test fixture: `SQ8_FP16_SpacesOptimizationTest` already exists on base (PR #970); we extend the three test methods inside it, no rename.
+
+---
+
+## Execution Handoff
+
+Plan complete and saved to `docs/superpowers/plans/2026-05-28-arm-sq8-fp16-kernels.md`. Two execution options:
+
+**1. Subagent-Driven (recommended)** — I dispatch a fresh subagent per task, review between tasks, fast iteration.
+
+**2. Inline Execution** — Execute tasks in this session using executing-plans, batch execution with checkpoints.
+
+Which approach?

From 4f0534c0d3bb6bfea2c3f54357775a365a2fcb85 Mon Sep 17 00:00:00 2001
From: Dor Forer <dor.forer@redis.com>
Date: Thu, 28 May 2026 17:54:33 +0300
Subject: [PATCH 03/19] =?UTF-8?q?Add=20NEON=5FHP=20SQ8=E2=86=94FP16=20IP?=
 =?UTF-8?q?=20kernel=20header=20[MOD-14972]?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/VecSim/spaces/IP/IP_NEON_SQ8_FP16.h | 135 ++++++++++++++++++++++++
 1 file changed, 135 insertions(+)
 create mode 100644 src/VecSim/spaces/IP/IP_NEON_SQ8_FP16.h

diff --git a/src/VecSim/spaces/IP/IP_NEON_SQ8_FP16.h b/src/VecSim/spaces/IP/IP_NEON_SQ8_FP16.h
new file mode 100644
index 000000000..b1d26fec5
--- /dev/null
+++ b/src/VecSim/spaces/IP/IP_NEON_SQ8_FP16.h
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2006-Present, Redis Ltd.
+ * All rights reserved.
+ *
+ * Licensed under your choice of the Redis Source Available License 2.0
+ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
+ * GNU Affero General Public License v3 (AGPLv3).
+ */
+#pragma once
+#include "VecSim/spaces/space_includes.h"
+#include "VecSim/types/sq8.h"
+#include "VecSim/types/float16.h"
+#include <arm_neon.h>
+#include <cassert>
+
+using sq8 = vecsim_types::sq8;
+using float16 = vecsim_types::float16;
+
+/*
+ * Optimised asymmetric SQ8<->FP16 inner product using the algebraic identity:
+ *
+ *   IP(x, y) = sum(x_i * y_i)
+ *            ~= sum((min + delta * q_i) * y_i)
+ *            = min * y_sum + delta * sum(q_i * y_i)
+ *
+ * The hot loop only accumulates sum(q_i * y_i) - no per-element dequantisation.
+ * FP16 query lanes are widened to FP32 via vcvt_f32_f16 per 16-lane chunk.
+ */
+
+// Helper: 16 lanes per call, four FP32 accumulators (one per quarter).
+static inline void
+SQ8_FP16_InnerProductStep_NEON_HP(const uint8_t *&pVect1, const float16 *&pVect2,
+                                  float32x4_t &sum0, float32x4_t &sum1,
+                                  float32x4_t &sum2, float32x4_t &sum3) {
+    // SQ8 storage: 16 * uint8 -> 4 * float32x4_t
+    uint8x16_t v1_u8 = vld1q_u8(pVect1);
+    uint16x8_t v1_lo = vmovl_u8(vget_low_u8(v1_u8));
+    uint16x8_t v1_hi = vmovl_u8(vget_high_u8(v1_u8));
+    float32x4_t v1_0 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(v1_lo)));
+    float32x4_t v1_1 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(v1_lo)));
+    float32x4_t v1_2 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(v1_hi)));
+    float32x4_t v1_3 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(v1_hi)));
+
+    // FP16 query: 16 * f16 -> 4 * float32x4_t via vcvt_f32_f16
+    const float16_t *q = reinterpret_cast<const float16_t *>(pVect2);
+    float16x8_t q_lo = vld1q_f16(q);
+    float16x8_t q_hi = vld1q_f16(q + 8);
+    float32x4_t v2_0 = vcvt_f32_f16(vget_low_f16(q_lo));
+    float32x4_t v2_1 = vcvt_f32_f16(vget_high_f16(q_lo));
+    float32x4_t v2_2 = vcvt_f32_f16(vget_low_f16(q_hi));
+    float32x4_t v2_3 = vcvt_f32_f16(vget_high_f16(q_hi));
+
+    sum0 = vfmaq_f32(sum0, v1_0, v2_0);
+    sum1 = vfmaq_f32(sum1, v1_1, v2_1);
+    sum2 = vfmaq_f32(sum2, v1_2, v2_2);
+    sum3 = vfmaq_f32(sum3, v1_3, v2_3);
+
+    pVect1 += 16;
+    pVect2 += 16;
+}
+
+// pVect1v = SQ8 storage, pVect2v = FP16 query
+template <unsigned char residual> // 0..15
+float SQ8_FP16_InnerProductSIMD16_NEON_HP_IMP(const void *pVect1v, const void *pVect2v,
+                                              size_t dimension) {
+    assert(dimension >= 16 && "kernel precondition: dispatcher must guard dim >= 16");
+
+    const uint8_t *pVect1 = static_cast<const uint8_t *>(pVect1v); // SQ8 storage
+    const float16 *pVect2 = static_cast<const float16 *>(pVect2v); // FP16 query
+
+    float32x4_t sum0 = vdupq_n_f32(0.0f);
+    float32x4_t sum1 = vdupq_n_f32(0.0f);
+    float32x4_t sum2 = vdupq_n_f32(0.0f);
+    float32x4_t sum3 = vdupq_n_f32(0.0f);
+
+    const size_t num_of_chunks = dimension / 16;
+    for (size_t i = 0; i < num_of_chunks; i++) {
+        SQ8_FP16_InnerProductStep_NEON_HP(pVect1, pVect2, sum0, sum1, sum2, sum3);
+    }
+
+    // Residual handling: dim % 16 lanes.
+    // residual >= 8: one safe 8-lane SQ8 + 8-lane FP16 load (FP16 trailer is wide enough).
+    // residual <  8: scalar-only - a 4-lane FP16 load would overread y_sum metadata.
+    constexpr unsigned char r = residual;
+    if constexpr (r >= 8) {
+        uint8x8_t v1_u8 = vld1_u8(pVect1);
+        uint16x8_t v1_u16 = vmovl_u8(v1_u8);
+        float32x4_t v1_a = vcvtq_f32_u32(vmovl_u16(vget_low_u16(v1_u16)));
+        float32x4_t v1_b = vcvtq_f32_u32(vmovl_u16(vget_high_u16(v1_u16)));
+        float16x8_t q_h = vld1q_f16(reinterpret_cast<const float16_t *>(pVect2));
+        float32x4_t v2_a = vcvt_f32_f16(vget_low_f16(q_h));
+        float32x4_t v2_b = vcvt_f32_f16(vget_high_f16(q_h));
+        sum0 = vfmaq_f32(sum0, v1_a, v2_a);
+        sum1 = vfmaq_f32(sum1, v1_b, v2_b);
+        pVect1 += 8;
+        pVect2 += 8;
+    }
+    // Lane-by-lane scalar for the final 0..7 (residual % 8) elements.
+    constexpr unsigned char tail = r & 0x7;
+    float scalar_dot = 0.0f;
+    for (unsigned char k = 0; k < tail; ++k) {
+        scalar_dot += static_cast<float>(pVect1[k]) * vecsim_types::FP16_to_FP32(pVect2[k]);
+    }
+
+    // Reduce the four NEON accumulators.
+    float32x4_t sum_lo = vaddq_f32(sum0, sum1);
+    float32x4_t sum_hi = vaddq_f32(sum2, sum3);
+    float quantized_dot = vaddvq_f32(vaddq_f32(sum_lo, sum_hi)) + scalar_dot;
+
+    // Metadata loads - use load_unaligned because odd dim leaves trailers unaligned.
+    const uint8_t *params_bytes = static_cast<const uint8_t *>(pVect1v) + dimension;
+    const float min_val =
+        load_unaligned<float>(params_bytes + sq8::MIN_VAL * sizeof(float));
+    const float delta =
+        load_unaligned<float>(params_bytes + sq8::DELTA * sizeof(float));
+    const uint8_t *query_meta_bytes =
+        reinterpret_cast<const uint8_t *>(static_cast<const float16 *>(pVect2v) + dimension);
+    const float y_sum =
+        load_unaligned<float>(query_meta_bytes + sq8::SUM_QUERY * sizeof(float));
+
+    return min_val * y_sum + delta * quantized_dot;
+}
+
+template <unsigned char residual>
+float SQ8_FP16_InnerProductSIMD16_NEON_HP(const void *pVect1v, const void *pVect2v,
+                                          size_t dimension) {
+    return 1.0f -
+           SQ8_FP16_InnerProductSIMD16_NEON_HP_IMP<residual>(pVect1v, pVect2v, dimension);
+}
+
+template <unsigned char residual>
+float SQ8_FP16_CosineSIMD16_NEON_HP(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    // Cosine = 1 - IP (vectors are pre-normalised); reuses the IP wrapper.
+    return SQ8_FP16_InnerProductSIMD16_NEON_HP<residual>(pVect1v, pVect2v, dimension);
+}

From d3c6415578e202eb79eb22d50019c084a1e240ec Mon Sep 17 00:00:00 2001
From: Dor Forer <dor.forer@redis.com>
Date: Thu, 28 May 2026 18:01:04 +0300
Subject: [PATCH 04/19] =?UTF-8?q?Add=20NEON=5FHP=20SQ8=E2=86=94FP16=20L2?=
 =?UTF-8?q?=20kernel=20header=20[MOD-14972]?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/VecSim/spaces/L2/L2_NEON_SQ8_FP16.h | 42 +++++++++++++++++++++++++
 1 file changed, 42 insertions(+)
 create mode 100644 src/VecSim/spaces/L2/L2_NEON_SQ8_FP16.h

diff --git a/src/VecSim/spaces/L2/L2_NEON_SQ8_FP16.h b/src/VecSim/spaces/L2/L2_NEON_SQ8_FP16.h
new file mode 100644
index 000000000..7bf5db986
--- /dev/null
+++ b/src/VecSim/spaces/L2/L2_NEON_SQ8_FP16.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2006-Present, Redis Ltd.
+ * All rights reserved.
+ *
+ * Licensed under your choice of the Redis Source Available License 2.0
+ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
+ * GNU Affero General Public License v3 (AGPLv3).
+ */
+#pragma once
+#include "VecSim/spaces/space_includes.h"
+#include "VecSim/spaces/IP/IP_NEON_SQ8_FP16.h"
+#include "VecSim/types/sq8.h"
+#include "VecSim/types/float16.h"
+
+using sq8 = vecsim_types::sq8;
+using float16 = vecsim_types::float16;
+
+/*
+ * Optimised asymmetric SQ8<->FP16 L2 squared distance using the algebraic identity:
+ *
+ *   ||x - y||^2 = sum(x_i^2) - 2 * IP(x, y) + sum(y_i^2)
+ *               = x_sum_squares - 2 * IP(x, y) + y_sum_squares
+ *
+ * IP is computed by SQ8_FP16_InnerProductSIMD16_NEON_HP_IMP; metadata is FP32.
+ */
+
+template <unsigned char residual> // 0..15
+float SQ8_FP16_L2SqrSIMD16_NEON_HP(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    const float ip =
+        SQ8_FP16_InnerProductSIMD16_NEON_HP_IMP<residual>(pVect1v, pVect2v, dimension);
+
+    const uint8_t *params_bytes = static_cast<const uint8_t *>(pVect1v) + dimension;
+    const float x_sum_sq =
+        load_unaligned<float>(params_bytes + sq8::SUM_SQUARES * sizeof(float));
+
+    const uint8_t *query_meta_bytes = reinterpret_cast<const uint8_t *>(
+        static_cast<const float16 *>(pVect2v) + dimension);
+    const float y_sum_sq =
+        load_unaligned<float>(query_meta_bytes + sq8::SUM_SQUARES_QUERY * sizeof(float));
+
+    return x_sum_sq + y_sum_sq - 2.0f * ip;
+}

From 69cee3d730bd808699db076d25ab3c7b14040ee8 Mon Sep 17 00:00:00 2001
From: Dor Forer <dor.forer@redis.com>
Date: Thu, 28 May 2026 18:05:16 +0300
Subject: [PATCH 05/19] =?UTF-8?q?Wire=20NEON=5FHP=20SQ8=E2=86=94FP16=20cho?=
 =?UTF-8?q?osers=20[MOD-14972]?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/VecSim/spaces/functions/NEON_HP.cpp | 20 ++++++++++++++++++++
 src/VecSim/spaces/functions/NEON_HP.h   |  4 ++++
 2 files changed, 24 insertions(+)

diff --git a/src/VecSim/spaces/functions/NEON_HP.cpp b/src/VecSim/spaces/functions/NEON_HP.cpp
index 2dea94934..20d93a517 100644
--- a/src/VecSim/spaces/functions/NEON_HP.cpp
+++ b/src/VecSim/spaces/functions/NEON_HP.cpp
@@ -10,6 +10,8 @@
 
 #include "VecSim/spaces/L2/L2_NEON_FP16.h"
 #include "VecSim/spaces/IP/IP_NEON_FP16.h"
+#include "VecSim/spaces/IP/IP_NEON_SQ8_FP16.h"
+#include "VecSim/spaces/L2/L2_NEON_SQ8_FP16.h"
 
 namespace spaces {
 
@@ -27,6 +29,24 @@ dist_func_t<float> Choose_FP16_IP_implementation_NEON_HP(size_t dim) {
     return ret_dist_func;
 }
 
+dist_func_t<float> Choose_SQ8_FP16_IP_implementation_NEON_HP(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_FP16_InnerProductSIMD16_NEON_HP);
+    return ret_dist_func;
+}
+
+dist_func_t<float> Choose_SQ8_FP16_L2_implementation_NEON_HP(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_FP16_L2SqrSIMD16_NEON_HP);
+    return ret_dist_func;
+}
+
+dist_func_t<float> Choose_SQ8_FP16_Cosine_implementation_NEON_HP(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_FP16_CosineSIMD16_NEON_HP);
+    return ret_dist_func;
+}
+
 #include "implementation_chooser_cleanup.h"
 
 } // namespace spaces
diff --git a/src/VecSim/spaces/functions/NEON_HP.h b/src/VecSim/spaces/functions/NEON_HP.h
index c65bd6948..889eb0919 100644
--- a/src/VecSim/spaces/functions/NEON_HP.h
+++ b/src/VecSim/spaces/functions/NEON_HP.h
@@ -16,4 +16,8 @@ dist_func_t<float> Choose_FP16_IP_implementation_NEON_HP(size_t dim);
 
 dist_func_t<float> Choose_FP16_L2_implementation_NEON_HP(size_t dim);
 
+dist_func_t<float> Choose_SQ8_FP16_IP_implementation_NEON_HP(size_t dim);
+dist_func_t<float> Choose_SQ8_FP16_L2_implementation_NEON_HP(size_t dim);
+dist_func_t<float> Choose_SQ8_FP16_Cosine_implementation_NEON_HP(size_t dim);
+
 } // namespace spaces

From 1b36b38e12fdbdbe148d80cea8245f2915e8df2d Mon Sep 17 00:00:00 2001
From: Dor Forer <dor.forer@redis.com>
Date: Thu, 28 May 2026 18:08:16 +0300
Subject: [PATCH 06/19] =?UTF-8?q?Dispatch=20SQ8=E2=86=94FP16=20to=20NEON?=
 =?UTF-8?q?=5FHP=20tier=20on=20AArch64=20[MOD-14972]?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/VecSim/spaces/IP_space.cpp | 26 ++++++++++++++++++++++++++
 src/VecSim/spaces/L2_space.cpp | 13 +++++++++++++
 2 files changed, 39 insertions(+)

diff --git a/src/VecSim/spaces/IP_space.cpp b/src/VecSim/spaces/IP_space.cpp
index b57971b60..92616f394 100644
--- a/src/VecSim/spaces/IP_space.cpp
+++ b/src/VecSim/spaces/IP_space.cpp
@@ -225,6 +225,19 @@ dist_func_t<float> IP_SQ8_FP16_GetDistFunc(size_t dim, unsigned char *alignment,
 #endif
 #endif // OPT_F16C
 #endif // x86_64
+#ifdef CPU_FEATURES_ARCH_AARCH64
+    if (dim < 16) {
+        return ret_dist_func;
+    }
+#ifdef OPT_NEON_HP
+    if (features.asimdhp) {
+        // No alignment write: the locked spec and the sister ARM SQ8_FP32 dispatchers
+        // leave *alignment untouched on ARM tiers. The corresponding tests assert
+        // 0xFF passthrough on the scalar path and do not assert any non-zero value here.
+        return Choose_SQ8_FP16_IP_implementation_NEON_HP(dim);
+    }
+#endif
+#endif // CPU_FEATURES_ARCH_AARCH64
     return ret_dist_func;
 }
 
@@ -274,6 +287,19 @@ dist_func_t<float> Cosine_SQ8_FP16_GetDistFunc(size_t dim, unsigned char *alignm
 #endif
 #endif // OPT_F16C
 #endif // x86_64
+#ifdef CPU_FEATURES_ARCH_AARCH64
+    if (dim < 16) {
+        return ret_dist_func;
+    }
+#ifdef OPT_NEON_HP
+    if (features.asimdhp) {
+        // No alignment write: the locked spec and the sister ARM SQ8_FP32 dispatchers
+        // leave *alignment untouched on ARM tiers. The corresponding tests assert
+        // 0xFF passthrough on the scalar path and do not assert any non-zero value here.
+        return Choose_SQ8_FP16_Cosine_implementation_NEON_HP(dim);
+    }
+#endif
+#endif // CPU_FEATURES_ARCH_AARCH64
     return ret_dist_func;
 }
 
diff --git a/src/VecSim/spaces/L2_space.cpp b/src/VecSim/spaces/L2_space.cpp
index 43020399f..995b4c4d6 100644
--- a/src/VecSim/spaces/L2_space.cpp
+++ b/src/VecSim/spaces/L2_space.cpp
@@ -156,6 +156,19 @@ dist_func_t<float> L2_SQ8_FP16_GetDistFunc(size_t dim, unsigned char *alignment,
 #endif
 #endif // OPT_F16C
 #endif // x86_64
+#ifdef CPU_FEATURES_ARCH_AARCH64
+    if (dim < 16) {
+        return ret_dist_func;
+    }
+#ifdef OPT_NEON_HP
+    if (features.asimdhp) {
+        // No alignment write: the locked spec and the sister ARM SQ8_FP32 dispatchers
+        // leave *alignment untouched on ARM tiers. The corresponding tests assert
+        // 0xFF passthrough on the scalar path and do not assert any non-zero value here.
+        return Choose_SQ8_FP16_L2_implementation_NEON_HP(dim);
+    }
+#endif
+#endif // CPU_FEATURES_ARCH_AARCH64
     return ret_dist_func;
 }
 

From 1af48125a6fd032806db68af2dfb8b4795c6bba7 Mon Sep 17 00:00:00 2001
From: Dor Forer <dor.forer@redis.com>
Date: Thu, 28 May 2026 18:11:42 +0300
Subject: [PATCH 07/19] =?UTF-8?q?Extend=20SQ8=E2=86=94FP16=20tier-walk=20t?=
 =?UTF-8?q?ests=20with=20NEON=5FHP=20[MOD-14972]?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 tests/unit/test_spaces.cpp | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp
index 474ac5c75..d2c9386ac 100644
--- a/tests/unit/test_spaces.cpp
+++ b/tests/unit/test_spaces.cpp
@@ -3149,6 +3149,18 @@ TEST_P(SQ8_FP16_SpacesOptimizationTest, SQ8_FP16_L2SqrTest) {
 #endif
 #endif // OPT_F16C
 
+#ifdef OPT_NEON_HP
+    if (optimization.asimdhp) {
+        unsigned char alignment = 0;
+        arch_opt_func = L2_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_L2_implementation_NEON_HP(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01)
+            << "NEON_HP with dim " << dim;
+        optimization.asimdhp = 0;
+    }
+#endif
+
     unsigned char alignment = 0;
     arch_opt_func = L2_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization);
     ASSERT_EQ(arch_opt_func, SQ8_FP16_L2Sqr)
@@ -3224,6 +3236,18 @@ TEST_P(SQ8_FP16_SpacesOptimizationTest, SQ8_FP16_InnerProductTest) {
 #endif
 #endif // OPT_F16C
 
+#ifdef OPT_NEON_HP
+    if (optimization.asimdhp) {
+        unsigned char alignment = 0;
+        arch_opt_func = IP_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_IP_implementation_NEON_HP(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01)
+            << "NEON_HP with dim " << dim;
+        optimization.asimdhp = 0;
+    }
+#endif
+
     unsigned char alignment = 0;
     arch_opt_func = IP_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization);
     ASSERT_EQ(arch_opt_func, SQ8_FP16_InnerProduct)
@@ -3299,6 +3323,18 @@ TEST_P(SQ8_FP16_SpacesOptimizationTest, SQ8_FP16_CosineTest) {
 #endif
 #endif // OPT_F16C
 
+#ifdef OPT_NEON_HP
+    if (optimization.asimdhp) {
+        unsigned char alignment = 0;
+        arch_opt_func = Cosine_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_Cosine_implementation_NEON_HP(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01)
+            << "NEON_HP with dim " << dim;
+        optimization.asimdhp = 0;
+    }
+#endif
+
     unsigned char alignment = 0;
     arch_opt_func = Cosine_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization);
     ASSERT_EQ(arch_opt_func, SQ8_FP16_Cosine)

From 0ce0bcebd31609601197f5d6aeeaaf3cea40f0c3 Mon Sep 17 00:00:00 2001
From: Dor Forer <dor.forer@redis.com>
Date: Thu, 28 May 2026 18:16:51 +0300
Subject: [PATCH 08/19] =?UTF-8?q?Add=20SVE=20SQ8=E2=86=94FP16=20IP=20kerne?=
 =?UTF-8?q?l=20header=20[MOD-14972]?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/VecSim/spaces/IP/IP_SVE_SQ8_FP16.h | 133 +++++++++++++++++++++++++
 1 file changed, 133 insertions(+)
 create mode 100644 src/VecSim/spaces/IP/IP_SVE_SQ8_FP16.h

diff --git a/src/VecSim/spaces/IP/IP_SVE_SQ8_FP16.h b/src/VecSim/spaces/IP/IP_SVE_SQ8_FP16.h
new file mode 100644
index 000000000..36a7d18e6
--- /dev/null
+++ b/src/VecSim/spaces/IP/IP_SVE_SQ8_FP16.h
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2006-Present, Redis Ltd.
+ * All rights reserved.
+ *
+ * Licensed under your choice of the Redis Source Available License 2.0
+ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
+ * GNU Affero General Public License v3 (AGPLv3).
+ */
+#pragma once
+#include "VecSim/spaces/space_includes.h"
+#include "VecSim/types/sq8.h"
+#include "VecSim/types/float16.h"
+#include <arm_sve.h>
+#include <cassert>
+
+using sq8 = vecsim_types::sq8;
+using float16 = vecsim_types::float16;
+
+/*
+ * Optimised asymmetric SQ8<->FP16 inner product using the algebraic identity:
+ *
+ *   IP(x, y) ~= min * y_sum + delta * sum(q_i * y_i)
+ *
+ * Hot loop accumulates sum(q_i * y_i) only; FP16 query lanes are widened to FP32
+ * inside each step via svcvt_f32_f16_x. Metadata loads use load_unaligned<float>.
+ */
+
+// Helper: one SVE-vector-width-of-FP32 step.
+//   chunk = svcntw() - number of FP32 lanes per step.
+//   pg    = svptrue_b32() - predicate for FP32 lanes.
+static inline void
+SQ8_FP16_InnerProductStep_SVE(const uint8_t *pVect1, const float16 *pVect2, size_t &offset,
+                              svfloat32_t &sum, svbool_t pg, size_t chunk) {
+    // SQ8 -> uint32 (widen on load), then to FP32.
+    svuint32_t v1_u32 = svld1ub_u32(pg, pVect1 + offset);
+    svfloat32_t v1_f = svcvt_f32_u32_x(pg, v1_u32);
+
+    // FP16 query -> FP32. svld1_f16 uses a b16 predicate sized to `chunk` half lanes.
+    svbool_t pg16 = svwhilelt_b16(uint32_t(0), uint32_t(chunk));
+    svfloat16_t q_h =
+        svld1_f16(pg16, reinterpret_cast<const float16_t *>(pVect2) + offset);
+    svfloat32_t v2_f = svcvt_f32_f16_x(pg, q_h);
+
+    sum = svmla_f32_x(pg, sum, v1_f, v2_f);
+    offset += chunk;
+}
+
+// pVect1v = SQ8 storage, pVect2v = FP16 query
+template <bool partial_chunk, unsigned char additional_steps>
+float SQ8_FP16_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v,
+                                        size_t dimension) {
+    assert(dimension >= 16 && "kernel precondition: dispatcher must guard dim >= 16");
+
+    const uint8_t *pVect1 = static_cast<const uint8_t *>(pVect1v);
+    const float16 *pVect2 = static_cast<const float16 *>(pVect2v);
+    size_t offset = 0;
+    svbool_t pg = svptrue_b32();
+    const size_t chunk = svcntw();
+
+    svfloat32_t sum0 = svdup_f32(0.0f);
+    svfloat32_t sum1 = svdup_f32(0.0f);
+    svfloat32_t sum2 = svdup_f32(0.0f);
+    svfloat32_t sum3 = svdup_f32(0.0f);
+
+    // Partial chunk for dim % chunk lanes. Use _z form so inactive lanes are zero -
+    // the final reduction below walks all lanes via svptrue_b32().
+    if constexpr (partial_chunk) {
+        size_t remaining = dimension % chunk;
+        if (remaining > 0) {
+            svbool_t pg_partial =
+                svwhilelt_b32(uint32_t(0), uint32_t(remaining));
+            svbool_t pg16_partial =
+                svwhilelt_b16(uint32_t(0), uint32_t(remaining));
+            svuint32_t v1_u32 = svld1ub_u32(pg_partial, pVect1 + offset);
+            svfloat32_t v1_f = svcvt_f32_u32_z(pg_partial, v1_u32);
+            svfloat16_t q_h = svld1_f16(
+                pg16_partial, reinterpret_cast<const float16_t *>(pVect2) + offset);
+            svfloat32_t v2_f = svcvt_f32_f16_z(pg_partial, q_h);
+            sum0 = svmla_f32_z(pg_partial, sum0, v1_f, v2_f);
+            offset += remaining;
+        }
+    }
+
+    // Main loop: 4 chunks per iteration via 4 accumulators.
+    const size_t chunk_size = 4 * chunk;
+    const size_t number_of_chunks =
+        (dimension - (partial_chunk ? dimension % chunk : 0)) / chunk_size;
+    for (size_t i = 0; i < number_of_chunks; i++) {
+        SQ8_FP16_InnerProductStep_SVE(pVect1, pVect2, offset, sum0, pg, chunk);
+        SQ8_FP16_InnerProductStep_SVE(pVect1, pVect2, offset, sum1, pg, chunk);
+        SQ8_FP16_InnerProductStep_SVE(pVect1, pVect2, offset, sum2, pg, chunk);
+        SQ8_FP16_InnerProductStep_SVE(pVect1, pVect2, offset, sum3, pg, chunk);
+    }
+
+    // Additional steps 0..3.
+    if constexpr (additional_steps > 0)
+        SQ8_FP16_InnerProductStep_SVE(pVect1, pVect2, offset, sum0, pg, chunk);
+    if constexpr (additional_steps > 1)
+        SQ8_FP16_InnerProductStep_SVE(pVect1, pVect2, offset, sum1, pg, chunk);
+    if constexpr (additional_steps > 2)
+        SQ8_FP16_InnerProductStep_SVE(pVect1, pVect2, offset, sum2, pg, chunk);
+
+    svfloat32_t sum = svadd_f32_x(pg, sum0, sum1);
+    sum = svadd_f32_x(pg, sum, sum2);
+    sum = svadd_f32_x(pg, sum, sum3);
+    float quantized_dot = svaddv_f32(pg, sum);
+
+    // Metadata loads - unaligned because odd dim leaves trailers unaligned.
+    const uint8_t *params_bytes = static_cast<const uint8_t *>(pVect1v) + dimension;
+    const float min_val =
+        load_unaligned<float>(params_bytes + sq8::MIN_VAL * sizeof(float));
+    const float delta =
+        load_unaligned<float>(params_bytes + sq8::DELTA * sizeof(float));
+    const uint8_t *query_meta_bytes = reinterpret_cast<const uint8_t *>(
+        static_cast<const float16 *>(pVect2v) + dimension);
+    const float y_sum =
+        load_unaligned<float>(query_meta_bytes + sq8::SUM_QUERY * sizeof(float));
+
+    return min_val * y_sum + delta * quantized_dot;
+}
+
+template <bool partial_chunk, unsigned char additional_steps>
+float SQ8_FP16_InnerProductSIMD_SVE(const void *pVect1v, const void *pVect2v,
+                                    size_t dimension) {
+    return 1.0f - SQ8_FP16_InnerProductSIMD_SVE_IMP<partial_chunk, additional_steps>(
+                      pVect1v, pVect2v, dimension);
+}
+
+template <bool partial_chunk, unsigned char additional_steps>
+float SQ8_FP16_CosineSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    return SQ8_FP16_InnerProductSIMD_SVE<partial_chunk, additional_steps>(
+        pVect1v, pVect2v, dimension);
+}

From eb4952a9de34756eae83d07d422d4b0e467cc238 Mon Sep 17 00:00:00 2001
From: Dor Forer <dor.forer@redis.com>
Date: Thu, 28 May 2026 18:20:41 +0300
Subject: [PATCH 09/19] =?UTF-8?q?Add=20SVE=20SQ8=E2=86=94FP16=20L2=20kerne?=
 =?UTF-8?q?l=20header=20[MOD-14972]?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/VecSim/spaces/L2/L2_SVE_SQ8_FP16.h | 38 ++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)
 create mode 100644 src/VecSim/spaces/L2/L2_SVE_SQ8_FP16.h

diff --git a/src/VecSim/spaces/L2/L2_SVE_SQ8_FP16.h b/src/VecSim/spaces/L2/L2_SVE_SQ8_FP16.h
new file mode 100644
index 000000000..3c8e89ca6
--- /dev/null
+++ b/src/VecSim/spaces/L2/L2_SVE_SQ8_FP16.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2006-Present, Redis Ltd.
+ * All rights reserved.
+ *
+ * Licensed under your choice of the Redis Source Available License 2.0
+ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
+ * GNU Affero General Public License v3 (AGPLv3).
+ */
+#pragma once
+#include "VecSim/spaces/space_includes.h"
+#include "VecSim/spaces/IP/IP_SVE_SQ8_FP16.h"
+#include "VecSim/types/sq8.h"
+#include "VecSim/types/float16.h"
+
+using sq8 = vecsim_types::sq8;
+using float16 = vecsim_types::float16;
+
+/*
+ * SVE SQ8<->FP16 L2 squared distance:
+ *   ||x - y||^2 = x_sum_squares - 2 * IP(x, y) + y_sum_squares
+ * IP is computed by SQ8_FP16_InnerProductSIMD_SVE_IMP; metadata is FP32.
+ */
+
+template <bool partial_chunk, unsigned char additional_steps>
+float SQ8_FP16_L2SqrSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) {
+    const float ip = SQ8_FP16_InnerProductSIMD_SVE_IMP<partial_chunk, additional_steps>(
+        pVect1v, pVect2v, dimension);
+
+    const uint8_t *params_bytes = static_cast<const uint8_t *>(pVect1v) + dimension;
+    const float x_sum_sq =
+        load_unaligned<float>(params_bytes + sq8::SUM_SQUARES * sizeof(float));
+    const uint8_t *query_meta_bytes = reinterpret_cast<const uint8_t *>(
+        static_cast<const float16 *>(pVect2v) + dimension);
+    const float y_sum_sq =
+        load_unaligned<float>(query_meta_bytes + sq8::SUM_SQUARES_QUERY * sizeof(float));
+
+    return x_sum_sq + y_sum_sq - 2.0f * ip;
+}

From fcb01bbc6e0abbcc14b366b9e3525ad5b25d80bc Mon Sep 17 00:00:00 2001
From: Dor Forer <dor.forer@redis.com>
Date: Thu, 28 May 2026 18:24:14 +0300
Subject: [PATCH 10/19] =?UTF-8?q?Wire=20SVE/SVE2=20SQ8=E2=86=94FP16=20choo?=
 =?UTF-8?q?sers=20[MOD-14972]?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/VecSim/spaces/functions/SVE.cpp  | 21 +++++++++++++++++++++
 src/VecSim/spaces/functions/SVE.h    |  4 ++++
 src/VecSim/spaces/functions/SVE2.cpp | 20 ++++++++++++++++++++
 src/VecSim/spaces/functions/SVE2.h   |  4 ++++
 4 files changed, 49 insertions(+)

diff --git a/src/VecSim/spaces/functions/SVE.cpp b/src/VecSim/spaces/functions/SVE.cpp
index fde853db2..bd197c84c 100644
--- a/src/VecSim/spaces/functions/SVE.cpp
+++ b/src/VecSim/spaces/functions/SVE.cpp
@@ -25,6 +25,9 @@
 #include "VecSim/spaces/IP/IP_SVE_SQ8_FP32.h"
 #include "VecSim/spaces/L2/L2_SVE_SQ8_FP32.h"
 
+#include "VecSim/spaces/IP/IP_SVE_SQ8_FP16.h"
+#include "VecSim/spaces/L2/L2_SVE_SQ8_FP16.h"
+
 #include "VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h"
 #include "VecSim/spaces/L2/L2_SVE_SQ8_SQ8.h"
 
@@ -119,6 +122,24 @@ dist_func_t<float> Choose_SQ8_FP32_L2_implementation_SVE(size_t dim) {
     return ret_dist_func;
 }
 
+dist_func_t<float> Choose_SQ8_FP16_IP_implementation_SVE(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_FP16_InnerProductSIMD_SVE, dim, svcntw);
+    return ret_dist_func;
+}
+
+dist_func_t<float> Choose_SQ8_FP16_Cosine_implementation_SVE(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_FP16_CosineSIMD_SVE, dim, svcntw);
+    return ret_dist_func;
+}
+
+dist_func_t<float> Choose_SQ8_FP16_L2_implementation_SVE(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_FP16_L2SqrSIMD_SVE, dim, svcntw);
+    return ret_dist_func;
+}
+
 // SQ8-to-SQ8 distance functions (both vectors are uint8 quantized with precomputed sum)
 // Note: Use svcntb for uint8 elements (not svcntw which is for 32-bit elements)
 dist_func_t<float> Choose_SQ8_SQ8_IP_implementation_SVE(size_t dim) {
diff --git a/src/VecSim/spaces/functions/SVE.h b/src/VecSim/spaces/functions/SVE.h
index bd3bc97c3..43b3b22cd 100644
--- a/src/VecSim/spaces/functions/SVE.h
+++ b/src/VecSim/spaces/functions/SVE.h
@@ -33,6 +33,10 @@ dist_func_t<float> Choose_SQ8_FP32_IP_implementation_SVE(size_t dim);
 dist_func_t<float> Choose_SQ8_FP32_Cosine_implementation_SVE(size_t dim);
 dist_func_t<float> Choose_SQ8_FP32_L2_implementation_SVE(size_t dim);
 
+dist_func_t<float> Choose_SQ8_FP16_IP_implementation_SVE(size_t dim);
+dist_func_t<float> Choose_SQ8_FP16_Cosine_implementation_SVE(size_t dim);
+dist_func_t<float> Choose_SQ8_FP16_L2_implementation_SVE(size_t dim);
+
 // SQ8-to-SQ8 distance functions (both vectors are uint8 quantized with precomputed sum)
 dist_func_t<float> Choose_SQ8_SQ8_IP_implementation_SVE(size_t dim);
 dist_func_t<float> Choose_SQ8_SQ8_Cosine_implementation_SVE(size_t dim);
diff --git a/src/VecSim/spaces/functions/SVE2.cpp b/src/VecSim/spaces/functions/SVE2.cpp
index 4215d79cf..4496c07e6 100644
--- a/src/VecSim/spaces/functions/SVE2.cpp
+++ b/src/VecSim/spaces/functions/SVE2.cpp
@@ -22,6 +22,8 @@
 #include "VecSim/spaces/IP/IP_SVE_UINT8.h"    // SVE2 implementation is identical to SVE
 #include "VecSim/spaces/IP/IP_SVE_SQ8_FP32.h" // SVE2 implementation is identical to SVE
 #include "VecSim/spaces/L2/L2_SVE_SQ8_FP32.h" // SVE2 implementation is identical to SVE
+#include "VecSim/spaces/IP/IP_SVE_SQ8_FP16.h" // SVE2 implementation is identical to SVE
+#include "VecSim/spaces/L2/L2_SVE_SQ8_FP16.h" // SVE2 implementation is identical to SVE
 #include "VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h"  // SVE2 implementation is identical to SVE
 #include "VecSim/spaces/L2/L2_SVE_SQ8_SQ8.h"  // SVE2 implementation is identical to SVE
 
@@ -116,6 +118,24 @@ dist_func_t<float> Choose_SQ8_FP32_L2_implementation_SVE2(size_t dim) {
     return ret_dist_func;
 }
 
+dist_func_t<float> Choose_SQ8_FP16_IP_implementation_SVE2(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_FP16_InnerProductSIMD_SVE, dim, svcntw);
+    return ret_dist_func;
+}
+
+dist_func_t<float> Choose_SQ8_FP16_Cosine_implementation_SVE2(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_FP16_CosineSIMD_SVE, dim, svcntw);
+    return ret_dist_func;
+}
+
+dist_func_t<float> Choose_SQ8_FP16_L2_implementation_SVE2(size_t dim) {
+    dist_func_t<float> ret_dist_func;
+    CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_FP16_L2SqrSIMD_SVE, dim, svcntw);
+    return ret_dist_func;
+}
+
 // SQ8-to-SQ8 distance functions (both vectors are uint8 quantized)
 // Note: Use svcntb for uint8 elements (not svcntw which is for 32-bit elements)
 dist_func_t<float> Choose_SQ8_SQ8_IP_implementation_SVE2(size_t dim) {
diff --git a/src/VecSim/spaces/functions/SVE2.h b/src/VecSim/spaces/functions/SVE2.h
index 04078a91e..2c1bfbac3 100644
--- a/src/VecSim/spaces/functions/SVE2.h
+++ b/src/VecSim/spaces/functions/SVE2.h
@@ -33,6 +33,10 @@ dist_func_t<float> Choose_SQ8_FP32_IP_implementation_SVE2(size_t dim);
 dist_func_t<float> Choose_SQ8_FP32_Cosine_implementation_SVE2(size_t dim);
 dist_func_t<float> Choose_SQ8_FP32_L2_implementation_SVE2(size_t dim);
 
+dist_func_t<float> Choose_SQ8_FP16_IP_implementation_SVE2(size_t dim);
+dist_func_t<float> Choose_SQ8_FP16_Cosine_implementation_SVE2(size_t dim);
+dist_func_t<float> Choose_SQ8_FP16_L2_implementation_SVE2(size_t dim);
+
 // SQ8-to-SQ8 distance functions (both vectors are uint8 quantized)
 dist_func_t<float> Choose_SQ8_SQ8_IP_implementation_SVE2(size_t dim);
 dist_func_t<float> Choose_SQ8_SQ8_Cosine_implementation_SVE2(size_t dim);

From 15fca694f344f502e0594c5bc70b85d5156c34d5 Mon Sep 17 00:00:00 2001
From: Dor Forer <dor.forer@redis.com>
Date: Thu, 28 May 2026 18:28:13 +0300
Subject: [PATCH 11/19] =?UTF-8?q?Dispatch=20SQ8=E2=86=94FP16=20to=20SVE/SV?=
 =?UTF-8?q?E2=20tiers=20on=20AArch64=20[MOD-14972]?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/VecSim/spaces/IP_space.cpp | 20 ++++++++++++++++++++
 src/VecSim/spaces/L2_space.cpp | 10 ++++++++++
 2 files changed, 30 insertions(+)

diff --git a/src/VecSim/spaces/IP_space.cpp b/src/VecSim/spaces/IP_space.cpp
index 92616f394..1930e64a2 100644
--- a/src/VecSim/spaces/IP_space.cpp
+++ b/src/VecSim/spaces/IP_space.cpp
@@ -229,6 +229,16 @@ dist_func_t<float> IP_SQ8_FP16_GetDistFunc(size_t dim, unsigned char *alignment,
     if (dim < 16) {
         return ret_dist_func;
     }
+#ifdef OPT_SVE2
+    if (features.sve2) {
+        return Choose_SQ8_FP16_IP_implementation_SVE2(dim);
+    }
+#endif
+#ifdef OPT_SVE
+    if (features.sve) {
+        return Choose_SQ8_FP16_IP_implementation_SVE(dim);
+    }
+#endif
 #ifdef OPT_NEON_HP
     if (features.asimdhp) {
         // No alignment write: the locked spec and the sister ARM SQ8_FP32 dispatchers
@@ -291,6 +301,16 @@ dist_func_t<float> Cosine_SQ8_FP16_GetDistFunc(size_t dim, unsigned char *alignm
     if (dim < 16) {
         return ret_dist_func;
     }
+#ifdef OPT_SVE2
+    if (features.sve2) {
+        return Choose_SQ8_FP16_Cosine_implementation_SVE2(dim);
+    }
+#endif
+#ifdef OPT_SVE
+    if (features.sve) {
+        return Choose_SQ8_FP16_Cosine_implementation_SVE(dim);
+    }
+#endif
 #ifdef OPT_NEON_HP
     if (features.asimdhp) {
         // No alignment write: the locked spec and the sister ARM SQ8_FP32 dispatchers
diff --git a/src/VecSim/spaces/L2_space.cpp b/src/VecSim/spaces/L2_space.cpp
index 995b4c4d6..2e18920b3 100644
--- a/src/VecSim/spaces/L2_space.cpp
+++ b/src/VecSim/spaces/L2_space.cpp
@@ -160,6 +160,16 @@ dist_func_t<float> L2_SQ8_FP16_GetDistFunc(size_t dim, unsigned char *alignment,
     if (dim < 16) {
         return ret_dist_func;
     }
+#ifdef OPT_SVE2
+    if (features.sve2) {
+        return Choose_SQ8_FP16_L2_implementation_SVE2(dim);
+    }
+#endif
+#ifdef OPT_SVE
+    if (features.sve) {
+        return Choose_SQ8_FP16_L2_implementation_SVE(dim);
+    }
+#endif
 #ifdef OPT_NEON_HP
     if (features.asimdhp) {
         // No alignment write: the locked spec and the sister ARM SQ8_FP32 dispatchers

From 0fcd7d0975222744e5dfe5f1dabc395c231efed5 Mon Sep 17 00:00:00 2001
From: Dor Forer <dor.forer@redis.com>
Date: Thu, 28 May 2026 18:33:10 +0300
Subject: [PATCH 12/19] =?UTF-8?q?Extend=20SQ8=E2=86=94FP16=20tier-walk=20t?=
 =?UTF-8?q?ests=20with=20SVE/SVE2=20[MOD-14972]?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 tests/unit/test_spaces.cpp | 72 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 72 insertions(+)

diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp
index d2c9386ac..f7266dce4 100644
--- a/tests/unit/test_spaces.cpp
+++ b/tests/unit/test_spaces.cpp
@@ -3149,6 +3149,29 @@ TEST_P(SQ8_FP16_SpacesOptimizationTest, SQ8_FP16_L2SqrTest) {
 #endif
 #endif // OPT_F16C
 
+#ifdef CPU_FEATURES_ARCH_AARCH64
+#ifdef OPT_SVE2
+    if (optimization.sve2) {
+        unsigned char alignment = 0;
+        arch_opt_func = L2_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_L2_implementation_SVE2(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01)
+            << "SVE2 with dim " << dim;
+        optimization.sve2 = 0;
+    }
+#endif
+#ifdef OPT_SVE
+    if (optimization.sve) {
+        unsigned char alignment = 0;
+        arch_opt_func = L2_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_L2_implementation_SVE(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01)
+            << "SVE with dim " << dim;
+        optimization.sve = 0;
+    }
+#endif
 #ifdef OPT_NEON_HP
     if (optimization.asimdhp) {
         unsigned char alignment = 0;
@@ -3160,6 +3183,7 @@ TEST_P(SQ8_FP16_SpacesOptimizationTest, SQ8_FP16_L2SqrTest) {
         optimization.asimdhp = 0;
     }
 #endif
+#endif // CPU_FEATURES_ARCH_AARCH64
 
     unsigned char alignment = 0;
     arch_opt_func = L2_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization);
@@ -3236,6 +3260,29 @@ TEST_P(SQ8_FP16_SpacesOptimizationTest, SQ8_FP16_InnerProductTest) {
 #endif
 #endif // OPT_F16C
 
+#ifdef CPU_FEATURES_ARCH_AARCH64
+#ifdef OPT_SVE2
+    if (optimization.sve2) {
+        unsigned char alignment = 0;
+        arch_opt_func = IP_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_IP_implementation_SVE2(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01)
+            << "SVE2 with dim " << dim;
+        optimization.sve2 = 0;
+    }
+#endif
+#ifdef OPT_SVE
+    if (optimization.sve) {
+        unsigned char alignment = 0;
+        arch_opt_func = IP_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_IP_implementation_SVE(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01)
+            << "SVE with dim " << dim;
+        optimization.sve = 0;
+    }
+#endif
 #ifdef OPT_NEON_HP
     if (optimization.asimdhp) {
         unsigned char alignment = 0;
@@ -3247,6 +3294,7 @@ TEST_P(SQ8_FP16_SpacesOptimizationTest, SQ8_FP16_InnerProductTest) {
         optimization.asimdhp = 0;
     }
 #endif
+#endif // CPU_FEATURES_ARCH_AARCH64
 
     unsigned char alignment = 0;
     arch_opt_func = IP_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization);
@@ -3323,6 +3371,29 @@ TEST_P(SQ8_FP16_SpacesOptimizationTest, SQ8_FP16_CosineTest) {
 #endif
 #endif // OPT_F16C
 
+#ifdef CPU_FEATURES_ARCH_AARCH64
+#ifdef OPT_SVE2
+    if (optimization.sve2) {
+        unsigned char alignment = 0;
+        arch_opt_func = Cosine_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_Cosine_implementation_SVE2(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01)
+            << "SVE2 with dim " << dim;
+        optimization.sve2 = 0;
+    }
+#endif
+#ifdef OPT_SVE
+    if (optimization.sve) {
+        unsigned char alignment = 0;
+        arch_opt_func = Cosine_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization);
+        ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_Cosine_implementation_SVE(dim))
+            << "Unexpected distance function chosen for dim " << dim;
+        ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01)
+            << "SVE with dim " << dim;
+        optimization.sve = 0;
+    }
+#endif
 #ifdef OPT_NEON_HP
     if (optimization.asimdhp) {
         unsigned char alignment = 0;
@@ -3334,6 +3405,7 @@ TEST_P(SQ8_FP16_SpacesOptimizationTest, SQ8_FP16_CosineTest) {
         optimization.asimdhp = 0;
     }
 #endif
+#endif // CPU_FEATURES_ARCH_AARCH64
 
     unsigned char alignment = 0;
     arch_opt_func = Cosine_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization);

From 6a783f8ccb641d02d6d80a03500d1a3b265ff001 Mon Sep 17 00:00:00 2001
From: Dor Forer <dor.forer@redis.com>
Date: Thu, 28 May 2026 18:41:59 +0300
Subject: [PATCH 13/19] =?UTF-8?q?Register=20ARM=20SQ8=E2=86=94FP16=20micro?=
 =?UTF-8?q?benchmarks=20[MOD-14972]?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../spaces_benchmarks/bm_spaces_sq8_fp16.cpp  | 26 +++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_fp16.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_fp16.cpp
index ba3030064..9ec022e39 100644
--- a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_fp16.cpp
+++ b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_fp16.cpp
@@ -16,8 +16,7 @@ using float16 = vecsim_types::float16;
 /**
  * SQ8-to-FP16 benchmarks: SQ8 quantized storage with FP16 query.
  * Registers the naive (scalar) baseline plus per-ISA SIMD variants (x86: AVX-512 / AVX2+FMA /
- * AVX2 / SSE4 — gated on the matching OPT_* defines and runtime CPU features). ARM kernels
- * land via MOD-14972.
+ * AVX2 / SSE4 — gated on the matching OPT_* defines and runtime CPU features). ARM kernels (NEON_HP / SVE / SVE2) are registered below.
  */
 class BM_VecSimSpaces_SQ8_FP16 : public benchmark::Fixture {
 protected:
@@ -85,6 +84,29 @@ INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, SSE4, 16, s
 #endif // OPT_F16C
 #endif // x86_64
 
+#ifdef CPU_FEATURES_ARCH_AARCH64
+cpu_features::Aarch64Features arm_opt = cpu_features::GetAarch64Info().features;
+
+#ifdef OPT_SVE2
+bool sve2_supported = arm_opt.sve2;
+INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, SVE2, 16, sve2_supported);
+INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, SVE2, 16, sve2_supported);
+#endif
+
+#ifdef OPT_SVE
+bool sve_supported = arm_opt.sve;
+INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, SVE, 16, sve_supported);
+INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, SVE, 16, sve_supported);
+#endif
+
+#ifdef OPT_NEON_HP
+bool neon_hp_supported = arm_opt.asimdhp;
+INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, NEON_HP, 16, neon_hp_supported);
+INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, NEON_HP, 16,
+                                  neon_hp_supported);
+#endif
+#endif // CPU_FEATURES_ARCH_AARCH64
+
 // Naive (scalar) baseline — always registered as the comparison anchor.
 
 INITIALIZE_NAIVE_BM(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, InnerProduct, 16);

From a2a1b24b1975e4c7a7e4f036eae756ad5739cf16 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-24-101.us-east-2.compute.internal>
Date: Sun, 31 May 2026 07:47:32 +0000
Subject: [PATCH 14/19] =?UTF-8?q?Add=20missing=20alignment=3D0=20assertion?=
 =?UTF-8?q?s=20to=20SQ8=E2=86=94FP16=20ARM=20tier-walk=20tests=20[MOD-1497?=
 =?UTF-8?q?2]?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The 9 ARM tier blocks (L2/IP/Cosine × SVE2/SVE/NEON_HP) were missing
ASSERT_EQ(alignment, 0) after each ASSERT_NEAR, unlike the SQ8_FP32
sister blocks which assert it. Adds the assertions to lock the contract
that ARM tiers leave the caller's alignment value untouched.
---
 tests/unit/test_spaces.cpp | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp
index f7266dce4..ce8605565 100644
--- a/tests/unit/test_spaces.cpp
+++ b/tests/unit/test_spaces.cpp
@@ -3158,6 +3158,7 @@ TEST_P(SQ8_FP16_SpacesOptimizationTest, SQ8_FP16_L2SqrTest) {
             << "Unexpected distance function chosen for dim " << dim;
         ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01)
             << "SVE2 with dim " << dim;
+        ASSERT_EQ(alignment, 0) << "No alignment SVE2 with dim " << dim;
         optimization.sve2 = 0;
     }
 #endif
@@ -3169,6 +3170,7 @@ TEST_P(SQ8_FP16_SpacesOptimizationTest, SQ8_FP16_L2SqrTest) {
             << "Unexpected distance function chosen for dim " << dim;
         ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01)
             << "SVE with dim " << dim;
+        ASSERT_EQ(alignment, 0) << "No alignment SVE with dim " << dim;
         optimization.sve = 0;
     }
 #endif
@@ -3180,6 +3182,7 @@ TEST_P(SQ8_FP16_SpacesOptimizationTest, SQ8_FP16_L2SqrTest) {
             << "Unexpected distance function chosen for dim " << dim;
         ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01)
             << "NEON_HP with dim " << dim;
+        ASSERT_EQ(alignment, 0) << "No alignment NEON_HP with dim " << dim;
         optimization.asimdhp = 0;
     }
 #endif
@@ -3269,6 +3272,7 @@ TEST_P(SQ8_FP16_SpacesOptimizationTest, SQ8_FP16_InnerProductTest) {
             << "Unexpected distance function chosen for dim " << dim;
         ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01)
             << "SVE2 with dim " << dim;
+        ASSERT_EQ(alignment, 0) << "No alignment SVE2 with dim " << dim;
         optimization.sve2 = 0;
     }
 #endif
@@ -3280,6 +3284,7 @@ TEST_P(SQ8_FP16_SpacesOptimizationTest, SQ8_FP16_InnerProductTest) {
             << "Unexpected distance function chosen for dim " << dim;
         ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01)
             << "SVE with dim " << dim;
+        ASSERT_EQ(alignment, 0) << "No alignment SVE with dim " << dim;
         optimization.sve = 0;
     }
 #endif
@@ -3291,6 +3296,7 @@ TEST_P(SQ8_FP16_SpacesOptimizationTest, SQ8_FP16_InnerProductTest) {
             << "Unexpected distance function chosen for dim " << dim;
         ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01)
             << "NEON_HP with dim " << dim;
+        ASSERT_EQ(alignment, 0) << "No alignment NEON_HP with dim " << dim;
         optimization.asimdhp = 0;
     }
 #endif
@@ -3380,6 +3386,7 @@ TEST_P(SQ8_FP16_SpacesOptimizationTest, SQ8_FP16_CosineTest) {
             << "Unexpected distance function chosen for dim " << dim;
         ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01)
             << "SVE2 with dim " << dim;
+        ASSERT_EQ(alignment, 0) << "No alignment SVE2 with dim " << dim;
         optimization.sve2 = 0;
     }
 #endif
@@ -3391,6 +3398,7 @@ TEST_P(SQ8_FP16_SpacesOptimizationTest, SQ8_FP16_CosineTest) {
             << "Unexpected distance function chosen for dim " << dim;
         ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01)
             << "SVE with dim " << dim;
+        ASSERT_EQ(alignment, 0) << "No alignment SVE with dim " << dim;
         optimization.sve = 0;
     }
 #endif
@@ -3402,6 +3410,7 @@ TEST_P(SQ8_FP16_SpacesOptimizationTest, SQ8_FP16_CosineTest) {
             << "Unexpected distance function chosen for dim " << dim;
         ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01)
             << "NEON_HP with dim " << dim;
+        ASSERT_EQ(alignment, 0) << "No alignment NEON_HP with dim " << dim;
         optimization.asimdhp = 0;
     }
 #endif

From 284ad69889030ee178c394cb4e9236a83e867924 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-24-101.us-east-2.compute.internal>
Date: Sun, 31 May 2026 08:40:11 +0000
Subject: [PATCH 15/19] =?UTF-8?q?Fix=20SVE=20SQ8=E2=86=94FP16=20kernel:=20?=
 =?UTF-8?q?use=20svzip1=20to=20correct=20FP16=E2=86=92FP32=20widening=20[M?=
 =?UTF-8?q?OD-14972]?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

svcvt_f32_f16_x (FCVT) reads even-indexed FP16 elements: FP32[e] ← FP16[2e].
The step function loaded chunk consecutive FP16 values into positions 0..chunk-1,
then passed them directly to svcvt_f32_f16_x, which picked positions 0,2,4,...
and silently skipped positions 1,3,5,...  For chunk=4 (128-bit SVE), only 2 of
4 FP16 values per step were used, producing wrong dot products.

Fix: svzip1_f16(q_h, zeros) spreads values to even positions [v0,0,v1,0,...] so
FCVT correctly reads v[0],v[1],v[2],...  Applied to both the full step helper
and the partial-chunk path.

Discovered and fixed during ARM host verification (Task 14, MOD-14972).
---
 src/VecSim/spaces/IP/IP_SVE_SQ8_FP16.h | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/src/VecSim/spaces/IP/IP_SVE_SQ8_FP16.h b/src/VecSim/spaces/IP/IP_SVE_SQ8_FP16.h
index 36a7d18e6..d3213e3c3 100644
--- a/src/VecSim/spaces/IP/IP_SVE_SQ8_FP16.h
+++ b/src/VecSim/spaces/IP/IP_SVE_SQ8_FP16.h
@@ -35,11 +35,16 @@ SQ8_FP16_InnerProductStep_SVE(const uint8_t *pVect1, const float16 *pVect2, size
     svuint32_t v1_u32 = svld1ub_u32(pg, pVect1 + offset);
     svfloat32_t v1_f = svcvt_f32_u32_x(pg, v1_u32);
 
-    // FP16 query -> FP32. svld1_f16 uses a b16 predicate sized to `chunk` half lanes.
+    // FP16 query -> FP32.
+    // svcvt_f32_f16_x (FCVT) reads even-indexed FP16 elements: FP32[e] <- FP16[2e].
+    // Our chunk FP16 values land at consecutive positions 0..chunk-1 after the load.
+    // svzip1 interleaves them with zeros → positions 0,2,4,... hold the values,
+    // so FCVT correctly reads v[0], v[1], v[2], ... into FP32[0..chunk-1].
     svbool_t pg16 = svwhilelt_b16(uint32_t(0), uint32_t(chunk));
     svfloat16_t q_h =
         svld1_f16(pg16, reinterpret_cast<const float16_t *>(pVect2) + offset);
-    svfloat32_t v2_f = svcvt_f32_f16_x(pg, q_h);
+    svfloat16_t q_h_spread = svzip1_f16(q_h, svdup_f16(0.0f));
+    svfloat32_t v2_f = svcvt_f32_f16_x(pg, q_h_spread);
 
     sum = svmla_f32_x(pg, sum, v1_f, v2_f);
     offset += chunk;
@@ -75,7 +80,9 @@ float SQ8_FP16_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v
             svfloat32_t v1_f = svcvt_f32_u32_z(pg_partial, v1_u32);
             svfloat16_t q_h = svld1_f16(
                 pg16_partial, reinterpret_cast<const float16_t *>(pVect2) + offset);
-            svfloat32_t v2_f = svcvt_f32_f16_z(pg_partial, q_h);
+            // Same zip1 trick as the full step: spread values to even positions.
+            svfloat16_t q_h_spread = svzip1_f16(q_h, svdup_f16(0.0f));
+            svfloat32_t v2_f = svcvt_f32_f16_z(pg_partial, q_h_spread);
             sum0 = svmla_f32_z(pg_partial, sum0, v1_f, v2_f);
             offset += remaining;
         }

From 3754f76e10dd340dcf71ca33711b714275b1bf3d Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-24-101.us-east-2.compute.internal>
Date: Sun, 31 May 2026 12:43:00 +0000
Subject: [PATCH 16/19] =?UTF-8?q?Optimize=20ARM=20SQ8=E2=86=94FP16=20kerne?=
 =?UTF-8?q?ls=20and=20align=20with=20codebase=20conventions=20[MOD-14972]?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

SVE hot loop: replace svzip1_f16+svdup_f16+svwhilelt_b16 (4 ops) with
svld1uh_u32 (1 op) — zero-extends each FP16 halfword into a 32-bit lane
so svcvt_f32_f16_x reads the correct bits directly. Same fix applied to
the partial-chunk path, which also drops the now-redundant pg16_partial
predicate. Accumulator combine changed from svadd_f32_x to svadd_f32_z
to match the SQ8_FP32 SVE sister.

NEON residual: replace the single 8-lane block + up-to-7 software-scalar
iterations with three independent 4-lane sub-steps (r>=4, r>=8, r>=12),
leaving at most 3 elements for scalar — mirrors the SQ8_FP32 NEON sister
exactly. Eliminates expensive vecsim_types::FP16_to_FP32 calls for
residuals 4..15 (previously up to 7 software conversions per call).

Both IP headers: remove assert()+<cassert> (no sister kernel uses them).
Both L2 headers: drop redundant float16.h include and using declarations
(arrive transitively through the included IP header).
---
 src/VecSim/spaces/IP/IP_NEON_SQ8_FP16.h | 64 +++++++++++++------------
 src/VecSim/spaces/IP/IP_SVE_SQ8_FP16.h  | 60 ++++++++---------------
 src/VecSim/spaces/L2/L2_NEON_SQ8_FP16.h |  5 --
 src/VecSim/spaces/L2/L2_SVE_SQ8_FP16.h  |  5 --
 4 files changed, 52 insertions(+), 82 deletions(-)

diff --git a/src/VecSim/spaces/IP/IP_NEON_SQ8_FP16.h b/src/VecSim/spaces/IP/IP_NEON_SQ8_FP16.h
index b1d26fec5..8b8d37c0f 100644
--- a/src/VecSim/spaces/IP/IP_NEON_SQ8_FP16.h
+++ b/src/VecSim/spaces/IP/IP_NEON_SQ8_FP16.h
@@ -11,19 +11,14 @@
 #include "VecSim/types/sq8.h"
 #include "VecSim/types/float16.h"
 #include <arm_neon.h>
-#include <cassert>
 
 using sq8 = vecsim_types::sq8;
 using float16 = vecsim_types::float16;
 
 /*
- * Optimised asymmetric SQ8<->FP16 inner product using the algebraic identity:
+ * Asymmetric SQ8 (storage) <-> FP16 (query) inner product using algebraic identity:
+ *   IP(x, y) ~= min * y_sum + delta * Σ(q_i * y_i)
  *
- *   IP(x, y) = sum(x_i * y_i)
- *            ~= sum((min + delta * q_i) * y_i)
- *            = min * y_sum + delta * sum(q_i * y_i)
- *
- * The hot loop only accumulates sum(q_i * y_i) - no per-element dequantisation.
  * FP16 query lanes are widened to FP32 via vcvt_f32_f16 per 16-lane chunk.
  */
 
@@ -32,7 +27,6 @@ static inline void
 SQ8_FP16_InnerProductStep_NEON_HP(const uint8_t *&pVect1, const float16 *&pVect2,
                                   float32x4_t &sum0, float32x4_t &sum1,
                                   float32x4_t &sum2, float32x4_t &sum3) {
-    // SQ8 storage: 16 * uint8 -> 4 * float32x4_t
     uint8x16_t v1_u8 = vld1q_u8(pVect1);
     uint16x8_t v1_lo = vmovl_u8(vget_low_u8(v1_u8));
     uint16x8_t v1_hi = vmovl_u8(vget_high_u8(v1_u8));
@@ -41,7 +35,6 @@ SQ8_FP16_InnerProductStep_NEON_HP(const uint8_t *&pVect1, const float16 *&pVect2
     float32x4_t v1_2 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(v1_hi)));
     float32x4_t v1_3 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(v1_hi)));
 
-    // FP16 query: 16 * f16 -> 4 * float32x4_t via vcvt_f32_f16
     const float16_t *q = reinterpret_cast<const float16_t *>(pVect2);
     float16x8_t q_lo = vld1q_f16(q);
     float16x8_t q_hi = vld1q_f16(q + 8);
@@ -59,14 +52,12 @@ SQ8_FP16_InnerProductStep_NEON_HP(const uint8_t *&pVect1, const float16 *&pVect2
     pVect2 += 16;
 }
 
-// pVect1v = SQ8 storage, pVect2v = FP16 query
+// pVect1v = SQ8 storage, pVect2v = FP16 query. Precondition: dim >= 16 (enforced by dispatcher).
 template <unsigned char residual> // 0..15
 float SQ8_FP16_InnerProductSIMD16_NEON_HP_IMP(const void *pVect1v, const void *pVect2v,
                                               size_t dimension) {
-    assert(dimension >= 16 && "kernel precondition: dispatcher must guard dim >= 16");
-
-    const uint8_t *pVect1 = static_cast<const uint8_t *>(pVect1v); // SQ8 storage
-    const float16 *pVect2 = static_cast<const float16 *>(pVect2v); // FP16 query
+    const uint8_t *pVect1 = static_cast<const uint8_t *>(pVect1v);
+    const float16 *pVect2 = static_cast<const float16 *>(pVect2v);
 
     float32x4_t sum0 = vdupq_n_f32(0.0f);
     float32x4_t sum1 = vdupq_n_f32(0.0f);
@@ -78,36 +69,48 @@ float SQ8_FP16_InnerProductSIMD16_NEON_HP_IMP(const void *pVect1v, const void *p
         SQ8_FP16_InnerProductStep_NEON_HP(pVect1, pVect2, sum0, sum1, sum2, sum3);
     }
 
-    // Residual handling: dim % 16 lanes.
-    // residual >= 8: one safe 8-lane SQ8 + 8-lane FP16 load (FP16 trailer is wide enough).
-    // residual <  8: scalar-only - a 4-lane FP16 load would overread y_sum metadata.
+    // Residual: up to three independent 4-lane sub-steps, leaving at most 3 elements
+    // for scalar — mirrors the SQ8_FP32 NEON sister pattern.
+    // vld1_f16 (4 FP16 = 8 bytes) is safe for any residual: FP16 metadata follows
+    // the lane data so there is always enough headroom.
     constexpr unsigned char r = residual;
-    if constexpr (r >= 8) {
+    if constexpr (r >= 4) {
         uint8x8_t v1_u8 = vld1_u8(pVect1);
-        uint16x8_t v1_u16 = vmovl_u8(v1_u8);
-        float32x4_t v1_a = vcvtq_f32_u32(vmovl_u16(vget_low_u16(v1_u16)));
-        float32x4_t v1_b = vcvtq_f32_u32(vmovl_u16(vget_high_u16(v1_u16)));
-        float16x8_t q_h = vld1q_f16(reinterpret_cast<const float16_t *>(pVect2));
-        float32x4_t v2_a = vcvt_f32_f16(vget_low_f16(q_h));
-        float32x4_t v2_b = vcvt_f32_f16(vget_high_f16(q_h));
+        float32x4_t v1_a = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(v1_u8))));
+        float32x4_t v2_a =
+            vcvt_f32_f16(vld1_f16(reinterpret_cast<const float16_t *>(pVect2)));
         sum0 = vfmaq_f32(sum0, v1_a, v2_a);
+        pVect1 += 4;
+        pVect2 += 4;
+    }
+    if constexpr (r >= 8) {
+        uint8x8_t v1_u8 = vld1_u8(pVect1);
+        float32x4_t v1_b = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(v1_u8))));
+        float32x4_t v2_b =
+            vcvt_f32_f16(vld1_f16(reinterpret_cast<const float16_t *>(pVect2)));
         sum1 = vfmaq_f32(sum1, v1_b, v2_b);
-        pVect1 += 8;
-        pVect2 += 8;
+        pVect1 += 4;
+        pVect2 += 4;
+    }
+    if constexpr (r >= 12) {
+        uint8x8_t v1_u8 = vld1_u8(pVect1);
+        float32x4_t v1_c = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(v1_u8))));
+        float32x4_t v2_c =
+            vcvt_f32_f16(vld1_f16(reinterpret_cast<const float16_t *>(pVect2)));
+        sum2 = vfmaq_f32(sum2, v1_c, v2_c);
+        pVect1 += 4;
+        pVect2 += 4;
     }
-    // Lane-by-lane scalar for the final 0..7 (residual % 8) elements.
-    constexpr unsigned char tail = r & 0x7;
+    constexpr unsigned char tail = r & 3;
     float scalar_dot = 0.0f;
     for (unsigned char k = 0; k < tail; ++k) {
         scalar_dot += static_cast<float>(pVect1[k]) * vecsim_types::FP16_to_FP32(pVect2[k]);
     }
 
-    // Reduce the four NEON accumulators.
     float32x4_t sum_lo = vaddq_f32(sum0, sum1);
     float32x4_t sum_hi = vaddq_f32(sum2, sum3);
     float quantized_dot = vaddvq_f32(vaddq_f32(sum_lo, sum_hi)) + scalar_dot;
 
-    // Metadata loads - use load_unaligned because odd dim leaves trailers unaligned.
     const uint8_t *params_bytes = static_cast<const uint8_t *>(pVect1v) + dimension;
     const float min_val =
         load_unaligned<float>(params_bytes + sq8::MIN_VAL * sizeof(float));
@@ -130,6 +133,5 @@ float SQ8_FP16_InnerProductSIMD16_NEON_HP(const void *pVect1v, const void *pVect
 
 template <unsigned char residual>
 float SQ8_FP16_CosineSIMD16_NEON_HP(const void *pVect1v, const void *pVect2v, size_t dimension) {
-    // Cosine = 1 - IP (vectors are pre-normalised); reuses the IP wrapper.
     return SQ8_FP16_InnerProductSIMD16_NEON_HP<residual>(pVect1v, pVect2v, dimension);
 }
diff --git a/src/VecSim/spaces/IP/IP_SVE_SQ8_FP16.h b/src/VecSim/spaces/IP/IP_SVE_SQ8_FP16.h
index d3213e3c3..6c7a52529 100644
--- a/src/VecSim/spaces/IP/IP_SVE_SQ8_FP16.h
+++ b/src/VecSim/spaces/IP/IP_SVE_SQ8_FP16.h
@@ -11,51 +11,36 @@
 #include "VecSim/types/sq8.h"
 #include "VecSim/types/float16.h"
 #include <arm_sve.h>
-#include <cassert>
 
 using sq8 = vecsim_types::sq8;
 using float16 = vecsim_types::float16;
 
 /*
- * Optimised asymmetric SQ8<->FP16 inner product using the algebraic identity:
+ * Asymmetric SQ8 (storage) <-> FP16 (query) inner product using algebraic identity:
+ *   IP(x, y) ~= min * y_sum + delta * Σ(q_i * y_i)
  *
- *   IP(x, y) ~= min * y_sum + delta * sum(q_i * y_i)
- *
- * Hot loop accumulates sum(q_i * y_i) only; FP16 query lanes are widened to FP32
- * inside each step via svcvt_f32_f16_x. Metadata loads use load_unaligned<float>.
+ * FP16 query lanes are widened to FP32 per step via svld1uh_u32 + svcvt_f32_f16_x.
+ * svld1uh_u32 zero-extends each FP16 halfword into a 32-bit lane so that
+ * svcvt_f32_f16_x reads the correct bits directly without any interleaving.
  */
 
 // Helper: one SVE-vector-width-of-FP32 step.
-//   chunk = svcntw() - number of FP32 lanes per step.
-//   pg    = svptrue_b32() - predicate for FP32 lanes.
 static inline void
 SQ8_FP16_InnerProductStep_SVE(const uint8_t *pVect1, const float16 *pVect2, size_t &offset,
                               svfloat32_t &sum, svbool_t pg, size_t chunk) {
-    // SQ8 -> uint32 (widen on load), then to FP32.
     svuint32_t v1_u32 = svld1ub_u32(pg, pVect1 + offset);
     svfloat32_t v1_f = svcvt_f32_u32_x(pg, v1_u32);
-
-    // FP16 query -> FP32.
-    // svcvt_f32_f16_x (FCVT) reads even-indexed FP16 elements: FP32[e] <- FP16[2e].
-    // Our chunk FP16 values land at consecutive positions 0..chunk-1 after the load.
-    // svzip1 interleaves them with zeros → positions 0,2,4,... hold the values,
-    // so FCVT correctly reads v[0], v[1], v[2], ... into FP32[0..chunk-1].
-    svbool_t pg16 = svwhilelt_b16(uint32_t(0), uint32_t(chunk));
-    svfloat16_t q_h =
-        svld1_f16(pg16, reinterpret_cast<const float16_t *>(pVect2) + offset);
-    svfloat16_t q_h_spread = svzip1_f16(q_h, svdup_f16(0.0f));
-    svfloat32_t v2_f = svcvt_f32_f16_x(pg, q_h_spread);
-
+    svuint32_t q_u32 =
+        svld1uh_u32(pg, reinterpret_cast<const uint16_t *>(pVect2 + offset));
+    svfloat32_t v2_f = svcvt_f32_f16_x(pg, svreinterpret_f16_u32(q_u32));
     sum = svmla_f32_x(pg, sum, v1_f, v2_f);
     offset += chunk;
 }
 
-// pVect1v = SQ8 storage, pVect2v = FP16 query
+// pVect1v = SQ8 storage, pVect2v = FP16 query. Precondition: dim >= 16 (enforced by dispatcher).
 template <bool partial_chunk, unsigned char additional_steps>
 float SQ8_FP16_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v,
                                         size_t dimension) {
-    assert(dimension >= 16 && "kernel precondition: dispatcher must guard dim >= 16");
-
     const uint8_t *pVect1 = static_cast<const uint8_t *>(pVect1v);
     const float16 *pVect2 = static_cast<const float16 *>(pVect2v);
     size_t offset = 0;
@@ -67,28 +52,23 @@ float SQ8_FP16_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v
     svfloat32_t sum2 = svdup_f32(0.0f);
     svfloat32_t sum3 = svdup_f32(0.0f);
 
-    // Partial chunk for dim % chunk lanes. Use _z form so inactive lanes are zero -
-    // the final reduction below walks all lanes via svptrue_b32().
+    // Partial chunk for dim % chunk lanes. Use _z form so inactive lanes are zero;
+    // the final reduction walks all lanes via svptrue_b32().
     if constexpr (partial_chunk) {
         size_t remaining = dimension % chunk;
         if (remaining > 0) {
-            svbool_t pg_partial =
-                svwhilelt_b32(uint32_t(0), uint32_t(remaining));
-            svbool_t pg16_partial =
-                svwhilelt_b16(uint32_t(0), uint32_t(remaining));
+            svbool_t pg_partial = svwhilelt_b32(uint32_t(0), uint32_t(remaining));
             svuint32_t v1_u32 = svld1ub_u32(pg_partial, pVect1 + offset);
             svfloat32_t v1_f = svcvt_f32_u32_z(pg_partial, v1_u32);
-            svfloat16_t q_h = svld1_f16(
-                pg16_partial, reinterpret_cast<const float16_t *>(pVect2) + offset);
-            // Same zip1 trick as the full step: spread values to even positions.
-            svfloat16_t q_h_spread = svzip1_f16(q_h, svdup_f16(0.0f));
-            svfloat32_t v2_f = svcvt_f32_f16_z(pg_partial, q_h_spread);
+            svuint32_t q_u32 = svld1uh_u32(
+                pg_partial, reinterpret_cast<const uint16_t *>(pVect2 + offset));
+            svfloat32_t v2_f = svcvt_f32_f16_z(pg_partial, svreinterpret_f16_u32(q_u32));
             sum0 = svmla_f32_z(pg_partial, sum0, v1_f, v2_f);
             offset += remaining;
         }
     }
 
-    // Main loop: 4 chunks per iteration via 4 accumulators.
+    // Main loop: 4 chunks per iteration, one chunk per accumulator.
     const size_t chunk_size = 4 * chunk;
     const size_t number_of_chunks =
         (dimension - (partial_chunk ? dimension % chunk : 0)) / chunk_size;
@@ -99,7 +79,6 @@ float SQ8_FP16_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v
         SQ8_FP16_InnerProductStep_SVE(pVect1, pVect2, offset, sum3, pg, chunk);
     }
 
-    // Additional steps 0..3.
     if constexpr (additional_steps > 0)
         SQ8_FP16_InnerProductStep_SVE(pVect1, pVect2, offset, sum0, pg, chunk);
     if constexpr (additional_steps > 1)
@@ -107,12 +86,11 @@ float SQ8_FP16_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v
     if constexpr (additional_steps > 2)
         SQ8_FP16_InnerProductStep_SVE(pVect1, pVect2, offset, sum2, pg, chunk);
 
-    svfloat32_t sum = svadd_f32_x(pg, sum0, sum1);
-    sum = svadd_f32_x(pg, sum, sum2);
-    sum = svadd_f32_x(pg, sum, sum3);
+    svfloat32_t sum = svadd_f32_z(pg, sum0, sum1);
+    sum = svadd_f32_z(pg, sum, sum2);
+    sum = svadd_f32_z(pg, sum, sum3);
     float quantized_dot = svaddv_f32(pg, sum);
 
-    // Metadata loads - unaligned because odd dim leaves trailers unaligned.
     const uint8_t *params_bytes = static_cast<const uint8_t *>(pVect1v) + dimension;
     const float min_val =
         load_unaligned<float>(params_bytes + sq8::MIN_VAL * sizeof(float));
diff --git a/src/VecSim/spaces/L2/L2_NEON_SQ8_FP16.h b/src/VecSim/spaces/L2/L2_NEON_SQ8_FP16.h
index 7bf5db986..15cc40f6a 100644
--- a/src/VecSim/spaces/L2/L2_NEON_SQ8_FP16.h
+++ b/src/VecSim/spaces/L2/L2_NEON_SQ8_FP16.h
@@ -9,11 +9,6 @@
 #pragma once
 #include "VecSim/spaces/space_includes.h"
 #include "VecSim/spaces/IP/IP_NEON_SQ8_FP16.h"
-#include "VecSim/types/sq8.h"
-#include "VecSim/types/float16.h"
-
-using sq8 = vecsim_types::sq8;
-using float16 = vecsim_types::float16;
 
 /*
  * Optimised asymmetric SQ8<->FP16 L2 squared distance using the algebraic identity:
diff --git a/src/VecSim/spaces/L2/L2_SVE_SQ8_FP16.h b/src/VecSim/spaces/L2/L2_SVE_SQ8_FP16.h
index 3c8e89ca6..e3592c24e 100644
--- a/src/VecSim/spaces/L2/L2_SVE_SQ8_FP16.h
+++ b/src/VecSim/spaces/L2/L2_SVE_SQ8_FP16.h
@@ -9,11 +9,6 @@
 #pragma once
 #include "VecSim/spaces/space_includes.h"
 #include "VecSim/spaces/IP/IP_SVE_SQ8_FP16.h"
-#include "VecSim/types/sq8.h"
-#include "VecSim/types/float16.h"
-
-using sq8 = vecsim_types::sq8;
-using float16 = vecsim_types::float16;
 
 /*
  * SVE SQ8<->FP16 L2 squared distance:

From 10c03aae9540c6ce769bee9b2a28f10eaac8e59a Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-24-101.us-east-2.compute.internal>
Date: Sun, 31 May 2026 13:03:53 +0000
Subject: [PATCH 17/19] Apply clang-format [MOD-14972]

---
 src/VecSim/batch_iterator.h                   |  2 +-
 src/VecSim/spaces/IP/IP_NEON_SQ8_FP16.h       | 28 ++++++----------
 src/VecSim/spaces/IP/IP_SVE_SQ8_FP16.h        | 33 ++++++++-----------
 src/VecSim/spaces/L2/L2_NEON_SQ8_FP16.h       | 10 +++---
 src/VecSim/spaces/L2/L2_SVE_SQ8_FP16.h        |  7 ++--
 tests/benchmark/bm_vecsim_svs.h               | 14 ++++----
 .../spaces_benchmarks/bm_spaces_sq8_fp16.cpp  |  5 +--
 tests/benchmark/types_ranges.h                | 12 ++++---
 tests/unit/test_allocator.cpp                 |  4 +--
 9 files changed, 53 insertions(+), 62 deletions(-)

diff --git a/src/VecSim/batch_iterator.h b/src/VecSim/batch_iterator.h
index 9e2791130..466072f86 100644
--- a/src/VecSim/batch_iterator.h
+++ b/src/VecSim/batch_iterator.h
@@ -27,7 +27,7 @@ struct VecSimBatchIterator : public VecsimBaseObject {
     explicit VecSimBatchIterator(void *query_vector, void *tctx,
                                  std::shared_ptr<VecSimAllocator> allocator)
         : VecsimBaseObject(allocator), query_vector(query_vector), returned_results_count(0),
-          timeoutCtx(tctx) {};
+          timeoutCtx(tctx){};
 
     virtual inline const void *getQueryBlob() const { return query_vector; }
 
diff --git a/src/VecSim/spaces/IP/IP_NEON_SQ8_FP16.h b/src/VecSim/spaces/IP/IP_NEON_SQ8_FP16.h
index 8b8d37c0f..a5c2465fc 100644
--- a/src/VecSim/spaces/IP/IP_NEON_SQ8_FP16.h
+++ b/src/VecSim/spaces/IP/IP_NEON_SQ8_FP16.h
@@ -23,10 +23,9 @@ using float16 = vecsim_types::float16;
  */
 
 // Helper: 16 lanes per call, four FP32 accumulators (one per quarter).
-static inline void
-SQ8_FP16_InnerProductStep_NEON_HP(const uint8_t *&pVect1, const float16 *&pVect2,
-                                  float32x4_t &sum0, float32x4_t &sum1,
-                                  float32x4_t &sum2, float32x4_t &sum3) {
+static inline void SQ8_FP16_InnerProductStep_NEON_HP(const uint8_t *&pVect1, const float16 *&pVect2,
+                                                     float32x4_t &sum0, float32x4_t &sum1,
+                                                     float32x4_t &sum2, float32x4_t &sum3) {
     uint8x16_t v1_u8 = vld1q_u8(pVect1);
     uint16x8_t v1_lo = vmovl_u8(vget_low_u8(v1_u8));
     uint16x8_t v1_hi = vmovl_u8(vget_high_u8(v1_u8));
@@ -77,8 +76,7 @@ float SQ8_FP16_InnerProductSIMD16_NEON_HP_IMP(const void *pVect1v, const void *p
     if constexpr (r >= 4) {
         uint8x8_t v1_u8 = vld1_u8(pVect1);
         float32x4_t v1_a = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(v1_u8))));
-        float32x4_t v2_a =
-            vcvt_f32_f16(vld1_f16(reinterpret_cast<const float16_t *>(pVect2)));
+        float32x4_t v2_a = vcvt_f32_f16(vld1_f16(reinterpret_cast<const float16_t *>(pVect2)));
         sum0 = vfmaq_f32(sum0, v1_a, v2_a);
         pVect1 += 4;
         pVect2 += 4;
@@ -86,8 +84,7 @@ float SQ8_FP16_InnerProductSIMD16_NEON_HP_IMP(const void *pVect1v, const void *p
     if constexpr (r >= 8) {
         uint8x8_t v1_u8 = vld1_u8(pVect1);
         float32x4_t v1_b = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(v1_u8))));
-        float32x4_t v2_b =
-            vcvt_f32_f16(vld1_f16(reinterpret_cast<const float16_t *>(pVect2)));
+        float32x4_t v2_b = vcvt_f32_f16(vld1_f16(reinterpret_cast<const float16_t *>(pVect2)));
         sum1 = vfmaq_f32(sum1, v1_b, v2_b);
         pVect1 += 4;
         pVect2 += 4;
@@ -95,8 +92,7 @@ float SQ8_FP16_InnerProductSIMD16_NEON_HP_IMP(const void *pVect1v, const void *p
     if constexpr (r >= 12) {
         uint8x8_t v1_u8 = vld1_u8(pVect1);
         float32x4_t v1_c = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(v1_u8))));
-        float32x4_t v2_c =
-            vcvt_f32_f16(vld1_f16(reinterpret_cast<const float16_t *>(pVect2)));
+        float32x4_t v2_c = vcvt_f32_f16(vld1_f16(reinterpret_cast<const float16_t *>(pVect2)));
         sum2 = vfmaq_f32(sum2, v1_c, v2_c);
         pVect1 += 4;
         pVect2 += 4;
@@ -112,14 +108,11 @@ float SQ8_FP16_InnerProductSIMD16_NEON_HP_IMP(const void *pVect1v, const void *p
     float quantized_dot = vaddvq_f32(vaddq_f32(sum_lo, sum_hi)) + scalar_dot;
 
     const uint8_t *params_bytes = static_cast<const uint8_t *>(pVect1v) + dimension;
-    const float min_val =
-        load_unaligned<float>(params_bytes + sq8::MIN_VAL * sizeof(float));
-    const float delta =
-        load_unaligned<float>(params_bytes + sq8::DELTA * sizeof(float));
+    const float min_val = load_unaligned<float>(params_bytes + sq8::MIN_VAL * sizeof(float));
+    const float delta = load_unaligned<float>(params_bytes + sq8::DELTA * sizeof(float));
     const uint8_t *query_meta_bytes =
         reinterpret_cast<const uint8_t *>(static_cast<const float16 *>(pVect2v) + dimension);
-    const float y_sum =
-        load_unaligned<float>(query_meta_bytes + sq8::SUM_QUERY * sizeof(float));
+    const float y_sum = load_unaligned<float>(query_meta_bytes + sq8::SUM_QUERY * sizeof(float));
 
     return min_val * y_sum + delta * quantized_dot;
 }
@@ -127,8 +120,7 @@ float SQ8_FP16_InnerProductSIMD16_NEON_HP_IMP(const void *pVect1v, const void *p
 template <unsigned char residual>
 float SQ8_FP16_InnerProductSIMD16_NEON_HP(const void *pVect1v, const void *pVect2v,
                                           size_t dimension) {
-    return 1.0f -
-           SQ8_FP16_InnerProductSIMD16_NEON_HP_IMP<residual>(pVect1v, pVect2v, dimension);
+    return 1.0f - SQ8_FP16_InnerProductSIMD16_NEON_HP_IMP<residual>(pVect1v, pVect2v, dimension);
 }
 
 template <unsigned char residual>
diff --git a/src/VecSim/spaces/IP/IP_SVE_SQ8_FP16.h b/src/VecSim/spaces/IP/IP_SVE_SQ8_FP16.h
index 6c7a52529..1408e0880 100644
--- a/src/VecSim/spaces/IP/IP_SVE_SQ8_FP16.h
+++ b/src/VecSim/spaces/IP/IP_SVE_SQ8_FP16.h
@@ -25,13 +25,12 @@ using float16 = vecsim_types::float16;
  */
 
 // Helper: one SVE-vector-width-of-FP32 step.
-static inline void
-SQ8_FP16_InnerProductStep_SVE(const uint8_t *pVect1, const float16 *pVect2, size_t &offset,
-                              svfloat32_t &sum, svbool_t pg, size_t chunk) {
+static inline void SQ8_FP16_InnerProductStep_SVE(const uint8_t *pVect1, const float16 *pVect2,
+                                                 size_t &offset, svfloat32_t &sum, svbool_t pg,
+                                                 size_t chunk) {
     svuint32_t v1_u32 = svld1ub_u32(pg, pVect1 + offset);
     svfloat32_t v1_f = svcvt_f32_u32_x(pg, v1_u32);
-    svuint32_t q_u32 =
-        svld1uh_u32(pg, reinterpret_cast<const uint16_t *>(pVect2 + offset));
+    svuint32_t q_u32 = svld1uh_u32(pg, reinterpret_cast<const uint16_t *>(pVect2 + offset));
     svfloat32_t v2_f = svcvt_f32_f16_x(pg, svreinterpret_f16_u32(q_u32));
     sum = svmla_f32_x(pg, sum, v1_f, v2_f);
     offset += chunk;
@@ -60,8 +59,8 @@ float SQ8_FP16_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v
             svbool_t pg_partial = svwhilelt_b32(uint32_t(0), uint32_t(remaining));
             svuint32_t v1_u32 = svld1ub_u32(pg_partial, pVect1 + offset);
             svfloat32_t v1_f = svcvt_f32_u32_z(pg_partial, v1_u32);
-            svuint32_t q_u32 = svld1uh_u32(
-                pg_partial, reinterpret_cast<const uint16_t *>(pVect2 + offset));
+            svuint32_t q_u32 =
+                svld1uh_u32(pg_partial, reinterpret_cast<const uint16_t *>(pVect2 + offset));
             svfloat32_t v2_f = svcvt_f32_f16_z(pg_partial, svreinterpret_f16_u32(q_u32));
             sum0 = svmla_f32_z(pg_partial, sum0, v1_f, v2_f);
             offset += remaining;
@@ -92,27 +91,23 @@ float SQ8_FP16_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v
     float quantized_dot = svaddv_f32(pg, sum);
 
     const uint8_t *params_bytes = static_cast<const uint8_t *>(pVect1v) + dimension;
-    const float min_val =
-        load_unaligned<float>(params_bytes + sq8::MIN_VAL * sizeof(float));
-    const float delta =
-        load_unaligned<float>(params_bytes + sq8::DELTA * sizeof(float));
-    const uint8_t *query_meta_bytes = reinterpret_cast<const uint8_t *>(
-        static_cast<const float16 *>(pVect2v) + dimension);
-    const float y_sum =
-        load_unaligned<float>(query_meta_bytes + sq8::SUM_QUERY * sizeof(float));
+    const float min_val = load_unaligned<float>(params_bytes + sq8::MIN_VAL * sizeof(float));
+    const float delta = load_unaligned<float>(params_bytes + sq8::DELTA * sizeof(float));
+    const uint8_t *query_meta_bytes =
+        reinterpret_cast<const uint8_t *>(static_cast<const float16 *>(pVect2v) + dimension);
+    const float y_sum = load_unaligned<float>(query_meta_bytes + sq8::SUM_QUERY * sizeof(float));
 
     return min_val * y_sum + delta * quantized_dot;
 }
 
 template <bool partial_chunk, unsigned char additional_steps>
-float SQ8_FP16_InnerProductSIMD_SVE(const void *pVect1v, const void *pVect2v,
-                                    size_t dimension) {
+float SQ8_FP16_InnerProductSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) {
     return 1.0f - SQ8_FP16_InnerProductSIMD_SVE_IMP<partial_chunk, additional_steps>(
                       pVect1v, pVect2v, dimension);
 }
 
 template <bool partial_chunk, unsigned char additional_steps>
 float SQ8_FP16_CosineSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) {
-    return SQ8_FP16_InnerProductSIMD_SVE<partial_chunk, additional_steps>(
-        pVect1v, pVect2v, dimension);
+    return SQ8_FP16_InnerProductSIMD_SVE<partial_chunk, additional_steps>(pVect1v, pVect2v,
+                                                                          dimension);
 }
diff --git a/src/VecSim/spaces/L2/L2_NEON_SQ8_FP16.h b/src/VecSim/spaces/L2/L2_NEON_SQ8_FP16.h
index 15cc40f6a..70367d7fe 100644
--- a/src/VecSim/spaces/L2/L2_NEON_SQ8_FP16.h
+++ b/src/VecSim/spaces/L2/L2_NEON_SQ8_FP16.h
@@ -21,15 +21,13 @@
 
 template <unsigned char residual> // 0..15
 float SQ8_FP16_L2SqrSIMD16_NEON_HP(const void *pVect1v, const void *pVect2v, size_t dimension) {
-    const float ip =
-        SQ8_FP16_InnerProductSIMD16_NEON_HP_IMP<residual>(pVect1v, pVect2v, dimension);
+    const float ip = SQ8_FP16_InnerProductSIMD16_NEON_HP_IMP<residual>(pVect1v, pVect2v, dimension);
 
     const uint8_t *params_bytes = static_cast<const uint8_t *>(pVect1v) + dimension;
-    const float x_sum_sq =
-        load_unaligned<float>(params_bytes + sq8::SUM_SQUARES * sizeof(float));
+    const float x_sum_sq = load_unaligned<float>(params_bytes + sq8::SUM_SQUARES * sizeof(float));
 
-    const uint8_t *query_meta_bytes = reinterpret_cast<const uint8_t *>(
-        static_cast<const float16 *>(pVect2v) + dimension);
+    const uint8_t *query_meta_bytes =
+        reinterpret_cast<const uint8_t *>(static_cast<const float16 *>(pVect2v) + dimension);
     const float y_sum_sq =
         load_unaligned<float>(query_meta_bytes + sq8::SUM_SQUARES_QUERY * sizeof(float));
 
diff --git a/src/VecSim/spaces/L2/L2_SVE_SQ8_FP16.h b/src/VecSim/spaces/L2/L2_SVE_SQ8_FP16.h
index e3592c24e..f70ef493d 100644
--- a/src/VecSim/spaces/L2/L2_SVE_SQ8_FP16.h
+++ b/src/VecSim/spaces/L2/L2_SVE_SQ8_FP16.h
@@ -22,10 +22,9 @@ float SQ8_FP16_L2SqrSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t di
         pVect1v, pVect2v, dimension);
 
     const uint8_t *params_bytes = static_cast<const uint8_t *>(pVect1v) + dimension;
-    const float x_sum_sq =
-        load_unaligned<float>(params_bytes + sq8::SUM_SQUARES * sizeof(float));
-    const uint8_t *query_meta_bytes = reinterpret_cast<const uint8_t *>(
-        static_cast<const float16 *>(pVect2v) + dimension);
+    const float x_sum_sq = load_unaligned<float>(params_bytes + sq8::SUM_SQUARES * sizeof(float));
+    const uint8_t *query_meta_bytes =
+        reinterpret_cast<const uint8_t *>(static_cast<const float16 *>(pVect2v) + dimension);
     const float y_sum_sq =
         load_unaligned<float>(query_meta_bytes + sq8::SUM_SQUARES_QUERY * sizeof(float));
 
diff --git a/tests/benchmark/bm_vecsim_svs.h b/tests/benchmark/bm_vecsim_svs.h
index 5acb882c0..b92cce5e0 100644
--- a/tests/benchmark/bm_vecsim_svs.h
+++ b/tests/benchmark/bm_vecsim_svs.h
@@ -466,17 +466,19 @@ void BM_VecSimSVS<index_type_t>::RunGC(benchmark::State &st) {
 #define UNIT_AND_ITERATIONS Unit(benchmark::kMillisecond)->Iterations(2)
 
 #if HAVE_SVS_LVQ
-#define QUANT_BITS_ARGS {VecSimSvsQuant_8, VecSimSvsQuant_4x8_LeanVec}
+#define QUANT_BITS_ARGS                                                                            \
+    { VecSimSvsQuant_8, VecSimSvsQuant_4x8_LeanVec }
 #define COMPRESSED_TRAINING_THRESHOLD_ARGS                                                         \
-    {static_cast<long int>(BM_VecSimGeneral::block_size), 5000, 10000}
+    { static_cast<long int>(BM_VecSimGeneral::block_size), 5000, 10000 }
 #define COMPRESSED_ASYNC_TRAINING_THRESHOLD_ARGS                                                   \
-    {static_cast<long int>(BM_VecSimGeneral::block_size), 5000, 10000, 50000}
+    { static_cast<long int>(BM_VecSimGeneral::block_size), 5000, 10000, 50000 }
 #else
-#define QUANT_BITS_ARGS {VecSimSvsQuant_8}
+#define QUANT_BITS_ARGS                                                                            \
+    { VecSimSvsQuant_8 }
 // Using smaller training TH to avoid long test times without LVQ
 #define COMPRESSED_TRAINING_THRESHOLD_ARGS                                                         \
-    {static_cast<long int>(BM_VecSimGeneral::block_size), 5000}
+    { static_cast<long int>(BM_VecSimGeneral::block_size), 5000 }
 #define COMPRESSED_ASYNC_TRAINING_THRESHOLD_ARGS                                                   \
-    {static_cast<long int>(BM_VecSimGeneral::block_size), 5000}
+    { static_cast<long int>(BM_VecSimGeneral::block_size), 5000 }
 
 #endif
diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_fp16.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_fp16.cpp
index 9ec022e39..cc5d040cb 100644
--- a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_fp16.cpp
+++ b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_fp16.cpp
@@ -16,7 +16,8 @@ using float16 = vecsim_types::float16;
 /**
  * SQ8-to-FP16 benchmarks: SQ8 quantized storage with FP16 query.
  * Registers the naive (scalar) baseline plus per-ISA SIMD variants (x86: AVX-512 / AVX2+FMA /
- * AVX2 / SSE4 — gated on the matching OPT_* defines and runtime CPU features). ARM kernels (NEON_HP / SVE / SVE2) are registered below.
+ * AVX2 / SSE4 — gated on the matching OPT_* defines and runtime CPU features). ARM kernels (NEON_HP
+ * / SVE / SVE2) are registered below.
  */
 class BM_VecSimSpaces_SQ8_FP16 : public benchmark::Fixture {
 protected:
@@ -103,7 +104,7 @@ INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, SVE, 16, sv
 bool neon_hp_supported = arm_opt.asimdhp;
 INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, NEON_HP, 16, neon_hp_supported);
 INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, NEON_HP, 16,
-                                  neon_hp_supported);
+                                 neon_hp_supported);
 #endif
 #endif // CPU_FEATURES_ARCH_AARCH64
 
diff --git a/tests/benchmark/types_ranges.h b/tests/benchmark/types_ranges.h
index 43abda8f0..deff4251c 100644
--- a/tests/benchmark/types_ranges.h
+++ b/tests/benchmark/types_ranges.h
@@ -11,9 +11,11 @@
 #include <array>
 #include "bm_definitions.h"
 
-#define DEFAULT_RANGE_RADII {20, 35, 50}
+#define DEFAULT_RANGE_RADII                                                                        \
+    { 20, 35, 50 }
 
-#define DEFAULT_RANGE_EPSILONS {1, 10, 11}
+#define DEFAULT_RANGE_EPSILONS                                                                     \
+    { 1, 10, 11 }
 
 // This template struct methods returns the default values for radii and epsilons
 // To specify different values for a certain type, use template specialization
@@ -25,7 +27,8 @@ struct benchmark_range {
 
 // Larger Range query values are required for int8 wikipedia dataset.
 // Default values give 0 results
-#define INT8_RANGE_RADII {50, 65, 80}
+#define INT8_RANGE_RADII                                                                           \
+    { 50, 65, 80 }
 
 template <>
 struct benchmark_range<int8_index_t> {
@@ -34,7 +37,8 @@ struct benchmark_range<int8_index_t> {
 };
 
 // UINT8 ranges
-#define UINT8_RANGE_RADII {4, 5, 7}
+#define UINT8_RANGE_RADII                                                                          \
+    { 4, 5, 7 }
 
 template <>
 struct benchmark_range<uint8_index_t> {
diff --git a/tests/unit/test_allocator.cpp b/tests/unit/test_allocator.cpp
index 6aa4a0d0b..77db41684 100644
--- a/tests/unit/test_allocator.cpp
+++ b/tests/unit/test_allocator.cpp
@@ -33,7 +33,7 @@ struct ObjectWithSTL : public VecsimBaseObject {
 
 public:
     ObjectWithSTL(std::shared_ptr<VecSimAllocator> allocator)
-        : VecsimBaseObject(allocator), test_vec(allocator) {};
+        : VecsimBaseObject(allocator), test_vec(allocator){};
 };
 
 struct NestedObject : public VecsimBaseObject {
@@ -42,7 +42,7 @@ struct NestedObject : public VecsimBaseObject {
 
 public:
     NestedObject(std::shared_ptr<VecSimAllocator> allocator)
-        : VecsimBaseObject(allocator), stl_object(allocator), simpleObject(allocator) {};
+        : VecsimBaseObject(allocator), stl_object(allocator), simpleObject(allocator){};
 };
 
 TEST_F(AllocatorTest, test_simple_object) {

From 9741cfb62a92cf0b04b78d43c50a89789deed9e1 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-24-101.us-east-2.compute.internal>
Date: Sun, 31 May 2026 13:20:58 +0000
Subject: [PATCH 18/19] Trim PR churn: remove docs, dispatcher comments, and
 test verbosity [MOD-14972]
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Remove docs/superpowers/ design and plan files (~1550 lines); sister PR #970
  removed its equivalent doc before merge.
- Drop 5-line "No alignment write" prose comment from the three AArch64
  NEON_HP dispatcher blocks; the sister SQ8_FP32 ARM dispatchers carry no
  such comment — the absent alignment write already encodes the intent.
- Trim GetDistFuncSQ8FP16Asymmetric to a 7-line template-mapping check at
  dim=15, matching the shape of GetDistFuncSQ8Asymmetric (SQ8_FP32 sister).
  The scalar-fallback assertion it previously duplicated is already covered
  by the trailing block of SQ8_FP16_SpacesOptimizationTest.
---
 .../plans/2026-05-28-arm-sq8-fp16-kernels.md  | 1195 -----------------
 .../specs/2026-05-28-arm-sq8-fp16-design.md   |  354 -----
 src/VecSim/spaces/IP_space.cpp                |    6 -
 src/VecSim/spaces/L2_space.cpp                |    3 -
 4 files changed, 1558 deletions(-)
 delete mode 100644 docs/superpowers/plans/2026-05-28-arm-sq8-fp16-kernels.md
 delete mode 100644 docs/superpowers/specs/2026-05-28-arm-sq8-fp16-design.md

diff --git a/docs/superpowers/plans/2026-05-28-arm-sq8-fp16-kernels.md b/docs/superpowers/plans/2026-05-28-arm-sq8-fp16-kernels.md
deleted file mode 100644
index 2759ba046..000000000
--- a/docs/superpowers/plans/2026-05-28-arm-sq8-fp16-kernels.md
+++ /dev/null
@@ -1,1195 +0,0 @@
-# SQ8↔FP16 ARM SIMD Distance Kernels — Implementation Plan
-
-> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
-
-**Goal:** Add SQ8↔FP16 asymmetric distance kernels (IP, L2, Cosine) for ARM ISA tiers — NEON_HP, SVE, SVE2 — plugged into the existing dispatcher. Mirrors the x86 work delivered in PR #970.
-
-**Architecture:** Header-only SIMD kernel templates (one per metric × ISA), instantiated via the existing `CHOOSE_IMPLEMENTATION` / `CHOOSE_SVE_IMPLEMENTATION` macros inside ISA-specific TUs (`NEON_HP.cpp`, `SVE.cpp`, `SVE2.cpp`). Wiring lives in `IP_space.cpp` and `L2_space.cpp` under a `#ifdef CPU_FEATURES_ARCH_AARCH64` block that parallels the existing x86 block. L2 reuses the IP `_IMP` template via the algebraic identity `L2² = x_sum_sq + y_sum_sq − 2·IP`. Scalar fallback already on `main` is unchanged and stays as the reference for every tier.
-
-**Tech Stack:** C++20, ARM NEON intrinsics (`arm_neon.h`), ARM SVE/SVE2 intrinsics (`arm_sve.h`), GoogleTest, Google Benchmark, cpu_features.
-
-**Branch:** `dor-forer-sq8-fp16-arm-kernels-mod-14972` (stacked on PR #970 / `dor-forer-sq8-fp16-x86-kernels-mod-14954`).
-
-**Build / test loop:** The user runs `make build` (per project memory). After each build cycle confirmed, the assistant runs `make unit_test` / ASan / benchmarks on the appropriate host (ARM hardware or cross-compile/qemu — coordinate with user). Each task ends in a commit; commits are pushed only when explicitly requested.
-
-**Spec:** [`docs/superpowers/specs/2026-05-28-arm-sq8-fp16-design.md`](../specs/2026-05-28-arm-sq8-fp16-design.md)
-
----
-
-## File Structure
-
-### Files created
-
-| Path | Responsibility |
-|------|----------------|
-| `src/VecSim/spaces/IP/IP_NEON_SQ8_FP16.h` | NEON IP kernel template (`SQ8_FP16_InnerProductSIMD16_NEON_HP_IMP` + thin wrappers) |
-| `src/VecSim/spaces/L2/L2_NEON_SQ8_FP16.h` | NEON L2 kernel template (calls NEON IP impl, applies L2 identity) |
-| `src/VecSim/spaces/IP/IP_SVE_SQ8_FP16.h` | SVE IP kernel template (`SQ8_FP16_InnerProductSIMD_SVE_IMP` + wrappers); also `#include`d from SVE2.cpp |
-| `src/VecSim/spaces/L2/L2_SVE_SQ8_FP16.h` | SVE L2 kernel template; also `#include`d from SVE2.cpp |
-
-### Files modified
-
-| Path | Change |
-|------|--------|
-| `src/VecSim/spaces/functions/NEON_HP.h` | +3 chooser declarations (IP, L2, Cosine) |
-| `src/VecSim/spaces/functions/NEON_HP.cpp` | +#include kernel headers; +3 chooser definitions |
-| `src/VecSim/spaces/functions/SVE.h` | +3 chooser declarations |
-| `src/VecSim/spaces/functions/SVE.cpp` | +#include kernel headers; +3 chooser definitions |
-| `src/VecSim/spaces/functions/SVE2.h` | +3 chooser declarations |
-| `src/VecSim/spaces/functions/SVE2.cpp` | +#include SVE kernel headers; +3 chooser definitions (own symbols, templates instantiated under SVE2 compile flags) |
-| `src/VecSim/spaces/IP_space.cpp` | +#ifdef AArch64 block in `IP_SQ8_FP16_GetDistFunc` and `Cosine_SQ8_FP16_GetDistFunc` (2 dispatcher blocks) |
-| `src/VecSim/spaces/L2_space.cpp` | +#ifdef AArch64 block in `L2_SQ8_FP16_GetDistFunc` (1 dispatcher block) |
-| `tests/unit/test_spaces.cpp` | retarget `GetDistFuncSQ8FP16Asymmetric` to dim=15; add dim=0 test; extend the three `SQ8_FP16_SpacesOptimizationTest` test bodies with ARM tier walks; extend `SQ8_FP16_SIMD_TierCoverage.ReportTiersExercised` with AArch64 tier reporting |
-| `tests/benchmark/spaces_benchmarks/bm_spaces_sq8_fp16.cpp` | +AArch64 `cpu_features` block; +ARM ISA benchmark registrations |
-
-### Files NOT modified
-
-`src/VecSim/spaces/CMakeLists.txt` — zero CMake changes. Existing TU flags (`-march=armv8.2-a+fp16fml` for NEON_HP, `-march=armv8-a+sve` for SVE, `-march=armv9-a+sve2` for SVE2) already carry everything the new kernels need.
-
----
-
-## Task 1: Retarget the scalar-fallback dispatcher test
-
-**Why first:** Builds and runs on x86 today, has nothing to do with the ARM kernels, and tightens the contract the rest of the plan relies on (the dispatcher returns scalar for `dim < 16`).
-
-**Files:**
-- Modify: `tests/unit/test_spaces.cpp` — locate test named `GetDistFuncSQ8FP16Asymmetric` (added by PR #970; currently asserts `dim=128` returns the scalar fallback)
-
-- [ ] **Step 1: Locate the existing test**
-
-Run:
-```bash
-grep -n 'GetDistFuncSQ8FP16Asymmetric' tests/unit/test_spaces.cpp
-```
-Expected: one or more line hits pointing at the `TEST(...)` block.
-
-- [ ] **Step 2: Modify the test to cover dim=0 and dim=15 instead of dim=128**
-
-Replace the body of the existing `TEST(..., GetDistFuncSQ8FP16Asymmetric)` so it walks two below-threshold dims and asserts the scalar fallback for each of L2 / IP / Cosine. Drop in this exact body (rename the test fixture symbol to match what is already there if it differs):
-
-```cpp
-TEST_F(SpacesTest, GetDistFuncSQ8FP16Asymmetric) {
-    // SQ8 storage with FP16 query (asymmetric) - should return SQ8_FP16 functions.
-    // Per-ISA dispatcher walk coverage lives in the SQ8_FP16 SpacesOptimizationTest below.
-    //
-    // Walk two below-threshold dims (0 and 15) so the assertions hold regardless of which
-    // SIMD tiers the host advertises: dim < 16 must always short-circuit to scalar fallback.
-    // The template-mapping form (spaces::GetDistFunc<sq8, float, float16>) and the direct
-    // *_SQ8_FP16_GetDistFunc form must agree for every dim, and both must match the scalar
-    // reference at sub-threshold dims.
-    for (size_t dim : {static_cast<size_t>(0), static_cast<size_t>(15)}) {
-        auto l2_func = spaces::GetDistFunc<sq8, float, float16>(VecSimMetric_L2, dim, nullptr);
-        auto ip_func = spaces::GetDistFunc<sq8, float, float16>(VecSimMetric_IP, dim, nullptr);
-        auto cosine_func =
-            spaces::GetDistFunc<sq8, float, float16>(VecSimMetric_Cosine, dim, nullptr);
-
-        ASSERT_EQ(l2_func, L2_SQ8_FP16_GetDistFunc(dim, nullptr))
-            << "Template mapping disagrees with direct dispatcher for L2 at dim=" << dim;
-        ASSERT_EQ(ip_func, IP_SQ8_FP16_GetDistFunc(dim, nullptr))
-            << "Template mapping disagrees with direct dispatcher for IP at dim=" << dim;
-        ASSERT_EQ(cosine_func, Cosine_SQ8_FP16_GetDistFunc(dim, nullptr))
-            << "Template mapping disagrees with direct dispatcher for Cosine at dim=" << dim;
-
-        ASSERT_EQ(l2_func, SQ8_FP16_L2Sqr)
-            << "dim=" << dim << " must short-circuit to scalar L2 fallback";
-        ASSERT_EQ(ip_func, SQ8_FP16_InnerProduct)
-            << "dim=" << dim << " must short-circuit to scalar IP fallback";
-        ASSERT_EQ(cosine_func, SQ8_FP16_Cosine)
-            << "dim=" << dim << " must short-circuit to scalar Cosine fallback";
-    }
-}
-```
-
-- [ ] **Step 3: User builds**
-
-Ask the user to run `make build` (their normal x86 build is sufficient — this test is host-agnostic).
-
-- [ ] **Step 4: Run the test**
-
-Run:
-```bash
-./bin/<host-triple>/unit_tests --gtest_filter='SpacesTest.GetDistFuncSQ8FP16Asymmetric'
-```
-(Use `find bin -name unit_tests -type f` if the host-triple subdir is unknown.)
-Expected: PASS.
-
-- [ ] **Step 5: Commit**
-
-```bash
-git add tests/unit/test_spaces.cpp
-git commit -m "Retarget SQ8↔FP16 scalar-fallback dispatcher test to dim=0/15 [MOD-14972]"
-```
-
----
-
-## Task 2: NEON IP kernel header
-
-**Files:**
-- Create: `src/VecSim/spaces/IP/IP_NEON_SQ8_FP16.h`
-
-- [ ] **Step 1: Author the kernel file**
-
-Create exactly this file (modeled on `IP_NEON_SQ8_FP32.h` + the NEON FP16 widening pattern from `IP_NEON_FP16.h`):
-
-```cpp
-/*
- * Copyright (c) 2006-Present, Redis Ltd.
- * All rights reserved.
- *
- * Licensed under your choice of the Redis Source Available License 2.0
- * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
- * GNU Affero General Public License v3 (AGPLv3).
- */
-#pragma once
-#include "VecSim/spaces/space_includes.h"
-#include "VecSim/types/sq8.h"
-#include "VecSim/types/float16.h"
-#include <arm_neon.h>
-#include <cassert>
-
-using sq8 = vecsim_types::sq8;
-using float16 = vecsim_types::float16;
-
-/*
- * Optimised asymmetric SQ8<->FP16 inner product using the algebraic identity:
- *
- *   IP(x, y) = sum(x_i * y_i)
- *            ~= sum((min + delta * q_i) * y_i)
- *            = min * y_sum + delta * sum(q_i * y_i)
- *
- * The hot loop only accumulates sum(q_i * y_i) - no per-element dequantisation.
- * FP16 query lanes are widened to FP32 via vcvt_f32_f16 per 16-lane chunk.
- */
-
-// Helper: 16 lanes per call, four FP32 accumulators (one per quarter).
-static inline void
-SQ8_FP16_InnerProductStep_NEON_HP(const uint8_t *&pVect1, const float16 *&pVect2,
-                                  float32x4_t &sum0, float32x4_t &sum1,
-                                  float32x4_t &sum2, float32x4_t &sum3) {
-    // SQ8 storage: 16 * uint8 -> 4 * float32x4_t
-    uint8x16_t v1_u8 = vld1q_u8(pVect1);
-    uint16x8_t v1_lo = vmovl_u8(vget_low_u8(v1_u8));
-    uint16x8_t v1_hi = vmovl_u8(vget_high_u8(v1_u8));
-    float32x4_t v1_0 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(v1_lo)));
-    float32x4_t v1_1 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(v1_lo)));
-    float32x4_t v1_2 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(v1_hi)));
-    float32x4_t v1_3 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(v1_hi)));
-
-    // FP16 query: 16 * f16 -> 4 * float32x4_t via vcvt_f32_f16
-    const float16_t *q = reinterpret_cast<const float16_t *>(pVect2);
-    float16x8_t q_lo = vld1q_f16(q);
-    float16x8_t q_hi = vld1q_f16(q + 8);
-    float32x4_t v2_0 = vcvt_f32_f16(vget_low_f16(q_lo));
-    float32x4_t v2_1 = vcvt_f32_f16(vget_high_f16(q_lo));
-    float32x4_t v2_2 = vcvt_f32_f16(vget_low_f16(q_hi));
-    float32x4_t v2_3 = vcvt_f32_f16(vget_high_f16(q_hi));
-
-    sum0 = vfmaq_f32(sum0, v1_0, v2_0);
-    sum1 = vfmaq_f32(sum1, v1_1, v2_1);
-    sum2 = vfmaq_f32(sum2, v1_2, v2_2);
-    sum3 = vfmaq_f32(sum3, v1_3, v2_3);
-
-    pVect1 += 16;
-    pVect2 += 16;
-}
-
-// pVect1v = SQ8 storage, pVect2v = FP16 query
-template <unsigned char residual> // 0..15
-float SQ8_FP16_InnerProductSIMD16_NEON_HP_IMP(const void *pVect1v, const void *pVect2v,
-                                              size_t dimension) {
-    assert(dimension >= 16 && "kernel precondition: dispatcher must guard dim >= 16");
-
-    const uint8_t *pVect1 = static_cast<const uint8_t *>(pVect1v); // SQ8 storage
-    const float16 *pVect2 = static_cast<const float16 *>(pVect2v); // FP16 query
-
-    float32x4_t sum0 = vdupq_n_f32(0.0f);
-    float32x4_t sum1 = vdupq_n_f32(0.0f);
-    float32x4_t sum2 = vdupq_n_f32(0.0f);
-    float32x4_t sum3 = vdupq_n_f32(0.0f);
-
-    const size_t num_of_chunks = dimension / 16;
-    for (size_t i = 0; i < num_of_chunks; i++) {
-        SQ8_FP16_InnerProductStep_NEON_HP(pVect1, pVect2, sum0, sum1, sum2, sum3);
-    }
-
-    // Residual handling: dim % 16 lanes.
-    // residual >= 8: one safe 8-lane SQ8 + 8-lane FP16 load (FP16 trailer is wide enough).
-    // residual <  8: scalar-only - a 4-lane FP16 load would overread y_sum metadata.
-    constexpr unsigned char r = residual;
-    if constexpr (r >= 8) {
-        uint8x8_t v1_u8 = vld1_u8(pVect1);
-        uint16x8_t v1_u16 = vmovl_u8(v1_u8);
-        float32x4_t v1_a = vcvtq_f32_u32(vmovl_u16(vget_low_u16(v1_u16)));
-        float32x4_t v1_b = vcvtq_f32_u32(vmovl_u16(vget_high_u16(v1_u16)));
-        float16x8_t q_h = vld1q_f16(reinterpret_cast<const float16_t *>(pVect2));
-        float32x4_t v2_a = vcvt_f32_f16(vget_low_f16(q_h));
-        float32x4_t v2_b = vcvt_f32_f16(vget_high_f16(q_h));
-        sum0 = vfmaq_f32(sum0, v1_a, v2_a);
-        sum1 = vfmaq_f32(sum1, v1_b, v2_b);
-        pVect1 += 8;
-        pVect2 += 8;
-    }
-    // Lane-by-lane scalar for the final 0..7 (residual % 8) elements.
-    constexpr unsigned char tail = r & 0x7;
-    float scalar_dot = 0.0f;
-    for (unsigned char k = 0; k < tail; ++k) {
-        scalar_dot += static_cast<float>(pVect1[k]) * vecsim_types::FP16_to_FP32(pVect2[k]);
-    }
-
-    // Reduce the four NEON accumulators.
-    float32x4_t sum_lo = vaddq_f32(sum0, sum1);
-    float32x4_t sum_hi = vaddq_f32(sum2, sum3);
-    float quantized_dot = vaddvq_f32(vaddq_f32(sum_lo, sum_hi)) + scalar_dot;
-
-    // Metadata loads - use load_unaligned because odd dim leaves trailers unaligned.
-    const uint8_t *params_bytes = static_cast<const uint8_t *>(pVect1v) + dimension;
-    const float min_val =
-        load_unaligned<float>(params_bytes + sq8::MIN_VAL * sizeof(float));
-    const float delta =
-        load_unaligned<float>(params_bytes + sq8::DELTA * sizeof(float));
-    const uint8_t *query_meta_bytes =
-        reinterpret_cast<const uint8_t *>(static_cast<const float16 *>(pVect2v) + dimension);
-    const float y_sum =
-        load_unaligned<float>(query_meta_bytes + sq8::SUM_QUERY * sizeof(float));
-
-    return min_val * y_sum + delta * quantized_dot;
-}
-
-template <unsigned char residual>
-float SQ8_FP16_InnerProductSIMD16_NEON_HP(const void *pVect1v, const void *pVect2v,
-                                          size_t dimension) {
-    return 1.0f -
-           SQ8_FP16_InnerProductSIMD16_NEON_HP_IMP<residual>(pVect1v, pVect2v, dimension);
-}
-
-template <unsigned char residual>
-float SQ8_FP16_CosineSIMD16_NEON_HP(const void *pVect1v, const void *pVect2v, size_t dimension) {
-    // Cosine = 1 - IP (vectors are pre-normalised); reuses the IP wrapper.
-    return SQ8_FP16_InnerProductSIMD16_NEON_HP<residual>(pVect1v, pVect2v, dimension);
-}
-```
-
-- [ ] **Step 2: Header-only smoke (no build yet)**
-
-Run:
-```bash
-grep -n 'load_unaligned\|FP16_to_FP32' src/VecSim/spaces/space_includes.h \
-    src/VecSim/spaces/IP/IP.cpp src/VecSim/types/float16.h 2>/dev/null
-```
-Expected: confirm the global `load_unaligned<T>` is reachable through `space_includes.h` (matches the include path used by `IP_NEON_SQ8_FP32.h`) and `FP16_to_FP32` is reachable through `VecSim/types/float16.h`. If either include is missing, add it.
-
-- [ ] **Step 3: Commit**
-
-```bash
-git add src/VecSim/spaces/IP/IP_NEON_SQ8_FP16.h
-git commit -m "Add NEON_HP SQ8↔FP16 IP kernel header [MOD-14972]"
-```
-
----
-
-## Task 3: NEON L2 kernel header
-
-**Files:**
-- Create: `src/VecSim/spaces/L2/L2_NEON_SQ8_FP16.h`
-
-- [ ] **Step 1: Author the kernel file**
-
-```cpp
-/*
- * Copyright (c) 2006-Present, Redis Ltd.
- * All rights reserved.
- *
- * Licensed under your choice of the Redis Source Available License 2.0
- * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
- * GNU Affero General Public License v3 (AGPLv3).
- */
-#pragma once
-#include "VecSim/spaces/space_includes.h"
-#include "VecSim/spaces/IP/IP_NEON_SQ8_FP16.h"
-#include "VecSim/types/sq8.h"
-#include "VecSim/types/float16.h"
-
-using sq8 = vecsim_types::sq8;
-using float16 = vecsim_types::float16;
-
-/*
- * Optimised asymmetric SQ8<->FP16 L2 squared distance using the algebraic identity:
- *
- *   ||x - y||^2 = sum(x_i^2) - 2 * IP(x, y) + sum(y_i^2)
- *               = x_sum_squares - 2 * IP(x, y) + y_sum_squares
- *
- * IP is computed by SQ8_FP16_InnerProductSIMD16_NEON_HP_IMP; metadata is FP32.
- */
-
-template <unsigned char residual> // 0..15
-float SQ8_FP16_L2SqrSIMD16_NEON_HP(const void *pVect1v, const void *pVect2v, size_t dimension) {
-    const float ip =
-        SQ8_FP16_InnerProductSIMD16_NEON_HP_IMP<residual>(pVect1v, pVect2v, dimension);
-
-    const uint8_t *params_bytes = static_cast<const uint8_t *>(pVect1v) + dimension;
-    const float x_sum_sq =
-        load_unaligned<float>(params_bytes + sq8::SUM_SQUARES * sizeof(float));
-
-    const uint8_t *query_meta_bytes = reinterpret_cast<const uint8_t *>(
-        static_cast<const float16 *>(pVect2v) + dimension);
-    const float y_sum_sq =
-        load_unaligned<float>(query_meta_bytes + sq8::SUM_SQUARES_QUERY * sizeof(float));
-
-    return x_sum_sq + y_sum_sq - 2.0f * ip;
-}
-```
-
-- [ ] **Step 2: Commit**
-
-```bash
-git add src/VecSim/spaces/L2/L2_NEON_SQ8_FP16.h
-git commit -m "Add NEON_HP SQ8↔FP16 L2 kernel header [MOD-14972]"
-```
-
----
-
-## Task 4: NEON_HP dispatcher TU additions
-
-**Files:**
-- Modify: `src/VecSim/spaces/functions/NEON_HP.h` — add 3 declarations
-- Modify: `src/VecSim/spaces/functions/NEON_HP.cpp` — add 3 chooser definitions
-
-- [ ] **Step 1: Add chooser declarations to NEON_HP.h**
-
-In `src/VecSim/spaces/functions/NEON_HP.h`, inside `namespace spaces { ... }`, append these three declarations alongside the existing `Choose_FP16_*_implementation_NEON_HP`:
-
-```cpp
-dist_func_t<float> Choose_SQ8_FP16_IP_implementation_NEON_HP(size_t dim);
-dist_func_t<float> Choose_SQ8_FP16_L2_implementation_NEON_HP(size_t dim);
-dist_func_t<float> Choose_SQ8_FP16_Cosine_implementation_NEON_HP(size_t dim);
-```
-
-- [ ] **Step 2: Add chooser definitions to NEON_HP.cpp**
-
-In `src/VecSim/spaces/functions/NEON_HP.cpp`, add the kernel `#include`s alongside the existing FP16 includes:
-
-```cpp
-#include "VecSim/spaces/IP/IP_NEON_SQ8_FP16.h"
-#include "VecSim/spaces/L2/L2_NEON_SQ8_FP16.h"
-```
-
-Then inside `namespace spaces { ... }` (between `#include "implementation_chooser.h"` and `#include "implementation_chooser_cleanup.h"`), append:
-
-```cpp
-dist_func_t<float> Choose_SQ8_FP16_IP_implementation_NEON_HP(size_t dim) {
-    dist_func_t<float> ret_dist_func;
-    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_FP16_InnerProductSIMD16_NEON_HP);
-    return ret_dist_func;
-}
-
-dist_func_t<float> Choose_SQ8_FP16_L2_implementation_NEON_HP(size_t dim) {
-    dist_func_t<float> ret_dist_func;
-    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_FP16_L2SqrSIMD16_NEON_HP);
-    return ret_dist_func;
-}
-
-dist_func_t<float> Choose_SQ8_FP16_Cosine_implementation_NEON_HP(size_t dim) {
-    dist_func_t<float> ret_dist_func;
-    CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_FP16_CosineSIMD16_NEON_HP);
-    return ret_dist_func;
-}
-```
-
-- [ ] **Step 3: Commit**
-
-```bash
-git add src/VecSim/spaces/functions/NEON_HP.h src/VecSim/spaces/functions/NEON_HP.cpp
-git commit -m "Wire NEON_HP SQ8↔FP16 choosers [MOD-14972]"
-```
-
----
-
-## Task 5: NEON_HP dispatcher wiring in IP_space.cpp + L2_space.cpp
-
-**Files:**
-- Modify: `src/VecSim/spaces/IP_space.cpp` — `IP_SQ8_FP16_GetDistFunc` + `Cosine_SQ8_FP16_GetDistFunc`
-- Modify: `src/VecSim/spaces/L2_space.cpp` — `L2_SQ8_FP16_GetDistFunc`
-
-Each of those three `_GetDistFunc` functions currently has an `#ifdef CPU_FEATURES_ARCH_X86_64` block with an early `if (dim < 16) return ret_dist_func;` guard followed by per-tier dispatch. We append an `#ifdef CPU_FEATURES_ARCH_AARCH64` block with the matching shape. Only NEON_HP is wired in this task; SVE/SVE2 land in a later task.
-
-- [ ] **Step 1: Confirm the #include for NEON_HP.h is present**
-
-Run:
-```bash
-grep -n 'functions/NEON_HP.h' src/VecSim/spaces/IP_space.cpp src/VecSim/spaces/L2_space.cpp
-```
-Expected: both files already `#include "VecSim/spaces/functions/NEON_HP.h"`. If a file is missing it, add the include.
-
-- [ ] **Step 2: Wire IP_SQ8_FP16_GetDistFunc**
-
-In `src/VecSim/spaces/IP_space.cpp`, locate `IP_SQ8_FP16_GetDistFunc`. After the closing `#endif // x86_64`, insert a parallel AArch64 block immediately before the trailing `return ret_dist_func;`:
-
-```cpp
-#ifdef CPU_FEATURES_ARCH_AARCH64
-    if (dim < 16) {
-        return ret_dist_func;
-    }
-#ifdef OPT_NEON_HP
-    if (features.asimdhp) {
-        // No alignment write: the locked spec and the sister ARM SQ8_FP32 dispatchers
-        // leave *alignment untouched on ARM tiers. The corresponding tests assert
-        // 0xFF passthrough on the scalar path and do not assert any non-zero value here.
-        return Choose_SQ8_FP16_IP_implementation_NEON_HP(dim);
-    }
-#endif
-#endif // CPU_FEATURES_ARCH_AARCH64
-```
-
-- [ ] **Step 3: Wire Cosine_SQ8_FP16_GetDistFunc**
-
-In the same file, locate `Cosine_SQ8_FP16_GetDistFunc`. Insert the same block, swapping `Choose_SQ8_FP16_IP_implementation_NEON_HP` for `Choose_SQ8_FP16_Cosine_implementation_NEON_HP`.
-
-- [ ] **Step 4: Wire L2_SQ8_FP16_GetDistFunc**
-
-In `src/VecSim/spaces/L2_space.cpp`, locate `L2_SQ8_FP16_GetDistFunc`. Insert the same block, swapping the call for `Choose_SQ8_FP16_L2_implementation_NEON_HP`.
-
-- [ ] **Step 5: User builds**
-
-Ask the user to run `make build` — first time the new NEON_HP TU additions compile. If they have ARM hardware or a cross-compile target, that build path; otherwise the x86 build must at least confirm the new headers don't accidentally break non-ARM compilation (the new headers are only `#include`d from `NEON_HP.cpp`, which is excluded on non-ARM hosts, so x86 builds should be clean).
-
-- [ ] **Step 6: Commit**
-
-```bash
-git add src/VecSim/spaces/IP_space.cpp src/VecSim/spaces/L2_space.cpp
-git commit -m "Dispatch SQ8↔FP16 to NEON_HP tier on AArch64 [MOD-14972]"
-```
-
----
-
-## Task 6: Extend `SQ8_FP16_SpacesOptimizationTest` with NEON_HP tier-walk
-
-**Files:**
-- Modify: `tests/unit/test_spaces.cpp` — three test bodies (`SQ8_FP16_L2SqrTest`, `SQ8_FP16_InnerProductTest`, `SQ8_FP16_CosineTest`)
-
-After the existing `#ifdef OPT_SSE4` block in each test, append:
-
-- [ ] **Step 1: Add NEON_HP tier to L2 test**
-
-In `SQ8_FP16_L2SqrTest`, immediately after the closing `#endif` that follows the SSE4 block and before `// Scalar fallback`:
-
-```cpp
-#ifdef OPT_NEON_HP
-    if (optimization.asimdhp) {
-        unsigned char alignment = 0;
-        arch_opt_func = L2_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization);
-        ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_L2_implementation_NEON_HP(dim))
-            << "Unexpected distance function chosen for dim " << dim;
-        ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01)
-            << "NEON_HP with dim " << dim;
-        optimization.asimdhp = 0;
-    }
-#endif
-```
-
-- [ ] **Step 2: Add NEON_HP tier to IP test**
-
-In `SQ8_FP16_InnerProductTest`, append the same block but swap `L2_SQ8_FP16_GetDistFunc` → `IP_SQ8_FP16_GetDistFunc` and `Choose_SQ8_FP16_L2_implementation_NEON_HP` → `Choose_SQ8_FP16_IP_implementation_NEON_HP`.
-
-- [ ] **Step 3: Add NEON_HP tier to Cosine test**
-
-In `SQ8_FP16_CosineTest`, append the same block with `Cosine_SQ8_FP16_GetDistFunc` and `Choose_SQ8_FP16_Cosine_implementation_NEON_HP`.
-
-- [ ] **Step 4: Confirm the include path for the NEON_HP chooser declarations**
-
-Run:
-```bash
-grep -n 'functions/NEON_HP.h' tests/unit/test_spaces.cpp
-```
-Expected: include present. If not, add `#include "VecSim/spaces/functions/NEON_HP.h"` near the other space-function includes at the top of the file.
-
-- [ ] **Step 5: User builds (ARM target)**
-
-Ask the user to run `make build` for an ARM target (hardware or cross-compile). On x86 the new test code is gated by `#ifdef OPT_NEON_HP` and stays inert.
-
-- [ ] **Step 6: Run NEON_HP tests**
-
-Once the ARM build is reported clean, run:
-```bash
-./bin/<arm-triple>/unit_tests --gtest_filter='SQ8_FP16_*Test*'
-```
-Expected: all parametrized cases PASS, including the dims-16..32 and high-dim suites.
-
-- [ ] **Step 7: Commit**
-
-```bash
-git add tests/unit/test_spaces.cpp
-git commit -m "Extend SQ8↔FP16 tier-walk tests with NEON_HP [MOD-14972]"
-```
-
----
-
-## Task 7: SVE IP kernel header
-
-**Files:**
-- Create: `src/VecSim/spaces/IP/IP_SVE_SQ8_FP16.h`
-
-- [ ] **Step 1: Author the kernel file**
-
-Modeled on `IP_SVE_SQ8_FP32.h`. The shape: an `InnerProductStep` helper that consumes `chunk = svcntw()` FP32 lanes per call (FP16 query loaded under a `b16` predicate, SQ8 storage under a `b32` predicate that drives uint8→uint32 widening), then a templated `_IMP` over `<bool partial_chunk, unsigned char additional_steps>`.
-
-```cpp
-/*
- * Copyright (c) 2006-Present, Redis Ltd.
- * All rights reserved.
- *
- * Licensed under your choice of the Redis Source Available License 2.0
- * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
- * GNU Affero General Public License v3 (AGPLv3).
- */
-#pragma once
-#include "VecSim/spaces/space_includes.h"
-#include "VecSim/types/sq8.h"
-#include "VecSim/types/float16.h"
-#include <arm_sve.h>
-#include <cassert>
-
-using sq8 = vecsim_types::sq8;
-using float16 = vecsim_types::float16;
-
-/*
- * Optimised asymmetric SQ8<->FP16 inner product using the algebraic identity:
- *
- *   IP(x, y) ~= min * y_sum + delta * sum(q_i * y_i)
- *
- * Hot loop accumulates sum(q_i * y_i) only; FP16 query lanes are widened to FP32
- * inside each step via svcvt_f32_f16_x. Metadata loads use load_unaligned<float>.
- */
-
-// Helper: one SVE-vector-width-of-FP32 step.
-//   chunk = svcntw() - number of FP32 lanes per step.
-//   pg    = svptrue_b32() - predicate for FP32 lanes.
-static inline void
-SQ8_FP16_InnerProductStep_SVE(const uint8_t *pVect1, const float16 *pVect2, size_t &offset,
-                              svfloat32_t &sum, svbool_t pg, size_t chunk) {
-    // SQ8 -> uint32 (widen on load), then to FP32.
-    svuint32_t v1_u32 = svld1ub_u32(pg, pVect1 + offset);
-    svfloat32_t v1_f = svcvt_f32_u32_x(pg, v1_u32);
-
-    // FP16 query -> FP32. svld1_f16 uses a b16 predicate sized to `chunk` half lanes.
-    svbool_t pg16 = svwhilelt_b16(uint32_t(0), uint32_t(chunk));
-    svfloat16_t q_h =
-        svld1_f16(pg16, reinterpret_cast<const float16_t *>(pVect2) + offset);
-    svfloat32_t v2_f = svcvt_f32_f16_x(pg, q_h);
-
-    sum = svmla_f32_x(pg, sum, v1_f, v2_f);
-    offset += chunk;
-}
-
-// pVect1v = SQ8 storage, pVect2v = FP16 query
-template <bool partial_chunk, unsigned char additional_steps>
-float SQ8_FP16_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v,
-                                        size_t dimension) {
-    assert(dimension >= 16 && "kernel precondition: dispatcher must guard dim >= 16");
-
-    const uint8_t *pVect1 = static_cast<const uint8_t *>(pVect1v);
-    const float16 *pVect2 = static_cast<const float16 *>(pVect2v);
-    size_t offset = 0;
-    svbool_t pg = svptrue_b32();
-    const size_t chunk = svcntw();
-
-    svfloat32_t sum0 = svdup_f32(0.0f);
-    svfloat32_t sum1 = svdup_f32(0.0f);
-    svfloat32_t sum2 = svdup_f32(0.0f);
-    svfloat32_t sum3 = svdup_f32(0.0f);
-
-    // Partial chunk for dim % chunk lanes. Use _z form so inactive lanes are zero -
-    // the final reduction below walks all lanes via svptrue_b32().
-    if constexpr (partial_chunk) {
-        size_t remaining = dimension % chunk;
-        if (remaining > 0) {
-            svbool_t pg_partial =
-                svwhilelt_b32(uint32_t(0), uint32_t(remaining));
-            svbool_t pg16_partial =
-                svwhilelt_b16(uint32_t(0), uint32_t(remaining));
-            svuint32_t v1_u32 = svld1ub_u32(pg_partial, pVect1 + offset);
-            svfloat32_t v1_f = svcvt_f32_u32_z(pg_partial, v1_u32);
-            svfloat16_t q_h = svld1_f16(
-                pg16_partial, reinterpret_cast<const float16_t *>(pVect2) + offset);
-            svfloat32_t v2_f = svcvt_f32_f16_z(pg_partial, q_h);
-            sum0 = svmla_f32_z(pg_partial, sum0, v1_f, v2_f);
-            offset += remaining;
-        }
-    }
-
-    // Main loop: 4 chunks per iteration via 4 accumulators.
-    const size_t chunk_size = 4 * chunk;
-    const size_t number_of_chunks =
-        (dimension - (partial_chunk ? dimension % chunk : 0)) / chunk_size;
-    for (size_t i = 0; i < number_of_chunks; i++) {
-        SQ8_FP16_InnerProductStep_SVE(pVect1, pVect2, offset, sum0, pg, chunk);
-        SQ8_FP16_InnerProductStep_SVE(pVect1, pVect2, offset, sum1, pg, chunk);
-        SQ8_FP16_InnerProductStep_SVE(pVect1, pVect2, offset, sum2, pg, chunk);
-        SQ8_FP16_InnerProductStep_SVE(pVect1, pVect2, offset, sum3, pg, chunk);
-    }
-
-    // Additional steps 0..3.
-    if constexpr (additional_steps > 0)
-        SQ8_FP16_InnerProductStep_SVE(pVect1, pVect2, offset, sum0, pg, chunk);
-    if constexpr (additional_steps > 1)
-        SQ8_FP16_InnerProductStep_SVE(pVect1, pVect2, offset, sum1, pg, chunk);
-    if constexpr (additional_steps > 2)
-        SQ8_FP16_InnerProductStep_SVE(pVect1, pVect2, offset, sum2, pg, chunk);
-
-    svfloat32_t sum = svadd_f32_x(pg, sum0, sum1);
-    sum = svadd_f32_x(pg, sum, sum2);
-    sum = svadd_f32_x(pg, sum, sum3);
-    float quantized_dot = svaddv_f32(pg, sum);
-
-    // Metadata loads - unaligned because odd dim leaves trailers unaligned.
-    const uint8_t *params_bytes = static_cast<const uint8_t *>(pVect1v) + dimension;
-    const float min_val =
-        load_unaligned<float>(params_bytes + sq8::MIN_VAL * sizeof(float));
-    const float delta =
-        load_unaligned<float>(params_bytes + sq8::DELTA * sizeof(float));
-    const uint8_t *query_meta_bytes = reinterpret_cast<const uint8_t *>(
-        static_cast<const float16 *>(pVect2v) + dimension);
-    const float y_sum =
-        load_unaligned<float>(query_meta_bytes + sq8::SUM_QUERY * sizeof(float));
-
-    return min_val * y_sum + delta * quantized_dot;
-}
-
-template <bool partial_chunk, unsigned char additional_steps>
-float SQ8_FP16_InnerProductSIMD_SVE(const void *pVect1v, const void *pVect2v,
-                                    size_t dimension) {
-    return 1.0f - SQ8_FP16_InnerProductSIMD_SVE_IMP<partial_chunk, additional_steps>(
-                      pVect1v, pVect2v, dimension);
-}
-
-template <bool partial_chunk, unsigned char additional_steps>
-float SQ8_FP16_CosineSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) {
-    return SQ8_FP16_InnerProductSIMD_SVE<partial_chunk, additional_steps>(
-        pVect1v, pVect2v, dimension);
-}
-```
-
-**Note for the implementer:** `svcvt_f32_f16_x(pg, q_h)` widens *the lower half of `q_h`'s lanes* to FP32 (one widening, b32-predicated). If the ACLE on the target toolchain rejects this pairing (e.g. ARM RVCT vs LLVM disagreement), verify the FP16->FP32 widening sequence against the actual ARM build output and adjust as needed (potential alternatives: explicit `svunpklo_*` unpack-then-widen, or operating on the lower half lanes by reinterpretation). Commit only after the build is clean. Do not blindly copy `IP_SVE_FP16.h`'s pattern - that file accumulates in FP16 and is not a direct widening reference.
-
-- [ ] **Step 2: Commit**
-
-```bash
-git add src/VecSim/spaces/IP/IP_SVE_SQ8_FP16.h
-git commit -m "Add SVE SQ8↔FP16 IP kernel header [MOD-14972]"
-```
-
----
-
-## Task 8: SVE L2 kernel header
-
-**Files:**
-- Create: `src/VecSim/spaces/L2/L2_SVE_SQ8_FP16.h`
-
-- [ ] **Step 1: Author the kernel file**
-
-```cpp
-/*
- * Copyright (c) 2006-Present, Redis Ltd.
- * All rights reserved.
- *
- * Licensed under your choice of the Redis Source Available License 2.0
- * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
- * GNU Affero General Public License v3 (AGPLv3).
- */
-#pragma once
-#include "VecSim/spaces/space_includes.h"
-#include "VecSim/spaces/IP/IP_SVE_SQ8_FP16.h"
-#include "VecSim/types/sq8.h"
-#include "VecSim/types/float16.h"
-
-using sq8 = vecsim_types::sq8;
-using float16 = vecsim_types::float16;
-
-/*
- * SVE SQ8<->FP16 L2 squared distance:
- *   ||x - y||^2 = x_sum_squares - 2 * IP(x, y) + y_sum_squares
- * IP is computed by SQ8_FP16_InnerProductSIMD_SVE_IMP; metadata is FP32.
- */
-
-template <bool partial_chunk, unsigned char additional_steps>
-float SQ8_FP16_L2SqrSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) {
-    const float ip = SQ8_FP16_InnerProductSIMD_SVE_IMP<partial_chunk, additional_steps>(
-        pVect1v, pVect2v, dimension);
-
-    const uint8_t *params_bytes = static_cast<const uint8_t *>(pVect1v) + dimension;
-    const float x_sum_sq =
-        load_unaligned<float>(params_bytes + sq8::SUM_SQUARES * sizeof(float));
-    const uint8_t *query_meta_bytes = reinterpret_cast<const uint8_t *>(
-        static_cast<const float16 *>(pVect2v) + dimension);
-    const float y_sum_sq =
-        load_unaligned<float>(query_meta_bytes + sq8::SUM_SQUARES_QUERY * sizeof(float));
-
-    return x_sum_sq + y_sum_sq - 2.0f * ip;
-}
-```
-
-- [ ] **Step 2: Commit**
-
-```bash
-git add src/VecSim/spaces/L2/L2_SVE_SQ8_FP16.h
-git commit -m "Add SVE SQ8↔FP16 L2 kernel header [MOD-14972]"
-```
-
----
-
-## Task 9: SVE + SVE2 dispatcher TU additions
-
-**Files:**
-- Modify: `src/VecSim/spaces/functions/SVE.h` — +3 declarations
-- Modify: `src/VecSim/spaces/functions/SVE.cpp` — +#includes; +3 chooser definitions
-- Modify: `src/VecSim/spaces/functions/SVE2.h` — +3 declarations
-- Modify: `src/VecSim/spaces/functions/SVE2.cpp` — +#includes; +3 chooser definitions (own symbols, template instantiated under SVE2 flags)
-
-- [ ] **Step 1: Declarations in SVE.h**
-
-Inside `namespace spaces { ... }`, alongside the existing `Choose_SQ8_FP32_*_SVE` declarations:
-
-```cpp
-dist_func_t<float> Choose_SQ8_FP16_IP_implementation_SVE(size_t dim);
-dist_func_t<float> Choose_SQ8_FP16_Cosine_implementation_SVE(size_t dim);
-dist_func_t<float> Choose_SQ8_FP16_L2_implementation_SVE(size_t dim);
-```
-
-- [ ] **Step 2: Definitions in SVE.cpp**
-
-Add includes alongside the existing SQ8_FP32 includes:
-
-```cpp
-#include "VecSim/spaces/IP/IP_SVE_SQ8_FP16.h"
-#include "VecSim/spaces/L2/L2_SVE_SQ8_FP16.h"
-```
-
-Inside `namespace spaces { ... }` (between `implementation_chooser.h` and `implementation_chooser_cleanup.h`), append:
-
-```cpp
-dist_func_t<float> Choose_SQ8_FP16_IP_implementation_SVE(size_t dim) {
-    dist_func_t<float> ret_dist_func;
-    CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_FP16_InnerProductSIMD_SVE, dim, svcntw);
-    return ret_dist_func;
-}
-
-dist_func_t<float> Choose_SQ8_FP16_Cosine_implementation_SVE(size_t dim) {
-    dist_func_t<float> ret_dist_func;
-    CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_FP16_CosineSIMD_SVE, dim, svcntw);
-    return ret_dist_func;
-}
-
-dist_func_t<float> Choose_SQ8_FP16_L2_implementation_SVE(size_t dim) {
-    dist_func_t<float> ret_dist_func;
-    CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_FP16_L2SqrSIMD_SVE, dim, svcntw);
-    return ret_dist_func;
-}
-```
-
-- [ ] **Step 3: Declarations in SVE2.h**
-
-```cpp
-dist_func_t<float> Choose_SQ8_FP16_IP_implementation_SVE2(size_t dim);
-dist_func_t<float> Choose_SQ8_FP16_Cosine_implementation_SVE2(size_t dim);
-dist_func_t<float> Choose_SQ8_FP16_L2_implementation_SVE2(size_t dim);
-```
-
-- [ ] **Step 4: Definitions in SVE2.cpp**
-
-Add includes alongside the existing SQ8_FP32 includes — note the SVE header is included from SVE2 (SVE2 instantiates the template under SVE2 compile flags):
-
-```cpp
-#include "VecSim/spaces/IP/IP_SVE_SQ8_FP16.h" // SVE2 implementation is identical to SVE
-#include "VecSim/spaces/L2/L2_SVE_SQ8_FP16.h" // SVE2 implementation is identical to SVE
-```
-
-Inside `namespace spaces { ... }`, append:
-
-```cpp
-dist_func_t<float> Choose_SQ8_FP16_IP_implementation_SVE2(size_t dim) {
-    dist_func_t<float> ret_dist_func;
-    CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_FP16_InnerProductSIMD_SVE, dim, svcntw);
-    return ret_dist_func;
-}
-
-dist_func_t<float> Choose_SQ8_FP16_Cosine_implementation_SVE2(size_t dim) {
-    dist_func_t<float> ret_dist_func;
-    CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_FP16_CosineSIMD_SVE, dim, svcntw);
-    return ret_dist_func;
-}
-
-dist_func_t<float> Choose_SQ8_FP16_L2_implementation_SVE2(size_t dim) {
-    dist_func_t<float> ret_dist_func;
-    CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_FP16_L2SqrSIMD_SVE, dim, svcntw);
-    return ret_dist_func;
-}
-```
-
-- [ ] **Step 5: Commit**
-
-```bash
-git add src/VecSim/spaces/functions/SVE.h src/VecSim/spaces/functions/SVE.cpp \
-        src/VecSim/spaces/functions/SVE2.h src/VecSim/spaces/functions/SVE2.cpp
-git commit -m "Wire SVE/SVE2 SQ8↔FP16 choosers [MOD-14972]"
-```
-
----
-
-## Task 10: SVE + SVE2 dispatcher wiring in IP_space.cpp + L2_space.cpp
-
-The NEON_HP block added in Task 5 lives inside `#ifdef CPU_FEATURES_ARCH_AARCH64`. Extend the same block in all three `_GetDistFunc` functions with SVE2 and SVE tiers — ordered SVE2 → SVE → NEON_HP, matching every other SQ8/FP32 dispatcher in the file.
-
-**Files:**
-- Modify: `src/VecSim/spaces/IP_space.cpp` (two functions)
-- Modify: `src/VecSim/spaces/L2_space.cpp` (one function)
-
-- [ ] **Step 1: Confirm the SVE/SVE2 dispatcher includes are present**
-
-Run:
-```bash
-grep -n 'functions/SVE\.h\|functions/SVE2\.h' src/VecSim/spaces/IP_space.cpp src/VecSim/spaces/L2_space.cpp
-```
-Expected: both files already include both headers. If not, add them.
-
-- [ ] **Step 2: Extend IP_SQ8_FP16_GetDistFunc**
-
-Inside the AArch64 block of `IP_SQ8_FP16_GetDistFunc`, after the `if (dim < 16) return ret_dist_func;` guard and **before** the existing `#ifdef OPT_NEON_HP`, prepend:
-
-```cpp
-#ifdef OPT_SVE2
-    if (features.sve2) {
-        return Choose_SQ8_FP16_IP_implementation_SVE2(dim);
-    }
-#endif
-#ifdef OPT_SVE
-    if (features.sve) {
-        return Choose_SQ8_FP16_IP_implementation_SVE(dim);
-    }
-#endif
-```
-
-(SVE/SVE2 paths don't compute alignment hints — the SVE vector width is runtime-variable, so the SQ8_FP32 sister doesn't set `*alignment` here either. Mirror that.)
-
-- [ ] **Step 3: Extend Cosine_SQ8_FP16_GetDistFunc**
-
-Same as Step 2, with `Cosine` in the chooser names.
-
-- [ ] **Step 4: Extend L2_SQ8_FP16_GetDistFunc**
-
-Same as Step 2, with `L2` in the chooser names.
-
-- [ ] **Step 5: User builds (ARM target)**
-
-Ask user to run `make build` for an ARM target.
-
-- [ ] **Step 6: Commit**
-
-```bash
-git add src/VecSim/spaces/IP_space.cpp src/VecSim/spaces/L2_space.cpp
-git commit -m "Dispatch SQ8↔FP16 to SVE/SVE2 tiers on AArch64 [MOD-14972]"
-```
-
----
-
-## Task 11: Extend `SQ8_FP16_SpacesOptimizationTest` with SVE2 + SVE tier-walks
-
-**Files:**
-- Modify: `tests/unit/test_spaces.cpp` — the same three test bodies extended in Task 6
-
-For each test (L2, IP, Cosine), inside the existing `#ifdef CPU_FEATURES_ARCH_AARCH64` region (which currently holds only NEON_HP from Task 6), **prepend** SVE2 and SVE blocks so the dispatch-precedence order is SVE2 → SVE → NEON_HP. If the existing NEON_HP block is not yet inside an AArch64 outer ifdef, wrap all three together.
-
-- [ ] **Step 1: Wrap and extend the L2 test**
-
-Replace the NEON_HP-only AArch64 block in `SQ8_FP16_L2SqrTest` with:
-
-```cpp
-#ifdef CPU_FEATURES_ARCH_AARCH64
-#ifdef OPT_SVE2
-    if (optimization.sve2) {
-        unsigned char alignment = 0;
-        arch_opt_func = L2_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization);
-        ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_L2_implementation_SVE2(dim))
-            << "Unexpected distance function chosen for dim " << dim;
-        ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01)
-            << "SVE2 with dim " << dim;
-        optimization.sve2 = 0;
-    }
-#endif
-#ifdef OPT_SVE
-    if (optimization.sve) {
-        unsigned char alignment = 0;
-        arch_opt_func = L2_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization);
-        ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_L2_implementation_SVE(dim))
-            << "Unexpected distance function chosen for dim " << dim;
-        ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01)
-            << "SVE with dim " << dim;
-        optimization.sve = 0;
-    }
-#endif
-#ifdef OPT_NEON_HP
-    if (optimization.asimdhp) {
-        unsigned char alignment = 0;
-        arch_opt_func = L2_SQ8_FP16_GetDistFunc(dim, &alignment, &optimization);
-        ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_L2_implementation_NEON_HP(dim))
-            << "Unexpected distance function chosen for dim " << dim;
-        ASSERT_NEAR(baseline, arch_opt_func(v2_compressed.data(), v1_query.data(), dim), 0.01)
-            << "NEON_HP with dim " << dim;
-        optimization.asimdhp = 0;
-    }
-#endif
-#endif // CPU_FEATURES_ARCH_AARCH64
-```
-
-- [ ] **Step 2: Same for IP test**
-
-Replicate the block in `SQ8_FP16_InnerProductTest` with `IP_SQ8_FP16_GetDistFunc` and `Choose_SQ8_FP16_IP_implementation_<TIER>`.
-
-- [ ] **Step 3: Same for Cosine test**
-
-Replicate with `Cosine_SQ8_FP16_GetDistFunc` and `Choose_SQ8_FP16_Cosine_implementation_<TIER>`.
-
-- [ ] **Step 4: User builds**
-
-ARM target build.
-
-- [ ] **Step 5: Run the optimization tests**
-
-```bash
-./bin/<arm-triple>/unit_tests --gtest_filter='SQ8_FP16_SpacesOptimizationTest.*'
-```
-Expected: all parametrized cases PASS — dims 16..32 + high-dim suite (64..1024) — exercising whichever ARM tiers the host advertises.
-
-- [ ] **Step 6: Commit**
-
-```bash
-git add tests/unit/test_spaces.cpp
-git commit -m "Extend SQ8↔FP16 tier-walk tests with SVE/SVE2 [MOD-14972]"
-```
-
----
-
-## Task 12: Extend `SQ8_FP16_SIMD_TierCoverage.ReportTiersExercised` with ARM rows
-
-**Files:**
-- Modify: `tests/unit/test_spaces.cpp` — `TEST(SQ8_FP16_SIMD_TierCoverage, ReportTiersExercised)`
-
-The existing test body has an outer `#ifdef CPU_FEATURES_ARCH_X86_64` block that loops over each x86 tier and logs presence to stderr. Add a sibling `#ifdef CPU_FEATURES_ARCH_AARCH64` block with the same shape.
-
-- [ ] **Step 1: Append the AArch64 reporting block**
-
-Locate the trailing `#endif // CPU_FEATURES_ARCH_X86_64` and immediately after, insert:
-
-```cpp
-#ifdef CPU_FEATURES_ARCH_AARCH64
-#ifdef OPT_SVE2
-    if (opt.sve2) {
-        std::cerr << "[SQ8_FP16] SVE2 tier exercised\n";
-        any_simd = true;
-    } else {
-        std::cerr << "[SQ8_FP16] SVE2 tier NOT exercised on this host\n";
-    }
-#endif
-#ifdef OPT_SVE
-    if (opt.sve) {
-        std::cerr << "[SQ8_FP16] SVE tier exercised\n";
-        any_simd = true;
-    } else {
-        std::cerr << "[SQ8_FP16] SVE tier NOT exercised on this host\n";
-    }
-#endif
-#ifdef OPT_NEON_HP
-    if (opt.asimdhp) {
-        std::cerr << "[SQ8_FP16] NEON_HP tier exercised\n";
-        any_simd = true;
-    } else {
-        std::cerr << "[SQ8_FP16] NEON_HP tier NOT exercised on this host\n";
-    }
-#endif
-#endif // CPU_FEATURES_ARCH_AARCH64
-```
-
-(The trailing `if (!any_simd) { GTEST_SKIP() << ...; }` already at the bottom of the existing test handles the all-quiet case across both archs.)
-
-- [ ] **Step 2: Build + run on an ARM host**
-
-Ask the user to build for ARM, then run:
-```bash
-./bin/<arm-triple>/unit_tests --gtest_filter='SQ8_FP16_SIMD_TierCoverage.*'
-```
-Expected: stderr shows at least one ARM tier marked "exercised", test PASS.
-
-- [ ] **Step 3: Commit**
-
-```bash
-git add tests/unit/test_spaces.cpp
-git commit -m "Report ARM tiers in SQ8↔FP16 tier-coverage test [MOD-14972]"
-```
-
----
-
-## Task 13: Microbench AArch64 block
-
-**Files:**
-- Modify: `tests/benchmark/spaces_benchmarks/bm_spaces_sq8_fp16.cpp`
-
-The existing file already opens `#ifdef CPU_FEATURES_ARCH_X86_64` and pulls `cpu_features::X86Features opt = cpu_features::GetX86Info().features;`. Add the parallel AArch64 block at the end of that `#endif // CPU_FEATURES_ARCH_X86_64`.
-
-- [ ] **Step 1: Append the AArch64 bench block**
-
-After the closing `#endif // CPU_FEATURES_ARCH_X86_64` (or after the last x86 `INITIALIZE_BENCHMARKS_SET_*` macro if no such comment exists), insert:
-
-```cpp
-#ifdef CPU_FEATURES_ARCH_AARCH64
-cpu_features::Aarch64Features arm_opt = cpu_features::GetAarch64Info().features;
-
-#ifdef OPT_SVE2
-bool sve2_supported = arm_opt.sve2;
-INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, SVE2, 16, sve2_supported);
-INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, SVE2, 16, sve2_supported);
-#endif
-
-#ifdef OPT_SVE
-bool sve_supported = arm_opt.sve;
-INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, SVE, 16, sve_supported);
-INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, SVE, 16, sve_supported);
-#endif
-
-#ifdef OPT_NEON_HP
-bool neon_hp_supported = arm_opt.asimdhp;
-INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, NEON_HP, 16, neon_hp_supported);
-INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, NEON_HP, 16,
-                                  neon_hp_supported);
-#endif
-#endif // CPU_FEATURES_ARCH_AARCH64
-```
-
-Verify the exact `cpu_features` helper name during build. If the toolchain uses `Aarch64Info` vs `Aarch64Features` vs `ArmFeatures`, adjust to match the sister x86 block.
-
-- [ ] **Step 2: Update the file-header comment**
-
-The current file-header comment (around the top) ends with `ARM kernels land via MOD-14972.` — change that line to `ARM kernels (NEON_HP / SVE / SVE2) are registered below.` so the doc stays accurate.
-
-- [ ] **Step 3: User builds (ARM target)**
-
-- [ ] **Step 4: Run the bench on ARM**
-
-```bash
-./bin/<arm-triple>/bm_spaces_sq8_fp16 --benchmark_filter='SQ8_FP16_.*(SVE2|SVE|NEON_HP)'
-```
-Expected: per-ISA throughput rows for L2, IP, Cosine. If no rows match, list all benchmarks first with `--benchmark_list_tests` to see the exact generated names, then adjust the regex.
-
-- [ ] **Step 5: Side-by-side compare against SQ8_FP32**
-
-```bash
-./bin/<arm-triple>/bm_spaces_sq8_fp32 --benchmark_filter='SQ8_FP32_.*(SVE2|SVE|NEON)'
-```
-Compare matched-ISA rows manually. Acceptance per Jira: per-ISA throughput data captured.
-
-- [ ] **Step 6: Commit**
-
-```bash
-git add tests/benchmark/spaces_benchmarks/bm_spaces_sq8_fp16.cpp
-git commit -m "Register ARM SQ8↔FP16 microbenchmarks [MOD-14972]"
-```
-
----
-
-## Task 14: ASan + final pre-PR verification
-
-- [ ] **Step 1: Full unit-test pass on ARM host (no filter)**
-
-```bash
-./bin/<arm-triple>/unit_tests
-```
-Expected: all tests PASS.
-
-- [ ] **Step 2: ASan build + run**
-
-Ask user to run `make build SAN=address` (or the repo's equivalent — verify against `Makefile`). After confirmed:
-
-```bash
-./bin/<arm-triple>-asan/unit_tests --gtest_filter='SQ8_FP16_*'
-```
-Expected: zero ASan reports; all SQ8_FP16 tests PASS.
-
-- [ ] **Step 3: x86 sanity build**
-
-User runs `make build` on x86 (no ARM target). Confirms the new test extensions and dispatcher AArch64 ifdefs stay inert on x86 and the build is clean.
-
-- [ ] **Step 4: Push branch (ASK USER FIRST)**
-
-Pushes are user-gated. Confirm with the user before running:
-
-```bash
-git push -u origin dor-forer-sq8-fp16-arm-kernels-mod-14972
-```
-
-- [ ] **Step 5: Open PR against PR #970 (ASK USER FIRST)**
-
-PR creation is user-gated. Confirm with the user before running:
-
-```bash
-gh pr create \
-  --base dor-forer-sq8-fp16-x86-kernels-mod-14954 \
-  --title 'Add SQ8↔FP16 ARM SIMD distance kernels [MOD-14972]' \
-  --body "$(cat <<'EOF'
-## Summary
-
-- Add asymmetric SQ8↔FP16 distance kernels (IP, L2, Cosine) for ARM NEON_HP, SVE, SVE2 tiers
-- Wire kernels into the existing dispatcher (`IP_space.cpp`, `L2_space.cpp`)
-- Extend `SQ8_FP16_SpacesOptimizationTest` and `SQ8_FP16_SIMD_TierCoverage` with ARM tiers
-- Register per-ISA microbenchmarks for cross-arch throughput comparison
-
-Stacked on PR #970 (MOD-14954 x86 kernels); retarget to `main` once #970 merges.
-
-Spec: `docs/superpowers/specs/2026-05-28-arm-sq8-fp16-design.md`
-
-## Test plan
-
-- [ ] Unit tests on ARM host pass — `SQ8_FP16_SpacesOptimizationTest` (dims 16..32 + 64..1024), `SQ8_FP16_SIMD_TierCoverage`, `GetDistFuncSQ8FP16Asymmetric`
-- [ ] ASan build on ARM host clean across SQ8_FP16 tests
-- [ ] x86 build remains clean (new AArch64 dispatcher block + tests stay inert)
-- [ ] Microbench output captured for SVE2 / SVE / NEON_HP, compared against matched SQ8_FP32 ARM rows
-EOF
-)"
-```
-
-- [ ] **Step 6: Retarget once #970 merges (ASK USER FIRST)**
-
-When PR #970 lands on `main`, change this PR's base to `main`:
-
-```bash
-gh pr edit <PR-number> --base main
-```
-
----
-
-## Self-review checklist
-
-- [x] **Spec coverage:** every requirement in `2026-05-28-arm-sq8-fp16-design.md` is covered:
-  - Kernel headers (4 new): Tasks 2, 3, 7, 8
-  - Wrapper symbols: Tasks 4 (NEON_HP), 9 (SVE/SVE2)
-  - Dispatcher wiring: Tasks 5 (NEON_HP), 10 (SVE/SVE2)
-  - Tier-walk tests: Tasks 6 (NEON_HP), 11 (SVE/SVE2)
-  - TierCoverage report: Task 12
-  - Scalar-fallback edge tests (dim=0, dim=15): Task 1
-  - Microbench: Task 13
-  - ASan + verification: Task 14
-- [x] **No CMake changes** — confirmed in file structure table.
-- [x] **Zero placeholders** — every code block is concrete; ambiguous spots (SVE FP16 widening ACLE) are called out with the fallback strategy spelled in-task.
-- [x] **Type/symbol consistency:**
-  - NEON kernel template names: `SQ8_FP16_InnerProductSIMD16_NEON_HP_IMP` / `…NEON_HP` / `SQ8_FP16_L2SqrSIMD16_NEON_HP` / `SQ8_FP16_CosineSIMD16_NEON_HP` — match across kernel header, NEON_HP chooser, dispatcher call, and test.
-  - SVE kernel template names: `SQ8_FP16_InnerProductSIMD_SVE_IMP` / `…SVE` / `SQ8_FP16_L2SqrSIMD_SVE` / `SQ8_FP16_CosineSIMD_SVE` — match across kernel header, SVE chooser, SVE2 chooser, dispatcher call, and test.
-  - Chooser symbol names: `Choose_SQ8_FP16_{IP,L2,Cosine}_implementation_{NEON_HP,SVE,SVE2}` — match across `.h` declarations, `.cpp` definitions, dispatcher calls, tests, and bench.
-  - Test fixture: `SQ8_FP16_SpacesOptimizationTest` already exists on base (PR #970); we extend the three test methods inside it, no rename.
-
----
-
-## Execution Handoff
-
-Plan complete and saved to `docs/superpowers/plans/2026-05-28-arm-sq8-fp16-kernels.md`. Two execution options:
-
-**1. Subagent-Driven (recommended)** — I dispatch a fresh subagent per task, review between tasks, fast iteration.
-
-**2. Inline Execution** — Execute tasks in this session using executing-plans, batch execution with checkpoints.
-
-Which approach?
diff --git a/docs/superpowers/specs/2026-05-28-arm-sq8-fp16-design.md b/docs/superpowers/specs/2026-05-28-arm-sq8-fp16-design.md
deleted file mode 100644
index f4188d38b..000000000
--- a/docs/superpowers/specs/2026-05-28-arm-sq8-fp16-design.md
+++ /dev/null
@@ -1,354 +0,0 @@
-# SQ8↔FP16 ARM SIMD Distance Kernels — Design Spec
-
-- **Ticket**: [MOD-14972](https://redislabs.atlassian.net/browse/MOD-14972)
-- **Branch**: `dor-forer-sq8-fp16-arm-kernels-mod-14972`
-- **Base**: `dor-forer-sq8-fp16-x86-kernels-mod-14954` (PR #970) — stacked
-- **Sibling**: MOD-14954 / PR #970 delivers x86 SIMD kernels (AVX-512, AVX2, SSE4) for the same operation
-
-## Goal
-
-Add SQ8↔FP16 SIMD distance kernels for IP and L2 on the ARM ISA tiers (NEON_HP, SVE, SVE2). FP16 is the query data type; SQ8 is the stored vector representation. Match the contract and structure of the x86 kernels delivered in PR #970 so dispatch tables, metadata layout, and acceptance criteria stay symmetric across architectures.
-
-The scalar fallback (`SQ8_FP16_InnerProduct`, `SQ8_FP16_L2Sqr`, `SQ8_FP16_Cosine` in `src/VecSim/spaces/IP/IP.cpp` and `src/VecSim/spaces/L2/L2.cpp`) already exists on `main`. This spec does not modify it; it serves as the reference implementation for all platforms.
-
-## Algebraic identity (shared with x86 PR + SQ8_FP32 sister)
-
-```
-IP(x, y) ≈ min · y_sum + delta · Σ(q_i · y_i)
-L2(x, y) = x_sum_sq + y_sum_sq - 2 · IP(x, y)
-```
-
-Hot loop accumulates `Σ(q_i · y_i)` only. No per-element dequantization. FP16 query lanes are widened to FP32 per SIMD chunk; everything in the hot loop is FP32.
-
-## Metadata layout
-
-```
-SQ8 storage (pVect1): [uint8 × dim] [min_val] [delta] [x_sum] [x_sum_squares]
-FP16 query   (pVect2): [float16 × dim] [y_sum] [y_sum_squares]
-```
-
-Both metadata trailers are FP32 scalars. Storage metadata is not 4-byte aligned whenever `dim % 4 != 0`; query metadata is not 4-byte aligned whenever `dim` is odd. The blanket rule: every FP32 metadata read uses the global `load_unaligned<float>` helper, matching scalar `_Impl` in `IP.cpp` / `L2.cpp`. `sq8` namespace constants: `MIN_VAL`, `DELTA`, `SUM_QUERY`, `SUM_SQUARES`, `SUM_SQUARES_QUERY`.
-
-## File layout
-
-```
-src/VecSim/spaces/IP/
-  IP_NEON_SQ8_FP16.h     (new)
-  IP_SVE_SQ8_FP16.h      (new) — also #included from SVE2.cpp
-src/VecSim/spaces/L2/
-  L2_NEON_SQ8_FP16.h     (new)
-  L2_SVE_SQ8_FP16.h      (new) — also #included from SVE2.cpp
-src/VecSim/spaces/functions/
-  NEON_HP.cpp            (+ Choose_SQ8_FP16_{IP,L2,Cosine}_implementation_NEON_HP)
-  NEON_HP.h              (+ 3 declarations)
-  SVE.cpp                (+ Choose_SQ8_FP16_*_implementation_SVE)
-  SVE.h                  (+ 3 declarations)
-  SVE2.cpp               (+ Choose_SQ8_FP16_*_implementation_SVE2; owns its own chooser symbols; instantiates SVE kernel templates under SVE2 compile flags)
-  SVE2.h                 (+ 3 declarations)
-src/VecSim/spaces/
-  IP_space.cpp           (2 dispatcher block edits: IP, Cosine)
-  L2_space.cpp           (1 dispatcher block edit)
-```
-
-**Zero CMake changes.** Existing TU flags carry exactly what we need:
-
-| TU | Flags |
-|----|-------|
-| `NEON_HP.cpp` | `-march=armv8.2-a+fp16fml` (covers fp16 cvt + fma) |
-| `SVE.cpp` | `-march=armv8-a+sve` (SVE includes f16↔f32 cvt) |
-| `SVE2.cpp` | `-march=armv9-a+sve2` |
-
-## Dispatcher tier order
-
-Same precedence as existing SQ8_FP32 ARM dispatch:
-
-```cpp
-#ifdef OPT_SVE2
-    if (features.sve2 && dim >= 16) {
-        return Choose_SQ8_FP16_IP_implementation_SVE2(dim);
-    }
-#endif
-#ifdef OPT_SVE
-    if (features.sve && dim >= 16) {
-        return Choose_SQ8_FP16_IP_implementation_SVE(dim);
-    }
-#endif
-#ifdef OPT_NEON_HP
-    if (features.asimdhp && dim >= 16) {
-        return Choose_SQ8_FP16_IP_implementation_NEON_HP(dim);
-    }
-#endif
-// dim < 16 or no ARM SIMD → scalar fallback (existing return at function tail)
-```
-
-The `dim >= 16` guard in the dispatcher is what lets each SIMD kernel hold an internal `assert(dim >= 16)` as a real precondition. Edge cases for `dim < 16` are routed to scalar.
-
-## NEON kernel design
-
-### Header: `IP_NEON_SQ8_FP16.h`
-
-Template signature mirrors SQ8_FP32 NEON sister:
-
-```cpp
-template <unsigned char residual>   // 0..15
-float SQ8_FP16_InnerProductSIMD16_NEON_HP_IMP(const void *pVect1v, const void *pVect2v, size_t dimension);
-```
-
-Hot loop — 16 lanes per iteration, 4 FP32 accumulators:
-
-```cpp
-// SQ8 load: 16 × uint8 → 4 × float32x4_t
-uint8x16_t v1_u8 = vld1q_u8(pVect1);
-uint16x8_t v1_lo = vmovl_u8(vget_low_u8(v1_u8));
-uint16x8_t v1_hi = vmovl_u8(vget_high_u8(v1_u8));
-float32x4_t v1_0 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(v1_lo)));
-float32x4_t v1_1 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(v1_lo)));
-float32x4_t v1_2 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(v1_hi)));
-float32x4_t v1_3 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(v1_hi)));
-
-// FP16 query load: 16 × f16 → 4 × float32x4_t via vcvt_f32_f16
-float16x8_t q_lo = vld1q_f16(pVect2);
-float16x8_t q_hi = vld1q_f16(pVect2 + 8);
-float32x4_t v2_0 = vcvt_f32_f16(vget_low_f16(q_lo));
-float32x4_t v2_1 = vcvt_f32_f16(vget_high_f16(q_lo));
-float32x4_t v2_2 = vcvt_f32_f16(vget_low_f16(q_hi));
-float32x4_t v2_3 = vcvt_f32_f16(vget_high_f16(q_hi));
-
-// 4-accumulator FMA
-sum0 = vfmaq_f32(sum0, v1_0, v2_0);
-sum1 = vfmaq_f32(sum1, v1_1, v2_1);
-sum2 = vfmaq_f32(sum2, v1_2, v2_2);
-sum3 = vfmaq_f32(sum3, v1_3, v2_3);
-```
-
-Residual ladder (`dim % 16`, residual 0..15):
-
-- **`residual >= 8`**: one 8-lane safe load each side — `vld1_u8` (8 bytes) for SQ8 and `vld1q_f16` (8 × FP16 = 16 bytes, fits before query metadata) for FP16. Convert + FMA. Remaining `residual - 8` lanes handled scalar.
-- **`residual < 8`**: full scalar residual loop using `vecsim_types::FP16_to_FP32`.
-
-Rationale: a 16-byte SQ8 load (`vld1q_u8`) or a 16-byte FP16 load (`vld1q_f16` past the 8-lane boundary) on a residual < 8 would overread past valid query data into metadata — `y_sum` is only 4 bytes for IP and `y_sum_sq` adds 4 more for L2, not enough headroom for an 8-lane FP16 load.
-
-Final reduction: `vaddvq_f32(sum0 + sum1 + sum2 + sum3)`, then return `min_val * y_sum + delta * quantized_dot`.
-
-`assert(dim >= 16)` at the top.
-
-### Header: `L2_NEON_SQ8_FP16.h`
-
-Calls `SQ8_FP16_InnerProductSIMD16_NEON_HP_IMP<residual>(...)` to compute raw IP, then returns `x_sum_sq + y_sum_sq - 2.0f * ip`. Mirrors `L2_NEON_SQ8_FP32.h` exactly.
-
-### Wrapper symbols (NEON_HP.cpp)
-
-```cpp
-dist_func_t<float> Choose_SQ8_FP16_IP_implementation_NEON_HP(size_t dim) {
-    dist_func_t<float> ret;
-    CHOOSE_IMPLEMENTATION(ret, dim, 16, SQ8_FP16_InnerProductSIMD16_NEON_HP);
-    return ret;
-}
-// L2 + Cosine identical shape (Cosine reuses IP wrapper per repo convention)
-```
-
-## SVE kernel design
-
-### Header: `IP_SVE_SQ8_FP16.h`
-
-Template signature mirrors SVE SQ8_FP32 sister:
-
-```cpp
-template <bool partial_chunk, unsigned char additional_steps>
-float SQ8_FP16_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v, size_t dimension);
-```
-
-Inner step (one SVE vector width `svcntw()` lanes of FP32):
-
-```cpp
-svbool_t pg = svptrue_b32();
-// SQ8: zero-extend uint8 → uint32 (predicated b32 load)
-svuint32_t v1_u32 = svld1ub_u32(pg, pVect1 + offset);
-svfloat32_t v1_f  = svcvt_f32_u32_x(pg, v1_u32);
-// FP16: load chunk fp16 lanes, widen to fp32
-svbool_t pg16 = svwhilelt_b16(uint32_t(0), uint32_t(chunk));
-svfloat16_t q_h = svld1_f16(pg16, pVect2 + offset);
-svfloat32_t v2_f = svcvt_f32_f16_x(pg, q_h);   // verify exact ACLE/packing during impl
-sum = svmla_f32_x(pg, sum, v1_f, v2_f);
-offset += chunk;
-```
-
-**ACLE caveat**: exact f16→f32 widening intrinsic and lane packing — confirm `svcvt_f32_f16_x(pg, q_h)` compiles cleanly against the loaded `svfloat16_t`. If lane packing needs an unpack/interleave step, verify against `IP_SVE_FP16.h`.
-
-4 accumulators `sum0..sum3`; main loop processes 4 chunks via 4 `InnerProductStep` calls. `partial_chunk` template branch handles `dim % chunk` via `svwhilelt_b32`.
-
-Inactive-lane discipline on the partial path: the predicated `svld1_f16` / `svld1ub_u32` cover lane *liveness*, but the final reduction with `svaddv_f32(svptrue_b32(), ...)` walks *all* lanes. To keep inactive lanes from contributing garbage, the partial step uses the zeroing form `svmla_f32_z(pg_partial, sum0, v1_f, v2_f)` (matches `IP_SVE_SQ8_FP32.h` partial-chunk pattern). Alternative: reduce only active lanes via `svaddv_f32(pg_partial, sum0)` for the partial-step accumulator, then sum into the main reduction. The `_z` form is the simpler choice and is what the SQ8_FP32 SVE sister already does.
-
-Predicate widths on the partial path: FP32 math (load/widen/mla) uses a `b32` predicate sized to `remaining` 32-bit lanes (`svwhilelt_b32(0, remaining)`); the FP16 query load needs its own `b16` predicate sized to the same `remaining` half lanes (`svwhilelt_b16(0, remaining)`) since `svld1_f16` is governed by a 16-bit predicate. SQ8 load via `svld1ub_u32` is governed by the `b32` predicate (it widens uint8 → uint32 lanewise).
-
-Final reduction: `svaddv_f32(svptrue_b32(), sum0 + sum1 + sum2 + sum3)`.
-
-### Header: `L2_SVE_SQ8_FP16.h`
-
-Calls `SQ8_FP16_InnerProductSIMD_SVE_IMP<partial_chunk, additional_steps>(...)` then returns `x_sum_sq + y_sum_sq - 2.0f * ip`. Mirrors `L2_SVE_SQ8_FP32.h`.
-
-### Wrapper symbols
-
-`SVE.cpp`:
-
-```cpp
-dist_func_t<float> Choose_SQ8_FP16_IP_implementation_SVE(size_t dim) {
-    dist_func_t<float> ret;
-    CHOOSE_SVE_IMPLEMENTATION(ret, SQ8_FP16_InnerProductSIMD_SVE, dim, svcntw);
-    return ret;
-}
-// L2 + Cosine identical shape
-```
-
-`SVE2.cpp`:
-
-```cpp
-#include "VecSim/spaces/IP/IP_SVE_SQ8_FP16.h"  // SVE2 implementation is identical to SVE
-#include "VecSim/spaces/L2/L2_SVE_SQ8_FP16.h"
-
-dist_func_t<float> Choose_SQ8_FP16_IP_implementation_SVE2(size_t dim) {
-    dist_func_t<float> ret;
-    CHOOSE_SVE_IMPLEMENTATION(ret, SQ8_FP16_InnerProductSIMD_SVE, dim, svcntw);
-    return ret;
-}
-// L2 + Cosine identical shape
-```
-
-SVE2 owns its own chooser symbols (does **not** call the SVE chooser); template instantiated under SVE2 compile flags.
-
-## Tests
-
-### Class
-
-Branch base is PR #970. During implementation, verify whether the base branch already exposes `SQ8_FP16_SpacesOptimizationTest` (extend) or only `SQ8_FP16_NoOptimizationSpacesTest` (add the optimization class here mirroring `SQ8_FP32_SpacesOptimizationTest`).
-
-### Tier-walk pattern
-
-Per-tier `if (features.<flag>)` block; **unset higher flag** after each block so the next tier is exercised on hosts that support multiple ISAs. Do not use `GTEST_SKIP()` here — it would abort the entire walk.
-
-```cpp
-auto expected = SQ8_FP16_InnerProduct;  // scalar reference
-
-#ifdef OPT_SVE2
-    if (features.sve2) {
-        arch_opt_func = IP_SQ8_FP16_GetDistFunc(dim, &alignment, &features);
-        ASSERT_EQ(arch_opt_func, Choose_SQ8_FP16_IP_implementation_SVE2(dim))
-            << "SVE2 dispatch mismatch";
-        ASSERT_NEAR(arch_opt_func(v1, v2, dim), expected(v1, v2, dim), 0.01);
-        features.sve2 = 0;   // exercise next tier
-    }
-#endif
-#ifdef OPT_SVE
-    if (features.sve) { /* same shape */ features.sve = 0; }
-#endif
-#ifdef OPT_NEON_HP
-    if (features.asimdhp) { /* same shape */ features.asimdhp = 0; }
-#endif
-// final fallback assertion: IP_SQ8_FP16_GetDistFunc(...) == SQ8_FP16_InnerProduct (scalar)
-```
-
-Three dispatch entry points exercised per tier: `IP_SQ8_FP16_GetDistFunc`, `L2_SQ8_FP16_GetDistFunc`, `Cosine_SQ8_FP16_GetDistFunc`.
-
-### Scalar-fallback tests
-
-`GetDistFuncSQ8FP16Asymmetric` — currently asserts `dim=128` returns scalar; that assertion breaks once SIMD dispatch lands. Change to `dim=15` (below the `dim >= 16` SIMD threshold). Add a small `dim=0` (empty) scalar-fallback assertion to cover the Jira "empty" edge case.
-
-### Dim parameterization
-
-Base branch already has both parameterized suites against `SQ8_FP16_SpacesOptimizationTest`:
-- `SQ8_FP16_SIMD` — `testing::Range(16UL, 33UL)` (dims 16..32; residual + threshold boundaries)
-- `SQ8_FP16_SIMD_HighDim` — `64, 128, 256, 512, 1024` (multi-iteration main loop)
-
-Both suites pick up the ARM tier-walk additions automatically since the test class body is what's extended. No new instantiation needed.
-
-### Tier coverage report
-
-`SQ8_FP16_SIMD_TierCoverage.ReportTiersExercised` (test_spaces.cpp) currently reports only x86 tiers. Extend it with ARM tier entries (SVE2 / SVE / NEON_HP) so an ARM-only SIMD host reports its exercised tiers instead of going silent.
-
-## Microbench
-
-`tests/benchmark/spaces_benchmarks/bm_spaces_sq8_fp16.cpp` already registers x86 ISA benchmarks. Add ARM registrations under `#ifdef OPT_*` guards using the existing `bm_spaces.h` macros:
-
-```cpp
-#ifdef CPU_FEATURES_ARCH_AARCH64
-    cpu_features::Aarch64Features opt = cpu_features::GetAarch64Info().features;
-    bool sve2_supported    = opt.sve2;
-    bool sve_supported     = opt.sve;
-    bool neon_hp_supported = opt.asimdhp;
-#ifdef OPT_SVE2
-    INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, SVE2, 16, sve2_supported);
-    INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, SVE2, 16, sve2_supported);
-#endif
-#ifdef OPT_SVE
-    INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, SVE, 16, sve_supported);
-    INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, SVE, 16, sve_supported);
-#endif
-#ifdef OPT_NEON_HP
-    INITIALIZE_BENCHMARKS_SET_L2_IP(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, NEON_HP, 16, neon_hp_supported);
-    INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_FP16, SQ8_FP16, NEON_HP, 16, neon_hp_supported);
-#endif
-#endif // CPU_FEATURES_ARCH_AARCH64
-```
-
-Verify exact `cpu_features` helper names against the x86 sister block already in `bm_spaces_sq8_fp16.cpp` (e.g. `GetX86Info`).
-
-`bm_spaces_sq8_fp16` and `bm_spaces_sq8_fp32` are separate executables; the per-ISA throughput comparison requested by Jira is done by running both benches and comparing matched ISA rows.
-
-## Acceptance criteria (Jira MOD-14972 → spec mapping)
-
-| Jira requirement | Where this spec delivers it |
-|------------------|------------------------------|
-| Kernels: IP + L2 for NEON | NEON_HP TU hosts kernel headers + chooser symbols |
-| Kernels: IP + L2 for SVE | SVE TU hosts kernel headers + chooser symbols |
-| Kernels: IP + L2 for SVE2 | SVE2 TU includes SVE headers, instantiates templates under SVE2 flags |
-| Scalar fallback (reference for all platforms) | Already present in `IP.cpp` / `L2.cpp`; unchanged |
-| FP16 query → FP32 per SIMD chunk | `vcvt_f32_f16` (NEON), `svcvt_f32_f16_x` (SVE) |
-| FP32 metadata + correction terms | `load_unaligned<float>` for all FP32 trailer scalars |
-| Wire into dispatch table per ISA flag | `IP_space.cpp` (2 blocks), `L2_space.cpp` (1 block), `OPT_SVE2/SVE/NEON_HP` |
-| Unit tests vs. scalar reference per ISA | Tier-walk in `SQ8_FP16_SpacesOptimizationTest` |
-| Edge cases (empty, dim-alignment boundaries) | `dim=0` + `dim=15` scalar tests; `dim=16..32` SIMD boundary param suite |
-| Microbench per ISA throughput vs. SQ8↔FP32 | ARM registrations in `bm_spaces_sq8_fp16.cpp`; matched-ISA comparison vs. `bm_spaces_sq8_fp32` |
-
-## Diff size estimate
-
-| Area | Files | LoC (rough) |
-|------|-------|-------------|
-| Kernel headers | 4 new | ~600 |
-| Dispatcher TU additions | NEON_HP.cpp/h, SVE.cpp/h, SVE2.cpp/h | ~80 |
-| Dispatcher wiring | IP_space.cpp, L2_space.cpp | ~45 |
-| Tests | test_spaces.cpp | ~80 |
-| Bench | bm_spaces_sq8_fp16.cpp | ~25 |
-| CMakeLists.txt | none | 0 |
-| **Total** | **~10 files** | **~830** |
-
-## PR mechanics
-
-- **Branch**: `dor-forer-sq8-fp16-arm-kernels-mod-14972`
-- **Base branch**: `dor-forer-sq8-fp16-x86-kernels-mod-14954` (PR #970)
-- **PR target**: opens against PR #970 head; retarget to `main` once #970 merges
-- **Commit prefix**: `[MOD-14972]` (matches repo convention)
-- **PR title**: `Add SQ8↔FP16 ARM SIMD distance kernels [MOD-14972]`
-
-## Verification gates before opening PR
-
-1. **x86 host build clean** — verifies generic dispatch and tests remain clean; ARM kernels require ARM build or cross-compile, so the kernels themselves are not exercised here.
-2. **ARM host build + unit tests** — NEON_HP / SVE / SVE2 paths exercised. Requires coordination with the user for ARM hardware or a cross-compile setup.
-3. **ASan clean** on every host that runs unit tests.
-4. **Microbench compiles + runs on ARM host.**
-
-## Out of scope (deferred, separate PRs)
-
-- Dispatcher-routed edge-case tests (`ZeroQueryTest`, `ConstantStorageTest`, `MixedSignQueryTest`) — they currently bypass the dispatcher and call scalar directly; cross-arch debt, also PR #970 H1.
-- Multi-accumulator ILP tuning beyond the 4-accumulator baseline established here.
-- Unrelated x86 review-feedback fixes (M1–M4, H1–H2 on x86 files from PR #970 review). This ARM PR will modify some files that PR #970 also touches (dispatchers, test class, bench), but only with ARM-relevant additions — x86 review fixes land in #970.
-
-## Inheritance from PR #970 review findings
-
-The following lessons from the PR #970 review are baked into this design so they do not need to be re-flagged on ARM kernels:
-
-- `assert(dim >= 16)` at the top of every kernel template (paired with dispatcher `dim >= 16` guard).
-- 4-accumulator ILP in both NEON and SVE hot loops.
-- Algebraic-identity formula anchor comment at the top of each kernel header.
-- `load_unaligned<float>` for all FP32 metadata reads (matches scalar).
-- Dispatcher-routed tier-walk test pattern (no scalar-bypass).
-- Per-ISA microbench registration alongside SQ8↔FP32 sister for direct comparison.
diff --git a/src/VecSim/spaces/IP_space.cpp b/src/VecSim/spaces/IP_space.cpp
index 1930e64a2..9366d3144 100644
--- a/src/VecSim/spaces/IP_space.cpp
+++ b/src/VecSim/spaces/IP_space.cpp
@@ -241,9 +241,6 @@ dist_func_t<float> IP_SQ8_FP16_GetDistFunc(size_t dim, unsigned char *alignment,
 #endif
 #ifdef OPT_NEON_HP
     if (features.asimdhp) {
-        // No alignment write: the locked spec and the sister ARM SQ8_FP32 dispatchers
-        // leave *alignment untouched on ARM tiers. The corresponding tests assert
-        // 0xFF passthrough on the scalar path and do not assert any non-zero value here.
         return Choose_SQ8_FP16_IP_implementation_NEON_HP(dim);
     }
 #endif
@@ -313,9 +310,6 @@ dist_func_t<float> Cosine_SQ8_FP16_GetDistFunc(size_t dim, unsigned char *alignm
 #endif
 #ifdef OPT_NEON_HP
     if (features.asimdhp) {
-        // No alignment write: the locked spec and the sister ARM SQ8_FP32 dispatchers
-        // leave *alignment untouched on ARM tiers. The corresponding tests assert
-        // 0xFF passthrough on the scalar path and do not assert any non-zero value here.
         return Choose_SQ8_FP16_Cosine_implementation_NEON_HP(dim);
     }
 #endif
diff --git a/src/VecSim/spaces/L2_space.cpp b/src/VecSim/spaces/L2_space.cpp
index 2e18920b3..7d65814e0 100644
--- a/src/VecSim/spaces/L2_space.cpp
+++ b/src/VecSim/spaces/L2_space.cpp
@@ -172,9 +172,6 @@ dist_func_t<float> L2_SQ8_FP16_GetDistFunc(size_t dim, unsigned char *alignment,
 #endif
 #ifdef OPT_NEON_HP
     if (features.asimdhp) {
-        // No alignment write: the locked spec and the sister ARM SQ8_FP32 dispatchers
-        // leave *alignment untouched on ARM tiers. The corresponding tests assert
-        // 0xFF passthrough on the scalar path and do not assert any non-zero value here.
         return Choose_SQ8_FP16_L2_implementation_NEON_HP(dim);
     }
 #endif

From e1647dcc158b9cdea13b59cfb3688663d099169c Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-24-101.us-east-2.compute.internal>
Date: Sun, 31 May 2026 13:34:54 +0000
Subject: [PATCH 19/19] Apply clang-format 18.1.8 (matches CI) [MOD-14972]

---
 src/VecSim/batch_iterator.h     |  2 +-
 tests/benchmark/bm_vecsim_svs.h | 14 ++++++--------
 tests/benchmark/types_ranges.h  | 12 ++++--------
 tests/unit/test_allocator.cpp   |  4 ++--
 4 files changed, 13 insertions(+), 19 deletions(-)

diff --git a/src/VecSim/batch_iterator.h b/src/VecSim/batch_iterator.h
index 466072f86..9e2791130 100644
--- a/src/VecSim/batch_iterator.h
+++ b/src/VecSim/batch_iterator.h
@@ -27,7 +27,7 @@ struct VecSimBatchIterator : public VecsimBaseObject {
     explicit VecSimBatchIterator(void *query_vector, void *tctx,
                                  std::shared_ptr<VecSimAllocator> allocator)
         : VecsimBaseObject(allocator), query_vector(query_vector), returned_results_count(0),
-          timeoutCtx(tctx){};
+          timeoutCtx(tctx) {};
 
     virtual inline const void *getQueryBlob() const { return query_vector; }
 
diff --git a/tests/benchmark/bm_vecsim_svs.h b/tests/benchmark/bm_vecsim_svs.h
index b92cce5e0..5acb882c0 100644
--- a/tests/benchmark/bm_vecsim_svs.h
+++ b/tests/benchmark/bm_vecsim_svs.h
@@ -466,19 +466,17 @@ void BM_VecSimSVS<index_type_t>::RunGC(benchmark::State &st) {
 #define UNIT_AND_ITERATIONS Unit(benchmark::kMillisecond)->Iterations(2)
 
 #if HAVE_SVS_LVQ
-#define QUANT_BITS_ARGS                                                                            \
-    { VecSimSvsQuant_8, VecSimSvsQuant_4x8_LeanVec }
+#define QUANT_BITS_ARGS {VecSimSvsQuant_8, VecSimSvsQuant_4x8_LeanVec}
 #define COMPRESSED_TRAINING_THRESHOLD_ARGS                                                         \
-    { static_cast<long int>(BM_VecSimGeneral::block_size), 5000, 10000 }
+    {static_cast<long int>(BM_VecSimGeneral::block_size), 5000, 10000}
 #define COMPRESSED_ASYNC_TRAINING_THRESHOLD_ARGS                                                   \
-    { static_cast<long int>(BM_VecSimGeneral::block_size), 5000, 10000, 50000 }
+    {static_cast<long int>(BM_VecSimGeneral::block_size), 5000, 10000, 50000}
 #else
-#define QUANT_BITS_ARGS                                                                            \
-    { VecSimSvsQuant_8 }
+#define QUANT_BITS_ARGS {VecSimSvsQuant_8}
 // Using smaller training TH to avoid long test times without LVQ
 #define COMPRESSED_TRAINING_THRESHOLD_ARGS                                                         \
-    { static_cast<long int>(BM_VecSimGeneral::block_size), 5000 }
+    {static_cast<long int>(BM_VecSimGeneral::block_size), 5000}
 #define COMPRESSED_ASYNC_TRAINING_THRESHOLD_ARGS                                                   \
-    { static_cast<long int>(BM_VecSimGeneral::block_size), 5000 }
+    {static_cast<long int>(BM_VecSimGeneral::block_size), 5000}
 
 #endif
diff --git a/tests/benchmark/types_ranges.h b/tests/benchmark/types_ranges.h
index deff4251c..43abda8f0 100644
--- a/tests/benchmark/types_ranges.h
+++ b/tests/benchmark/types_ranges.h
@@ -11,11 +11,9 @@
 #include <array>
 #include "bm_definitions.h"
 
-#define DEFAULT_RANGE_RADII                                                                        \
-    { 20, 35, 50 }
+#define DEFAULT_RANGE_RADII {20, 35, 50}
 
-#define DEFAULT_RANGE_EPSILONS                                                                     \
-    { 1, 10, 11 }
+#define DEFAULT_RANGE_EPSILONS {1, 10, 11}
 
 // This template struct methods returns the default values for radii and epsilons
 // To specify different values for a certain type, use template specialization
@@ -27,8 +25,7 @@ struct benchmark_range {
 
 // Larger Range query values are required for int8 wikipedia dataset.
 // Default values give 0 results
-#define INT8_RANGE_RADII                                                                           \
-    { 50, 65, 80 }
+#define INT8_RANGE_RADII {50, 65, 80}
 
 template <>
 struct benchmark_range<int8_index_t> {
@@ -37,8 +34,7 @@ struct benchmark_range<int8_index_t> {
 };
 
 // UINT8 ranges
-#define UINT8_RANGE_RADII                                                                          \
-    { 4, 5, 7 }
+#define UINT8_RANGE_RADII {4, 5, 7}
 
 template <>
 struct benchmark_range<uint8_index_t> {
diff --git a/tests/unit/test_allocator.cpp b/tests/unit/test_allocator.cpp
index 77db41684..6aa4a0d0b 100644
--- a/tests/unit/test_allocator.cpp
+++ b/tests/unit/test_allocator.cpp
@@ -33,7 +33,7 @@ struct ObjectWithSTL : public VecsimBaseObject {
 
 public:
     ObjectWithSTL(std::shared_ptr<VecSimAllocator> allocator)
-        : VecsimBaseObject(allocator), test_vec(allocator){};
+        : VecsimBaseObject(allocator), test_vec(allocator) {};
 };
 
 struct NestedObject : public VecsimBaseObject {
@@ -42,7 +42,7 @@ struct NestedObject : public VecsimBaseObject {
 
 public:
     NestedObject(std::shared_ptr<VecSimAllocator> allocator)
-        : VecsimBaseObject(allocator), stl_object(allocator), simpleObject(allocator){};
+        : VecsimBaseObject(allocator), stl_object(allocator), simpleObject(allocator) {};
 };
 
 TEST_F(AllocatorTest, test_simple_object) {