Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
10ea6df
Add design spec for SQ8↔FP16 ARM SIMD kernels [MOD-14972]
dor-forer May 28, 2026
c061da9
Add implementation plan for SQ8↔FP16 ARM SIMD kernels [MOD-14972]
dor-forer May 28, 2026
4f0534c
Add NEON_HP SQ8↔FP16 IP kernel header [MOD-14972]
dor-forer May 28, 2026
d3c6415
Add NEON_HP SQ8↔FP16 L2 kernel header [MOD-14972]
dor-forer May 28, 2026
69cee3d
Wire NEON_HP SQ8↔FP16 choosers [MOD-14972]
dor-forer May 28, 2026
1b36b38
Dispatch SQ8↔FP16 to NEON_HP tier on AArch64 [MOD-14972]
dor-forer May 28, 2026
1af4812
Extend SQ8↔FP16 tier-walk tests with NEON_HP [MOD-14972]
dor-forer May 28, 2026
0ce0bce
Add SVE SQ8↔FP16 IP kernel header [MOD-14972]
dor-forer May 28, 2026
eb4952a
Add SVE SQ8↔FP16 L2 kernel header [MOD-14972]
dor-forer May 28, 2026
fcb01bb
Wire SVE/SVE2 SQ8↔FP16 choosers [MOD-14972]
dor-forer May 28, 2026
15fca69
Dispatch SQ8↔FP16 to SVE/SVE2 tiers on AArch64 [MOD-14972]
dor-forer May 28, 2026
0fcd7d0
Extend SQ8↔FP16 tier-walk tests with SVE/SVE2 [MOD-14972]
dor-forer May 28, 2026
6a783f8
Register ARM SQ8↔FP16 microbenchmarks [MOD-14972]
dor-forer May 28, 2026
a2a1b24
Add missing alignment=0 assertions to SQ8↔FP16 ARM tier-walk tests [M…
May 31, 2026
284ad69
Fix SVE SQ8↔FP16 kernel: use svzip1 to correct FP16→FP32 widening [MO…
May 31, 2026
3754f76
Optimize ARM SQ8↔FP16 kernels and align with codebase conventions [MO…
May 31, 2026
10c03aa
Apply clang-format [MOD-14972]
May 31, 2026
9741cfb
Trim PR churn: remove docs, dispatcher comments, and test verbosity […
May 31, 2026
e1647dc
Apply clang-format 18.1.8 (matches CI) [MOD-14972]
May 31, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
129 changes: 129 additions & 0 deletions src/VecSim/spaces/IP/IP_NEON_SQ8_FP16.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
/*
* Copyright (c) 2006-Present, Redis Ltd.
* All rights reserved.
*
* Licensed under your choice of the Redis Source Available License 2.0
* (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
* GNU Affero General Public License v3 (AGPLv3).
*/
#pragma once
#include "VecSim/spaces/space_includes.h"
#include "VecSim/types/sq8.h"
#include "VecSim/types/float16.h"
#include <arm_neon.h>

using sq8 = vecsim_types::sq8;
using float16 = vecsim_types::float16;

/*
* Asymmetric SQ8 (storage) <-> FP16 (query) inner product using algebraic identity:
* IP(x, y) ~= min * y_sum + delta * Σ(q_i * y_i)
*
* FP16 query lanes are widened to FP32 via vcvt_f32_f16 per 16-lane chunk.
*/

// Helper: 16 lanes per call, four FP32 accumulators (one per quarter).
static inline void SQ8_FP16_InnerProductStep_NEON_HP(const uint8_t *&pVect1, const float16 *&pVect2,
float32x4_t &sum0, float32x4_t &sum1,
float32x4_t &sum2, float32x4_t &sum3) {
uint8x16_t v1_u8 = vld1q_u8(pVect1);
uint16x8_t v1_lo = vmovl_u8(vget_low_u8(v1_u8));
uint16x8_t v1_hi = vmovl_u8(vget_high_u8(v1_u8));
float32x4_t v1_0 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(v1_lo)));
float32x4_t v1_1 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(v1_lo)));
float32x4_t v1_2 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(v1_hi)));
float32x4_t v1_3 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(v1_hi)));

const float16_t *q = reinterpret_cast<const float16_t *>(pVect2);
float16x8_t q_lo = vld1q_f16(q);
float16x8_t q_hi = vld1q_f16(q + 8);
float32x4_t v2_0 = vcvt_f32_f16(vget_low_f16(q_lo));
float32x4_t v2_1 = vcvt_f32_f16(vget_high_f16(q_lo));
float32x4_t v2_2 = vcvt_f32_f16(vget_low_f16(q_hi));
float32x4_t v2_3 = vcvt_f32_f16(vget_high_f16(q_hi));

sum0 = vfmaq_f32(sum0, v1_0, v2_0);
sum1 = vfmaq_f32(sum1, v1_1, v2_1);
sum2 = vfmaq_f32(sum2, v1_2, v2_2);
sum3 = vfmaq_f32(sum3, v1_3, v2_3);

pVect1 += 16;
pVect2 += 16;
}

// pVect1v = SQ8 storage, pVect2v = FP16 query. Precondition: dim >= 16 (enforced by dispatcher).
template <unsigned char residual> // 0..15
float SQ8_FP16_InnerProductSIMD16_NEON_HP_IMP(const void *pVect1v, const void *pVect2v,
size_t dimension) {
const uint8_t *pVect1 = static_cast<const uint8_t *>(pVect1v);
const float16 *pVect2 = static_cast<const float16 *>(pVect2v);

float32x4_t sum0 = vdupq_n_f32(0.0f);
float32x4_t sum1 = vdupq_n_f32(0.0f);
float32x4_t sum2 = vdupq_n_f32(0.0f);
float32x4_t sum3 = vdupq_n_f32(0.0f);

const size_t num_of_chunks = dimension / 16;
for (size_t i = 0; i < num_of_chunks; i++) {
SQ8_FP16_InnerProductStep_NEON_HP(pVect1, pVect2, sum0, sum1, sum2, sum3);
}

// Residual: up to three independent 4-lane sub-steps, leaving at most 3 elements
// for scalar — mirrors the SQ8_FP32 NEON sister pattern.
// vld1_f16 (4 FP16 = 8 bytes) is safe for any residual: FP16 metadata follows
// the lane data so there is always enough headroom.
constexpr unsigned char r = residual;
if constexpr (r >= 4) {
uint8x8_t v1_u8 = vld1_u8(pVect1);
float32x4_t v1_a = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(v1_u8))));
float32x4_t v2_a = vcvt_f32_f16(vld1_f16(reinterpret_cast<const float16_t *>(pVect2)));
sum0 = vfmaq_f32(sum0, v1_a, v2_a);
pVect1 += 4;
pVect2 += 4;
}
if constexpr (r >= 8) {
uint8x8_t v1_u8 = vld1_u8(pVect1);
float32x4_t v1_b = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(v1_u8))));
float32x4_t v2_b = vcvt_f32_f16(vld1_f16(reinterpret_cast<const float16_t *>(pVect2)));
sum1 = vfmaq_f32(sum1, v1_b, v2_b);
pVect1 += 4;
pVect2 += 4;
}
if constexpr (r >= 12) {
uint8x8_t v1_u8 = vld1_u8(pVect1);
float32x4_t v1_c = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(v1_u8))));
float32x4_t v2_c = vcvt_f32_f16(vld1_f16(reinterpret_cast<const float16_t *>(pVect2)));
sum2 = vfmaq_f32(sum2, v1_c, v2_c);
pVect1 += 4;
pVect2 += 4;
}
constexpr unsigned char tail = r & 3;
float scalar_dot = 0.0f;
for (unsigned char k = 0; k < tail; ++k) {
scalar_dot += static_cast<float>(pVect1[k]) * vecsim_types::FP16_to_FP32(pVect2[k]);
}

float32x4_t sum_lo = vaddq_f32(sum0, sum1);
float32x4_t sum_hi = vaddq_f32(sum2, sum3);
float quantized_dot = vaddvq_f32(vaddq_f32(sum_lo, sum_hi)) + scalar_dot;

const uint8_t *params_bytes = static_cast<const uint8_t *>(pVect1v) + dimension;
const float min_val = load_unaligned<float>(params_bytes + sq8::MIN_VAL * sizeof(float));
const float delta = load_unaligned<float>(params_bytes + sq8::DELTA * sizeof(float));
const uint8_t *query_meta_bytes =
reinterpret_cast<const uint8_t *>(static_cast<const float16 *>(pVect2v) + dimension);
const float y_sum = load_unaligned<float>(query_meta_bytes + sq8::SUM_QUERY * sizeof(float));

return min_val * y_sum + delta * quantized_dot;
}

template <unsigned char residual>
float SQ8_FP16_InnerProductSIMD16_NEON_HP(const void *pVect1v, const void *pVect2v,
size_t dimension) {
return 1.0f - SQ8_FP16_InnerProductSIMD16_NEON_HP_IMP<residual>(pVect1v, pVect2v, dimension);
}

template <unsigned char residual>
float SQ8_FP16_CosineSIMD16_NEON_HP(const void *pVect1v, const void *pVect2v, size_t dimension) {
return SQ8_FP16_InnerProductSIMD16_NEON_HP<residual>(pVect1v, pVect2v, dimension);
}
113 changes: 113 additions & 0 deletions src/VecSim/spaces/IP/IP_SVE_SQ8_FP16.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
/*
* Copyright (c) 2006-Present, Redis Ltd.
* All rights reserved.
*
* Licensed under your choice of the Redis Source Available License 2.0
* (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
* GNU Affero General Public License v3 (AGPLv3).
*/
#pragma once
#include "VecSim/spaces/space_includes.h"
#include "VecSim/types/sq8.h"
#include "VecSim/types/float16.h"
#include <arm_sve.h>

using sq8 = vecsim_types::sq8;
using float16 = vecsim_types::float16;

/*
* Asymmetric SQ8 (storage) <-> FP16 (query) inner product using algebraic identity:
* IP(x, y) ~= min * y_sum + delta * Σ(q_i * y_i)
*
* FP16 query lanes are widened to FP32 per step via svld1uh_u32 + svcvt_f32_f16_x.
* svld1uh_u32 zero-extends each FP16 halfword into a 32-bit lane so that
* svcvt_f32_f16_x reads the correct bits directly without any interleaving.
*/

// Helper: one SVE-vector-width-of-FP32 step.
static inline void SQ8_FP16_InnerProductStep_SVE(const uint8_t *pVect1, const float16 *pVect2,
size_t &offset, svfloat32_t &sum, svbool_t pg,
size_t chunk) {
svuint32_t v1_u32 = svld1ub_u32(pg, pVect1 + offset);
svfloat32_t v1_f = svcvt_f32_u32_x(pg, v1_u32);
svuint32_t q_u32 = svld1uh_u32(pg, reinterpret_cast<const uint16_t *>(pVect2 + offset));
svfloat32_t v2_f = svcvt_f32_f16_x(pg, svreinterpret_f16_u32(q_u32));
sum = svmla_f32_x(pg, sum, v1_f, v2_f);
offset += chunk;
}

// pVect1v = SQ8 storage, pVect2v = FP16 query. Precondition: dim >= 16 (enforced by dispatcher).
template <bool partial_chunk, unsigned char additional_steps>
float SQ8_FP16_InnerProductSIMD_SVE_IMP(const void *pVect1v, const void *pVect2v,
size_t dimension) {
const uint8_t *pVect1 = static_cast<const uint8_t *>(pVect1v);
const float16 *pVect2 = static_cast<const float16 *>(pVect2v);
size_t offset = 0;
svbool_t pg = svptrue_b32();
const size_t chunk = svcntw();

svfloat32_t sum0 = svdup_f32(0.0f);
svfloat32_t sum1 = svdup_f32(0.0f);
svfloat32_t sum2 = svdup_f32(0.0f);
svfloat32_t sum3 = svdup_f32(0.0f);

// Partial chunk for dim % chunk lanes. Use _z form so inactive lanes are zero;
// the final reduction walks all lanes via svptrue_b32().
if constexpr (partial_chunk) {
size_t remaining = dimension % chunk;
if (remaining > 0) {
svbool_t pg_partial = svwhilelt_b32(uint32_t(0), uint32_t(remaining));
svuint32_t v1_u32 = svld1ub_u32(pg_partial, pVect1 + offset);
svfloat32_t v1_f = svcvt_f32_u32_z(pg_partial, v1_u32);
svuint32_t q_u32 =
svld1uh_u32(pg_partial, reinterpret_cast<const uint16_t *>(pVect2 + offset));
svfloat32_t v2_f = svcvt_f32_f16_z(pg_partial, svreinterpret_f16_u32(q_u32));
sum0 = svmla_f32_z(pg_partial, sum0, v1_f, v2_f);
offset += remaining;
}
}

// Main loop: 4 chunks per iteration, one chunk per accumulator.
const size_t chunk_size = 4 * chunk;
const size_t number_of_chunks =
(dimension - (partial_chunk ? dimension % chunk : 0)) / chunk_size;
for (size_t i = 0; i < number_of_chunks; i++) {
SQ8_FP16_InnerProductStep_SVE(pVect1, pVect2, offset, sum0, pg, chunk);
SQ8_FP16_InnerProductStep_SVE(pVect1, pVect2, offset, sum1, pg, chunk);
SQ8_FP16_InnerProductStep_SVE(pVect1, pVect2, offset, sum2, pg, chunk);
SQ8_FP16_InnerProductStep_SVE(pVect1, pVect2, offset, sum3, pg, chunk);
}

if constexpr (additional_steps > 0)
SQ8_FP16_InnerProductStep_SVE(pVect1, pVect2, offset, sum0, pg, chunk);
if constexpr (additional_steps > 1)
SQ8_FP16_InnerProductStep_SVE(pVect1, pVect2, offset, sum1, pg, chunk);
if constexpr (additional_steps > 2)
SQ8_FP16_InnerProductStep_SVE(pVect1, pVect2, offset, sum2, pg, chunk);

svfloat32_t sum = svadd_f32_z(pg, sum0, sum1);
sum = svadd_f32_z(pg, sum, sum2);
sum = svadd_f32_z(pg, sum, sum3);
float quantized_dot = svaddv_f32(pg, sum);

const uint8_t *params_bytes = static_cast<const uint8_t *>(pVect1v) + dimension;
const float min_val = load_unaligned<float>(params_bytes + sq8::MIN_VAL * sizeof(float));
const float delta = load_unaligned<float>(params_bytes + sq8::DELTA * sizeof(float));
const uint8_t *query_meta_bytes =
reinterpret_cast<const uint8_t *>(static_cast<const float16 *>(pVect2v) + dimension);
const float y_sum = load_unaligned<float>(query_meta_bytes + sq8::SUM_QUERY * sizeof(float));

return min_val * y_sum + delta * quantized_dot;
}

template <bool partial_chunk, unsigned char additional_steps>
float SQ8_FP16_InnerProductSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) {
return 1.0f - SQ8_FP16_InnerProductSIMD_SVE_IMP<partial_chunk, additional_steps>(
pVect1v, pVect2v, dimension);
}

template <bool partial_chunk, unsigned char additional_steps>
float SQ8_FP16_CosineSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) {
return SQ8_FP16_InnerProductSIMD_SVE<partial_chunk, additional_steps>(pVect1v, pVect2v,
dimension);
}
40 changes: 40 additions & 0 deletions src/VecSim/spaces/IP_space.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -225,6 +225,26 @@ dist_func_t<float> IP_SQ8_FP16_GetDistFunc(size_t dim, unsigned char *alignment,
#endif
#endif // OPT_F16C
#endif // x86_64
#ifdef CPU_FEATURES_ARCH_AARCH64
if (dim < 16) {
return ret_dist_func;
}
#ifdef OPT_SVE2
if (features.sve2) {
return Choose_SQ8_FP16_IP_implementation_SVE2(dim);
}
#endif
#ifdef OPT_SVE
if (features.sve) {
return Choose_SQ8_FP16_IP_implementation_SVE(dim);
}
#endif
#ifdef OPT_NEON_HP
if (features.asimdhp) {
return Choose_SQ8_FP16_IP_implementation_NEON_HP(dim);
}
#endif
#endif // CPU_FEATURES_ARCH_AARCH64
return ret_dist_func;
}

Expand Down Expand Up @@ -274,6 +294,26 @@ dist_func_t<float> Cosine_SQ8_FP16_GetDistFunc(size_t dim, unsigned char *alignm
#endif
#endif // OPT_F16C
#endif // x86_64
#ifdef CPU_FEATURES_ARCH_AARCH64
if (dim < 16) {
return ret_dist_func;
}
#ifdef OPT_SVE2
if (features.sve2) {
return Choose_SQ8_FP16_Cosine_implementation_SVE2(dim);
}
#endif
#ifdef OPT_SVE
if (features.sve) {
return Choose_SQ8_FP16_Cosine_implementation_SVE(dim);
}
#endif
#ifdef OPT_NEON_HP
if (features.asimdhp) {
return Choose_SQ8_FP16_Cosine_implementation_NEON_HP(dim);
}
#endif
#endif // CPU_FEATURES_ARCH_AARCH64
return ret_dist_func;
}

Expand Down
35 changes: 35 additions & 0 deletions src/VecSim/spaces/L2/L2_NEON_SQ8_FP16.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
/*
* Copyright (c) 2006-Present, Redis Ltd.
* All rights reserved.
*
* Licensed under your choice of the Redis Source Available License 2.0
* (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
* GNU Affero General Public License v3 (AGPLv3).
*/
#pragma once
#include "VecSim/spaces/space_includes.h"
#include "VecSim/spaces/IP/IP_NEON_SQ8_FP16.h"

/*
* Optimised asymmetric SQ8<->FP16 L2 squared distance using the algebraic identity:
*
* ||x - y||^2 = sum(x_i^2) - 2 * IP(x, y) + sum(y_i^2)
* = x_sum_squares - 2 * IP(x, y) + y_sum_squares
*
* IP is computed by SQ8_FP16_InnerProductSIMD16_NEON_HP_IMP; metadata is FP32.
*/

template <unsigned char residual> // 0..15
float SQ8_FP16_L2SqrSIMD16_NEON_HP(const void *pVect1v, const void *pVect2v, size_t dimension) {
const float ip = SQ8_FP16_InnerProductSIMD16_NEON_HP_IMP<residual>(pVect1v, pVect2v, dimension);

const uint8_t *params_bytes = static_cast<const uint8_t *>(pVect1v) + dimension;
const float x_sum_sq = load_unaligned<float>(params_bytes + sq8::SUM_SQUARES * sizeof(float));

const uint8_t *query_meta_bytes =
reinterpret_cast<const uint8_t *>(static_cast<const float16 *>(pVect2v) + dimension);
const float y_sum_sq =
load_unaligned<float>(query_meta_bytes + sq8::SUM_SQUARES_QUERY * sizeof(float));

return x_sum_sq + y_sum_sq - 2.0f * ip;
}
32 changes: 32 additions & 0 deletions src/VecSim/spaces/L2/L2_SVE_SQ8_FP16.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
/*
* Copyright (c) 2006-Present, Redis Ltd.
* All rights reserved.
*
* Licensed under your choice of the Redis Source Available License 2.0
* (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the
* GNU Affero General Public License v3 (AGPLv3).
*/
#pragma once
#include "VecSim/spaces/space_includes.h"
#include "VecSim/spaces/IP/IP_SVE_SQ8_FP16.h"

/*
* SVE SQ8<->FP16 L2 squared distance:
* ||x - y||^2 = x_sum_squares - 2 * IP(x, y) + y_sum_squares
* IP is computed by SQ8_FP16_InnerProductSIMD_SVE_IMP; metadata is FP32.
*/

template <bool partial_chunk, unsigned char additional_steps>
float SQ8_FP16_L2SqrSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) {
const float ip = SQ8_FP16_InnerProductSIMD_SVE_IMP<partial_chunk, additional_steps>(
pVect1v, pVect2v, dimension);

const uint8_t *params_bytes = static_cast<const uint8_t *>(pVect1v) + dimension;
const float x_sum_sq = load_unaligned<float>(params_bytes + sq8::SUM_SQUARES * sizeof(float));
const uint8_t *query_meta_bytes =
reinterpret_cast<const uint8_t *>(static_cast<const float16 *>(pVect2v) + dimension);
const float y_sum_sq =
load_unaligned<float>(query_meta_bytes + sq8::SUM_SQUARES_QUERY * sizeof(float));

return x_sum_sq + y_sum_sq - 2.0f * ip;
}
20 changes: 20 additions & 0 deletions src/VecSim/spaces/L2_space.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,26 @@ dist_func_t<float> L2_SQ8_FP16_GetDistFunc(size_t dim, unsigned char *alignment,
#endif
#endif // OPT_F16C
#endif // x86_64
#ifdef CPU_FEATURES_ARCH_AARCH64
if (dim < 16) {
return ret_dist_func;
}
#ifdef OPT_SVE2
if (features.sve2) {
return Choose_SQ8_FP16_L2_implementation_SVE2(dim);
}
#endif
#ifdef OPT_SVE
if (features.sve) {
return Choose_SQ8_FP16_L2_implementation_SVE(dim);
}
#endif
#ifdef OPT_NEON_HP
if (features.asimdhp) {
return Choose_SQ8_FP16_L2_implementation_NEON_HP(dim);
}
#endif
#endif // CPU_FEATURES_ARCH_AARCH64
return ret_dist_func;
}

Expand Down
Loading
Loading