diff --git a/src/VecSim/spaces/IP/IP.cpp b/src/VecSim/spaces/IP/IP.cpp index 5e2c4b4dc..59d5d5fe1 100644 --- a/src/VecSim/spaces/IP/IP.cpp +++ b/src/VecSim/spaces/IP/IP.cpp @@ -15,13 +15,13 @@ using bfloat16 = vecsim_types::bfloat16; using float16 = vecsim_types::float16; float FLOAT_INTEGER_InnerProduct(const float *pVect1v, const uint8_t *pVect2v, size_t dimension, - float min_val, float delta, float inv_norm) { + float min_val, float delta) { float res = 0; for (size_t i = 0; i < dimension; i++) { float dequantized_V2 = (pVect2v[i] * delta + min_val); res += pVect1v[i] * dequantized_V2; } - return res * inv_norm; + return res; } float SQ8_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dimension) { @@ -29,11 +29,11 @@ float SQ8_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dimensio const auto *pVect2 = static_cast(pVect2v); // pVect2 is a vector of uint8_t, so we need to de-quantize it, normalize it and then multiply // it. it is structured as [quantized values (int8_t * dim)][min_val (float)][delta - // (float)][inv_norm (float)] The last two values are used to dequantize the vector. + // (float)]] The last two values are used to dequantize the vector. const float min_val = *reinterpret_cast(pVect2 + dimension); const float delta = *reinterpret_cast(pVect2 + dimension + sizeof(float)); // Compute inner product with dequantization - const float res = FLOAT_INTEGER_InnerProduct(pVect1, pVect2, dimension, min_val, delta, 1.0f); + const float res = FLOAT_INTEGER_InnerProduct(pVect1, pVect2, dimension, min_val, delta); return 1.0f - res; } @@ -44,13 +44,46 @@ float SQ8_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension) { // Get quantization parameters const float min_val = *reinterpret_cast(pVect2 + dimension); const float delta = *reinterpret_cast(pVect2 + dimension + sizeof(float)); - const float inv_norm = *reinterpret_cast(pVect2 + dimension + 2 * sizeof(float)); // Compute inner product with dequantization - const float res = - FLOAT_INTEGER_InnerProduct(pVect1, pVect2, dimension, min_val, delta, inv_norm); + const float res = FLOAT_INTEGER_InnerProduct(pVect1, pVect2, dimension, min_val, delta); + return 1.0f - res; +} + +// SQ8-to-SQ8: Both vectors are uint8 quantized with precomputed sum +// Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)] +float SQ8_SQ8_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dimension) { + const auto *pVect1 = static_cast(pVect1v); + const auto *pVect2 = static_cast(pVect2v); + + // Compute inner product of quantized values: Σ(q1[i]*q2[i]) + float product = 0; + for (size_t i = 0; i < dimension; i++) { + product += pVect1[i] * pVect2[i]; + } + + // Get quantization parameters from pVect1 + const float min_val1 = *reinterpret_cast(pVect1 + dimension); + const float delta1 = *reinterpret_cast(pVect1 + dimension + sizeof(float)); + const float sum1 = *reinterpret_cast(pVect1 + dimension + 2 * sizeof(float)); + + // Get quantization parameters from pVect2 + const float min_val2 = *reinterpret_cast(pVect2 + dimension); + const float delta2 = *reinterpret_cast(pVect2 + dimension + sizeof(float)); + const float sum2 = *reinterpret_cast(pVect2 + dimension + 2 * sizeof(float)); + + // Apply the algebraic formula using precomputed sums: + // IP = min1*sum2 + min2*sum1 + delta1*delta2*Σ(q1[i]*q2[i]) - dim*min1*min2 + float res = min_val1 * sum2 + min_val2 * sum1 - + static_cast(dimension) * min_val1 * min_val2 + delta1 * delta2 * product; return 1.0f - res; } +// SQ8-to-SQ8: Both vectors are uint8 quantized and normalized with precomputed sum +// Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)] +float SQ8_SQ8_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension) { + return SQ8_SQ8_InnerProduct(pVect1v, pVect2v, dimension); +} + float FP32_InnerProduct(const void *pVect1, const void *pVect2, size_t dimension) { auto *vec1 = (float *)pVect1; auto *vec2 = (float *)pVect2; diff --git a/src/VecSim/spaces/IP/IP.h b/src/VecSim/spaces/IP/IP.h index d4796cbd6..8acb4b963 100644 --- a/src/VecSim/spaces/IP/IP.h +++ b/src/VecSim/spaces/IP/IP.h @@ -16,6 +16,14 @@ float SQ8_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dimensio // pVect1v vector of type fp32 and pVect2v vector of type uint8 float SQ8_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension); +// SQ8-to-SQ8: Both vectors are uint8 quantized with precomputed sum +// Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)] +float SQ8_SQ8_InnerProduct(const void *pVect1v, const void *pVect2v, size_t dimension); + +// SQ8-to-SQ8: Both vectors are uint8 quantized and normalized with precomputed sum +// Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)] +float SQ8_SQ8_Cosine(const void *pVect1v, const void *pVect2v, size_t dimension); + float FP32_InnerProduct(const void *pVect1, const void *pVect2, size_t dimension); double FP64_InnerProduct(const void *pVect1, const void *pVect2, size_t dimension); diff --git a/src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h b/src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h index 007ee333e..f8333e6e8 100644 --- a/src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h +++ b/src/VecSim/spaces/IP/IP_AVX2_FMA_SQ8.h @@ -100,14 +100,7 @@ float SQ8_InnerProductSIMD16_AVX2_FMA(const void *pVect1v, const void *pVect2v, template // 0..15 float SQ8_CosineSIMD16_AVX2_FMA(const void *pVect1v, const void *pVect2v, size_t dimension) { - // Get dequantization parameters from the end of quantized vector - const uint8_t *pVect2 = static_cast(pVect2v); - const float inv_norm = *reinterpret_cast(pVect2 + dimension + 2 * sizeof(float)); - // Calculate inner product using common implementation with normalization float ip = SQ8_InnerProductImp_FMA(pVect1v, pVect2v, dimension); - - // For cosine, we need to account for the vector norms - // The inv_norm parameter is stored after min_val and delta in the quantized vector - return 1.0f - ip * inv_norm; + return 1.0f - ip; } diff --git a/src/VecSim/spaces/IP/IP_AVX2_SQ8.h b/src/VecSim/spaces/IP/IP_AVX2_SQ8.h index 89b1c0b6b..203e32fad 100644 --- a/src/VecSim/spaces/IP/IP_AVX2_SQ8.h +++ b/src/VecSim/spaces/IP/IP_AVX2_SQ8.h @@ -33,7 +33,7 @@ static inline void InnerProductStepSQ8(const float *&pVect1, const uint8_t *&pVe } template // 0..15 -float SQ8_InnerProductImp(const void *pVect1v, const void *pVect2v, size_t dimension) { +float SQ8_InnerProductImp_AVX2(const void *pVect1v, const void *pVect2v, size_t dimension) { const float *pVect1 = static_cast(pVect1v); // pVect2 is a quantized uint8_t vector const uint8_t *pVect2 = static_cast(pVect2v); @@ -89,19 +89,12 @@ float SQ8_InnerProductImp(const void *pVect1v, const void *pVect2v, size_t dimen template // 0..15 float SQ8_InnerProductSIMD16_AVX2(const void *pVect1v, const void *pVect2v, size_t dimension) { - return 1.0f - SQ8_InnerProductImp(pVect1v, pVect2v, dimension); + return 1.0f - SQ8_InnerProductImp_AVX2(pVect1v, pVect2v, dimension); } template // 0..15 float SQ8_CosineSIMD16_AVX2(const void *pVect1v, const void *pVect2v, size_t dimension) { - // Get dequantization parameters from the end of quantized vector - const uint8_t *pVect2 = static_cast(pVect2v); - const float inv_norm = *reinterpret_cast(pVect2 + dimension + 2 * sizeof(float)); - // Calculate inner product using common implementation with normalization - float ip = SQ8_InnerProductImp(pVect1v, pVect2v, dimension); - - // For cosine, we need to account for the vector norms - // The inv_norm parameter is stored after min_val and delta in the quantized vector - return 1.0f - ip * inv_norm; + float ip = SQ8_InnerProductImp_AVX2(pVect1v, pVect2v, dimension); + return 1.0f - ip; } diff --git a/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8_SQ8.h b/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8_SQ8.h new file mode 100644 index 000000000..9b4f7e01a --- /dev/null +++ b/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8_SQ8.h @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ +#pragma once +#include "VecSim/spaces/space_includes.h" +#include "VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_UINT8.h" +#include + +/** + * SQ8-to-SQ8 distance functions using AVX512 VNNI with precomputed sum. + * These functions compute distance between two SQ8 (scalar quantized 8-bit) vectors, + * where BOTH vectors are uint8 quantized. + * + * Uses precomputed sum stored in the vector data, + * eliminating the need to compute them during distance calculation. + * + * Uses algebraic optimization to leverage integer VNNI instructions: + * + * With sum = Σv[i] (sum of original float values), the formula is: + * IP = min1*sum2 + min2*sum1 + δ1*δ2 * Σ(q1[i]*q2[i]) - dim*min1*min2 + * + * Since sum is precomputed, we only need to compute the dot product Σ(q1[i]*q2[i]). + * The dot product is computed using the efficient UINT8_InnerProductImp which uses + * VNNI instructions (_mm512_dpwssd_epi32) for native integer dot product computation. + * + * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)] + */ + +// Common implementation for inner product between two SQ8 vectors with precomputed sum +// Uses UINT8_InnerProductImp for efficient dot product computation with VNNI +template // 0..63 +float SQ8_SQ8_InnerProductImp(const void *pVec1v, const void *pVec2v, size_t dimension) { + // Compute raw dot product using efficient UINT8 AVX512 VNNI implementation + // UINT8_InnerProductImp uses _mm512_dpwssd_epi32 for native integer dot product + int dot_product = UINT8_InnerProductImp(pVec1v, pVec2v, dimension); + + // Get dequantization parameters and precomputed values from the end of vectors + // Layout: [data (dim)] [min (float)] [delta (float)] [sum (float)] + const uint8_t *pVec1 = static_cast(pVec1v); + const uint8_t *pVec2 = static_cast(pVec2v); + + const float *params1 = reinterpret_cast(pVec1 + dimension); + const float min1 = params1[0]; + const float delta1 = params1[1]; + const float sum1 = params1[2]; // Precomputed sum of original float elements + + const float *params2 = reinterpret_cast(pVec2 + dimension); + const float min2 = params2[0]; + const float delta2 = params2[1]; + const float sum2 = params2[2]; // Precomputed sum of original float elements + + // Apply the algebraic formula using precomputed sums: + // IP = min1*sum2 + min2*sum1 + δ1*δ2 * Σ(q1[i]*q2[i]) - dim*min1*min2 + return min1 * sum2 + min2 * sum1 + delta1 * delta2 * static_cast(dot_product) - + static_cast(dimension) * min1 * min2; +} + +// SQ8-to-SQ8 Inner Product distance function +// Returns 1 - inner_product (distance form) +template // 0..63 +float SQ8_SQ8_InnerProductSIMD64_AVX512F_BW_VL_VNNI(const void *pVec1v, const void *pVec2v, + size_t dimension) { + return 1.0f - SQ8_SQ8_InnerProductImp(pVec1v, pVec2v, dimension); +} + +// SQ8-to-SQ8 Cosine distance function +// Returns 1 - (inner_product) +template // 0..63 +float SQ8_SQ8_CosineSIMD64_AVX512F_BW_VL_VNNI(const void *pVec1v, const void *pVec2v, + size_t dimension) { + // Assume vectors are normalized. + return SQ8_SQ8_InnerProductSIMD64_AVX512F_BW_VL_VNNI(pVec1v, pVec2v, dimension); +} diff --git a/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_UINT8.h b/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_UINT8.h index 96ff1e8f5..deed0f706 100644 --- a/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_UINT8.h +++ b/src/VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_UINT8.h @@ -6,6 +6,7 @@ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the * GNU Affero General Public License v3 (AGPLv3). */ +#pragma once #include "VecSim/spaces/space_includes.h" static inline void InnerProductStep(uint8_t *&pVect1, uint8_t *&pVect2, __m512i &sum) { diff --git a/src/VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h b/src/VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h index 3fd665111..35ea482fa 100644 --- a/src/VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h +++ b/src/VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h @@ -36,8 +36,7 @@ static inline void SQ8_InnerProductStep(const float *&pVec1, const uint8_t *&pVe // Common implementation for both inner product and cosine similarity template // 0..15 -float SQ8_InnerProductImp(const void *pVec1v, const void *pVec2v, size_t dimension, - float inv_norm = 1.0f) { +float SQ8_InnerProductImp_AVX512(const void *pVec1v, const void *pVec2v, size_t dimension) { const float *pVec1 = static_cast(pVec1v); const uint8_t *pVec2 = static_cast(pVec2v); const float *pEnd1 = pVec1 + dimension; @@ -92,7 +91,7 @@ template // 0..15 float SQ8_InnerProductSIMD16_AVX512F_BW_VL_VNNI(const void *pVec1v, const void *pVec2v, size_t dimension) { // Calculate inner product using common implementation - float ip = SQ8_InnerProductImp(pVec1v, pVec2v, dimension); + float ip = SQ8_InnerProductImp_AVX512(pVec1v, pVec2v, dimension); // The inner product similarity is 1 - ip return 1.0f - ip; @@ -101,12 +100,8 @@ float SQ8_InnerProductSIMD16_AVX512F_BW_VL_VNNI(const void *pVec1v, const void * template // 0..15 float SQ8_CosineSIMD16_AVX512F_BW_VL_VNNI(const void *pVec1v, const void *pVec2v, size_t dimension) { - // Get the inverse norm factor stored after min_val and delta - const uint8_t *pVec2 = static_cast(pVec2v); - const float inv_norm = *reinterpret_cast(pVec2 + dimension + 2 * sizeof(float)); - // Calculate inner product using common implementation with normalization - float ip = SQ8_InnerProductImp(pVec1v, pVec2v, dimension, inv_norm); + float ip = SQ8_InnerProductImp_AVX512(pVec1v, pVec2v, dimension); // The cosine similarity is 1 - ip return 1.0f - ip; diff --git a/src/VecSim/spaces/IP/IP_NEON_DOTPROD_SQ8_SQ8.h b/src/VecSim/spaces/IP/IP_NEON_DOTPROD_SQ8_SQ8.h new file mode 100644 index 000000000..7b2ed8829 --- /dev/null +++ b/src/VecSim/spaces/IP/IP_NEON_DOTPROD_SQ8_SQ8.h @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ +#pragma once +#include "VecSim/spaces/space_includes.h" +#include "VecSim/spaces/IP/IP_NEON_DOTPROD_UINT8.h" +#include + +/** + * SQ8-to-SQ8 distance functions using ARM NEON DOTPROD with precomputed sum. + * These functions compute distance between two SQ8 (scalar quantized 8-bit) vectors, + * where BOTH vectors are uint8 quantized. + * + * Uses precomputed sum stored in the vector data, + * eliminating the need to compute them during distance calculation. + * + * Uses algebraic optimization with DOTPROD instruction: + * + * With sum = Σv[i] (sum of original float values), the formula is: + * IP = min1*sum2 + min2*sum1 + δ1*δ2 * Σ(q1[i]*q2[i]) - dim*min1*min2 + * + * Since sum is precomputed, we only need to compute the dot product Σ(q1[i]*q2[i]). + * The dot product is computed using the efficient UINT8_InnerProductImp which uses + * the DOTPROD instruction (vdotq_u32) for native uint8 dot product computation. + * + * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)] + */ + +// Common implementation for inner product between two SQ8 vectors with precomputed sum +// Uses UINT8_InnerProductImp for efficient dot product computation with DOTPROD +template // 0..63 +float SQ8_SQ8_InnerProductSIMD64_NEON_DOTPROD_IMP(const void *pVec1v, const void *pVec2v, + size_t dimension) { + // Compute raw dot product using efficient UINT8 DOTPROD implementation + // UINT8_InnerProductImp uses vdotq_u32 for native uint8 dot product + float dot_product = UINT8_InnerProductImp(pVec1v, pVec2v, dimension); + + // Get dequantization parameters and precomputed values from the end of vectors + // Layout: [data (dim)] [min (float)] [delta (float)] [sum (float)] + const uint8_t *pVec1 = static_cast(pVec1v); + const uint8_t *pVec2 = static_cast(pVec2v); + + const float *params1 = reinterpret_cast(pVec1 + dimension); + const float min1 = params1[0]; + const float delta1 = params1[1]; + const float sum1 = params1[2]; // Precomputed sum of original float elements + + const float *params2 = reinterpret_cast(pVec2 + dimension); + const float min2 = params2[0]; + const float delta2 = params2[1]; + const float sum2 = params2[2]; // Precomputed sum of original float elements + + // Apply algebraic formula using precomputed sums: + // IP = min1*sum2 + min2*sum1 + δ1*δ2 * Σ(q1*q2) - dim*min1*min2 + return min1 * sum2 + min2 * sum1 + delta1 * delta2 * dot_product - + static_cast(dimension) * min1 * min2; +} + +// SQ8-to-SQ8 Inner Product distance function +// Returns 1 - inner_product (distance form) +template // 0..63 +float SQ8_SQ8_InnerProductSIMD64_NEON_DOTPROD(const void *pVec1v, const void *pVec2v, + size_t dimension) { + return 1.0f - SQ8_SQ8_InnerProductSIMD64_NEON_DOTPROD_IMP(pVec1v, pVec2v, dimension); +} + +// SQ8-to-SQ8 Cosine distance function +// Returns 1 - inner_product (assumes vectors are pre-normalized) +template // 0..63 +float SQ8_SQ8_CosineSIMD64_NEON_DOTPROD(const void *pVec1v, const void *pVec2v, size_t dimension) { + return SQ8_SQ8_InnerProductSIMD64_NEON_DOTPROD(pVec1v, pVec2v, dimension); +} diff --git a/src/VecSim/spaces/IP/IP_NEON_DOTPROD_UINT8.h b/src/VecSim/spaces/IP/IP_NEON_DOTPROD_UINT8.h index dde497c46..73682a21a 100644 --- a/src/VecSim/spaces/IP/IP_NEON_DOTPROD_UINT8.h +++ b/src/VecSim/spaces/IP/IP_NEON_DOTPROD_UINT8.h @@ -6,6 +6,7 @@ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the * GNU Affero General Public License v3 (AGPLv3). */ +#pragma once #include "VecSim/spaces/space_includes.h" #include diff --git a/src/VecSim/spaces/IP/IP_NEON_SQ8.h b/src/VecSim/spaces/IP/IP_NEON_SQ8.h index 3e632dcdb..7c3f27e10 100644 --- a/src/VecSim/spaces/IP/IP_NEON_SQ8.h +++ b/src/VecSim/spaces/IP/IP_NEON_SQ8.h @@ -114,15 +114,7 @@ float SQ8_InnerProductSIMD16_NEON(const void *pVect1v, const void *pVect2v, size template // 0..15 float SQ8_CosineSIMD16_NEON(const void *pVect1v, const void *pVect2v, size_t dimension) { - const uint8_t *pVect2 = static_cast(pVect2v); - - // Get quantization parameters - const float inv_norm = *reinterpret_cast(pVect2 + dimension + 2 * sizeof(float)); - // Compute inner product with dequantization using the common function const float res = SQ8_InnerProductSIMD16_NEON_IMP(pVect1v, pVect2v, dimension); - - // For cosine, we need to account for the vector norms - // The inv_norm parameter is stored after min_val and delta in the quantized vector - return 1.0f - res * inv_norm; + return 1.0f - res; } diff --git a/src/VecSim/spaces/IP/IP_NEON_SQ8_SQ8.h b/src/VecSim/spaces/IP/IP_NEON_SQ8_SQ8.h new file mode 100644 index 000000000..8d6cbd650 --- /dev/null +++ b/src/VecSim/spaces/IP/IP_NEON_SQ8_SQ8.h @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ +#pragma once +#include "VecSim/spaces/space_includes.h" +#include "VecSim/spaces/IP/IP_NEON_UINT8.h" +#include + +/** + * SQ8-to-SQ8 distance functions using ARM NEON with precomputed sum. + * These functions compute distance between two SQ8 (scalar quantized 8-bit) vectors, + * where BOTH vectors are uint8 quantized. + * + * Uses precomputed sum stored in the vector data, + * eliminating the need to compute them during distance calculation. + * + * Uses algebraic optimization: + * + * With sum = Σv[i] (sum of original float values), the formula is: + * IP = min1*sum2 + min2*sum1 + δ1*δ2 * Σ(q1[i]*q2[i]) - dim*min1*min2 + * + * Since sum is precomputed, we only need to compute the dot product Σ(q1[i]*q2[i]). + * The dot product is computed using the efficient UINT8_InnerProductImp which uses + * native NEON uint8 multiply-accumulate instructions (vmull_u8, vpadalq_u16). + * + * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)] + */ + +// Common implementation for inner product between two SQ8 vectors with precomputed sum +// Uses UINT8_InnerProductImp for efficient dot product computation +template // 0..63 +float SQ8_SQ8_InnerProductSIMD64_NEON_IMP(const void *pVec1v, const void *pVec2v, + size_t dimension) { + // Compute raw dot product using efficient UINT8 implementation + // UINT8_InnerProductImp processes 16 elements at a time using native uint8 instructions + float dot_product = UINT8_InnerProductImp(pVec1v, pVec2v, dimension); + + // Get dequantization parameters and precomputed values from the end of pVec1 + // Layout: [data (dim)] [min (float)] [delta (float)] [sum (float)] + const uint8_t *pVec1 = static_cast(pVec1v); + const uint8_t *pVec2 = static_cast(pVec2v); + + const float *params1 = reinterpret_cast(pVec1 + dimension); + const float min1 = params1[0]; + const float delta1 = params1[1]; + const float sum1 = params1[2]; // Precomputed sum of original float elements + + // Get dequantization parameters and precomputed values from the end of pVec2 + const float *params2 = reinterpret_cast(pVec2 + dimension); + const float min2 = params2[0]; + const float delta2 = params2[1]; + const float sum2 = params2[2]; // Precomputed sum of original float elements + + // Apply algebraic formula using precomputed sums: + // IP = min1*sum2 + min2*sum1 + δ1*δ2 * Σ(q1*q2) - dim*min1*min2 + return min1 * sum2 + min2 * sum1 + delta1 * delta2 * dot_product - + static_cast(dimension) * min1 * min2; +} + +// SQ8-to-SQ8 Inner Product distance function +// Returns 1 - inner_product (distance form) +template // 0..63 +float SQ8_SQ8_InnerProductSIMD64_NEON(const void *pVec1v, const void *pVec2v, size_t dimension) { + return 1.0f - SQ8_SQ8_InnerProductSIMD64_NEON_IMP(pVec1v, pVec2v, dimension); +} + +// SQ8-to-SQ8 Cosine distance function +// Returns 1 - inner_product (assumes vectors are pre-normalized) +template // 0..63 +float SQ8_SQ8_CosineSIMD64_NEON(const void *pVec1v, const void *pVec2v, size_t dimension) { + return SQ8_SQ8_InnerProductSIMD64_NEON(pVec1v, pVec2v, dimension); +} diff --git a/src/VecSim/spaces/IP/IP_NEON_UINT8.h b/src/VecSim/spaces/IP/IP_NEON_UINT8.h index 10bb18707..6263eeea4 100644 --- a/src/VecSim/spaces/IP/IP_NEON_UINT8.h +++ b/src/VecSim/spaces/IP/IP_NEON_UINT8.h @@ -6,6 +6,7 @@ * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the * GNU Affero General Public License v3 (AGPLv3). */ +#pragma once #include "VecSim/spaces/space_includes.h" #include diff --git a/src/VecSim/spaces/IP/IP_SSE4_SQ8.h b/src/VecSim/spaces/IP/IP_SSE4_SQ8.h index 5e47af2b6..d5dfc4e80 100644 --- a/src/VecSim/spaces/IP/IP_SSE4_SQ8.h +++ b/src/VecSim/spaces/IP/IP_SSE4_SQ8.h @@ -104,16 +104,10 @@ float SQ8_InnerProductSIMD16_SSE4(const void *pVect1v, const void *pVect2v, size template // 0..15 float SQ8_CosineSIMD16_SSE4(const void *pVect1v, const void *pVect2v, size_t dimension) { - - const uint8_t *pVect2 = static_cast(pVect2v); - // Get quantization parameters - const float inv_norm = *reinterpret_cast(pVect2 + dimension + 2 * sizeof(float)); - // Compute inner product with dequantization using the common function // We need to cast away const for the inner product function, but it doesn't modify the vectors const float res = SQ8_InnerProductSIMD16_SSE4_IMP(pVect1v, pVect2v, dimension); // For cosine, we need to account for the vector norms - // The inv_norm parameter is stored after min_val and delta in the quantized vector - return 1.0f - res * inv_norm; + return 1.0f - res; } diff --git a/src/VecSim/spaces/IP/IP_SVE_SQ8.h b/src/VecSim/spaces/IP/IP_SVE_SQ8.h index 7b9bd86bc..825e9c501 100644 --- a/src/VecSim/spaces/IP/IP_SVE_SQ8.h +++ b/src/VecSim/spaces/IP/IP_SVE_SQ8.h @@ -131,16 +131,10 @@ float SQ8_InnerProductSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t template float SQ8_CosineSIMD_SVE(const void *pVect1v, const void *pVect2v, size_t dimension) { - const uint8_t *pVect2 = static_cast(pVect2v); - - // Get quantization parameters - const float inv_norm = *reinterpret_cast(pVect2 + dimension + 2 * sizeof(float)); - // Compute inner product with dequantization using the common function const float res = SQ8_InnerProductSIMD_SVE_IMP(pVect1v, pVect2v, dimension); // For cosine, we need to account for the vector norms - // The inv_norm parameter is stored after min_val and delta in the quantized vector - return 1.0f - res * inv_norm; + return 1.0f - res; } diff --git a/src/VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h b/src/VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h new file mode 100644 index 000000000..e0369f5b7 --- /dev/null +++ b/src/VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ +#pragma once +#include "VecSim/spaces/space_includes.h" +#include "VecSim/spaces/IP/IP_SVE_UINT8.h" +#include + +/** + * SQ8-to-SQ8 distance functions using ARM SVE with precomputed sum. + * These functions compute distance between two SQ8 (scalar quantized 8-bit) vectors, + * where BOTH vectors are uint8 quantized. + * + * Uses precomputed sum stored in the vector data, + * eliminating the need to compute them during distance calculation. + * + * Uses algebraic optimization with SVE dot product instruction: + * + * With sum = Σv[i] (sum of original float values), the formula is: + * IP = min1*sum2 + min2*sum1 + δ1*δ2 * Σ(q1[i]*q2[i]) - dim*min1*min2 + * + * Since sum is precomputed, we only need to compute the dot product Σ(q1[i]*q2[i]). + * The dot product is computed using the efficient UINT8_InnerProductImp which uses + * SVE dot product instruction (svdot_u32) for native uint8 dot product computation. + * + * Vector layout: [uint8_t values (dim)] [min_val (float)] [delta (float)] [sum (float)] + */ + +// Common implementation for inner product between two SQ8 vectors with precomputed sum +// Uses UINT8_InnerProductImp for efficient dot product computation with SVE +template +float SQ8_SQ8_InnerProductSIMD_SVE_IMP(const void *pVec1v, const void *pVec2v, size_t dimension) { + // Compute raw dot product using efficient UINT8 SVE implementation + // UINT8_InnerProductImp uses svdot_u32 for native uint8 dot product + float dot_product = + UINT8_InnerProductImp(pVec1v, pVec2v, dimension); + + // Get dequantization parameters and precomputed values from the end of vectors + // Layout: [data (dim)] [min (float)] [delta (float)] [sum (float)] + const uint8_t *pVec1 = static_cast(pVec1v); + const uint8_t *pVec2 = static_cast(pVec2v); + + const float *params1 = reinterpret_cast(pVec1 + dimension); + const float min1 = params1[0]; + const float delta1 = params1[1]; + const float sum1 = params1[2]; // Precomputed sum of original float elements + + const float *params2 = reinterpret_cast(pVec2 + dimension); + const float min2 = params2[0]; + const float delta2 = params2[1]; + const float sum2 = params2[2]; // Precomputed sum of original float elements + + // Apply algebraic formula with float conversion only at the end: + // IP = min1*sum2 + min2*sum1 + δ1*δ2 * Σ(q1*q2) - dim*min1*min2 + return min1 * sum2 + min2 * sum1 + delta1 * delta2 * dot_product - + static_cast(dimension) * min1 * min2; +} + +// SQ8-to-SQ8 Inner Product distance function +// Returns 1 - inner_product (distance form) +template +float SQ8_SQ8_InnerProductSIMD_SVE(const void *pVec1v, const void *pVec2v, size_t dimension) { + return 1.0f - SQ8_SQ8_InnerProductSIMD_SVE_IMP(pVec1v, pVec2v, + dimension); +} + +// SQ8-to-SQ8 Cosine distance function +// Returns 1 - inner_product (assumes vectors are pre-normalized) +template +float SQ8_SQ8_CosineSIMD_SVE(const void *pVec1v, const void *pVec2v, size_t dimension) { + // Assume vectors are normalized. + return SQ8_SQ8_InnerProductSIMD_SVE(pVec1v, pVec2v, dimension); +} diff --git a/src/VecSim/spaces/IP_space.cpp b/src/VecSim/spaces/IP_space.cpp index d24c1d142..c25f0d043 100644 --- a/src/VecSim/spaces/IP_space.cpp +++ b/src/VecSim/spaces/IP_space.cpp @@ -150,6 +150,95 @@ dist_func_t Cosine_SQ8_GetDistFunc(size_t dim, unsigned char *alignment, return ret_dist_func; } +// SQ8-to-SQ8 Inner Product distance function (both vectors are uint8 quantized with precomputed +// sum) +dist_func_t IP_SQ8_SQ8_GetDistFunc(size_t dim, unsigned char *alignment, + const void *arch_opt) { + unsigned char dummy_alignment; + if (alignment == nullptr) { + alignment = &dummy_alignment; + } + + dist_func_t ret_dist_func = SQ8_SQ8_InnerProduct; + [[maybe_unused]] auto features = getCpuOptimizationFeatures(arch_opt); + +#ifdef CPU_FEATURES_ARCH_AARCH64 +#ifdef OPT_SVE2 + if (features.sve2) { + return Choose_SQ8_SQ8_IP_implementation_SVE2(dim); + } +#endif +#ifdef OPT_SVE + if (features.sve) { + return Choose_SQ8_SQ8_IP_implementation_SVE(dim); + } +#endif +#ifdef OPT_NEON_DOTPROD + if (features.asimddp && dim >= 16) { + return Choose_SQ8_SQ8_IP_implementation_NEON_DOTPROD(dim); + } +#endif +#ifdef OPT_NEON + if (features.asimd && dim >= 16) { + return Choose_SQ8_SQ8_IP_implementation_NEON(dim); + } +#endif +#endif // AARCH64 + +#ifdef CPU_FEATURES_ARCH_X86_64 +#ifdef OPT_AVX512_F_BW_VL_VNNI + if (dim >= 64 && features.avx512f && features.avx512bw && features.avx512vnni) { + return Choose_SQ8_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(dim); + } +#endif +#endif // __x86_64__ + return ret_dist_func; +} + +// SQ8-to-SQ8 Cosine distance function (both vectors are uint8 quantized with precomputed sum) +dist_func_t Cosine_SQ8_SQ8_GetDistFunc(size_t dim, unsigned char *alignment, + const void *arch_opt) { + unsigned char dummy_alignment; + if (alignment == nullptr) { + alignment = &dummy_alignment; + } + + dist_func_t ret_dist_func = SQ8_SQ8_Cosine; + [[maybe_unused]] auto features = getCpuOptimizationFeatures(arch_opt); + +#ifdef CPU_FEATURES_ARCH_AARCH64 +#ifdef OPT_SVE2 + if (features.sve2) { + return Choose_SQ8_SQ8_Cosine_implementation_SVE2(dim); + } +#endif +#ifdef OPT_SVE + if (features.sve) { + return Choose_SQ8_SQ8_Cosine_implementation_SVE(dim); + } +#endif +#ifdef OPT_NEON_DOTPROD + if (features.asimddp && dim >= 16) { + return Choose_SQ8_SQ8_Cosine_implementation_NEON_DOTPROD(dim); + } +#endif +#ifdef OPT_NEON + if (features.asimd && dim >= 16) { + return Choose_SQ8_SQ8_Cosine_implementation_NEON(dim); + } +#endif +#endif // AARCH64 + +#ifdef CPU_FEATURES_ARCH_X86_64 +#ifdef OPT_AVX512_F_BW_VL_VNNI + if (dim >= 64 && features.avx512f && features.avx512bw && features.avx512vnni) { + return Choose_SQ8_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(dim); + } +#endif +#endif // __x86_64__ + return ret_dist_func; +} + dist_func_t IP_FP32_GetDistFunc(size_t dim, unsigned char *alignment, const void *arch_opt) { unsigned char dummy_alignment; if (alignment == nullptr) { diff --git a/src/VecSim/spaces/IP_space.h b/src/VecSim/spaces/IP_space.h index db2d0b2d9..9a03c6a96 100644 --- a/src/VecSim/spaces/IP_space.h +++ b/src/VecSim/spaces/IP_space.h @@ -31,4 +31,9 @@ dist_func_t Cosine_UINT8_GetDistFunc(size_t dim, unsigned char *alignment const void *arch_opt = nullptr); dist_func_t Cosine_SQ8_GetDistFunc(size_t dim, unsigned char *alignment = nullptr, const void *arch_opt = nullptr); +// SQ8-to-SQ8 distance functions (both vectors are uint8 quantized with precomputed sum) +dist_func_t IP_SQ8_SQ8_GetDistFunc(size_t dim, unsigned char *alignment = nullptr, + const void *arch_opt = nullptr); +dist_func_t Cosine_SQ8_SQ8_GetDistFunc(size_t dim, unsigned char *alignment = nullptr, + const void *arch_opt = nullptr); } // namespace spaces diff --git a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp index fef34dd22..256d2eea2 100644 --- a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp +++ b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.cpp @@ -17,6 +17,8 @@ #include "VecSim/spaces/IP/IP_AVX512F_SQ8_BW_VL_VNNI.h" #include "VecSim/spaces/L2/L2_AVX512F_BW_VL_VNNI_SQ8.h" +#include "VecSim/spaces/IP/IP_AVX512F_BW_VL_VNNI_SQ8_SQ8.h" + namespace spaces { #include "implementation_chooser.h" @@ -72,6 +74,18 @@ dist_func_t Choose_SQ8_L2_implementation_AVX512F_BW_VL_VNNI(size_t dim) { CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 16, SQ8_L2SqrSIMD16_AVX512F_BW_VL_VNNI); return ret_dist_func; } +// SQ8-to-SQ8 distance functions (both vectors are uint8 quantized with precomputed sum) +dist_func_t Choose_SQ8_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 64, SQ8_SQ8_InnerProductSIMD64_AVX512F_BW_VL_VNNI); + return ret_dist_func; +} + +dist_func_t Choose_SQ8_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 64, SQ8_SQ8_CosineSIMD64_AVX512F_BW_VL_VNNI); + return ret_dist_func; +} #include "implementation_chooser_cleanup.h" diff --git a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h index 745a339fb..f3127d577 100644 --- a/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h +++ b/src/VecSim/spaces/functions/AVX512F_BW_VL_VNNI.h @@ -24,4 +24,8 @@ dist_func_t Choose_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(size_t dim); dist_func_t Choose_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t dim); dist_func_t Choose_SQ8_L2_implementation_AVX512F_BW_VL_VNNI(size_t dim); +// SQ8-to-SQ8 distance functions (both vectors are uint8 quantized with precomputed sum) +dist_func_t Choose_SQ8_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(size_t dim); +dist_func_t Choose_SQ8_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(size_t dim); + } // namespace spaces diff --git a/src/VecSim/spaces/functions/NEON.cpp b/src/VecSim/spaces/functions/NEON.cpp index d0b5c9160..df181ecad 100644 --- a/src/VecSim/spaces/functions/NEON.cpp +++ b/src/VecSim/spaces/functions/NEON.cpp @@ -17,6 +17,7 @@ #include "VecSim/spaces/IP/IP_NEON_FP64.h" #include "VecSim/spaces/L2/L2_NEON_SQ8.h" #include "VecSim/spaces/IP/IP_NEON_SQ8.h" +#include "VecSim/spaces/IP/IP_NEON_SQ8_SQ8.h" namespace spaces { @@ -99,6 +100,20 @@ dist_func_t Choose_SQ8_Cosine_implementation_NEON(size_t dim) { return ret_dist_func; } +// SQ8-to-SQ8 distance functions (both vectors are uint8 quantized with precomputed sum) +// Uses 64-element chunking to leverage efficient UINT8_InnerProductImp +dist_func_t Choose_SQ8_SQ8_IP_implementation_NEON(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 64, SQ8_SQ8_InnerProductSIMD64_NEON); + return ret_dist_func; +} + +dist_func_t Choose_SQ8_SQ8_Cosine_implementation_NEON(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 64, SQ8_SQ8_CosineSIMD64_NEON); + return ret_dist_func; +} + #include "implementation_chooser_cleanup.h" } // namespace spaces diff --git a/src/VecSim/spaces/functions/NEON.h b/src/VecSim/spaces/functions/NEON.h index 1449c6ac5..1c3dba285 100644 --- a/src/VecSim/spaces/functions/NEON.h +++ b/src/VecSim/spaces/functions/NEON.h @@ -30,4 +30,8 @@ dist_func_t Choose_SQ8_L2_implementation_NEON(size_t dim); dist_func_t Choose_SQ8_IP_implementation_NEON(size_t dim); dist_func_t Choose_SQ8_Cosine_implementation_NEON(size_t dim); +// SQ8-to-SQ8 distance functions (both vectors are uint8 quantized with precomputed sum) +dist_func_t Choose_SQ8_SQ8_IP_implementation_NEON(size_t dim); +dist_func_t Choose_SQ8_SQ8_Cosine_implementation_NEON(size_t dim); + } // namespace spaces diff --git a/src/VecSim/spaces/functions/NEON_DOTPROD.cpp b/src/VecSim/spaces/functions/NEON_DOTPROD.cpp index 02f098420..d9ec6da35 100644 --- a/src/VecSim/spaces/functions/NEON_DOTPROD.cpp +++ b/src/VecSim/spaces/functions/NEON_DOTPROD.cpp @@ -9,6 +9,7 @@ #include "NEON.h" #include "VecSim/spaces/IP/IP_NEON_DOTPROD_INT8.h" #include "VecSim/spaces/IP/IP_NEON_DOTPROD_UINT8.h" +#include "VecSim/spaces/IP/IP_NEON_DOTPROD_SQ8_SQ8.h" #include "VecSim/spaces/L2/L2_NEON_DOTPROD_INT8.h" #include "VecSim/spaces/L2/L2_NEON_DOTPROD_UINT8.h" @@ -52,6 +53,19 @@ dist_func_t Choose_UINT8_L2_implementation_NEON_DOTPROD(size_t dim) { return ret_dist_func; } +// SQ8-to-SQ8 distance functions (both vectors are uint8 quantized with precomputed sum) +dist_func_t Choose_SQ8_SQ8_IP_implementation_NEON_DOTPROD(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 64, SQ8_SQ8_InnerProductSIMD64_NEON_DOTPROD); + return ret_dist_func; +} + +dist_func_t Choose_SQ8_SQ8_Cosine_implementation_NEON_DOTPROD(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_IMPLEMENTATION(ret_dist_func, dim, 64, SQ8_SQ8_CosineSIMD64_NEON_DOTPROD); + return ret_dist_func; +} + #include "implementation_chooser_cleanup.h" } // namespace spaces diff --git a/src/VecSim/spaces/functions/NEON_DOTPROD.h b/src/VecSim/spaces/functions/NEON_DOTPROD.h index 199e57708..6e98358c5 100644 --- a/src/VecSim/spaces/functions/NEON_DOTPROD.h +++ b/src/VecSim/spaces/functions/NEON_DOTPROD.h @@ -21,4 +21,8 @@ dist_func_t Choose_UINT8_Cosine_implementation_NEON_DOTPROD(size_t dim); dist_func_t Choose_INT8_L2_implementation_NEON_DOTPROD(size_t dim); dist_func_t Choose_UINT8_L2_implementation_NEON_DOTPROD(size_t dim); +// SQ8-to-SQ8 DOTPROD-optimized distance functions (with precomputed sum) +dist_func_t Choose_SQ8_SQ8_IP_implementation_NEON_DOTPROD(size_t dim); +dist_func_t Choose_SQ8_SQ8_Cosine_implementation_NEON_DOTPROD(size_t dim); + } // namespace spaces diff --git a/src/VecSim/spaces/functions/SVE.cpp b/src/VecSim/spaces/functions/SVE.cpp index 208763779..d3f0a757d 100644 --- a/src/VecSim/spaces/functions/SVE.cpp +++ b/src/VecSim/spaces/functions/SVE.cpp @@ -25,6 +25,8 @@ #include "VecSim/spaces/IP/IP_SVE_SQ8.h" #include "VecSim/spaces/L2/L2_SVE_SQ8.h" +#include "VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h" + namespace spaces { #include "implementation_chooser.h" @@ -116,6 +118,20 @@ dist_func_t Choose_SQ8_L2_implementation_SVE(size_t dim) { return ret_dist_func; } +// SQ8-to-SQ8 distance functions (both vectors are uint8 quantized with precomputed sum) +// Note: Use svcntb for uint8 elements (not svcntw which is for 32-bit elements) +dist_func_t Choose_SQ8_SQ8_IP_implementation_SVE(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_SQ8_InnerProductSIMD_SVE, dim, svcntb); + return ret_dist_func; +} + +dist_func_t Choose_SQ8_SQ8_Cosine_implementation_SVE(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_SQ8_CosineSIMD_SVE, dim, svcntb); + return ret_dist_func; +} + #include "implementation_chooser_cleanup.h" } // namespace spaces diff --git a/src/VecSim/spaces/functions/SVE.h b/src/VecSim/spaces/functions/SVE.h index 680e906e6..4cce8cfc8 100644 --- a/src/VecSim/spaces/functions/SVE.h +++ b/src/VecSim/spaces/functions/SVE.h @@ -33,4 +33,8 @@ dist_func_t Choose_SQ8_IP_implementation_SVE(size_t dim); dist_func_t Choose_SQ8_Cosine_implementation_SVE(size_t dim); dist_func_t Choose_SQ8_L2_implementation_SVE(size_t dim); +// SQ8-to-SQ8 distance functions (both vectors are uint8 quantized with precomputed sum) +dist_func_t Choose_SQ8_SQ8_IP_implementation_SVE(size_t dim); +dist_func_t Choose_SQ8_SQ8_Cosine_implementation_SVE(size_t dim); + } // namespace spaces diff --git a/src/VecSim/spaces/functions/SVE2.cpp b/src/VecSim/spaces/functions/SVE2.cpp index 9df4b3b08..8d03fbe97 100644 --- a/src/VecSim/spaces/functions/SVE2.cpp +++ b/src/VecSim/spaces/functions/SVE2.cpp @@ -16,12 +16,13 @@ #include "VecSim/spaces/IP/IP_SVE_FP64.h" #include "VecSim/spaces/L2/L2_SVE_FP64.h" -#include "VecSim/spaces/L2/L2_SVE_INT8.h" // SVE2 implementation is identical to SVE -#include "VecSim/spaces/IP/IP_SVE_INT8.h" // SVE2 implementation is identical to SVE -#include "VecSim/spaces/L2/L2_SVE_UINT8.h" // SVE2 implementation is identical to SVE -#include "VecSim/spaces/IP/IP_SVE_UINT8.h" // SVE2 implementation is identical to SVE -#include "VecSim/spaces/IP/IP_SVE_SQ8.h" // SVE2 implementation is identical to SVE -#include "VecSim/spaces/L2/L2_SVE_SQ8.h" // SVE2 implementation is identical to SVE +#include "VecSim/spaces/L2/L2_SVE_INT8.h" // SVE2 implementation is identical to SVE +#include "VecSim/spaces/IP/IP_SVE_INT8.h" // SVE2 implementation is identical to SVE +#include "VecSim/spaces/L2/L2_SVE_UINT8.h" // SVE2 implementation is identical to SVE +#include "VecSim/spaces/IP/IP_SVE_UINT8.h" // SVE2 implementation is identical to SVE +#include "VecSim/spaces/IP/IP_SVE_SQ8.h" // SVE2 implementation is identical to SVE +#include "VecSim/spaces/L2/L2_SVE_SQ8.h" // SVE2 implementation is identical to SVE +#include "VecSim/spaces/IP/IP_SVE_SQ8_SQ8.h" // SVE2 implementation is identical to SVE namespace spaces { @@ -114,6 +115,20 @@ dist_func_t Choose_SQ8_L2_implementation_SVE2(size_t dim) { return ret_dist_func; } +// SQ8-to-SQ8 distance functions (both vectors are uint8 quantized) +// Note: Use svcntb for uint8 elements (not svcntw which is for 32-bit elements) +dist_func_t Choose_SQ8_SQ8_IP_implementation_SVE2(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_SQ8_InnerProductSIMD_SVE, dim, svcntb); + return ret_dist_func; +} + +dist_func_t Choose_SQ8_SQ8_Cosine_implementation_SVE2(size_t dim) { + dist_func_t ret_dist_func; + CHOOSE_SVE_IMPLEMENTATION(ret_dist_func, SQ8_SQ8_CosineSIMD_SVE, dim, svcntb); + return ret_dist_func; +} + #include "implementation_chooser_cleanup.h" } // namespace spaces diff --git a/src/VecSim/spaces/functions/SVE2.h b/src/VecSim/spaces/functions/SVE2.h index 059f38b1b..bf1a717c9 100644 --- a/src/VecSim/spaces/functions/SVE2.h +++ b/src/VecSim/spaces/functions/SVE2.h @@ -33,4 +33,8 @@ dist_func_t Choose_SQ8_IP_implementation_SVE2(size_t dim); dist_func_t Choose_SQ8_Cosine_implementation_SVE2(size_t dim); dist_func_t Choose_SQ8_L2_implementation_SVE2(size_t dim); +// SQ8-to-SQ8 distance functions (both vectors are uint8 quantized) +dist_func_t Choose_SQ8_SQ8_IP_implementation_SVE2(size_t dim); +dist_func_t Choose_SQ8_SQ8_Cosine_implementation_SVE2(size_t dim); + } // namespace spaces diff --git a/tests/benchmark/CMakeLists.txt b/tests/benchmark/CMakeLists.txt index d898fa85c..052207214 100644 --- a/tests/benchmark/CMakeLists.txt +++ b/tests/benchmark/CMakeLists.txt @@ -39,7 +39,7 @@ endif() # Spaces benchmarks # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # -set(DATA_TYPE fp32 fp64 bf16 fp16 int8 uint8 sq8) +set(DATA_TYPE fp32 fp64 bf16 fp16 int8 uint8 sq8 sq8_sq8) foreach(data_type IN LISTS DATA_TYPE) add_executable(bm_spaces_${data_type} spaces_benchmarks/bm_spaces_${data_type}.cpp) target_link_libraries(bm_spaces_${data_type} VectorSimilarity benchmark::benchmark) diff --git a/tests/benchmark/benchmarks.sh b/tests/benchmark/benchmarks.sh index bc8db7535..00eaf47a0 100755 --- a/tests/benchmark/benchmarks.sh +++ b/tests/benchmark/benchmarks.sh @@ -20,6 +20,7 @@ if [ -z "$BM_TYPE" ] || [ "$BM_TYPE" = "benchmarks-all" ]; then echo spaces_int8 echo spaces_uint8 echo spaces_sq8 + echo spaces_sq8_sq8 elif [ "$BM_TYPE" = "benchmarks-default" ]; then echo basics_single_fp32 @@ -31,6 +32,7 @@ elif [ "$BM_TYPE" = "benchmarks-default" ]; then echo spaces_int8 echo spaces_uint8 echo spaces_sq8 + echo spaces_sq8_sq8 # Basic benchmarks @@ -97,6 +99,9 @@ elif [ "$BM_TYPE" = "bm-svs-train-fp16" ] ; then elif [ "$BM_TYPE" = "bm-basics-svs-fp32-single" ] ; then echo basics_svs_single_fp32 echo basics_svs_single_fp32_LVQ8 +elif [ "$BM_TYPE" = "bm-spaces-sq8-full" ] ; then + echo spaces_sq8 + echo spaces_sq8_sq8 # Spaces benchmarks elif [ "$BM_TYPE" = "bm-spaces" ] ; then @@ -107,6 +112,7 @@ elif [ "$BM_TYPE" = "bm-spaces" ] ; then echo spaces_int8 echo spaces_uint8 echo spaces_sq8 + echo spaces_sq8_sq8 elif [ "$BM_TYPE" = "bm-spaces-fp32" ] ; then echo spaces_fp32 @@ -122,4 +128,6 @@ elif [ "$BM_TYPE" = "bm-spaces-uint8" ] ; then echo spaces_uint8 elif [ "$BM_TYPE" = "bm-spaces-sq8" ] ; then echo spaces_sq8 +elif [ "$BM_TYPE" = "bm-spaces-sq8-sq8" ] ; then + echo spaces_sq8_sq8 fi diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp index 1349a3512..f1d9ebd90 100644 --- a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp +++ b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8.cpp @@ -25,8 +25,8 @@ class BM_VecSimSpaces_SQ8 : public benchmark::Fixture { v1 = new float[dim]; test_utils::populate_float_vec(v1, dim, 123); // Allocate vector with extra space for min, delta and cosine calculations - v2 = new uint8_t[dim + sizeof(float) * 3]; - test_utils::populate_float_vec_to_sq8(v2, dim, 1234); + v2 = new uint8_t[dim + sizeof(float) * 4]; + test_utils::populate_float_vec_to_sq8_with_metadata(v2, dim, 1234, true); } void TearDown(const ::benchmark::State &state) { delete v1; diff --git a/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_sq8.cpp b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_sq8.cpp new file mode 100644 index 000000000..13c28ee4e --- /dev/null +++ b/tests/benchmark/spaces_benchmarks/bm_spaces_sq8_sq8.cpp @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2006-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ +#include "bm_spaces.h" +#include "utils/tests_utils.h" + +/** + * SQ8-to-SQ8 benchmarks: Both vectors are uint8 quantized with dequantization applied to both. + */ +class BM_VecSimSpaces_SQ8_SQ8 : public benchmark::Fixture { +protected: + std::mt19937 rng; + size_t dim; + uint8_t *v1; + uint8_t *v2; + +public: + BM_VecSimSpaces_SQ8_SQ8() { rng.seed(47); } + ~BM_VecSimSpaces_SQ8_SQ8() = default; + + void SetUp(const ::benchmark::State &state) { + dim = state.range(0); + // Allocate both vectors with extra space for min, delta, sum, and sum_squares + // Vector layout: [uint8_t values (dim)] [min (float)] [delta (float)] [sum (float)] + // [sum_squares (float)] + v1 = new uint8_t[dim + sizeof(float) * 4]; + v2 = new uint8_t[dim + sizeof(float) * 4]; + test_utils::populate_float_vec_to_sq8_with_metadata(v1, dim, true, 123); + test_utils::populate_float_vec_to_sq8_with_metadata(v2, dim, true, 1234); + } + void TearDown(const ::benchmark::State &state) { + delete[] v1; + delete[] v2; + } +}; + +#ifdef CPU_FEATURES_ARCH_AARCH64 +cpu_features::Aarch64Features opt = cpu_features::GetAarch64Info().features; + +// NEON SQ8-to-SQ8 functions +#ifdef OPT_NEON +bool neon_supported = opt.asimd; +INITIALIZE_BENCHMARKS_SET_IP(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, NEON, 64, neon_supported); +INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, NEON, 64, neon_supported); +#endif // NEON +// SVE SQ8-to-SQ8 functions +#ifdef OPT_SVE +bool sve_supported = opt.sve; +INITIALIZE_BENCHMARKS_SET_IP(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, SVE, 16, sve_supported); +INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, SVE, 16, sve_supported); +#endif // SVE +// SVE2 SQ8-to-SQ8 functions +#ifdef OPT_SVE2 +bool sve2_supported = opt.sve2; +INITIALIZE_BENCHMARKS_SET_IP(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, SVE2, 16, sve2_supported); +INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, SVE2, 16, sve2_supported); +#endif // SVE2 +#endif // AARCH64 + +#ifdef CPU_FEATURES_ARCH_X86_64 +cpu_features::X86Features opt = cpu_features::GetX86Info().features; + +// AVX512_F_BW_VL_VNNI SQ8-to-SQ8 functions +#ifdef OPT_AVX512_F_BW_VL_VNNI +bool avx512_f_bw_vl_vnni_supported = opt.avx512f && opt.avx512bw && opt.avx512vl && opt.avx512vnni; +INITIALIZE_BENCHMARKS_SET_IP(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, AVX512F_BW_VL_VNNI, 64, + avx512_f_bw_vl_vnni_supported); +INITIALIZE_BENCHMARKS_SET_Cosine(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, AVX512F_BW_VL_VNNI, 64, + avx512_f_bw_vl_vnni_supported); + +#endif // AVX512_F_BW_VL_VNNI +#endif // x86_64 + +// Naive SQ8-to-SQ8 algorithms +INITIALIZE_NAIVE_BM(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, InnerProduct, 16); +INITIALIZE_NAIVE_BM(BM_VecSimSpaces_SQ8_SQ8, SQ8_SQ8, Cosine, 16); + +BENCHMARK_MAIN(); diff --git a/tests/unit/test_spaces.cpp b/tests/unit/test_spaces.cpp index dabe9c794..0ae60ee86 100644 --- a/tests/unit/test_spaces.cpp +++ b/tests/unit/test_spaces.cpp @@ -43,6 +43,7 @@ using bfloat16 = vecsim_types::bfloat16; using float16 = vecsim_types::float16; +using namespace spaces; class SpacesTest : public ::testing::Test { @@ -319,7 +320,7 @@ void common_ip_sq8(bool should_normalize, float expected_dist) { } // Create SQ8 compressed version of v2 - // Size: dim (uint8_t) + min_val (float) + delta (float) + inv_norm (float) + // Size: dim (uint8_t) + min_val (float) + delta (float) + sum (float) + sum_squares (float) size_t compressed_size = dim * sizeof(uint8_t) + 3 * sizeof(float); if (should_normalize) { spaces::GetNormalizeFunc()(v1_orig, dim); @@ -329,12 +330,14 @@ void common_ip_sq8(bool should_normalize, float expected_dist) { // Find min and max for quantization float min_val = v2_orig[0]; float max_val = v2_orig[0]; + float sum = v2_orig[0]; for (size_t i = 1; i < dim; i++) { min_val = std::min(min_val, v2_orig[i]); max_val = std::max(max_val, v2_orig[i]); + sum += v2_orig[i]; } - // Calculate delta and inverse norm + // Calculate delta float delta = (max_val - min_val) / 255.0f; if (delta == 0) delta = 1.0f; // Avoid division by zero @@ -348,12 +351,13 @@ void common_ip_sq8(bool should_normalize, float expected_dist) { // Store parameters params[0] = min_val; params[1] = delta; + params[2] = sum; // Quantize each value for (size_t i = 0; i < dim; i++) { - float normalized = (v2_orig[i] - min_val) / delta; - normalized = std::max(0.0f, std::min(255.0f, normalized)); - quant_values[i] = static_cast(std::round(normalized)); + float quantized = (v2_orig[i] - min_val) / delta; + quantized = std::max(0.0f, std::min(255.0f, quantized)); + quant_values[i] = static_cast(std::round(quantized)); } float dist = SQ8_InnerProduct((const void *)v1_orig, (const void *)v2_compressed.data(), dim); @@ -385,12 +389,15 @@ TEST_F(SpacesTest, SQ8_Cosine_no_optimization_func_test) { // Size: dim (uint8_t) + min_val (float) + delta (float) + inv_norm (float) size_t compressed_size = dim * sizeof(uint8_t) + 3 * sizeof(float); spaces::GetNormalizeFunc()(v1_orig, dim); + spaces::GetNormalizeFunc()(v2_orig, dim); // Find min and max for quantization float min_val = v2_orig[0]; float max_val = v2_orig[0]; + float sum = v2_orig[0]; for (size_t i = 1; i < dim; i++) { min_val = std::min(min_val, v2_orig[i]); max_val = std::max(max_val, v2_orig[i]); + sum += v2_orig[i]; } // Calculate delta and inverse norm float delta = (max_val - min_val) / 255.0f; @@ -408,20 +415,13 @@ TEST_F(SpacesTest, SQ8_Cosine_no_optimization_func_test) { normalized = std::max(0.0f, std::min(255.0f, normalized)); quant_values[i] = static_cast(std::round(normalized)); } - // Calculate inverse norm from decompressed values - float inv_norm = 0.0f; - for (size_t i = 0; i < dim; i++) { - float decompressed_value = min_val + quant_values[i] * delta; - inv_norm += decompressed_value * decompressed_value; - } - inv_norm = 1.0f / std::sqrt(inv_norm); // Store parameters params[0] = min_val; params[1] = delta; - params[2] = inv_norm; + params[2] = sum; float dist = SQ8_Cosine((const void *)v1_orig, (const void *)v2_compressed.data(), dim); - ASSERT_NEAR(dist, 0.0f, 0.000001f) << "SQ8_Cosine failed to match expected distance"; + ASSERT_NEAR(dist, 0.0f, 0.001f) << "SQ8_Cosine failed to match expected distance"; } TEST_F(SpacesTest, SQ8_l2sqr_no_optimization_func_test) { // create a vector with extra space for the norm @@ -509,8 +509,6 @@ TEST_F(SpacesTest, GetDistFuncInvalidMetricUINT8) { (spaces::GetDistFunc((VecSimMetric)(VecSimMetric_Cosine + 1), 10, nullptr)), std::invalid_argument); } - -using namespace spaces; #ifdef CPU_FEATURES_ARCH_X86_64 TEST_F(SpacesTest, smallDimChooser) { // Verify that small dimensions gets the no optimization function. @@ -2467,3 +2465,594 @@ TEST_P(SQ8SpacesOptimizationTest, SQ8CosineTest) { << "No optimization with dim " << dim; ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; } + +/* ======================== Tests SQ8_SQ8 ========================= */ + +TEST_F(SpacesTest, SQ8_SQ8_ip_no_optimization_func_test) { + size_t dim = 5; + + // Create SQ8 quantized versions of both vectors + size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); + std::vector v1_quantized(quantized_size); + std::vector v2_quantized(quantized_size); + test_utils::populate_float_vec_to_sq8_with_metadata(v1_quantized.data(), dim, true, 1234); + test_utils::populate_float_vec_to_sq8_with_metadata(v2_quantized.data(), dim, true, 5678); + + float baseline = test_utils::SQ8_SQ8_NotOptimized_InnerProduct(v1_quantized.data(), + v2_quantized.data(), dim); + + unsigned char alignment = 0; +#ifdef CPU_FEATURES_ARCH_AARCH64 + // Make sure we don't use any optimization (because there is no size optimization for arm) + auto optimization = getCpuOptimizationFeatures(); + optimization.sve = optimization.sve2 = 0; + auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); +#else + auto arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, nullptr); +#endif + ASSERT_EQ(arch_opt_func, SQ8_SQ8_InnerProduct) + << "Unexpected distance function chosen for dim " << dim; + // Checks that the function with the optimized math equivalence returns similar result. + // Use ASSERT_NEAR due to floating-point differences between naive and algebraic formulas. + ASSERT_NEAR(baseline, arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim), 0.001) + << "No optimization with dim " << dim; + ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; +} + +TEST_F(SpacesTest, SQ8_SQ8_Cosine_no_optimization_func_test) { + size_t dim = 5; + + // Create SQ8 quantized versions of both vectors + size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); + std::vector v1_quantized(quantized_size); + std::vector v2_quantized(quantized_size); + test_utils::populate_float_vec_to_sq8_with_metadata(v1_quantized.data(), dim, true, 1234); + test_utils::populate_float_vec_to_sq8_with_metadata(v2_quantized.data(), dim, true, 5678); + + float baseline = + test_utils::SQ8_SQ8_NotOptimized_Cosine(v1_quantized.data(), v2_quantized.data(), dim); + + unsigned char alignment = 0; +#ifdef CPU_FEATURES_ARCH_AARCH64 + // Make sure we don't use any optimization (because there is no size optimization for arm) + auto optimization = getCpuOptimizationFeatures(); + optimization.sve = optimization.sve2 = 0; + auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); +#else + auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, nullptr); +#endif + ASSERT_EQ(arch_opt_func, SQ8_SQ8_Cosine) + << "Unexpected distance function chosen for dim " << dim; + // Checks that the function with the optimized math equivalence returns the same result. + // min1*sum2 + min2*sum1 + delta1*delta2*Σ(q1[i]*q2[i]) - dim*min1*min2 + ASSERT_NEAR(baseline, arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim), 0.001) + << "No optimization with dim " << dim; + ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; +} + +class SQ8_SQ8_SpacesOptimizationTest : public testing::TestWithParam {}; + +TEST_P(SQ8_SQ8_SpacesOptimizationTest, SQ8_SQ8_InnerProductTest) { + auto optimization = getCpuOptimizationFeatures(); + size_t dim = GetParam(); + + // Create SQ8 quantized versions of both vectors + size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); + std::vector v1_quantized(quantized_size); + std::vector v2_quantized(quantized_size); + test_utils::populate_float_vec_to_sq8_with_metadata(v1_quantized.data(), dim, true, 1234); + test_utils::populate_float_vec_to_sq8_with_metadata(v2_quantized.data(), dim, true, 5678); + + dist_func_t arch_opt_func; + float baseline = SQ8_SQ8_InnerProduct(v1_quantized.data(), v2_quantized.data(), dim); + +#ifdef OPT_SVE2 + if (optimization.sve2) { + unsigned char alignment = 0; + arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_SQ8_IP_implementation_SVE2(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim), 0.01) + << "SVE2 with dim " << dim; + optimization.sve2 = 0; + } +#endif +#ifdef OPT_SVE + if (optimization.sve) { + unsigned char alignment = 0; + arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_SQ8_IP_implementation_SVE(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim), 0.01) + << "SVE with dim " << dim; + optimization.sve = 0; + } +#endif +#ifdef OPT_NEON_DOTPROD + if (optimization.asimddp && dim >= 64) { + unsigned char alignment = 0; + arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_SQ8_IP_implementation_NEON_DOTPROD(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim), 0.01) + << "NEON_DOTPROD with dim " << dim; + optimization.asimddp = 0; + } +#endif +#ifdef OPT_NEON + if (optimization.asimd) { + unsigned char alignment = 0; + arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_SQ8_IP_implementation_NEON(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim), 0.01) + << "NEON with dim " << dim; + optimization.asimd = 0; + } +#endif + +#ifdef OPT_AVX512_F_BW_VL_VNNI + if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) { + unsigned char alignment = 0; + arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_SQ8_IP_implementation_AVX512F_BW_VL_VNNI(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim), 0.01) + << "AVX512 with dim " << dim; + optimization.avx512f = 0; + } +#endif + + // Test default implementation + unsigned char alignment = 0; + arch_opt_func = IP_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, SQ8_SQ8_InnerProduct) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_EQ(baseline, arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim)) + << "No optimization with dim " << dim; + ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; +} + +TEST_P(SQ8_SQ8_SpacesOptimizationTest, SQ8_SQ8_CosineTest) { + auto optimization = getCpuOptimizationFeatures(); + size_t dim = GetParam(); + + // Create quantized vectors + // Size: dim (uint8_t) + min_val (float) + delta (float) + sum (float) + sum_squares (float) + size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); + std::vector v1_quantized(quantized_size); + std::vector v2_quantized(quantized_size); + test_utils::populate_float_vec_to_sq8_with_metadata(v1_quantized.data(), dim, true, 1234); + test_utils::populate_float_vec_to_sq8_with_metadata(v2_quantized.data(), dim, true, 5678); + + dist_func_t arch_opt_func; + float baseline = SQ8_SQ8_Cosine(v1_quantized.data(), v2_quantized.data(), dim); + +#ifdef OPT_SVE2 + if (optimization.sve2) { + unsigned char alignment = 0; + arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_SQ8_Cosine_implementation_SVE2(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim), 0.01) + << "SVE2 with dim " << dim; + optimization.sve2 = 0; + } +#endif +#ifdef OPT_SVE + if (optimization.sve) { + unsigned char alignment = 0; + arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_SQ8_Cosine_implementation_SVE(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim), 0.01) + << "SVE with dim " << dim; + optimization.sve = 0; + } +#endif +#ifdef OPT_NEON_DOTPROD + if (optimization.asimddp && dim >= 64) { + unsigned char alignment = 0; + arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_SQ8_Cosine_implementation_NEON_DOTPROD(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim), 0.01) + << "NEON_DOTPROD with dim " << dim; + optimization.asimddp = 0; + } +#endif +#ifdef OPT_NEON + if (optimization.asimd) { + unsigned char alignment = 0; + arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_SQ8_Cosine_implementation_NEON(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim), 0.01) + << "NEON with dim " << dim; + optimization.asimd = 0; + } +#endif + +#ifdef OPT_AVX512_F_BW_VL_VNNI + if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) { + unsigned char alignment = 0; + arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, Choose_SQ8_SQ8_Cosine_implementation_AVX512F_BW_VL_VNNI(dim)) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_NEAR(baseline, arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim), 0.01) + << "AVX512 with dim " << dim; + optimization.avx512f = 0; + } +#endif + + // Test default implementation + unsigned char alignment = 0; + arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(arch_opt_func, SQ8_SQ8_Cosine) + << "Unexpected distance function chosen for dim " << dim; + ASSERT_EQ(baseline, arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim)) + << "No optimization with dim " << dim; + ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; +} + +// Note: This suite intentionally uses a larger dimension range (64–128) than SQ8OptFuncs. +// It is designed to exercise SQ8–SQ8 cosine implementations, including SIMD paths +// that are only enabled or meaningfully stressed for dimensions >= 64. +INSTANTIATE_TEST_SUITE_P(SQ8_SQ8OptFuncs, SQ8_SQ8_SpacesOptimizationTest, + testing::Range(64UL, 64 * 2UL + 1)); + +// Test self-distance: distance to itself should be 0 for cosine (normalized vectors) +TEST(SQ8_SQ8_EdgeCases, SelfDistanceCosine) { + auto optimization = getCpuOptimizationFeatures(); + size_t dim = 128; + + size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); + std::vector v_quantized(quantized_size); + test_utils::populate_float_vec_to_sq8_with_metadata(v_quantized.data(), dim, true); + + float baseline = SQ8_SQ8_Cosine(v_quantized.data(), v_quantized.data(), dim); + + // Self-distance for cosine should be close to 0 + ASSERT_NEAR(baseline, 0.0f, 0.001f) << "Self-distance should be ~0 for cosine"; + +#ifdef OPT_SVE2 + if (optimization.sve2) { + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_quantized.data(), v_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) << "Optimized self-distance should match baseline"; + optimization.sve2 = 0; + } +#endif +#ifdef OPT_SVE + if (optimization.sve) { + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_quantized.data(), v_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) << "Optimized self-distance should match baseline"; + optimization.sve = 0; + } +#endif +#ifdef OPT_NEON_DOTPROD + if (optimization.asimddp) { + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_quantized.data(), v_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) << "Optimized self-distance should match baseline"; + optimization.asimddp = 0; + } +#endif +#ifdef OPT_NEON + if (optimization.asimd) { + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_quantized.data(), v_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) << "Optimized self-distance should match baseline"; + optimization.asimd = 0; + } +#endif +#ifdef OPT_AVX512_F_BW_VL_VNNI + if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) { + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_quantized.data(), v_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) << "Optimized self-distance should match baseline"; + optimization.avx512f = 0; + } +#endif + + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + ASSERT_EQ(baseline, arch_opt_func(v_quantized.data(), v_quantized.data(), dim)) + << "No optimization self-distance should match baseline"; + ASSERT_EQ(alignment, 0) << "No optimization with dim " << dim; +} + +// Test symmetry: dist(v1, v2) == dist(v2, v1) +TEST(SQ8_SQ8_EdgeCases, CosineSymmetryTest) { + size_t dim = 128; + auto optimization = getCpuOptimizationFeatures(); + size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); + std::vector v1_quantized(quantized_size); + std::vector v2_quantized(quantized_size); + test_utils::populate_float_vec_to_sq8_with_metadata(v1_quantized.data(), dim, true, 456, -1.0f, + 1.0f); + test_utils::populate_float_vec_to_sq8_with_metadata(v2_quantized.data(), dim, true, 123, -1.0f, + 1.0f); + + unsigned char alignment = 0; + +#ifdef OPT_SVE2 + if (optimization.sve2) { + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + float cos_12 = arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim); + float cos_21 = arch_opt_func(v2_quantized.data(), v1_quantized.data(), dim); + ASSERT_EQ(cos_12, cos_21) << "Optimized cosine should be symmetric"; + optimization.sve2 = 0; + } +#endif +#ifdef OPT_SVE + if (optimization.sve) { + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + float cos_12 = arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim); + float cos_21 = arch_opt_func(v2_quantized.data(), v1_quantized.data(), dim); + ASSERT_EQ(cos_12, cos_21) << "Optimized cosine should be symmetric"; + optimization.sve = 0; + } +#endif +#ifdef OPT_NEON_DOTPROD + if (optimization.asimddp) { + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + float cos_12 = arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim); + float cos_21 = arch_opt_func(v2_quantized.data(), v1_quantized.data(), dim); + ASSERT_EQ(cos_12, cos_21) << "Optimized cosine should be symmetric"; + optimization.asimddp = 0; + } +#endif +#ifdef OPT_NEON + if (optimization.asimd) { + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + float cos_12 = arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim); + float cos_21 = arch_opt_func(v2_quantized.data(), v1_quantized.data(), dim); + ASSERT_EQ(cos_12, cos_21) << "Optimized cosine should be symmetric"; + optimization.asimd = 0; + } +#endif +#ifdef OPT_AVX512_F_BW_VL_VNNI + if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) { + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + float cos_12 = arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim); + float cos_21 = arch_opt_func(v2_quantized.data(), v1_quantized.data(), dim); + ASSERT_EQ(cos_12, cos_21) << "Optimized cosine should be symmetric"; + optimization.avx512f = 0; + } +#endif + auto cosine_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, nullptr); + float cos_12 = cosine_func(v1_quantized.data(), v2_quantized.data(), dim); + float cos_21 = cosine_func(v2_quantized.data(), v1_quantized.data(), dim); + ASSERT_EQ(cos_12, cos_21) << "Cosine should be symmetric"; +} + +// Test with zero vector +TEST(SQ8_SQ8_EdgeCases, CosineZeroVectorTest) { + auto optimization = getCpuOptimizationFeatures(); + size_t dim = 128; + std::vector v_zero(dim, 0.0f); + + size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); + std::vector v_zero_quantized(quantized_size); + std::vector v_nonzero_quantized(quantized_size); + test_utils::quantize_float_vec_to_sq8_with_metadata(v_zero.data(), dim, + v_zero_quantized.data()); + test_utils::populate_float_vec_to_sq8_with_metadata(v_nonzero_quantized.data(), dim, true); + + float baseline = SQ8_SQ8_Cosine(v_zero_quantized.data(), v_nonzero_quantized.data(), dim); + +#ifdef OPT_SVE2 + if (optimization.sve2) { + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_zero_quantized.data(), v_nonzero_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) << "Optimized zero vector IP should match baseline"; + optimization.sve2 = 0; + } +#endif +#ifdef OPT_SVE + if (optimization.sve) { + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_zero_quantized.data(), v_nonzero_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) << "Optimized zero vector IP should match baseline"; + optimization.sve = 0; + } +#endif +#ifdef OPT_NEON_DOTPROD + if (optimization.asimddp) { + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_zero_quantized.data(), v_nonzero_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) << "Optimized zero vector IP should match baseline"; + optimization.asimddp = 0; + } +#endif +#ifdef OPT_NEON + if (optimization.asimd) { + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_zero_quantized.data(), v_nonzero_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) << "Optimized zero vector IP should match baseline"; + optimization.asimd = 0; + } +#endif +#ifdef OPT_AVX512_F_BW_VL_VNNI + if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) { + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_zero_quantized.data(), v_nonzero_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) << "Optimized zero vector IP should match baseline"; + optimization.avx512f = 0; + } +#endif + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, nullptr); + float result = arch_opt_func(v_zero_quantized.data(), v_nonzero_quantized.data(), dim); + + ASSERT_NEAR(result, baseline, 0.01f) << "Zero vector Cosine should match baseline"; +} + +// Test with constant vector (all same values) +TEST(SQ8_SQ8_EdgeCases, CosineConstantVectorTest) { + auto optimization = getCpuOptimizationFeatures(); + size_t dim = 128; + std::vector v_const(dim, 0.5f); + + size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); + std::vector v_const_quantized(quantized_size); + std::vector v_random_quantized(quantized_size); + spaces::GetNormalizeFunc()(v_const.data(), dim); + test_utils::quantize_float_vec_to_sq8_with_metadata(v_const.data(), dim, + v_const_quantized.data()); + test_utils::populate_float_vec_to_sq8_with_metadata(v_random_quantized.data(), dim, true); + + float baseline = SQ8_SQ8_Cosine(v_const_quantized.data(), v_random_quantized.data(), dim); +#ifdef OPT_SVE2 + if (optimization.sve2) { + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_const_quantized.data(), v_random_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) + << "Optimized constant vector Cosine should match baseline"; + optimization.sve2 = 0; + } +#endif +#ifdef OPT_SVE + if (optimization.sve) { + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_const_quantized.data(), v_random_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) + << "Optimized constant vector Cosine should match baseline"; + optimization.sve = 0; + } +#endif +#ifdef OPT_NEON_DOTPROD + if (optimization.asimddp) { + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_const_quantized.data(), v_random_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) + << "Optimized constant vector Cosine should match baseline"; + optimization.asimddp = 0; + } +#endif +#ifdef OPT_NEON + if (optimization.asimd) { + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_const_quantized.data(), v_random_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) + << "Optimized constant vector Cosine should match baseline"; + optimization.asimd = 0; + } +#endif +#ifdef OPT_AVX512_F_BW_VL_VNNI + if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) { + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v_const_quantized.data(), v_random_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) + << "Optimized constant vector Cosine should match baseline"; + optimization.avx512f = 0; + } +#endif + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, nullptr); + float result = arch_opt_func(v_const_quantized.data(), v_random_quantized.data(), dim); + + ASSERT_NEAR(result, baseline, 0.01f) << "Constant vector Cosine should match baseline"; +} + +// Test with extreme values (-1 and 1 only) +TEST(SQ8_SQ8_EdgeCases, CosineExtremeValuesTest) { + auto optimization = getCpuOptimizationFeatures(); + size_t dim = 128; + std::vector v1(dim), v2(dim); + + // Alternating extreme values + for (size_t i = 0; i < dim; i++) { + v1[i] = (i % 2 == 0) ? 1.0f : -1.0f; + v2[i] = (i % 3 == 0) ? 1.0f : -1.0f; + } + + spaces::GetNormalizeFunc()(v1.data(), dim); + spaces::GetNormalizeFunc()(v2.data(), dim); + + size_t quantized_size = dim * sizeof(uint8_t) + 4 * sizeof(float); + std::vector v1_quantized(quantized_size); + std::vector v2_quantized(quantized_size); + test_utils::quantize_float_vec_to_sq8_with_metadata(v1.data(), dim, v1_quantized.data()); + test_utils::quantize_float_vec_to_sq8_with_metadata(v2.data(), dim, v2_quantized.data()); + + float baseline = SQ8_SQ8_Cosine(v1_quantized.data(), v2_quantized.data(), dim); + +#ifdef OPT_SVE2 + if (optimization.sve2) { + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) + << "Optimized extreme values Cosine should match baseline"; + optimization.sve2 = 0; + } +#endif +#ifdef OPT_SVE + if (optimization.sve) { + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) + << "Optimized extreme values Cosine should match baseline"; + optimization.sve = 0; + } +#endif +#ifdef OPT_NEON_DOTPROD + if (optimization.asimddp) { + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) + << "Optimized extreme values Cosine should match baseline"; + optimization.asimddp = 0; + } +#endif +#ifdef OPT_NEON + if (optimization.asimd) { + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) + << "Optimized extreme values Cosine should match baseline"; + optimization.asimd = 0; + } +#endif +#ifdef OPT_AVX512_F_BW_VL_VNNI + if (optimization.avx512f && optimization.avx512bw && optimization.avx512vnni) { + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, &optimization); + float result = arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim); + ASSERT_NEAR(result, baseline, 0.01f) + << "Optimized extreme values Cosine should match baseline"; + optimization.avx512f = 0; + } +#endif + unsigned char alignment = 0; + auto arch_opt_func = Cosine_SQ8_SQ8_GetDistFunc(dim, &alignment, nullptr); + float result = arch_opt_func(v1_quantized.data(), v2_quantized.data(), dim); + + ASSERT_NEAR(result, baseline, 0.01f) << "Extreme values Cosine should match baseline"; +} diff --git a/tests/utils/tests_utils.h b/tests/utils/tests_utils.h index 64211b7ef..0479f2101 100644 --- a/tests/utils/tests_utils.h +++ b/tests/utils/tests_utils.h @@ -11,6 +11,7 @@ #include #include #include "VecSim/spaces/normalize/compute_norm.h" +#include "VecSim/spaces/spaces.h" #include "VecSim/types/float16.h" namespace test_utils { @@ -66,43 +67,94 @@ static void populate_float16_vec(vecsim_types::float16 *v, const size_t dim, int } } -static void quantize_float_vec_to_uint8(float *v, size_t dim, uint8_t *qv, int seed = 1234) { +/* + * SQ8_SQ8 distance function without the algebraic optimizations + * uses the regular dequantization formula: + * IP = Σ((min1 + delta1 * q1_i) * (min2 + delta2 * q2_i)) + * Used for testing the correctness of the optimized functions. + * + */ +static float SQ8_SQ8_NotOptimized_InnerProduct(const void *pVect1v, const void *pVect2v, + size_t dimension) { + + const auto *pVect1 = static_cast(pVect1v); + const auto *pVect2 = static_cast(pVect2v); + + // Get quantization parameters from pVect1 + const float min_val1 = *reinterpret_cast(pVect1 + dimension); + const float delta1 = *reinterpret_cast(pVect1 + dimension + sizeof(float)); + + // Get quantization parameters from pVect2 + const float min_val2 = *reinterpret_cast(pVect2 + dimension); + const float delta2 = *reinterpret_cast(pVect2 + dimension + sizeof(float)); + + // Compute inner product with dequantization + float res = 0.0f; + for (size_t i = 0; i < dimension; i++) { + res += (pVect1[i] * delta1 + min_val1) * (pVect2[i] * delta2 + min_val2); + } + return 1.0f - res; +} + +static float SQ8_SQ8_NotOptimized_Cosine(const void *pVect1v, const void *pVect2v, + size_t dimension) { + return SQ8_SQ8_NotOptimized_InnerProduct(pVect1v, pVect2v, dimension); +} +/** + * Quantize float vector to SQ8 with precomputed sum and sum_squares. + * Vector layout: [uint8_t values (dim)] [min (float)] [delta (float)] [sum (float)] [sum_squares + * (float)] where sum = Σv[i] and norm = Σv[i]² (sum of squares of uint8 elements) + */ +static void quantize_float_vec_to_sq8_with_metadata(const float *v, size_t dim, uint8_t *qv) { float min_val = v[0]; float max_val = v[0]; for (size_t i = 1; i < dim; i++) { min_val = std::min(min_val, v[i]); max_val = std::max(max_val, v[i]); } + + float sum = 0.0f; + float square_sum = 0.0f; + for (size_t i = 0; i < dim; i++) { + sum += v[i]; + square_sum += v[i] * v[i]; + } + // Calculate delta float delta = (max_val - min_val) / 255.0f; if (delta == 0) delta = 1.0f; // Avoid division by zero - float norm = 0.0f; + // Quantize each value for (size_t i = 0; i < dim; i++) { float normalized = (v[i] - min_val) / delta; normalized = std::max(0.0f, std::min(255.0f, normalized)); qv[i] = static_cast(std::round(normalized)); - norm += (qv[i] * delta + min_val) * (qv[i] * delta + min_val); } - float inv_norm = 1.0f / std::sqrt(norm); - // Store parameters + + // Store parameters: [min, delta, sum, square_sum] float *params = reinterpret_cast(qv + dim); params[0] = min_val; params[1] = delta; - params[2] = inv_norm; + params[2] = sum; + params[3] = square_sum; } -static void populate_float_vec_to_sq8(uint8_t *v, size_t dim, int seed = 1234) { - - std::mt19937 gen(seed); // Mersenne Twister engine initialized with the fixed seed - std::uniform_real_distribution dis(-1.0f, 1.0f); +/** + * Populate a float vector and quantize to SQ8 with precomputed sum and sum_squares. + * Vector layout: [uint8_t values (dim)] [min (float)] [delta (float)] [sum (float)] [sum_squares + * (float)] + */ +static void populate_float_vec_to_sq8_with_metadata(uint8_t *v, size_t dim, + bool should_normalize = false, int seed = 1234, + float min = -1.0f, float max = 1.0f) { std::vector vec(dim); - for (size_t i = 0; i < dim; i++) { - vec[i] = dis(gen); + populate_float_vec(vec.data(), dim, seed, min, max); + if (should_normalize) { + spaces::GetNormalizeFunc()(vec.data(), dim); } - quantize_float_vec_to_uint8(vec.data(), dim, v, seed); + quantize_float_vec_to_sq8_with_metadata(vec.data(), dim, v); } template