diff --git a/src/core/tq_turbo_kv.c b/src/core/tq_turbo_kv.c index d32fd59..3794a8b 100644 --- a/src/core/tq_turbo_kv.c +++ b/src/core/tq_turbo_kv.c @@ -339,10 +339,11 @@ void tq_turbo_kv_3b_attention_ref(const float* query, const void* kv_cache, * 3-bit codebook has 8 entries which fit in 8 bytes — store in lower half * of a 16-byte register. Indices in 0-7. */ const float* cb = tq_codebook_centroids(3); + /* Used by both NEON and scalar paths — keep outside the NEON guard. */ + static const float CB3_I8_RECIP = 2.1520f / 127.0f; #ifdef __ARM_NEON static int8_t s_cb3_i8[16] = {0}; static int s_cb3_i8_init = 0; - static const float CB3_I8_RECIP = 2.1520f / 127.0f; if (!s_cb3_i8_init) { for (int j = 0; j < 8; j++) { float v = cb[j] * (127.0f / 2.1520f); @@ -559,11 +560,12 @@ void tq_turbo_kv_4b_attention_ref(const float* query, const void* kv_cache, * scale gives 16-element processing per ~10 NEON instructions vs * the previous ~32 scalar instructions. */ + /* Used by both NEON and scalar paths — keep outside the NEON guard. */ + static const float CB_I8_RECIP = 2.7326f / 127.0f; /* fp32 = int8 * recip */ #ifdef __ARM_NEON /* Static int8 codebook (computed once at startup; safe across blocks) */ static int8_t s_cb_i8[16] = {0}; static int s_cb_i8_init = 0; - static const float CB_I8_RECIP = 2.7326f / 127.0f; /* fp32 = int8 * recip */ if (!s_cb_i8_init) { for (int j = 0; j < 16; j++) { float v = cb[j] * (127.0f / 2.7326f); @@ -1299,10 +1301,11 @@ void tq_turbo_kv_5b_attention_ref(const float* query, const void* kv_cache, * within regression test thresholds). */ const float* cb = tq_codebook_centroids(5); + /* Used by both NEON and scalar paths — keep outside the NEON guard. */ + static const float CB5_I8_RECIP = 1.9956f / 127.0f; /* 5-bit max centroid */ #ifdef __ARM_NEON static int8_t s_cb5_i8[32] = {0}; static int s_cb5_i8_init = 0; - static const float CB5_I8_RECIP = 1.9956f / 127.0f; /* 5-bit max centroid */ if (!s_cb5_i8_init) { for (int j = 0; j < 32; j++) { float v = cb[j] * (127.0f / 1.9956f);