fix(pr217): address review (fmt + codex/coderabbit) — v3-clean

claude · claude · commit e563fdcd3ed9 · 2026-06-14T09:00:37.000Z
CI fmt: ran `cargo fmt --all` (edge_residue_probe / golden_helix_probe were committed unformatted). Correctness (codex / coderabbit): * simd_int_ops::gemm_u8_i8 — VNNI dispatch was compile-time `#[cfg(target_feature)]`, so the default x86-64-v3 GitHub build stripped both VNNI arms → scalar on Ice Lake / SPR / Zen 4 silicon (codex P2 regression). Now RUNTIME `is_x86_feature_detected!` (avx512vnni → avxvnni → scalar); compiles + reaches VNNI under v3, and removes the pre-existing `needless_return` clippy warning. * simd_avx2.rs U16x16 `shr`/`shl` — returned ZERO for any shift ∉{1,2,4,8}; now `_mm256_srl_epi16`/`_mm256_sll_epi16` with a runtime lane count (all shifts). * amx_matmul::for_dpbusd — tile 1/2 shapes now match the operand contract (tmm1 = VNNI kb/4×64, tmm2 = plain 16×kb); identical at kb=64 (tests unaffected), correct for kb<64. * backend::native gemv_f32/f64 — early-return on m==0 (don't slice `x[..n]` when there are no rows; matches the scalar reference no-op). * test_tile_zero_and_release — minimal config rewritten on the corrected XTILECFG offsets (colsb=4 @16 / rows=1 @48), with an explanatory note. Probes / docs: * amx_probe matmul_f32 validator — true relative-L2 + max-abs (the old `|e|.max(1.0)` denominator was an absolute test for |e|<1). * amx_rb_probe rb_32 — assert K % 64 == 0 (was silently truncating the tail). * doc `# Examples` (ignore) on the new public APIs: TileConfig::for_dpbusd_8, tile_dpbusd_2x2, F32x8::mul_add, F32x8::cmp_gt_mask. Validated under x86-64-v3 (GitHub target): clippy clean, `cargo build --examples` Finished; native AMX probes still all CORRECT. https://claude.ai/code/session_01D2WSmezQBNC3bUdHuGfGmo
diff --git a/examples/amx_probe.rs b/examples/amx_probe.rs
@@ -101,13 +101,21 @@ fn test_matmul_f32(m: usize, n: usize, k: usize) {
         ArrayViewMut2::from_shape((m, n), &mut got[..]).unwrap(),
     )
     .unwrap();
-    let mut max_rel = 0.0f32;
+    // True relative metric: L2 relative error ‖got−exp‖ / ‖exp‖ (robust to
+    // small individual outputs — the previous `|e|.max(1.0)` denominator turned
+    // every |e|<1 cell into an absolute-error test) plus the max absolute error.
+    let mut sq_err = 0.0f64;
+    let mut sq_ref = 0.0f64;
+    let mut max_abs = 0.0f32;
     for (g, e) in got.iter().zip(&exp) {
-        let denom = e.abs().max(1.0);
-        max_rel = max_rel.max((g - e).abs() / denom);
+        let d = g - e;
+        sq_err += (d as f64) * (d as f64);
+        sq_ref += (*e as f64) * (*e as f64);
+        max_abs = max_abs.max(d.abs());
     }
-    let verdict = if max_rel < 0.05 { "CORRECT" } else { "WRONG  " };
-    println!("  matmul_f32  {m:>4}x{k:>4}x{n:>4}  {verdict}  max_rel_err = {max_rel:.4}");
+    let rel_l2 = (sq_err.sqrt() / sq_ref.sqrt().max(1e-12)) as f32;
+    let verdict = if rel_l2 < 0.02 { "CORRECT" } else { "WRONG  " };
+    println!("  matmul_f32  {m:>4}x{k:>4}x{n:>4}  {verdict}  rel-L2 {rel_l2:.4}  max-abs {max_abs:.4}");
 }
 
 fn main() {
diff --git a/examples/amx_rb_probe.rs b/examples/amx_rb_probe.rs
@@ -27,6 +27,7 @@ fn ref_32(a: &[u8], b: &[i8], k: usize) -> Vec<i32> {
 
 /// 32×32 = A(32×k u8) · B(k×32 i8) via the 2×2 register-blocked AMX kernel.
 fn rb_32(a: &[u8], b: &[i8], k: usize) -> Vec<i32> {
+    assert_eq!(k % 64, 0, "rb_32: K must be a multiple of 64 (TDPBUSD tile depth)");
     // Pack the two 16-wide B column bands into VNNI quads.
     let mut b0 = vec![0i8; k * 16];
     let mut b1 = vec![0i8; k * 16];
diff --git a/examples/edge_residue_probe.rs b/examples/edge_residue_probe.rs
@@ -37,11 +37,20 @@ fn splitmix(s: &mut u64) -> f32 {
 fn quantize_i8(x: &[f32]) -> (Vec<i8>, f32) {
     let amax = x.iter().fold(0.0f32, |a, &v| a.max(v.abs())).max(1e-12);
     let scale = 127.0 / amax;
-    (x.iter().map(|&v| (v * scale).round().clamp(-127.0, 127.0) as i8).collect(), scale)
+    (
+        x.iter()
+            .map(|&v| (v * scale).round().clamp(-127.0, 127.0) as i8)
+            .collect(),
+        scale,
+    )
 }
 
 fn l2(a: &[f32], b: &[f32]) -> f32 {
-    a.iter().zip(b).map(|(x, y)| (x - y) * (x - y)).sum::<f32>().sqrt()
+    a.iter()
+        .zip(b)
+        .map(|(x, y)| (x - y) * (x - y))
+        .sum::<f32>()
+        .sqrt()
 }
 
 fn run(n: usize, d: usize, k: usize, noise: f32) {
@@ -78,7 +87,9 @@ fn run(n: usize, d: usize, k: usize, noise: f32) {
     let amx_ns = t0.elapsed().as_nanos() as f64;
 
     // ||c_j||² in the i8 domain (same scale as v) for the argmin.
-    let cnorm: Vec<i32> = (0..k).map(|c| (0..d).map(|j| (cb_i8[c * d + j] as i32).pow(2)).sum()).collect();
+    let cnorm: Vec<i32> = (0..k)
+        .map(|c| (0..d).map(|j| (cb_i8[c * d + j] as i32).pow(2)).sum())
+        .collect();
     // idx[i] = argmax_j (2·G[i][j] − ||c_j||²)  ≡  argmin_j ||v_i − c_j||².
     let mut idx = vec![0u32; n];
     for i in 0..n {
diff --git a/examples/golden_helix_probe.rs b/examples/golden_helix_probe.rs
@@ -119,13 +119,17 @@ fn main() {
 
     println!("\n[2] Fisher-z percentile rank as a no-cosine normalised key:");
     // A deterministic spread of cosine similarities in (−1, 1).
-    let mut sims: Vec<f64> = (0..1000).map(|i| -0.999 + 1.998 * (i as f64 + 0.5) / 1000.0).collect();
+    let mut sims: Vec<f64> = (0..1000)
+        .map(|i| -0.999 + 1.998 * (i as f64 + 0.5) / 1000.0)
+        .collect();
     // Percentile rank of fisher_z(s). Both fisher_z and ranking are monotone in s,
     // so the rank order must equal the cosine order — verify (Spearman == 1).
     let mut idx: Vec<usize> = (0..sims.len()).collect();
     idx.sort_by(|&a, &b| fisher_z(sims[a]).partial_cmp(&fisher_z(sims[b])).unwrap());
     let inversions = idx.windows(2).filter(|w| sims[w[0]] > sims[w[1]]).count();
-    println!("    rank-order vs cosine-order inversions: {inversions} (0 ⇒ ordering fully preserved, no cosine needed)");
+    println!(
+        "    rank-order vs cosine-order inversions: {inversions} (0 ⇒ ordering fully preserved, no cosine needed)"
+    );
 
     // Rim-stretch: resolution (Δz per unit Δs) near the rim vs the centre.
     sims.sort_by(|a, b| a.partial_cmp(b).unwrap());
diff --git a/src/backend/native.rs b/src/backend/native.rs
@@ -287,6 +287,9 @@ pub fn gemm_f64(
 /// SIMD tiers compute each row via [`dot_f32`]; the scalar tier uses
 /// the byte-stable [`scalar::gemv_f32`] reference.
 pub fn gemv_f32(m: usize, n: usize, alpha: f32, a: &[f32], lda: usize, x: &[f32], beta: f32, y: &mut [f32]) {
+    if m == 0 {
+        return; // no rows ⇒ no-op; must not slice `x[..n]` (scalar ref returns too)
+    }
     match tier() {
         Tier::Scalar => scalar::gemv_f32(m, n, alpha, a, lda, x, beta, y),
         // Avx512 + Avx2: per-row SIMD dot product. `dot_f32` itself
@@ -307,6 +310,9 @@ pub fn gemv_f32(m: usize, n: usize, alpha: f32, a: &[f32], lda: usize, x: &[f32]
 /// SIMD tiers compute each row via [`dot_f64`]; the scalar tier uses
 /// the byte-stable [`scalar::gemv_f64`] reference.
 pub fn gemv_f64(m: usize, n: usize, alpha: f64, a: &[f64], lda: usize, x: &[f64], beta: f64, y: &mut [f64]) {
+    if m == 0 {
+        return; // no rows ⇒ no-op; must not slice `x[..n]` (scalar ref returns too)
+    }
     match tier() {
         Tier::Scalar => scalar::gemv_f64(m, n, alpha, a, lda, x, beta, y),
         _ => {
diff --git a/src/hpc/amx_matmul.rs b/src/hpc/amx_matmul.rs
@@ -63,13 +63,16 @@ impl TileConfig {
         cfg.data[16] = 64; // colsb[0] low (u16 @ 16); high byte @17 stays 0
         cfg.data[48] = 16; // rows[0]      (u8  @ 48)
 
-        // Tile 1 (A): 16 rows × kb colbytes (u8 left operand).
-        cfg.data[18] = kb as u8; // colsb[1] low (u16 @ 18); high byte @19 stays 0
-        cfg.data[49] = 16; // rows[1]            (u8  @ 49)
+        // Tile 1 (B, VNNI K×N → VEX.vvvv): kb/4 rows × 64 colbytes. The kernel
+        // loads the VNNI operand into tmm1, so tile 1 must carry the VNNI shape.
+        // (Was the plain 16×kb shape — equal to this only at kb=64; backwards
+        // for kb<64, which would mis-shape a tail kernel / external caller.)
+        cfg.data[18] = 64; // colsb[1] low (u16 @ 18); high byte @19 stays 0
+        cfg.data[49] = (kb / 4) as u8; // rows[1] (u8 @ 49)
 
-        // Tile 2 (B): kb/4 rows × 64 colbytes (VNNI-packed right operand).
-        cfg.data[20] = 64; // colsb[2] low (u16 @ 20); high byte @21 stays 0
-        cfg.data[50] = (kb / 4) as u8; // rows[2] (u8 @ 50)
+        // Tile 2 (A, plain M×K → ModRM.rm): 16 rows × kb colbytes.
+        cfg.data[20] = kb as u8; // colsb[2] low (u16 @ 20); high byte @21 stays 0
+        cfg.data[50] = 16; // rows[2] (u8 @ 50)
 
         cfg
     }
@@ -80,6 +83,17 @@ impl TileConfig {
     /// (vvvv/signed). Every tile is 16×64 so one config serves all roles. Same
     /// XTILECFG layout as [`Self::for_dpbusd`]: colsb[t] u16 @ 16+2t, rows[t]
     /// u8 @ 48+t.
+    ///
+    /// # Examples
+    /// ```ignore
+    /// use ndarray::hpc::amx_matmul::{tile_loadconfig, tile_release, TileConfig};
+    /// // SAFETY: requires AMX (gate on `amx_available()`); all 8 tiles are 16×64.
+    /// unsafe {
+    ///     tile_loadconfig(&TileConfig::for_dpbusd_8());
+    ///     // load A→tmm4/tmm5, B-VNNI→tmm6/tmm7, zero tmm0-3, then tile_dpbusd_2x2()
+    ///     tile_release();
+    /// }
+    /// ```
     pub fn for_dpbusd_8() -> Self {
         let mut cfg = TileConfig { data: [0u8; 64] };
         cfg.data[0] = 1; // palette 1
@@ -291,6 +305,21 @@ pub unsafe fn tile_dpbusd() {
 ///   C10 dst2 rm5 vvvv6 → C4 E2 49 5E D5   C11 dst3 rm5 vvvv7 → C4 E2 41 5E DD
 /// All eight operand tiles (0/1/2/3 dst, 4/5 A, 6/7 B) are distinct → no #UD.
 ///
+/// # Examples
+/// ```ignore
+/// use ndarray::hpc::amx_matmul::*;
+/// // SAFETY: requires AMX; full 32×32 register-blocked tile contract.
+/// unsafe {
+///     tile_loadconfig(&TileConfig::for_dpbusd_8());
+///     tile_zero(0); tile_zero(1); tile_zero(2); tile_zero(3); // C accumulators
+///     tile_load(4, a0_ptr, k); tile_load(5, a1_ptr, k);       // A rows (rm)
+///     tile_load(6, b0_vnni, 64); tile_load(7, b1_vnni, 64);   // B cols (vvvv)
+///     tile_dpbusd_2x2();                                       // 4 TDPBUSDs
+///     tile_store(0, c00, n * 4); /* … tmm1/2/3 → other quadrants … */
+///     tile_release();
+/// }
+/// ```
+///
 /// # Safety
 /// Tiles 0-7 configured (`TileConfig::for_dpbusd_8`) and 4/5/6/7 loaded.
 #[inline]
@@ -842,11 +871,14 @@ mod tests {
             return;
         }
         unsafe {
-            // Minimal config: just tile 0, 1 row × 4 bytes
+            // Minimal valid tile 0: 1 row × 4 colbytes, using the CORRECTED
+            // XTILECFG offsets (colsb[t] u16 @ 16+2t, rows[t] u8 @ 48+t). The
+            // old code wrote data[16]=1/data[48]=4 which under the fixed layout
+            // means colsb=1/rows=4 — still valid, but mislabeled; now explicit.
             let mut cfg = TileConfig { data: [0u8; 64] };
             cfg.data[0] = 1; // palette 1
-            cfg.data[16] = 1; // tile 0: 1 row
-            cfg.data[48] = 4; // tile 0: 4 colbytes
+            cfg.data[16] = 4; // colsb[0] = 4 bytes  (u16 @ 16)
+            cfg.data[48] = 1; // rows[0]  = 1 row    (u8  @ 48)
 
             tile_loadconfig(&cfg);
             // TILEZERO tmm0
diff --git a/src/simd_avx2.rs b/src/simd_avx2.rs
@@ -1603,29 +1603,18 @@ impl U16x16 {
     /// Logical right shift each 16-bit lane by `imm` (matches `U16x32::shr`).
     #[inline(always)]
     pub fn shr(self, imm: u32) -> Self {
-        Self(unsafe {
-            match imm {
-                1 => _mm256_srli_epi16(self.0, 1),
-                2 => _mm256_srli_epi16(self.0, 2),
-                4 => _mm256_srli_epi16(self.0, 4),
-                8 => _mm256_srli_epi16(self.0, 8),
-                _ => _mm256_setzero_si256(),
-            }
-        })
+        // SAFETY: AVX2 baseline; `_mm256_srl_epi16` takes a runtime lane count
+        // from the low 64 bits of an xmm, so every shift amount works (the
+        // earlier `match {1,2,4,8}` returned zero for all other amounts).
+        Self(unsafe { _mm256_srl_epi16(self.0, _mm_cvtsi32_si128(imm as i32)) })
     }
 
     /// Logical left shift each 16-bit lane by `imm` (matches `U16x32::shl`).
     #[inline(always)]
     pub fn shl(self, imm: u32) -> Self {
-        Self(unsafe {
-            match imm {
-                1 => _mm256_slli_epi16(self.0, 1),
-                2 => _mm256_slli_epi16(self.0, 2),
-                4 => _mm256_slli_epi16(self.0, 4),
-                8 => _mm256_slli_epi16(self.0, 8),
-                _ => _mm256_setzero_si256(),
-            }
-        })
+        // SAFETY: AVX2 baseline; `_mm256_sll_epi16` takes a runtime lane count
+        // (same fix as `shr` — the `match {1,2,4,8}` zeroed all other amounts).
+        Self(unsafe { _mm256_sll_epi16(self.0, _mm_cvtsi32_si128(imm as i32)) })
     }
 
     /// Multiply, keep low 16 bits (wrapping) — `_mm256_mullo_epi16`.
diff --git a/src/simd_avx512.rs b/src/simd_avx512.rs
@@ -1351,6 +1351,14 @@ impl PartialEq for U16x32 {
 // reduction needs an 8-wide FMA.
 impl F32x8 {
     /// Fused multiply-add: `self * a + b`, single rounding (`_mm256_fmadd_ps`).
+    ///
+    /// # Examples
+    /// ```ignore
+    /// let a = F32x8::splat(0.5);
+    /// let b = F32x8::splat(2.0);
+    /// let c = F32x8::splat(1.0);
+    /// assert_eq!(a.mul_add(b, c).to_array(), [2.0; 8]); // 0.5*2.0 + 1.0
+    /// ```
     #[inline(always)]
     pub fn mul_add(self, a: Self, b: Self) -> Self {
         // SAFETY: FMA3 intrinsic; reached only on FMA-capable targets via the
@@ -1363,6 +1371,14 @@ impl F32x8 {
     /// + `_mm256_movemask_ps`. The FastScan heap threshold-prune uses it to skip
     /// an 8-lane score chunk that holds no candidate above the current heap-min
     /// in a single instruction — the SIMD early-out the scalar `>hmin` scan loses.
+    ///
+    /// # Examples
+    /// ```ignore
+    /// let a = F32x8::from_array([3.0, 0.0, 5.0, 0.0, 3.0, 0.0, 5.0, 0.0]);
+    /// let b = F32x8::splat(1.0);
+    /// // lanes 0,2,4,6 are > 1.0 ⇒ bits 0,2,4,6 set = 0b0101_0101 = 0x55.
+    /// assert_eq!(a.cmp_gt_mask(b), 0x55);
+    /// ```
     #[inline(always)]
     pub fn cmp_gt_mask(self, other: Self) -> u32 {
         // SAFETY: AVX `vcmpps` + `vmovmskps`; available wherever this 256-bit
diff --git a/src/simd_int_ops.rs b/src/simd_int_ops.rs
@@ -277,32 +277,27 @@ pub fn gemm_u8_i8(a: &[u8], b: &[i8], c: &mut [i32], m: usize, n: usize, k: usiz
         }
     }
 
-    // Compile-time dispatch chain (tiers 1-3). Exactly one arm survives
-    // per build; the others are stripped by `#[cfg]` so the compiler
-    // emits a direct call to the chosen kernel with no runtime branch.
-
-    #[cfg(all(target_arch = "x86_64", target_feature = "avx512vnni"))]
-    {
-        // SAFETY: `target_feature = "avx512vnni"` at this site guarantees
-        // AVX-512F + VNNI + BW (the kernel's `#[target_feature(enable)]`
-        // set). The dispatcher is the safety invariant the kernel relies on.
-        unsafe { crate::hpc::vnni_gemm::int8_gemm_vnni_avx512(a, b, c, m, n, k) };
-        return;
-    }
-
-    #[cfg(all(
-        target_arch = "x86_64",
-        target_feature = "avxvnni",
-        not(target_feature = "avx512vnni"),
-    ))]
+    // RUNTIME VNNI dispatch (tiers 1-2, after the AMX check above). This MUST
+    // be runtime `is_x86_feature_detected!`, NOT compile-time
+    // `#[cfg(target_feature)]`: the default x86-64-v3 build has neither
+    // avx512vnni nor avxvnni as a *compile* feature, so a cfg chain would strip
+    // both arms and fall through to scalar even on Ice Lake / Sapphire Rapids /
+    // Zen 4 silicon that supports VNNI at runtime (the regression codex flagged
+    // on PR #217). Runtime detection keeps the VNNI kernels reachable on the
+    // baseline build, matching the pre-consolidation `simd_caps()` behaviour.
+    #[cfg(target_arch = "x86_64")]
     {
-        // SAFETY: `target_feature = "avxvnni"` at this site guarantees
-        // AVX + AVX2 + AVX-VNNI (the kernel's `#[target_feature(enable)]`
-        // set). Arm only fires when AVX-512 VNNI is *not* present —
-        // Alder Lake / Arrow Lake without AVX-512, or Zen 4 builds that
-        // pinned a ymm-only target. The dispatcher is the safety invariant.
-        unsafe { crate::hpc::vnni_gemm::int8_gemm_avxvnni_ymm(a, b, c, m, n, k) };
-        return;
+        if std::is_x86_feature_detected!("avx512vnni") {
+            // SAFETY: avx512vnni detected ⇒ AVX-512F + VNNI + BW present, the
+            // kernel's `#[target_feature(enable)]` set.
+            unsafe { crate::hpc::vnni_gemm::int8_gemm_vnni_avx512(a, b, c, m, n, k) };
+            return;
+        }
+        if std::is_x86_feature_detected!("avxvnni") {
+            // SAFETY: avxvnni detected ⇒ AVX + AVX2 + AVX-VNNI present.
+            unsafe { crate::hpc::vnni_gemm::int8_gemm_avxvnni_ymm(a, b, c, m, n, k) };
+            return;
+        }
     }
 
     // Fallback: scalar reference kernel. Always correct; same result the