From f5f7e76d531078f55483e9841e973164b03a77bc Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 16 Jun 2026 07:56:14 +0000 Subject: [PATCH 1/5] =?UTF-8?q?test(clam):=20CHAODA=20outlier-discriminati?= =?UTF-8?q?on=20spike=20=E2=80=94=20single-method=20LFD=20is=20below=20the?= =?UTF-8?q?=20PROBE-CHAODA-1000G=20bar?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Runs the "1-day spike substitute" named in the genetics-probes-v1 spec (AdaWorldAPI/lance-graph): a kernel smoke test for the claim "CHAODA detects novel variants without a trained classifier." Synthesises a 5-lane Gaussian mixture (matching the probe's 5-lane variant feature vector) — three tight "common" clusters plus eight deliberately extreme "novel" outliers — thermometer-encodes each lane into 48 bits so Hamming distance is monotone in per-lane L1 magnitude (the honest bridge from ordinal features to the Hamming-metric CLAM default), builds the shipped ClamTree, and scores via anomaly_scores. MEASURED (deterministic, seed-fixed): mean cluster score = 0.6749, mean outlier score = 0.7500 frac cluster >= 0.5 = 0.733, frac outlier >= 0.5 = 0.750 ROC-AUC (Mann-Whitney U) = 0.6240 FINDING: the shipped single-method leaf-LFD anomaly_scores reaches only AUC ~ 0.62 on the EASIEST possible case (clean synthetic clusters with far outliers) — well below the probe's >= 0.85 bar. The cause is mechanical: leaf LFD = log2(|B(c,r)|/|B(c,r/2)|) measures intra-leaf geometry complexity, not inter-leaf isolation, so an isolated singleton lands in a leaf whose LFD is comparable to a dense cluster's, and global min-max normalisation compresses both into the same band. The CHAODA ensemble of Ishaq et al. 2021 combines several graph-based signals (relative/component cardinality, graph neighbourhood, random-walk stationary distribution, vertex degree); only the LFD signal is shipped here. PROBE-CHAODA-1000G therefore needs the multi-method ensemble or an augmented signal before it can pass — not merely genomic fixtures. The test locks robust, wide-tolerance invariants (valid range, bit-exact determinism, correct polarity, better-than-chance lower bound) plus one tripwire (auc < 0.85) that fails by design if a future multi-method port lifts the signal to the probe bar, forcing a cross-repo FINDING update rather than letting the claim silently rot. This is the evidence-before-build payoff: the gap is caught before any adapter-genetics-experimental (D-GEN-1..4) spend. https://claude.ai/code/session_01VysoWJ6vsyg3wEGc5v7T5v --- src/hpc/clam.rs | 184 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 184 insertions(+) diff --git a/src/hpc/clam.rs b/src/hpc/clam.rs index 904cedb1..c90e75d6 100644 --- a/src/hpc/clam.rs +++ b/src/hpc/clam.rs @@ -2499,6 +2499,190 @@ mod tests { } } + // ── CHAODA outlier-discrimination spike (PROBE-CHAODA-1000G kernel smoke test) ── + // + // Thermometer-encode each of 5 continuous lanes into 48 bits so Hamming + // distance is monotone in per-lane L1 magnitude (the honest bridge from + // ordinal features to the Hamming-metric CLAM default). 5 lanes x 6 bytes + // = 30 bytes/vector. + const SPIKE_LANES: usize = 5; + const SPIKE_LEVELS: usize = 48; // 48 bits = 6 bytes per lane + const SPIKE_VEC_LEN: usize = SPIKE_LANES * (SPIKE_LEVELS / 8); + + fn thermometer_encode(lanes: &[f64; SPIKE_LANES]) -> Vec { + let mut out = vec![0u8; SPIKE_VEC_LEN]; + for (l, &v) in lanes.iter().enumerate() { + let q = (v.clamp(0.0, 1.0) * SPIKE_LEVELS as f64).round() as usize; + let base_bit = l * SPIKE_LEVELS; + for b in 0..q { + let bit = base_bit + b; + out[bit / 8] |= 1 << (bit % 8); + } + } + out + } + + // Box-Muller standard normal from a SplitMix64 uniform stream. + fn next_gaussian(rng: &mut SplitMix64) -> f64 { + let u1 = ((rng.next_u64() >> 11) as f64 / (1u64 << 53) as f64).max(1e-12); + let u2 = (rng.next_u64() >> 11) as f64 / (1u64 << 53) as f64; + (-2.0 * u1.ln()).sqrt() * (std::f64::consts::TAU * u2).cos() + } + + /// Synthesise a 5-lane Gaussian mixture: tight "common" clusters plus + /// far "novel" outliers. Returns (bytes, outlier_index_set). + fn make_genetics_like_mixture() -> (Vec, Vec) { + let mut rng = SplitMix64::new(0x6E_65_74_69_63_73); // "netics" + let centers: [[f64; SPIKE_LANES]; 3] = [ + [0.20, 0.25, 0.15, 0.30, 0.22], + [0.50, 0.55, 0.48, 0.52, 0.50], + [0.78, 0.72, 0.80, 0.75, 0.82], + ]; + let sigma = 0.025; + let per_cluster = 40; + let mut data = Vec::new(); + let mut idx = 0usize; + for center in ¢ers { + for _ in 0..per_cluster { + let mut lanes = *center; + for lane in lanes.iter_mut() { + *lane = (*lane + sigma * next_gaussian(&mut rng)).clamp(0.0, 1.0); + } + data.extend_from_slice(&thermometer_encode(&lanes)); + idx += 1; + } + } + // 8 novel outliers placed in regions far from every cluster center. + let outlier_lanes: [[f64; SPIKE_LANES]; 8] = [ + [0.02, 0.97, 0.03, 0.95, 0.05], + [0.98, 0.02, 0.96, 0.04, 0.99], + [0.05, 0.05, 0.98, 0.02, 0.50], + [0.95, 0.95, 0.02, 0.98, 0.03], + [0.50, 0.02, 0.05, 0.97, 0.95], + [0.03, 0.50, 0.97, 0.05, 0.02], + [0.99, 0.50, 0.99, 0.50, 0.01], + [0.01, 0.99, 0.50, 0.99, 0.99], + ]; + let mut outlier_indices = Vec::new(); + for lanes in &outlier_lanes { + data.extend_from_slice(&thermometer_encode(lanes)); + outlier_indices.push(idx); + idx += 1; + } + (data, outlier_indices) + } + + /// Kernel smoke test for the `PROBE-CHAODA-1000G` claim (genetics-probes-v1 + /// in AdaWorldAPI/lance-graph): *"CHAODA detects novel variants without a + /// trained classifier."* + /// + /// FINDING (RUN 2026-06-16): the shipped single-method leaf-LFD + /// `anomaly_scores` achieves only **ROC-AUC ≈ 0.62** separating deliberately + /// extreme outliers from tight Gaussian clusters — the *easiest* possible + /// case. That is well below the probe's ≥ 0.85 bar. The cause is mechanical: + /// leaf LFD = log₂(|B(c,r)|/|B(c,r/2)|) measures *intra-leaf* geometry + /// complexity, not *inter-leaf* isolation, so an isolated singleton lands in + /// a leaf whose LFD is comparable to a dense cluster's, and the global + /// min-max normalisation compresses both into the same score band. The + /// CHAODA ensemble of Ishaq et al. 2021 combines several graph-based signals + /// (relative/component cardinality, graph neighbourhood, random-walk + /// stationary distribution, vertex degree); only the LFD signal is shipped + /// here. PROBE-CHAODA-1000G therefore needs the multi-method ensemble (or an + /// augmented signal) before it can pass — not just genomic fixtures. + /// + /// This test locks the *robust* invariants — valid range, bit-exact + /// determinism, correct polarity (outliers ≥ cluster mean), better-than- + /// chance lower bound — with wide tolerance: the AUC may drift anywhere in + /// [0.5, 0.85) without breaking. The one tripwire is `auc < 0.85`: it does + /// not assert the measured 0.62, but it does fail by design if a future + /// change (e.g. a multi-method CHAODA ensemble) lifts the single-method + /// signal to the probe bar — forcing whoever does that to update the + /// PROBE-CHAODA-1000G FINDING in lance-graph rather than letting the + /// cross-repo claim silently rot. + #[test] + fn test_chaoda_flags_novel_outliers_in_genetics_like_mixture() { + let (data, outliers) = make_genetics_like_mixture(); + let count = data.len() / SPIKE_VEC_LEN; + let tree = ClamTree::build(&data, SPIKE_VEC_LEN, 3); + let scores = tree.anomaly_scores(&data, SPIKE_VEC_LEN); + assert_eq!(scores.len(), count); + + let is_outlier = |i: usize| outliers.contains(&i); + let (mut sum_out, mut n_out) = (0.0f64, 0usize); + let (mut sum_clu, mut n_clu) = (0.0f64, 0usize); + let (mut out_high, mut clu_high) = (0usize, 0usize); // score >= 0.5 + for s in &scores { + assert!(s.score >= 0.0 && s.score <= 1.0); + if is_outlier(s.index) { + sum_out += s.score; + n_out += 1; + if s.score >= 0.5 { + out_high += 1; + } + } else { + sum_clu += s.score; + n_clu += 1; + if s.score >= 0.5 { + clu_high += 1; + } + } + } + let mean_out = sum_out / n_out as f64; + let mean_clu = sum_clu / n_clu as f64; + let frac_out_high = out_high as f64 / n_out as f64; + let frac_clu_high = clu_high as f64 / n_clu as f64; + + // ROC-AUC via the Mann-Whitney U statistic (ties count 0.5). This is the + // exact number PROBE-CHAODA-1000G gates on (>= 0.85 to pass). + let mut u = 0.0f64; + for a in &scores { + if !is_outlier(a.index) { + continue; + } + for b in &scores { + if is_outlier(b.index) { + continue; + } + if a.score > b.score { + u += 1.0; + } else if (a.score - b.score).abs() < 1e-12 { + u += 0.5; + } + } + } + let auc = u / (n_out as f64 * n_clu as f64); + eprintln!( + "[CHAODA-spike] n_clu={n_clu} n_out={n_out} mean_clu={mean_clu:.4} mean_out={mean_out:.4} \ + frac_clu>=0.5={frac_clu_high:.3} frac_out>=0.5={frac_out_high:.3} ROC_AUC={auc:.4}" + ); + + // Determinism: rebuild + rescore must be bit-identical (no-randomness invariant). + let tree2 = ClamTree::build(&data, SPIKE_VEC_LEN, 3); + let scores2 = tree2.anomaly_scores(&data, SPIKE_VEC_LEN); + for (a, b) in scores.iter().zip(scores2.iter()) { + assert_eq!(a.score.to_bits(), b.score.to_bits(), "non-deterministic score"); + } + + // Robust, forward-compatible invariants (see the doc comment for the + // measured AUC ≈ 0.62 finding; we deliberately do NOT assert the ceiling). + assert!( + mean_out >= mean_clu, + "polarity wrong: outliers ({mean_out:.4}) below cluster mean ({mean_clu:.4})" + ); + assert!( + auc > 0.5, + "leaf-LFD anomaly signal is not better than chance (AUC={auc:.4})" + ); + // Documents the gap to the PROBE-CHAODA-1000G bar without making the + // test brittle to a future multi-method CHAODA port that raises the AUC. + assert!( + auc < 0.85, + "single-method leaf-LFD unexpectedly met the >= 0.85 probe bar \ + (AUC={auc:.4}); if a multi-method CHAODA ensemble was added, update \ + this assertion AND the PROBE-CHAODA-1000G FINDING in lance-graph" + ); + } + // ── rho_nn_candidates tests ────────────────────────────────── #[test] From f612dc7fa51588d72a802e2ebb7929a61e2d2bc8 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 16 Jun 2026 08:23:06 +0000 Subject: [PATCH 2/5] =?UTF-8?q?test(clam):=20address=20Codex=20P2=20?= =?UTF-8?q?=E2=80=94=20drop=20the=20auc<0.85=20upper-bound=20tripwire?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Codex correctly flagged that asserting auc < 0.85 in a library unit test turns a future quality improvement into a failing test: once a multi-method CHAODA ensemble lifts the signal past the 0.85 probe bar, cargo test -p ndarray would fail until an external lance-graph doc is updated. A library test must never fail because the code got better, and ndarray CI should not be coupled to a lance-graph note. Fix: remove the upper-bound assertion. The test now asserts only lower-bound, forward-compatible invariants — valid range, bit-exact determinism, correct polarity (outliers >= cluster mean), and better-than-chance (auc > 0.5). The measured AUC (~ 0.62 today) is surfaced via the existing eprintln diagnostic, not enforced. Refreshing the PROBE-CHAODA-1000G FINDING in lance-graph when the ensemble lands is a documentation step, not a gate enforced from this library's test suite. Doc comment updated to match. Re-run: test green, ROC_AUC=0.6240 still printed. https://claude.ai/code/session_01VysoWJ6vsyg3wEGc5v7T5v --- src/hpc/clam.rs | 36 ++++++++++++++++-------------------- 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/src/hpc/clam.rs b/src/hpc/clam.rs index c90e75d6..bf9619e7 100644 --- a/src/hpc/clam.rs +++ b/src/hpc/clam.rs @@ -2590,15 +2590,15 @@ mod tests { /// here. PROBE-CHAODA-1000G therefore needs the multi-method ensemble (or an /// augmented signal) before it can pass — not just genomic fixtures. /// - /// This test locks the *robust* invariants — valid range, bit-exact - /// determinism, correct polarity (outliers ≥ cluster mean), better-than- - /// chance lower bound — with wide tolerance: the AUC may drift anywhere in - /// [0.5, 0.85) without breaking. The one tripwire is `auc < 0.85`: it does - /// not assert the measured 0.62, but it does fail by design if a future - /// change (e.g. a multi-method CHAODA ensemble) lifts the single-method - /// signal to the probe bar — forcing whoever does that to update the - /// PROBE-CHAODA-1000G FINDING in lance-graph rather than letting the - /// cross-repo claim silently rot. + /// This test asserts only *lower-bound*, forward-compatible invariants — + /// valid range, bit-exact determinism, correct polarity (outliers ≥ cluster + /// mean), and a better-than-chance signal (`auc > 0.5`). It deliberately does + /// NOT cap the AUC: a future multi-method CHAODA ensemble that lifts the + /// signal past the 0.85 probe bar must keep `cargo test -p ndarray` green, + /// never fail it. The measured AUC (≈ 0.62 today) is surfaced as an + /// `eprintln!` diagnostic, not enforced. When the ensemble lands and raises + /// it, refresh the `PROBE-CHAODA-1000G` FINDING in lance-graph — but that is + /// a documentation step, not a gate enforced from this library's test suite. #[test] fn test_chaoda_flags_novel_outliers_in_genetics_like_mixture() { let (data, outliers) = make_genetics_like_mixture(); @@ -2663,23 +2663,19 @@ mod tests { assert_eq!(a.score.to_bits(), b.score.to_bits(), "non-deterministic score"); } - // Robust, forward-compatible invariants (see the doc comment for the - // measured AUC ≈ 0.62 finding; we deliberately do NOT assert the ceiling). + // Robust, forward-compatible invariants. These are LOWER bounds only: + // they stay green whether the signal is the current weak leaf-LFD + // (AUC ~ 0.62) or a future multi-method ensemble that lifts it past the + // 0.85 probe bar. The measured AUC is surfaced via the eprintln above as + // a diagnostic; we deliberately do NOT cap it (a quality improvement must + // never fail `cargo test -p ndarray`). assert!( mean_out >= mean_clu, "polarity wrong: outliers ({mean_out:.4}) below cluster mean ({mean_clu:.4})" ); assert!( auc > 0.5, - "leaf-LFD anomaly signal is not better than chance (AUC={auc:.4})" - ); - // Documents the gap to the PROBE-CHAODA-1000G bar without making the - // test brittle to a future multi-method CHAODA port that raises the AUC. - assert!( - auc < 0.85, - "single-method leaf-LFD unexpectedly met the >= 0.85 probe bar \ - (AUC={auc:.4}); if a multi-method CHAODA ensemble was added, update \ - this assertion AND the PROBE-CHAODA-1000G FINDING in lance-graph" + "anomaly signal is not better than chance (AUC={auc:.4})" ); } From 497b04a7cdd003cefe57f8f1532bc19831b0f5e7 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 16 Jun 2026 09:24:31 +0000 Subject: [PATCH 3/5] style(clam): rustfmt the CHAODA spike test Fixes the format/stable CI check on PR #219. rustfmt reflows the centers array literal and two assert! calls in the spike test; no logic change, test still green (single-LFD AUC 0.6240 unchanged). Changes confined to the added test code. https://claude.ai/code/session_01VysoWJ6vsyg3wEGc5v7T5v --- src/hpc/clam.rs | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/src/hpc/clam.rs b/src/hpc/clam.rs index bf9619e7..699202db 100644 --- a/src/hpc/clam.rs +++ b/src/hpc/clam.rs @@ -2533,11 +2533,8 @@ mod tests { /// far "novel" outliers. Returns (bytes, outlier_index_set). fn make_genetics_like_mixture() -> (Vec, Vec) { let mut rng = SplitMix64::new(0x6E_65_74_69_63_73); // "netics" - let centers: [[f64; SPIKE_LANES]; 3] = [ - [0.20, 0.25, 0.15, 0.30, 0.22], - [0.50, 0.55, 0.48, 0.52, 0.50], - [0.78, 0.72, 0.80, 0.75, 0.82], - ]; + let centers: [[f64; SPIKE_LANES]; 3] = + [[0.20, 0.25, 0.15, 0.30, 0.22], [0.50, 0.55, 0.48, 0.52, 0.50], [0.78, 0.72, 0.80, 0.75, 0.82]]; let sigma = 0.025; let per_cluster = 40; let mut data = Vec::new(); @@ -2669,14 +2666,8 @@ mod tests { // 0.85 probe bar. The measured AUC is surfaced via the eprintln above as // a diagnostic; we deliberately do NOT cap it (a quality improvement must // never fail `cargo test -p ndarray`). - assert!( - mean_out >= mean_clu, - "polarity wrong: outliers ({mean_out:.4}) below cluster mean ({mean_clu:.4})" - ); - assert!( - auc > 0.5, - "anomaly signal is not better than chance (AUC={auc:.4})" - ); + assert!(mean_out >= mean_clu, "polarity wrong: outliers ({mean_out:.4}) below cluster mean ({mean_clu:.4})"); + assert!(auc > 0.5, "anomaly signal is not better than chance (AUC={auc:.4})"); } // ── rho_nn_candidates tests ────────────────────────────────── From 40d1553af2eb224a05becec9e9f6ecb75c5472ff Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 16 Jun 2026 09:08:01 +0000 Subject: [PATCH 4/5] =?UTF-8?q?feat(clam):=20CHAODA=20multi-method=20anoma?= =?UTF-8?q?ly=20ensemble=20=E2=80=94=20clears=20the=20PROBE-CHAODA-1000G?= =?UTF-8?q?=20synthetic=20bar=20(AUC=200.62=20->=200.99)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Increment 1 of D-GEN-CHAODA-ENSEMBLE (lance-graph genetics-probes-v1.md). Adds ClamTree::ensemble_anomaly_scores as a NEW scoring entry point alongside the unchanged single-method anomaly_scores baseline. The spike (#219) measured single-method leaf-LFD at ROC-AUC 0.624 on a synthetic 5-lane Gaussian mixture, below the 0.85 bar. Mechanical cause: leaf LFD measures intra-leaf geometry, not inter-leaf isolation. This ensemble combines isolation-sensitive CHAODA signals: - parent-child path-minority ratio (dominant): walking a leaf to the root, the minimum child/parent cardinality ratio is tiny for a point that split off as a minority (isolated outlier) and moderate for a point that always stayed in the majority (dense-cluster member). Immune to the leaf-fragmentation that defeats raw leaf cardinality. - connected-component cardinality over the leaf-overlap graph (small components are anomalous). Averaged into one score; every point inherits its leaf's score. A first attempt using raw leaf cardinality + vertex degree + component size scored AUC 0.621 (no lift) because the tree fragments dense blobs into many tiny leaves that mimic isolated outliers under those metrics; the path-minority signal is what actually separates. Leaf degree and raw leaf cardinality were dropped as fragmentation noise. The remaining CHAODA methods (random-walk stationary distribution) are deferred. MEASURED (deterministic synthetic mixture, same fixture as #219): single-LFD AUC = 0.6240 ensemble AUC = 0.9906 (lift +0.3667, clears the 0.85 bar) This is the synthetic SMOKE TEST only. It proves the ensemble approach captures isolation where single-LFD does not; it does NOT prove genomic novelty detection. PROBE-CHAODA-1000G on real corpora remains gated on D-GEN-1 + D-GEN-2 (VCF -> feature-vector pipeline). Tests: full hpc::clam suite green (53 incl. the new ensemble test); ensemble is deterministic (bit-exact rebuild) and built purely from shipped tree fields + the public dist(). https://claude.ai/code/session_01VysoWJ6vsyg3wEGc5v7T5v --- src/hpc/clam.rs | 231 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 231 insertions(+) diff --git a/src/hpc/clam.rs b/src/hpc/clam.rs index 699202db..5fc16daa 100644 --- a/src/hpc/clam.rs +++ b/src/hpc/clam.rs @@ -1576,6 +1576,169 @@ impl ClamTree { .filter(|a| a.score >= threshold) .collect() } + + /// Multi-method CHAODA anomaly ensemble — increment 1 of `D-GEN-CHAODA-ENSEMBLE` + /// (lance-graph `genetics-probes-v1.md`). + /// + /// The single-method [`anomaly_scores`](Self::anomaly_scores) signal scores + /// each point by its leaf cluster's local fractal dimension (LFD). LFD + /// measures *intra-leaf* geometry complexity, not *inter-leaf* isolation, so + /// it does not separate isolated outliers from dense clusters (measured + /// ROC-AUC ≈ 0.62 on a synthetic mixture; see the spike test). This method + /// adds the **isolation-sensitive** subset of the CHAODA ensemble (Ishaq et + /// al. 2021), computed over the **leaf-cluster overlap graph** — clusters are + /// vertices, and an edge joins two leaves whose volumes overlap + /// (`dist(centerᵢ, centerⱼ) ≤ rᵢ + rⱼ`): + /// + /// - **relative cardinality** — `1 − |C|/max|C|`: small clusters are anomalous. + /// - **vertex degree** — `1 − deg/max deg`: low-degree (isolated) leaves are anomalous. + /// - **component cardinality** — `1 − |comp|/max|comp|`: small connected components are anomalous. + /// + /// The three per-method scores (each already in `[0, 1]`) are averaged into + /// the ensemble score; every point inherits its leaf's ensemble score. The + /// remaining CHAODA methods (parent-child cardinality ratio, random-walk + /// stationary distribution) are deferred to a later increment. Deterministic: + /// no randomness, graph built purely from shipped tree fields + [`Self::dist`]. + pub fn ensemble_anomaly_scores(&self, data: &[u8], vec_len: usize) -> Vec { + let count = data.len() / vec_len; + + // Leaf clusters become the graph vertices. + let leaves: Vec = self + .nodes + .iter() + .enumerate() + .filter(|(_, n)| n.is_leaf()) + .map(|(i, _)| i) + .collect(); + let n_leaves = leaves.len(); + + if n_leaves == 0 { + return Vec::new(); + } + + let center = |node_idx: usize| -> &[u8] { + let ci = self.nodes[node_idx].center_idx; + &data[ci * vec_len..(ci + 1) * vec_len] + }; + + // Overlap-graph adjacency: edge iff the two leaf volumes intersect. + let mut adj: Vec> = vec![Vec::new(); n_leaves]; + for a in 0..n_leaves { + let na = &self.nodes[leaves[a]]; + let ca = center(leaves[a]); + for b in (a + 1)..n_leaves { + let nb = &self.nodes[leaves[b]]; + let d = self.dist(ca, center(leaves[b])); + if d <= na.radius.saturating_add(nb.radius) { + adj[a].push(b); + adj[b].push(a); + } + } + } + + // Connected components over the overlap graph (iterative BFS). + let mut comp_of = vec![usize::MAX; n_leaves]; + let mut comp_size: Vec = Vec::new(); + for start in 0..n_leaves { + if comp_of[start] != usize::MAX { + continue; + } + let cid = comp_size.len(); + let mut stack = vec![start]; + comp_of[start] = cid; + let mut size = 0usize; + while let Some(v) = stack.pop() { + size += 1; + for &w in &adj[v] { + if comp_of[w] == usize::MAX { + comp_of[w] = cid; + stack.push(w); + } + } + } + comp_size.push(size); + } + + // Parent map (the tree stores child pointers, not parent pointers). + let mut parent = vec![usize::MAX; self.nodes.len()]; + for (i, n) in self.nodes.iter().enumerate() { + if let Some(l) = n.left { + parent[l] = i; + } + if let Some(r) = n.right { + parent[r] = i; + } + } + + // Per-method normalisers. + let max_comp = comp_size.iter().copied().max().unwrap_or(1).max(1) as f64; + + // Per-leaf ensemble score. The dominant signal is the **parent-child + // path-minority ratio**: walking a leaf up to the root, the minimum + // child/parent cardinality ratio is tiny for a point that split off as a + // minority (an isolated outlier), and moderate for a point that always + // stayed in the majority (a dense-cluster member). This is immune to the + // leaf-fragmentation that defeats raw leaf cardinality/degree. It is + // averaged with the connected-component size (small components are + // anomalous); leaf degree and raw leaf cardinality are dropped — measured + // to add only fragmentation noise. + let mut leaf_score = vec![0.0f64; n_leaves]; + for a in 0..n_leaves { + // path-minority + let mut node = leaves[a]; + let mut min_ratio = 1.0f64; + while parent[node] != usize::MAX { + let p = parent[node]; + let ratio = self.nodes[node].cardinality as f64 + / (self.nodes[p].cardinality as f64).max(1.0); + if ratio < min_ratio { + min_ratio = ratio; + } + node = p; + } + let s_path = 1.0 - min_ratio; + // component cardinality + let comp = comp_size[comp_of[a]] as f64; + let s_comp = 1.0 - comp / max_comp; + leaf_score[a] = (s_path + s_comp) / 2.0; + } + + // Project leaf scores back onto every original data point. + let mut out: Vec = (0..count) + .map(|index| AnomalyScore { + index, + lfd: 0.0, + score: 0.0, + awareness: AwarenessState::Crystallized, + }) + .collect(); + for (a, &node_idx) in leaves.iter().enumerate() { + let node = &self.nodes[node_idx]; + let start = node.offset; + let end = start + node.cardinality; + for &orig_idx in &self.reordered[start..end] { + if orig_idx < count { + let score = leaf_score[a]; + let awareness = if score < 0.25 { + AwarenessState::Crystallized + } else if score < 0.50 { + AwarenessState::Tensioned + } else if score < 0.75 { + AwarenessState::Uncertain + } else { + AwarenessState::Noise + }; + out[orig_idx] = AnomalyScore { + index: orig_idx, + lfd: node.lfd.value, + score, + awareness, + }; + } + } + } + out + } } // ─── Tests ────────────────────────────────────────── @@ -2670,6 +2833,74 @@ mod tests { assert!(auc > 0.5, "anomaly signal is not better than chance (AUC={auc:.4})"); } + /// ROC-AUC via the Mann-Whitney U statistic (ties count 0.5); positive class + /// = `is_pos(index)`. + fn roc_auc(scores: &[AnomalyScore], is_pos: impl Fn(usize) -> bool) -> f64 { + let (mut u, mut n_pos) = (0.0f64, 0usize); + for a in scores { + if !is_pos(a.index) { + continue; + } + n_pos += 1; + for b in scores { + if is_pos(b.index) { + continue; + } + if a.score > b.score { + u += 1.0; + } else if (a.score - b.score).abs() < 1e-12 { + u += 0.5; + } + } + } + let n_neg = scores.len() - n_pos; + if n_pos == 0 || n_neg == 0 { + return 0.5; + } + u / (n_pos as f64 * n_neg as f64) + } + + /// `D-GEN-CHAODA-ENSEMBLE` increment 1: the isolation-sensitive ensemble must + /// materially out-discriminate the single-method leaf-LFD baseline on the same + /// synthetic mixture the spike measured at AUC ≈ 0.62. This is a NEW capability + /// (not a future improvement), so a lower-bound gate is appropriate here. + #[test] + fn test_chaoda_ensemble_beats_single_lfd_on_genetics_like_mixture() { + let (data, outliers) = make_genetics_like_mixture(); + let tree = ClamTree::build(&data, SPIKE_VEC_LEN, 3); + let is_out = |i: usize| outliers.contains(&i); + + let lfd = tree.anomaly_scores(&data, SPIKE_VEC_LEN); + let ens = tree.ensemble_anomaly_scores(&data, SPIKE_VEC_LEN); + assert_eq!(ens.len(), lfd.len()); + for s in &ens { + assert!(s.score >= 0.0 && s.score <= 1.0, "ensemble score out of range"); + } + + let auc_lfd = roc_auc(&lfd, is_out); + let auc_ens = roc_auc(&ens, is_out); + eprintln!("[CHAODA-ensemble] AUC single-LFD={auc_lfd:.4} ensemble={auc_ens:.4} lift={:.4}", auc_ens - auc_lfd); + + // Determinism: the ensemble graph is built purely from shipped tree + // fields, so a rebuild must reproduce bit-identical scores. + let tree2 = ClamTree::build(&data, SPIKE_VEC_LEN, 3); + let ens2 = tree2.ensemble_anomaly_scores(&data, SPIKE_VEC_LEN); + for (a, b) in ens.iter().zip(ens2.iter()) { + assert_eq!(a.score.to_bits(), b.score.to_bits(), "non-deterministic ensemble score"); + } + + // The whole point: the ensemble lifts discrimination well past the weak + // single-LFD signal. These are lower bounds (a better ensemble keeps them green). + assert!( + auc_ens > auc_lfd + 0.15, + "ensemble (AUC={auc_ens:.4}) did not materially beat single-LFD (AUC={auc_lfd:.4})" + ); + assert!( + auc_ens >= 0.85, + "ensemble AUC {auc_ens:.4} did not clear the PROBE-CHAODA-1000G bar of 0.85" + ); + } + // ── rho_nn_candidates tests ────────────────────────────────── #[test] From a630d77c89777d797abfa9e51f9a9c62cfb3381b Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 16 Jun 2026 09:23:50 +0000 Subject: [PATCH 5/5] fix(clam): guard ensemble overlap-graph behind a leaf budget + correct doc + rustfmt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses the Codex P2 on PR #220 (quadratic leaf-overlap build) and a doc-comment inconsistency I introduced, and fixes the format/stable CI. (1) Quadratic-build guard (Codex P2). The connected-component term needs an O(L^2 * vec_len) leaf-overlap graph; on production corpora with small min_cluster_size, L approaches the point count and the public API could hang. Split into: - ensemble_anomaly_scores_budgeted(.., graph_budget): computes the linear O(L*depth) parent-child path-minority signal always, and only builds the overlap graph + component term when n_leaves <= graph_budget. - ensemble_anomaly_scores(..): convenience wrapper using the default ENSEMBLE_GRAPH_BUDGET = 4096; above that it degrades to path-minority alone, so the public API never runs the quadratic build at scale. (2) Path-only fallback is validated, not assumed. New measurement on the synthetic fixture (graph_budget = 0 forces the fallback): single-LFD 0.6240 | path-only 0.9938 | full ensemble 0.9906 Path-minority alone clears the 0.85 bar (slightly above the combined — the component term is a marginal refinement), so degrading at scale is safe. The test now asserts path-only AUC >= 0.85 so the guard can never silently degrade large-corpus accuracy. (3) Doc-comment correction. When the scoring pivoted to path-minority + component, the method doc still described the abandoned relative-cardinality / vertex-degree set and listed parent-child ratio as "deferred" when it is in fact the dominant shipped signal. Rewritten to match the implementation. (4) rustfmt: format/stable was red; the new code is now rustfmt-clean (changes confined to the added ensemble method + tests; no pre-existing code touched). clippy --lib clean; full hpc::clam suite green (53 tests). https://claude.ai/code/session_01VysoWJ6vsyg3wEGc5v7T5v --- src/hpc/clam.rs | 225 ++++++++++++++++++++++++++++-------------------- 1 file changed, 130 insertions(+), 95 deletions(-) diff --git a/src/hpc/clam.rs b/src/hpc/clam.rs index 5fc16daa..37785f43 100644 --- a/src/hpc/clam.rs +++ b/src/hpc/clam.rs @@ -1577,6 +1577,11 @@ impl ClamTree { .collect() } + /// Default leaf-count cap for the quadratic connected-component term of + /// [`ensemble_anomaly_scores`](Self::ensemble_anomaly_scores). Above this many + /// leaves the public API falls back to the linear path-minority signal alone. + pub const ENSEMBLE_GRAPH_BUDGET: usize = 4096; + /// Multi-method CHAODA anomaly ensemble — increment 1 of `D-GEN-CHAODA-ENSEMBLE` /// (lance-graph `genetics-probes-v1.md`). /// @@ -1585,24 +1590,45 @@ impl ClamTree { /// measures *intra-leaf* geometry complexity, not *inter-leaf* isolation, so /// it does not separate isolated outliers from dense clusters (measured /// ROC-AUC ≈ 0.62 on a synthetic mixture; see the spike test). This method - /// adds the **isolation-sensitive** subset of the CHAODA ensemble (Ishaq et - /// al. 2021), computed over the **leaf-cluster overlap graph** — clusters are - /// vertices, and an edge joins two leaves whose volumes overlap - /// (`dist(centerᵢ, centerⱼ) ≤ rᵢ + rⱼ`): + /// adds isolation-sensitive CHAODA signals (Ishaq et al. 2021): + /// + /// - **parent-child path-minority ratio** (dominant; always computed; + /// `O(L · depth)`): walking a leaf up to the root, the minimum + /// `child_cardinality / parent_cardinality` ratio is tiny for a point that + /// split off as a minority (an isolated outlier) and moderate for one that + /// always stayed in the majority (a dense-cluster member). Immune to the + /// leaf-fragmentation that defeats raw leaf cardinality / degree. + /// - **connected-component cardinality** over the leaf-overlap graph (an edge + /// joins two leaves whose volumes overlap, `dist(cᵢ, cⱼ) ≤ rᵢ + rⱼ`; small + /// components are anomalous): a refinement averaged in **only when the leaf + /// count is within `graph_budget`**, because the overlap build is + /// `O(L² · vec_len)`. /// - /// - **relative cardinality** — `1 − |C|/max|C|`: small clusters are anomalous. - /// - **vertex degree** — `1 − deg/max deg`: low-degree (isolated) leaves are anomalous. - /// - **component cardinality** — `1 − |comp|/max|comp|`: small connected components are anomalous. + /// Every point inherits its leaf's score. Raw leaf cardinality and vertex + /// degree are not used (measured to add only fragmentation noise); the + /// random-walk stationary distribution method is deferred to a later + /// increment. Deterministic: no randomness; built purely from shipped tree + /// fields + [`Self::dist`]. /// - /// The three per-method scores (each already in `[0, 1]`) are averaged into - /// the ensemble score; every point inherits its leaf's ensemble score. The - /// remaining CHAODA methods (parent-child cardinality ratio, random-walk - /// stationary distribution) are deferred to a later increment. Deterministic: - /// no randomness, graph built purely from shipped tree fields + [`Self::dist`]. + /// This convenience wrapper uses the default + /// [`ENSEMBLE_GRAPH_BUDGET`](Self::ENSEMBLE_GRAPH_BUDGET), so it never runs the + /// quadratic overlap build on production-sized corpora — it degrades to the + /// linear path-minority signal above the budget. Call + /// [`ensemble_anomaly_scores_budgeted`](Self::ensemble_anomaly_scores_budgeted) + /// to choose the cap explicitly. pub fn ensemble_anomaly_scores(&self, data: &[u8], vec_len: usize) -> Vec { + self.ensemble_anomaly_scores_budgeted(data, vec_len, Self::ENSEMBLE_GRAPH_BUDGET) + } + + /// See [`ensemble_anomaly_scores`](Self::ensemble_anomaly_scores). `graph_budget` + /// caps the leaf count above which the quadratic connected-component term is + /// skipped (path-minority only). `usize::MAX` always includes it; `0` forces + /// path-only. + pub fn ensemble_anomaly_scores_budgeted( + &self, data: &[u8], vec_len: usize, graph_budget: usize, + ) -> Vec { let count = data.len() / vec_len; - // Leaf clusters become the graph vertices. let leaves: Vec = self .nodes .iter() @@ -1611,54 +1637,10 @@ impl ClamTree { .map(|(i, _)| i) .collect(); let n_leaves = leaves.len(); - if n_leaves == 0 { return Vec::new(); } - let center = |node_idx: usize| -> &[u8] { - let ci = self.nodes[node_idx].center_idx; - &data[ci * vec_len..(ci + 1) * vec_len] - }; - - // Overlap-graph adjacency: edge iff the two leaf volumes intersect. - let mut adj: Vec> = vec![Vec::new(); n_leaves]; - for a in 0..n_leaves { - let na = &self.nodes[leaves[a]]; - let ca = center(leaves[a]); - for b in (a + 1)..n_leaves { - let nb = &self.nodes[leaves[b]]; - let d = self.dist(ca, center(leaves[b])); - if d <= na.radius.saturating_add(nb.radius) { - adj[a].push(b); - adj[b].push(a); - } - } - } - - // Connected components over the overlap graph (iterative BFS). - let mut comp_of = vec![usize::MAX; n_leaves]; - let mut comp_size: Vec = Vec::new(); - for start in 0..n_leaves { - if comp_of[start] != usize::MAX { - continue; - } - let cid = comp_size.len(); - let mut stack = vec![start]; - comp_of[start] = cid; - let mut size = 0usize; - while let Some(v) = stack.pop() { - size += 1; - for &w in &adj[v] { - if comp_of[w] == usize::MAX { - comp_of[w] = cid; - stack.push(w); - } - } - } - comp_size.push(size); - } - // Parent map (the tree stores child pointers, not parent pointers). let mut parent = vec![usize::MAX; self.nodes.len()]; for (i, n) in self.nodes.iter().enumerate() { @@ -1670,39 +1652,80 @@ impl ClamTree { } } - // Per-method normalisers. - let max_comp = comp_size.iter().copied().max().unwrap_or(1).max(1) as f64; - - // Per-leaf ensemble score. The dominant signal is the **parent-child - // path-minority ratio**: walking a leaf up to the root, the minimum - // child/parent cardinality ratio is tiny for a point that split off as a - // minority (an isolated outlier), and moderate for a point that always - // stayed in the majority (a dense-cluster member). This is immune to the - // leaf-fragmentation that defeats raw leaf cardinality/degree. It is - // averaged with the connected-component size (small components are - // anomalous); leaf degree and raw leaf cardinality are dropped — measured - // to add only fragmentation noise. - let mut leaf_score = vec![0.0f64; n_leaves]; - for a in 0..n_leaves { - // path-minority - let mut node = leaves[a]; + // Signal 1 — parent-child path-minority ratio (always; O(L · depth)). + let mut s_path = vec![0.0f64; n_leaves]; + for (a, &leaf) in leaves.iter().enumerate() { + let mut node = leaf; let mut min_ratio = 1.0f64; while parent[node] != usize::MAX { let p = parent[node]; - let ratio = self.nodes[node].cardinality as f64 - / (self.nodes[p].cardinality as f64).max(1.0); + let ratio = self.nodes[node].cardinality as f64 / (self.nodes[p].cardinality as f64).max(1.0); if ratio < min_ratio { min_ratio = ratio; } node = p; } - let s_path = 1.0 - min_ratio; - // component cardinality - let comp = comp_size[comp_of[a]] as f64; - let s_comp = 1.0 - comp / max_comp; - leaf_score[a] = (s_path + s_comp) / 2.0; + s_path[a] = 1.0 - min_ratio; } + // Signal 2 — connected-component cardinality over the leaf-overlap graph. + // Guarded: the overlap build is O(L² · vec_len), so it is skipped above + // `graph_budget` and scoring falls back to path-minority alone. + let s_comp: Option> = if n_leaves <= graph_budget { + let center = |node_idx: usize| -> &[u8] { + let ci = self.nodes[node_idx].center_idx; + &data[ci * vec_len..(ci + 1) * vec_len] + }; + let mut adj: Vec> = vec![Vec::new(); n_leaves]; + for a in 0..n_leaves { + let na = &self.nodes[leaves[a]]; + let ca = center(leaves[a]); + for b in (a + 1)..n_leaves { + let nb = &self.nodes[leaves[b]]; + let d = self.dist(ca, center(leaves[b])); + if d <= na.radius.saturating_add(nb.radius) { + adj[a].push(b); + adj[b].push(a); + } + } + } + let mut comp_of = vec![usize::MAX; n_leaves]; + let mut comp_size: Vec = Vec::new(); + for start in 0..n_leaves { + if comp_of[start] != usize::MAX { + continue; + } + let cid = comp_size.len(); + let mut stack = vec![start]; + comp_of[start] = cid; + let mut size = 0usize; + while let Some(v) = stack.pop() { + size += 1; + for &w in &adj[v] { + if comp_of[w] == usize::MAX { + comp_of[w] = cid; + stack.push(w); + } + } + } + comp_size.push(size); + } + let max_comp = comp_size.iter().copied().max().unwrap_or(1).max(1) as f64; + Some( + (0..n_leaves) + .map(|a| 1.0 - comp_size[comp_of[a]] as f64 / max_comp) + .collect(), + ) + } else { + None + }; + + // Combine: average whichever signals are available. + let leaf_score: Vec = match &s_comp { + Some(sc) => (0..n_leaves).map(|a| (s_path[a] + sc[a]) / 2.0).collect(), + None => s_path, + }; + // Project leaf scores back onto every original data point. let mut out: Vec = (0..count) .map(|index| AnomalyScore { @@ -1716,18 +1739,18 @@ impl ClamTree { let node = &self.nodes[node_idx]; let start = node.offset; let end = start + node.cardinality; + let score = leaf_score[a]; + let awareness = if score < 0.25 { + AwarenessState::Crystallized + } else if score < 0.50 { + AwarenessState::Tensioned + } else if score < 0.75 { + AwarenessState::Uncertain + } else { + AwarenessState::Noise + }; for &orig_idx in &self.reordered[start..end] { if orig_idx < count { - let score = leaf_score[a]; - let awareness = if score < 0.25 { - AwarenessState::Crystallized - } else if score < 0.50 { - AwarenessState::Tensioned - } else if score < 0.75 { - AwarenessState::Uncertain - } else { - AwarenessState::Noise - }; out[orig_idx] = AnomalyScore { index: orig_idx, lfd: node.lfd.value, @@ -2872,6 +2895,10 @@ mod tests { let lfd = tree.anomaly_scores(&data, SPIKE_VEC_LEN); let ens = tree.ensemble_anomaly_scores(&data, SPIKE_VEC_LEN); + // Path-minority only (graph_budget = 0 forces the linear fallback that the + // public API uses above ENSEMBLE_GRAPH_BUDGET) — grounds the claim that the + // dominant signal survives without the quadratic component term. + let path_only = tree.ensemble_anomaly_scores_budgeted(&data, SPIKE_VEC_LEN, 0); assert_eq!(ens.len(), lfd.len()); for s in &ens { assert!(s.score >= 0.0 && s.score <= 1.0, "ensemble score out of range"); @@ -2879,7 +2906,18 @@ mod tests { let auc_lfd = roc_auc(&lfd, is_out); let auc_ens = roc_auc(&ens, is_out); - eprintln!("[CHAODA-ensemble] AUC single-LFD={auc_lfd:.4} ensemble={auc_ens:.4} lift={:.4}", auc_ens - auc_lfd); + let auc_path = roc_auc(&path_only, is_out); + eprintln!( + "[CHAODA-ensemble] AUC single-LFD={auc_lfd:.4} path-only={auc_path:.4} ensemble={auc_ens:.4} lift={:.4}", + auc_ens - auc_lfd + ); + + // The linear path-only fallback (used at scale) must itself clear the bar, + // otherwise the budget guard would silently degrade production accuracy. + assert!( + auc_path >= 0.85, + "path-only fallback AUC {auc_path:.4} below 0.85 — the budget guard would degrade large corpora" + ); // Determinism: the ensemble graph is built purely from shipped tree // fields, so a rebuild must reproduce bit-identical scores. @@ -2895,10 +2933,7 @@ mod tests { auc_ens > auc_lfd + 0.15, "ensemble (AUC={auc_ens:.4}) did not materially beat single-LFD (AUC={auc_lfd:.4})" ); - assert!( - auc_ens >= 0.85, - "ensemble AUC {auc_ens:.4} did not clear the PROBE-CHAODA-1000G bar of 0.85" - ); + assert!(auc_ens >= 0.85, "ensemble AUC {auc_ens:.4} did not clear the PROBE-CHAODA-1000G bar of 0.85"); } // ── rho_nn_candidates tests ──────────────────────────────────