Skip to content

Commit 6185433

Browse files
committed
probe(morton): 2×2 Morton quadtree cascade over gridlake SoA (validated)
First probe toward the Morton-tile cascade substrate (codec-agnostic; the f32 cell value stands in for the eventual palette256 / helix Fisher-2z per-cell codec — wiring that is the next step). Substrate proven: * 4×4 Morton leaf tile = 16 cells = one F32x16 loaded from `simd_soa::MultiLaneColumn` (the "2bit×2bit" tile; gridlake SoA carrier). * Quadtree over T×T tiles (2×2 per level) ⇒ total grid (4T)² for T=2^k gives the ladder 64, 256, 1024, 4096, 16384, 64k, 256k. * Morton order ⇒ every quadtree node is a contiguous index range, so the aggregate min/max pyramid (the Belichtungsmesser "bands") is a flat bottom-up reduction; the cascade prunes a subtree whose [min,max] can't intersect the query band [q−r, q+r] (the 3-stroke band-miss early-exit, generalized). examples/morton_cascade_probe.rs, validated: cascade count == brute-force count for every (size, query) 64..262144 cells; selective queries prune ~63-67% of cells, broad queries correctly prune 0% (no over-prune). `required-features = ["std"]` (uses `ndarray::simd`). Next: wire the per-cell codec (palette256 / helix), the SIMD leaf mask via F32x16, and a fully non-materialized aggregate variant. https://claude.ai/code/session_01D2WSmezQBNC3bUdHuGfGmo
1 parent cdb012c commit 6185433

2 files changed

Lines changed: 207 additions & 0 deletions

File tree

Cargo.toml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,11 @@ required-features = ["std"]
5656
name = "amx_rb_probe"
5757
required-features = ["std"]
5858

59+
# Morton cascade probe imports `ndarray::simd` (std-gated).
60+
[[example]]
61+
name = "morton_cascade_probe"
62+
required-features = ["std"]
63+
5964
[dependencies]
6065
num-integer = { workspace = true }
6166
num-traits = { workspace = true }

examples/morton_cascade_probe.rs

Lines changed: 202 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,202 @@
1+
//! Morton 2×2 cascade probe — non-materialized Z-order quadtree over the
2+
//! gridlake SoA carrier, with Belichtungsmesser-style min/max early-exit.
3+
//!
4+
//! ## What this validates (probe-first, codec-agnostic)
5+
//!
6+
//! - **4×4 Morton leaf tile = one `F32x16`** ("2bit×2bit": 2 bits X, 2 bits Y =
7+
//! 16 cells = 64 bytes = one AVX-512 register loaded from `MultiLaneColumn`).
8+
//! - **Quadtree over T×T tiles** (2×2 per level): total grid `(4T)²` for
9+
//! `T = 2^k` gives the ladder 64, 256, 1024, 4096, 16384, 64k, 256k.
10+
//! - **Morton order ⇒ every quadtree node is a contiguous index range**, so the
11+
//! aggregate (min/max) pyramid is a flat bottom-up reduction (the
12+
//! Belichtungsmesser "calibrated bands" = per-node value range).
13+
//! - **Cascade early-exit**: descend the pyramid; if a node's [min,max] can't
14+
//! intersect the query band [q−r, q+r], prune the whole subtree (the 3-stroke
15+
//! band-miss generalized). Leaf tile → `F32x16` load + test 16 cells.
16+
//!
17+
//! The cell value is a plain `f32` stand-in for the eventual per-cell codec
18+
//! (palette256 / helix Fisher-2z / Belichtungsmesser band) — wiring that is the
19+
//! next step; this probe proves the *substrate* (addressing + cascade) is
20+
//! correct and that the prune actually skips work.
21+
//!
22+
//! RUSTFLAGS="-C target-cpu=native" cargo run --release --example morton_cascade_probe
23+
//!
24+
//! PASS: cascade count == brute-force count for every (size, query); the
25+
//! reported prune-rate is the "boost".
26+
27+
use std::sync::Arc;
28+
29+
use ndarray::simd::{F32x16, MultiLaneColumn};
30+
31+
/// Interleave the low `bits` of `x` and `y` into a Z-order (Morton) index.
32+
/// x occupies even output bits, y the odd bits.
33+
fn morton2d(x: u32, y: u32, bits: u32) -> u32 {
34+
let mut m = 0u32;
35+
for b in 0..bits {
36+
m |= ((x >> b) & 1) << (2 * b);
37+
m |= ((y >> b) & 1) << (2 * b + 1);
38+
}
39+
m
40+
}
41+
42+
/// Deterministic SplitMix64 → f32 in [0, 1).
43+
fn splitmix(state: &mut u64) -> f32 {
44+
*state = state.wrapping_add(0x9E37_79B9_7F4A_7C15);
45+
let mut z = *state;
46+
z = (z ^ (z >> 30)).wrapping_mul(0xBF58_476D_1CE4_E5B9);
47+
z = (z ^ (z >> 27)).wrapping_mul(0x94D0_49BB_1331_11EB);
48+
z ^= z >> 31;
49+
((z >> 40) as f32) / (1u32 << 24) as f32
50+
}
51+
52+
/// Build the field in Morton-tile order:
53+
/// cell(x,y) → idx = morton(tx,ty)·16 + morton_in_tile(ix,iy)
54+
/// where tile (tx,ty) = (x>>2, y>>2), in-tile (ix,iy) = (x&3, y&3).
55+
/// Each 4×4 tile is therefore a contiguous 16-f32 (64-byte) `F32x16` chunk.
56+
fn build_field(t: u32, seed: u64) -> Vec<f32> {
57+
let side = 4 * t; // grid side
58+
let n = (side * side) as usize;
59+
let mut field = vec![0.0f32; n];
60+
let k = t.trailing_zeros(); // T = 2^k tiles per side
61+
let mut st = seed;
62+
for y in 0..side {
63+
for x in 0..side {
64+
let (tx, ty) = (x >> 2, y >> 2);
65+
let (ix, iy) = (x & 3, y & 3);
66+
let idx = (morton2d(tx, ty, k) as usize) * 16 + morton2d(ix, iy, 2) as usize;
67+
// A smooth-ish field + noise so neighbouring tiles share value ranges
68+
// (otherwise every node spans the full range and nothing prunes).
69+
let base = ((x as f32) / side as f32 + (y as f32) / side as f32) * 0.5;
70+
field[idx] = 0.85 * base + 0.15 * splitmix(&mut st);
71+
}
72+
}
73+
field
74+
}
75+
76+
/// Aggregate (min,max) pyramid over the T² tiles in Morton order. Level 0 =
77+
/// per-tile range (over its 16 cells); level l = range of 4 level-(l−1) nodes.
78+
/// A node at level l covers `4^l` contiguous tiles starting at `base`.
79+
struct Pyramid {
80+
levels: Vec<Vec<(f32, f32)>>, // levels[0] = per-tile, levels[K] = root
81+
k: u32, // number of quadtree levels (T = 2^k)
82+
}
83+
84+
impl Pyramid {
85+
fn build(field: &[f32], t: u32) -> Self {
86+
let k = t.trailing_zeros();
87+
let n_tiles = (t * t) as usize;
88+
// Level 0: min/max over each tile's 16 cells.
89+
let mut lvl0 = Vec::with_capacity(n_tiles);
90+
for tile in 0..n_tiles {
91+
let s = &field[tile * 16..tile * 16 + 16];
92+
let mut mn = f32::INFINITY;
93+
let mut mx = f32::NEG_INFINITY;
94+
for &v in s {
95+
mn = mn.min(v);
96+
mx = mx.max(v);
97+
}
98+
lvl0.push((mn, mx));
99+
}
100+
let mut levels = vec![lvl0];
101+
for l in 1..=k as usize {
102+
let prev = &levels[l - 1];
103+
let mut cur = Vec::with_capacity(prev.len() / 4);
104+
for node in prev.chunks_exact(4) {
105+
let mn = node.iter().map(|p| p.0).fold(f32::INFINITY, f32::min);
106+
let mx = node.iter().map(|p| p.1).fold(f32::NEG_INFINITY, f32::max);
107+
cur.push((mn, mx));
108+
}
109+
levels.push(cur);
110+
}
111+
Pyramid { levels, k }
112+
}
113+
}
114+
115+
/// Cascade query: count cells with |value − q| ≤ r, descending the quadtree and
116+
/// pruning any node whose [min,max] can't intersect [q−r, q+r]. Returns
117+
/// (count, cells_visited) — cells_visited only counts leaf cells actually tested.
118+
fn cascade_count(field: &[f32], col: &MultiLaneColumn, pyr: &Pyramid, q: f32, r: f32) -> (usize, usize) {
119+
let (lo, hi) = (q - r, q + r);
120+
let mut count = 0usize;
121+
let mut visited = 0usize;
122+
// Stack of (level, node_index_within_level).
123+
let mut stack = vec![(pyr.k as usize, 0usize)];
124+
let bytes = col.as_bytes();
125+
while let Some((level, node)) = stack.pop() {
126+
let (mn, mx) = pyr.levels[level][node];
127+
if mx < lo || mn > hi {
128+
continue; // band miss → prune whole subtree (early-exit)
129+
}
130+
if level == 0 {
131+
// Leaf tile = 16 cells = one F32x16 chunk in the SoA column.
132+
let off = node * 64; // 16 f32 × 4 bytes
133+
let chunk: [u8; 64] = bytes[off..off + 64].try_into().unwrap();
134+
let arr = f32x16_from_bytes(&chunk).to_array();
135+
for &v in arr.iter() {
136+
if (v - q).abs() <= r {
137+
count += 1;
138+
}
139+
}
140+
visited += 16;
141+
} else {
142+
let base = node * 4;
143+
for c in 0..4 {
144+
stack.push((level - 1, base + c));
145+
}
146+
}
147+
}
148+
let _ = field; // field kept for the brute-force reference; cascade reads the SoA column
149+
(count, visited)
150+
}
151+
152+
/// Build an `F32x16` from 64 little-endian bytes (one 4×4 Morton tile).
153+
fn f32x16_from_bytes(chunk: &[u8; 64]) -> F32x16 {
154+
let arr: [f32; 16] = core::array::from_fn(|i| {
155+
let o = i * 4;
156+
f32::from_le_bytes([chunk[o], chunk[o + 1], chunk[o + 2], chunk[o + 3]])
157+
});
158+
F32x16::from_array(arr)
159+
}
160+
161+
fn brute_count(field: &[f32], q: f32, r: f32) -> usize {
162+
field.iter().filter(|&&v| (v - q).abs() <= r).count()
163+
}
164+
165+
fn run(t: u32) {
166+
let side = 4 * t;
167+
let n = (side * side) as usize;
168+
let field = build_field(t, 0xC0FFEE ^ t as u64);
169+
// Wrap the Morton-ordered field bytes in the gridlake SoA carrier.
170+
let raw: Vec<u8> = field.iter().flat_map(|v| v.to_le_bytes()).collect();
171+
let col = MultiLaneColumn::new(Arc::from(raw.into_boxed_slice())).unwrap();
172+
let pyr = Pyramid::build(&field, t);
173+
174+
// Three query bands: tight (high prune), medium, broad (low prune).
175+
let queries = [(0.5f32, 0.02f32), (0.25, 0.10), (0.5, 0.5)];
176+
let mut all_ok = true;
177+
let mut report = String::new();
178+
for (q, r) in queries {
179+
let exp = brute_count(&field, q, r);
180+
let (got, visited) = cascade_count(&field, &col, &pyr, q, r);
181+
let ok = got == exp;
182+
all_ok &= ok;
183+
let prune = 100.0 * (1.0 - visited as f64 / n as f64);
184+
report.push_str(&format!(
185+
" q={q:.2} r={r:.2}: count {got:>7} (exp {exp:>7}) {} visited {visited:>8}/{n} → prune {prune:5.1}%\n",
186+
if ok { "OK " } else { "MISMATCH" }
187+
));
188+
}
189+
println!(
190+
" T={t:>3} tiles/side grid {side}×{side} = {n:>7} cells ({})\n{}",
191+
if all_ok { "CORRECT" } else { "WRONG" },
192+
report
193+
);
194+
}
195+
196+
fn main() {
197+
println!("== Morton 2×2 quadtree cascade probe (4×4 tile = F32x16, gridlake SoA) ==\n");
198+
// T = 2^k → grid (4T)² = the ladder 64, 256, 1024, 4096, 16384, 64k, 256k.
199+
for t in [2u32, 4, 8, 16, 32, 64, 128] {
200+
run(t);
201+
}
202+
}

0 commit comments

Comments
 (0)