Skip to content

Commit f4af48d

Browse files
authored
perf: Pack successor instructions into cache-line gaps (#360)
1 parent a3319f9 commit f4af48d

14 files changed

Lines changed: 682 additions & 298 deletions

crates/plotnik-compiler/src/emit/layout.rs

Lines changed: 281 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
//! Cache-aligned instruction layout.
22
//!
33
//! Extracts linear chains from the control flow graph and places them
4-
//! contiguously. Pads instructions to prevent cache line straddling.
4+
//! contiguously. Packs successor instructions into free space of predecessor
5+
//! blocks for improved d-cache locality.
56
67
use std::collections::{BTreeMap, HashSet};
78

@@ -10,6 +11,170 @@ use crate::bytecode::{InstructionIR, Label, LayoutResult};
1011
const CACHE_LINE: usize = 64;
1112
const STEP_SIZE: usize = 8;
1213

14+
/// Intermediate representation for layout optimization.
15+
struct LayoutIR {
16+
blocks: Vec<Block>,
17+
label_to_block: BTreeMap<Label, usize>,
18+
label_to_offset: BTreeMap<Label, u8>,
19+
}
20+
21+
/// A 64-byte cache-line block.
22+
struct Block {
23+
placements: Vec<Placement>,
24+
used: u8,
25+
}
26+
27+
/// An instruction placed within a block.
28+
struct Placement {
29+
label: Label,
30+
offset: u8,
31+
size: u8,
32+
}
33+
34+
impl Block {
35+
fn new() -> Self {
36+
Self {
37+
placements: Vec::new(),
38+
used: 0,
39+
}
40+
}
41+
42+
fn free(&self) -> u8 {
43+
CACHE_LINE as u8 - self.used
44+
}
45+
46+
fn can_fit(&self, size: u8) -> bool {
47+
self.free() >= size
48+
}
49+
50+
fn place(&mut self, label: Label, size: u8) -> u8 {
51+
let offset = self.used;
52+
self.placements.push(Placement {
53+
label,
54+
offset,
55+
size,
56+
});
57+
self.used += size;
58+
offset
59+
}
60+
}
61+
62+
impl LayoutIR {
63+
fn new() -> Self {
64+
Self {
65+
blocks: Vec::new(),
66+
label_to_block: BTreeMap::new(),
67+
label_to_offset: BTreeMap::new(),
68+
}
69+
}
70+
71+
fn place(&mut self, label: Label, block_idx: usize, size: u8) {
72+
let offset = self.blocks[block_idx].place(label, size);
73+
self.label_to_block.insert(label, block_idx);
74+
self.label_to_offset.insert(label, offset);
75+
}
76+
77+
/// Move an instruction from its current block to a new block.
78+
fn move_to(&mut self, label: Label, new_block_idx: usize, size: u8) {
79+
// Remove from old block
80+
if let Some(&old_block_idx) = self.label_to_block.get(&label)
81+
&& let block = &mut self.blocks[old_block_idx]
82+
&& let Some(pos) = block.placements.iter().position(|p| p.label == label)
83+
{
84+
let old_placement = block.placements.remove(pos);
85+
block.used -= old_placement.size;
86+
87+
// Compact remaining placements
88+
let mut offset = 0u8;
89+
for p in &mut block.placements {
90+
p.offset = offset;
91+
offset += p.size;
92+
}
93+
}
94+
95+
// Add to new block
96+
let offset = self.blocks[new_block_idx].place(label, size);
97+
self.label_to_block.insert(label, new_block_idx);
98+
self.label_to_offset.insert(label, offset);
99+
}
100+
101+
fn finalize(self) -> LayoutResult {
102+
let mut mapping = BTreeMap::new();
103+
let mut max_step_end = 0u16;
104+
105+
for (block_idx, block) in self.blocks.iter().enumerate() {
106+
let block_base_step = (block_idx * CACHE_LINE / STEP_SIZE) as u16;
107+
for placement in &block.placements {
108+
let step = block_base_step + (placement.offset / STEP_SIZE as u8) as u16;
109+
mapping.insert(placement.label, step);
110+
let step_end = step + (placement.size / STEP_SIZE as u8) as u16;
111+
max_step_end = max_step_end.max(step_end);
112+
}
113+
}
114+
115+
LayoutResult::new(mapping, max_step_end)
116+
}
117+
}
118+
119+
/// Block-to-block reference counts for scoring.
120+
struct BlockRefs {
121+
/// (from_block, to_block) -> reference count
122+
direct: BTreeMap<(usize, usize), usize>,
123+
/// block -> list of predecessor blocks
124+
predecessors: BTreeMap<usize, Vec<usize>>,
125+
}
126+
127+
impl BlockRefs {
128+
fn new() -> Self {
129+
Self {
130+
direct: BTreeMap::new(),
131+
predecessors: BTreeMap::new(),
132+
}
133+
}
134+
135+
fn add_ref(&mut self, from_block: usize, to_block: usize) {
136+
*self.direct.entry((from_block, to_block)).or_default() += 1;
137+
let preds = self.predecessors.entry(to_block).or_default();
138+
if !preds.contains(&from_block) {
139+
preds.push(from_block);
140+
}
141+
}
142+
143+
fn count(&self, from_block: usize, to_block: usize) -> usize {
144+
self.direct.get(&(from_block, to_block)).copied().unwrap_or(0)
145+
}
146+
147+
fn predecessors(&self, block: usize) -> &[usize] {
148+
self.predecessors
149+
.get(&block)
150+
.map(|v| v.as_slice())
151+
.unwrap_or(&[])
152+
}
153+
}
154+
155+
/// Score a candidate block for packing based on reference distance.
156+
/// Direct refs count 1.0, 1-hop = 0.5, 2-hop = 0.25, capped at 3 hops.
157+
fn block_score(target_block: usize, candidate_block: usize, refs: &BlockRefs) -> f32 {
158+
let mut score = 0.0f32;
159+
let mut frontier = vec![(candidate_block, 0u8)];
160+
let mut visited = HashSet::new();
161+
162+
while let Some((block, dist)) = frontier.pop() {
163+
if !visited.insert(block) || dist > 3 {
164+
continue;
165+
}
166+
167+
let direct_refs = refs.count(block, target_block);
168+
score += direct_refs as f32 / (1u32 << dist) as f32;
169+
170+
for &pred in refs.predecessors(block) {
171+
frontier.push((pred, dist + 1));
172+
}
173+
}
174+
175+
score
176+
}
177+
13178
/// Successor graph for layout analysis.
14179
struct Graph {
15180
/// label -> list of successor labels
@@ -70,7 +235,121 @@ impl CacheAligned {
70235
let chains = extract_chains(&graph, instructions, entries);
71236
let ordered = order_chains(chains, entries);
72237

73-
assign_step_ids(ordered, &label_to_instr)
238+
let mut ir = build_layout_ir(&ordered, &label_to_instr);
239+
let refs = build_block_refs(&ir, &label_to_instr);
240+
pack_successors(&mut ir, &refs, &label_to_instr);
241+
242+
ir.finalize()
243+
}
244+
}
245+
246+
/// Build initial LayoutIR from ordered chains.
247+
fn build_layout_ir(
248+
chains: &[Vec<Label>],
249+
label_to_instr: &BTreeMap<Label, &InstructionIR>,
250+
) -> LayoutIR {
251+
let mut ir = LayoutIR::new();
252+
253+
for chain in chains {
254+
for &label in chain {
255+
let Some(instr) = label_to_instr.get(&label) else {
256+
continue;
257+
};
258+
let size = instr.size() as u8;
259+
260+
// Ensure current block can fit, or create new one
261+
if ir.blocks.is_empty() || !ir.blocks.last().unwrap().can_fit(size) {
262+
ir.blocks.push(Block::new());
263+
}
264+
let block_idx = ir.blocks.len() - 1;
265+
266+
ir.place(label, block_idx, size);
267+
}
268+
}
269+
270+
ir
271+
}
272+
273+
/// Build block reference counts from current layout.
274+
fn build_block_refs(
275+
ir: &LayoutIR,
276+
label_to_instr: &BTreeMap<Label, &InstructionIR>,
277+
) -> BlockRefs {
278+
let mut refs = BlockRefs::new();
279+
280+
for (&label, &block_idx) in &ir.label_to_block {
281+
let Some(instr) = label_to_instr.get(&label) else {
282+
continue;
283+
};
284+
for succ in instr.successors() {
285+
if let Some(&succ_block) = ir.label_to_block.get(&succ)
286+
&& succ_block != block_idx
287+
{
288+
refs.add_ref(block_idx, succ_block);
289+
}
290+
}
291+
}
292+
293+
refs
294+
}
295+
296+
/// Pack successor instructions into free space of predecessor blocks.
297+
///
298+
/// When X → Y and X is in block B, try to move Y to an earlier block
299+
/// that has free space and high reference score to B.
300+
fn pack_successors(
301+
ir: &mut LayoutIR,
302+
refs: &BlockRefs,
303+
label_to_instr: &BTreeMap<Label, &InstructionIR>,
304+
) {
305+
// Collect candidates: (successor_label, successor_block, predecessor_block)
306+
// We want to move successors to earlier blocks with free space
307+
let mut candidates: Vec<(Label, usize, usize)> = Vec::new();
308+
309+
for (&label, &block_idx) in &ir.label_to_block {
310+
let Some(instr) = label_to_instr.get(&label) else {
311+
continue;
312+
};
313+
314+
// For each successor of this instruction
315+
for succ in instr.successors() {
316+
if let Some(&succ_block) = ir.label_to_block.get(&succ) {
317+
// Only consider moving if successor is in a later block
318+
if succ_block > block_idx {
319+
candidates.push((succ, succ_block, block_idx));
320+
}
321+
}
322+
}
323+
}
324+
325+
// Sort by successor block descending (process later blocks first)
326+
candidates.sort_by_key(|(_, succ_block, _)| std::cmp::Reverse(*succ_block));
327+
328+
// Try to move each successor to an earlier block
329+
for (succ_label, _succ_block, pred_block) in candidates {
330+
// Re-check current block (might have changed)
331+
let Some(&current_block) = ir.label_to_block.get(&succ_label) else {
332+
continue;
333+
};
334+
335+
let Some(instr) = label_to_instr.get(&succ_label) else {
336+
continue;
337+
};
338+
let size = instr.size() as u8;
339+
340+
// Find the best earlier block with free space
341+
// Prefer blocks that reference the predecessor block (cache locality)
342+
let best = (0..current_block)
343+
.filter(|&c| ir.blocks[c].can_fit(size))
344+
.max_by(|&a, &b| {
345+
let score_a = block_score(pred_block, a, refs);
346+
let score_b = block_score(pred_block, b, refs);
347+
score_a.partial_cmp(&score_b).unwrap_or(std::cmp::Ordering::Equal)
348+
});
349+
350+
if let Some(candidate) = best {
351+
ir.move_to(succ_label, candidate, size);
352+
}
74353
}
75354
}
76355

@@ -144,46 +423,3 @@ fn order_chains(mut chains: Vec<Vec<Label>>, entries: &[Label]) -> Vec<Vec<Label
144423
entry_chains
145424
}
146425

147-
/// Assign step IDs with cache line awareness.
148-
fn assign_step_ids(
149-
chains: Vec<Vec<Label>>,
150-
label_to_instr: &BTreeMap<Label, &InstructionIR>,
151-
) -> LayoutResult {
152-
let mut mapping = BTreeMap::new();
153-
154-
let mut current_step = 0u16;
155-
let mut current_offset = 0usize; // Byte offset for cache alignment
156-
157-
for chain in chains {
158-
for label in chain {
159-
let Some(instr) = label_to_instr.get(&label) else {
160-
continue;
161-
};
162-
let size = instr.size();
163-
164-
// Pad if instruction would straddle cache line boundary
165-
let line_offset = current_offset % CACHE_LINE;
166-
if line_offset + size > CACHE_LINE {
167-
let padding_bytes = CACHE_LINE - line_offset;
168-
let padding_steps = (padding_bytes / STEP_SIZE) as u16;
169-
current_step += padding_steps;
170-
current_offset += padding_bytes;
171-
}
172-
173-
// Invariant: instruction must not straddle cache line
174-
assert!(
175-
current_offset % CACHE_LINE + size <= CACHE_LINE,
176-
"instruction at offset {} with size {} straddles 64-byte cache line",
177-
current_offset,
178-
size
179-
);
180-
181-
mapping.insert(label, current_step);
182-
let step_count = (size / STEP_SIZE) as u16;
183-
current_step += step_count;
184-
current_offset += size;
185-
}
186-
}
187-
188-
LayoutResult::new(mapping, current_step)
189-
}

0 commit comments

Comments
 (0)