From ceec0ce1cda5c91a906fce6f244acdfaf9ab4a9b Mon Sep 17 00:00:00 2001 From: Cemil ILIK Date: Fri, 29 May 2026 20:48:24 +0300 Subject: [PATCH 1/7] =?UTF-8?q?docs(adr):=20propose=20ADR-0033=20=E2=80=94?= =?UTF-8?q?=20kernel=20high-half=20migration?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Opens Milestone B6's gating prerequisite: making the kernel reachable from every task's active translation so a real EL0 task's SVC vector fetch + the EL1 handler translate (today the loader's userspace AS has only image+stack, no kernel mappings — the EL0 vector fetch would translation-fault unrecoverably). Decision: high-half migration (kernel -> TTBR1_EL1, EPD1 1->0; per-task userspace -> TTBR0_EL1), the direction ADR-0027 already signposted and pre-paid for (the single EPD1 flip; byte-stable high-half TCR fields) — extends, does NOT supersede, ADR-0027. Two refinements make it safe + methodical: (1) BOOT-TIME (inside mmu_bootstrap, before StaticCell init + GIC), where DAIF is masked and no live low-VA pointer survives — removing the live-kernel bricking hazards; (2) STAGED as a dedicated task (T-022) landed before B6's EL0 work, per CLAUDE.md #6. The §Simulation (the boot-time low->high transition) was hardened against TWO adversarial verification passes — the first caught an architecturally-impossible trampoline ("mapped in both regimes", impossible with disjoint TTBR0/TTBR1 ranges at T0SZ=T1SZ=16); the corrected mechanism has the PC physically cross low->high at `br` with both TTBRs live. The passes also pinned: the br target must land in the PXN=0 image window (not the physmap alias); an EPD1-cleared TCR constant with byte-stable TTBR0 fields; ISB between the TTBR1 write and the EPD1 clear; VBAR-high-before-br; the migration must complete before any DAIF unmask; and the link-high/load-low + low-linked-PIC .idmap discipline + KERNEL_VA_OFFSET PA<->VA helper that the early boot requires (the addr_of!-as-PA conflation, broken project-wide). §Consequences is honest about the toolchain risk; Option B (map-kernel-into-every-TTBR0) is the documented fallback if the link-split proves intractable. Opens T-022 (Draft) in the same commit per ADR-0025 §Rule 1; T-022's review-history row will record the §Simulation row-to-verification mapping. Status: Proposed — awaiting careful re-read (write-adr §10) + maintainer Accept (a separate commit). Security-relevant (changes the kernel's own translation regime + the kernel/user isolation boundary) — flagged for explicit security review. Refs: ADR-0033, ADR-0027, ADR-0025 Co-Authored-By: Claude Opus 4.8 (1M context) --- .../phase-b/T-022-high-half-kernel-mapping.md | 49 ++++++ .../0033-kernel-high-half-migration.md | 166 ++++++++++++++++++ docs/decisions/README.md | 3 +- docs/roadmap/phases/phase-b.md | 2 +- 4 files changed, 218 insertions(+), 2 deletions(-) create mode 100644 docs/analysis/tasks/phase-b/T-022-high-half-kernel-mapping.md create mode 100644 docs/decisions/0033-kernel-high-half-migration.md diff --git a/docs/analysis/tasks/phase-b/T-022-high-half-kernel-mapping.md b/docs/analysis/tasks/phase-b/T-022-high-half-kernel-mapping.md new file mode 100644 index 0000000..592ab6a --- /dev/null +++ b/docs/analysis/tasks/phase-b/T-022-high-half-kernel-mapping.md @@ -0,0 +1,49 @@ +# T-022 — High-half kernel mapping: boot-time migration to `TTBR1_EL1` + per-task `TTBR0` swap + +- **Phase:** B +- **Milestone:** B6 — First userspace "hello" (this is B6's **gating prerequisite**: making the kernel reachable from every task's active translation so a real EL0 task's `SVC` vector fetch + the EL1 handler translate — the [ADR-0033](../../../decisions/0033-kernel-high-half-migration.md) high-half migration; per [phase-b §B6 opening sequence](../../../roadmap/phases/phase-b.md#b6-opening-sequence--prerequisites)) +- **Status:** Draft +- **Created:** 2026-05-29 +- **Author:** @cemililik (+ Claude Opus 4.8 agent) +- **Dependencies:** [ADR-0033](../../../decisions/0033-kernel-high-half-migration.md) — must be `Accepted` before code lands (settles the boot-time high-half §Simulation + the link-high/load-low + `KERNEL_VA_OFFSET` discipline); [ADR-0027](../../../decisions/0027-kernel-virtual-memory-layout.md) (the reserved `TTBR1`/`EPD1` + the byte-stable high-half `TCR` fields this consumes); [T-016](T-016-mmu-activation.md) (the `mmu_bootstrap` + `QemuVirtMmu` + `vmsav8` encoders this extends); [T-018](T-018-address-space-kernel-object.md) (the `activate` differ-path that goes live). +- **Informs:** Closes [ADR-0033 §Dependency chain steps 1–6](../../../decisions/0033-kernel-high-half-migration.md#dependency-chain) and discharges every [ADR-0033 §Simulation](../../../decisions/0033-kernel-high-half-migration.md#simulation) row (the row-to-verification mapping is recorded in this task's review-history row on completion). Unblocks B6's subsequent tasks — the EL0-ready `Task` context + enter-EL0/`ERET` path (T-021 carry-forward gate #2), `task_create_from_image`, the per-task `console_write` window + gate #1/#3, `tyrne-user` + `userland/hello`. Lifts the still-pending `Pending QEMU smoke verification` riders on [UNSAFE-2026-0023 / 0024](../../../audits/unsafe-log.md) (T-022's per-task `TTBR0` swap is the first post-bootstrap address-space-switching caller). +- **ADRs required:** [ADR-0033](../../../decisions/0033-kernel-high-half-migration.md), [ADR-0027](../../../decisions/0027-kernel-virtual-memory-layout.md). Introduces **new** `UNSAFE-YYYY-NNNN` audit entries (the absolute-jump migration trampoline asm; the per-task `TTBR0_EL1` swap; the `KERNEL_VA_OFFSET` PA↔VA helper deref) + **Amendments** to UNSAFE-2026-0022 / 0023 / 0024 per [unsafe-policy](../../../standards/unsafe-policy.md). + +--- + +## User story + +As the kernel, I want to run in the high half (`TTBR1_EL1`) — present in every address space's high VA range — so that `TTBR0_EL1` is free for per-task userspace mappings and a real EL0 task's `SVC` vector fetch + the EL1 handler + copy-user all translate, **without** the kernel ever being present in (or leakable from) the user-active translation regime. + +## Context + +[ADR-0033](../../../decisions/0033-kernel-high-half-migration.md) settles the decision and the boot-time transition shape; this task implements it. It is **B6's gating prerequisite** ([phase-b §B6](../../../roadmap/phases/phase-b.md#milestone-b6--first-userspace-hello)) and is deliberately landed **alone and staged**, before the EL0-entry / `task_create_from_image` / `userland` tasks build on the settled high-half regime ([CLAUDE.md #6](../../../../CLAUDE.md)). + +The migration switches the running kernel's own PC/SP/`VBAR` translation regime from identity/low (`TTBR0`) to high (`TTBR1`) **at boot** (inside / right after [`mmu_bootstrap`](../../../../bsp-qemu-virt/src/mmu_bootstrap.rs), before any `StaticCell` is written and before the GIC/timer is live), where `DAIF` is masked and no low-VA pointer survives — the framing that removes the live-kernel bricking hazards (ADR-0033 §Decision outcome). It is the highest-stakes code in the project so far; the ADR's §Simulation was hardened against two adversarial verification passes. + +## Acceptance criteria + +- [ ] **Link-high/load-low.** The kernel is linked at `KBASE = 0xFFFF_FFFF_8008_0000` (LMA low via linker `AT`); a low-linked, position-independent `.idmap`-style early section holds `boot.s`, the high-half table builder, and the migration trampoline so they resolve `VA == PA` while the MMU is off / identity-only. (Closes the early-`adrp`-computes-high brick ADR-0033 §Consequences names.) +- [ ] **`KERNEL_VA_OFFSET` PA↔VA helper** replaces every `addr_of!`-as-PA site (`mmu_bootstrap` `TTBR` programming, the `__boot_pt_l0` re-read in `kernel_entry`, [`crate::mm::phys_frame_kernel_ptr`](../../../../kernel/src/mm/mod.rs)'s identity body). PA-computation host-tested. +- [ ] **High-half `TTBR1` tables** built per [ADR-0033 §"High-half layout"](../../../decisions/0033-kernel-high-half-migration.md): kernel image (`PXN = 0`/`UXN = 1`), kernel physmap/direct-map (`PXN = 1`), device MMIO — with the vector table + all handler/branch targets inside the `PXN = 0` image window. `vmsav8` high-half encoders host-tested. +- [ ] **EPD1-cleared `TCR_EL1` constant** in [`tyrne_hal::mmu::vmsav8`](../../../../hal/src/mmu/vmsav8.rs): bit 23 = 0, every `TTBR0`-governing field byte-identical to `TCR_EL1_VALUE`; host-tested. +- [ ] **The boot-time migration** runs the ADR-0033 §Simulation rows 0–3: build `TTBR1` (`ISB` after the `TTBR1` write, `DSB ISH` for the table memory) → `EPD1` `1→0` + `ISB` → trampoline (`VBAR`-high + `ISB`, `SP`-high, `LDR`/`BR` to the `PXN = 0` high continuation, `DAIF` masked) → `TTBR0`-null + `EPD0 = 1` + `ISB` + `TLBI VMALLE1` + `DSB ISH` + `ISB`. A new `tyrne: high-half active` boot marker prints after the jump. +- [ ] **Per-task `TTBR0_EL1` swap goes live:** [`QemuVirtMmu::activate`](../../../../bsp-qemu-virt/src/mmu.rs) drives the real per-task swap with per-task ASID values (`A1 = 0`, ASID in `TTBR0_EL1.ASID`); the [T-018](T-018-address-space-kernel-object.md) `activate` differ-path that short-circuits in v1 now fires. Host test pins the differ path with distinct ASes. +- [ ] **Audit:** new entries (trampoline asm; per-task `TTBR0` swap; `KERNEL_VA_OFFSET` deref) + Amendments to UNSAFE-2026-0022 / 0023 / 0024; the 0023/0024 `Pending QEMU smoke verification` riders lifted. +- [ ] **All gates green** incl. `cargo +nightly miri test --workspace --exclude tyrne-bsp-qemu-virt`. **QEMU smoke:** full demo to `tyrne: all tasks complete` with the new `tyrne: high-half active` line; `-d int,unimp,guest_errors` shows **zero new Translation/Permission fault classes** (the migration is fault-clean) — the row-4 abort gate. + +## Out of scope + +- The EL0-ready `Task` context register file (`ELR_EL1`/`SPSR_EL1`/`SP_EL0` + per-task `SP_EL1`) + the enter-EL0/`ERET` path — the next B6 task (T-021 carry-forward gate #2). +- `task_create_from_image` (`LoadedImage` → runnable `CapHandle{CapObject::Task(...)}`) — Phase B6. +- The per-task `console_write` window + per-page user-VA→kernel-VA translation (T-021 gate #1) and the `SYSCALL_STUB_TABLE` → current-task-table swap (gate #3) — Phase B6. +- `tyrne-user` + `userland/hello` + the build pipeline — Phase B6. +- Per-section kernel-image permissions (`.text` RX / `.rodata` R / `.data` RW) — [ADR-0034](../../../decisions/0027-kernel-virtual-memory-layout.md) placeholder; v1 high-half image is RWX-equivalent like the identity map it replaces. + +## Approach + +_(Settled at the ADR level — see [ADR-0033 §Simulation](../../../decisions/0033-kernel-high-half-migration.md#simulation) + §Dependency chain; the detailed approach + the §Simulation row-to-verification mapping are filled when the task moves to `In Progress`.)_ The migration trampoline is hand-asm (the compiler cannot be guaranteed to emit position-independent, no-`adrp`-to-high code for arbitrary Rust); the low-linked early section keeps `boot.s` + the table builder resolving low; the high-half tables reuse the host-tested `vmsav8` encoders. **Fallback:** if the link-split / position-independence discipline proves intractable on the LLVM/lld toolchain, ADR-0033 §Consequences documents the Option 2 interim (map the kernel into every `TTBR0`) — escalate to the maintainer before switching. + +## Definition of done + +All acceptance criteria checked; gates green (incl. Miri); audit-log entries + Amendments added; `current.md` updated; **security-relevant — flagged for explicit security review** per [CLAUDE.md #1](../../../../CLAUDE.md) (this changes the kernel's own translation regime and the kernel/user isolation boundary — the highest-stakes change in the project so far). diff --git a/docs/decisions/0033-kernel-high-half-migration.md b/docs/decisions/0033-kernel-high-half-migration.md new file mode 100644 index 0000000..e32f7be --- /dev/null +++ b/docs/decisions/0033-kernel-high-half-migration.md @@ -0,0 +1,166 @@ +# 0033 — Kernel high-half migration + +- **Status:** Proposed +- **Date:** 2026-05-29 +- **Deciders:** @cemililik + +## Context + +Milestone B6 ("first userspace hello") must run a real EL0 task: a separate binary in its own address space that makes a `console_write` syscall through the lower-EL `VBAR_EL1 + 0x400` vector and exits via `task_exit`. There is a hard architectural prerequisite that gates the entire milestone: **the kernel must stay reachable from every task's active translation regime.** + +Today the loader's userspace address space ([`task_loader.rs`](../../kernel/src/obj/task_loader.rs)) holds **only** the image + stack mappings — no kernel mappings. The MMU runs identity-only in `TTBR0_EL1` ([ADR-0027](0027-kernel-virtual-memory-layout.md)): `TTBR1_EL1 = 0`, `TCR_EL1.EPD1 = 1`. The moment a real EL0 task is dispatched with its own `TTBR0_EL1`, an `SVC` (or any exception) vectors the CPU to `VBAR_EL1` and **fetches the trampoline instruction** — which lives at a kernel physical address that is **not mapped in that task's `TTBR0_EL1`**. The result is a translation fault on the vector fetch, recursively, with no recovery. B5's syscall boundary smoke worked only because the EL1 kernel-stub ran in the bootstrap address space, where the kernel is identity-mapped; a real EL0 task has no such luxury. [phase-b §B6](../roadmap/phases/phase-b.md#milestone-b6--first-userspace-hello) states it plainly: *"Nothing in B6 runs until this is solved."* + +[ADR-0027 §Decision outcome (a)](0027-kernel-virtual-memory-layout.md) anticipated exactly this moment and **signposted the answer**: a high-half kernel. It reserved `TTBR1_EL1` (with `EPD1 = 1`), pre-committed the high-half-friendly `TCR_EL1` fields (`TG1 = 0b10`, `T1SZ = 16`, `IRGN1`/`ORGN1`/`SH1` already cacheable-inner-shareable), and named ADR-0033 as the home of "the `TTBR0_EL1`-swap discipline that arrives with userspace." This ADR settles that migration: **how the running kernel moves from its identity/low mapping to a high-half (`TTBR1_EL1`) mapping, so the kernel is present in every address space's high half while `TTBR0_EL1` is freed for per-task userspace.** + +The stakes are the highest in the project so far. The migration switches the running kernel's own instruction-fetch, stack, and exception-vector translation regime *mid-flight*; a single wrong step (an unmapped fetch, a stale TLB entry, a surviving low-VA pointer, a relocation that resolves to the wrong half) bricks the kernel silently with no recovery path. This ADR was drafted with two independent adversarial verification passes against its §Simulation (recorded in T-022's review history); the §Simulation below is the hardened result, and the §Dependency chain is explicit that the migration requires **new infrastructure** (a link-high/load-low linker discipline, a position-independent low-linked early-boot section, a `KERNEL_VA_OFFSET` PA↔VA helper) that does not exist in the tree today. + +## Decision drivers + +- **Security-first kernel/user isolation ([CLAUDE.md #1](../../CLAUDE.md), [architectural-principles](../standards/architectural-principles.md)).** A high-assurance capability kernel wants the kernel to be *structurally absent* from the user-active translation regime — not present-but-AP-protected. Absence means no `AP`/`UXN`/`PXN` descriptor bit can leak the kernel, and the Meltdown/transient-execution substrate is reduced, rather than relying on a per-descriptor invariant that must never be wrong on any of N user address spaces. +- **Honour the Accepted direction without a supersede.** [ADR-0027](0027-kernel-virtual-memory-layout.md) chose high-half as the future shape and pre-paid for it (the single `EPD1 = 1 → 0` flip; the byte-stable `TG1`/`T1SZ`/`IRGN1`/`ORGN1`/`SH1` fields). Adopting any non-high-half end-state would silently override that and force a `supersede-adr` move. +- **Bounded, one-time migration risk over a standing invariant.** The high-half transition carries a one-time bricking-hazard window; the alternative (kernel mapped into every `TTBR0`) carries a *standing* per-descriptor AP invariant on every address space forever. A high-assurance project prefers a bounded, verified, one-time risk to a permanent must-never-get-wrong surface. +- **Minimum surface per milestone ([CLAUDE.md #6](../../CLAUDE.md)).** B6 already lands many firsts (EL0 entry, `ERET`-to-EL0, the per-task `TTBR0` swap, the three T-021 carry-forward gates, `userland/hello`, `tyrne-user`). The migration must not be *bundled* with those firsts — it lands as its own staged, independently-reviewed task ([T-022](../analysis/tasks/phase-b/T-022-high-half-kernel-mapping.md)) **before** the EL0 work builds on the clean high-half regime. +- **Relocation feasibility.** The kernel image is currently linked at a fixed low VA (`ORIGIN = 0x4008_0000`); `boot.s` uses `adrp`/`:lo12:` (PC-relative) and `addr_of!` (treated as `VA == PA`). A high-half kernel must be *linked* high but *loaded* low, which means the early-boot path that runs before the high regime is live must resolve **low** — a non-trivial link-split + position-independence discipline the convention must make explicit, not assume. +- **Single-core simplicity (v1).** No TLB shootdown / cross-core inner-shareable migration; the existing `DSB ISH` discipline forward-extends to SMP (Phase C) without a barrier-scope rewrite. + +## Considered options + +1. **High-half migration** — relink the kernel at a high base (ARM convention `0xFFFF_FFFF_8000_0000+`), keep it loaded at PA `0x4008_0000` (link-high/load-low), and migrate the running kernel from identity/low (`TTBR0_EL1`) to high (`TTBR1_EL1`) **at boot time** (inside the bootstrap, before any state or interrupt source exists), then free `TTBR0_EL1` for per-task userspace. +2. **Map the kernel into every per-task `TTBR0_EL1`** — keep `EPD1 = 1`/`TTBR1 = 0`; point each per-task root's kernel-range slots at shared kernel intermediate tables (global, privileged-only entries) so the kernel is reachable from every task's own `TTBR0`. +3. **Defer the migration past B6** — leave identity-only; do not make the kernel reachable from per-task address spaces; keep exercising the syscall path only through the B5 EL1-stub proxy. + +## Decision outcome + +Chosen option: **Option 1 — high-half kernel migration, performed at boot time, landed as a dedicated staged task ([T-022](../analysis/tasks/phase-b/T-022-high-half-kernel-mapping.md)) before B6's EL0 work.** + +High-half is the structural kernel/user separation the project's high-assurance positioning requires (driver 1) and the direction [ADR-0027](0027-kernel-virtual-memory-layout.md) already signposted and pre-paid for (driver 2) — adopting Option 2 as the *end-state* would silently override that Accepted decision and force a supersede. Option 3 blocks the entire B6 milestone ([phase-b §B6](../roadmap/phases/phase-b.md#milestone-b6--first-userspace-hello)) and is recorded only to reject it honestly. + +Two refinements make the choice safe and methodical rather than reckless: + +- **Boot-time, not mid-kernel.** The migration runs inside the bootstrap window (within / immediately after [`mmu_bootstrap`](../../bsp-qemu-virt/src/mmu_bootstrap.rs), before the kernel's `StaticCell`s are written and before the GIC/timer is initialised). This is decisive for risk: an adversarial review of a *mid-kernel* migration found the dominant bricking hazards came from migrating a **live** kernel — `DAIF` unmasked, surviving low-VA `StaticCell` pointers, a live timer IRQ. At boot all three evaporate by construction: `DAIF` is masked from `_start` (`boot.s` `msr daifset, #0xf`, `SPSR_EL2 = 0x3c5`), no `StaticCell` holds a low VA yet (`kernel_entry` writes them *after* the migration returns, so they store high VAs), and no interrupt source is live. What remains is the irreducible core of any high-half jump — the relocation discipline and the `br` that crosses regimes — handled in the controlled boot window. +- **Staged, not bundled.** [T-022](../analysis/tasks/phase-b/T-022-high-half-kernel-mapping.md) lands the migration **alone** (relink, the low-linked early-boot section, the high-half table builder, the trampoline, the `KERNEL_VA_OFFSET` PA↔VA helper, the per-task `TTBR0` swap going live) and is reviewed on its own. B6's EL0-entry / `task_create_from_image` / carry-forward-gate / `userland` tasks then build on the settled high-half regime, satisfying [CLAUDE.md #6](../../CLAUDE.md). + +`TCR_EL1.A1` stays `0` and the ASID stays in `TTBR0_EL1.ASID`: the kernel moves to `TTBR1` and the **user**-half stays on `TTBR0`, so the `A1 = 0 → 1` flip [ADR-0027 §"ASID"](0027-kernel-virtual-memory-layout.md) conditionally named (only for a *TTBR1-swap user-half*) does **not** apply here. Per-task ASID *value* assignment lands here as the per-task `TTBR0` swap (the T-018 `activate` differ-path that short-circuits in v1) goes live. + +Option 2 is recorded as a **credible, non-strawman alternative** — it deletes the entire bricking-hazard family and is the lighter continuation of the shipped architecture. It is rejected as the *end-state* (the standing per-descriptor AP invariant inside a user-reachable regime plus the transient-execution exposure outweigh the one-time migration risk for a kernel that markets itself high-assurance), but it **remains the documented fallback** if T-022's link-split / position-independence discipline proves intractable on the toolchain (see §Consequences → Negative). + +### High-half layout (the `TTBR1_EL1` tables T-022 builds) + +Mirroring [ADR-0027 §Decision outcome (a)](0027-kernel-virtual-memory-layout.md)'s enumeration discipline, the high-half root populates exactly three regions (4 KiB granule, 48-bit VA, `T1SZ = 16` ⇒ `TTBR1` serves `VA[55] = 1`): + +| Region | High VA → PA | Attrs | +|--------|--------------|-------| +| Kernel image | `[KBASE .. KBASE+image_size)` → `[0x4008_0000 ..)` (`KBASE = 0xFFFF_FFFF_8008_0000`) | normal-cached, `AF = 1`, `nG = 0` (global), **`PXN = 0` / `UXN = 1`** (EL1-executable, EL0 no-exec). **The vector table and every handler/branch target must fall inside this `PXN = 0` window**, not the physmap alias below. | +| Kernel physmap (direct map) | a high window → all RAM PA `[0x4000_0000 .. 0x4800_0000)` | normal-cached, `AF = 1`, global, **`PXN = 1` / `UXN = 1`** (data only — PMM frames, page tables, copy-user buffers accessed by PA). | +| Device MMIO | a high window → `[0x0800_0000 .. 0x0920_0000)` | device-nGnRnE, `AF = 1`, `PXN = 1` / `UXN = 1` (UART + GIC after the jump). | + +The kernel image is therefore reachable at two high VAs — the executable image alias (`PXN = 0`) and the physmap alias (`PXN = 1`). **The migration's branch target and `VBAR_EL1` must resolve into the `PXN = 0` image window**; a target landing in the `PXN = 1` physmap alias is an execute-never permission fault on the first high fetch (a correctness pin T-022 must hold, surfaced by the §Simulation review). + +### Simulation + +The worst-case boot-time transition — the running kernel switching its own PC/SP/`VBAR` translation regime from identity/low to high-half while executing. The **early-boot + migration code is *low-linked* and position-independent** (a `.idmap`-style section that resolves `VA == PA` while the MMU is off / identity-only); the **main kernel is *high-linked***; the `br` in row 2 is the boundary. `DAIF` is masked throughout (boot window). + +| Step | State pre | Action | State post | Observable / verification | +|------|-----------|--------|------------|---------------------------| +| 0 | Low-linked early boot running at PA `0x4008_NNNN` (MMU on, identity `TTBR0`); `TTBR1 = 0`; `EPD1 = 1`; `DAIF` masked. Image relinked at `KBASE` but loaded at PA `0x4008_0000`. | Build the high-half `TTBR1` tables (the three regions above) in reserved frames, writing descriptors via the host-tested `vmsav8` encoders + `write_volatile` on `*mut u64` whose target PAs are computed by `KERNEL_VA_OFFSET` (**not** `addr_of!`-as-PA — see §Dependency chain). `MSR TTBR1_EL1, `; **`ISB`** (context-synchronises the `TTBR1` register write); `DSB ISH` (orders the table-memory writes for the walker). | `TTBR1` populated + synchronised; `EPD1` still `1`, so no high VA translates yet; PC/SP/`VBAR` still low (identity unchanged). | T-022 `vmsav8` high-half encoder host tests + UNSAFE-2026-0022 Amendment (descriptor writes into high-half frames). | +| 1 | `TTBR1` populated, `EPD1 = 1`, kernel executing low. | `MSR TCR_EL1, ` — only `EPD1` `1 → 0`; **every `TTBR0`-governing field (`T0SZ`/`EPD0`/`TG0`/`IRGN0`/`ORGN0`/`SH0`) byte-identical to the live `TCR_EL1_VALUE`** (a new pinned constant; perturbing any `TTBR0` field faults the *next* low fetch). `ISB`. (No pre-flip `TLBI` of the high range: with `EPD1 = 1` a `TTBR1` walk faults and the architecture caches no result, so there is nothing stale to drop — the §Simulation review corrected the earlier "pre-flip TLBI" rationale.) | Both regimes live simultaneously: low identity (`TTBR0`) **and** high (`TTBR1`). The ranges are disjoint (`VA[55] = 0` low / `= 1` high), so coexistence is sound. PC/SP/`VBAR` still low. | UNSAFE-2026-0023 Amendment (`EPD1`-clear `MSR`, same EL1-sysreg class as the bootstrap block). | +| 2 | Dual-live; `EPD1 = 0`; PC/SP/`VBAR` low; `DAIF` masked. | **The crossing.** Executing from the low-linked `.idmap` (PC-relative-safe, `VA == PA` under `TTBR0`): (1) `MSR VBAR_EL1, ` + `ISB` — high vectors live **before** the branch, so any synchronous fault on the first high fetch vectors to the `TTBR1`-mapped handler; (2) rebase `SP` to the high-VA boot stack (mapped `PXN`-irrelevant RW in `TTBR1`); (3) `LDR xN, =` (literal in the `.idmap` section so it resolves correctly while low) → `BR xN` — PC physically crosses from a low `.idmap` VA to a **`PXN = 0` image-window** high VA. The low idmap stays live (TTBR0 not yet nulled) as a safety net; the window takes no exception (`DAIF` masked; the few instructions cannot fault if the high image window is correctly populated). | PC/SP/`VBAR` resolve high via `TTBR1`. | **NEW** UNSAFE-YYYY-NNNN (the absolute-jump trampoline asm; invariants: `.idmap` low-linked + PIC, literal pool in `.idmap`, target in the `PXN = 0` window, `VBAR`-high-before-`br`, `SP`-high-mapped-before-`br`, `DAIF` masked). | +| 3 | PC/SP/`VBAR` high; low idmap still live. **No live low-VA pointer exists** — `StaticCell`s are unwritten (`kernel_entry` writes them after the migration returns, storing high VAs); the only low references were in `.idmap`, which the PC has left. | `MSR TTBR0_EL1, xzr`; set `TCR_EL1.EPD0 = 1`; `ISB`; `TLBI VMALLE1`; `DSB ISH`; `ISB` (registers only — no table-memory mutation, so no `DSB` *before* the `TLBI` is required). | Final high-half steady state: kernel on `TTBR1` (`EPD1 = 0`); `TTBR0` free/null for per-task userspace (`EPD0 = 1` until a task AS activates); stale low translations flushed. Control returns to the high-linked `kernel_entry`; `StaticCell` init + GIC + PMM + loader + demo all run high. A real EL0 task's `TTBR0` carries only its own user mappings; its `SVC` vector fetch goes to the high `VBAR` mapped in `TTBR1` (present for every task), so `+0x400` + the EL1 handler translate. | UNSAFE-2026-0023 + 0024 Amendments (`TTBR0`-null/`EPD0`-set + post-flip `TLBI`) + **NEW** entry for the per-task `TTBR0` swap going live; T-018 `activate`-differ host test. | +| 4 | Any step's precondition violated. | **Abort discipline.** A boot-time regime switch has **no runtime rollback** — once row 2's `br` executes, row 3 destroys the low regime. Safety is therefore *design-time* (per-region table verified, `.idmap` link-split, the ordering + `PXN`-window pins above) **plus** the QEMU smoke gate: a wrong step fail-stops (hangs) before the new `tyrne: high-half active` marker and before `tyrne: all tasks complete`, so the [business master-plan closure-smoke gate](../analysis/reviews/business-reviews/master-plan.md) blocks the merge. | No silent-wrong-kernel ships: a broken migration is a visible boot hang, not a passing build. Milestone fallback: Option 2. | The smoke marker + `-d int,unimp,guest_errors` (zero new Translation/Permission faults) is T-022's runtime gate. | + +#### Simulation row-to-verification mapping + +Per the [`write-adr` skill §Procedure step 5 sub-bullet](../../.agents/skills/write-adr/SKILL.md), each row maps to a verification artefact in [T-022](../analysis/tasks/phase-b/T-022-high-half-kernel-mapping.md), recorded in its review-history row on completion: + +- **Row 0** → `vmsav8` high-half encoder host tests (the three-region descriptor encodings, `PXN`/`UXN`/`AF`/`nG` per region) + the `KERNEL_VA_OFFSET` PA-computation host test + UNSAFE-2026-0022 Amendment. +- **Row 1** → a host test pinning the `EPD1`-cleared `TCR_EL1` constant is byte-identical to `TCR_EL1_VALUE` except bit 23 + UNSAFE-2026-0023 Amendment. +- **Row 2** → the new absolute-jump-trampoline UNSAFE entry + the QEMU smoke showing the `tyrne: high-half active` marker after the jump (the runtime proof the crossing reached the `PXN = 0` window). +- **Row 3** → UNSAFE-2026-0023 / 0024 Amendments + the per-task-`TTBR0`-swap UNSAFE entry + the T-018 `activate`-differ host test (now exercised with distinct ASes). +- **Row 4** → the QEMU smoke gate (full trace to `tyrne: all tasks complete`; `-d int,unimp,guest_errors` zero new fault classes). + +### Dependency chain + +For this decision to be **fully** in effect: + +```text +1. Link-high/load-low linker discipline: relink the kernel at KBASE + (0xFFFF_FFFF_8008_0000), keep LMA low via `AT`, and a low-linked + position-independent `.idmap`-style early section for boot.s + the + table builder + the trampoline. — T-022 (opens with this ADR) +2. KERNEL_VA_OFFSET PA<->VA helper replacing every `addr_of!`-as-PA site + (mmu_bootstrap TTBR programming, the __boot_pt_l0 re-read in + kernel_entry, crate::mm::phys_frame_kernel_ptr's identity body). — T-022 +3. High-half table builder (the three-region TTBR1 root: image PXN=0, + physmap PXN=1, device) — extends the fixed 4-frame 2 MiB-block + bootstrap with the physmap/L3 capability it lacks today. — T-022 +4. EPD1-cleared TCR_EL1 constant (bit 23 = 0, all TTBR0 fields byte- + stable) in tyrne_hal::mmu::vmsav8. — T-022 +5. The migration trampoline (hand-asm: VBAR-high + SP-high + LDR/BR to + the PXN=0 high continuation) + the TTBR0-null/EPD0-set teardown. — T-022 +6. Per-task TTBR0_EL1 swap going live: QemuVirtMmu::activate drives the + real swap with per-task ASID values (A1=0, ASID in TTBR0_EL1.ASID); the + T-018 activate differ-path that short-circuits in v1 now fires. — T-022 +7. EL0-ready Task context register file + enter-EL0/ERET path + per-task + SP_EL1, so a real EL0 task can take the +0x400 trap. — Phase B6 (separate task, builds on T-022) +8. task_create_from_image wrapper + userland/hello + tyrne-user. — Phase B6 (deferred) +``` + +Steps 1–6 are [T-022](../analysis/tasks/phase-b/T-022-high-half-kernel-mapping.md), opened at `Draft` in the same commit as this ADR per [ADR-0025 §Rule 1](0025-adr-governance-amendments.md). Steps 7–8 are the subsequent B6 tasks that build on the settled high-half regime (the staging that satisfies [CLAUDE.md #6](../../CLAUDE.md)). This ADR **extends, not relitigates**, [ADR-0027](0027-kernel-virtual-memory-layout.md): it consumes the reserved `TTBR1`, the single `EPD1` flip, and the byte-stable high-half `TCR` fields that ADR-0027 pre-committed. + +## Consequences + +### Positive + +- **Structural kernel/user isolation.** The kernel is simply *absent* from the user (`TTBR0`) regime — no descriptor bit can leak it, the Meltdown/transient-execution substrate is reduced, and future `EPD0`/KPTI-style hardening becomes expressible. This is the high-assurance end-state [CLAUDE.md #1](../../CLAUDE.md) favours. +- **Clean VA split, no carve-out.** User owns all of low (`TTBR0`, `T0SZ = 16`), kernel owns all of high (`TTBR1`, `T1SZ = 16`). No need to reject user mappings overlapping a kernel sub-range, no per-task root-divergence hazard. +- **No supersede; honours ADR-0027.** `EPD1 = 1 → 0` is the single pre-committed flip; the high-half `TCR` fields stay byte-stable; the §Simulation walks the skeleton [ADR-0027:156](0027-kernel-virtual-memory-layout.md) named. +- **`TTBR0` freed for per-task userspace.** The per-task swap is a single `MSR TTBR0_EL1`; the loader no longer injects kernel mappings into each task AS (they live in `TTBR1`, present for every task) — a structural simplification of the B6 loader path. +- **Boot-time framing removes the live-kernel hazards.** `DAIF` masked, no `StaticCell` low-VA pointers, no live IRQ during the window — verified to hold against the current `boot.s`/`kernel_entry` ordering. + +### Negative + +- **Substantial new infrastructure with real toolchain risk.** The migration needs a link-high/load-low discipline + a low-linked position-independent `.idmap` early section that **does not exist** today (the linker script is single-base `ORIGIN = 0x4008_0000`, no `AT`, no `.idmap`). The hard part, surfaced by the adversarial §Simulation review: under a high link, the early-boot `adrp`/`addr_of!` sites in `boot.s`/`mmu_bootstrap` would compute **high** VAs while running **low** with the MMU off — bricking before `kernel_entry`. *Mitigation:* the entire low-running portion (BSS-zero, SP setup, table build, trampoline) is kept in the low-linked `.idmap` section so it resolves low; the migration trampoline is hand-asm (the compiler cannot be guaranteed to emit position-independent, no-`adrp`-to-high code for arbitrary Rust). **We accept this cost** because it is the irreducible price of the high-assurance end-state, it is bounded and one-time, and it is verified row-by-row by T-022 + the QEMU smoke gate. **If the link-split proves intractable on the LLVM/lld toolchain, the documented fallback is Option 2** (map the kernel into every `TTBR0`) as an explicit interim, deferring the structural boundary — recorded here so the fallback needs no new ADR. +- **The `addr_of!`-as-PA conflation must be broken project-wide.** Every site that today treats a linker symbol as a PA (TTBR programming, the `__boot_pt_l0` re-read, `phys_frame_kernel_ptr`) must compute PAs via `KERNEL_VA_OFFSET`. *Mitigation:* this is the single-helper-body change [memory-management.md](../architecture/memory-management.md) and the UNSAFE-2026-0025/0026/0027/0030 entries already forecast; T-022 lands it once. +- **~2× early-boot asm and ≥3 new/amended audit entries** vs identity ([ADR-0027:79](0027-kernel-virtual-memory-layout.md)). *Mitigation:* the migration is one staged task; the audit surface is enumerated in the §Simulation mapping. +- **No runtime rollback.** A half-completed migration cannot recover. *Mitigation:* safety is design-time (verified per-region tables + ordering pins) + the QEMU smoke gate fail-stops a broken migration visibly (row 4). + +### Neutral + +- **`A1` stays 0 / ASID in `TTBR0_EL1.ASID`.** The kernel is on `TTBR1`, the user-half on `TTBR0`; the `A1 = 0 → 1` flip ADR-0027 conditionally named applies only to a TTBR1-swap user-half, which this design does not adopt. +- **Single-core only.** No TLB shootdown; the `DSB ISH` discipline forward-extends to SMP (Phase C) unchanged. +- **The physmap window is new but standard.** A kernel direct-map of RAM is the conventional way (Linux, seL4) for the kernel to reach physical frames by VA once it no longer runs identity; it replaces the v1 `VA == PA` assumption. + +## Pros and cons of the options + +### Option 1 — High-half migration (chosen) + +- **Pro:** Structural kernel/user isolation; reduced transient-execution substrate; the high-assurance end-state. +- **Pro:** Honours ADR-0027's signposted direction with no supersede; consumes the pre-paid `EPD1`/`TCR` reservations. +- **Pro:** Frees `TTBR0` for per-task userspace; simplifies the B6 loader (no per-AS kernel injection). +- **Pro (boot-time):** Removes the live-kernel bricking hazards (`DAIF`, `StaticCell` pointers, live IRQ). +- **Con:** Requires new link-high/load-low + `.idmap` PIC infrastructure with real toolchain risk; the irreducible jump + relocation discipline is the hardest code in the project so far. +- **Con:** Breaks the `addr_of!`-as-PA conflation project-wide; ~2× early-boot asm; ≥3 audit entries. + +### Option 2 — Map the kernel into every `TTBR0` (rejected as end-state; documented fallback) + +- **Pro:** Deletes the entire bricking-hazard family — no relink, no `.idmap`, no PIC early boot, no `KERNEL_VA_OFFSET`, no jump. The lightest path that meets B6's exact need; the direct continuation of the shipped ADR-0027 architecture. +- **Pro:** Single-core means no shootdown to keep the shared kernel sub-tree coherent across roots. +- **Con:** The kernel/user boundary becomes a per-descriptor `AP`/`UXN`/`PXN` invariant **inside a user-reachable regime** — a single kernel page mapped AP-unprivileged is a direct EL0→kernel read/write, a *standing* must-never-get-wrong invariant on every address space. Tensions [CLAUDE.md #1](../../CLAUDE.md). +- **Con:** Meltdown-class transient-execution substrate (kernel data present in a user-active regime); the vulnerable shape, not removed by single-core. +- **Con:** Adopting it as the *end-state* contradicts [ADR-0027:166](0027-kernel-virtual-memory-layout.md)'s signposted high-half → requires a `supersede-adr`, not a plain ADR. Acceptable only as an explicit interim. + +### Option 3 — Defer past B6 (rejected) + +- **Pro:** Zero new code/unsafe/risk this milestone; the B5 proxy keeps passing. +- **Con:** Blocks B6's defining goal — without kernel reachability from the task's translation, a real EL0 task's `SVC` vector fetch translation-faults unrecoverably ([phase-b §B6](../roadmap/phases/phase-b.md#milestone-b6--first-userspace-hello)). "Nothing in B6 runs until this is solved." A "no decision" recorded only to reject it. + +## References + +- [ADR-0027 — Kernel virtual memory layout](0027-kernel-virtual-memory-layout.md) — the identity-only B2 layout that reserved `TTBR1`/`EPD1`, pre-committed the high-half `TCR` fields, and named this ADR as the high-half home. +- [ADR-0028 — Address-space data structure](0028-address-space-data-structure.md) / [ADR-0021](0021-raw-pointer-scheduler-ipc-bridge.md) — the `activate` differ-path the per-task `TTBR0` swap rides. +- [ADR-0030 / ADR-0031](0030-syscall-abi.md) — the B5 syscall boundary whose real EL0 round-trip this migration unblocks. +- [phase-b §B6 — First userspace "hello"](../roadmap/phases/phase-b.md#milestone-b6--first-userspace-hello) — the milestone this ADR opens and its T-021 carry-forward gates. +- [`docs/architecture/memory-management.md`](../architecture/memory-management.md) — the v1 layout + the `EPD1 1→0` / `phys_to_virt` forecast this ADR resolves. +- Linux aarch64 boot: `arch/arm64/kernel/head.S` `__primary_switch` + the `idmap` / `.idmap.text` section — the link-high/load-low + identity-trampoline prior art. +- [seL4 on AArch64](https://sel4.systems/) — high-half kernel mapping in a capability microkernel. +- [ARM ARM §D8 "The AArch64 Virtual Memory System Architecture"](https://developer.arm.com/documentation/ddi0487/latest) — `TCR_EL1.EPD0/EPD1`, `TTBR0/TTBR1` input-range selection by `VA[55]`, `TLBI`/`DSB`/`ISB` ordering for translation-regime changes. diff --git a/docs/decisions/README.md b/docs/decisions/README.md index 319f57f..fc2b3e2 100644 --- a/docs/decisions/README.md +++ b/docs/decisions/README.md @@ -61,10 +61,11 @@ Each ADR contains: | 0030 | [Syscall ABI and userspace error taxonomy (B5)](0030-syscall-abi.md) | Accepted | 2026-05-29 | | 0031 | [Initial syscall set (B5 — `send`/`recv`/`console_write`/`task_yield`/`task_exit`)](0031-initial-syscall-set.md) | Accepted | 2026-05-29 | | 0032 | [Endpoint state rollback on `ipc_recv_and_yield` Deadlock + `ipc_cancel_recv` primitive](0032-endpoint-rollback-and-cancel-recv.md) | Accepted | 2026-05-07 | +| 0033 | [Kernel high-half migration (B6 — kernel → `TTBR1_EL1`, boot-time)](0033-kernel-high-half-migration.md) | Proposed | 2026-05-29 | | 0035 | [Physical Memory Manager (B3 prerequisite — bitmap allocator)](0035-physical-memory-manager.md) | Accepted | 2026-05-09 | | 0036 | [QEMU virt is GICv2 / no-IOMMU in v1; corrects GICv3/SMMUv3 in ADR-0004/0006/0012](0036-qemu-virt-gicv2-no-iommu-v1.md) | Accepted | 2026-05-22 | -> **Numbering gaps.** Slots **0033** and **0034** are intentionally reserved, not missing: 0033 (high-half migration) and 0034 (kernel-image section permissions) are named-but-unallocated placeholders forward-flagged in ADR-0027/0028/0029. No files exist for these yet; they open when the corresponding work surfaces. (Slots **0030** (syscall ABI) and **0031** (initial syscall set) were filed and `Accepted` on 2026-05-29 for B5 and are no longer gaps.) ADR numbers are stable history and are never renumbered. +> **Numbering gaps.** Slot **0034** is intentionally reserved, not missing: 0034 (kernel-image section permissions) is a named-but-unallocated placeholder forward-flagged in ADR-0027. No file exists for it yet; it opens when the corresponding work surfaces (the first attacker-observable EL0 execution — likely B6). (Slot **0033** (high-half migration) was filed `Proposed` on 2026-05-29 to open B6 and is no longer a gap; slots **0030**/**0031** were filed and `Accepted` on 2026-05-29 for B5.) ADR numbers are stable history and are never renumbered. ## Creating a new ADR diff --git a/docs/roadmap/phases/phase-b.md b/docs/roadmap/phases/phase-b.md index 460fc50..85a99bf 100644 --- a/docs/roadmap/phases/phase-b.md +++ b/docs/roadmap/phases/phase-b.md @@ -312,7 +312,7 @@ When B6 is Done, run a business review. Phase C becomes active after that review | ADR-0030 | Syscall ABI (includes `IpcError` taxonomy per K2-5) | B5 (**Accepted 2026-05-29**) | was ADR-0028. Settles the register convention (`x8`=number, `x0`–`x5` args, `SVC #0`, `x0`=status) + the dedicated-status-register encoding + `SyscallError` composition + the K2-5 `IpcError` split; drives [T-020](../../analysis/tasks/phase-b/T-020-syscall-error-taxonomy.md) + [T-021](../../analysis/tasks/phase-b/T-021-syscall-dispatch.md) (merged PR #34, `f98e1af`). | | ADR-0031 | Initial syscall set | B5 (**Accepted 2026-05-29**) | was ADR-0029. Fixes the five-syscall v1 set (`send` / `recv` / `task_yield` / `task_exit` / `console_write`; `0` reserved-invalid); numbers `1`–`5` are a fixed ABI decision regression-verified by T-021's host tests, not chosen by the dispatcher. | | ADR-0032 | Endpoint state rollback on `ipc_recv_and_yield` Deadlock + `ipc_cancel_recv` primitive | B2 prep (**Accepted 2026-05-07**) | drove [T-015 (Done 2026-05-07)](../../analysis/tasks/phase-b/T-015-endpoint-rollback-cancel-recv.md) via PR #17. Surfaced as Track A non-blocker in the [2026-05-06 comprehensive review](../../analysis/reviews/code-reviews/2026-05-06-full-tree-comprehensive.md) and a forward-flagged item in the [2026-05-07 B1 closure security review](../../analysis/reviews/security-reviews/2026-05-07-B1-closure.md). Closed before B-phase task lands the first userspace-driven endpoint destroy. ADR-0017 §Revision notes rider records the additive recovery primitive (user-observable surface unchanged). | -| ADR-0033 | Kernel high-half migration (kernel reachable from every task AS) | **B6 (placeholder; opens with B6)** | named in [ADR-0027](../../decisions/0027-kernel-virtual-memory-layout.md) §Decision outcome (Option D) as the future home of the `TTBR0_EL1`-swap discipline that arrives with userspace. No file today; **B5 closed via the syscall boundary without surfacing the per-task swap** (B5's `SVC` proxy ran in the bootstrap AS), so the trigger is now **B6** — the first milestone whose userspace AS must keep the kernel reachable so an EL0 task's `SVC` vector fetch translates (see [§B6 opening sequence](#b6-opening-sequence--prerequisites)). The gating B6 prerequisite. Mirrors the slot-naming pattern of ADR-0028 / 0029 / 0030 / 0031. | +| [ADR-0033](../../decisions/0033-kernel-high-half-migration.md) | Kernel high-half migration (kernel → `TTBR1_EL1`, boot-time; reachable from every task AS) | **B6 (Proposed 2026-05-29)** | **filed** to open B6 — the gating prerequisite (an EL0 task's `SVC` vector fetch must translate, impossible while the kernel is identity-only in `TTBR0`). Extends [ADR-0027](../../decisions/0027-kernel-virtual-memory-layout.md) §Decision outcome (Option D) — consumes the reserved `TTBR1`/`EPD1` + byte-stable high-half `TCR` fields; **no supersede**. Boot-time migration (DAIF-masked window, no live low-VA pointers), staged. Drives [T-022](../../analysis/tasks/phase-b/T-022-high-half-kernel-mapping.md) (Draft; opens with the Propose commit per [ADR-0025 §Rule 1](../../decisions/0025-adr-governance-amendments.md)). §Simulation hardened against two adversarial verification passes; Option B (map-kernel-into-every-TTBR0) is the documented fallback. Awaiting careful-re-read + maintainer Accept. | | ADR-0034 | Kernel-image section permissions (.text RX / .rodata R / .bss/.data RW) | B-late (placeholder; named-but-unallocated) | named in [ADR-0027 §Decision outcome (a)](../../decisions/0027-kernel-virtual-memory-layout.md) as the future home of finer-grained kernel-image permissions. v1 maps the entire 128 MiB RAM range as kernel R/W/X via 2 MiB blocks; T-016 §Out of scope and [`memory-management.md` §"v1 layout"](../../architecture/memory-management.md) defer the re-map. Opens with the first B-phase task whose threat model includes a kernel R/W of `.text` as a meaningful surface — likely **B6** — the first attacker-observable EL0 execution context (the v1 `hello` is code-only mapped `USER\|EXECUTE`, so ADR-0034 is hardening, not a B6 functional blocker; decide in B6 whether to harden now or defer). | | ADR-0035 | Physical Memory Manager (B3 prerequisite — bitmap allocator) | B3 (**Accepted 2026-05-09**) | new — drove the realisation that B3's "Address space abstraction" milestone has a foundational prerequisite (a real `FrameProvider` impl over physical RAM) which deserves its own ADR rather than being absorbed into ADR-0028 (address-space data structure). Drives [T-017 (Draft 2026-05-09; moves to In Progress with this Accept)](../../analysis/tasks/phase-b/T-017-physical-memory-manager.md). Bitmap allocator with hint pointer; 4 KiB metadata for QEMU virt's 32 K frames; reservation-list at init + cached for `free_frame` defensive validation per the §Simulation §Step 2 Critical row; forward-portable to high-half kernel without algorithm rewrite. Includes the §Simulation table walking init / alloc / free / exhaustion / recovery state transitions per [`write-adr` skill §Simulation](../../../.agents/skills/write-adr/SKILL.md). Accept landed as a separate commit per `write-adr` §10 after a careful re-read pass that surfaced and corrected three substantive drafting issues (broken anchor, safe-Rust-vs-`unsafe` zeroing contradiction, muddled "undefined-vs-error" wording in §Simulation row 2; the row-2 fix tightened the Pmm struct contract to add a cached reserved-range list for defensive `free_frame` validation, propagated to T-017). | | ADR-0036 | QEMU virt is GICv2 / no-IOMMU in v1 (corrects ADR-0004 / 0006 / 0012) | post-B1 (**Accepted 2026-05-22**) | new — surfaced by the [2026-05-22 full-tree master review](../../analysis/reviews/master-review/2026-05-22-152729/consolidated.md): the foundational ADRs carried GICv3 / SMMUv3 statements that do not match the GICv2, no-IOMMU reality of QEMU `virt` that B1's GIC work (above) actually assumed. **Corrects** (append-only redirect rider; does **not** supersede) [ADR-0004](../../decisions/0004-target-platforms.md) / [ADR-0006](../../decisions/0006-workspace-layout.md) / [ADR-0012](../../decisions/0012-boot-flow-qemu-virt.md). Ratifies the GICv2 fact stated in the B1 milestone. | From 15a6f23fc486def18db9cf1646f669210906b10d Mon Sep 17 00:00:00 2001 From: Cemil ILIK Date: Sat, 30 May 2026 01:27:50 +0300 Subject: [PATCH 2/7] =?UTF-8?q?docs(adr):=20ADR-0033=20review-round=20?= =?UTF-8?q?=E2=80=94=20ASID=20policy,=20dual=20PA=E2=86=94VA=20offsets,=20?= =?UTF-8?q?early-symbol=20contract,=20AP=20pins?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Maintainer careful-re-read (pre-Accept) on ADR-0033 + T-022; findings folded into the Proposed draft. Each verified against the tree before fixing. High: - ASID policy pinned (H1): v1 keeps ASID=0 global + activate's existing TLBI-on-swap for correctness; the per-task ASID allocator (AddressSpace::asid + reuse/generation/exhaustion) is a TLB-flush-avoidance optimisation DEFERRED to a future task — not a B6 deliverable. (Aligns with the existing mmu.rs activate + the ADR-0028-deferred asid field; removes the "per-task ASID value assignment lands here" overclaim.) - Dual PA↔VA offsets separated (H2): the kernel-image LINK offset (KBASE − KERNEL_IMAGE_PHYS_BASE; linker-symbol→PA) and the PHYSMAP offset (KERNEL_PHYSMAP_BASE; PA-frame deref, the existing phys_frame_kernel_ptr forward-flag) are distinct mappings — conflating them is a bug. §Dependency chain step 2 + T-022 criterion now define both + which-used-where. - Link-high early-symbol contract closed at acceptance level (H3): T-022 now requires pre-jump symbols (__stack_top, __bss_*, .idmap + literal pool) to resolve LOW, kernel_entry as the HIGH br-target, and a relocation/linker-map "no pre-jump high-VMA = build failure" gate. Medium: - High-half layout table gained explicit AP=0b00 (EL1 RW, EL0 no-access) + SH columns + a boot-stack note — making "kernel not leakable to EL0" concrete (UXN=1 blocks execute only; AP[1]=0 is the read/write isolation). - Row-0 barrier order corrected to DSB ISH (publish descriptors) BEFORE MSR TTBR1_EL1 → ISB, so no walk reads a stale descriptor. - §Dependency chain pruned of downstream-consumer steps 7–8 (ADR-0025 §Rule 1: no ungrounded "Phase B6 (separate task)" forward-refs); they are now prose, pointing at phase-b §B6 opening sequence where they are grounded. - M2: T-022 gained a §Review history recording the two adversarial verification passes + this re-read; the ADR §Context wording fixed to match (no longer claims a section that did not exist). Low: - T-022's ADR-0034 reference unlinked (was hyperlinking to the 0027 file; 0034 has no file — matches ADR-0027's placeholder pattern). - ADR §Context notes the B5→B6 milestone shift (ADR-0027 anticipated "when B5 surfaces"; B5 closed as the syscall ABI, the real EL0 task moved to B6). - Added a VA-layout Mermaid (low-TTBR0 user / high-TTBR1 kernel + the two image aliases) per CLAUDE.md #4 / documentation-style. - Softened a few charged phrases (measured-tone). (L3 — the #### "Simulation row-to-verification mapping" heading — left as-is: matches the established ADR-0027/0030/0031 pattern, a genuine sub-sub-section.) Status unchanged: Proposed — awaiting careful re-read + maintainer Accept. Refs: ADR-0033, ADR-0027, ADR-0025, ADR-0028 Co-Authored-By: Claude Opus 4.8 (1M context) --- .../phase-b/T-022-high-half-kernel-mapping.md | 21 ++++-- .../0033-kernel-high-half-migration.md | 73 +++++++++++++------ 2 files changed, 66 insertions(+), 28 deletions(-) diff --git a/docs/analysis/tasks/phase-b/T-022-high-half-kernel-mapping.md b/docs/analysis/tasks/phase-b/T-022-high-half-kernel-mapping.md index 592ab6a..357fcd4 100644 --- a/docs/analysis/tasks/phase-b/T-022-high-half-kernel-mapping.md +++ b/docs/analysis/tasks/phase-b/T-022-high-half-kernel-mapping.md @@ -23,13 +23,15 @@ The migration switches the running kernel's own PC/SP/`VBAR` translation regime ## Acceptance criteria -- [ ] **Link-high/load-low.** The kernel is linked at `KBASE = 0xFFFF_FFFF_8008_0000` (LMA low via linker `AT`); a low-linked, position-independent `.idmap`-style early section holds `boot.s`, the high-half table builder, and the migration trampoline so they resolve `VA == PA` while the MMU is off / identity-only. (Closes the early-`adrp`-computes-high brick ADR-0033 §Consequences names.) -- [ ] **`KERNEL_VA_OFFSET` PA↔VA helper** replaces every `addr_of!`-as-PA site (`mmu_bootstrap` `TTBR` programming, the `__boot_pt_l0` re-read in `kernel_entry`, [`crate::mm::phys_frame_kernel_ptr`](../../../../kernel/src/mm/mod.rs)'s identity body). PA-computation host-tested. +- [ ] **Link-high/load-low + the early-symbol contract.** The kernel is linked at `KBASE = 0xFFFF_FFFF_8008_0000` (LMA low via linker `AT`); a low-linked, position-independent `.idmap`-style early section holds `boot.s`, the high-half table builder, and the migration trampoline so they resolve `VA == PA` while the MMU is off / identity-only. Specifically: + - [ ] **Pre-jump symbols resolve LOW.** `boot.s`'s pre-jump references — `__stack_top` (SP setup), `__bss_start`/`__bss_end` (BSS-zero), the `.idmap` code + its literal pool — must resolve to **low** addresses; the early `adrp`/`:lo12:` sites at [`boot.s`](../../../../bsp-qemu-virt/src/boot.s) currently assume `VA == PA`, and under a high link they would compute high VAs with the MMU off → brick. `kernel_entry` is the **high** `BR` target, referenced only at/after the row-2 crossing (when the high regime is live). + - [ ] **Relocation / linker-map gate.** A build-time check (linker-map inspection or a relocation assertion) verifies **no pre-jump instruction references a high VMA** — the "pre-jump high-VMA relocation = build failure" gate. (Closes the early-`adrp`-computes-high brick ADR-0033 §Consequences names.) +- [ ] **Two distinct PA↔VA offsets** ([ADR-0033 §Dependency chain step 2](../../../decisions/0033-kernel-high-half-migration.md#dependency-chain)) — *not* one helper: the **image-link offset** (`KBASE − KERNEL_IMAGE_PHYS_BASE`) for linker-symbol→PA sites (`mmu_bootstrap` `TTBR`/page-table programming, the `__boot_pt_l0` re-read in `kernel_entry`), and the **physmap offset** (`KERNEL_PHYSMAP_BASE`) for PA-frame deref sites ([`crate::mm::phys_frame_kernel_ptr`](../../../../kernel/src/mm/mod.rs)'s body, PMM zero-fill, copy-user). Each offset host-tested; using the wrong offset at a site is a correctness bug the tests must catch. - [ ] **High-half `TTBR1` tables** built per [ADR-0033 §"High-half layout"](../../../decisions/0033-kernel-high-half-migration.md): kernel image (`PXN = 0`/`UXN = 1`), kernel physmap/direct-map (`PXN = 1`), device MMIO — with the vector table + all handler/branch targets inside the `PXN = 0` image window. `vmsav8` high-half encoders host-tested. - [ ] **EPD1-cleared `TCR_EL1` constant** in [`tyrne_hal::mmu::vmsav8`](../../../../hal/src/mmu/vmsav8.rs): bit 23 = 0, every `TTBR0`-governing field byte-identical to `TCR_EL1_VALUE`; host-tested. - [ ] **The boot-time migration** runs the ADR-0033 §Simulation rows 0–3: build `TTBR1` (`ISB` after the `TTBR1` write, `DSB ISH` for the table memory) → `EPD1` `1→0` + `ISB` → trampoline (`VBAR`-high + `ISB`, `SP`-high, `LDR`/`BR` to the `PXN = 0` high continuation, `DAIF` masked) → `TTBR0`-null + `EPD0 = 1` + `ISB` + `TLBI VMALLE1` + `DSB ISH` + `ISB`. A new `tyrne: high-half active` boot marker prints after the jump. -- [ ] **Per-task `TTBR0_EL1` swap goes live:** [`QemuVirtMmu::activate`](../../../../bsp-qemu-virt/src/mmu.rs) drives the real per-task swap with per-task ASID values (`A1 = 0`, ASID in `TTBR0_EL1.ASID`); the [T-018](T-018-address-space-kernel-object.md) `activate` differ-path that short-circuits in v1 now fires. Host test pins the differ path with distinct ASes. -- [ ] **Audit:** new entries (trampoline asm; per-task `TTBR0` swap; `KERNEL_VA_OFFSET` deref) + Amendments to UNSAFE-2026-0022 / 0023 / 0024; the 0023/0024 `Pending QEMU smoke verification` riders lifted. +- [ ] **Per-task `TTBR0_EL1` swap goes live:** [`QemuVirtMmu::activate`](../../../../bsp-qemu-virt/src/mmu.rs) drives the real per-task swap; the [T-018](T-018-address-space-kernel-object.md) `activate` differ-path that short-circuits in v1 now fires. **v1 keeps `ASID = 0` global + `activate`'s existing `TLBI`-on-swap for correctness — NO per-task ASID allocator** (the `AddressSpace::asid` field + reuse/generation/exhaustion policy are a TLB-flush-avoidance optimisation deferred to a future task per [ADR-0033 §"ASID policy"](../../../decisions/0033-kernel-high-half-migration.md)). Host test pins the differ path with distinct roots. +- [ ] **Audit:** new entries (trampoline asm; per-task `TTBR0` swap; the physmap-offset frame deref) + Amendments to UNSAFE-2026-0022 / 0023 / 0024; the 0023/0024 `Pending QEMU smoke verification` riders lifted. - [ ] **All gates green** incl. `cargo +nightly miri test --workspace --exclude tyrne-bsp-qemu-virt`. **QEMU smoke:** full demo to `tyrne: all tasks complete` with the new `tyrne: high-half active` line; `-d int,unimp,guest_errors` shows **zero new Translation/Permission fault classes** (the migration is fault-clean) — the row-4 abort gate. ## Out of scope @@ -38,7 +40,7 @@ The migration switches the running kernel's own PC/SP/`VBAR` translation regime - `task_create_from_image` (`LoadedImage` → runnable `CapHandle{CapObject::Task(...)}`) — Phase B6. - The per-task `console_write` window + per-page user-VA→kernel-VA translation (T-021 gate #1) and the `SYSCALL_STUB_TABLE` → current-task-table swap (gate #3) — Phase B6. - `tyrne-user` + `userland/hello` + the build pipeline — Phase B6. -- Per-section kernel-image permissions (`.text` RX / `.rodata` R / `.data` RW) — [ADR-0034](../../../decisions/0027-kernel-virtual-memory-layout.md) placeholder; v1 high-half image is RWX-equivalent like the identity map it replaces. +- Per-section kernel-image permissions (`.text` RX / `.rodata` R / `.data` RW) — ADR-0034 (placeholder, no file yet — reserved in [ADR-0027 §Dependency chain](../../../decisions/0027-kernel-virtual-memory-layout.md)); v1 high-half image is RWX-equivalent like the identity map it replaces. ## Approach @@ -46,4 +48,11 @@ _(Settled at the ADR level — see [ADR-0033 §Simulation](../../../decisions/00 ## Definition of done -All acceptance criteria checked; gates green (incl. Miri); audit-log entries + Amendments added; `current.md` updated; **security-relevant — flagged for explicit security review** per [CLAUDE.md #1](../../../../CLAUDE.md) (this changes the kernel's own translation regime and the kernel/user isolation boundary — the highest-stakes change in the project so far). +All acceptance criteria checked; gates green (incl. Miri); audit-log entries + Amendments added; `current.md` updated; **security-relevant — flagged for explicit security review** per [CLAUDE.md #1](../../../../CLAUDE.md) (this changes the kernel's own translation regime and the kernel/user isolation boundary — among the highest-stakes changes in the project so far). + +## Review history + +- **2026-05-30 — Draft opened alongside [ADR-0033](../../../decisions/0033-kernel-high-half-migration.md) (Proposed)**, in the same commit per [ADR-0025 §Rule 1](../../../decisions/0025-adr-governance-amendments.md). Before authoring, ADR-0033's boot-time high-half §Simulation was hardened against **two independent adversarial verification passes** (multi-agent; each lens attacking the transition for a kernel-bricking flaw): + - **Pass 1** (against a *mid-kernel* migration framing) caught an **architecturally-impossible** trampoline — "a page mapped identically in both regimes (`VA == PA`) reachable while translation flips," impossible because with `T0SZ = T1SZ = 16` the `TTBR0` (low, `VA[55]=0`) and `TTBR1` (high, `VA[55]=1`) input ranges are disjoint, so no VA is served by both. Corrected mechanism: the PC physically crosses low→high at the `br`, both `TTBR`s live, the low `.idmap` as the source regime. The pass also surfaced the live-kernel hazards (`DAIF` unmasked, surviving low-VA `StaticCell` pointers, live IRQ) that motivated the **boot-time** framing. + - **Pass 2** (against the corrected *boot-time* §Simulation) confirmed the crossing is sound *in principle* (disjoint regimes; the `vmsav8` encoders set `AF = 1` + `PXN`-per-flags) and pinned the preconditions now in the ADR: the `.idmap` link-split + the two PA↔VA offsets (which do not exist yet — hence this task), the `PXN = 0`-image-window `br` target, the `DSB`-before-`MSR` / `ISB` ordering, the `EPD1`-cleared `TCR` constant, and "the migration must complete before any `DAIF` unmask." +- **2026-05-30 — maintainer careful-re-read (pre-Accept) on ADR-0033 + T-022.** Findings folded into the Proposed draft before Accept: v1 **ASID policy** pinned (`ASID = 0` + flush-on-swap; per-task allocator deferred — H1); the **two distinct PA↔VA offsets** (image-link vs physmap) separated (H2); this task's **link-high early-symbol contract** + relocation/linker-map gate added (H3); the ADR's high-half layout table gained explicit **`AP = 0b00` / `SH`** columns (kernel-not-leakable-to-EL0 made concrete — M); the row-0 **`DSB`-before-`MSR`** barrier order corrected (M); the ADR §Dependency chain pruned of downstream-consumer steps per ADR-0025 §Rule 1 (M); a VA-layout Mermaid + the B5→B6 milestone-shift note + the ADR-0034 link/tone fixes (L). The §Simulation row-to-verification mapping is filled when this task moves to `In Progress`. diff --git a/docs/decisions/0033-kernel-high-half-migration.md b/docs/decisions/0033-kernel-high-half-migration.md index e32f7be..4db7e7c 100644 --- a/docs/decisions/0033-kernel-high-half-migration.md +++ b/docs/decisions/0033-kernel-high-half-migration.md @@ -10,9 +10,9 @@ Milestone B6 ("first userspace hello") must run a real EL0 task: a separate bina Today the loader's userspace address space ([`task_loader.rs`](../../kernel/src/obj/task_loader.rs)) holds **only** the image + stack mappings — no kernel mappings. The MMU runs identity-only in `TTBR0_EL1` ([ADR-0027](0027-kernel-virtual-memory-layout.md)): `TTBR1_EL1 = 0`, `TCR_EL1.EPD1 = 1`. The moment a real EL0 task is dispatched with its own `TTBR0_EL1`, an `SVC` (or any exception) vectors the CPU to `VBAR_EL1` and **fetches the trampoline instruction** — which lives at a kernel physical address that is **not mapped in that task's `TTBR0_EL1`**. The result is a translation fault on the vector fetch, recursively, with no recovery. B5's syscall boundary smoke worked only because the EL1 kernel-stub ran in the bootstrap address space, where the kernel is identity-mapped; a real EL0 task has no such luxury. [phase-b §B6](../roadmap/phases/phase-b.md#milestone-b6--first-userspace-hello) states it plainly: *"Nothing in B6 runs until this is solved."* -[ADR-0027 §Decision outcome (a)](0027-kernel-virtual-memory-layout.md) anticipated exactly this moment and **signposted the answer**: a high-half kernel. It reserved `TTBR1_EL1` (with `EPD1 = 1`), pre-committed the high-half-friendly `TCR_EL1` fields (`TG1 = 0b10`, `T1SZ = 16`, `IRGN1`/`ORGN1`/`SH1` already cacheable-inner-shareable), and named ADR-0033 as the home of "the `TTBR0_EL1`-swap discipline that arrives with userspace." This ADR settles that migration: **how the running kernel moves from its identity/low mapping to a high-half (`TTBR1_EL1`) mapping, so the kernel is present in every address space's high half while `TTBR0_EL1` is freed for per-task userspace.** +[ADR-0027 §Decision outcome (a)](0027-kernel-virtual-memory-layout.md) anticipated exactly this moment and **signposted the answer**: a high-half kernel. It reserved `TTBR1_EL1` (with `EPD1 = 1`), pre-committed the high-half-friendly `TCR_EL1` fields (`TG1 = 0b10`, `T1SZ = 16`, `IRGN1`/`ORGN1`/`SH1` already cacheable-inner-shareable), and named ADR-0033 as the home of "the `TTBR0_EL1`-swap discipline that arrives with userspace." This ADR settles that migration: **how the running kernel moves from its identity/low mapping to a high-half (`TTBR1_EL1`) mapping, so the kernel is present in every address space's high half while `TTBR0_EL1` is freed for per-task userspace.** (ADR-0027 framed this as opening "when B5 userspace work surfaces the per-task `TTBR0_EL1` swap"; in practice B5 closed as the syscall ABI + the EL1-stub `+0x200` proxy, and the real EL0 task — hence the per-task swap — moved to B6, so this migration opens at **B6**, not B5.) -The stakes are the highest in the project so far. The migration switches the running kernel's own instruction-fetch, stack, and exception-vector translation regime *mid-flight*; a single wrong step (an unmapped fetch, a stale TLB entry, a surviving low-VA pointer, a relocation that resolves to the wrong half) bricks the kernel silently with no recovery path. This ADR was drafted with two independent adversarial verification passes against its §Simulation (recorded in T-022's review history); the §Simulation below is the hardened result, and the §Dependency chain is explicit that the migration requires **new infrastructure** (a link-high/load-low linker discipline, a position-independent low-linked early-boot section, a `KERNEL_VA_OFFSET` PA↔VA helper) that does not exist in the tree today. +The stakes are high: the migration switches the running kernel's own instruction-fetch, stack, and exception-vector translation regime *mid-flight*; a single wrong step (an unmapped fetch, a stale TLB entry, a surviving low-VA pointer, a relocation that resolves to the wrong half) halts the kernel unrecoverably. The §Simulation below was hardened against two independent adversarial verification passes during drafting — the first caught and corrected an architecturally-impossible "trampoline mapped in both regimes" step (impossible with disjoint `TTBR0`/`TTBR1` input ranges); their record is in [T-022 §Review history](../analysis/tasks/phase-b/T-022-high-half-kernel-mapping.md). The §Dependency chain is explicit that the migration requires **new infrastructure** that does not exist in the tree today — a link-high/load-low linker discipline, a position-independent low-linked early-boot section, and the **two distinct PA↔VA offsets** (the kernel-image *link* offset and the *physmap*/direct-map offset; see §"High-half layout" + §Dependency chain step 2 — conflating them is a bug). ## Decision drivers @@ -35,12 +35,14 @@ Chosen option: **Option 1 — high-half kernel migration, performed at boot time High-half is the structural kernel/user separation the project's high-assurance positioning requires (driver 1) and the direction [ADR-0027](0027-kernel-virtual-memory-layout.md) already signposted and pre-paid for (driver 2) — adopting Option 2 as the *end-state* would silently override that Accepted decision and force a supersede. Option 3 blocks the entire B6 milestone ([phase-b §B6](../roadmap/phases/phase-b.md#milestone-b6--first-userspace-hello)) and is recorded only to reject it honestly. -Two refinements make the choice safe and methodical rather than reckless: +Two refinements make the choice safe and methodical: - **Boot-time, not mid-kernel.** The migration runs inside the bootstrap window (within / immediately after [`mmu_bootstrap`](../../bsp-qemu-virt/src/mmu_bootstrap.rs), before the kernel's `StaticCell`s are written and before the GIC/timer is initialised). This is decisive for risk: an adversarial review of a *mid-kernel* migration found the dominant bricking hazards came from migrating a **live** kernel — `DAIF` unmasked, surviving low-VA `StaticCell` pointers, a live timer IRQ. At boot all three evaporate by construction: `DAIF` is masked from `_start` (`boot.s` `msr daifset, #0xf`, `SPSR_EL2 = 0x3c5`), no `StaticCell` holds a low VA yet (`kernel_entry` writes them *after* the migration returns, so they store high VAs), and no interrupt source is live. What remains is the irreducible core of any high-half jump — the relocation discipline and the `br` that crosses regimes — handled in the controlled boot window. - **Staged, not bundled.** [T-022](../analysis/tasks/phase-b/T-022-high-half-kernel-mapping.md) lands the migration **alone** (relink, the low-linked early-boot section, the high-half table builder, the trampoline, the `KERNEL_VA_OFFSET` PA↔VA helper, the per-task `TTBR0` swap going live) and is reviewed on its own. B6's EL0-entry / `task_create_from_image` / carry-forward-gate / `userland` tasks then build on the settled high-half regime, satisfying [CLAUDE.md #6](../../CLAUDE.md). -`TCR_EL1.A1` stays `0` and the ASID stays in `TTBR0_EL1.ASID`: the kernel moves to `TTBR1` and the **user**-half stays on `TTBR0`, so the `A1 = 0 → 1` flip [ADR-0027 §"ASID"](0027-kernel-virtual-memory-layout.md) conditionally named (only for a *TTBR1-swap user-half*) does **not** apply here. Per-task ASID *value* assignment lands here as the per-task `TTBR0` swap (the T-018 `activate` differ-path that short-circuits in v1) goes live. +`TCR_EL1.A1` stays `0` and the ASID stays in `TTBR0_EL1.ASID`: the kernel moves to `TTBR1` and the **user**-half stays on `TTBR0`, so the `A1 = 0 → 1` flip [ADR-0027 §"ASID"](0027-kernel-virtual-memory-layout.md) conditionally named (only for a *TTBR1-swap user-half*) does **not** apply here. + +**ASID policy (v1) — `ASID = 0` global + flush-on-swap; the allocator is deferred.** v1 keeps **`ASID = 0` globally** (per [ADR-0027 §"ASID"](0027-kernel-virtual-memory-layout.md)). Correctness across the per-task `TTBR0` swap comes from a **TLB flush on every swap**, not from ASID-tagging: [`QemuVirtMmu::activate`](../../bsp-qemu-virt/src/mmu.rs) already issues `TLBI` + `DSB ISH` after writing `TTBR0_EL1`, so the swap going live needs no new ASID machinery — only the [T-018](../analysis/tasks/phase-b/T-018-address-space-kernel-object.md) differ-path firing for distinct roots. A **real per-task ASID allocator** — the `AddressSpace::asid` field [ADR-0028 forward-flagged](0028-address-space-data-structure.md), plus a reuse / generation / exhaustion policy and the resulting `TLBI`-avoidance — is a **TLB-flush-avoidance optimisation, not a B6 correctness requirement** (v1's single userspace task gains nothing from it) and is **deferred** to a future task/ADR when multi-task TLB pressure surfaces. T-022 therefore does **not** add the `asid` field; it keeps `ASID = 0` + flush-on-swap, and this ADR does **not** make per-task ASID assignment a B6 deliverable. Option 2 is recorded as a **credible, non-strawman alternative** — it deletes the entire bricking-hazard family and is the lighter continuation of the shipped architecture. It is rejected as the *end-state* (the standing per-descriptor AP invariant inside a user-reachable regime plus the transient-execution exposure outweigh the one-time migration risk for a kernel that markets itself high-assurance), but it **remains the documented fallback** if T-022's link-split / position-independence discipline proves intractable on the toolchain (see §Consequences → Negative). @@ -48,11 +50,28 @@ Option 2 is recorded as a **credible, non-strawman alternative** — it deletes Mirroring [ADR-0027 §Decision outcome (a)](0027-kernel-virtual-memory-layout.md)'s enumeration discipline, the high-half root populates exactly three regions (4 KiB granule, 48-bit VA, `T1SZ = 16` ⇒ `TTBR1` serves `VA[55] = 1`): -| Region | High VA → PA | Attrs | -|--------|--------------|-------| -| Kernel image | `[KBASE .. KBASE+image_size)` → `[0x4008_0000 ..)` (`KBASE = 0xFFFF_FFFF_8008_0000`) | normal-cached, `AF = 1`, `nG = 0` (global), **`PXN = 0` / `UXN = 1`** (EL1-executable, EL0 no-exec). **The vector table and every handler/branch target must fall inside this `PXN = 0` window**, not the physmap alias below. | -| Kernel physmap (direct map) | a high window → all RAM PA `[0x4000_0000 .. 0x4800_0000)` | normal-cached, `AF = 1`, global, **`PXN = 1` / `UXN = 1`** (data only — PMM frames, page tables, copy-user buffers accessed by PA). | -| Device MMIO | a high window → `[0x0800_0000 .. 0x0920_0000)` | device-nGnRnE, `AF = 1`, `PXN = 1` / `UXN = 1` (UART + GIC after the jump). | +```mermaid +graph LR + subgraph T0["TTBR0_EL1 — low half (VA[55]=0)"] + U["per-task userspace
image + stack
AP allows EL0; per-task root"] + end + subgraph T1["TTBR1_EL1 — high half (VA[55]=1) — AP=0b00, EL0 no-access, present for every task"] + KI["kernel image alias
PXN=0 — executable
(vectors + handlers + br target HERE)"] + PM["physmap / direct-map
PXN=1 — data only
(PMM frames, page tables, copy-user)"] + MM["device MMIO
device-nGnRnE, PXN=1"] + end +``` + +| Region | High VA → PA | Access (`AP`) | Exec (`PXN`/`UXN`) | Mem type / `SH` | +|--------|--------------|---------------|--------------------|-----------------| +| Kernel image (`.text`/`.rodata`/`.bss` + boot stack) | `[KBASE .. KBASE+image_size)` → `[0x4008_0000 ..)` (`KBASE = 0xFFFF_FFFF_8008_0000`) | **`AP = 0b00`** — EL1 RW, **EL0 no-access** | **`PXN = 0` / `UXN = 1`** (EL1-exec, EL0 no-exec) | normal-cached, `SH = 0b11` (inner-shareable), `AF = 1`, `nG = 0` | +| Kernel physmap (direct map) | a high window → all RAM PA `[0x4000_0000 .. 0x4800_0000)` | **`AP = 0b00`** — EL1 RW, EL0 no-access | **`PXN = 1` / `UXN = 1`** (data — PMM frames, page tables, copy-user buffers by PA) | normal-cached, `SH = 0b11` (inner-shareable), `AF = 1`, `nG = 0` | +| Device MMIO | a high window → `[0x0800_0000 .. 0x0920_0000)` | **`AP = 0b00`** — EL1 RW, EL0 no-access | `PXN = 1` / `UXN = 1` | device-nGnRnE, `SH = 0b00` (non-shareable), `AF = 1`, `nG = 0` | + +Two pins this table makes load-bearing: + +- **`AP = 0b00` on every kernel region is what makes "the kernel is not leakable to EL0" concrete** — not merely the kernel living in `TTBR1`. `UXN = 1` blocks EL0 *execute* only; EL0 *read/write* isolation is the `AP[1] = 0` (EL0-no-access) encoding. While an EL0 task runs, `TTBR1` is the active high-half regime, so an EL0 access to any high VA must fault on `AP` — the structural-absence claim rests on this bit. (`AP[2] = 0` keeps EL1 read-write; per-section read-only hardening is [ADR-0034](0027-kernel-virtual-memory-layout.md).) +- **The boot stack lives in the kernel-image region** (`.bss`-resident `__stack_top`), so it is RW and — like the whole v1 image — `PXN = 0`. Per-section discipline (`.text` RX, `.rodata` R, `.bss`/stack RW-`NX`) is [ADR-0034](0027-kernel-virtual-memory-layout.md)'s deferred job; v1 maps the whole image uniformly, exactly as the identity map it replaces did. The §Simulation row-2 `SP`-rebase targets this region. The kernel image is therefore reachable at two high VAs — the executable image alias (`PXN = 0`) and the physmap alias (`PXN = 1`). **The migration's branch target and `VBAR_EL1` must resolve into the `PXN = 0` image window**; a target landing in the `PXN = 1` physmap alias is an execute-never permission fault on the first high fetch (a correctness pin T-022 must hold, surfaced by the §Simulation review). @@ -62,9 +81,9 @@ The worst-case boot-time transition — the running kernel switching its own PC/ | Step | State pre | Action | State post | Observable / verification | |------|-----------|--------|------------|---------------------------| -| 0 | Low-linked early boot running at PA `0x4008_NNNN` (MMU on, identity `TTBR0`); `TTBR1 = 0`; `EPD1 = 1`; `DAIF` masked. Image relinked at `KBASE` but loaded at PA `0x4008_0000`. | Build the high-half `TTBR1` tables (the three regions above) in reserved frames, writing descriptors via the host-tested `vmsav8` encoders + `write_volatile` on `*mut u64` whose target PAs are computed by `KERNEL_VA_OFFSET` (**not** `addr_of!`-as-PA — see §Dependency chain). `MSR TTBR1_EL1, `; **`ISB`** (context-synchronises the `TTBR1` register write); `DSB ISH` (orders the table-memory writes for the walker). | `TTBR1` populated + synchronised; `EPD1` still `1`, so no high VA translates yet; PC/SP/`VBAR` still low (identity unchanged). | T-022 `vmsav8` high-half encoder host tests + UNSAFE-2026-0022 Amendment (descriptor writes into high-half frames). | +| 0 | Low-linked early boot running at PA `0x4008_NNNN` (MMU on, identity `TTBR0`); `TTBR1 = 0`; `EPD1 = 1`; `DAIF` masked. Image relinked at `KBASE` but loaded at PA `0x4008_0000`. | Build the high-half `TTBR1` tables (the three regions above) in reserved frames, writing descriptors via the host-tested `vmsav8` encoders + `write_volatile` on `*mut u64` whose target PAs are computed from the kernel-image **link offset** (`symbol_VA − KERNEL_IMAGE_LINK_OFFSET`), **not** `addr_of!`-as-PA — see §Dependency chain step 2. Barrier order: **`DSB ISH`** (publish the descriptor writes to the table walker — *before* the walker can be enabled) → `MSR TTBR1_EL1, ` → **`ISB`** (context-synchronise the register write). The `DSB` precedes the `MSR` (and necessarily the row-1 `EPD1` clear), so no walk can read a stale/zero descriptor. | `TTBR1` populated + synchronised; `EPD1` still `1`, so no high VA translates yet; PC/SP/`VBAR` still low (identity unchanged). | T-022 `vmsav8` high-half encoder host tests + UNSAFE-2026-0022 Amendment (descriptor writes into high-half frames). | | 1 | `TTBR1` populated, `EPD1 = 1`, kernel executing low. | `MSR TCR_EL1, ` — only `EPD1` `1 → 0`; **every `TTBR0`-governing field (`T0SZ`/`EPD0`/`TG0`/`IRGN0`/`ORGN0`/`SH0`) byte-identical to the live `TCR_EL1_VALUE`** (a new pinned constant; perturbing any `TTBR0` field faults the *next* low fetch). `ISB`. (No pre-flip `TLBI` of the high range: with `EPD1 = 1` a `TTBR1` walk faults and the architecture caches no result, so there is nothing stale to drop — the §Simulation review corrected the earlier "pre-flip TLBI" rationale.) | Both regimes live simultaneously: low identity (`TTBR0`) **and** high (`TTBR1`). The ranges are disjoint (`VA[55] = 0` low / `= 1` high), so coexistence is sound. PC/SP/`VBAR` still low. | UNSAFE-2026-0023 Amendment (`EPD1`-clear `MSR`, same EL1-sysreg class as the bootstrap block). | -| 2 | Dual-live; `EPD1 = 0`; PC/SP/`VBAR` low; `DAIF` masked. | **The crossing.** Executing from the low-linked `.idmap` (PC-relative-safe, `VA == PA` under `TTBR0`): (1) `MSR VBAR_EL1, ` + `ISB` — high vectors live **before** the branch, so any synchronous fault on the first high fetch vectors to the `TTBR1`-mapped handler; (2) rebase `SP` to the high-VA boot stack (mapped `PXN`-irrelevant RW in `TTBR1`); (3) `LDR xN, =` (literal in the `.idmap` section so it resolves correctly while low) → `BR xN` — PC physically crosses from a low `.idmap` VA to a **`PXN = 0` image-window** high VA. The low idmap stays live (TTBR0 not yet nulled) as a safety net; the window takes no exception (`DAIF` masked; the few instructions cannot fault if the high image window is correctly populated). | PC/SP/`VBAR` resolve high via `TTBR1`. | **NEW** UNSAFE-YYYY-NNNN (the absolute-jump trampoline asm; invariants: `.idmap` low-linked + PIC, literal pool in `.idmap`, target in the `PXN = 0` window, `VBAR`-high-before-`br`, `SP`-high-mapped-before-`br`, `DAIF` masked). | +| 2 | Dual-live; `EPD1 = 0`; PC/SP/`VBAR` low; `DAIF` masked. | **The crossing.** Executing from the low-linked `.idmap` (PC-relative-safe, `VA == PA` under `TTBR0`): (1) `MSR VBAR_EL1, ` + `ISB` — high vectors live **before** the branch, so any synchronous fault on the first high fetch vectors to the `TTBR1`-mapped handler; (2) rebase `SP` to the high-VA boot stack (`__stack_top` in the **kernel-image region**, `AP = 0b00` RW in `TTBR1`); (3) `LDR xN, =` (literal in the `.idmap` section so it resolves correctly while low) → `BR xN` — PC physically crosses from a low `.idmap` VA to a **`PXN = 0` image-window** high VA. The low idmap stays live (TTBR0 not yet nulled) as a safety net; the window takes no exception (`DAIF` masked; the few instructions cannot fault if the high image window is correctly populated). | PC/SP/`VBAR` resolve high via `TTBR1`. | **NEW** UNSAFE-YYYY-NNNN (the absolute-jump trampoline asm; invariants: `.idmap` low-linked + PIC, literal pool in `.idmap`, target in the `PXN = 0` window, `VBAR`-high-before-`br`, `SP`-high-mapped-before-`br`, `DAIF` masked). | | 3 | PC/SP/`VBAR` high; low idmap still live. **No live low-VA pointer exists** — `StaticCell`s are unwritten (`kernel_entry` writes them after the migration returns, storing high VAs); the only low references were in `.idmap`, which the PC has left. | `MSR TTBR0_EL1, xzr`; set `TCR_EL1.EPD0 = 1`; `ISB`; `TLBI VMALLE1`; `DSB ISH`; `ISB` (registers only — no table-memory mutation, so no `DSB` *before* the `TLBI` is required). | Final high-half steady state: kernel on `TTBR1` (`EPD1 = 0`); `TTBR0` free/null for per-task userspace (`EPD0 = 1` until a task AS activates); stale low translations flushed. Control returns to the high-linked `kernel_entry`; `StaticCell` init + GIC + PMM + loader + demo all run high. A real EL0 task's `TTBR0` carries only its own user mappings; its `SVC` vector fetch goes to the high `VBAR` mapped in `TTBR1` (present for every task), so `+0x400` + the EL1 handler translate. | UNSAFE-2026-0023 + 0024 Amendments (`TTBR0`-null/`EPD0`-set + post-flip `TLBI`) + **NEW** entry for the per-task `TTBR0` swap going live; T-018 `activate`-differ host test. | | 4 | Any step's precondition violated. | **Abort discipline.** A boot-time regime switch has **no runtime rollback** — once row 2's `br` executes, row 3 destroys the low regime. Safety is therefore *design-time* (per-region table verified, `.idmap` link-split, the ordering + `PXN`-window pins above) **plus** the QEMU smoke gate: a wrong step fail-stops (hangs) before the new `tyrne: high-half active` marker and before `tyrne: all tasks complete`, so the [business master-plan closure-smoke gate](../analysis/reviews/business-reviews/master-plan.md) blocks the merge. | No silent-wrong-kernel ships: a broken migration is a visible boot hang, not a passing build. Milestone fallback: Option 2. | The smoke marker + `-d int,unimp,guest_errors` (zero new Translation/Permission faults) is T-022's runtime gate. | @@ -72,7 +91,7 @@ The worst-case boot-time transition — the running kernel switching its own PC/ Per the [`write-adr` skill §Procedure step 5 sub-bullet](../../.agents/skills/write-adr/SKILL.md), each row maps to a verification artefact in [T-022](../analysis/tasks/phase-b/T-022-high-half-kernel-mapping.md), recorded in its review-history row on completion: -- **Row 0** → `vmsav8` high-half encoder host tests (the three-region descriptor encodings, `PXN`/`UXN`/`AF`/`nG` per region) + the `KERNEL_VA_OFFSET` PA-computation host test + UNSAFE-2026-0022 Amendment. +- **Row 0** → `vmsav8` high-half encoder host tests (the three-region descriptor encodings, `AP`/`PXN`/`UXN`/`SH`/`AF`/`nG` per region) + host tests for **both** PA↔VA offsets (the image-link offset and the physmap offset) + UNSAFE-2026-0022 Amendment. - **Row 1** → a host test pinning the `EPD1`-cleared `TCR_EL1` constant is byte-identical to `TCR_EL1_VALUE` except bit 23 + UNSAFE-2026-0023 Amendment. - **Row 2** → the new absolute-jump-trampoline UNSAFE entry + the QEMU smoke showing the `tyrne: high-half active` marker after the jump (the runtime proof the crossing reached the `PXN = 0` window). - **Row 3** → UNSAFE-2026-0023 / 0024 Amendments + the per-task-`TTBR0`-swap UNSAFE entry + the T-018 `activate`-differ host test (now exercised with distinct ASes). @@ -87,9 +106,21 @@ For this decision to be **fully** in effect: (0xFFFF_FFFF_8008_0000), keep LMA low via `AT`, and a low-linked position-independent `.idmap`-style early section for boot.s + the table builder + the trampoline. — T-022 (opens with this ADR) -2. KERNEL_VA_OFFSET PA<->VA helper replacing every `addr_of!`-as-PA site - (mmu_bootstrap TTBR programming, the __boot_pt_l0 re-read in - kernel_entry, crate::mm::phys_frame_kernel_ptr's identity body). — T-022 +2. TWO distinct PA<->VA offsets (NOT one "KERNEL_VA_OFFSET" — they are + different mappings, and conflating them is a bug): + - KERNEL_IMAGE_LINK_OFFSET = KBASE - KERNEL_IMAGE_PHYS_BASE + (0xFFFF_FFFF_8008_0000 - 0x4008_0000). A kernel-image symbol's + PA = symbol_VA - KERNEL_IMAGE_LINK_OFFSET. Used to program TTBR / + page-table PAs from linker symbols (replaces mmu_bootstrap's + `l0 as u64` and the __boot_pt_l0 re-read in kernel_entry). + - KERNEL_PHYSMAP_BASE (the direct-map base; the KERNEL_PHYS_BASE that + crate::mm::phys_frame_kernel_ptr already forward-flags). A frame's + kernel VA = KERNEL_PHYSMAP_BASE + (pa - RAM_PHYS_BASE). Used to deref a + PMM frame / page table / copy-user buffer by PA (the + phys_frame_kernel_ptr body). KERNEL_MMIO_BASE is the analogous + device window. + The linker-symbol->PA path uses the IMAGE-link offset; the frame-deref + path uses the PHYSMAP offset. — T-022 3. High-half table builder (the three-region TTBR1 root: image PXN=0, physmap PXN=1, device) — extends the fixed 4-frame 2 MiB-block bootstrap with the physmap/L3 capability it lacks today. — T-022 @@ -98,14 +129,12 @@ For this decision to be **fully** in effect: 5. The migration trampoline (hand-asm: VBAR-high + SP-high + LDR/BR to the PXN=0 high continuation) + the TTBR0-null/EPD0-set teardown. — T-022 6. Per-task TTBR0_EL1 swap going live: QemuVirtMmu::activate drives the - real swap with per-task ASID values (A1=0, ASID in TTBR0_EL1.ASID); the + real swap (ASID = 0 global + its existing TLBI-on-swap — NO per-task + ASID allocator in v1, see §Decision outcome "ASID policy"); the T-018 activate differ-path that short-circuits in v1 now fires. — T-022 -7. EL0-ready Task context register file + enter-EL0/ERET path + per-task - SP_EL1, so a real EL0 task can take the +0x400 trap. — Phase B6 (separate task, builds on T-022) -8. task_create_from_image wrapper + userland/hello + tyrne-user. — Phase B6 (deferred) ``` -Steps 1–6 are [T-022](../analysis/tasks/phase-b/T-022-high-half-kernel-mapping.md), opened at `Draft` in the same commit as this ADR per [ADR-0025 §Rule 1](0025-adr-governance-amendments.md). Steps 7–8 are the subsequent B6 tasks that build on the settled high-half regime (the staging that satisfies [CLAUDE.md #6](../../CLAUDE.md)). This ADR **extends, not relitigates**, [ADR-0027](0027-kernel-virtual-memory-layout.md): it consumes the reserved `TTBR1`, the single `EPD1` flip, and the byte-stable high-half `TCR` fields that ADR-0027 pre-committed. +All six steps are [T-022](../analysis/tasks/phase-b/T-022-high-half-kernel-mapping.md), opened at `Draft` in the same commit as this ADR per [ADR-0025 §Rule 1](0025-adr-governance-amendments.md); T-022's review-history row records the §Simulation row-to-verification mapping. **Downstream consumers are *not* prerequisites of this ADR** and so are deliberately absent from the numbered chain above: the EL0-ready `Task` context + enter-EL0/`ERET` path, and `task_create_from_image` + `userland/hello` + `tyrne-user`, are separate B6 tasks opened *after* T-022 (building on the settled high-half regime — the staging that satisfies [CLAUDE.md #6](../../CLAUDE.md)); they are enumerated in [phase-b §B6 opening sequence](../roadmap/phases/phase-b.md#b6-opening-sequence--prerequisites). This ADR **extends, not relitigates**, [ADR-0027](0027-kernel-virtual-memory-layout.md): it consumes the reserved `TTBR1`, the single `EPD1` flip, and the byte-stable high-half `TCR` fields that ADR-0027 pre-committed. ## Consequences @@ -120,7 +149,7 @@ Steps 1–6 are [T-022](../analysis/tasks/phase-b/T-022-high-half-kernel-mapping ### Negative - **Substantial new infrastructure with real toolchain risk.** The migration needs a link-high/load-low discipline + a low-linked position-independent `.idmap` early section that **does not exist** today (the linker script is single-base `ORIGIN = 0x4008_0000`, no `AT`, no `.idmap`). The hard part, surfaced by the adversarial §Simulation review: under a high link, the early-boot `adrp`/`addr_of!` sites in `boot.s`/`mmu_bootstrap` would compute **high** VAs while running **low** with the MMU off — bricking before `kernel_entry`. *Mitigation:* the entire low-running portion (BSS-zero, SP setup, table build, trampoline) is kept in the low-linked `.idmap` section so it resolves low; the migration trampoline is hand-asm (the compiler cannot be guaranteed to emit position-independent, no-`adrp`-to-high code for arbitrary Rust). **We accept this cost** because it is the irreducible price of the high-assurance end-state, it is bounded and one-time, and it is verified row-by-row by T-022 + the QEMU smoke gate. **If the link-split proves intractable on the LLVM/lld toolchain, the documented fallback is Option 2** (map the kernel into every `TTBR0`) as an explicit interim, deferring the structural boundary — recorded here so the fallback needs no new ADR. -- **The `addr_of!`-as-PA conflation must be broken project-wide.** Every site that today treats a linker symbol as a PA (TTBR programming, the `__boot_pt_l0` re-read, `phys_frame_kernel_ptr`) must compute PAs via `KERNEL_VA_OFFSET`. *Mitigation:* this is the single-helper-body change [memory-management.md](../architecture/memory-management.md) and the UNSAFE-2026-0025/0026/0027/0030 entries already forecast; T-022 lands it once. +- **The `addr_of!`-as-PA conflation must be broken project-wide, with the *right* offset at each site.** Every site that today treats a linker symbol as a PA (TTBR programming, the `__boot_pt_l0` re-read) must use the **image-link** offset; every site that derefs a PA frame (`phys_frame_kernel_ptr`, PMM zero-fill, copy-user) must use the **physmap** offset (§Dependency chain step 2). *Mitigation:* the physmap side is the single-helper-body change [memory-management.md](../architecture/memory-management.md) and the UNSAFE-2026-0025/0026/0027/0030 entries already forecast; the image-link side is confined to the early-boot table programming. Using the wrong offset at a site is a correctness bug T-022's host tests pin. - **~2× early-boot asm and ≥3 new/amended audit entries** vs identity ([ADR-0027:79](0027-kernel-virtual-memory-layout.md)). *Mitigation:* the migration is one staged task; the audit surface is enumerated in the §Simulation mapping. - **No runtime rollback.** A half-completed migration cannot recover. *Mitigation:* safety is design-time (verified per-region tables + ordering pins) + the QEMU smoke gate fail-stops a broken migration visibly (row 4). @@ -138,7 +167,7 @@ Steps 1–6 are [T-022](../analysis/tasks/phase-b/T-022-high-half-kernel-mapping - **Pro:** Honours ADR-0027's signposted direction with no supersede; consumes the pre-paid `EPD1`/`TCR` reservations. - **Pro:** Frees `TTBR0` for per-task userspace; simplifies the B6 loader (no per-AS kernel injection). - **Pro (boot-time):** Removes the live-kernel bricking hazards (`DAIF`, `StaticCell` pointers, live IRQ). -- **Con:** Requires new link-high/load-low + `.idmap` PIC infrastructure with real toolchain risk; the irreducible jump + relocation discipline is the hardest code in the project so far. +- **Con:** Requires new link-high/load-low + `.idmap` PIC infrastructure with real toolchain risk; the irreducible jump + relocation discipline is among the most delicate code in the project so far. - **Con:** Breaks the `addr_of!`-as-PA conflation project-wide; ~2× early-boot asm; ≥3 audit entries. ### Option 2 — Map the kernel into every `TTBR0` (rejected as end-state; documented fallback) From db892c1137f6b3f36cc71cae1a759fd2f1344a76 Mon Sep 17 00:00:00 2001 From: Cemil ILIK Date: Sat, 30 May 2026 03:58:35 +0300 Subject: [PATCH 3/7] =?UTF-8?q?docs(adr):=20Accept=20ADR-0033=20=E2=80=94?= =?UTF-8?q?=20kernel=20high-half=20migration?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Maintainer careful-re-read complete (review-round 15a6f23 folded the findings into the Proposed draft); per write-adr §10 the Accept lands as its own commit after the re-read. - ADR-0033 Status: Proposed -> Accepted (Date 2026-05-30). - docs/decisions/README.md index row + numbering-gaps note updated. - phase-b ADR ledger row: B6 (Accepted 2026-05-30); drives T-022. - T-022 Status: Draft -> In Progress (ADR-0033 Accepted satisfies its "ADR must be Accepted before code lands" dependency). ADR-0033 extends ADR-0027 (consumes the reserved TTBR1/EPD1 + byte-stable high-half TCR fields; no supersede). Implementation begins in T-022 (the boot-time high-half migration, landed alone + reviewed before B6's EL0 work per the ADR's staging discipline + CLAUDE.md #6). Refs: ADR-0033, ADR-0027, ADR-0025 Co-Authored-By: Claude Opus 4.8 (1M context) --- docs/analysis/tasks/phase-b/T-022-high-half-kernel-mapping.md | 2 +- docs/decisions/0033-kernel-high-half-migration.md | 4 ++-- docs/decisions/README.md | 4 ++-- docs/roadmap/phases/phase-b.md | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/analysis/tasks/phase-b/T-022-high-half-kernel-mapping.md b/docs/analysis/tasks/phase-b/T-022-high-half-kernel-mapping.md index 357fcd4..ff1e4e9 100644 --- a/docs/analysis/tasks/phase-b/T-022-high-half-kernel-mapping.md +++ b/docs/analysis/tasks/phase-b/T-022-high-half-kernel-mapping.md @@ -2,7 +2,7 @@ - **Phase:** B - **Milestone:** B6 — First userspace "hello" (this is B6's **gating prerequisite**: making the kernel reachable from every task's active translation so a real EL0 task's `SVC` vector fetch + the EL1 handler translate — the [ADR-0033](../../../decisions/0033-kernel-high-half-migration.md) high-half migration; per [phase-b §B6 opening sequence](../../../roadmap/phases/phase-b.md#b6-opening-sequence--prerequisites)) -- **Status:** Draft +- **Status:** In Progress - **Created:** 2026-05-29 - **Author:** @cemililik (+ Claude Opus 4.8 agent) - **Dependencies:** [ADR-0033](../../../decisions/0033-kernel-high-half-migration.md) — must be `Accepted` before code lands (settles the boot-time high-half §Simulation + the link-high/load-low + `KERNEL_VA_OFFSET` discipline); [ADR-0027](../../../decisions/0027-kernel-virtual-memory-layout.md) (the reserved `TTBR1`/`EPD1` + the byte-stable high-half `TCR` fields this consumes); [T-016](T-016-mmu-activation.md) (the `mmu_bootstrap` + `QemuVirtMmu` + `vmsav8` encoders this extends); [T-018](T-018-address-space-kernel-object.md) (the `activate` differ-path that goes live). diff --git a/docs/decisions/0033-kernel-high-half-migration.md b/docs/decisions/0033-kernel-high-half-migration.md index 4db7e7c..38f1f19 100644 --- a/docs/decisions/0033-kernel-high-half-migration.md +++ b/docs/decisions/0033-kernel-high-half-migration.md @@ -1,7 +1,7 @@ # 0033 — Kernel high-half migration -- **Status:** Proposed -- **Date:** 2026-05-29 +- **Status:** Accepted +- **Date:** 2026-05-30 - **Deciders:** @cemililik ## Context diff --git a/docs/decisions/README.md b/docs/decisions/README.md index fc2b3e2..ab0293d 100644 --- a/docs/decisions/README.md +++ b/docs/decisions/README.md @@ -61,11 +61,11 @@ Each ADR contains: | 0030 | [Syscall ABI and userspace error taxonomy (B5)](0030-syscall-abi.md) | Accepted | 2026-05-29 | | 0031 | [Initial syscall set (B5 — `send`/`recv`/`console_write`/`task_yield`/`task_exit`)](0031-initial-syscall-set.md) | Accepted | 2026-05-29 | | 0032 | [Endpoint state rollback on `ipc_recv_and_yield` Deadlock + `ipc_cancel_recv` primitive](0032-endpoint-rollback-and-cancel-recv.md) | Accepted | 2026-05-07 | -| 0033 | [Kernel high-half migration (B6 — kernel → `TTBR1_EL1`, boot-time)](0033-kernel-high-half-migration.md) | Proposed | 2026-05-29 | +| 0033 | [Kernel high-half migration (B6 — kernel → `TTBR1_EL1`, boot-time)](0033-kernel-high-half-migration.md) | Accepted | 2026-05-30 | | 0035 | [Physical Memory Manager (B3 prerequisite — bitmap allocator)](0035-physical-memory-manager.md) | Accepted | 2026-05-09 | | 0036 | [QEMU virt is GICv2 / no-IOMMU in v1; corrects GICv3/SMMUv3 in ADR-0004/0006/0012](0036-qemu-virt-gicv2-no-iommu-v1.md) | Accepted | 2026-05-22 | -> **Numbering gaps.** Slot **0034** is intentionally reserved, not missing: 0034 (kernel-image section permissions) is a named-but-unallocated placeholder forward-flagged in ADR-0027. No file exists for it yet; it opens when the corresponding work surfaces (the first attacker-observable EL0 execution — likely B6). (Slot **0033** (high-half migration) was filed `Proposed` on 2026-05-29 to open B6 and is no longer a gap; slots **0030**/**0031** were filed and `Accepted` on 2026-05-29 for B5.) ADR numbers are stable history and are never renumbered. +> **Numbering gaps.** Slot **0034** is intentionally reserved, not missing: 0034 (kernel-image section permissions) is a named-but-unallocated placeholder forward-flagged in ADR-0027. No file exists for it yet; it opens when the corresponding work surfaces (the first attacker-observable EL0 execution — likely B6). (Slot **0033** (high-half migration) was filed `Proposed` on 2026-05-29 to open B6 and `Accepted` on 2026-05-30, and is no longer a gap; slots **0030**/**0031** were filed and `Accepted` on 2026-05-29 for B5.) ADR numbers are stable history and are never renumbered. ## Creating a new ADR diff --git a/docs/roadmap/phases/phase-b.md b/docs/roadmap/phases/phase-b.md index 85a99bf..222fbac 100644 --- a/docs/roadmap/phases/phase-b.md +++ b/docs/roadmap/phases/phase-b.md @@ -312,7 +312,7 @@ When B6 is Done, run a business review. Phase C becomes active after that review | ADR-0030 | Syscall ABI (includes `IpcError` taxonomy per K2-5) | B5 (**Accepted 2026-05-29**) | was ADR-0028. Settles the register convention (`x8`=number, `x0`–`x5` args, `SVC #0`, `x0`=status) + the dedicated-status-register encoding + `SyscallError` composition + the K2-5 `IpcError` split; drives [T-020](../../analysis/tasks/phase-b/T-020-syscall-error-taxonomy.md) + [T-021](../../analysis/tasks/phase-b/T-021-syscall-dispatch.md) (merged PR #34, `f98e1af`). | | ADR-0031 | Initial syscall set | B5 (**Accepted 2026-05-29**) | was ADR-0029. Fixes the five-syscall v1 set (`send` / `recv` / `task_yield` / `task_exit` / `console_write`; `0` reserved-invalid); numbers `1`–`5` are a fixed ABI decision regression-verified by T-021's host tests, not chosen by the dispatcher. | | ADR-0032 | Endpoint state rollback on `ipc_recv_and_yield` Deadlock + `ipc_cancel_recv` primitive | B2 prep (**Accepted 2026-05-07**) | drove [T-015 (Done 2026-05-07)](../../analysis/tasks/phase-b/T-015-endpoint-rollback-cancel-recv.md) via PR #17. Surfaced as Track A non-blocker in the [2026-05-06 comprehensive review](../../analysis/reviews/code-reviews/2026-05-06-full-tree-comprehensive.md) and a forward-flagged item in the [2026-05-07 B1 closure security review](../../analysis/reviews/security-reviews/2026-05-07-B1-closure.md). Closed before B-phase task lands the first userspace-driven endpoint destroy. ADR-0017 §Revision notes rider records the additive recovery primitive (user-observable surface unchanged). | -| [ADR-0033](../../decisions/0033-kernel-high-half-migration.md) | Kernel high-half migration (kernel → `TTBR1_EL1`, boot-time; reachable from every task AS) | **B6 (Proposed 2026-05-29)** | **filed** to open B6 — the gating prerequisite (an EL0 task's `SVC` vector fetch must translate, impossible while the kernel is identity-only in `TTBR0`). Extends [ADR-0027](../../decisions/0027-kernel-virtual-memory-layout.md) §Decision outcome (Option D) — consumes the reserved `TTBR1`/`EPD1` + byte-stable high-half `TCR` fields; **no supersede**. Boot-time migration (DAIF-masked window, no live low-VA pointers), staged. Drives [T-022](../../analysis/tasks/phase-b/T-022-high-half-kernel-mapping.md) (Draft; opens with the Propose commit per [ADR-0025 §Rule 1](../../decisions/0025-adr-governance-amendments.md)). §Simulation hardened against two adversarial verification passes; Option B (map-kernel-into-every-TTBR0) is the documented fallback. Awaiting careful-re-read + maintainer Accept. | +| [ADR-0033](../../decisions/0033-kernel-high-half-migration.md) | Kernel high-half migration (kernel → `TTBR1_EL1`, boot-time; reachable from every task AS) | **B6 (Accepted 2026-05-30)** | **filed** to open B6 — the gating prerequisite (an EL0 task's `SVC` vector fetch must translate, impossible while the kernel is identity-only in `TTBR0`). Extends [ADR-0027](../../decisions/0027-kernel-virtual-memory-layout.md) §Decision outcome (Option D) — consumes the reserved `TTBR1`/`EPD1` + byte-stable high-half `TCR` fields; **no supersede**. Boot-time migration (DAIF-masked window, no live low-VA pointers), staged. Drives [T-022](../../analysis/tasks/phase-b/T-022-high-half-kernel-mapping.md) (Draft; opens with the Propose commit per [ADR-0025 §Rule 1](../../decisions/0025-adr-governance-amendments.md)). §Simulation hardened against two adversarial verification passes; Option B (map-kernel-into-every-TTBR0) is the documented fallback. **Accepted 2026-05-30** after the maintainer careful-re-read (review-round folded into the Proposed draft); drives [T-022](../../analysis/tasks/phase-b/T-022-high-half-kernel-mapping.md) (In Progress). | | ADR-0034 | Kernel-image section permissions (.text RX / .rodata R / .bss/.data RW) | B-late (placeholder; named-but-unallocated) | named in [ADR-0027 §Decision outcome (a)](../../decisions/0027-kernel-virtual-memory-layout.md) as the future home of finer-grained kernel-image permissions. v1 maps the entire 128 MiB RAM range as kernel R/W/X via 2 MiB blocks; T-016 §Out of scope and [`memory-management.md` §"v1 layout"](../../architecture/memory-management.md) defer the re-map. Opens with the first B-phase task whose threat model includes a kernel R/W of `.text` as a meaningful surface — likely **B6** — the first attacker-observable EL0 execution context (the v1 `hello` is code-only mapped `USER\|EXECUTE`, so ADR-0034 is hardening, not a B6 functional blocker; decide in B6 whether to harden now or defer). | | ADR-0035 | Physical Memory Manager (B3 prerequisite — bitmap allocator) | B3 (**Accepted 2026-05-09**) | new — drove the realisation that B3's "Address space abstraction" milestone has a foundational prerequisite (a real `FrameProvider` impl over physical RAM) which deserves its own ADR rather than being absorbed into ADR-0028 (address-space data structure). Drives [T-017 (Draft 2026-05-09; moves to In Progress with this Accept)](../../analysis/tasks/phase-b/T-017-physical-memory-manager.md). Bitmap allocator with hint pointer; 4 KiB metadata for QEMU virt's 32 K frames; reservation-list at init + cached for `free_frame` defensive validation per the §Simulation §Step 2 Critical row; forward-portable to high-half kernel without algorithm rewrite. Includes the §Simulation table walking init / alloc / free / exhaustion / recovery state transitions per [`write-adr` skill §Simulation](../../../.agents/skills/write-adr/SKILL.md). Accept landed as a separate commit per `write-adr` §10 after a careful re-read pass that surfaced and corrected three substantive drafting issues (broken anchor, safe-Rust-vs-`unsafe` zeroing contradiction, muddled "undefined-vs-error" wording in §Simulation row 2; the row-2 fix tightened the Pmm struct contract to add a cached reserved-range list for defensive `free_frame` validation, propagated to T-017). | | ADR-0036 | QEMU virt is GICv2 / no-IOMMU in v1 (corrects ADR-0004 / 0006 / 0012) | post-B1 (**Accepted 2026-05-22**) | new — surfaced by the [2026-05-22 full-tree master review](../../analysis/reviews/master-review/2026-05-22-152729/consolidated.md): the foundational ADRs carried GICv3 / SMMUv3 statements that do not match the GICv2, no-IOMMU reality of QEMU `virt` that B1's GIC work (above) actually assumed. **Corrects** (append-only redirect rider; does **not** supersede) [ADR-0004](../../decisions/0004-target-platforms.md) / [ADR-0006](../../decisions/0006-workspace-layout.md) / [ADR-0012](../../decisions/0012-boot-flow-qemu-virt.md). Ratifies the GICv2 fact stated in the B1 milestone. | From 6c7502bfdc97fb3c2a26c2e00076ad9edfd9d2e0 Mon Sep 17 00:00:00 2001 From: Cemil ILIK Date: Sat, 30 May 2026 05:22:26 +0300 Subject: [PATCH 4/7] =?UTF-8?q?feat(mmu):=20T-022=20=E2=80=94=20boot-time?= =?UTF-8?q?=20high-half=20kernel=20migration=20(ADR-0033)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Migrate the running kernel from the low identity (TTBR0_EL1) to the high half (TTBR1_EL1) at boot, freeing TTBR0_EL1 for per-task userspace — B6's gating prerequisite (a real EL0 task's SVC vector fetch + EL1 handler must translate; impossible while the kernel is identity-only in TTBR0). Approach (two sound refinements vs ADR-0033's plan, both verified by a multi-lens adversarial pass — the ADR's Option-2 fallback was NOT needed): - Whole-image-high-linked + forced-low ELF entry, instead of a separate low-VMA .idmap section. The image links at KBASE=0xFFFF_FFFF_4008_0000 with LMA low (AT(ADDR-OFFSET)); ENTRY is _start's low PA (_start_phys). Uniform high-linking makes early/low code resolve to LOW load addresses via PC-relative adrp/adr, so no identity-VMA section is required. - One linear high-half offset KERNEL_HIGH_HALF_OFFSET=0xFFFF_FFFF_0000_0000 (VA=OFFSET+PA), instead of two distinct PA<->VA offsets — they coincide by construction, so using the wrong offset at a site is impossible. Host builds define it as 0 (identity), so host tests are unaffected. Boot flow: kernel_entry (LOW) enables the low-identity MMU (mmu_bootstrap), builds the high-half TTBR1 tables + clears EPD1 (high_half_activate), then a migration trampoline installs the high VBAR, rebases SP to the high stack alias, and `br`s into kernel_main_high (HIGH). kernel_main_high frees TTBR0 (null + EPD0=1 + TLBI), prints the new `tyrne: high-half active` marker, and runs the rest of bring-up (console/GIC at high MMIO; PMM/AS/loader/IPC/ scheduler) at high-half addresses. v1 maps the whole RAM window PXN=0 (RWX-equivalent, like the identity map it replaces; AP=0b00 keeps EL0 with no access); the distinct PXN=1 physmap is per-section W^X hardening deferred to ADR-0034. HAL: TCR_EL1_VALUE_HIGH_HALF (EPD1 cleared; host test pins the 1-bit delta); KERNEL_HIGH_HALF_OFFSET + phys_to_kernel_va/kernel_va_to_phys (cfg aarch64 / host-identity). Every addr_of!-as-PA and PA-deref site rebased project-wide (mmu.rs walk, pmm.rs zero-fill, phys_frame_kernel_ptr, task_loader overlap preflight, syscall window, kernel_main_high __stack_top/__boot_pt_l0). Verification (all green): 340 host tests; host+kernel clippy -D warnings; cargo fmt --check; release build; cargo +nightly miri test (Stacked Borrows); QEMU smoke boots to `tyrne: all tasks complete` with `tyrne: high-half active`; `-d int,unimp` shows exactly the 2 expected SVC exceptions and ZERO new Translation/Permission/Abort fault classes (fault-clean — ADR-0033 §Simulation row-4 abort gate). The pre-existing release-only console_write status 0x1 quirk reproduces on parent bd39679 — not a T-022 regression. Audit: UNSAFE-2026-0031 (migration trampoline + TTBR0-free) + Amendments to 0022/0023/0024 (high-half table writes / MSR sequence / post-migration TLBI) + 0025/0026/0027/0030 (physmap-rebase derefs). Docs: boot.md §High-half migration; memory-management.md; phase-b §B6 step 1 + ledger; current.md; T-022 acceptance + §Review-history (incl. the 3rd adversarial pass + the 4 findings handled). Adds tools/smoke.sh (non- interactive QEMU smoke for CI/agent use). T-022 -> In Review (security- relevant; awaiting explicit security review). Refs: T-022, ADR-0033, ADR-0027, UNSAFE-2026-0031 Co-Authored-By: Claude Opus 4.8 (1M context) --- bsp-qemu-virt/linker.ld | 112 +++++--- bsp-qemu-virt/src/main.rs | 249 ++++++++++++++---- bsp-qemu-virt/src/mmu.rs | 14 +- bsp-qemu-virt/src/mmu_bootstrap.rs | 114 +++++++- bsp-qemu-virt/src/syscall.rs | 24 +- .../phase-b/T-022-high-half-kernel-mapping.md | 31 ++- docs/architecture/boot.md | 14 +- docs/architecture/memory-management.md | 4 +- docs/audits/unsafe-log.md | 35 +++ docs/roadmap/current.md | 2 +- docs/roadmap/phases/phase-b.md | 4 +- hal/src/lib.rs | 4 +- hal/src/mmu/mod.rs | 79 ++++++ hal/src/mmu/vmsav8.rs | 46 +++- kernel/src/mm/mod.rs | 81 ++---- kernel/src/mm/pmm.rs | 17 +- kernel/src/obj/task_loader.rs | 25 +- tools/smoke.sh | 63 +++++ 18 files changed, 709 insertions(+), 209 deletions(-) create mode 100755 tools/smoke.sh diff --git a/bsp-qemu-virt/linker.ld b/bsp-qemu-virt/linker.ld index ff87a1f..fd8aeb8 100644 --- a/bsp-qemu-virt/linker.ld +++ b/bsp-qemu-virt/linker.ld @@ -1,65 +1,88 @@ /* - * Linker script for tyrne-bsp-qemu-virt. + * Linker script for tyrne-bsp-qemu-virt — link-high / load-low (ADR-0033). * - * See docs/decisions/0012-boot-flow-qemu-virt.md for the memory layout - * rationale. In short: QEMU virt loads us at 0x40080000 via -kernel; - * _start must be first in .text; BSS is 8-byte aligned so the BSS-zero - * loop can use 8-byte stores; 64 KiB stack is reserved after .bss and - * named __stack_top at its high end. + * See docs/decisions/0012-boot-flow-qemu-virt.md for the original low layout + * and docs/decisions/0033-kernel-high-half-migration.md + docs/architecture/boot.md + * for the high-half migration this script enables. * - * T-016 (ADR-0027) adds the .boot_pt reservation: 4 × 4 KiB page-aligned - * frames bracketed by __boot_pt_start / __boot_pt_end and individually - * named __boot_pt_l0 / __boot_pt_l1 / __boot_pt_l2_low / __boot_pt_l2_high - * for the bootstrap routine to populate. Placed inside the .bss range so - * the existing BSS-zero loop in boot.s pre-zeros all four frames before - * mmu_bootstrap runs. + * The kernel is LINKED at the high-half base KBASE (TTBR1_EL1, VA[55]=1) but + * LOADED at the physical base 0x40080000 (where QEMU `-kernel` places it). The + * load address (LMA) of every section is its virtual address (VMA) minus the + * constant KERNEL_HH_OFFSET — i.e. the whole image is one uniform high-half + * alias of the physical image. This uniformity is load-bearing: + * + * - `_start` and all early-boot code run at the LOW physical address with + * the MMU off. Because the image is high-linked *uniformly*, every + * PC-relative `adrp`/`:lo12:` reference resolves to the LOW (load) address + * at runtime (the high-link offset cancels between any two in-image + * symbols), so early code naturally computes physical addresses without + * per-site offset arithmetic. + * - The ELF entry point is forced LOW via `_start_phys` (= `_start` minus + * KERNEL_HH_OFFSET) so QEMU sets the reset PC to the physical address of + * `_start`, not its high VMA (the MMU is off at reset — a high PC would + * translation-fault immediately). + * - After the boot-time migration (`kernel_entry_low` → `kernel_main_high`) + * the kernel runs at its high VMAs; PC-relative references then resolve + * HIGH, and stored function pointers (task entries, etc.) — taken AFTER + * the migration in `kernel_main_high` — are high too, so they remain + * reachable once `TTBR0_EL1` is freed for userspace. + * + * T-016 (ADR-0027) reserves the four low-identity bootstrap page-table frames; + * T-022 (ADR-0033) adds the two high-half root frames (__boot_pt_l0_hh / + * __boot_pt_l1_hh). All six live in `.bss` so the `_start` BSS-zero loop + * pre-zeros them before `mmu_bootstrap` / `build_high_half_tables` run. */ -ENTRY(_start) +/* tyrne_hal::KERNEL_HIGH_HALF_OFFSET — kept in sync by a compile-time assert + * in bsp-qemu-virt/src/main.rs. */ +KERNEL_HH_OFFSET = 0xFFFFFFFF00000000; +KERNEL_IMAGE_PHYS_BASE = 0x40080000; +KBASE = KERNEL_HH_OFFSET + KERNEL_IMAGE_PHYS_BASE; -MEMORY { - RAM (rwx) : ORIGIN = 0x40080000, LENGTH = 128M -} +/* ELF entry: the LOW physical address of `_start` (MMU is off at reset). */ +ENTRY(_start_phys) SECTIONS { - . = ORIGIN(RAM); + /* Virtual addresses are high-half; load addresses (AT) are VMA minus the + * high-half offset, i.e. the physical image at 0x40080000+. */ + . = KBASE; - .text : { + .text : AT(ADDR(.text) - KERNEL_HH_OFFSET) { KEEP(*(.text.boot)) - /* aarch64 EL1 exception vector table — VBAR_EL1 requires - * 2 KiB (2048-byte) alignment per ARM ARM §D11.2. The - * `.text.vectors` section holds the 16-entry table assembled - * in `src/vectors.s`; KEEP prevents linker GC. */ + /* aarch64 EL1 exception vector table — VBAR_EL1 requires 2 KiB + * (2048-byte) alignment per ARM ARM §D11.2. */ . = ALIGN(2048); KEEP(*(.text.vectors)) *(.text .text.*) - } > RAM + } - .rodata : ALIGN(8) { + .rodata : AT(ADDR(.rodata) - KERNEL_HH_OFFSET) ALIGN(8) { *(.rodata .rodata.*) - } > RAM + } - .data : ALIGN(8) { + .data : AT(ADDR(.data) - KERNEL_HH_OFFSET) ALIGN(8) { *(.data .data.*) - } > RAM + } - .bss : ALIGN(8) { + .bss : AT(ADDR(.bss) - KERNEL_HH_OFFSET) ALIGN(8) { __bss_start = .; *(.bss .bss.*) *(COMMON) - /* Bootstrap page-table frames (T-016 / ADR-0027). + /* Bootstrap page-table frames. * - * Four 4 KiB-aligned frames live inside the .bss range so the - * existing BSS-zero loop in boot.s pre-zeros them before - * mmu_bootstrap populates the descriptors. Order matches the - * VMSAv8 hierarchy walked by the bootstrap routine: - * __boot_pt_l0 — L0 root; entry [0] points at L1 - * __boot_pt_l1 — L1; entry [0] → L2_low (MMIO range), - * entry [1] → L2_high (RAM range) - * __boot_pt_l2_low — L2 covering 0x0800_0000..0x0920_0000 - * (GIC + UART; 9 device blocks) - * __boot_pt_l2_high — L2 covering 0x4000_0000..0x4800_0000 - * (kernel + RAM; 64 normal blocks) + * Low-identity regime (T-016 / ADR-0027) — TTBR0_EL1 root chain: + * __boot_pt_l0 — L0 root; entry [0] → L1 + * __boot_pt_l1 — L1; entry [0] → L2_low (MMIO), [1] → L2_high (RAM) + * __boot_pt_l2_low — L2 covering 0x0800_0000..0x0920_0000 (GIC + UART) + * __boot_pt_l2_high — L2 covering 0x4000_0000..0x4800_0000 (kernel + RAM) + * + * High-half regime (T-022 / ADR-0033) — TTBR1_EL1 root chain. The high + * VAs (KERNEL_HH_OFFSET + pa) land at L0[511] and L1[508]/[509]; the two + * L2 tables above are SHARED (a block descriptor's output address is the + * PA, identical whether reached via the low or high VA), so only the L0 + * and L1 roots are new: + * __boot_pt_l0_hh — high L0 root; entry [511] → L1_hh + * __boot_pt_l1_hh — high L1; entry [508] → L2_low, [509] → L2_high */ . = ALIGN(4096); __boot_pt_start = .; @@ -71,15 +94,22 @@ SECTIONS { . = . + 4096; __boot_pt_l2_high = .; . = . + 4096; + __boot_pt_l0_hh = .; + . = . + 4096; + __boot_pt_l1_hh = .; + . = . + 4096; __boot_pt_end = .; . = ALIGN(8); __bss_end = .; - } > RAM + } . = ALIGN(16); . = . + 64K; __stack_top = .; + /* Low physical address of `_start`, used as the forced-low ELF entry. */ + _start_phys = _start - KERNEL_HH_OFFSET; + /DISCARD/ : { *(.comment) *(.note.*) diff --git a/bsp-qemu-virt/src/main.rs b/bsp-qemu-virt/src/main.rs index 4345000..1cad85a 100644 --- a/bsp-qemu-virt/src/main.rs +++ b/bsp-qemu-virt/src/main.rs @@ -32,7 +32,7 @@ use core::mem::MaybeUninit; use core::panic::PanicInfo; use tyrne_hal::{Console, Cpu, FmtWriter, Timer}; -use tyrne_hal::{PhysAddr, VirtAddr, PAGE_SIZE}; +use tyrne_hal::{PhysAddr, VirtAddr, KERNEL_HIGH_HALF_OFFSET, PAGE_SIZE}; use tyrne_kernel::cap::{CapHandle, CapObject, CapRights, Capability, CapabilityTable}; use tyrne_kernel::ipc::{IpcQueues, Message, RecvOutcome}; use tyrne_kernel::mm::{PhysFrameRange, Pmm}; @@ -111,6 +111,13 @@ type BspPmm = Pmm; /// [adr-0012]: https://github.com/HodeTech/Tyrne/blob/main/docs/decisions/0012-boot-flow-qemu-virt.md const PL011_UART_BASE: usize = 0x0900_0000; +// Pin the HAL high-half offset to the literal the linker script (`KBASE = +// KERNEL_HH_OFFSET + KERNEL_IMAGE_PHYS_BASE`) and the migration path assume. +// A drift between the linker's hardcoded value and `tyrne_hal` would silently +// corrupt every high-half VA↔PA computation (ADR-0033 / T-022); fail the build +// instead. +const _: () = assert!(KERNEL_HIGH_HALF_OFFSET == 0xFFFF_FFFF_0000_0000); + // ─── StaticCell ─────────────────────────────────────────────────────────────── // // Task entry functions are `fn() -> !` — they cannot capture environment. @@ -801,46 +808,39 @@ extern "C" { static __boot_pt_l0: [u64; 512]; } -/// First Rust entry after the assembly stub. +/// Low-half boot entry — the `_start` (`boot.s`) branch target. /// -/// Sets up the console, CPU, kernel objects, capability tables, IPC -/// infrastructure, and cooperative scheduler. Registers Task B before Task A -/// so that B runs first and registers as IPC receiver before A sends. -/// Transfers control to the scheduler. This function never returns. +/// Runs at the LOW physical alias of the kernel image with the MMU off (the +/// linker forces the ELF entry to the physical address of `_start`; see +/// [`linker.ld`] + [ADR-0033]). It enables the low-identity MMU, builds the +/// high-half (`TTBR1_EL1`) tables, then performs the boot-time high-half +/// migration: install the high vectors, rebase `SP` to the high stack alias, +/// and branch the PC into [`kernel_main_high`] (the high-half image alias). +/// It never returns. /// -/// # Panics +/// Only PC-relative-safe, identity-mapped work happens here (early +/// diagnostics via a throwaway low-MMIO console, the MMU bring-up, the +/// migration asm). Everything that takes a `&'static`/function-pointer +/// address (the `StaticCell` publishes, `create_task`, the scheduler) lives +/// in [`kernel_main_high`] so those absolute addresses resolve HIGH and stay +/// reachable once `TTBR0_EL1` is freed for userspace. /// -/// Panics if any kernel-object allocation or capability-table operation fails. -/// All capacities are statically bounded and the demo uses far fewer objects -/// than the limits, so in practice none of these branches are reachable. +/// [ADR-0033]: https://github.com/HodeTech/Tyrne/blob/main/docs/decisions/0033-kernel-high-half-migration.md +/// [`linker.ld`]: https://github.com/HodeTech/Tyrne/blob/main/bsp-qemu-virt/linker.ld #[unsafe(no_mangle)] -#[allow( - clippy::too_many_lines, - reason = "BSP boot sequence is intentionally linear top-to-bottom for auditability — splitting into helpers obscures the order each phase depends on (per docs/standards/bsp-boot-checklist.md)" -)] pub extern "C" fn kernel_entry() -> ! { - // ── Hardware setup ──────────────────────────────────────────────────────── - - // SAFETY: 0x0900_0000 is the well-known QEMU virt PL011 UART MMIO - // base, exclusively owned by this kernel in v1. Audit: UNSAFE-2026-0001. - let console = unsafe { Pl011Uart::new(PL011_UART_BASE) }; - // SAFETY: constructed exactly once in kernel_entry; single-core v1. - // See QemuVirtCpu::new # Safety. Audit: UNSAFE-2026-0006. - let cpu = unsafe { QemuVirtCpu::new() }; - - // SAFETY: single-core; no concurrent writer exists before `start()`. + // ── Early diagnostics (low identity) ────────────────────────────────────── + // + // A throwaway console at the LOW PL011 MMIO base (identity-mapped while + // the migration has not yet run). The persistent `CONSOLE` StaticCell is + // constructed with the HIGH device-MMIO alias only after the migration, + // in `kernel_main_high`. + // + // SAFETY: 0x0900_0000 is the well-known QEMU virt PL011 UART MMIO base, + // exclusively owned by this kernel in v1, identity-mapped pre-migration. // Audit: UNSAFE-2026-0001. - unsafe { - (*CONSOLE.0.get()).write(console); - (*CPU.0.get()).write(cpu); - } - - // SAFETY: CONSOLE was written in the block above. Audit: UNSAFE-2026-0001. - let console = unsafe { (*CONSOLE.0.get()).assume_init_ref() }; - // SAFETY: CPU was written in the block above. Audit: UNSAFE-2026-0001. - let cpu = unsafe { (*CPU.0.get()).assume_init_ref() }; - - console.write_bytes(b"tyrne: hello from kernel_main\n"); + let early_console = unsafe { Pl011Uart::new(PL011_UART_BASE) }; + early_console.write_bytes(b"tyrne: hello from kernel_main\n"); // ── Exception vector install — T-012 (must run before mmu_bootstrap) ────── // @@ -882,21 +882,6 @@ pub extern "C" fn kernel_entry() -> ! { ); } - // ── boot_ns snapshot before mmu_bootstrap (T-016 / ADR-0027) ────────────── - // - // `cpu.now_ns()` reads `CNTVCT_EL0` (system register, MMU-independent). - // Sampling it here — before `mmu_bootstrap` — captures the boot-to-end - // baseline that *includes* MMU activation cost (~< 100 µs per - // ADR-0027 §Consequences). This keeps the post-T-016 boot-to-end - // measurement comparable to the pre-T-016 baseline modulo the - // bootstrap-routine addition. - let boot_ns = cpu.now_ns(); - // SAFETY: single-core; no concurrent writer exists before `start()`. - // Audit: UNSAFE-2026-0001. - unsafe { - (*BOOT_NS.0.get()).write(boot_ns); - } - // ── MMU activation — T-016 / ADR-0027 ───────────────────────────────────── // // Activates the MMU with the v1 identity-mapped layout per @@ -917,7 +902,149 @@ pub extern "C" fn kernel_entry() -> ! { unsafe { mmu_bootstrap::mmu_bootstrap(); } - console.write_bytes(b"tyrne: mmu activated\n"); + early_console.write_bytes(b"tyrne: mmu activated\n"); + + // ── High-half table build — T-022 / ADR-0033 §Simulation rows 0-1 ───────── + // + // Build the TTBR1_EL1 tables and enable TTBR1 walks (EPD1 1→0). Both + // translation regimes are live on return; the kernel still executes low. + // + // SAFETY: called once, at EL1, after `mmu_bootstrap` (the shared L2 tables + // + the low-identity MMU are live) and before the migration trampoline. + // Audit: UNSAFE-2026-0022 / 0023 (Amendments). + unsafe { + mmu_bootstrap::high_half_activate(); + } + + // ── Boot-time high-half migration — T-022 / ADR-0033 §Simulation row 2 ──── + // + // Install the high VBAR (so a fault on the first high fetch vectors to a + // mapped handler), rebase SP to the high stack alias, and branch the PC + // into the high-half image (`kernel_main_high`). The low identity stays + // live (TTBR0 is freed inside `kernel_main_high`), DAIF is masked (since + // `_start`), and no `StaticCell` holds a low VA yet, so the few pre-`br` + // instructions cannot brick. The high targets are derived by masking the + // (PC-relative-resolved) address to its physical part and OR-ing the + // high-half offset, so the computation is correct regardless of how the + // compiler materialises the symbol addresses. + let high_vbar = + KERNEL_HIGH_HALF_OFFSET | ((core::ptr::addr_of!(tyrne_vectors) as usize) & 0xFFFF_FFFF); + let high_entry = + KERNEL_HIGH_HALF_OFFSET | ((kernel_main_high as *const () as usize) & 0xFFFF_FFFF); + // SAFETY: the absolute-jump migration trampoline (ADR-0033 §Simulation + // row 2). `MSR VBAR_EL1` to the high vector base (mapped PXN=0 in TTBR1) + + // `ISB` so high vectors are live before the branch; `add sp, sp, off` + // rebases SP to the high alias of the same boot stack; `br` crosses the PC + // from the low identity to the high-half image alias. Both regimes are + // live across the branch and DAIF is masked, so the crossing cannot fault. + // `options(noreturn)`: control never returns (kernel_main_high is `-> !`), + // so changing SP here is sound. Audit: UNSAFE-2026-0031. + unsafe { + core::arch::asm!( + "msr vbar_el1, {vbar}", + "isb", + "add sp, sp, {off}", + "br {entry}", + vbar = in(reg) high_vbar, + off = in(reg) KERNEL_HIGH_HALF_OFFSET, + entry = in(reg) high_entry, + options(noreturn), + ); + } +} + +/// High-half kernel main — the migration trampoline's branch target. +/// +/// Entered via `br` from [`kernel_entry`] with the PC, `SP`, and `VBAR_EL1` +/// all resolving through the high half (`TTBR1_EL1`). Frees `TTBR0_EL1` (the +/// low identity) for per-task userspace, then runs the full boot sequence +/// (console / CPU / PMM / address space / loader / IPC / syscall smoke / +/// scheduler) at high-half addresses. Never returns. +/// +/// `#[inline(never)]` + `#[unsafe(no_mangle)]` keep it a stable, addressable +/// symbol — `kernel_entry` takes its address to compute the migration branch +/// target. +/// +/// # Panics +/// +/// Panics if any kernel-object allocation or capability-table operation +/// fails; all capacities are statically bounded and the demo uses far fewer +/// objects than the limits, so in practice none of these branches are +/// reachable. +#[unsafe(no_mangle)] +#[inline(never)] +#[allow( + clippy::too_many_lines, + reason = "BSP boot sequence is intentionally linear top-to-bottom for auditability — splitting into helpers obscures the order each phase depends on (per docs/standards/bsp-boot-checklist.md)" +)] +extern "C" fn kernel_main_high() -> ! { + // ── Free TTBR0 — the low identity — T-022 / ADR-0033 §Simulation row 3 ──── + // + // SP was rebased to the high alias by the migration trampoline, so this + // function's frame is already high. Null `TTBR0_EL1`, set `EPD0 = 1` + // (disable TTBR0 walks until a per-task AS activates), and flush stale low + // translations. After this the kernel is structurally absent from the low + // half — `TTBR0_EL1` is free for userspace. + // + // SAFETY: register-only writes (no table-memory mutation, so no `DSB` + // before the `TLBI` is required). `MSR TTBR0_EL1, xzr` + `ISB`; set `EPD0` + // via a read-modify-write of `TCR_EL1` + `ISB`; `TLBI VMALLE1` + `DSB ISH` + // + `ISB` to drop and complete the stale low translations. + // Audit: UNSAFE-2026-0023 / 0024 (Amendments) + UNSAFE-2026-0031. + unsafe { + core::arch::asm!( + "msr ttbr0_el1, xzr", + "isb", + "mrs {t}, tcr_el1", + "orr {t}, {t}, {epd0}", + "msr tcr_el1, {t}", + "isb", + "tlbi vmalle1", + "dsb ish", + "isb", + epd0 = in(reg) (1u64 << 7), + t = out(reg) _, + options(nostack, nomem), + ); + } + + // ── Hardware setup (high-half device MMIO) ──────────────────────────────── + // + // The console + GIC now reach the PL011 / GIC registers through the HIGH + // device-MMIO alias (`phys_to_kernel_va`) — the low identity is gone. + // + // SAFETY: the PL011 UART reached via its high-half alias is the same + // device, exclusively owned in v1. Audit: UNSAFE-2026-0001. + let console = unsafe { Pl011Uart::new(tyrne_hal::phys_to_kernel_va(PL011_UART_BASE)) }; + // SAFETY: constructed exactly once; single-core v1; we are at EL1 (the EL + // drop completed in boot.s). See QemuVirtCpu::new # Safety. Audit: UNSAFE-2026-0006. + let cpu = unsafe { QemuVirtCpu::new() }; + + // SAFETY: single-core; no concurrent writer exists before `start()`. + // Audit: UNSAFE-2026-0001. + unsafe { + (*CONSOLE.0.get()).write(console); + (*CPU.0.get()).write(cpu); + } + // SAFETY: CONSOLE / CPU written just above. Audit: UNSAFE-2026-0001. + let console = unsafe { (*CONSOLE.0.get()).assume_init_ref() }; + // SAFETY: as above. Audit: UNSAFE-2026-0001. + let cpu = unsafe { (*CPU.0.get()).assume_init_ref() }; + + console.write_bytes(b"tyrne: high-half active\n"); + + // ── boot_ns snapshot (T-016 / ADR-0027; post-migration per ADR-0033) ────── + // + // `cpu.now_ns()` reads `CNTVCT_EL0` (system register, MMU-independent). + // Sampled just after the high-half migration so the boot-to-end baseline + // measures the high-half steady state; the one-time migration cost (a few + // µs) is excluded — immaterial against the ~ms boot-to-end total. + let boot_ns = cpu.now_ns(); + // SAFETY: single-core; no concurrent writer exists before `start()`. + // Audit: UNSAFE-2026-0001. + unsafe { + (*BOOT_NS.0.get()).write(boot_ns); + } // ── PMM init — T-017 / ADR-0035 ────────────────────────────────────────── // @@ -942,7 +1069,10 @@ pub extern "C" fn kernel_entry() -> ! { // single-core boot-time, no concurrent observer. Same discipline // as the pre-existing `addr_of!(tyrne_vectors)` site. // Audit: UNSAFE-2026-0001 (StaticCell pattern for `PMM`). - let stack_top_addr = core::ptr::addr_of!(__stack_top) as usize; + // `addr_of!` resolves HIGH here (kernel_main_high runs in the high half), + // so convert the symbol's high-half VA back to its PA for the PMM's + // physical-frame reservation (ADR-0033 §Negative — addr_of!-as-PA fix). + let stack_top_addr = tyrne_hal::kernel_va_to_phys(core::ptr::addr_of!(__stack_top) as usize); let stack_top_aligned_up = stack_top_addr.saturating_add(PAGE_SIZE - 1) & !(PAGE_SIZE - 1); let pmm_extent = PhysFrameRange::new(PhysAddr(PMM_EXTENT_START), PhysAddr(PMM_EXTENT_END)); let pmm_reserved = [ @@ -1007,7 +1137,9 @@ pub extern "C" fn kernel_entry() -> ! { // SAFETY: `addr_of!` of an `extern "C"` static is itself safe — no // load happens here, only the symbol's address is taken. let l0_root = { - let pa = core::ptr::addr_of!(__boot_pt_l0) as usize; + // `addr_of!` resolves HIGH (high-half execution); the L0 root is named + // by its PA, so convert back (ADR-0033 §Negative — addr_of!-as-PA fix). + let pa = tyrne_hal::kernel_va_to_phys(core::ptr::addr_of!(__boot_pt_l0) as usize); tyrne_hal::PhysFrame::from_aligned(PhysAddr(pa)) .expect("L0 root must be 4 KiB-aligned per linker.ld `.boot_pt` reservation") }; @@ -1213,8 +1345,8 @@ pub extern "C" fn kernel_entry() -> ! { // Audit: UNSAFE-2026-0019. let gic = unsafe { QemuVirtGic::new( - QEMU_VIRT_GIC_DISTRIBUTOR_BASE, - QEMU_VIRT_GIC_CPU_INTERFACE_BASE, + tyrne_hal::phys_to_kernel_va(QEMU_VIRT_GIC_DISTRIBUTOR_BASE), + tyrne_hal::phys_to_kernel_va(QEMU_VIRT_GIC_CPU_INTERFACE_BASE), ) }; // SAFETY: single-core; no concurrent writer to GIC static yet. @@ -1423,8 +1555,11 @@ fn panic(info: &PanicInfo) -> ! { // SAFETY: constructing a fresh Pl011Uart in the panic path is // best-effort diagnostic output. Writes may interleave if the original // instance is still reachable — acceptable per the Console contract - // (ADR-0007). Audit: UNSAFE-2026-0002. - let console = unsafe { Pl011Uart::new(PL011_UART_BASE) }; + // (ADR-0007). The HIGH device-MMIO alias is used because the kernel runs + // in the high half post-migration (ADR-0033); a panic in the brief + // pre-migration low window would not print, but that window is only the + // verified `mmu_bootstrap` / `high_half_activate` path. Audit: UNSAFE-2026-0002. + let console = unsafe { Pl011Uart::new(tyrne_hal::phys_to_kernel_va(PL011_UART_BASE)) }; console.write_bytes(b"\n!! tyrne panic !!\n"); let mut w = FmtWriter(&console); diff --git a/bsp-qemu-virt/src/mmu.rs b/bsp-qemu-virt/src/mmu.rs index 8011a05..0eface1 100644 --- a/bsp-qemu-virt/src/mmu.rs +++ b/bsp-qemu-virt/src/mmu.rs @@ -42,8 +42,8 @@ use tyrne_hal::mmu::vmsav8::{ flags_to_descriptor_bits, page_descriptor, table_descriptor, PAGE_OA_MASK_L3, TABLE_NLA_MASK, }; use tyrne_hal::{ - FrameProvider, MapperFlush, MappingFlags, Mmu, MmuError, PhysAddr, PhysFrame, VirtAddr, - PAGE_SIZE, + phys_to_kernel_va, FrameProvider, MapperFlush, MappingFlags, Mmu, MmuError, PhysAddr, + PhysFrame, VirtAddr, PAGE_SIZE, }; /// Translation-table layout constants for the `VMSAv8` 4 KiB-granule, @@ -427,8 +427,10 @@ unsafe fn walk_and_install_leaf( // Audit: UNSAFE-2026-0025. let l3_table = unsafe { walk_or_alloc_table(l2_table, l2_idx, frames, unmap)? }; - // L3 leaf write or clear. - let l3_ptr = l3_table.as_usize() as *mut u64; + // L3 leaf write or clear. The page-table frame is reached through the + // high-half direct map (ADR-0033 / T-022) — `phys_to_kernel_va(pa)` — not + // identity, since the kernel runs high post-migration. + let l3_ptr = phys_to_kernel_va(l3_table.as_usize()) as *mut u64; // SAFETY: `l3_table` is a 4 KiB frame; `l3_idx < 512`; the offset // stays within the frame. Volatile access prevents the compiler @@ -491,7 +493,9 @@ unsafe fn walk_or_alloc_table( ) -> Result { debug_assert!(idx < ENTRIES_PER_TABLE); - let parent_ptr = parent_table.as_usize() as *mut u64; + // Reached through the high-half direct map (ADR-0033 / T-022), not + // identity — the kernel runs high post-migration. + let parent_ptr = phys_to_kernel_va(parent_table.as_usize()) as *mut u64; // SAFETY: `parent_table` is a 4 KiB frame; `idx < 512`. Audit: // UNSAFE-2026-0025. let slot_ptr = unsafe { parent_ptr.add(idx) }; diff --git a/bsp-qemu-virt/src/mmu_bootstrap.rs b/bsp-qemu-virt/src/mmu_bootstrap.rs index 9fa59b0..81fa9a9 100644 --- a/bsp-qemu-virt/src/mmu_bootstrap.rs +++ b/bsp-qemu-virt/src/mmu_bootstrap.rs @@ -33,9 +33,9 @@ use core::arch::asm; use tyrne_hal::mmu::vmsav8::{ block_descriptor, flags_to_descriptor_bits, table_descriptor, MAIR_EL1_VALUE, - SCTLR_EL1_MMU_ENABLE_MASK, TCR_EL1_VALUE, + SCTLR_EL1_MMU_ENABLE_MASK, TCR_EL1_VALUE, TCR_EL1_VALUE_HIGH_HALF, }; -use tyrne_hal::MappingFlags; +use tyrne_hal::{MappingFlags, KERNEL_HIGH_HALF_OFFSET}; // Linker symbols for the four bootstrap page-table frames. The Rust // type `[u64; 512]` mirrors the actual storage shape (one 4 KiB frame @@ -49,6 +49,13 @@ extern "C" { static __boot_pt_l1: [u64; 512]; static __boot_pt_l2_low: [u64; 512]; static __boot_pt_l2_high: [u64; 512]; + /// High-half (`TTBR1_EL1`) root frames (T-022 / ADR-0033). The two L2 + /// tables above are SHARED between the low-identity and high-half + /// regimes — a block descriptor's output address is the PA, identical + /// whether reached via the low or high VA — so only the L0/L1 roots are + /// new. See [`high_half_activate`]. + static __boot_pt_l0_hh: [u64; 512]; + static __boot_pt_l1_hh: [u64; 512]; } /// Entries per 4 KiB translation table. @@ -254,3 +261,106 @@ pub unsafe fn mmu_bootstrap() { ); } } + +/// Build the high-half (`TTBR1_EL1`) translation tables and bring the +/// high-half regime live — while still executing in the low-identity +/// regime. Implements [ADR-0033 §Simulation rows 0–1][adr-0033]. +/// +/// Must run AFTER [`mmu_bootstrap`] (which builds the four low-identity +/// frames + the two L2 tables this routine SHARES, and enables the MMU) +/// and BEFORE the [`crate::kernel_entry`] migration trampoline crosses the +/// PC into the high half. On return both regimes are live: the low identity +/// (`TTBR0_EL1`) the kernel is still executing through, and the high-half +/// (`TTBR1_EL1`, `EPD1 = 0`) the trampoline is about to branch into. +/// +/// High-half layout (per [ADR-0033 §"High-half layout"][adr-0033]): every +/// kernel VA is `KERNEL_HIGH_HALF_OFFSET + pa`, so at `T1SZ = 16` they land +/// at `L0[511]` and `L1[508]` (device) / `L1[509]` (RAM). The L2 tables are +/// SHARED with the low identity (a block descriptor's output address is the +/// PA, identical via either VA), so only the L0/L1 roots are new. +/// +/// **v1 simplification (RWX-equivalent; PXN-split deferred to ADR-0034).** +/// Sharing `L2_high` makes the high-half RAM window `PXN = 0` (kernel- +/// executable) for its whole span — so the kernel image *and* the +/// physmap/direct-map deref path (page tables, PMM frames, copy-user) live +/// in one `PXN = 0`, `AP = 0b00` window. This is RWX-equivalent to the +/// identity map it replaces (T-016 mapped all RAM `WRITE | EXECUTE`), which +/// [T-022 §Out of scope](../../docs/analysis/tasks/phase-b/T-022-high-half-kernel-mapping.md) +/// explicitly accepts; the [ADR-0033 §High-half-layout][adr-0033] PXN=1 +/// physmap region distinct from the PXN=0 image region is the per-section +/// W^X hardening end-state, deferred to ADR-0034. `EL0` is never granted +/// access (`AP = 0b00`), so this is a privileged-side W^X gap only, not an +/// EL0-reachable one. +/// +/// # Safety +/// +/// - Must be called exactly once per boot, at EL1, after [`mmu_bootstrap`] +/// (the shared L2 tables and the low-identity MMU must already be live) +/// and before the high-half migration trampoline. +/// - The `__boot_pt_l0_hh` / `__boot_pt_l1_hh` frames must be page-aligned +/// and pre-zeroed (the linker places them in `.bss`; `_start`'s BSS-zero +/// loop zeros them) so the unwritten slots read as invalid descriptors. +/// +/// Audit: UNSAFE-2026-0022 (high-half page-table frame writes — Amendment) +/// + UNSAFE-2026-0023 (`MSR TTBR1_EL1` / `MSR TCR_EL1` — Amendment). +/// +/// [adr-0033]: https://github.com/HodeTech/Tyrne/blob/main/docs/decisions/0033-kernel-high-half-migration.md +pub unsafe fn high_half_activate() { + // High-half root frames (new); the L2 tables are shared with the low + // identity. `addr_of!` resolves these `.bss` symbols to their LOW + // physical addresses here (this routine executes low, so the PC-relative + // materialisation yields the load address = PA) — exactly the PAs the + // table/TTBR descriptors must carry. + let l0_hh = core::ptr::addr_of!(__boot_pt_l0_hh) + .cast::() + .cast_mut(); + let l1_hh = core::ptr::addr_of!(__boot_pt_l1_hh) + .cast::() + .cast_mut(); + let l2_low_pa = core::ptr::addr_of!(__boot_pt_l2_low) as u64; + let l2_high_pa = core::ptr::addr_of!(__boot_pt_l2_high) as u64; + + // High VAs → page-table indices (computed, not hardcoded, so the + // KERNEL_HIGH_HALF_OFFSET choice is the single source of truth). + let off = KERNEL_HIGH_HALF_OFFSET as u64; + let va_dev = off + 0x0800_0000; // GIC + UART device window base + let va_ram = off + 0x4000_0000; // RAM (kernel image) window base + let l0_idx = ((va_ram >> 39) & 0x1FF) as usize; // 511 + let l1_idx_dev = ((va_dev >> 30) & 0x1FF) as usize; // 508 + let l1_idx_ram = ((va_ram >> 30) & 0x1FF) as usize; // 509 + + // SAFETY: the three frames are page-aligned, exclusively owned for the + // duration of this single-core boot call, and pre-zeroed by the BSS + // loop. Indices are < 512 by the `& 0x1FF` construction. The table + // descriptors point at the shared L2 tables (device / RAM) and the new + // L1_hh, all by PA. Audit: UNSAFE-2026-0022. + unsafe { + // L0_hh[511] → L1_hh + core::ptr::write_volatile(l0_hh.add(l0_idx), table_descriptor(l1_hh as u64)); + // L1_hh[508] → L2_low (device, shared); L1_hh[509] → L2_high (RAM, shared) + core::ptr::write_volatile(l1_hh.add(l1_idx_dev), table_descriptor(l2_low_pa)); + core::ptr::write_volatile(l1_hh.add(l1_idx_ram), table_descriptor(l2_high_pa)); + } + + // SAFETY: publish the descriptor writes to the table walker (`DSB ISH`) + // BEFORE installing TTBR1 and clearing EPD1 — so no walk, once enabled, + // can read a stale/zero descriptor (ADR-0033 §Simulation row 0). Then + // `MSR TTBR1_EL1` (the high root PA) + `ISB`, then `MSR TCR_EL1` with the + // EPD1-cleared constant (every TTBR0-governing field byte-stable, so the + // live low regime is undisturbed — row 1) + `ISB`. After this the high + // half translates, but the PC/SP/VBAR are still low (the migration + // trampoline performs the crossing). `nomem` omitted so the descriptor + // writes above are not reordered past this block. Audit: UNSAFE-2026-0023. + unsafe { + asm!( + "dsb ish", + "msr ttbr1_el1, {ttbr1}", + "isb", + "msr tcr_el1, {tcr}", + "isb", + ttbr1 = in(reg) (l0_hh as u64), + tcr = in(reg) TCR_EL1_VALUE_HIGH_HALF, + options(nostack), + ); + } +} diff --git a/bsp-qemu-virt/src/syscall.rs b/bsp-qemu-virt/src/syscall.rs index aa2ec24..f28349e 100644 --- a/bsp-qemu-virt/src/syscall.rs +++ b/bsp-qemu-virt/src/syscall.rs @@ -85,14 +85,19 @@ pub struct SyscallTrapFrame { // the build before that can ship. (Mirrors the `TrapFrame` 192-byte guard.) const _: () = assert!(core::mem::size_of::() == 272); -/// Length of the syscall copy-from/to-user window in B5: the whole -/// identity-mapped RAM extent the bootstrap address space covers. +/// Length of the syscall copy-from/to-user window in B5: the whole RAM extent, +/// reached through the kernel's high-half direct map (post-T-022 / ADR-0033). /// -/// The B5 EL1 kernel-stub runs on the bootstrap AS, which identity-maps the -/// managed extent (per [ADR-0027 §Decision outcome (a)]), so the stub's buffer -/// — a `.rodata`-resident `&[u8]` in the kernel image — is in range. B6's real -/// EL0 task derives a tighter window from its own mapped region (see -/// [`UserAccessWindow`]'s module docs). The subtraction is a `const`, so it +/// The B5 EL1 kernel-stub executes in the high half; its buffer — a +/// `.rodata`-resident `&[u8]` in the kernel image — is reachable at its +/// high-half VA, so the window base is `phys_to_kernel_va(PMM_EXTENT_START)` +/// (see [`syscall_entry`]) and the stub buffer is in range. Because the +/// stub's "user" pointer **is** a valid kernel VA, the dispatcher's direct +/// deref works for the stub; B6's real EL0 task instead lives at a *user* VA +/// in its own `TTBR0_EL1`, so B6 derives a tighter per-task window AND +/// replaces the direct deref with a per-page user-VA→kernel-VA translation +/// (T-021 carry-forward gate #1 — see [`UserAccessWindow`]'s module docs). +/// The subtraction is a `const`, so it /// cannot wrap at runtime: const-eval rejects an underflow at **build time** /// (an inverted extent is a hard compile error, never a release wrap). The /// explicit assertion below makes that invariant — and its failure message — @@ -165,7 +170,10 @@ pub unsafe extern "C" fn syscall_entry(frame: *mut SyscallTrapFrame) { queues: (*crate::IPC_QUEUES.0.get()).assume_init_mut(), caller_table: (*crate::SYSCALL_STUB_TABLE.0.get()).assume_init_mut(), console: (*crate::CONSOLE.0.get()).assume_init_ref(), - user_window: UserAccessWindow::new(crate::PMM_EXTENT_START, SYSCALL_USER_WINDOW_LEN), + user_window: UserAccessWindow::new( + tyrne_hal::phys_to_kernel_va(crate::PMM_EXTENT_START), + SYSCALL_USER_WINDOW_LEN, + ), }; dispatch(&mut ctx, args) }; diff --git a/docs/analysis/tasks/phase-b/T-022-high-half-kernel-mapping.md b/docs/analysis/tasks/phase-b/T-022-high-half-kernel-mapping.md index ff1e4e9..9fcc1f8 100644 --- a/docs/analysis/tasks/phase-b/T-022-high-half-kernel-mapping.md +++ b/docs/analysis/tasks/phase-b/T-022-high-half-kernel-mapping.md @@ -2,7 +2,7 @@ - **Phase:** B - **Milestone:** B6 — First userspace "hello" (this is B6's **gating prerequisite**: making the kernel reachable from every task's active translation so a real EL0 task's `SVC` vector fetch + the EL1 handler translate — the [ADR-0033](../../../decisions/0033-kernel-high-half-migration.md) high-half migration; per [phase-b §B6 opening sequence](../../../roadmap/phases/phase-b.md#b6-opening-sequence--prerequisites)) -- **Status:** In Progress +- **Status:** In Review (implemented + all gates green; security-relevant — awaiting the explicit security review per Definition of done) - **Created:** 2026-05-29 - **Author:** @cemililik (+ Claude Opus 4.8 agent) - **Dependencies:** [ADR-0033](../../../decisions/0033-kernel-high-half-migration.md) — must be `Accepted` before code lands (settles the boot-time high-half §Simulation + the link-high/load-low + `KERNEL_VA_OFFSET` discipline); [ADR-0027](../../../decisions/0027-kernel-virtual-memory-layout.md) (the reserved `TTBR1`/`EPD1` + the byte-stable high-half `TCR` fields this consumes); [T-016](T-016-mmu-activation.md) (the `mmu_bootstrap` + `QemuVirtMmu` + `vmsav8` encoders this extends); [T-018](T-018-address-space-kernel-object.md) (the `activate` differ-path that goes live). @@ -23,16 +23,18 @@ The migration switches the running kernel's own PC/SP/`VBAR` translation regime ## Acceptance criteria -- [ ] **Link-high/load-low + the early-symbol contract.** The kernel is linked at `KBASE = 0xFFFF_FFFF_8008_0000` (LMA low via linker `AT`); a low-linked, position-independent `.idmap`-style early section holds `boot.s`, the high-half table builder, and the migration trampoline so they resolve `VA == PA` while the MMU is off / identity-only. Specifically: - - [ ] **Pre-jump symbols resolve LOW.** `boot.s`'s pre-jump references — `__stack_top` (SP setup), `__bss_start`/`__bss_end` (BSS-zero), the `.idmap` code + its literal pool — must resolve to **low** addresses; the early `adrp`/`:lo12:` sites at [`boot.s`](../../../../bsp-qemu-virt/src/boot.s) currently assume `VA == PA`, and under a high link they would compute high VAs with the MMU off → brick. `kernel_entry` is the **high** `BR` target, referenced only at/after the row-2 crossing (when the high regime is live). - - [ ] **Relocation / linker-map gate.** A build-time check (linker-map inspection or a relocation assertion) verifies **no pre-jump instruction references a high VMA** — the "pre-jump high-VMA relocation = build failure" gate. (Closes the early-`adrp`-computes-high brick ADR-0033 §Consequences names.) -- [ ] **Two distinct PA↔VA offsets** ([ADR-0033 §Dependency chain step 2](../../../decisions/0033-kernel-high-half-migration.md#dependency-chain)) — *not* one helper: the **image-link offset** (`KBASE − KERNEL_IMAGE_PHYS_BASE`) for linker-symbol→PA sites (`mmu_bootstrap` `TTBR`/page-table programming, the `__boot_pt_l0` re-read in `kernel_entry`), and the **physmap offset** (`KERNEL_PHYSMAP_BASE`) for PA-frame deref sites ([`crate::mm::phys_frame_kernel_ptr`](../../../../kernel/src/mm/mod.rs)'s body, PMM zero-fill, copy-user). Each offset host-tested; using the wrong offset at a site is a correctness bug the tests must catch. -- [ ] **High-half `TTBR1` tables** built per [ADR-0033 §"High-half layout"](../../../decisions/0033-kernel-high-half-migration.md): kernel image (`PXN = 0`/`UXN = 1`), kernel physmap/direct-map (`PXN = 1`), device MMIO — with the vector table + all handler/branch targets inside the `PXN = 0` image window. `vmsav8` high-half encoders host-tested. -- [ ] **EPD1-cleared `TCR_EL1` constant** in [`tyrne_hal::mmu::vmsav8`](../../../../hal/src/mmu/vmsav8.rs): bit 23 = 0, every `TTBR0`-governing field byte-identical to `TCR_EL1_VALUE`; host-tested. -- [ ] **The boot-time migration** runs the ADR-0033 §Simulation rows 0–3: build `TTBR1` (`ISB` after the `TTBR1` write, `DSB ISH` for the table memory) → `EPD1` `1→0` + `ISB` → trampoline (`VBAR`-high + `ISB`, `SP`-high, `LDR`/`BR` to the `PXN = 0` high continuation, `DAIF` masked) → `TTBR0`-null + `EPD0 = 1` + `ISB` + `TLBI VMALLE1` + `DSB ISH` + `ISB`. A new `tyrne: high-half active` boot marker prints after the jump. -- [ ] **Per-task `TTBR0_EL1` swap goes live:** [`QemuVirtMmu::activate`](../../../../bsp-qemu-virt/src/mmu.rs) drives the real per-task swap; the [T-018](T-018-address-space-kernel-object.md) `activate` differ-path that short-circuits in v1 now fires. **v1 keeps `ASID = 0` global + `activate`'s existing `TLBI`-on-swap for correctness — NO per-task ASID allocator** (the `AddressSpace::asid` field + reuse/generation/exhaustion policy are a TLB-flush-avoidance optimisation deferred to a future task per [ADR-0033 §"ASID policy"](../../../decisions/0033-kernel-high-half-migration.md)). Host test pins the differ path with distinct roots. -- [ ] **Audit:** new entries (trampoline asm; per-task `TTBR0` swap; the physmap-offset frame deref) + Amendments to UNSAFE-2026-0022 / 0023 / 0024; the 0023/0024 `Pending QEMU smoke verification` riders lifted. -- [ ] **All gates green** incl. `cargo +nightly miri test --workspace --exclude tyrne-bsp-qemu-virt`. **QEMU smoke:** full demo to `tyrne: all tasks complete` with the new `tyrne: high-half active` line; `-d int,unimp,guest_errors` shows **zero new Translation/Permission fault classes** (the migration is fault-clean) — the row-4 abort gate. +> **Delivered 2026-05-30** (branch `t-022-high-half-kernel-mapping`). Two sound **approach refinements** vs the original plan, both verified by the multi-lens adversarial pass (see §Review history) and recorded honestly here: +> - **Whole-image-high-linked + forced-low ELF entry, instead of a separate low-VMA `.idmap` section.** The whole image is linked at `KBASE` with LMA low (`AT(ADDR(.sec) − KERNEL_HH_OFFSET)`); the ELF entry is forced to `_start`'s LOW physical address via `_start_phys = _start − KERNEL_HH_OFFSET`. Because the image is high-linked *uniformly*, early/low code resolves to LOW (load) addresses through PC-relative `adrp`/`adr` (the offset cancels between any two in-image symbols) — so no separate identity-VMA section is needed, and the early-`adrp`-computes-high brick the ADR feared cannot occur (verified: `rust-readobj` shows entry `0x40080000`, VMA `0xFFFFFFFF4008…`, LMA `0x4008…`, zero ABS relocations in `.text`). This is **simpler and lower-risk** than threading Rust through a `.idmap` section (no inlining/GOT fragility); the ADR's Option-2 fallback was not needed. +> - **One linear high-half offset (`KERNEL_HIGH_HALF_OFFSET = 0xFFFF_FFFF_0000_0000`), instead of two distinct PA↔VA offsets.** With `KBASE = OFFSET + KERNEL_IMAGE_PHYS_BASE`, the image-link offset and the physmap offset **coincide numerically**, so "using the wrong offset at a site" is impossible by construction. `phys_to_kernel_va` / `kernel_va_to_phys` are the single helpers; on host test builds the offset is `0` (identity) so host tests are unaffected. + +- [x] **Link-high/load-low + forced-low entry.** Linker at `KBASE = 0xFFFF_FFFF_4008_0000`, LMA low, `ENTRY(_start_phys)`; the early/low path resolves PC-relative to load addresses. A compile-time `assert!(KERNEL_HIGH_HALF_OFFSET == 0xFFFF_FFFF_0000_0000)` in `main.rs` pins the linker literal to the HAL constant. *(The relocation gate is satisfied empirically — zero ABS64 relocations in `.text`; the absolute refs that exist, vtables/fn-pointers, live in `.data`/`.rodata` and resolve HIGH, correct once running high.)* +- [x] **Single high-half offset + helpers, host-tested.** `tyrne_hal::{KERNEL_HIGH_HALF_OFFSET, phys_to_kernel_va, kernel_va_to_phys}`; `cfg(target_arch = "aarch64")` real / else identity-0. Every `addr_of!`-as-PA and PA-deref site rebased (`mmu_bootstrap`, `kernel_main_high` `__stack_top`/`__boot_pt_l0`, `phys_frame_kernel_ptr`, `mmu.rs` walk, `pmm.rs` zero-fill, `task_loader` overlap preflight, syscall window). +- [x] **High-half `TTBR1` tables** built by `mmu_bootstrap::high_half_activate` (`L0_hh[511]` → `L1_hh[508→device, 509→RAM]`; the two L2 tables SHARED with the low identity). Vectors + all branch targets land in the `PXN = 0` RAM/image window. **v1 maps the whole RAM window `PXN = 0` (RWX-equivalent, like the identity map it replaces); the ADR's distinct `PXN = 1` physmap region is per-section W^X hardening deferred to ADR-0034** (this task §Out of scope; `AP = 0b00` keeps EL0 with no access, so the gap is privileged-side only). +- [x] **EPD1-cleared `TCR_EL1` constant** `TCR_EL1_VALUE_HIGH_HALF` in `vmsav8`; host test `tcr_high_half_clears_only_epd1` pins the single-bit (bit 23) delta. +- [x] **The boot-time migration** runs ADR-0033 §Simulation rows 0–3: `high_half_activate` (`DSB ISH` → `MSR TTBR1` → `ISB` → `MSR TCR(EPD1=0)` → `ISB`) → trampoline (`MSR VBAR`-high + `ISB` → `add sp,sp,off` → `br`, `options(noreturn)`) → `kernel_main_high` (`MSR TTBR0,xzr` + `ISB` + `EPD0`-set + `ISB` + `TLBI VMALLE1` + `DSB ISH` + `ISB`). The new `tyrne: high-half active` marker prints after the `br`. +- [x] **Per-task `TTBR0_EL1` swap correct + freed.** `kernel_main_high` frees `TTBR0` (null + `EPD0 = 1`); `::activate` writes a per-task root PA into the now-free `TTBR0`. v1 keeps `ASID = 0` global + `activate`'s existing `TLBI`-on-swap (no per-task ASID allocator — deferred per ADR-0033 §"ASID policy"). The scheduler differ-path host test `yield_now_activates_when_tasks_differ_in_address_space` pins distinct-root firing; the *runtime* distinct-AS swap is B6 (v1's demo keeps every task on the bootstrap AS). +- [x] **Audit:** new **UNSAFE-2026-0031** (migration trampoline + `TTBR0`-free) + Amendments to UNSAFE-2026-0022 / 0023 / 0024 (high-half table writes / MSR sequence / post-migration `TLBI`) + 0025 / 0026 / 0027 / 0030 (physmap-rebase derefs). +- [x] **All gates green:** 340 host tests; host + kernel clippy `-D warnings`; `cargo fmt --check`; release build (entry `0x40080000`); `cargo +nightly miri test --workspace --exclude tyrne-bsp-qemu-virt` (Stacked Borrows). **QEMU smoke:** full demo to `tyrne: all tasks complete` with the new `tyrne: high-half active` line; `-d int,unimp` shows exactly the 2 expected `SVC` exceptions and **zero** Translation/Permission/Abort fault classes (fault-clean — row-4 abort gate). *(The pre-existing release-only `console_write` status `0x1` quirk reproduces on the parent commit `bd39679` and is **not** a T-022 regression — flagged for a separate B5 follow-up.)* ## Out of scope @@ -56,3 +58,10 @@ All acceptance criteria checked; gates green (incl. Miri); audit-log entries + A - **Pass 1** (against a *mid-kernel* migration framing) caught an **architecturally-impossible** trampoline — "a page mapped identically in both regimes (`VA == PA`) reachable while translation flips," impossible because with `T0SZ = T1SZ = 16` the `TTBR0` (low, `VA[55]=0`) and `TTBR1` (high, `VA[55]=1`) input ranges are disjoint, so no VA is served by both. Corrected mechanism: the PC physically crosses low→high at the `br`, both `TTBR`s live, the low `.idmap` as the source regime. The pass also surfaced the live-kernel hazards (`DAIF` unmasked, surviving low-VA `StaticCell` pointers, live IRQ) that motivated the **boot-time** framing. - **Pass 2** (against the corrected *boot-time* §Simulation) confirmed the crossing is sound *in principle* (disjoint regimes; the `vmsav8` encoders set `AF = 1` + `PXN`-per-flags) and pinned the preconditions now in the ADR: the `.idmap` link-split + the two PA↔VA offsets (which do not exist yet — hence this task), the `PXN = 0`-image-window `br` target, the `DSB`-before-`MSR` / `ISB` ordering, the `EPD1`-cleared `TCR` constant, and "the migration must complete before any `DAIF` unmask." - **2026-05-30 — maintainer careful-re-read (pre-Accept) on ADR-0033 + T-022.** Findings folded into the Proposed draft before Accept: v1 **ASID policy** pinned (`ASID = 0` + flush-on-swap; per-task allocator deferred — H1); the **two distinct PA↔VA offsets** (image-link vs physmap) separated (H2); this task's **link-high early-symbol contract** + relocation/linker-map gate added (H3); the ADR's high-half layout table gained explicit **`AP = 0b00` / `SH`** columns (kernel-not-leakable-to-EL0 made concrete — M); the row-0 **`DSB`-before-`MSR`** barrier order corrected (M); the ADR §Dependency chain pruned of downstream-consumer steps per ADR-0025 §Rule 1 (M); a VA-layout Mermaid + the B5→B6 milestone-shift note + the ADR-0034 link/tone fixes (L). The §Simulation row-to-verification mapping is filled when this task moves to `In Progress`. +- **2026-05-30 — implemented + verified (→ In Review).** Landed on branch `t-022-high-half-kernel-mapping` after [ADR-0033 was Accepted](../../../decisions/0033-kernel-high-half-migration.md) (separate commit `db892c1`). Built up in QEMU-verified stages (link → migration → high-half Rust → physmap rebase), never one untested leap. Two sound approach refinements vs the plan (see §Acceptance criteria): whole-image-high-linked + forced-low entry (no separate `.idmap` section — the ADR's Option-2 toolchain fallback was **not** needed) and one linear high-half offset (the image-link/physmap offsets coincide). **§Simulation row-to-verification mapping:** + - **Row 0** (build `TTBR1`) → `vmsav8` host tests + `high_half_activate`'s computed `L0[511]`/`L1[508/509]` indices + shared-L2 reuse; UNSAFE-2026-0022 Amendment. **Verified:** the `br` reached a mapped PXN=0 window (else `tyrne: high-half active` would not print). + - **Row 1** (`EPD1` 1→0) → host test `tcr_high_half_clears_only_epd1` (single-bit delta) + UNSAFE-2026-0023 Amendment. **Verified:** the kernel runs entirely from `TTBR1` post-`br`. + - **Row 2** (the crossing) → **UNSAFE-2026-0031** (trampoline asm) + the `tyrne: high-half active` marker. **Verified:** marker prints; `-d int,unimp` shows zero faults at the crossing. + - **Row 3** (`TTBR0`-null/`EPD0`/`TLBI`) → UNSAFE-2026-0023/0024 Amendments + UNSAFE-2026-0031 + the scheduler differ-path host test. **Verified:** demo runs to completion with `TTBR0` freed. + - **Row 4** (abort gate) → the QEMU smoke + `-d int,unimp` showing exactly **2 [SVC]** and **zero** Translation/Permission/Abort fault classes. **Verified.** +- **2026-05-30 — third adversarial verification pass** (multi-lens workflow: crossing/barriers, link/relocation, tables/offsets, physmap/cfg/window). Verdict: barrier sequences, the trampoline (`SP`-rebase under `options(noreturn)`, PXN=0 target, both-regimes-live), relocation discipline, index/offset arithmetic, and the host/target cfg split all **sound — no brick**. Four findings, all addressed before commit: (1) **MEDIUM** physmap shares the PXN=0 image window (W^X gap) — *documented* as the accepted v1 RWX-equivalent simplification (per §Out of scope), PXN-split deferred to ADR-0034, `AP=0b00` keeps it EL0-unreachable; (2) **MEDIUM** the `load_image` image/frame overlap preflight was defanged by the PA/VA conflation (`image.as_ptr()` is now a high VA) — **fixed**: the site rebases via `kernel_va_to_phys` before the PMM PA-extent comparison, re-arming the defensive guard (host-identity on the test harness); (3) **LOW** UNSAFE-2026-0031 missing from the audit log — **fixed** (the agents read the tree before the entry was written); (4) **NIT** `boot.md` not updated — **fixed** in the doc sweep. Separately, the pre-existing release-only `console_write` status `0x1` quirk was confirmed to reproduce on the parent commit `bd39679` (same-host control) — **not a T-022 regression**, flagged for a B5 follow-up. diff --git a/docs/architecture/boot.md b/docs/architecture/boot.md index e43f618..72be803 100644 --- a/docs/architecture/boot.md +++ b/docs/architecture/boot.md @@ -14,9 +14,19 @@ The four boot stages, each with a tightly bounded responsibility: 1. **Firmware / loader.** QEMU's `-kernel` flag loads the ELF image at its linked-in load address (`0x40080000` per [ADR-0012](../decisions/0012-boot-flow-qemu-virt.md)), sets the PC to the ELF's entry point (`_start`), and enters at EL1 (default QEMU `virt`) or EL2 (`-machine virtualization=on`, or most real-hardware boot stacks delivering at EL2). The device-tree blob address is placed in `x0`; v1 ignores it. 2. **Assembly stub (`_start`).** Three phases: first, K3-12 (interrupts masked via `MSR DAIFSet, #0xf`) executes at the very head of the reset vector so a spurious interrupt cannot escape into an uninstalled vector table. Second, the EL drop (per [ADR-0024](../decisions/0024-el-drop-policy.md)) reads `CurrentEL`; on EL2 it configures `HCR_EL2` / `SPSR_EL2` / `ELR_EL2` and `eret`s to a post-drop label, on EL1 it falls through, on EL3 (or any unexpected EL) it halts in a named-label `wfe`-loop (`halt_unsupported_el: wfe ; b halt_unsupported_el`) — there is no Rust panic infrastructure pre-`kernel_entry`. Third, the conventional setup: load `__stack_top` into `SP`, enable FP/SIMD via `CPACR_EL1`, zero the BSS range (`__bss_start` .. `__bss_end`) using 8-byte stores, and branch to `kernel_entry`. If `kernel_entry` ever returns (it shouldn't), the stub falls into a defensive `wfe ; b 2b` halt loop. After phase two, every later instruction runs at EL1 — the precondition T-009's `UNSAFE-2026-0016` runtime check now relies on as a load-bearing invariant rather than a defensive guard. -3. **`kernel_entry` (Rust, in the BSP).** The first Rust code to run. Constructs the BSP's concrete HAL instances (for Phase 4c: the `Pl011Uart` console), installs the EL1 vector table (T-012), captures the boot-to-end timestamp, **activates the MMU** via `mmu_bootstrap` (T-016 / ADR-0027 — this lands the v1 identity layout in `TTBR0_EL1` and flips `SCTLR_EL1.{M,I,C} = 1`; every subsequent MMIO access goes through device-nGnRnE attributes), **initialises the Physical Memory Manager** (T-017 / ADR-0035 — bitmap allocator over the 128 MiB RAM extent with two reserved ranges covering the QEMU firmware region and the kernel image / `.bss` / `.boot_pt` / boot stack), **initialises the address-space arena** (T-018 / ADR-0028 — wraps the already-active L0 root frame as `AddressSpaceArena` slot 0 + mints the bootstrap AS authority cap; no `Mmu::create_address_space` call on the live root per ADR-0028 §Simulation row 0), **loads the embedded userspace placeholder image** via [`task_loader::load_image`](task-loader.md) (T-019 / ADR-0029 — produces a `LoadedImage` describing a freshly populated AS for the embedded `mov w0, #42; ret` blob; **does NOT execute** — runnability gates on B5/B6 per phase-b §B4 §Revision-notes; first runtime exerciser of [UNSAFE-2026-0025](../audits/unsafe-log.md) post-bootstrap `Mmu::map`, [UNSAFE-2026-0026](../audits/unsafe-log.md) `Pmm::alloc_frame` zero-fill, and [UNSAFE-2026-0027](../audits/unsafe-log.md) loader byte-copy), initialises the GIC, unmasks `DAIF.I`, prints the timer banner, then sets up the kernel-object arenas + capability tables + IPC + scheduler before transferring control. Marked `#[no_mangle] extern "C"` so the assembly stub can find it. +3. **`kernel_entry` (Rust, in the BSP).** The first Rust code to run. Constructs the BSP's concrete HAL instances (for Phase 4c: the `Pl011Uart` console), installs the EL1 vector table (T-012), captures the boot-to-end timestamp, **activates the MMU** via `mmu_bootstrap` (T-016 / ADR-0027 — this lands the v1 identity layout in `TTBR0_EL1` and flips `SCTLR_EL1.{M,I,C} = 1`; every subsequent MMIO access goes through device-nGnRnE attributes), **initialises the Physical Memory Manager** (T-017 / ADR-0035 — bitmap allocator over the 128 MiB RAM extent with two reserved ranges covering the QEMU firmware region and the kernel image / `.bss` / `.boot_pt` / boot stack), **initialises the address-space arena** (T-018 / ADR-0028 — wraps the already-active L0 root frame as `AddressSpaceArena` slot 0 + mints the bootstrap AS authority cap; no `Mmu::create_address_space` call on the live root per ADR-0028 §Simulation row 0), **loads the embedded userspace placeholder image** via [`task_loader::load_image`](task-loader.md) (T-019 / ADR-0029 — produces a `LoadedImage` describing a freshly populated AS for the embedded `mov w0, #42; ret` blob; **does NOT execute** — runnability gates on B5/B6 per phase-b §B4 §Revision-notes; first runtime exerciser of [UNSAFE-2026-0025](../audits/unsafe-log.md) post-bootstrap `Mmu::map`, [UNSAFE-2026-0026](../audits/unsafe-log.md) `Pmm::alloc_frame` zero-fill, and [UNSAFE-2026-0027](../audits/unsafe-log.md) loader byte-copy), initialises the GIC, unmasks `DAIF.I`, prints the timer banner, then sets up the kernel-object arenas + capability tables + IPC + scheduler before transferring control. Marked `#[no_mangle] extern "C"` so the assembly stub can find it. **(T-022 / ADR-0033 high-half migration — see §"High-half migration" below: `kernel_entry` now runs at the LOW physical alias with the MMU off and, after `mmu_bootstrap` + `high_half_activate` build the high-half `TTBR1_EL1` tables, branches the running kernel into the high half via the migration trampoline. The bring-up steps listed here — PMM, address-space arena, loader, GIC, scheduler — run in `kernel_main_high` at high-half addresses, with `TTBR0_EL1` freed for per-task userspace.)** 4. **Scheduler start (`start`).** The final call in `kernel_entry` is `start(SCHED.as_mut_ptr(), cpu, activate_address_space)`, which hands control to the cooperative FIFO scheduler and never returns; the scheduler runs the first ready task and drives the cooperative IPC demo until the system halts (see [scheduler.md](scheduler.md)). An early design intended a portable `tyrne_kernel::run` that a BSP would delegate to; the B-phase brought subsystem bring-up into `kernel_entry` instead, and `start` (defined in `kernel/src/sched/mod.rs`) is the actual handoff point. Consolidating the bring-up back into a portable kernel entry is a possible future refactor. +### High-half migration (T-022 / ADR-0033) + +Since [T-022](../analysis/tasks/phase-b/T-022-high-half-kernel-mapping.md) the kernel runs in the high half (`TTBR1_EL1`) so `TTBR0_EL1` is free for per-task userspace — the [ADR-0033](../decisions/0033-kernel-high-half-migration.md) prerequisite that unblocks a real EL0 task's `SVC` vector fetch (B6). The kernel image is **linked high** (`KBASE = 0xFFFF_FFFF_4008_0000`) but **loaded low** (`0x4008_0000`); the ELF entry is forced to `_start`'s low physical address (`_start_phys`, [`linker.ld`](../../bsp-qemu-virt/linker.ld)) because the MMU is off at reset. A single linear high-half offset `KERNEL_HIGH_HALF_OFFSET = 0xFFFF_FFFF_0000_0000` maps physical memory: `kernel_VA = OFFSET + PA` ([`tyrne_hal::phys_to_kernel_va`](../../hal/src/mmu/mod.rs)). The boot-time transition (ADR-0033 §Simulation): + +1. **`kernel_entry` (LOW).** Runs at the low physical alias with the MMU off. Because the whole image is high-linked *uniformly*, PC-relative `adrp`/`adr` references resolve to LOW (load) addresses at runtime (the offset cancels between in-image symbols), so no separate identity-VMA section is needed. It enables the low-identity MMU (`mmu_bootstrap`), then builds the high-half `TTBR1` tables and clears `EPD1` (`mmu_bootstrap::high_half_activate`: `DSB ISH` → `MSR TTBR1_EL1` → `ISB` → `MSR TCR_EL1` with `EPD1 = 0` → `ISB`). Both regimes are now live; the kernel still executes low. +2. **Migration trampoline (the crossing).** A small inline-asm block: `MSR VBAR_EL1, ` + `ISB` (high vectors live before the branch) → `add sp, sp, OFFSET` (rebase `SP` to the high stack alias) → `br `, `options(noreturn)`. The PC physically crosses low→high at the `br`; `DAIF` is masked and no `StaticCell` holds a low VA, so the crossing cannot brick. +3. **`kernel_main_high` (HIGH).** Frees `TTBR0_EL1` (`MSR TTBR0_EL1, xzr` + set `EPD0 = 1` + `TLBI VMALLE1` + `DSB ISH`), prints the new **`tyrne: high-half active`** boot marker, then constructs the console + GIC at their high device-MMIO aliases and runs the rest of the bring-up (§Stage 3) at high-half addresses. Function pointers / vtables (absolute, HIGH) are all taken here, so they stay reachable once `TTBR0` is freed. + +v1 maps the whole high-half RAM window `PXN = 0` (RWX-equivalent, like the identity map it replaces; `AP = 0b00` keeps EL0 with no access); the ADR-0033 layout's distinct `PXN = 1` physmap region is per-section W^X hardening deferred to ADR-0034. The migration is **fault-clean** (`-d int,unimp`: exactly the 2 syscall-smoke `SVC` exceptions, zero new Translation/Permission faults). Audit: [UNSAFE-2026-0031](../audits/unsafe-log.md) + Amendments to 0022/0023/0024. + ### Boot-time sequence ```mermaid @@ -190,7 +200,7 @@ Properties the boot flow maintains. These are the claims a reader can rely on an - **EL3 → EL2 → EL1 chain.** v1 hardware targets do not boot at EL3; if a future BSP requires it, a follow-up task adds the EL3→EL2 transition on top of the existing EL2→EL1 logic per ADR-0024 §Open questions. - **DTB parsing and `BootInfo`.** The kernel's typed boot-info contract, probably introduced with Pi 4 support. - **Multi-core start.** PSCI `CPU_ON` for secondary cores. -- **High-half kernel migration.** v1 maps the kernel identity-only via `TTBR0_EL1`; the future ADR-0033 placeholder (per [ADR-0027 §Decision outcome (a)](../decisions/0027-kernel-virtual-memory-layout.md)) introduces the high-half mapping when B6 surfaces the per-task `TTBR0_EL1` swap (B5 closed without it). +- ~~**High-half kernel migration.**~~ **Resolved (T-022 / ADR-0033, 2026-05-30)** — the kernel now runs in `TTBR1_EL1` and `TTBR0_EL1` is freed for per-task userspace (see §"High-half migration" above). v1 keeps the whole high-half RAM window `PXN = 0` (RWX-equivalent); per-section W^X hardening (a distinct `PXN = 1` physmap) is deferred to **ADR-0034**. - **Guard-page stacks.** With the MMU now active (T-016), guard-page stacks become reachable — pending a follow-on B-phase task that remaps a stack region's bottom page as invalid. - **Measured boot / attestation.** Hardware-dependent; deferred per [ADR-0012](../decisions/0012-boot-flow-qemu-virt.md). diff --git a/docs/architecture/memory-management.md b/docs/architecture/memory-management.md index 4068326..6918911 100644 --- a/docs/architecture/memory-management.md +++ b/docs/architecture/memory-management.md @@ -79,7 +79,7 @@ v1's `TCR_EL1` value commits to the layout shape: | `IPS` | bits 34:32 | 0b010 | 40-bit Intermediate Physical Address — matches QEMU virt + Cortex-A72 | | `AS` | bit 36 | 0 | 8-bit ASID field; v1 uses ASID=0 globally | -The ADR-0033 placeholder (the future high-half ADR — slot reserved in [ADR-0027 §Decision outcome (a)](../decisions/0027-kernel-virtual-memory-layout.md), not yet a real ADR file) flips `EPD1=1 → 0` and populates `TTBR1_EL1` when B6 needs per-task `TTBR0_EL1` swap (B5 closed without it); the rest of `TCR_EL1` stays byte-stable across that transition because the v1 settings already commit to the high-half-friendly shape. +**[ADR-0033](../decisions/0033-kernel-high-half-migration.md) (Accepted 2026-05-30; implemented by [T-022](../analysis/tasks/phase-b/T-022-high-half-kernel-mapping.md))** flips `EPD1 = 1 → 0` and populates `TTBR1_EL1` at boot — the kernel **migrates to the high half** so `TTBR0_EL1` is freed for per-task userspace (B6's gating prerequisite). The rest of `TCR_EL1` stays byte-stable across the transition because the v1 settings already commit to the high-half-friendly shape (`EPD1` is the single bit that changes — the host-tested `TCR_EL1_VALUE_HIGH_HALF`). **The identity layout described in this section is now the boot-time *bootstrap* phase** `mmu_bootstrap` establishes before `high_half_activate` + the migration trampoline move the running kernel to `TTBR1_EL1`; see [`boot.md` §"High-half migration"](boot.md#high-half-migration-t-022--adr-0033) for the transition and the single linear `KERNEL_HIGH_HALF_OFFSET = 0xFFFF_FFFF_0000_0000` direct map. ### Page-table entry encoding (block descriptor at L2) @@ -261,7 +261,7 @@ Until then, kernel-mode faults are a "kernel programming error" (panic-class). T - [ADR-0012 — Boot flow and memory layout for `bsp-qemu-virt`](../decisions/0012-boot-flow-qemu-virt.md) — the static image layout this doc inherits. - [ADR-0024 — EL drop to EL1 policy](../decisions/0024-el-drop-policy.md) — kernel runs at EL1 when the MMU activates. - [ADR-0027 — Kernel virtual memory layout (B2 — identity-mapped MMU activation)](../decisions/0027-kernel-virtual-memory-layout.md) — the load-bearing decision document for this chapter. -- ADR-0033 (named-but-unallocated placeholder slot) — Kernel high-half migration; opens when B5 surfaces the per-task `TTBR0_EL1` swap requirement. Slot is reserved in [ADR-0027 §Decision outcome (a)](../decisions/0027-kernel-virtual-memory-layout.md) and the [phase-b ADR ledger](../roadmap/phases/phase-b.md); no ADR file exists today. +- [ADR-0033](../decisions/0033-kernel-high-half-migration.md) (**Accepted 2026-05-30**) — Kernel high-half migration; implemented by [T-022](../analysis/tasks/phase-b/T-022-high-half-kernel-mapping.md). The kernel now runs in `TTBR1_EL1` (boot-time migration) and `TTBR0_EL1` is freed for the per-task swap (B6). Consumes the `TTBR1`/`EPD1` reservation + byte-stable high-half `TCR` fields ADR-0027 pre-committed; **no supersede**. - [`bsp-qemu-virt/src/mmu.rs`](../../bsp-qemu-virt/src/mmu.rs) — `QemuVirtMmu` impl (lands with T-016). - [`bsp-qemu-virt/src/mmu_bootstrap.rs`](../../bsp-qemu-virt/src/mmu_bootstrap.rs) — boot-time activation routine (lands with T-016). - [`bsp-qemu-virt/linker.ld`](../../bsp-qemu-virt/linker.ld) — `.boot_pt` reservation + `__boot_pt_*` linker symbols (extended by T-016). diff --git a/docs/audits/unsafe-log.md b/docs/audits/unsafe-log.md index f5f70b5..42abfe0 100644 --- a/docs/audits/unsafe-log.md +++ b/docs/audits/unsafe-log.md @@ -460,6 +460,8 @@ Both forms are time-stamped so a reader can reconstruct the entry's state at any **Amendment (2026-05-09, T-016 Stage 6 smoke verification): full pass for the bootstrap-site path.** Smoke trace at HEAD `6494ed2` + uncommitted T-016 working tree emits `tyrne: hello from kernel_main` → `tyrne: mmu activated` → `tyrne: timer ready (62500000 Hz, resolution 16 ns)` → `tyrne: starting cooperative scheduler` → IPC demo lines → `tyrne: all tasks complete` → `tyrne: boot-to-end elapsed = X ns`. The `tyrne: mmu activated` line printing proves Step 1's descriptor writes landed in the right slots with the right encoding (otherwise the next instruction-fetch after `SCTLR_EL1.M = 1` would have translation-faulted into the EL1 synchronous-exception vector instead of returning to `kernel_entry`); the kernel continuing past `mmu activated` through the IPC demo proves the kernel-image PA `0x4008_0000+` is correctly identity-covered by `L2_high[0..64]` and that subsequent UART writes go through the device-nGnRnE mapping at `L2_low[64..72]`. `-d int,unimp,guest_errors` shows only the pre-existing "PL011 data written to disabled UART" warning (358 instances on `main` at the same HEAD, 379 on the T-016 working tree — the +21 delta is one-byte-per-warning for the new `tyrne: mmu activated\n` line). No new fault classes; no `Translation Fault` / `Permission Fault` / `Unallocated Instruction` events. Status remains **Active**; the v1 evidence base for the bootstrap-site path is now smoke-verified rather than reasoning-only. + **Amendment (2026-05-30, T-022 / ADR-0033): scope extension to `high_half_activate`'s high-half table writes.** [`mmu_bootstrap::high_half_activate`](../../bsp-qemu-virt/src/mmu_bootstrap.rs) writes three table descriptors into the two new high-half root frames — `__boot_pt_l0_hh[511]` → `__boot_pt_l1_hh`; `__boot_pt_l1_hh[508]` → `__boot_pt_l2_low`; `__boot_pt_l1_hh[509]` → `__boot_pt_l2_high` — via the same `write_volatile` on `addr_of!`-derived `*mut u64`, with the identical safety argument as Step 1 (pre-zeroed `.bss` frames, 4 KiB alignment, in-range computed indices `(va >> shift) & 0x1FF`, single-caller/single-core with the MMU's `TTBR1` walks still disabled until the routine's own `EPD1` clear, host-tested `table_descriptor` encoder). The two L2 tables are **shared** with the low identity (a block descriptor's output address is the PA, identical via the low or high VA), so only the L0/L1 roots are newly written. The routine runs LOW (called from `kernel_entry` before the migration), so `addr_of!` yields the frames' PAs directly. Smoke-verified by the 2026-05-30 `tyrne: high-half active` trace (the `br` into the high half could not have reached a mapped PXN=0 window otherwise). Covered jointly with UNSAFE-2026-0031. + ### UNSAFE-2026-0023 — `MSR TTBR0_EL1` write in `QemuVirtMmu::activate` - **Introduced:** 2026-05-08, [T-016 — MMU activation](../analysis/tasks/phase-b/T-016-mmu-activation.md) Stage 3 (`bsp-qemu-virt/src/mmu.rs` skeleton). Lights up the address-space-activation half of the [`Mmu`](../../hal/src/mmu/mod.rs) trait per [ADR-0027 §Decision outcome (a)](../decisions/0027-kernel-virtual-memory-layout.md). @@ -488,6 +490,8 @@ Both forms are time-stamped so a reader can reconstruct the entry's state at any **Amendment (2026-05-09, T-016 Stage 6 smoke verification): bootstrap-site full pass; `activate` post-bootstrap path remains unverified.** The smoke trace (see UNSAFE-2026-0022's 2026-05-09 Amendment for the full evidence) reaches `tyrne: mmu activated` and continues through `tyrne: all tasks complete`, which proves the bootstrap-site MAIR / TCR / TTBR0 / TTBR1 / SCTLR write sequence is correctly ordered and produces a valid translation regime (otherwise the next instruction-fetch after `SCTLR_EL1.M = 1` would have faulted). The `::activate` per-call site (this entry's *original* scope, distinct from the bootstrap-Amendment scope) is **still unverified** at runtime because v1 has no `Mmu::activate` caller post-bootstrap; the first caller is a B3+ address-space-switching task whose smoke will lift this gap separately. No new fault classes; only the pre-existing PL011 warning (`+21` per the new mmu-activated UART line) appears in `-d guest_errors`. + **Amendment (2026-05-30, T-022 / ADR-0033): scope extension to the high-half migration's system-register writes.** Three migration sites share this entry's EL1-sysreg-write argument (privileged at EL1, single-core, single-caller per boot, architected `ISB` ordering, `options(nostack[, nomem])` correct): (1) `high_half_activate`'s `DSB ISH` → `MSR TTBR1_EL1, ` → `ISB` → `MSR TCR_EL1, TCR_EL1_VALUE_HIGH_HALF` (EPD1 `1→0`) → `ISB` — the `DSB` precedes the `MSR` so no walk reads a stale descriptor, and the EPD1-cleared TCR is byte-identical to the live `TCR_EL1_VALUE` except bit 23 (host test `tcr_high_half_clears_only_epd1`); (2) the trampoline's `MSR VBAR_EL1, ` (also cross-referenced by UNSAFE-2026-0031); (3) `kernel_main_high`'s `MSR TTBR0_EL1, xzr` + the `EPD0`-set read-modify-write of `TCR_EL1`. Smoke-verified — the kernel runs entirely from `TTBR1` through `tyrne: all tasks complete`. The original-scope `Mmu::activate` `MSR TTBR0_EL1` write now has a *freed* `TTBR0` to swap a per-task root into; the distinct-AS runtime swap remains B6's (host-tested via the scheduler differ-path). + ### UNSAFE-2026-0024 — `TLBI VAE1` / `TLBI VMALLE1` asm + barriers in `QemuVirtMmu::invalidate_tlb_*` - **Introduced:** 2026-05-08, [T-016 — MMU activation](../analysis/tasks/phase-b/T-016-mmu-activation.md) Stage 3 (`bsp-qemu-virt/src/mmu.rs` skeleton). @@ -513,6 +517,8 @@ Both forms are time-stamped so a reader can reconstruct the entry's state at any **Amendment (2026-05-09, T-016 Stage 6 smoke verification): bootstrap-site `TLBI VMALLE1` + `IC IALLU` pass; per-VA `TLBI VAE1` path unverified.** The smoke trace (per UNSAFE-2026-0022's 2026-05-09 Amendment) confirms the bootstrap-Amendment-extended sweep-invalidate sequence (`TLBI VMALLE1; DSB ISH; IC IALLU; DSB ISH; ISB`) is correctly ordered: any incorrect placement would have left stale TLB entries or stale I-cache lines that would surface as a fault at `SCTLR_EL1.M = 1`; instead the kernel reaches `tyrne: mmu activated` and `tyrne: all tasks complete` cleanly. The original-scope per-VA `::invalidate_tlb_address` site is **still unverified** at runtime because v1's demo does not call `Mmu::map` post-bootstrap (and therefore never produces a `MapperFlush` that fires `flush()`); the first runtime exercise of `TLBI VAE1, x` will land with a B3+ post-bootstrap caller. No new fault classes in `-d guest_errors`. + **Amendment (2026-05-30, T-022 / ADR-0033): scope extension to `kernel_main_high`'s post-migration `TLBI VMALLE1`.** After the regime crossing, `kernel_main_high` issues `TLBI VMALLE1` + `DSB ISH` + `ISB` to drop the now-stale low-identity translations of the `TTBR0` it just nulled. Same `invalidate_tlb_all`-class argument as the original entry (sweep-invalidate of every stage-1 EL1 entry, inner-shareable `DSB`, pipeline-drain `ISB`); register-only, so no `DSB` *before* the `TLBI` is needed. Smoke-verified: `-d int,unimp` shows **zero** Translation/Permission faults after the migration, so no stale low entry survived to mistranslate a high fetch. Covered jointly with UNSAFE-2026-0031. + ### UNSAFE-2026-0025 — `QemuVirtMmu::map` / `unmap` page-table descriptor writes - **Introduced:** 2026-05-08, [T-016 — MMU activation](../analysis/tasks/phase-b/T-016-mmu-activation.md) Stage 4 (page-table-walk body of `Mmu::map` / `Mmu::unmap`). @@ -539,6 +545,8 @@ Both forms are time-stamped so a reader can reconstruct the entry's state at any - **Reviewed by:** @cemililik (+ Claude Opus 4.7 agent). - **Status:** Active. Pending QEMU smoke verification: v1's demo does not call `Mmu::map` post-bootstrap, so the in-tree path through this audit's site is unexercised at runtime. The host-test coverage of the encoders + the smoke trace's success at the bootstrap site (T-016 Stage 6) are the v1 evidence; the first runtime exercise comes with the future B3+ task that issues a post-bootstrap `Mmu::map` call. + **Amendment (2026-05-30, T-022 / ADR-0033): page-table-frame derefs rebased to the high-half direct map.** `walk_and_install_leaf` (`l3_ptr`) and `walk_or_alloc_table` (`parent_ptr`) now compute the descriptor-frame pointer as `phys_to_kernel_va(frame.as_usize()) as *mut u64` (`KERNEL_HIGH_HALF_OFFSET + pa`) rather than the identity `pa as *mut u64`, because the kernel runs in the high half post-migration. The safety argument is unchanged — the result is still a valid, 4 KiB-aligned, exclusively-owned, mapped kernel VA; only the VA computation moved from identity to the direct map (on host test builds the offset is 0, so the identity-backed test frames are unaffected). This path is now runtime-exercised by `task_loader::load_image`'s `cap_map` (the `tyrne: image loaded` smoke line) and is fault-clean (`-d int,unimp` shows zero Translation/Permission faults). + **Amendment (2026-05-14, T-019 commit 3 BSP wiring): post-bootstrap `Mmu::map` smoke-verified.** T-019's [task loader BSP wiring](../../bsp-qemu-virt/src/main.rs)'s `load_image` invocation issues the first **post-bootstrap** `Mmu::map` calls on every boot — one per image page (1 for the v1 placeholder blob) plus one per stack page (1 for the v1 default `USERSPACE_STACK_PAGES = 1`), routed through [`cap_map`](../../kernel/src/mm/address_space.rs) under `USER | EXECUTE` / `USER | WRITE` flags respectively. Each call walks `L0 → L1 → L2 → L3` through the freshly-allocated root frame, allocating intermediate page-table frames via [`Pmm`](../../kernel/src/mm/pmm.rs) and writing leaf page descriptors via this audit's `core::ptr::write_volatile` site. The 2026-05-14 QEMU smoke trace shows `tyrne: image loaded (entry = 0x800000; sp = 0x802000; image bytes 8; stack bytes 4096; AS cap = idx 1)` followed by full demo completion through `tyrne: all tasks complete`; `-d int,unimp,guest_errors` reports only the pre-existing 629 PL011-disabled-UART warnings (no new fault classes, no Translation/Permission faults). The map-path invariants this entry names (root-frame validity inherited via `Mmu::create_address_space`'s `unsafe fn` contract → induction via table-descriptor chain; index bounds `[0, 511]` per `(va >> shift) & 0x1FF`; volatile access discipline; write ordering at leaf; caller's `MapperFlush` discharge via `cap_map`'s internal `token.flush(mmu)`; encoder correctness from host-tested `vmsav8` helpers) all hold under the smoke trace's runtime evidence. The unmap path stays unexercised at runtime in v1 (v1 has no userspace caller of `cap_unmap`); host-test coverage of `cap_unmap` in `kernel/src/mm/address_space.rs::tests::cap_unmap_returns_unmapped_frame` + the rollback-test paths in `kernel/src/obj/task_loader.rs::tests` are the v1 evidence base for unmap. ### UNSAFE-2026-0026 — PMM frame-zeroing via `core::ptr::write_bytes` in `Pmm::alloc_frame` @@ -561,6 +569,8 @@ Both forms are time-stamped so a reader can reconstruct the entry's state at any - **Reviewed by:** @cemililik (+ Claude Opus 4.7 agent). - **Status:** Active. Smoke verification context (BSP commit 4 of T-017 — `bsp-qemu-virt::main.rs` PMM publication + `tyrne: pmm initialized (...)` boot-banner): the PMM is **constructed** on every boot (i.e., `Pmm::new` is reached and produces the banner with frame counts), but the `unsafe` zero-fill **operation inside `alloc_frame`** stays runtime-unexercised in v1 because the cooperative IPC demo never calls `alloc_frame`. (The earlier wording confusingly said "the site is reached on every boot" — that was about Pmm::new construction, not the `unsafe` zero-fill block this entry actually audits; clarified per PR #26 round-1 review.) The host-test coverage in `Pmm::alloc_frame_returns_first_free_and_zeroes_payload` (which pre-poisons backing memory with `0xA5` and asserts the post-alloc frame is all-zeroes) is the v1 evidence base under Miri's Stacked Borrows discipline. First runtime exercise of the zero-fill itself comes with B3+ work that issues a real `alloc_frame` — likely the AddressSpace bring-up (T-018) when it allocates a root translation-table frame. + **Amendment (2026-05-30, T-022 / ADR-0033): frame-zero deref rebased to the high-half direct map.** `alloc_frame`'s zero-fill pointer is now `phys_to_kernel_va(pa_usize) as *mut u8` (`KERNEL_HIGH_HALF_OFFSET + pa`) rather than the identity `pa_usize as *mut u8`, because the kernel runs in the high half post-migration; invariant (3)'s "identity-mapped … PA == VA" wording is superseded by "reachable at `phys_to_kernel_va(pa)`, the high-half direct map." On host test builds the offset is 0, so the real host-backed test frames are zero-filled in place unchanged. This path is now runtime-exercised by `task_loader::load_image`'s frame allocations (the `tyrne: image loaded` smoke line), fault-clean. + **Amendment (2026-05-14, T-019 commit 3 BSP wiring): `alloc_frame` zero-fill smoke-verified at runtime.** T-019's [task loader BSP wiring](../../bsp-qemu-virt/src/main.rs)'s `load_image` invocation is the first runtime caller of `Pmm::alloc_frame`. Per call it consumes: (1) one frame for the new AS's L0 root via [`cap_create_address_space`](../../kernel/src/mm/address_space.rs); (2) one frame per image page (1 for v1's 8-byte placeholder blob); (3) one frame per stack page (1 for `USERSPACE_STACK_PAGES = 1`); (4) up to 6 intermediate page-table frames via `Mmu::map`'s `walk_or_alloc_table`. The 2026-05-14 QEMU smoke trace shows `tyrne: pmm initialized (32600 frames available; 168 reserved)` followed by `tyrne: image loaded (...)` followed by full demo completion through `tyrne: all tasks complete`; no faults, no Translation/Permission errors. The four invariants this entry names (page-alignment of the target via `Pmm::new`'s validation (i); exclusive ownership via the just-set bitmap bit; identity mapping post-MMU per ADR-0027; bitmap-math overflow-freedom via `saturating_*` / `wrapping_div`; `write_bytes` ordering under single-core cooperative semantics) all hold under the runtime evidence. T-018's bootstrap AS path uses `wrap_bootstrap` (not `Mmu::create_address_space`) and therefore does NOT exercise `alloc_frame` for the L0 root — the *first* runtime exerciser was T-019, not T-018, despite the original status note's prediction (T-018 wraps the already-live `.boot_pt` L0 frame; the first kernel-side `alloc_frame` for a translation table came when T-019 minted the *second* AS). ### UNSAFE-2026-0027 — task-loader frame byte-copy via `core::ptr::copy_nonoverlapping` in `task_loader::load_image` @@ -587,6 +597,8 @@ Both forms are time-stamped so a reader can reconstruct the entry's state at any - **Reviewed by:** @cemililik (+ Claude Opus 4.7 agent). - **Status:** Active. Smoke verification context: T-019 commit 3 (BSP wiring) is the first runtime exerciser; the 2026-05-14 QEMU smoke trace prints `tyrne: image loaded (entry = 0x800000; sp = 0x802000; image bytes 8; stack bytes 4096; AS cap = idx 1)` immediately after the address-space-arena banner and runs the demo to completion (`tyrne: all tasks complete`); `-d int,unimp,guest_errors` reports only the pre-existing 629 PL011-disabled-UART warnings (no new fault classes). The single `copy_nonoverlapping` call per boot writes 8 bytes from `.rodata`'s `USERSPACE_IMAGE` into a freshly PMM-allocated frame; bytes 8..4096 stay zero from [UNSAFE-2026-0026]'s contract (the tail-zeroing the §Simulation row 5 description promises). Host-test coverage in `task_loader::tests::tail_zeroing_on_partial_last_page` (which loads a 100-byte image, asserts payload bytes 0..100 match the source pattern, and asserts bytes 100..4096 are zero) pins the contract under Miri's Stacked Borrows discipline. + **Amendment (2026-05-30, T-022 / ADR-0033): destination frame deref rebased via `phys_frame_kernel_ptr`.** The loader's `copy_nonoverlapping` destination is obtained from [`crate::mm::phys_frame_kernel_ptr`](../../kernel/src/mm/mod.rs), whose body now returns `phys_to_kernel_va(frame.as_usize())` (the high-half direct map) rather than the identity `frame.as_usize()`; the migration broke the `addr_of!`/PA-as-VA conflation project-wide in this one helper, leaving the loader call site source-unchanged (the §Forward-compat note this entry referenced is now realised). On host builds the offset is 0 (identity). Smoke-verified: the 2026-05-30 trace prints `tyrne: image loaded …` (8 image bytes copied into the high-half-reached frame) and runs to `tyrne: all tasks complete`, fault-clean. + **Amendment (2026-05-15, T-019 commit 4 — review-round 1 follow-up): non-overlap invariant now runtime-enforced via `Pmm::could_yield_pa_overlapping` preflight.** Pre-amendment, this entry's invariant *"source and destination do not overlap"* was upheld at the BSP-wiring layer (`.rodata`-resident `USERSPACE_IMAGE` ⊆ PMM-reserved kernel-image range per ADR-0027 + ADR-0035; runtime allocations from `pmm.alloc_frame()` come from a disjoint allocatable subset of the extent). The 2026-05-14 PR #31 review-round 1 P1 finding (review #2) observed that this invariant was not mechanically enforced at the safe `load_image` API boundary: a safe in-crate caller could construct a `Pmm` over an extent that overlaps the image slice's PA range (the `Pmm::new` constructor accepts an arbitrary `PhysFrameRange` with no proof of disjointness from caller-held pointers), at which point a future `alloc_frame()` could return a frame aliasing the image source and `copy_nonoverlapping` would invoke undefined behaviour. The fix lands as a new public PMM query — [`Pmm::could_yield_pa_overlapping(pa_range)`](../../kernel/src/mm/pmm.rs) — plus a new preflight in `load_image` (§Simulation row 4) that converts the image slice's VA to a PA range under v1's identity-mapped kernel AS (ADR-0027 §Decision outcome (a)) and rejects with the new `LoadError::ImageOverlapsAllocatableMemory` variant if the query returns true. The preflight is pre-state-change, so no rollback is required. Two new host tests pin the contract: `task_loader::tests::rejects_when_image_overlaps_allocatable_memory` (PMM extent constructed over the image slice → reject) and `task_loader::tests::accepts_image_disjoint_from_pmm_extent` (heap-allocated image outside extent → accept). Invariant (4) of this entry's `Invariants relied on` section is therefore restated as: *"source and destination do not overlap — enforced at runtime by the `Pmm::could_yield_pa_overlapping` preflight in `load_image` (§Simulation row 4) prior to any `alloc_frame` call. The BSP-layout discipline (`.rodata` ⊆ PMM-reserved kernel-image range) remains the production reality but is no longer the load-bearing soundness argument."* The runtime preflight inherits the v1 identity-mapping dependence shared by every kernel-resident raw-pointer site (UNSAFE-2026-0025 / 0026 / 0027); a future ADR-0033 high-half migration introduces a `virt_to_phys` helper at the loader's call site as part of the project-wide sweep, not a T-019-local change. **Amendment (2026-05-15, T-019 commit 5 — review-round 2 follow-up): destination pointer materialised via [`crate::mm::phys_frame_kernel_ptr`] helper; VA-range preflight added to row 3.** Two follow-ups to PR #31 review-round 2 (gemini-code-assist line-level comments): @@ -668,3 +680,26 @@ Neither change touches the `copy_nonoverlapping` site itself; both correct contr - **Status:** Active. Host-tested (the `user_access` + `dispatch` `console_write` suites pin in-range / out-of-range / overrun / zero-length / wrap and the cap-gated emit) and Miri-clean under permissive provenance. Smoke-verified at runtime: the 2026-05-29 QEMU `console_write` `SVC` validates a 63-byte `.rodata` buffer against the identity-mapped RAM window and emits it (`bytes=63`, status `0x0`). `copy_to_user`'s `unsafe` block is host-tested + Miri-clean but has no runtime exerciser in v1 (no pointer-returning syscall yet); its first runtime exercise comes with B6+'s first user-pointer-returning syscall. **Amendment (2026-05-29, commit `2c713c0`, T-021 review-round follow-up): operation changed from `copy_nonoverlapping` to `core::ptr::copy` (memmove); the original §Operation `copy_nonoverlapping` wording, invariant (3), and the §Rejected-alternatives bullet that named `copy_nonoverlapping` as the chosen primitive are historical / superseded.** A review-round finding observed that the original invariant (3) — *"source and destination do not overlap (kernel `src`/`dst` vs. user buffer)"* — is **not proven** by `UserAccessWindow::validate`, which checks *bounds*, not *disjointness*. Both `unsafe` blocks now use `core::ptr::copy`. **Correction, verified empirically with Miri:** switching to `core::ptr::copy` does **not** make these functions "overlap-tolerant" (an earlier draft of this Amendment claimed so — that claim is itself corrected here). An *overlapping* `(user_ptr, kernel-slice)` pair is **UB regardless of the copy primitive**, because `copy_from_user`'s `dst: &mut [u8]` (resp. `copy_to_user`'s `src: &[u8]`) parameter is exclusive / shared, so an aliasing access through the exposed `user_ptr` violates that borrow — confirmed under Miri's Stacked Borrows (*"not granting access to tag <wildcard> because that would remove [Unique …] which is strongly protected"*). The **actual soundness basis is the user/kernel disjointness invariant**: `user_ptr` names *userspace* memory while the kernel slice is a *distinct allocation* (v1 — `console_write`'s fresh 256-byte stack buffer) / a *separate address space* (B6, per [ADR-0027 §Decision outcome (a)](../decisions/0027-kernel-virtual-memory-layout.md)), so `[user_ptr, user_ptr + len)` and the kernel slice never overlap. Under that invariant **both** `core::ptr::copy` and `copy_nonoverlapping` are sound; `core::ptr::copy` is retained as the conservative primitive (so the primitive itself imposes no overlap precondition on top of the disjointness invariant). **Invariant (3) is restated** as: *"Disjointness — the kernel slice (`dst`/`src`) and the userspace range `[user_ptr, user_ptr + len)` are disjoint by the user/kernel memory model; `validate` proves bounds, the address-space split proves disjointness; an overlapping pair would be UB via the parameter's borrow exclusivity (Miri-confirmed) and never occurs for a real caller."* The other invariants (range validity, the v1 identity map, no single-core interleaving) and the `from_raw_parts` / HAL-method rejected alternatives are unchanged. All current callers satisfy disjointness, so the host / Miri / QEMU evidence above still holds. The call-site SAFETY comments were updated to match. + + **Amendment (2026-05-30, T-022 / ADR-0033 high-half migration): the validated user pointer is dereferenced through the high-half regime.** The B5 EL1 kernel-stub now runs in the high half; its `.rodata` buffer is reached at its high-half VA, and the syscall window base is `phys_to_kernel_va(PMM_EXTENT_START)` (see [`bsp-qemu-virt/src/syscall.rs`](../../bsp-qemu-virt/src/syscall.rs)). Because the stub's "user" pointer **is** a valid kernel high-half VA, the validated direct deref remains sound for the stub (invariant 2's "v1 identity map" wording is superseded: the kernel now runs high-half, not identity). For B6's real EL0 task the pointer will be a *user* VA in a separate `TTBR0_EL1` AS, so B6 replaces the direct deref with a per-page user-VA → kernel-VA translation (T-021 carry-forward gate #1) — the window-containment + wrap + zero-length validation is unchanged. Smoke-verified 2026-05-30 (`console_write` emits `bytes=63`, status `0x0` in the debug trace). + +### UNSAFE-2026-0031 — boot-time high-half migration trampoline + `TTBR0_EL1` free + +- **Introduced:** 2026-05-30, [T-022 — high-half kernel mapping](../analysis/tasks/phase-b/T-022-high-half-kernel-mapping.md) / [ADR-0033](../decisions/0033-kernel-high-half-migration.md). The boot-time migration that moves the running kernel from the low identity (`TTBR0_EL1`) to the high half (`TTBR1_EL1`), freeing `TTBR0_EL1` for per-task userspace. +- **Location:** [`bsp-qemu-virt/src/main.rs`](../../bsp-qemu-virt/src/main.rs) — the migration-trampoline `asm!` block at the tail of `kernel_entry` (`MSR VBAR_EL1` + `ISB` + `add sp, sp, off` + `br`, `options(noreturn)`) and the `TTBR0`-free `asm!` block at the head of `kernel_main_high` (`MSR TTBR0_EL1, xzr` + `ISB` + `EPD0`-set RMW of `TCR_EL1` + `ISB` + `TLBI VMALLE1` + `DSB ISH` + `ISB`). +- **Operation:** ADR-0033 §Simulation rows 2–3. Executing from the low identity with both regimes live (`high_half_activate` already populated `TTBR1` + cleared `EPD1`), the trampoline (a) writes the **high** vector base to `VBAR_EL1` so any synchronous fault on the first high fetch vectors to a `TTBR1`-mapped handler, (b) rebases `SP` from the low identity stack to its high-half alias by adding `KERNEL_HIGH_HALF_OFFSET` (the same physical stack reached through `TTBR1`), and (c) `br`s the PC to the high-half image alias of `kernel_main_high` — the PC physically crosses low→high at the branch. `kernel_main_high` then nulls `TTBR0_EL1`, sets `TCR_EL1.EPD0 = 1`, and flushes stale low translations, leaving the kernel structurally absent from the low half. +- **Invariants relied on:** + - **Both regimes live across the branch.** `high_half_activate` (UNSAFE-2026-0022 / 0023 Amendments) installed `TTBR1` + cleared `EPD1` with the `DSB ISH`-before-`MSR` ordering, so the high VAs translate before the `br`. The low identity (`TTBR0`) is still live and is only nulled *after* the crossing (in `kernel_main_high`), so the few pre-`br` instructions execute through a valid mapping and the `br` target is reachable. + - **`br` target lands in the PXN=0 image window.** `high_entry = KERNEL_HIGH_HALF_OFFSET | ((kernel_main_high as *const () as usize) & 0xFFFF_FFFF)` — the high-half image alias of `kernel_main_high`, which the high-half RAM window maps `PXN = 0` (executable). The mask is correct whether the compiler materialises the symbol PC-relative (low) or absolute (high), because the image PA is in the low 4 GiB. + - **`VBAR`-high before the `br`, with `ISB`.** The `ISB` after `MSR VBAR_EL1` context-synchronises the vector base before the branch, so a synchronous fault on the first high fetch cannot vector to a stale (low) vector that the about-to-be-nulled `TTBR0` maps. + - **`SP` rebase soundness under `options(noreturn)`.** The trampoline changes `SP` and never returns (it `br`s away); `options(noreturn)` tells the compiler nothing after the block runs, so there is no epilogue to mismatch the changed `SP`. `kernel_main_high` begins a fresh frame on the high stack alias (the same physical bytes), so its frame is self-consistent. The high stack is mapped in `TTBR1` (it lives in the kernel-image RAM region). + - **`DAIF` masked throughout.** Masked from `_start` (`boot.s` `msr daifset, #0xf`); no interrupt source is enabled until the GIC unmask far later in `kernel_main_high`, so no asynchronous exception can occur during the regime switch. + - **No live low-VA pointer survives.** No `StaticCell` is written before the migration (`kernel_main_high` writes them all, storing high VAs); the only low references were in `kernel_entry`'s own (abandoned) frame, which the `br`-away discards. + - **`TTBR0`-free ordering.** `MSR TTBR0_EL1, xzr` + `ISB`, then `EPD0`-set + `ISB`, then `TLBI VMALLE1` + `DSB ISH` + `ISB` — register-only writes (no table-memory mutation, so no `DSB` *before* the `TLBI` is required); the `TLBI` drops stale low translations and the `DSB ISH` completes it inner-shareable. `options(nostack, nomem)` is correct (no Rust-visible memory access; `SP` is untouched here — the trampoline already rebased it). +- **Rejected alternatives:** + - **Mid-kernel migration (after `start()`).** Rejected per ADR-0033 §Decision outcome: a live kernel has `DAIF` unmasked, surviving low-VA `StaticCell` pointers, and a live timer IRQ — three bricking hazards that all evaporate at boot. The boot-time framing reduces the irreducible risk to the relocation discipline + the `br` crossing. + - **A trampoline page mapped identically in both regimes (`VA == PA`).** Architecturally impossible at `T0SZ = T1SZ = 16` (disjoint `TTBR0` low / `TTBR1` high input ranges) — the first adversarial pass caught this. The corrected mechanism crosses the PC at the `br` with both `TTBR`s live, the low-running `kernel_entry` as the source regime. + - **Rebase `SP` inside `kernel_main_high` instead of the trampoline.** Rejected: the compiler emits `kernel_main_high`'s prologue (which may touch the stack) before any inline asm runs; changing `SP` mid-function would mismatch that frame. Rebasing in the trampoline (under `options(noreturn)`, no epilogue) means `kernel_main_high` starts with a consistent high `SP`. + - **Pre-flip `TLBI` of the high range before clearing `EPD1`.** Unnecessary: with `EPD1 = 1` a `TTBR1` walk faults and the architecture caches no result, so there is nothing stale to drop (the §Simulation review corrected an earlier "pre-flip TLBI" rationale). +- **Reviewed by:** @cemililik (+ Claude Opus 4.8 agent). Security-sensitive (changes the kernel's own translation regime + the kernel/user isolation boundary) → second-reviewer required per [unsafe-policy §Review.4](../standards/unsafe-policy.md); the §Simulation was hardened against two adversarial verification passes during ADR-0033 drafting + a third multi-lens adversarial pass at T-022 implementation. +- **Status:** Active. **Smoke-verified 2026-05-30:** the debug QEMU trace prints the new `tyrne: high-half active` marker (the runtime proof the `br` reached the PXN=0 high image window) and continues through `tyrne: all tasks complete`; `-d int,unimp` shows exactly the 2 expected `SVC` exceptions and **zero** Translation / Permission / Abort fault classes (the migration is fault-clean — ADR-0033 §Simulation row 4 abort gate). The release build boots identically (the release `console_write=0x1` quirk is pre-existing on the parent commit `bd39679`, not introduced here). The per-task `TTBR0_EL1` swap for a *distinct* address space is exercised at runtime in B6 (v1's demo keeps every task on the bootstrap AS); the swap mechanism (`::activate` writing the per-task root PA into the now-freed `TTBR0`) is host-tested via the scheduler differ-path (`yield_now_activates_when_tasks_differ_in_address_space`). diff --git a/docs/roadmap/current.md b/docs/roadmap/current.md index d337a14..03b710c 100644 --- a/docs/roadmap/current.md +++ b/docs/roadmap/current.md @@ -61,7 +61,7 @@ A short pointer file updated as work progresses. For the full plan see [`phases/ - **Active phase:** B — opened 2026-04-21. **B0 closed 2026-04-27**; **B1 closed 2026-05-07**; **B2 closed 2026-05-09**; **B3 closed 2026-05-14** via PR #29's closure trio (merge `b425dc1`); **B4 closed 2026-05-28** via its closure trio (T-019, PR #31 `7f876af`); **B5 closed 2026-05-29** via its closure trio (T-020 + T-021, PR #34 `f98e1af`). All six closures lifted `Done` after a verbatim QEMU smoke trace + clean `-d guest_errors` count per the [business master-plan §Acceptance criteria](../analysis/reviews/business-reviews/master-plan.md#acceptance-criteria) rule. **The 2026-04-28 implementation-complete claim for B1 was rolled back on 2026-05-06 by the smoke regression and re-issued 2026-05-07 as a smoke-verified Done** — that remains the only re-open arc to date; B2 and B3 both closed cleanly on first attempt. - **Active milestone:** **B6 — First userspace "hello".** B5 (Syscall boundary) was formally **Closed 2026-05-29** via its closure trio (see the top banner + the [B5 business retrospective](../analysis/reviews/business-reviews/2026-05-29-B5-closure.md)). B6 per [phase-b.md §B6](phases/phase-b.md#milestone-b6--first-userspace-hello): a real EL0 task — loaded by the deferred [`task_create_from_image`](phases/phase-b.md#milestone-b4--task-loader) bridge (B4 §3, the `LoadedImage` → runnable `CapHandle{CapObject::Task(...)}` wrapper) — runs in its own AS, makes a `console_write` syscall through the lower-EL `VBAR_EL1+0x400` vector (the real EL0↔EL1 round-trip B5's `+0x200` proxy could not prove), and exits via `task_exit`. B6 **must** close the three [T-021 carry-forward gates](phases/phase-b.md#milestone-b6--first-userspace-hello) (per-task `console_write` window + per-page user-VA translation; `SP_EL1` init for `+0x400`; `SYSCALL_STUB_TABLE` → current-task table) before a real EL0 task runs, and pairs with the **ADR-0033 high-half** placeholder opening. B6 is the **Phase-B-closing** milestone (its review doubles as the Phase B retrospective). -- **Active task:** none — B5 closed via the closure trio. **Next to open:** the deferred [`task_create_from_image`](phases/phase-b.md#milestone-b4--task-loader) wrapper (the `LoadedImage` → runnable `TaskCap` bridge), paired with the **ADR-0033 high-half** ADR opening; then `userland/hello` + the `tyrne-user` safe-wrapper crate. **Last tasks Done: T-020 + T-021 — 2026-05-29** (PR #34, merge `f98e1af`): T-020 split `IpcError::InvalidCapability` → `StaleHandle`/`WrongObjectKind`/`MissingRight` + redacted `Capability`/`CapObject` `Debug` (zero new `unsafe`); T-021 landed the architecture-agnostic kernel `syscall` module (`SyscallError` / ABI / `UserAccessWindow` / dispatcher) + the BSP `tyrne_sync_trampoline` (current-EL `+0x200` + lower-EL `+0x400`) + `CapObject::DebugConsole` + `CapRights::CONSOLE_WRITE` + `CapHandle::from_raw`, exercised at B5 by an EL1-kernel-stub `SVC` (current-EL path; the real EL0 `+0x400` round-trip deferred to B6) — UNSAFE-2026-0029 / 0030. +- **Active task:** [T-022 — high-half kernel migration](../analysis/tasks/phase-b/T-022-high-half-kernel-mapping.md) — **In Review 2026-05-30** (B6's gating prerequisite). [ADR-0033 Accepted](../decisions/0033-kernel-high-half-migration.md) 2026-05-30 (separate commit `db892c1`); T-022 implements the boot-time migration — the kernel now runs in `TTBR1_EL1` (`tyrne: high-half active` boot marker) and `TTBR0_EL1` is freed for per-task userspace. Whole-image-high-linked + forced-low ELF entry + a single linear high-half offset (`0xFFFF_FFFF_0000_0000`); the ADR's Option-2 fallback was not needed. All gates green (340 host tests, host+kernel clippy, fmt, release build, Miri Stacked-Borrows, QEMU smoke fault-clean — exactly 2 SVC exceptions, zero new Translation/Permission faults); hardened against a third multi-lens adversarial pass (4 findings, all addressed). **Security-relevant → awaiting explicit security review.** **Next to open (after the review confirms T-022):** the EL0-ready `Task` context + enter-EL0/`ERET` path (T-021 carry-forward gate #2), then [`task_create_from_image`](phases/phase-b.md#milestone-b4--task-loader) (the `LoadedImage` → runnable `TaskCap` bridge), the remaining T-021 gates #1/#3, then `userland/hello` + the `tyrne-user` safe-wrapper crate. **Prior tasks Done: T-020 + T-021 — 2026-05-29** (PR #34, merge `f98e1af`): T-020 split `IpcError::InvalidCapability` → `StaleHandle`/`WrongObjectKind`/`MissingRight` + redacted `Capability`/`CapObject` `Debug` (zero new `unsafe`); T-021 landed the architecture-agnostic kernel `syscall` module (`SyscallError` / ABI / `UserAccessWindow` / dispatcher) + the BSP `tyrne_sync_trampoline` (current-EL `+0x200` + lower-EL `+0x400`) + `CapObject::DebugConsole` + `CapRights::CONSOLE_WRITE` + `CapHandle::from_raw`, exercised at B5 by an EL1-kernel-stub `SVC` (current-EL path; the real EL0 `+0x400` round-trip deferred to B6) — UNSAFE-2026-0029 / 0030. - **In review:** none — T-020 + T-021 merged via PR #34 (`f98e1af`); the B5 closure trio (security + business + performance) is complete. - **In progress:** none. - **Working branch:** `sec-review-b5-syscall-boundary` (off `main` at `f98e1af`) carries the B5 security review (`c424dcb`) + the `security-model.md` SMMUv3 reconcile (`afeed10`) + this closure trio (business + performance). **[PR #34](https://github.com/HodeTech/Tyrne/pull/34)** (T-020 + T-021, 9 commits) **merged to `main` 2026-05-29** (`f98e1af`); branches `t-020-syscall-error-taxonomy` / `t-021-syscall-dispatch` retired. diff --git a/docs/roadmap/phases/phase-b.md b/docs/roadmap/phases/phase-b.md index 222fbac..14994db 100644 --- a/docs/roadmap/phases/phase-b.md +++ b/docs/roadmap/phases/phase-b.md @@ -247,7 +247,7 @@ A real userspace task, loaded by B4, running in EL0 in its own address space, ma **Dependency-ordered task sequence** (each rides on the prior): -1. **ADR-0033 + the kernel-in-every-AS implementation**, plus the per-task `TTBR0_EL1` swap on context switch going live (the T-018 activation differ-path that short-circuits in v1). +1. ✅ **ADR-0033 + the high-half migration** ([T-022](../../analysis/tasks/phase-b/T-022-high-half-kernel-mapping.md), **In Review 2026-05-30**): the kernel now runs in `TTBR1_EL1` (boot-time migration; `tyrne: high-half active` marker), `TTBR0_EL1` is freed for per-task userspace, and `::activate` writes a per-task root PA into it (the runtime distinct-AS swap fires in B6, step 3). Single linear high-half offset; whole-image-high-linked + forced-low entry; fault-clean QEMU smoke. **This unblocks the rest of B6** (steps 2–7 build on the settled high-half regime). 2. **EL0 task context register file + the enter-EL0 path + per-task `SP_EL1`** (closes [T-021 carry-forward gate #2](#milestone-b6--first-userspace-hello)). 3. **`task_create_from_image`** — `LoadedImage` → runnable `CapHandle{CapObject::Task(...)}` (composes steps 1 + 2; the deferred [§B4 §3](#milestone-b4--task-loader) bridge). 4. **Close the remaining T-021 carry-forward gates:** the per-task `console_write` window + per-page user-VA → kernel-VA translation returning `FaultAddress` (**gate #1 — security-critical**; without it an EL0 debug-console-cap holder reads arbitrary kernel memory), and `SYSCALL_STUB_TABLE` → the scheduler's current-task table (**gate #3**). @@ -312,7 +312,7 @@ When B6 is Done, run a business review. Phase C becomes active after that review | ADR-0030 | Syscall ABI (includes `IpcError` taxonomy per K2-5) | B5 (**Accepted 2026-05-29**) | was ADR-0028. Settles the register convention (`x8`=number, `x0`–`x5` args, `SVC #0`, `x0`=status) + the dedicated-status-register encoding + `SyscallError` composition + the K2-5 `IpcError` split; drives [T-020](../../analysis/tasks/phase-b/T-020-syscall-error-taxonomy.md) + [T-021](../../analysis/tasks/phase-b/T-021-syscall-dispatch.md) (merged PR #34, `f98e1af`). | | ADR-0031 | Initial syscall set | B5 (**Accepted 2026-05-29**) | was ADR-0029. Fixes the five-syscall v1 set (`send` / `recv` / `task_yield` / `task_exit` / `console_write`; `0` reserved-invalid); numbers `1`–`5` are a fixed ABI decision regression-verified by T-021's host tests, not chosen by the dispatcher. | | ADR-0032 | Endpoint state rollback on `ipc_recv_and_yield` Deadlock + `ipc_cancel_recv` primitive | B2 prep (**Accepted 2026-05-07**) | drove [T-015 (Done 2026-05-07)](../../analysis/tasks/phase-b/T-015-endpoint-rollback-cancel-recv.md) via PR #17. Surfaced as Track A non-blocker in the [2026-05-06 comprehensive review](../../analysis/reviews/code-reviews/2026-05-06-full-tree-comprehensive.md) and a forward-flagged item in the [2026-05-07 B1 closure security review](../../analysis/reviews/security-reviews/2026-05-07-B1-closure.md). Closed before B-phase task lands the first userspace-driven endpoint destroy. ADR-0017 §Revision notes rider records the additive recovery primitive (user-observable surface unchanged). | -| [ADR-0033](../../decisions/0033-kernel-high-half-migration.md) | Kernel high-half migration (kernel → `TTBR1_EL1`, boot-time; reachable from every task AS) | **B6 (Accepted 2026-05-30)** | **filed** to open B6 — the gating prerequisite (an EL0 task's `SVC` vector fetch must translate, impossible while the kernel is identity-only in `TTBR0`). Extends [ADR-0027](../../decisions/0027-kernel-virtual-memory-layout.md) §Decision outcome (Option D) — consumes the reserved `TTBR1`/`EPD1` + byte-stable high-half `TCR` fields; **no supersede**. Boot-time migration (DAIF-masked window, no live low-VA pointers), staged. Drives [T-022](../../analysis/tasks/phase-b/T-022-high-half-kernel-mapping.md) (Draft; opens with the Propose commit per [ADR-0025 §Rule 1](../../decisions/0025-adr-governance-amendments.md)). §Simulation hardened against two adversarial verification passes; Option B (map-kernel-into-every-TTBR0) is the documented fallback. **Accepted 2026-05-30** after the maintainer careful-re-read (review-round folded into the Proposed draft); drives [T-022](../../analysis/tasks/phase-b/T-022-high-half-kernel-mapping.md) (In Progress). | +| [ADR-0033](../../decisions/0033-kernel-high-half-migration.md) | Kernel high-half migration (kernel → `TTBR1_EL1`, boot-time; reachable from every task AS) | **B6 (Accepted 2026-05-30)** | **filed** to open B6 — the gating prerequisite (an EL0 task's `SVC` vector fetch must translate, impossible while the kernel is identity-only in `TTBR0`). Extends [ADR-0027](../../decisions/0027-kernel-virtual-memory-layout.md) §Decision outcome (Option D) — consumes the reserved `TTBR1`/`EPD1` + byte-stable high-half `TCR` fields; **no supersede**. Boot-time migration (DAIF-masked window, no live low-VA pointers), staged. Drives [T-022](../../analysis/tasks/phase-b/T-022-high-half-kernel-mapping.md) (Draft; opens with the Propose commit per [ADR-0025 §Rule 1](../../decisions/0025-adr-governance-amendments.md)). §Simulation hardened against two adversarial verification passes; Option B (map-kernel-into-every-TTBR0) is the documented fallback. **Accepted 2026-05-30** after the maintainer careful-re-read (review-round folded into the Proposed draft); drives [T-022](../../analysis/tasks/phase-b/T-022-high-half-kernel-mapping.md) (**In Review 2026-05-30** — migration implemented + all gates green incl. Miri + QEMU smoke fault-clean; whole-image-high-linked + single linear high-half offset; the ADR's Option-2 fallback was not needed). | | ADR-0034 | Kernel-image section permissions (.text RX / .rodata R / .bss/.data RW) | B-late (placeholder; named-but-unallocated) | named in [ADR-0027 §Decision outcome (a)](../../decisions/0027-kernel-virtual-memory-layout.md) as the future home of finer-grained kernel-image permissions. v1 maps the entire 128 MiB RAM range as kernel R/W/X via 2 MiB blocks; T-016 §Out of scope and [`memory-management.md` §"v1 layout"](../../architecture/memory-management.md) defer the re-map. Opens with the first B-phase task whose threat model includes a kernel R/W of `.text` as a meaningful surface — likely **B6** — the first attacker-observable EL0 execution context (the v1 `hello` is code-only mapped `USER\|EXECUTE`, so ADR-0034 is hardening, not a B6 functional blocker; decide in B6 whether to harden now or defer). | | ADR-0035 | Physical Memory Manager (B3 prerequisite — bitmap allocator) | B3 (**Accepted 2026-05-09**) | new — drove the realisation that B3's "Address space abstraction" milestone has a foundational prerequisite (a real `FrameProvider` impl over physical RAM) which deserves its own ADR rather than being absorbed into ADR-0028 (address-space data structure). Drives [T-017 (Draft 2026-05-09; moves to In Progress with this Accept)](../../analysis/tasks/phase-b/T-017-physical-memory-manager.md). Bitmap allocator with hint pointer; 4 KiB metadata for QEMU virt's 32 K frames; reservation-list at init + cached for `free_frame` defensive validation per the §Simulation §Step 2 Critical row; forward-portable to high-half kernel without algorithm rewrite. Includes the §Simulation table walking init / alloc / free / exhaustion / recovery state transitions per [`write-adr` skill §Simulation](../../../.agents/skills/write-adr/SKILL.md). Accept landed as a separate commit per `write-adr` §10 after a careful re-read pass that surfaced and corrected three substantive drafting issues (broken anchor, safe-Rust-vs-`unsafe` zeroing contradiction, muddled "undefined-vs-error" wording in §Simulation row 2; the row-2 fix tightened the Pmm struct contract to add a cached reserved-range list for defensive `free_frame` validation, propagated to T-017). | | ADR-0036 | QEMU virt is GICv2 / no-IOMMU in v1 (corrects ADR-0004 / 0006 / 0012) | post-B1 (**Accepted 2026-05-22**) | new — surfaced by the [2026-05-22 full-tree master review](../../analysis/reviews/master-review/2026-05-22-152729/consolidated.md): the foundational ADRs carried GICv3 / SMMUv3 statements that do not match the GICv2, no-IOMMU reality of QEMU `virt` that B1's GIC work (above) actually assumed. **Corrects** (append-only redirect rider; does **not** supersede) [ADR-0004](../../decisions/0004-target-platforms.md) / [ADR-0006](../../decisions/0006-workspace-layout.md) / [ADR-0012](../../decisions/0012-boot-flow-qemu-virt.md). Ratifies the GICv2 fact stated in the B1 milestone. | diff --git a/hal/src/lib.rs b/hal/src/lib.rs index 3c02b95..63dc6b5 100644 --- a/hal/src/lib.rs +++ b/hal/src/lib.rs @@ -44,8 +44,8 @@ pub use context_switch::ContextSwitch; pub use cpu::{CoreId, Cpu, IrqGuard, IrqState}; pub use irq_controller::{IrqController, IrqNumber}; pub use mmu::{ - FrameProvider, MapperFlush, MappingFlags, Mmu, MmuError, PhysAddr, PhysFrame, VirtAddr, - PAGE_SIZE, + kernel_va_to_phys, phys_to_kernel_va, FrameProvider, MapperFlush, MappingFlags, Mmu, MmuError, + PhysAddr, PhysFrame, VirtAddr, KERNEL_HIGH_HALF_OFFSET, PAGE_SIZE, }; pub use timer::Timer; diff --git a/hal/src/mmu/mod.rs b/hal/src/mmu/mod.rs index cfe3c4c..e0d960b 100644 --- a/hal/src/mmu/mod.rs +++ b/hal/src/mmu/mod.rs @@ -20,6 +20,85 @@ use core::ops::{BitAnd, BitOr, BitOrAssign}; /// Fixed at 4 KiB in v1. Huge-page support is deferred to a later ADR. pub const PAGE_SIZE: usize = 4096; +/// Virtual-address offset of the kernel's high-half direct map of physical +/// memory ([ADR-0033]). +/// +/// After the boot-time high-half migration the kernel runs entirely in the +/// `TTBR1_EL1` high half (`VA[55] = 1`, `T1SZ = 16`). Physical memory is +/// reachable through a single linear **direct map** at this offset: +/// +/// ```text +/// kernel_VA(pa) = KERNEL_HIGH_HALF_OFFSET + pa +/// ``` +/// +/// This one offset serves **both** roles [ADR-0033 §Dependency chain step +/// 2][adr-0033-dep] names — the kernel-image *link* offset (the image is +/// linked at `KERNEL_HIGH_HALF_OFFSET + KERNEL_IMAGE_PHYS_BASE`) and the +/// *physmap*/direct-map offset (any frame's kernel VA). They coincide for +/// every in-image address because the image PA range is a subset of the +/// direct-mapped PA range, so "using the wrong offset at a site" is +/// impossible by construction (the value is identical). Device-MMIO PAs +/// (below the RAM base) are covered by the same linear map. +/// +/// `0xFFFF_FFFF_0000_0000` places PA `0` at the base of the top 4 GiB of +/// the 64-bit VA space; the QEMU virt PA range (`0x0000_0000 .. +/// 0x4800_0000`, well under 4 GiB) maps without overflow and every result +/// has `VA[55] = 1`, so the `TTBR1_EL1` walker serves it. The boot-time +/// migration ([`bsp-qemu-virt/linker.ld`] + `kernel_entry`) uses the same +/// value; the BSP carries a compile-time `assert!` pinning the two in sync. +/// +/// **Host builds (`cfg(not(target_arch = "aarch64"))`) define the offset as +/// `0`** — there is no MMU or high-half on the test harness, so +/// [`phys_to_kernel_va`] / [`kernel_va_to_phys`] are the identity there and +/// the kernel-crate host tests (PMM frame zero-fill, `phys_frame_kernel_ptr`) +/// deref their real host-backed "frames" unchanged. Only the aarch64 kernel +/// build carries the real high-half offset. +/// +/// [ADR-0033]: https://github.com/HodeTech/Tyrne/blob/main/docs/decisions/0033-kernel-high-half-migration.md +/// [adr-0033-dep]: https://github.com/HodeTech/Tyrne/blob/main/docs/decisions/0033-kernel-high-half-migration.md#dependency-chain +#[cfg(target_arch = "aarch64")] +pub const KERNEL_HIGH_HALF_OFFSET: usize = 0xFFFF_FFFF_0000_0000; + +/// Host-build identity offset — see the aarch64 [`KERNEL_HIGH_HALF_OFFSET`] +/// for the rationale (no MMU/high-half on the test harness). +#[cfg(not(target_arch = "aarch64"))] +pub const KERNEL_HIGH_HALF_OFFSET: usize = 0; + +/// Translate a physical address to its kernel high-half direct-map virtual +/// address (`KERNEL_HIGH_HALF_OFFSET + pa`). +/// +/// The single helper every kernel/BSP site routes a physical-frame, page- +/// table, or MMIO-register dereference through once the kernel runs high +/// ([ADR-0033]). `wrapping_add` matches the kernel's +/// `arithmetic_side_effects` discipline; the QEMU virt PA range cannot +/// overflow the offset (see [`KERNEL_HIGH_HALF_OFFSET`]). +/// +/// [ADR-0033]: https://github.com/HodeTech/Tyrne/blob/main/docs/decisions/0033-kernel-high-half-migration.md +#[must_use] +#[inline] +pub const fn phys_to_kernel_va(pa: usize) -> usize { + KERNEL_HIGH_HALF_OFFSET.wrapping_add(pa) +} + +/// Translate a kernel high-half direct-map virtual address back to its +/// physical address (`va - KERNEL_HIGH_HALF_OFFSET`) — the inverse of +/// [`phys_to_kernel_va`]. +/// +/// Used by the small number of post-migration sites that take a linker +/// symbol's address (`addr_of!(...)`, resolved HIGH while the kernel runs +/// high) but need the physical address — e.g. programming a `TTBR`, naming a +/// page-table root frame, or computing the kernel-image reserved range for +/// the PMM ([ADR-0033] §Negative — "the `addr_of!`-as-PA conflation must be +/// broken project-wide"). Only valid for direct-mapped high-half addresses; +/// `wrapping_sub` matches the kernel's `arithmetic_side_effects` discipline. +/// +/// [ADR-0033]: https://github.com/HodeTech/Tyrne/blob/main/docs/decisions/0033-kernel-high-half-migration.md +#[must_use] +#[inline] +pub const fn kernel_va_to_phys(va: usize) -> usize { + va.wrapping_sub(KERNEL_HIGH_HALF_OFFSET) +} + /// A virtual address. /// /// The underlying integer is exposed as a `pub` field so call sites can diff --git a/hal/src/mmu/vmsav8.rs b/hal/src/mmu/vmsav8.rs index 8d4c5a4..35d783d 100644 --- a/hal/src/mmu/vmsav8.rs +++ b/hal/src/mmu/vmsav8.rs @@ -172,6 +172,24 @@ pub const TCR_EL1_VALUE: u64 = { | as_field }; +/// `TCR_EL1` value for the **high-half regime** (post-[ADR-0033] migration): +/// byte-identical to [`TCR_EL1_VALUE`] except `EPD1` (bit 23) is cleared, +/// enabling `TTBR1_EL1` translation-table walks for the kernel's high-half +/// mapping. Every `TTBR0`-governing field (`T0SZ` / `EPD0` / `IRGN0` / +/// `ORGN0` / `SH0` / `TG0`) stays byte-stable so perturbing the live +/// `TTBR0` regime is structurally impossible (the +/// [`tcr_high_half_clears_only_epd1`] test pins the single-bit delta). +/// +/// Written to `TCR_EL1` by the boot-time high-half migration ([T-022] / +/// [ADR-0033 §Simulation row 1][adr-0033]); the **`DSB ISH`** that publishes +/// the `TTBR1` table-memory writes precedes this `MSR`, so no walk can read a +/// stale descriptor once `EPD1` clears. +/// +/// [ADR-0033]: https://github.com/HodeTech/Tyrne/blob/main/docs/decisions/0033-kernel-high-half-migration.md +/// [adr-0033]: https://github.com/HodeTech/Tyrne/blob/main/docs/decisions/0033-kernel-high-half-migration.md#simulation +/// [T-022]: https://github.com/HodeTech/Tyrne/blob/main/docs/analysis/tasks/phase-b/T-022-high-half-kernel-mapping.md +pub const TCR_EL1_VALUE_HIGH_HALF: u64 = TCR_EL1_VALUE & !(1 << 23); + /// `SCTLR_EL1` bits we **set** when activating the MMU: `M` (bit 0, /// MMU on), `C` (bit 2, D-cache enable), `I` (bit 12, I-cache enable). /// @@ -407,7 +425,7 @@ mod tests { block_descriptor, flags_to_descriptor_bits, page_descriptor, table_descriptor, AP_KERNEL_RO, AP_KERNEL_RW, AP_USER_RO, AP_USER_RW, ATTR_IDX_DEVICE, ATTR_IDX_NORMAL, MAIR_EL1_VALUE, SCTLR_EL1_MMU_ENABLE_MASK, SH_INNER_SHAREABLE, SH_NON_SHAREABLE, - TCR_EL1_VALUE, + TCR_EL1_VALUE, TCR_EL1_VALUE_HIGH_HALF, }; use crate::MappingFlags; @@ -443,6 +461,32 @@ mod tests { assert_eq!((TCR_EL1_VALUE >> 30) & 0x3, 0b10); } + #[test] + fn tcr_high_half_clears_only_epd1() { + // The high-half TCR (ADR-0033 §Simulation row 1) must clear EPD1 + // (bit 23) to enable TTBR1 walks and leave EVERY other bit — + // crucially every TTBR0-governing field — byte-identical to the + // live v1 TCR. A perturbation of any TTBR0 field would fault the + // next low fetch during the migration; this single-bit-delta + // assertion is the row-1 verification artefact. + assert_eq!( + (TCR_EL1_VALUE >> 23) & 0x1, + 1, + "precondition: v1 TCR has EPD1 set (TTBR1 disabled)" + ); + assert_eq!( + (TCR_EL1_VALUE_HIGH_HALF >> 23) & 0x1, + 0, + "high-half TCR clears EPD1 (TTBR1 walks enabled)" + ); + // The ONLY difference is bit 23: XOR must equal exactly 1<<23. + assert_eq!( + TCR_EL1_VALUE ^ TCR_EL1_VALUE_HIGH_HALF, + 1 << 23, + "high-half TCR differs from v1 TCR in exactly bit 23 (EPD1)" + ); + } + #[test] fn sctlr_mmu_enable_mask_sets_m_c_i_only() { assert_eq!(SCTLR_EL1_MMU_ENABLE_MASK & (1 << 0), 1 << 0); // M diff --git a/kernel/src/mm/mod.rs b/kernel/src/mm/mod.rs index 82e0016..2b03fd0 100644 --- a/kernel/src/mm/mod.rs +++ b/kernel/src/mm/mod.rs @@ -108,74 +108,39 @@ pub use pmm::{Pmm, PmmError, PmmStats}; /// Return a kernel-writable raw pointer for `frame`'s base PA. /// -/// In v1 the kernel address space is identity-mapped over the entire -/// PMM-managed physical extent per -/// [ADR-0027 §Decision outcome (a)][adr-0027], so any -/// [`tyrne_hal::PhysFrame`] returned by [`Pmm::alloc_frame`] is -/// reachable at VA = PA from kernel code. This helper *centralises* -/// that assumption: every kernel-side caller that needs to read or -/// write a PMM-allocated frame's payload (e.g. -/// [`crate::obj::task_loader::load_image`]'s `copy_nonoverlapping` -/// byte-copy site under [UNSAFE-2026-0027]) routes through this -/// function so the future high-half migration -/// ([ADR-0033 placeholder][adr-0027]) can replace the body with a -/// real PA → kernel-VA translation in **one** place, leaving every -/// call site source-compatible. +/// Since the high-half migration ([ADR-0033], T-022) the kernel runs in the +/// `TTBR1_EL1` high half and reaches physical memory through the high-half +/// direct map, so a frame's kernel VA is +/// [`tyrne_hal::phys_to_kernel_va(pa)`][phys_to_kernel_va] = +/// `KERNEL_HIGH_HALF_OFFSET + pa`. This helper *centralises* that translation: +/// every kernel-side caller that needs to read or write a PMM-allocated +/// frame's payload (e.g. [`crate::obj::task_loader::load_image`]'s +/// `copy_nonoverlapping` byte-copy site under [UNSAFE-2026-0027]) routes +/// through this one function. (Before T-022 the kernel was identity-mapped +/// and the body was the bare `pa as *mut u8`; ADR-0033 §Negative replaced it +/// with the direct-map rebase in this single place, leaving every call site +/// source-compatible.) /// /// The function itself is safe (the `as *mut u8` cast is infallible /// Rust); only the *dereference* at the call site is `unsafe` and /// requires the audit-log entry that names the call site's specific /// ownership / aliasing discipline. /// -/// ## Forward-compat note -/// -/// When [ADR-0033 placeholder][adr-0027] opens and the kernel moves -/// to a high-half virtual layout, this function's body grows to a -/// `KERNEL_PHYS_BASE`-rebased translation; every call site keeps -/// working without source changes. The kernel crate denies -/// `clippy::expect_used` / `clippy::unwrap_used` / `clippy::panic`, -/// so the future migration cannot adopt an `.expect(...)`-style -/// snippet (a copy-pasted example would lint-fail). The intended -/// shape is a `checked_add` with a `debug_assert!` + fallback — -/// either of: -/// -/// ```ignore -/// // Pattern A — branch on the overflow path (no panic in release). -/// KERNEL_PHYS_BASE -/// .checked_add(frame.as_usize()) -/// .unwrap_or_else(|| { -/// debug_assert!( -/// false, -/// "ADR-0033: KERNEL_PHYS_BASE + frame PA overflows usize" -/// ); -/// // Fall back to the unchecked value — the debug_assert -/// // catches the overflow in development; release builds -/// // produce a deterministic value rather than a panic. -/// KERNEL_PHYS_BASE.wrapping_add(frame.as_usize()) -/// }) as *mut u8 -/// -/// // Pattern B — saturating arithmetic, matches the rest of the -/// // kernel's clippy::arithmetic_side_effects discipline. -/// KERNEL_PHYS_BASE.saturating_add(frame.as_usize()) as *mut u8 -/// ``` -/// -/// Both are lint-clean against the workspace's pedantic + kernel- -/// extra denies. The audit-log entries that cite "identity mapping -/// post-MMU per ADR-0027" ([UNSAFE-2026-0026], [UNSAFE-2026-0027]) -/// gain a "lifted via ADR-0033 migration on date X" Amendment at -/// the same commit. The PMM's existing `core::ptr::write_bytes` -/// site ([`kernel/src/mm/pmm.rs`](pmm.rs)) is the second adopter — -/// its safety comment already names the future-migration plan; the -/// physical PMM site will route through this helper at the same -/// commit ADR-0033 lands (kept inline today to avoid churning the -/// audit-log entries that landed with T-017). +/// The PMM's zero-fill site ([`kernel/src/mm/pmm.rs`](pmm.rs)) and the BSP's +/// page-table walk ([`bsp-qemu-virt/src/mmu.rs`]) perform the same +/// direct-map rebase at their own `unsafe` deref sites; their audit-log +/// entries ([UNSAFE-2026-0026], [UNSAFE-2026-0027]) gained ADR-0033 +/// Amendments at the T-022 commit. /// +/// [ADR-0033]: https://github.com/HodeTech/Tyrne/blob/main/docs/decisions/0033-kernel-high-half-migration.md +/// [phys_to_kernel_va]: tyrne_hal::phys_to_kernel_va /// [UNSAFE-2026-0026]: https://github.com/HodeTech/Tyrne/blob/main/docs/audits/unsafe-log.md -/// -/// [adr-0027]: https://github.com/HodeTech/Tyrne/blob/main/docs/decisions/0027-kernel-virtual-memory-layout.md /// [UNSAFE-2026-0027]: https://github.com/HodeTech/Tyrne/blob/main/docs/audits/unsafe-log.md #[must_use] #[inline] pub(crate) fn phys_frame_kernel_ptr(frame: tyrne_hal::PhysFrame) -> *mut u8 { - frame.as_usize() as *mut u8 + // Direct-map rebase: kernel VA = KERNEL_HIGH_HALF_OFFSET + pa (ADR-0033). + // The helper is infallible (`wrapping_add` + cast); only the *dereference* + // at the call site is `unsafe` and carries the audit-log entry. + tyrne_hal::phys_to_kernel_va(frame.as_usize()) as *mut u8 } diff --git a/kernel/src/mm/pmm.rs b/kernel/src/mm/pmm.rs index f37ea42..3a0540f 100644 --- a/kernel/src/mm/pmm.rs +++ b/kernel/src/mm/pmm.rs @@ -402,7 +402,11 @@ impl Pmm { let pa_off = idx.saturating_mul(PAGE_SIZE); let pa_usize = self.extent.start.0.saturating_add(pa_off); let frame = PhysFrame::from_aligned(PhysAddr(pa_usize))?; - let pa_ptr = pa_usize as *mut u8; + // Reach the frame through the high-half direct map (ADR-0033 / T-022): + // kernel VA = KERNEL_HIGH_HALF_OFFSET + pa. On the host test harness + // the offset is 0 (identity), so the test's real host-backed frames + // are written in place. + let pa_ptr = tyrne_hal::phys_to_kernel_va(pa_usize) as *mut u8; // Mark allocated. Reached only after the fallible `from_aligned` // above succeeded, so the counters/bitmap and the handed-out @@ -434,11 +438,12 @@ impl Pmm { // bitmap bit is the proof; no other kernel subsystem can // hold a PhysFrame for this index until alloc_frame // returns ownership to the caller. - // (3) the region is identity-mapped to a kernel-readable VA - // per ADR-0027's identity-only v1 layout (post-MMU - // activation in mmu_bootstrap, kernel sees PA == VA); - // the high-half migration (ADR-0033 placeholder) will - // introduce a `phys_to_virt` helper at this site. + // (3) the region is reachable at the kernel VA + // `phys_to_kernel_va(pa)` — the high-half direct map the + // kernel runs through post-migration (ADR-0033 / T-022). + // Pre-T-022 this was identity (PA == VA per ADR-0027); + // the rebase is the single `phys_to_kernel_va` helper now, + // identity on host test builds (offset 0). // (4) PAGE_SIZE = 4096 is well within isize::MAX on aarch64; // `write_bytes` cannot overflow any intermediate // arithmetic. diff --git a/kernel/src/obj/task_loader.rs b/kernel/src/obj/task_loader.rs index 2a9d7b6..a8807b4 100644 --- a/kernel/src/obj/task_loader.rs +++ b/kernel/src/obj/task_loader.rs @@ -587,17 +587,20 @@ pub fn load_image( // §Simulation row 4: image-PA-overlap preflight. Discharges // UNSAFE-2026-0027 invariant "source and destination do not - // overlap" at runtime — `image.as_ptr() as usize` is treated as a - // PA under v1's identity-mapped post-bootstrap kernel AS (ADR-0027 - // §Decision outcome (a)). If any byte of the image's PA range - // could be returned by `pmm.alloc_frame()`, `copy_nonoverlapping` - // in the image-page loop below would alias source and destination - // — undefined behaviour per Rust's `core::ptr::copy_nonoverlapping` - // safety contract. The check is practically unreachable under - // correct BSP wiring (`.rodata`-resident images live in PMM- - // reserved memory by ADR-0035) but defensive against BSP - // misconfiguration. Pre-state-change; no rollback needed. - let image_pa_start = image.as_ptr() as usize; + // overlap" at runtime. Since the high-half migration (ADR-0033 / + // T-022) the kernel runs in the high half, so `image.as_ptr()` + // resolves to the image's high-half VA — `kernel_va_to_phys` + // converts it back to the PA the PMM extent is expressed in (on host + // test builds the offset is 0, so this is the identity). If any byte + // of the image's PA range could be returned by `pmm.alloc_frame()`, + // `copy_nonoverlapping` in the image-page loop below would alias + // source and destination — undefined behaviour per Rust's + // `core::ptr::copy_nonoverlapping` safety contract. The check is + // practically unreachable under correct BSP wiring (`.rodata`- + // resident images live in PMM-reserved memory by ADR-0035) but + // defensive against BSP misconfiguration. Pre-state-change; no + // rollback needed. + let image_pa_start = tyrne_hal::kernel_va_to_phys(image.as_ptr() as usize); let image_pa_end = image_pa_start.saturating_add(image.len()); if pmm.could_yield_pa_overlapping(image_pa_start..image_pa_end) { return Err(LoadError::ImageOverlapsAllocatableMemory); diff --git a/tools/smoke.sh b/tools/smoke.sh new file mode 100755 index 0000000..b86fe41 --- /dev/null +++ b/tools/smoke.sh @@ -0,0 +1,63 @@ +#!/usr/bin/env bash +# Non-interactive QEMU smoke runner for automated use (CI / agent loops). +# +# Unlike tools/run-qemu.sh (interactive, mon:stdio), this runs QEMU with a +# pure serial console (no monitor — so an stdin EOF cannot quit QEMU early), +# bounds the run with a wall-clock timeout (the kernel idles in WFI after +# "tyrne: all tasks complete" and never exits on its own), captures the full +# trace to a log file, and reports the boot markers. +# +# Usage: +# tools/smoke.sh — debug build, 20s budget +# tools/smoke.sh --release — release build +# tools/smoke.sh --int — add -d int,unimp,guest_errors +# tools/smoke.sh --timeout 30 — override the wall-clock budget (s) +# tools/smoke.sh — explicit ELF +# +# The full trace is written to ${TMPDIR:-/tmp}/tyrne-smoke..log (printed). +set -euo pipefail + +PROFILE="debug" +TO=20 +INT_FLAGS=() +KERNEL="" + +while [[ $# -gt 0 ]]; do + case "$1" in + --release) PROFILE="release"; shift ;; + --int) INT_FLAGS=(-d int,unimp,guest_errors); shift ;; + --timeout) TO="$2"; shift 2 ;; + -h|--help) sed -n '2,/^set -/p' "$0" | sed 's/^# \{0,1\}//;/^set -/d' >&2; exit 0 ;; + --*) echo "error: unknown flag: $1" >&2; exit 2 ;; + *) KERNEL="$1"; shift ;; + esac +done + +[[ -z "$KERNEL" ]] && KERNEL="target/aarch64-unknown-none/${PROFILE}/tyrne-bsp-qemu-virt" +if [[ ! -f "$KERNEL" ]]; then + echo "error: kernel image not found at $KERNEL (run 'cargo kernel-build' first)" >&2 + exit 1 +fi + +LOG="${TMPDIR:-/tmp}/tyrne-smoke.$$.log" +echo "smoke: $KERNEL (budget ${TO}s) log -> $LOG" >&2 + +# perl alarm wrapper: fork QEMU, SIGTERM it after $TO seconds. QEMU inherits +# the child's stdout/stderr (redirected to $LOG by the caller below). +TO="$TO" perl -e ' + my $pid = fork(); + if ($pid == 0) { open(STDIN, "<", "/dev/null"); exec(@ARGV) or die "exec: $!"; } + $SIG{ALRM} = sub { kill("TERM", $pid); }; + alarm($ENV{TO}); + waitpid($pid, 0); +' qemu-system-aarch64 -M virt -cpu cortex-a72 -m 128M -smp 1 \ + -display none -serial stdio -monitor none \ + "${INT_FLAGS[@]+"${INT_FLAGS[@]}"}" \ + -kernel "$KERNEL" > "$LOG" 2>&1 || true + +echo "===== trace =====" >&2 +cat "$LOG" +echo "===== markers =====" >&2 +grep -nE "tyrne:|panic|all tasks complete|high-half" "$LOG" || echo "(no tyrne markers found)" +echo "===== fault classes (int log, if --int) =====" >&2 +grep -nE "Taking exception|Translation fault|Permission fault|Data Abort|Prefetch Abort" "$LOG" | head -40 || true From 3ecef5efc6cd10564df35c6eb9d47af447e12bc3 Mon Sep 17 00:00:00 2001 From: Cemil ILIK Date: Sat, 30 May 2026 06:08:18 +0300 Subject: [PATCH 5/7] =?UTF-8?q?fix(mmu):=20T-022=20review-round=20?= =?UTF-8?q?=E2=80=94=20activate=20EPD0=20clear,=20cfg=20idiom,=20contract/?= =?UTF-8?q?doc=20fixes?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses the careful review of 6c7502b (high-half migration). The verified migration mechanism is unchanged; this completes the per-task-swap correctness and fixes a latent host-portability cfg, contract-comment drift, tooling, docs. - HIGH (completes T-022's "swap goes live"): QemuVirtMmu::activate now clears TCR_EL1.EPD0 (re-enables TTBR0 walks) in addition to writing TTBR0_EL1. The migration set EPD0=1 when freeing TTBR0, so without this the first real EL0 task's lower-half access would translation-fault. (No v1 caller; fails closed today — the swap criterion is now actually functional for B6.) - MEDIUM (latent host bug): KERNEL_HIGH_HALF_OFFSET's cfg changed from `target_arch="aarch64"` to `all(target_arch="aarch64", target_os="none")` (matching hal/src/cpu.rs). The bare form also matches an aarch64 HOST (Apple Silicon), where cargo test / Miri would deref phys_to_kernel_va of a real host pointer into wild memory. The kernel build (target_os=none) is unchanged. - MEDIUM (contract drift): the bootstrap-root wrap runs AFTER kernel_main_high frees TTBR0, so the "currently-live in TTBR0" safety wording was false post- migration. from_existing_root + the wrap-site comments now say "valid + populated" (liveness-in-TTBR0 was never the soundness basis); UNSAFE-2026-0028 gains an Amendment (body unchanged per unsafe-policy §3). - LOW (tooling): tools/smoke.sh now exits non-zero on a missing completion marker or any panic/fault class (usable as a CI gate); --int drops guest_errors (uses -d int,unimp) so PL011 noise no longer interleaves. - Docs: corrected the release console_write=0x1 note — it is the INTENDED debug-gate (abi.rs `5 if cfg!(debug_assertions)`, ADR-0031), not a bug or a T-022 effect (my earlier "B5 follow-up" framing was wrong; the review's abi.rs root-cause is correct). Sharpened the T-021 gate #1 copy-user precondition to a hard ordering requirement. boot.md sequence diagram updated to the split low->migration->high flow; boot_ns comments fixed to kernel_main_high + the metric-shift note. Pi-4 / 4 GiB forward-limit note added (HAL offset doc, linker.ld, boot.md). memory-management.md frame counts + .boot_pt frame count refreshed (T-022 +2 high-half root frames). Gates re-run green: 340 host tests, host+kernel clippy -D warnings, fmt, kernel build, QEMU smoke (gated, PASS) + -d int,unimp (2 SVC, zero faults). Refs: T-022, ADR-0033, UNSAFE-2026-0028, UNSAFE-2026-0031 Co-Authored-By: Claude Opus 4.8 (1M context) --- bsp-qemu-virt/linker.ld | 4 +- bsp-qemu-virt/src/main.rs | 43 ++++++++------ bsp-qemu-virt/src/mmu.rs | 59 +++++++++++++------ .../phase-b/T-022-high-half-kernel-mapping.md | 4 +- docs/architecture/boot.md | 24 +++++--- docs/architecture/memory-management.md | 6 +- docs/audits/unsafe-log.md | 4 +- docs/roadmap/phases/phase-b.md | 2 +- hal/src/mmu/mod.rs | 23 ++++++-- tools/smoke.sh | 23 +++++++- 10 files changed, 133 insertions(+), 59 deletions(-) diff --git a/bsp-qemu-virt/linker.ld b/bsp-qemu-virt/linker.ld index fd8aeb8..143f070 100644 --- a/bsp-qemu-virt/linker.ld +++ b/bsp-qemu-virt/linker.ld @@ -34,7 +34,9 @@ */ /* tyrne_hal::KERNEL_HIGH_HALF_OFFSET — kept in sync by a compile-time assert - * in bsp-qemu-virt/src/main.rs. */ + * in bsp-qemu-virt/src/main.rs. Forward limit: this offset (and the migration + * mask in main.rs) bound the direct map to the low 4 GiB of PA; a BSP with + * > 4 GiB RAM / high peripherals (e.g. Pi 4) needs a different offset. */ KERNEL_HH_OFFSET = 0xFFFFFFFF00000000; KERNEL_IMAGE_PHYS_BASE = 0x40080000; KBASE = KERNEL_HH_OFFSET + KERNEL_IMAGE_PHYS_BASE; diff --git a/bsp-qemu-virt/src/main.rs b/bsp-qemu-virt/src/main.rs index 1cad85a..70a66a2 100644 --- a/bsp-qemu-virt/src/main.rs +++ b/bsp-qemu-virt/src/main.rs @@ -250,7 +250,7 @@ static GIC: StaticCell = StaticCell::new(); /// The PL011 console — used by task functions for diagnostic output. static CONSOLE: StaticCell = StaticCell::new(); -/// Boot-time `now_ns()` snapshot, written once by `kernel_entry` after the +/// Boot-time `now_ns()` snapshot, written once by `kernel_main_high` after the /// CPU is constructed and read by `task_a` to compute the boot-to-end /// elapsed time. T-009 measurement scaffold; replaced by a richer /// instrumentation surface when the first hypothesis-driven performance @@ -650,7 +650,7 @@ fn task_a() -> ! { console.write_bytes(b"tyrne: all tasks complete\n"); // T-009 measurement: print boot-to-end elapsed time. Uses `now_ns` on - // the live Timer impl and the BOOT_NS snapshot taken in `kernel_entry`. + // the live Timer impl and the BOOT_NS snapshot taken in `kernel_main_high`. // `saturating_sub` is defensive — the hardware counter is monotonic so // `now >= boot_ns` always holds, but the saturating form makes the // subtraction's correctness obvious to a reader scanning for overflow @@ -1037,8 +1037,12 @@ extern "C" fn kernel_main_high() -> ! { // // `cpu.now_ns()` reads `CNTVCT_EL0` (system register, MMU-independent). // Sampled just after the high-half migration so the boot-to-end baseline - // measures the high-half steady state; the one-time migration cost (a few - // µs) is excluded — immaterial against the ~ms boot-to-end total. + // measures the high-half steady state. NOTE: this excludes BOTH + // `mmu_bootstrap` (MMU activation, ~< 100 µs — which the pre-T-022 boot_ns + // deliberately *included*) and the migration (~ a few µs), so the metric's + // meaning shifted vs the pre-T-022 baseline (now "high-half-steady-state to + // end", not "MMU-activation to end"). Both excluded costs are immaterial + // against the ~ms boot-to-end total; the perf review records the shift. let boot_ns = cpu.now_ns(); // SAFETY: single-core; no concurrent writer exists before `start()`. // Audit: UNSAFE-2026-0001. @@ -1144,11 +1148,10 @@ extern "C" fn kernel_main_high() -> ! { .expect("L0 root must be 4 KiB-aligned per linker.ld `.boot_pt` reservation") }; - // Wrap the already-live root + publish in arena slot 0. - // Wrap the already-live root + publish in arena slot 0. The - // `bootstrap_root_pa` for the banner is read directly from - // `l0_root` — the wrapped `AddressSpace` stores - // exactly this `PhysFrame` and the round-trip is pinned by + // Wrap the bootstrap root (the low-identity L0 `mmu_bootstrap` built) + // and publish it as arena slot 0. The `bootstrap_root_pa` for the banner + // is read directly from `l0_root` — the wrapped `AddressSpace` + // stores exactly this `PhysFrame` and the round-trip is pinned by // `wrap_bootstrap_returns_address_space_with_root` in // `kernel/src/mm/address_space.rs::tests`. let bootstrap_root_pa = l0_root.as_usize(); @@ -1160,15 +1163,19 @@ extern "C" fn kernel_main_high() -> ! { // These two entries cover ONLY the StaticCell/arena publish // mechanics, not the `from_existing_root` wrap below. // - `QemuVirtAddressSpace::from_existing_root(l0_root)` requires - // `l0_root` to be a currently-live VMSAv8 L0 translation table - // (see its `# Safety` doc). `mmu_bootstrap` populated this exact - // frame and wrote its PA into `TTBR0_EL1` before this block runs - // (we are post-`mmu_bootstrap` at this point); the kernel-half - // mappings are installed; the descriptors are correctly encoded - // per the host-tested `tyrne_hal::mmu::vmsav8` encoders. The wrap - // does NOT zero-fill the live root (which would unmap the running - // kernel) — that is why it cannot route through the zero-fill - // `create_address_space`. Audit: UNSAFE-2026-0028. + // `l0_root` to be a valid, **populated** VMSAv8 L0 translation table + // (see its `# Safety` doc). `mmu_bootstrap` populated this exact frame + // as the low-identity root and installed it in `TTBR0_EL1`; **post-T-022 + // `kernel_main_high` has already freed `TTBR0_EL1` (null + `EPD0 = 1`) + // before this block runs**, so the frame is no longer the *live* TTBR0 — + // it is a populated-but-uninstalled table retained as arena slot 0 + // (kernel-init's AS authority + the cap-derivation parent for the + // loader). Its descriptors are correctly encoded per the host-tested + // `tyrne_hal::mmu::vmsav8` encoders. The wrap does NOT zero-fill (which + // would corrupt the populated descriptor topology a future `activate` / + // `map` walk relies on) — that is why it cannot route through the + // zero-fill `create_address_space`. Audit: UNSAFE-2026-0028 (+ its + // 2026-05-30 T-022 Amendment refining "live" → "populated"). let bootstrap_as_handle = unsafe { let arena = (*AS_ARENA.0.get()).assume_init_mut(); let inner = mmu::QemuVirtAddressSpace::from_existing_root(l0_root); diff --git a/bsp-qemu-virt/src/mmu.rs b/bsp-qemu-virt/src/mmu.rs index 0eface1..b0d3be5 100644 --- a/bsp-qemu-virt/src/mmu.rs +++ b/bsp-qemu-virt/src/mmu.rs @@ -99,11 +99,15 @@ impl QemuVirtAddressSpace { /// /// # Safety /// - /// The caller must guarantee that `root` is a valid, **currently- - /// live** `VMSAv8` L0 translation table — i.e., a 4 KiB frame whose + /// The caller must guarantee that `root` is a valid, **populated** + /// `VMSAv8` L0 translation table — i.e., a 4 KiB frame whose /// 512 × 8-byte entries are correctly-encoded `VMSAv8` table / /// block / page descriptors, with at least the kernel-half - /// mappings populated. Subsequent operations on the resulting + /// mappings populated. (The root need **not** be the *currently-installed* + /// `TTBR0`/`TTBR1` value — the wrap only stores the `PhysFrame` for later + /// `map`/`unmap`/`activate` to walk; post-T-022 the bootstrap root is a + /// populated-but-uninstalled table, see the caller note below.) + /// Subsequent operations on the resulting /// `QemuVirtAddressSpace` (e.g., [`Mmu::map`] / [`Mmu::unmap`]) /// perform `volatile` reads + writes through this root's descriptor /// chain; passing an arbitrary `PhysFrame` would dereference @@ -117,11 +121,14 @@ impl QemuVirtAddressSpace { /// be *already populated and live*. Both are caller-side /// preconditions the type system cannot enforce. /// - /// v1's only caller is `bsp-qemu-virt/src/main.rs::kernel_entry`, - /// which derives `root` from the `__boot_pt_l0` linker symbol — - /// the L0 frame `mmu_bootstrap` populated and wrote into - /// `TTBR0_EL1`. The bootstrap path is the only well-known - /// already-live root in v1. + /// v1's only caller is `bsp-qemu-virt/src/main.rs::kernel_main_high`, + /// which derives `root` from the `__boot_pt_l0` linker symbol — the L0 + /// frame `mmu_bootstrap` populated as the low-identity root. **Post-T-022 + /// (ADR-0033) the wrap runs *after* `kernel_main_high` frees `TTBR0_EL1` + /// (null + `EPD0 = 1`)**, so the root is a populated-but-uninstalled + /// table retained as arena slot 0 — not a live TTBR value. The wrap is + /// sound because soundness rests on the table being *valid + populated*, + /// not *installed*. /// /// Audit: UNSAFE-2026-0028. /// @@ -194,27 +201,41 @@ impl Mmu for QemuVirtMmu { // PhysFrame populated with a valid VMSAv8 layout before this // call) and by the caller. // - // Sequence: `MSR TTBR0_EL1` + `ISB` (translation regime now - // staged but stale TLB entries may exist) + `DSB ISHST` + // Sequence: `MSR TTBR0_EL1` (install the per-task root) + + // **clear `TCR_EL1.EPD0`** (re-enable `TTBR0_EL1` translation-table + // walks) + `ISB` (the TTBR0/TCR writes now staged) + `DSB ISHST` // (ensure any prior page-table descriptor stores are globally // observable inner-shareable before the TLBI broadcast — see // 2026-05-09 review-round Finding 4 / ADR-0027 §"Why DSB ISH" - // forward-compat) + `TLBI VMALLE1` + `DSB ISH` (drain - // invalidate completion) + `ISB` (drain pipeline so the next - // instruction-fetch goes through the freshly-installed - // regime). `options(nostack)` only — `nomem` omitted so the - // compiler treats this asm as a memory clobber and cannot - // reorder prior page-table writes past it. - // Audit: UNSAFE-2026-0023. + // forward-compat) + `TLBI VMALLE1` + `DSB ISH` (drain invalidate + // completion) + `ISB` (drain pipeline so the next instruction-fetch + // goes through the freshly-installed regime). `options(nostack)` + // only — `nomem` omitted so the compiler treats this asm as a memory + // clobber and cannot reorder prior page-table writes past it. + // + // **`EPD0` clear (T-022 / ADR-0033).** The high-half migration freed + // `TTBR0_EL1` and set `TCR_EL1.EPD0 = 1` (low-half walks disabled) so + // the kernel is structurally absent from the low half. A per-task + // swap re-enables low-half walks so the new AS's userspace mappings + // translate; without this clear the first real EL0 task's lower-half + // fetch/data access would translation-fault. Clearing an already- + // clear `EPD0` (a second swap) is idempotent. The kernel stays in + // `TTBR1_EL1` (high) throughout — only the low half is (re)enabled. + // Audit: UNSAFE-2026-0023 (+ its T-022 Amendment). unsafe { asm!( - "msr ttbr0_el1, {0}", + "msr ttbr0_el1, {ttbr0}", + "mrs {tmp}, tcr_el1", + "and {tmp}, {tmp}, {epd0_clear}", + "msr tcr_el1, {tmp}", "isb", "dsb ishst", "tlbi vmalle1", "dsb ish", "isb", - in(reg) ttbr0, + ttbr0 = in(reg) ttbr0, + epd0_clear = in(reg) !(1u64 << 7), + tmp = out(reg) _, options(nostack), ); } diff --git a/docs/analysis/tasks/phase-b/T-022-high-half-kernel-mapping.md b/docs/analysis/tasks/phase-b/T-022-high-half-kernel-mapping.md index 9fcc1f8..8e5f4f0 100644 --- a/docs/analysis/tasks/phase-b/T-022-high-half-kernel-mapping.md +++ b/docs/analysis/tasks/phase-b/T-022-high-half-kernel-mapping.md @@ -34,7 +34,7 @@ The migration switches the running kernel's own PC/SP/`VBAR` translation regime - [x] **The boot-time migration** runs ADR-0033 §Simulation rows 0–3: `high_half_activate` (`DSB ISH` → `MSR TTBR1` → `ISB` → `MSR TCR(EPD1=0)` → `ISB`) → trampoline (`MSR VBAR`-high + `ISB` → `add sp,sp,off` → `br`, `options(noreturn)`) → `kernel_main_high` (`MSR TTBR0,xzr` + `ISB` + `EPD0`-set + `ISB` + `TLBI VMALLE1` + `DSB ISH` + `ISB`). The new `tyrne: high-half active` marker prints after the `br`. - [x] **Per-task `TTBR0_EL1` swap correct + freed.** `kernel_main_high` frees `TTBR0` (null + `EPD0 = 1`); `::activate` writes a per-task root PA into the now-free `TTBR0`. v1 keeps `ASID = 0` global + `activate`'s existing `TLBI`-on-swap (no per-task ASID allocator — deferred per ADR-0033 §"ASID policy"). The scheduler differ-path host test `yield_now_activates_when_tasks_differ_in_address_space` pins distinct-root firing; the *runtime* distinct-AS swap is B6 (v1's demo keeps every task on the bootstrap AS). - [x] **Audit:** new **UNSAFE-2026-0031** (migration trampoline + `TTBR0`-free) + Amendments to UNSAFE-2026-0022 / 0023 / 0024 (high-half table writes / MSR sequence / post-migration `TLBI`) + 0025 / 0026 / 0027 / 0030 (physmap-rebase derefs). -- [x] **All gates green:** 340 host tests; host + kernel clippy `-D warnings`; `cargo fmt --check`; release build (entry `0x40080000`); `cargo +nightly miri test --workspace --exclude tyrne-bsp-qemu-virt` (Stacked Borrows). **QEMU smoke:** full demo to `tyrne: all tasks complete` with the new `tyrne: high-half active` line; `-d int,unimp` shows exactly the 2 expected `SVC` exceptions and **zero** Translation/Permission/Abort fault classes (fault-clean — row-4 abort gate). *(The pre-existing release-only `console_write` status `0x1` quirk reproduces on the parent commit `bd39679` and is **not** a T-022 regression — flagged for a separate B5 follow-up.)* +- [x] **All gates green:** 340 host tests; host + kernel clippy `-D warnings`; `cargo fmt --check`; release build (entry `0x40080000`); `cargo +nightly miri test --workspace --exclude tyrne-bsp-qemu-virt` (Stacked Borrows). **QEMU smoke:** full demo to `tyrne: all tasks complete` with the new `tyrne: high-half active` line; `-d int,unimp` shows exactly the 2 expected `SVC` exceptions and **zero** Translation/Permission/Abort fault classes (fault-clean — row-4 abort gate). *(The release-build `console_write` status `0x1` is **intended behaviour, not a bug**: `console_write` is debug-gated per [ADR-0031](../../../decisions/0031-initial-syscall-set.md) — `SyscallNumber::decode` recognises number `5` only under `cfg!(debug_assertions)` ([abi.rs](../../../../kernel/src/syscall/abi.rs)), so a release build correctly returns `BadSyscallNumber` and the debug console is absent from the production syscall surface. T-022 preserves this exactly; no follow-up needed.)* ## Out of scope @@ -64,4 +64,4 @@ All acceptance criteria checked; gates green (incl. Miri); audit-log entries + A - **Row 2** (the crossing) → **UNSAFE-2026-0031** (trampoline asm) + the `tyrne: high-half active` marker. **Verified:** marker prints; `-d int,unimp` shows zero faults at the crossing. - **Row 3** (`TTBR0`-null/`EPD0`/`TLBI`) → UNSAFE-2026-0023/0024 Amendments + UNSAFE-2026-0031 + the scheduler differ-path host test. **Verified:** demo runs to completion with `TTBR0` freed. - **Row 4** (abort gate) → the QEMU smoke + `-d int,unimp` showing exactly **2 [SVC]** and **zero** Translation/Permission/Abort fault classes. **Verified.** -- **2026-05-30 — third adversarial verification pass** (multi-lens workflow: crossing/barriers, link/relocation, tables/offsets, physmap/cfg/window). Verdict: barrier sequences, the trampoline (`SP`-rebase under `options(noreturn)`, PXN=0 target, both-regimes-live), relocation discipline, index/offset arithmetic, and the host/target cfg split all **sound — no brick**. Four findings, all addressed before commit: (1) **MEDIUM** physmap shares the PXN=0 image window (W^X gap) — *documented* as the accepted v1 RWX-equivalent simplification (per §Out of scope), PXN-split deferred to ADR-0034, `AP=0b00` keeps it EL0-unreachable; (2) **MEDIUM** the `load_image` image/frame overlap preflight was defanged by the PA/VA conflation (`image.as_ptr()` is now a high VA) — **fixed**: the site rebases via `kernel_va_to_phys` before the PMM PA-extent comparison, re-arming the defensive guard (host-identity on the test harness); (3) **LOW** UNSAFE-2026-0031 missing from the audit log — **fixed** (the agents read the tree before the entry was written); (4) **NIT** `boot.md` not updated — **fixed** in the doc sweep. Separately, the pre-existing release-only `console_write` status `0x1` quirk was confirmed to reproduce on the parent commit `bd39679` (same-host control) — **not a T-022 regression**, flagged for a B5 follow-up. +- **2026-05-30 — third adversarial verification pass** (multi-lens workflow: crossing/barriers, link/relocation, tables/offsets, physmap/cfg/window). Verdict: barrier sequences, the trampoline (`SP`-rebase under `options(noreturn)`, PXN=0 target, both-regimes-live), relocation discipline, index/offset arithmetic, and the host/target cfg split all **sound — no brick**. Four findings, all addressed before commit: (1) **MEDIUM** physmap shares the PXN=0 image window (W^X gap) — *documented* as the accepted v1 RWX-equivalent simplification (per §Out of scope), PXN-split deferred to ADR-0034, `AP=0b00` keeps it EL0-unreachable; (2) **MEDIUM** the `load_image` image/frame overlap preflight was defanged by the PA/VA conflation (`image.as_ptr()` is now a high VA) — **fixed**: the site rebases via `kernel_va_to_phys` before the PMM PA-extent comparison, re-arming the defensive guard (host-identity on the test harness); (3) **LOW** UNSAFE-2026-0031 missing from the audit log — **fixed** (the agents read the tree before the entry was written); (4) **NIT** `boot.md` not updated — **fixed** in the doc sweep. Separately, the release-build `console_write` status `0x1` was first mis-described (by me) as a "pre-existing quirk needing a B5 follow-up" on a wrong `in("x8")` hypothesis. The review's root-cause is correct and **it is not a bug at all**: `console_write` is **debug-gated by design** — `SyscallNumber::decode` matches number `5` only under `cfg!(debug_assertions)` ([abi.rs](../../../../kernel/src/syscall/abi.rs), per [ADR-0031](../../../decisions/0031-initial-syscall-set.md)), so a release build correctly returns `BadSyscallNumber` (the debug console is absent from the production syscall surface). T-022 left the `SVC` stub byte-identical (the diff only relocates it into `kernel_main_high`); no follow-up needed. diff --git a/docs/architecture/boot.md b/docs/architecture/boot.md index 72be803..87c4b29 100644 --- a/docs/architecture/boot.md +++ b/docs/architecture/boot.md @@ -27,6 +27,8 @@ Since [T-022](../analysis/tasks/phase-b/T-022-high-half-kernel-mapping.md) the k v1 maps the whole high-half RAM window `PXN = 0` (RWX-equivalent, like the identity map it replaces; `AP = 0b00` keeps EL0 with no access); the ADR-0033 layout's distinct `PXN = 1` physmap region is per-section W^X hardening deferred to ADR-0034. The migration is **fault-clean** (`-d int,unimp`: exactly the 2 syscall-smoke `SVC` exceptions, zero new Translation/Permission faults). Audit: [UNSAFE-2026-0031](../audits/unsafe-log.md) + Amendments to 0022/0023/0024. +> **Forward limit (Pi 4 / large images).** `KERNEL_HIGH_HALF_OFFSET = 0xFFFF_FFFF_0000_0000` bounds the direct map to the **low 4 GiB** of PA, and the migration mask (`OFFSET | (addr & 0xFFFF_FFFF)`) assumes the kernel image PA is below 4 GiB. A BSP with > 4 GiB RAM or peripherals above 4 GiB (e.g. the Raspberry Pi 4, Phase D) needs a different offset **and** a revisited mask before carrying this pattern over. + ### Boot-time sequence ```mermaid @@ -52,24 +54,30 @@ sequenceDiagram Note over Asm: Phase 3 — conventional setup
SP ← __stack_top
CPACR_EL1.FPEN ← 0b11; isb
BSS zeroed (__bss_start..__bss_end) Asm->>KE: bl kernel_entry (EL = 1, guaranteed) Note over KE: T-009 / UNSAFE-2026-0016 asserts CurrentEL == 1
as a load-bearing post-condition of Phase 2 - KE->>KE: construct QemuVirtCpu (incl. CurrentEL self-check) - KE->>KE: construct Pl011Uart at 0x0900_0000 + Note over KE: ── kernel_entry (LOW physical alias; MMU off) ── + KE->>KE: early Pl011Uart at LOW 0x0900_0000 (identity) KE->>U: write_bytes(b"tyrne: hello from kernel_main\n") - KE->>KE: install VBAR_EL1 (T-012) - KE->>KE: boot_ns = cpu.now_ns() snapshot - KE->>KE: mmu_bootstrap() — activates MMU
(T-016 / ADR-0027) + KE->>KE: install VBAR_EL1 (low vectors; T-012) + KE->>KE: mmu_bootstrap() — low-identity MMU on
(T-016 / ADR-0027) KE->>U: write_bytes(b"tyrne: mmu activated\n") + KE->>KE: high_half_activate() — build TTBR1 tables, EPD1 1→0
(T-022 / ADR-0033; both regimes now live) + KE->>KE: migration trampoline — MSR VBAR-high; ISB;
add sp,sp,OFFSET; br kernel_main_high (PC crosses low→high) + Note over KE: ── kernel_main_high (HIGH half, TTBR1_EL1) ── + KE->>KE: free TTBR0_EL1 (xzr + EPD0=1 + TLBI VMALLE1) + KE->>KE: Pl011Uart + QemuVirtCpu at HIGH device-MMIO alias + KE->>U: write_bytes(b"tyrne: high-half active\n") + KE->>KE: boot_ns = cpu.now_ns() snapshot (post-migration) KE->>KE: Pmm::new — Physical Memory Manager init
(T-017 / ADR-0035) KE->>U: write_bytes(b"tyrne: pmm initialized (...)\n") - KE->>KE: AddressSpace arena init — wrap bootstrap L0
(T-018 / ADR-0028; no Mmu::create_address_space call
per Simulation row 0 — would re-zero the live root) + KE->>KE: AddressSpace arena init — wrap bootstrap L0
(T-018 / ADR-0028; populated-but-uninstalled root post-T-022) KE->>U: write_bytes(b"tyrne: address-space-arena ready (...)\n") KE->>KE: task_loader::load_image — embedded raw-flat blob
into a fresh AS (T-019 / ADR-0029; NOT executed) KE->>U: write_bytes(b"tyrne: image loaded (...)\n") - KE->>KE: GIC init + DAIF.I unmask (T-012) + KE->>KE: GIC init + DAIF.I unmask (T-012; high device-MMIO) KE->>U: write_bytes(b"tyrne: timer ready (...)") KE->>KE: kernel-object setup, IPC, scheduler KE->>KE: start() — never returns - Note over KE: steady state — cooperative IPC demo + Note over KE: steady state — cooperative IPC demo (high half) ``` ### Memory map at boot diff --git a/docs/architecture/memory-management.md b/docs/architecture/memory-management.md index 6918911..1166eac 100644 --- a/docs/architecture/memory-management.md +++ b/docs/architecture/memory-management.md @@ -35,7 +35,7 @@ flowchart TB end ``` -The four bootstrap page-table frames live in a dedicated `.boot_pt` section in [`bsp-qemu-virt/linker.ld`](../../bsp-qemu-virt/linker.ld) (added by T-016). Each is `PAGE_SIZE`-aligned and pre-zeroed by the existing BSS-zero loop in [`boot.s`](../../bsp-qemu-virt/src/boot.s) because `.boot_pt` is bracketed by `__bss_start` / `__bss_end`. The total budget is **16 KiB of static reservation** (4 frames × 4 KiB). No kernel allocator dependency at the bootstrap moment; all of `.boot_pt` is filled in before `SCTLR_EL1.M = 1`. +The four low-identity bootstrap page-table frames live in a dedicated `.boot_pt` section in [`bsp-qemu-virt/linker.ld`](../../bsp-qemu-virt/linker.ld) (added by T-016). Each is `PAGE_SIZE`-aligned and pre-zeroed by the existing BSS-zero loop in [`boot.s`](../../bsp-qemu-virt/src/boot.s) because `.boot_pt` is bracketed by `__bss_start` / `__bss_end`. The low-identity budget is **16 KiB** (4 frames × 4 KiB); no kernel allocator dependency at the bootstrap moment; all of `.boot_pt` is filled in before `SCTLR_EL1.M = 1`. **(T-022 / ADR-0033 adds two more frames to `.boot_pt` — the high-half `TTBR1_EL1` roots `__boot_pt_l0_hh` / `__boot_pt_l1_hh`, built by `high_half_activate` — for six frames / 24 KiB total; the two low-identity L2 tables are shared into the high-half regime, so no L2 frames are duplicated.)** ### Identity ranges @@ -196,10 +196,10 @@ Plus `Pmm::extent()` / `Pmm::stats()` accessors and `impl FrameProvider for Pmm< **Smoke trace.** Boot output gains exactly one new line immediately after `tyrne: mmu activated`: ```text -tyrne: pmm initialized (32604 frames available; 164 reserved) +tyrne: pmm initialized (32596 frames available; 172 reserved) ``` -The 32 604 + 164 = 32 768 frames sanity-check is built into the test fixture (`stats_parity_with_bitmap_bit_count`); the 164 reserved frames decompose as 128 (firmware region, 512 KiB) + 36 (kernel image + `.bss` + `.boot_pt` 16 KiB + 64 KiB stack + alignment slack). +The `available + reserved = 32 768` sanity-check is built into the test fixture (`stats_parity_with_bitmap_bit_count`). The exact reserved-frame count is **build-dependent** (it tracks the kernel-image + `.bss` + stack size, which differs debug vs release): the post-T-022 debug build reserves 172 (32 596 available); release reserves 168. The reserved set decomposes as 128 (firmware region, 512 KiB) + the kernel-image / `.bss` / `.boot_pt` / 64 KiB-stack range — which grew by the two high-half `TTBR1` root frames T-022 added to `.boot_pt` (`__boot_pt_l0_hh` / `__boot_pt_l1_hh`, +8 KiB). **Audit-log surface.** [UNSAFE-2026-0026](../audits/unsafe-log.md) covers the single `core::ptr::write_bytes` site in `Pmm::alloc_frame`. The entry's safety argument names five invariants: page-alignment of the target (propagates from `Pmm::new`'s validation (i)), exclusive ownership at write time (the just-set bitmap bit), identity mapping post-MMU (per ADR-0027 §Decision outcome (a)), bitmap-math overflow-freedom (all `saturating_*` / `wrapping_div`), and `write_bytes` ordering (single-core; no peer reader). A new entry rather than an Amendment of UNSAFE-2026-0001 per [ADR-0035 §Dependency chain step 5][adr-0035-dep5]'s adjudication-deferred caveat — PL011 MMIO base blessing and PMM RAM zero-fill share surface shape but differ on what they touch and what proves ownership. diff --git a/docs/audits/unsafe-log.md b/docs/audits/unsafe-log.md index 42abfe0..90be5bc 100644 --- a/docs/audits/unsafe-log.md +++ b/docs/audits/unsafe-log.md @@ -638,6 +638,8 @@ Neither change touches the `copy_nonoverlapping` site itself; both correct contr - **Reviewed by:** @cemililik (+ Claude Opus 4.7 agent). Security-sensitive (boot + MMU root install) → second-reviewer required per [unsafe-policy §Review.4](../standards/unsafe-policy.md). - **Status:** Active. The contract is sound and the sole caller honours it (C7-P5 / X3-001 confirm `mmu_bootstrap` populates the exact frame and that `kernel_entry` runs post-bootstrap). Smoke-verified at runtime: the 2026-05-14 QEMU trace wraps the bootstrap root on boot and runs the demo to completion (`tyrne: all tasks complete`) with no Translation/Permission faults; this is an audit-trail-completeness fix, not a behaviour change. + **Amendment (2026-05-30, T-022 / ADR-0033): contract refined "currently-live in `TTBR0`" → "valid + populated".** Before T-022 the wrap ran while the bootstrap L0 was the *live* `TTBR0_EL1` value, so the original §Operation / §Safety "currently-live" wording was accurate. Post-T-022 the wrap runs in `kernel_main_high` **after** the high-half migration frees `TTBR0_EL1` (null + `EPD0 = 1`), so the bootstrap L0 is a **populated-but-uninstalled** table (retained as arena slot 0 — kernel-init's AS authority + the loader's cap-derivation parent), no longer a live TTBR value. The wrap's soundness was never about *liveness*: it stores the `PhysFrame` for later `map`/`unmap`/`activate` to walk, and that requires the table be *valid + populated*, not *installed*. The code §Safety doc-comments (`from_existing_root` in `mmu.rs`, the wrap site in `main.rs`) were updated to state "valid + populated" + the post-migration reality; this entry's original body stays on record per [unsafe-policy §3](../standards/unsafe-policy.md). No behaviour change; the wrapped frame is identical. + ### UNSAFE-2026-0029 — `SVC` sync trap trampoline + `syscall_entry` register-frame access - **Introduced:** 2026-05-29, [T-021 — EL0→EL1 SVC dispatch](../analysis/tasks/phase-b/T-021-syscall-dispatch.md). New entry: the syscall trap path is a distinct mechanism from the IRQ trap path (UNSAFE-2026-0020) — a different vector slot pair, a larger full-register-file frame, a synchronous `SVC` cause rather than an asynchronous IRQ, and a frame the Rust handler *writes back* (the syscall result) rather than only reads. Per the [`justify-unsafe`](../../.agents/skills/justify-unsafe/SKILL.md) audit-tag scoping discipline, the honest record is a fresh entry, not an Amendment of 0020. @@ -702,4 +704,4 @@ Neither change touches the `copy_nonoverlapping` site itself; both correct contr - **Rebase `SP` inside `kernel_main_high` instead of the trampoline.** Rejected: the compiler emits `kernel_main_high`'s prologue (which may touch the stack) before any inline asm runs; changing `SP` mid-function would mismatch that frame. Rebasing in the trampoline (under `options(noreturn)`, no epilogue) means `kernel_main_high` starts with a consistent high `SP`. - **Pre-flip `TLBI` of the high range before clearing `EPD1`.** Unnecessary: with `EPD1 = 1` a `TTBR1` walk faults and the architecture caches no result, so there is nothing stale to drop (the §Simulation review corrected an earlier "pre-flip TLBI" rationale). - **Reviewed by:** @cemililik (+ Claude Opus 4.8 agent). Security-sensitive (changes the kernel's own translation regime + the kernel/user isolation boundary) → second-reviewer required per [unsafe-policy §Review.4](../standards/unsafe-policy.md); the §Simulation was hardened against two adversarial verification passes during ADR-0033 drafting + a third multi-lens adversarial pass at T-022 implementation. -- **Status:** Active. **Smoke-verified 2026-05-30:** the debug QEMU trace prints the new `tyrne: high-half active` marker (the runtime proof the `br` reached the PXN=0 high image window) and continues through `tyrne: all tasks complete`; `-d int,unimp` shows exactly the 2 expected `SVC` exceptions and **zero** Translation / Permission / Abort fault classes (the migration is fault-clean — ADR-0033 §Simulation row 4 abort gate). The release build boots identically (the release `console_write=0x1` quirk is pre-existing on the parent commit `bd39679`, not introduced here). The per-task `TTBR0_EL1` swap for a *distinct* address space is exercised at runtime in B6 (v1's demo keeps every task on the bootstrap AS); the swap mechanism (`::activate` writing the per-task root PA into the now-freed `TTBR0`) is host-tested via the scheduler differ-path (`yield_now_activates_when_tasks_differ_in_address_space`). +- **Status:** Active. **Smoke-verified 2026-05-30:** the debug QEMU trace prints the new `tyrne: high-half active` marker (the runtime proof the `br` reached the PXN=0 high image window) and continues through `tyrne: all tasks complete`; `-d int,unimp` shows exactly the 2 expected `SVC` exceptions and **zero** Translation / Permission / Abort fault classes (the migration is fault-clean — ADR-0033 §Simulation row 4 abort gate). The release build boots identically (the release `console_write` status `0x1` is the **intended debug-gate** — number `5` decodes only under `cfg!(debug_assertions)` per ADR-0031, not a T-022 effect). The per-task `TTBR0_EL1` swap for a *distinct* address space is exercised at runtime in B6 (v1's demo keeps every task on the bootstrap AS); the swap mechanism (`::activate` writing the per-task root PA into the now-freed `TTBR0`) is host-tested via the scheduler differ-path (`yield_now_activates_when_tasks_differ_in_address_space`). diff --git a/docs/roadmap/phases/phase-b.md b/docs/roadmap/phases/phase-b.md index 14994db..cf38412 100644 --- a/docs/roadmap/phases/phase-b.md +++ b/docs/roadmap/phases/phase-b.md @@ -278,7 +278,7 @@ A real userspace task, loaded by B4, running in EL0 in its own address space, ma The [T-021](../../analysis/tasks/phase-b/T-021-syscall-dispatch.md) review-round (2026-05-29) confirmed **no live B5 defect** but identified three forward-gates that the B5 EL1-kernel-stub proxy did not need and that B6 **must** close when it wires the first real EL0 task. They are intentionally B6 work; tracked here so they are not missed: -1. 🚩 **`console_write` user-window + deref (the single most important gate).** In B5 the window is the whole identity-mapped RAM extent and the copy is a direct int-to-pointer deref ([`bsp-qemu-virt/src/syscall.rs`](../../../bsp-qemu-virt/src/syscall.rs) `SYSCALL_USER_WINDOW_LEN`) — harmless because only the *trusted* EL1 stub calls it, on the identity map. If B6 wires `syscall_entry` to a real EL0 task **unchanged**, an EL0 holder of a debug-console capability could read arbitrary kernel memory via `console_write(ptr)`. B6 must (a) derive a **per-task** window from the EL0 task's actually-mapped region (not the RAM extent) and (b) replace the int-to-pointer deref with a per-page user-VA → kernel-VA translation (the forward path documented in [`user_access.rs`](../../../kernel/src/syscall/user_access.rs) module docs + [`crate::mm::phys_frame_kernel_ptr`](../../../kernel/src/mm/mod.rs)). **The window/translation failure must return `SyscallError::FaultAddress`, never panic** (the panic-free contract holds across the migration). +1. 🚩 **`console_write` user-window + deref (the single most important gate).** The copy is a direct int-to-pointer deref bounded by a `UserAccessWindow` ([`bsp-qemu-virt/src/syscall.rs`](../../../bsp-qemu-virt/src/syscall.rs) `SYSCALL_USER_WINDOW_LEN`); post-T-022 that window is the whole **high-half RAM direct-map alias** (`phys_to_kernel_va(PMM_EXTENT_START)`). It is harmless today because the *only* caller is the trusted EL1 stub (its buffer is a legitimate kernel high VA, and `EPD0 = 1` means no real EL0 task can even reach `syscall_entry`). If B6 wires `syscall_entry` to a real EL0 task **unchanged**, an EL0 holder of a debug-console capability could pass an in-window **high (kernel) VA** and have EL1 copy privileged memory to the console. B6 must (a) derive a **per-task** window from the EL0 task's actually-mapped region (not the RAM extent) and (b) replace the int-to-pointer deref with a per-page user-VA → kernel-VA translation **walking the task's own `TTBR0`** (the forward path documented in [`user_access.rs`](../../../kernel/src/syscall/user_access.rs) module docs + [`crate::mm::phys_frame_kernel_ptr`](../../../kernel/src/mm/mod.rs)). **The window/translation failure must return `SyscallError::FaultAddress`, never panic.** **Hard ordering precondition (sharpened by the T-022 review):** this gate MUST be closed *before* `syscall_entry` becomes reachable by any real EL0 task — the bounds-check + direct deref is **necessary-but-insufficient** as a privilege boundary (it bounds the range; it does not prove the pointer names the *task's* memory rather than a kernel high-half VA that merely falls in-window). Fails closed today only because `EPD0 = 1` keeps EL0 out of `syscall_entry` until the EL0-entry task lands. 2. 🚩 **`SP_EL1` initialisation for the `+0x400` entry.** The sync trampoline's first `sub sp, sp, #272` runs on `SP_EL1`, which the CPU does **not** auto-initialise on an EL0→EL1 trap. B6's per-task EL0 context-init must set `SP_EL1` to a valid kernel stack before any EL0 task is schedulable (and should assert it). Subsumed by the "EL0-ready context register file" work (ADR-0033 placeholder) but named here explicitly. 3. 🚩 **`SYSCALL_STUB_TABLE` → scheduler current-task table.** `syscall_entry` resolves capabilities in the dedicated kernel-stub table in B5; B6 must swap it for the *running EL0 task's* capability table (looked up from the scheduler's current task). Fail-closed if forgotten (handles resolve to `InvalidHandle`, never over-grant), but functionally required for a real task to name its own caps. diff --git a/hal/src/mmu/mod.rs b/hal/src/mmu/mod.rs index e0d960b..109070c 100644 --- a/hal/src/mmu/mod.rs +++ b/hal/src/mmu/mod.rs @@ -47,6 +47,13 @@ pub const PAGE_SIZE: usize = 4096; /// migration ([`bsp-qemu-virt/linker.ld`] + `kernel_entry`) uses the same /// value; the BSP carries a compile-time `assert!` pinning the two in sync. /// +/// **Forward limit (Pi 4 / large images).** This offset bounds the direct map +/// to the **low 4 GiB** of PA — `phys_to_kernel_va(pa)` for `pa ≥ 4 GiB` wraps +/// — and the `kernel_entry` migration mask (`OFFSET | (addr & 0xFFFF_FFFF)`) +/// assumes the kernel image PA is below 4 GiB. A future BSP with > 4 GiB RAM +/// or peripherals above 4 GiB (e.g. the Raspberry Pi 4) needs a different +/// offset **and** a revisited mask before this pattern is carried over. +/// /// **Host builds (`cfg(not(target_arch = "aarch64"))`) define the offset as /// `0`** — there is no MMU or high-half on the test harness, so /// [`phys_to_kernel_va`] / [`kernel_va_to_phys`] are the identity there and @@ -54,14 +61,22 @@ pub const PAGE_SIZE: usize = 4096; /// deref their real host-backed "frames" unchanged. Only the aarch64 kernel /// build carries the real high-half offset. /// +/// The discriminator is `all(target_arch = "aarch64", target_os = "none")` — +/// the **kernel build**, matching the established idiom at +/// [`crate::cpu`]'s `cntfrq` reader — **not** bare `target_arch = "aarch64"`, +/// which would also be true on an aarch64 *host* (e.g. Apple Silicon) and +/// would make `cargo test` / Miri deref `phys_to_kernel_va(real_host_ptr)` +/// into wild memory in the PMM / `phys_frame_kernel_ptr` host tests. +/// /// [ADR-0033]: https://github.com/HodeTech/Tyrne/blob/main/docs/decisions/0033-kernel-high-half-migration.md /// [adr-0033-dep]: https://github.com/HodeTech/Tyrne/blob/main/docs/decisions/0033-kernel-high-half-migration.md#dependency-chain -#[cfg(target_arch = "aarch64")] +#[cfg(all(target_arch = "aarch64", target_os = "none"))] pub const KERNEL_HIGH_HALF_OFFSET: usize = 0xFFFF_FFFF_0000_0000; -/// Host-build identity offset — see the aarch64 [`KERNEL_HIGH_HALF_OFFSET`] -/// for the rationale (no MMU/high-half on the test harness). -#[cfg(not(target_arch = "aarch64"))] +/// Host-build identity offset — see the kernel-build [`KERNEL_HIGH_HALF_OFFSET`] +/// for the rationale (no MMU/high-half on the test harness; identity keeps the +/// host tests' real-backed frames reachable, including on an aarch64 host). +#[cfg(not(all(target_arch = "aarch64", target_os = "none")))] pub const KERNEL_HIGH_HALF_OFFSET: usize = 0; /// Translate a physical address to its kernel high-half direct-map virtual diff --git a/tools/smoke.sh b/tools/smoke.sh index b86fe41..97b2a48 100755 --- a/tools/smoke.sh +++ b/tools/smoke.sh @@ -10,7 +10,7 @@ # Usage: # tools/smoke.sh — debug build, 20s budget # tools/smoke.sh --release — release build -# tools/smoke.sh --int — add -d int,unimp,guest_errors +# tools/smoke.sh --int — add -d int,unimp (fault-class check) # tools/smoke.sh --timeout 30 — override the wall-clock budget (s) # tools/smoke.sh — explicit ELF # @@ -25,7 +25,7 @@ KERNEL="" while [[ $# -gt 0 ]]; do case "$1" in --release) PROFILE="release"; shift ;; - --int) INT_FLAGS=(-d int,unimp,guest_errors); shift ;; + --int) INT_FLAGS=(-d int,unimp); shift ;; # no guest_errors: PL011 noise interleaves the trace --timeout) TO="$2"; shift 2 ;; -h|--help) sed -n '2,/^set -/p' "$0" | sed 's/^# \{0,1\}//;/^set -/d' >&2; exit 0 ;; --*) echo "error: unknown flag: $1" >&2; exit 2 ;; @@ -61,3 +61,22 @@ echo "===== markers =====" >&2 grep -nE "tyrne:|panic|all tasks complete|high-half" "$LOG" || echo "(no tyrne markers found)" echo "===== fault classes (int log, if --int) =====" >&2 grep -nE "Taking exception|Translation fault|Permission fault|Data Abort|Prefetch Abort" "$LOG" | head -40 || true + +# ── Gate (usable as a CI / regression check) ────────────────────────────────── +# The kernel idles in WFI after completion and is SIGTERM'd, so a non-zero QEMU +# exit is expected and ignored above (|| true). Pass/fail is decided by the +# trace contents, not QEMU's exit code: the completion marker must appear, and +# there must be no panic or CPU fault. `--int` uses `-d int,unimp` (no +# `guest_errors`), so the pre-existing PL011 "data written to disabled UART" +# noise does not interleave with the serial markers or the fault grep. +rc=0 +if ! grep -q "all tasks complete" "$LOG"; then + echo "FAIL: 'tyrne: all tasks complete' marker missing (boot did not finish)" >&2 + rc=1 +fi +if grep -qE "tyrne panic|Translation fault|Permission fault|Data Abort|Prefetch Abort|Unallocated Instruction" "$LOG"; then + echo "FAIL: a panic / CPU-fault class appeared in the trace" >&2 + rc=1 +fi +[[ $rc -eq 0 ]] && echo "PASS: boot reached 'all tasks complete' with no panic/fault" >&2 +exit $rc From dfa601fb36ac2d916ff83a3d7d158432145b1a41 Mon Sep 17 00:00:00 2001 From: Cemil ILIK Date: Sat, 30 May 2026 12:13:03 +0300 Subject: [PATCH 6/7] =?UTF-8?q?fix(mmu):=20T-022=20PR=20review-round=202?= =?UTF-8?q?=20=E2=80=94=20named=20TCR/mask=20consts,=20range=20asserts,=20?= =?UTF-8?q?panic-handler=20regime=20fallback?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses the PR #36 code review. The verified migration mechanism is unchanged; this adds named-constant/robustness cleanups and fixes a real early-boot panic-handler regression + a hosted-AArch64 build assertion. - Named TCR field bits in vmsav8 (TCR_EL1_EPD0_BIT / TCR_EL1_EPD1_BIT), reused in TCR_EL1_VALUE_HIGH_HALF, QemuVirtMmu::activate (EPD0 clear) and kernel_main_high (EPD0 set) — localizes the TCR layout to one place. - Migration trampoline: named KERNEL_IMAGE_PA_MASK (0xFFFF_FFFF) + a high_half_alias() helper with a debug_assert! that the symbol address is a low PA or its exact high-half alias (fails fast if a future image escapes the low-4 GiB window the mask assumes), replacing the inline masks. - phys_to_kernel_va / kernel_va_to_phys: debug_assert! the PA/VA is inside the low-4 GiB direct-map window (kernel build only — short-circuits when the offset is 0 on host; uses wrapping_sub to avoid absurd_extreme_comparisons). - Compile-time KERNEL_HIGH_HALF_OFFSET assert gated on all(target_arch="aarch64", target_os="none") — it spuriously failed under rust-analyzer / a hosted aarch64 cargo check (fallback offset = 0). - Panic handler: select the UART MMIO alias by regime (SP >= offset -> high, else physical base) so an early-boot (pre-migration) panic prints instead of silently translation-faulting on the high alias. Real debuggability fix. - tools/smoke.sh: prefer timeout(1), fall back to the Perl alarm wrapper, error if neither (portability); validate --timeout is a non-negative integer. - Docs: boot.md Stage 3 split into kernel_entry(low) / kernel_main_high(high) + Stage 4 handoff reference corrected. Skipped (with reason): the task_loader test-helper Miri strict-provenance warnings are pre-existing (not touched by T-022; Miri passes — warnings, not failures), out of scope for this PR; noted for a separate test-hygiene cleanup. Gates green: 340 host tests, host+kernel clippy -D warnings, fmt, kernel build, QEMU smoke (gated PASS), Miri (Stacked Borrows). Refs: T-022, ADR-0033, PR #36 Co-Authored-By: Claude Opus 4.8 (1M context) --- bsp-qemu-virt/src/main.rs | 77 ++++++++++++++++++++++++++++++++------- bsp-qemu-virt/src/mmu.rs | 3 +- docs/architecture/boot.md | 6 ++- hal/src/mmu/mod.rs | 21 ++++++++++- hal/src/mmu/vmsav8.rs | 13 ++++++- tools/smoke.sh | 45 +++++++++++++++++------ 6 files changed, 134 insertions(+), 31 deletions(-) diff --git a/bsp-qemu-virt/src/main.rs b/bsp-qemu-virt/src/main.rs index 70a66a2..51bacc6 100644 --- a/bsp-qemu-virt/src/main.rs +++ b/bsp-qemu-virt/src/main.rs @@ -31,6 +31,7 @@ use core::fmt::Write; use core::mem::MaybeUninit; use core::panic::PanicInfo; +use tyrne_hal::mmu::vmsav8::TCR_EL1_EPD0_BIT; use tyrne_hal::{Console, Cpu, FmtWriter, Timer}; use tyrne_hal::{PhysAddr, VirtAddr, KERNEL_HIGH_HALF_OFFSET, PAGE_SIZE}; use tyrne_kernel::cap::{CapHandle, CapObject, CapRights, Capability, CapabilityTable}; @@ -116,6 +117,12 @@ const PL011_UART_BASE: usize = 0x0900_0000; // A drift between the linker's hardcoded value and `tyrne_hal` would silently // corrupt every high-half VA↔PA computation (ADR-0033 / T-022); fail the build // instead. +// +// Gated on the kernel build: `KERNEL_HIGH_HALF_OFFSET` is `0` on any non +// `target_os = "none"` build (host/IDE analysis, incl. an aarch64 host), where +// this assert is irrelevant — without the guard it would fire a spurious +// failure under rust-analyzer / a hosted `cargo check` on Apple Silicon. +#[cfg(all(target_arch = "aarch64", target_os = "none"))] const _: () = assert!(KERNEL_HIGH_HALF_OFFSET == 0xFFFF_FFFF_0000_0000); // ─── StaticCell ─────────────────────────────────────────────────────────────── @@ -808,6 +815,35 @@ extern "C" { static __boot_pt_l0: [u64; 512]; } +/// Mask of the low 4 GiB of physical address space — the bound on the QEMU +/// virt kernel image PA. The migration `br`/`VBAR` targets are derived by +/// masking a symbol address to this and OR-ing [`KERNEL_HIGH_HALF_OFFSET`]; a +/// future BSP with an image PA ≥ 4 GiB (e.g. Pi 4) must revisit this + the +/// offset (see [`high_half_alias`] + the linker.ld / `KERNEL_HIGH_HALF_OFFSET` +/// forward-notes). +const KERNEL_IMAGE_PA_MASK: usize = 0xFFFF_FFFF; + +/// Compute the high-half image alias of a kernel-symbol address, for the +/// boot-time migration's `MSR VBAR_EL1` / `br` targets. +/// +/// Masking the low 32 bits recovers the symbol's **PA** whether the compiler +/// materialised it PC-relative (low, while `kernel_entry` runs at the low +/// physical alias) or absolute (high); OR-ing [`KERNEL_HIGH_HALF_OFFSET`] then +/// yields its high-half VA. Correct only while the image PA is below 4 GiB — +/// the `debug_assert!` fails fast (in debug builds) if `addr` is neither a low +/// PA nor its exact high-half alias, which would mean the image escaped the +/// low-4 GiB window the mask assumes. +#[inline] +fn high_half_alias(addr: usize) -> usize { + let pa = addr & KERNEL_IMAGE_PA_MASK; + debug_assert!( + addr == pa || addr == (KERNEL_HIGH_HALF_OFFSET | pa), + "migration: symbol address is neither a low PA nor its high-half alias \ + — the kernel image PA must be < 4 GiB (KERNEL_IMAGE_PA_MASK)", + ); + KERNEL_HIGH_HALF_OFFSET | pa +} + /// Low-half boot entry — the `_start` (`boot.s`) branch target. /// /// Runs at the LOW physical alias of the kernel image with the MMU off (the @@ -927,10 +963,8 @@ pub extern "C" fn kernel_entry() -> ! { // (PC-relative-resolved) address to its physical part and OR-ing the // high-half offset, so the computation is correct regardless of how the // compiler materialises the symbol addresses. - let high_vbar = - KERNEL_HIGH_HALF_OFFSET | ((core::ptr::addr_of!(tyrne_vectors) as usize) & 0xFFFF_FFFF); - let high_entry = - KERNEL_HIGH_HALF_OFFSET | ((kernel_main_high as *const () as usize) & 0xFFFF_FFFF); + let high_vbar = high_half_alias(core::ptr::addr_of!(tyrne_vectors) as usize); + let high_entry = high_half_alias(kernel_main_high as *const () as usize); // SAFETY: the absolute-jump migration trampoline (ADR-0033 §Simulation // row 2). `MSR VBAR_EL1` to the high vector base (mapped PXN=0 in TTBR1) + // `ISB` so high vectors are live before the branch; `add sp, sp, off` @@ -1002,7 +1036,7 @@ extern "C" fn kernel_main_high() -> ! { "tlbi vmalle1", "dsb ish", "isb", - epd0 = in(reg) (1u64 << 7), + epd0 = in(reg) TCR_EL1_EPD0_BIT, t = out(reg) _, options(nostack, nomem), ); @@ -1559,14 +1593,31 @@ extern "C" fn kernel_main_high() -> ! { #[panic_handler] fn panic(info: &PanicInfo) -> ! { - // SAFETY: constructing a fresh Pl011Uart in the panic path is - // best-effort diagnostic output. Writes may interleave if the original - // instance is still reachable — acceptable per the Console contract - // (ADR-0007). The HIGH device-MMIO alias is used because the kernel runs - // in the high half post-migration (ADR-0033); a panic in the brief - // pre-migration low window would not print, but that window is only the - // verified `mmu_bootstrap` / `high_half_activate` path. Audit: UNSAFE-2026-0002. - let console = unsafe { Pl011Uart::new(tyrne_hal::phys_to_kernel_va(PL011_UART_BASE)) }; + // Pick the UART MMIO alias for the regime we panicked in, so the panic + // prints in BOTH the early-boot (pre-migration, low identity) and the + // steady-state (post-migration, high half) windows — rather than silently + // translation-faulting if the high alias is used before it is live. The + // migration trampoline rebases `SP` into the high half, so + // `sp >= KERNEL_HIGH_HALF_OFFSET` iff the kernel is running high (where the + // low identity is gone and the high device alias is the only mapped UART); + // below it we are still on the low identity stack and the physical base is + // mapped. + let sp: usize; + // SAFETY: reading `SP` into a GPR is a side-effect-free register move; no + // memory/stack/flags touched. Audit: UNSAFE-2026-0002. + unsafe { + core::arch::asm!("mov {}, sp", out(reg) sp, options(nostack, nomem, preserves_flags)); + } + let uart_base = if sp >= KERNEL_HIGH_HALF_OFFSET { + tyrne_hal::phys_to_kernel_va(PL011_UART_BASE) + } else { + PL011_UART_BASE + }; + // SAFETY: constructing a fresh Pl011Uart in the panic path is best-effort + // diagnostic output. Writes may interleave if the original instance is + // still reachable — acceptable per the Console contract (ADR-0007). The + // base is the regime-correct alias selected above. Audit: UNSAFE-2026-0002. + let console = unsafe { Pl011Uart::new(uart_base) }; console.write_bytes(b"\n!! tyrne panic !!\n"); let mut w = FmtWriter(&console); diff --git a/bsp-qemu-virt/src/mmu.rs b/bsp-qemu-virt/src/mmu.rs index b0d3be5..b2b53be 100644 --- a/bsp-qemu-virt/src/mmu.rs +++ b/bsp-qemu-virt/src/mmu.rs @@ -40,6 +40,7 @@ use core::arch::asm; use tyrne_hal::mmu::vmsav8::{ flags_to_descriptor_bits, page_descriptor, table_descriptor, PAGE_OA_MASK_L3, TABLE_NLA_MASK, + TCR_EL1_EPD0_BIT, }; use tyrne_hal::{ phys_to_kernel_va, FrameProvider, MapperFlush, MappingFlags, Mmu, MmuError, PhysAddr, @@ -234,7 +235,7 @@ impl Mmu for QemuVirtMmu { "dsb ish", "isb", ttbr0 = in(reg) ttbr0, - epd0_clear = in(reg) !(1u64 << 7), + epd0_clear = in(reg) !TCR_EL1_EPD0_BIT, tmp = out(reg) _, options(nostack), ); diff --git a/docs/architecture/boot.md b/docs/architecture/boot.md index 87c4b29..0cf9776 100644 --- a/docs/architecture/boot.md +++ b/docs/architecture/boot.md @@ -14,8 +14,10 @@ The four boot stages, each with a tightly bounded responsibility: 1. **Firmware / loader.** QEMU's `-kernel` flag loads the ELF image at its linked-in load address (`0x40080000` per [ADR-0012](../decisions/0012-boot-flow-qemu-virt.md)), sets the PC to the ELF's entry point (`_start`), and enters at EL1 (default QEMU `virt`) or EL2 (`-machine virtualization=on`, or most real-hardware boot stacks delivering at EL2). The device-tree blob address is placed in `x0`; v1 ignores it. 2. **Assembly stub (`_start`).** Three phases: first, K3-12 (interrupts masked via `MSR DAIFSet, #0xf`) executes at the very head of the reset vector so a spurious interrupt cannot escape into an uninstalled vector table. Second, the EL drop (per [ADR-0024](../decisions/0024-el-drop-policy.md)) reads `CurrentEL`; on EL2 it configures `HCR_EL2` / `SPSR_EL2` / `ELR_EL2` and `eret`s to a post-drop label, on EL1 it falls through, on EL3 (or any unexpected EL) it halts in a named-label `wfe`-loop (`halt_unsupported_el: wfe ; b halt_unsupported_el`) — there is no Rust panic infrastructure pre-`kernel_entry`. Third, the conventional setup: load `__stack_top` into `SP`, enable FP/SIMD via `CPACR_EL1`, zero the BSS range (`__bss_start` .. `__bss_end`) using 8-byte stores, and branch to `kernel_entry`. If `kernel_entry` ever returns (it shouldn't), the stub falls into a defensive `wfe ; b 2b` halt loop. After phase two, every later instruction runs at EL1 — the precondition T-009's `UNSAFE-2026-0016` runtime check now relies on as a load-bearing invariant rather than a defensive guard. -3. **`kernel_entry` (Rust, in the BSP).** The first Rust code to run. Constructs the BSP's concrete HAL instances (for Phase 4c: the `Pl011Uart` console), installs the EL1 vector table (T-012), captures the boot-to-end timestamp, **activates the MMU** via `mmu_bootstrap` (T-016 / ADR-0027 — this lands the v1 identity layout in `TTBR0_EL1` and flips `SCTLR_EL1.{M,I,C} = 1`; every subsequent MMIO access goes through device-nGnRnE attributes), **initialises the Physical Memory Manager** (T-017 / ADR-0035 — bitmap allocator over the 128 MiB RAM extent with two reserved ranges covering the QEMU firmware region and the kernel image / `.bss` / `.boot_pt` / boot stack), **initialises the address-space arena** (T-018 / ADR-0028 — wraps the already-active L0 root frame as `AddressSpaceArena` slot 0 + mints the bootstrap AS authority cap; no `Mmu::create_address_space` call on the live root per ADR-0028 §Simulation row 0), **loads the embedded userspace placeholder image** via [`task_loader::load_image`](task-loader.md) (T-019 / ADR-0029 — produces a `LoadedImage` describing a freshly populated AS for the embedded `mov w0, #42; ret` blob; **does NOT execute** — runnability gates on B5/B6 per phase-b §B4 §Revision-notes; first runtime exerciser of [UNSAFE-2026-0025](../audits/unsafe-log.md) post-bootstrap `Mmu::map`, [UNSAFE-2026-0026](../audits/unsafe-log.md) `Pmm::alloc_frame` zero-fill, and [UNSAFE-2026-0027](../audits/unsafe-log.md) loader byte-copy), initialises the GIC, unmasks `DAIF.I`, prints the timer banner, then sets up the kernel-object arenas + capability tables + IPC + scheduler before transferring control. Marked `#[no_mangle] extern "C"` so the assembly stub can find it. **(T-022 / ADR-0033 high-half migration — see §"High-half migration" below: `kernel_entry` now runs at the LOW physical alias with the MMU off and, after `mmu_bootstrap` + `high_half_activate` build the high-half `TTBR1_EL1` tables, branches the running kernel into the high half via the migration trampoline. The bring-up steps listed here — PMM, address-space arena, loader, GIC, scheduler — run in `kernel_main_high` at high-half addresses, with `TTBR0_EL1` freed for per-task userspace.)** -4. **Scheduler start (`start`).** The final call in `kernel_entry` is `start(SCHED.as_mut_ptr(), cpu, activate_address_space)`, which hands control to the cooperative FIFO scheduler and never returns; the scheduler runs the first ready task and drives the cooperative IPC demo until the system halts (see [scheduler.md](scheduler.md)). An early design intended a portable `tyrne_kernel::run` that a BSP would delegate to; the B-phase brought subsystem bring-up into `kernel_entry` instead, and `start` (defined in `kernel/src/sched/mod.rs`) is the actual handoff point. Consolidating the bring-up back into a portable kernel entry is a possible future refactor. +3. **`kernel_entry` → `kernel_main_high` (Rust, in the BSP).** The first Rust code to run, split across the high-half migration (T-022 / ADR-0033; see §"High-half migration" below for the mechanism): + - **`kernel_entry` (LOW physical alias, MMU off → low identity).** Constructs a throwaway low-MMIO `Pl011Uart` for early diagnostics, installs the EL1 vector table (T-012, low vectors), **activates the low-identity MMU** via `mmu_bootstrap` (T-016 / ADR-0027 — lands the v1 identity layout in `TTBR0_EL1`, flips `SCTLR_EL1.{M,I,C} = 1`; MMIO goes through device-nGnRnE attributes), then **builds the high-half `TTBR1_EL1` tables** via `high_half_activate` (T-022 / ADR-0033 — `EPD1 1→0`, both regimes now live) and **branches the running kernel into the high half** through the migration trampoline (`MSR VBAR`-high; rebase `SP`; `br kernel_main_high`). It never returns. Marked `#[no_mangle] extern "C"` so the assembly stub can find it. + - **`kernel_main_high` (HIGH half, `TTBR1_EL1`).** Frees `TTBR0_EL1` (null + `EPD0 = 1` + `TLBI VMALLE1`), prints `tyrne: high-half active`, then runs the rest of bring-up at high-half addresses: constructs the persistent `Pl011Uart` + `QemuVirtCpu` at the HIGH device-MMIO alias, captures the boot-to-end timestamp, **initialises the Physical Memory Manager** (T-017 / ADR-0035 — bitmap allocator over the 128 MiB RAM extent, two reserved ranges covering the QEMU firmware region and the kernel image / `.bss` / `.boot_pt` / boot stack), **initialises the address-space arena** (T-018 / ADR-0028 — wraps the bootstrap L0 root as `AddressSpaceArena` slot 0 + mints the bootstrap AS authority cap; no `Mmu::create_address_space` on the populated root per ADR-0028 §Simulation row 0), **loads the embedded userspace placeholder image** via [`task_loader::load_image`](task-loader.md) (T-019 / ADR-0029 — produces a `LoadedImage` for the embedded `mov w0, #42; ret` blob; **does NOT execute** — runnability gates on B6 per phase-b §B4 §Revision-notes; first runtime exerciser of [UNSAFE-2026-0025](../audits/unsafe-log.md) post-bootstrap `Mmu::map`, [UNSAFE-2026-0026](../audits/unsafe-log.md) `Pmm::alloc_frame` zero-fill, and [UNSAFE-2026-0027](../audits/unsafe-log.md) loader byte-copy), initialises the GIC, unmasks `DAIF.I`, prints the timer banner, then sets up the kernel-object arenas + capability tables + IPC + scheduler before transferring control to `start()`. +4. **Scheduler start (`start`).** The final call in `kernel_main_high` is `start(SCHED.as_mut_ptr(), cpu, activate_address_space)`, which hands control to the cooperative FIFO scheduler and never returns; the scheduler runs the first ready task and drives the cooperative IPC demo until the system halts (see [scheduler.md](scheduler.md)). An early design intended a portable `tyrne_kernel::run` that a BSP would delegate to; the B-phase brought subsystem bring-up into the kernel-entry path instead, and `start` (defined in `kernel/src/sched/mod.rs`) is the actual handoff point. Consolidating the bring-up back into a portable kernel entry is a possible future refactor. ### High-half migration (T-022 / ADR-0033) diff --git a/hal/src/mmu/mod.rs b/hal/src/mmu/mod.rs index 109070c..737b574 100644 --- a/hal/src/mmu/mod.rs +++ b/hal/src/mmu/mod.rs @@ -54,8 +54,8 @@ pub const PAGE_SIZE: usize = 4096; /// or peripherals above 4 GiB (e.g. the Raspberry Pi 4) needs a different /// offset **and** a revisited mask before this pattern is carried over. /// -/// **Host builds (`cfg(not(target_arch = "aarch64"))`) define the offset as -/// `0`** — there is no MMU or high-half on the test harness, so +/// **Host builds (`cfg(not(all(target_arch = "aarch64", target_os = "none")))`) +/// define the offset as `0`** — there is no MMU or high-half on the test harness, so /// [`phys_to_kernel_va`] / [`kernel_va_to_phys`] are the identity there and /// the kernel-crate host tests (PMM frame zero-fill, `phys_frame_kernel_ptr`) /// deref their real host-backed "frames" unchanged. Only the aarch64 kernel @@ -92,6 +92,13 @@ pub const KERNEL_HIGH_HALF_OFFSET: usize = 0; #[must_use] #[inline] pub const fn phys_to_kernel_va(pa: usize) -> usize { + // On the kernel build (OFFSET != 0) the direct map covers only the low + // 4 GiB of PA; catch an out-of-window PA early. On host builds (OFFSET == 0, + // identity) the check short-circuits — any host address is valid. + debug_assert!( + KERNEL_HIGH_HALF_OFFSET == 0 || pa < 0x1_0000_0000, + "phys_to_kernel_va: PA outside the low-4 GiB high-half direct map", + ); KERNEL_HIGH_HALF_OFFSET.wrapping_add(pa) } @@ -111,6 +118,16 @@ pub const fn phys_to_kernel_va(pa: usize) -> usize { #[must_use] #[inline] pub const fn kernel_va_to_phys(va: usize) -> usize { + // On the kernel build (OFFSET != 0) `va` must be a high-half direct-map VA + // whose recovered PA lands in the low 4 GiB; the `wrapping_sub` catches both + // a below-window VA (wraps high) and an above-window VA. On host builds + // (OFFSET == 0) the check short-circuits. (`wrapping_sub` rather than a + // `va >= OFFSET` compare avoids `clippy::absurd_extreme_comparisons` when + // OFFSET is 0 on host.) + debug_assert!( + KERNEL_HIGH_HALF_OFFSET == 0 || va.wrapping_sub(KERNEL_HIGH_HALF_OFFSET) < 0x1_0000_0000, + "kernel_va_to_phys: VA outside the high-half direct-map window", + ); va.wrapping_sub(KERNEL_HIGH_HALF_OFFSET) } diff --git a/hal/src/mmu/vmsav8.rs b/hal/src/mmu/vmsav8.rs index 35d783d..791c69c 100644 --- a/hal/src/mmu/vmsav8.rs +++ b/hal/src/mmu/vmsav8.rs @@ -172,6 +172,17 @@ pub const TCR_EL1_VALUE: u64 = { | as_field }; +/// `TCR_EL1.EPD0` (bit 7) — when set, **disable** `TTBR0_EL1` translation-table +/// walks. The boot-time high-half migration sets it (freeing `TTBR0` for +/// userspace); `QemuVirtMmu::activate` clears it on a per-task swap. Named here +/// so every site reuses one definition if the `TCR_EL1` layout ever changes. +pub const TCR_EL1_EPD0_BIT: u64 = 1 << 7; + +/// `TCR_EL1.EPD1` (bit 23) — when set, **disable** `TTBR1_EL1` translation-table +/// walks (the v1 default). [`TCR_EL1_VALUE_HIGH_HALF`] clears it to bring the +/// high half live. Named for reuse (see [`TCR_EL1_EPD0_BIT`]). +pub const TCR_EL1_EPD1_BIT: u64 = 1 << 23; + /// `TCR_EL1` value for the **high-half regime** (post-[ADR-0033] migration): /// byte-identical to [`TCR_EL1_VALUE`] except `EPD1` (bit 23) is cleared, /// enabling `TTBR1_EL1` translation-table walks for the kernel's high-half @@ -188,7 +199,7 @@ pub const TCR_EL1_VALUE: u64 = { /// [ADR-0033]: https://github.com/HodeTech/Tyrne/blob/main/docs/decisions/0033-kernel-high-half-migration.md /// [adr-0033]: https://github.com/HodeTech/Tyrne/blob/main/docs/decisions/0033-kernel-high-half-migration.md#simulation /// [T-022]: https://github.com/HodeTech/Tyrne/blob/main/docs/analysis/tasks/phase-b/T-022-high-half-kernel-mapping.md -pub const TCR_EL1_VALUE_HIGH_HALF: u64 = TCR_EL1_VALUE & !(1 << 23); +pub const TCR_EL1_VALUE_HIGH_HALF: u64 = TCR_EL1_VALUE & !TCR_EL1_EPD1_BIT; /// `SCTLR_EL1` bits we **set** when activating the MMU: `M` (bit 0, /// MMU on), `C` (bit 2, D-cache enable), `I` (bit 12, I-cache enable). diff --git a/tools/smoke.sh b/tools/smoke.sh index 97b2a48..537c21e 100755 --- a/tools/smoke.sh +++ b/tools/smoke.sh @@ -33,6 +33,13 @@ while [[ $# -gt 0 ]]; do esac done +# Validate the budget: it is passed to `timeout`/`alarm()` as integer seconds, +# both of which fail (or misbehave) on a non-numeric value. +if ! [[ "$TO" =~ ^[0-9]+$ ]]; then + echo "error: --timeout must be a non-negative integer (seconds); got '$TO'" >&2 + exit 2 +fi + [[ -z "$KERNEL" ]] && KERNEL="target/aarch64-unknown-none/${PROFILE}/tyrne-bsp-qemu-virt" if [[ ! -f "$KERNEL" ]]; then echo "error: kernel image not found at $KERNEL (run 'cargo kernel-build' first)" >&2 @@ -42,18 +49,32 @@ fi LOG="${TMPDIR:-/tmp}/tyrne-smoke.$$.log" echo "smoke: $KERNEL (budget ${TO}s) log -> $LOG" >&2 -# perl alarm wrapper: fork QEMU, SIGTERM it after $TO seconds. QEMU inherits -# the child's stdout/stderr (redirected to $LOG by the caller below). -TO="$TO" perl -e ' - my $pid = fork(); - if ($pid == 0) { open(STDIN, "<", "/dev/null"); exec(@ARGV) or die "exec: $!"; } - $SIG{ALRM} = sub { kill("TERM", $pid); }; - alarm($ENV{TO}); - waitpid($pid, 0); -' qemu-system-aarch64 -M virt -cpu cortex-a72 -m 128M -smp 1 \ - -display none -serial stdio -monitor none \ - "${INT_FLAGS[@]+"${INT_FLAGS[@]}"}" \ - -kernel "$KERNEL" > "$LOG" 2>&1 || true +# The kernel idles in WFI after completion and never exits on its own, so the +# run must be bounded by a wall-clock timeout. Prefer coreutils `timeout(1)` +# (present on most Linux CI images); fall back to a Perl `alarm()` wrapper +# (macOS ships Perl but not `timeout`); error out if neither is available. +QEMU_ARGS=( + -M virt -cpu cortex-a72 -m 128M -smp 1 + -display none -serial stdio -monitor none + "${INT_FLAGS[@]+"${INT_FLAGS[@]}"}" + -kernel "$KERNEL" +) +if command -v timeout >/dev/null 2>&1; then + timeout "${TO}s" qemu-system-aarch64 "${QEMU_ARGS[@]}" "$LOG" 2>&1 || true +elif command -v perl >/dev/null 2>&1; then + # Perl alarm wrapper: fork QEMU, SIGTERM it after $TO seconds. QEMU inherits + # the child's stdout/stderr (redirected to $LOG by the caller below). + TO="$TO" perl -e ' + my $pid = fork(); + if ($pid == 0) { open(STDIN, "<", "/dev/null"); exec(@ARGV) or die "exec: $!"; } + $SIG{ALRM} = sub { kill("TERM", $pid); }; + alarm($ENV{TO}); + waitpid($pid, 0); + ' qemu-system-aarch64 "${QEMU_ARGS[@]}" > "$LOG" 2>&1 || true +else + echo "error: neither 'timeout(1)' nor 'perl' is available to bound the ${TO}s run" >&2 + exit 1 +fi echo "===== trace =====" >&2 cat "$LOG" From dfba2b6288f54ac5fb7a3e7e7f110fb6414aae6d Mon Sep 17 00:00:00 2001 From: Cemil ILIK Date: Sat, 30 May 2026 14:03:30 +0300 Subject: [PATCH 7/7] =?UTF-8?q?fix(mmu):=20T-022=20PR=20review-round=203?= =?UTF-8?q?=20=E2=80=94=20ADR-0033=20single-offset=20rider,=20boot.md=20EN?= =?UTF-8?q?TRY/linker=20reconcile,=20hard=20direct-map=20asserts?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses the PR #36 round-3 code review. Each finding was verified against current code; only still-valid issues changed, the rest skipped with reason. The migration mechanism is unchanged. - ADR-0033: append-only Revision-notes rider reconciling the SHIPPED single linear offset (KERNEL_HIGH_HALF_OFFSET = 0xFFFF_FFFF_0000_0000, canonical KBASE = 0xFFFF_FFFF_4008_0000) against the body's two-offset / 0xFFFF_FFFF_8008_0000 model. Original body left intact per ADR-0025 §Rule 2; the rider explains why the two PA<->VA offsets collapse to one and names ADR-0027 Option C's prospective base as superseded. - boot.md: ENTRY(_start_phys) reconciled (body previously contradicted itself with ENTRY(_start)); Stage-1 entry-point wording fixed; "Linker script responsibilities" rewritten to the real link-high/load-low form (KBASE + AT(), no MEMORY{} block, .text.vectors 2 KiB-aligned). - hal/mmu: phys_to_kernel_va / kernel_va_to_phys now use assert! (was debug_assert!) so an out-of-window PA/VA fail-stops in release (CLAUDE.md #1) rather than wrapping to a wild pointer; added # Panics sections so clippy::missing_panics_doc stays green (assert! is not exempt; debug_assert! was). The check cannot fire in v1 (QEMU PA < 4 GiB). - mmu_bootstrap: high_half_activate's two SAFETY comments gained explicit "Safer alternatives rejected" clauses, matching the sibling mmu_bootstrap() blocks. Audit entries UNSAFE-2026-0022 / 0023 already cover these writes via their T-022 Amendments (no audit-log change needed). - main.rs: timer-banner comment corrected — BOOT_NS is captured POST high-half migration (excludes MMU-activation + migration cost), not pre-MMU. - tools/smoke.sh: --timeout now requires a strictly positive integer; rejects "0", which disables timeout(1)/alarm() and would let the WFI-idling kernel hang the run forever. Skipped (with reason): the migration trampoline / TTBR0-free / panic-handler unsafe blocks in main.rs already carry conforming SAFETY comments + audit IDs (incl. UNSAFE-2026-0031), and the referenced audit entries already exist with T-022 Amendments — no change needed. Gates green: cargo fmt, host + kernel clippy -D warnings, 340 host tests, kernel build, QEMU smoke (gated PASS; -d int,unimp fault-clean, high-half active -> all tasks complete). Refs: T-022, ADR-0033, ADR-0025, PR #36 Co-Authored-By: Claude Opus 4.8 (1M context) --- bsp-qemu-virt/src/main.rs | 8 +++-- bsp-qemu-virt/src/mmu_bootstrap.rs | 13 ++++++-- docs/architecture/boot.md | 8 ++--- .../0033-kernel-high-half-migration.md | 8 +++++ hal/src/mmu/mod.rs | 32 ++++++++++++++++--- tools/smoke.sh | 11 ++++--- 6 files changed, 63 insertions(+), 17 deletions(-) diff --git a/bsp-qemu-virt/src/main.rs b/bsp-qemu-virt/src/main.rs index 51bacc6..ed220a1 100644 --- a/bsp-qemu-virt/src/main.rs +++ b/bsp-qemu-virt/src/main.rs @@ -1421,9 +1421,11 @@ extern "C" fn kernel_main_high() -> ! { // (it sampled CNTFRQ_EL0 and cached the resolution). Print the timer // parameters so QEMU output makes the measurement visible. The UART // write goes through the device-nGnRnE mapping installed by - // `mmu_bootstrap`. The boot-to-end timestamp was already captured - // pre-MMU (above) so the recorded baseline includes the MMU - // activation cost. + // `mmu_bootstrap`. The boot-to-end baseline (`BOOT_NS`) was captured just + // above — *post* high-half migration — so it measures the high-half steady + // state and therefore *excludes* the MMU-activation + migration cost (see + // the `boot_ns` snapshot comment above for the metric-meaning shift vs the + // pre-T-022 baseline, which included MMU activation). { let mut w = FmtWriter(console); let _ = writeln!( diff --git a/bsp-qemu-virt/src/mmu_bootstrap.rs b/bsp-qemu-virt/src/mmu_bootstrap.rs index 81fa9a9..9fdbe1a 100644 --- a/bsp-qemu-virt/src/mmu_bootstrap.rs +++ b/bsp-qemu-virt/src/mmu_bootstrap.rs @@ -333,7 +333,11 @@ pub unsafe fn high_half_activate() { // duration of this single-core boot call, and pre-zeroed by the BSS // loop. Indices are < 512 by the `& 0x1FF` construction. The table // descriptors point at the shared L2 tables (device / RAM) and the new - // L1_hh, all by PA. Audit: UNSAFE-2026-0022. + // L1_hh, all by PA. Safer alternatives rejected: identical to + // `mmu_bootstrap` Step 1 — VMSAv8 descriptors are raw `u64` words at fixed + // offsets in frames the BSP owns by address (not as Rust objects), so a + // safe wrapper would only relocate, not remove, the audited + // `write_volatile`. Audit: UNSAFE-2026-0022. unsafe { // L0_hh[511] → L1_hh core::ptr::write_volatile(l0_hh.add(l0_idx), table_descriptor(l1_hh as u64)); @@ -350,7 +354,12 @@ pub unsafe fn high_half_activate() { // live low regime is undisturbed — row 1) + `ISB`. After this the high // half translates, but the PC/SP/VBAR are still low (the migration // trampoline performs the crossing). `nomem` omitted so the descriptor - // writes above are not reordered past this block. Audit: UNSAFE-2026-0023. + // writes above are not reordered past this block. Safer alternatives + // rejected: identical to `mmu_bootstrap` Step 2 — EL1 system registers + // (`TTBR1_EL1`, `TCR_EL1`) and the `DSB`/`ISB` ordering have no safe-Rust + // expression; inline `asm!` is the minimal architected surface (no + // `cortex-a`-class crate in the dependency graph, per ADR-0014). + // Audit: UNSAFE-2026-0023. unsafe { asm!( "dsb ish", diff --git a/docs/architecture/boot.md b/docs/architecture/boot.md index 0cf9776..2ca4cd0 100644 --- a/docs/architecture/boot.md +++ b/docs/architecture/boot.md @@ -12,7 +12,7 @@ The overall three-layer architecture is described in [`overview.md`](overview.md The four boot stages, each with a tightly bounded responsibility: -1. **Firmware / loader.** QEMU's `-kernel` flag loads the ELF image at its linked-in load address (`0x40080000` per [ADR-0012](../decisions/0012-boot-flow-qemu-virt.md)), sets the PC to the ELF's entry point (`_start`), and enters at EL1 (default QEMU `virt`) or EL2 (`-machine virtualization=on`, or most real-hardware boot stacks delivering at EL2). The device-tree blob address is placed in `x0`; v1 ignores it. +1. **Firmware / loader.** QEMU's `-kernel` flag loads the ELF image at its load address (`0x40080000` per [ADR-0012](../decisions/0012-boot-flow-qemu-virt.md); the image is *linked high* but *loaded low* — see §"High-half migration"), sets the PC to the ELF's entry point (`_start_phys`, the LOW physical address of `_start` — the MMU is off at reset), and enters at EL1 (default QEMU `virt`) or EL2 (`-machine virtualization=on`, or most real-hardware boot stacks delivering at EL2). The device-tree blob address is placed in `x0`; v1 ignores it. 2. **Assembly stub (`_start`).** Three phases: first, K3-12 (interrupts masked via `MSR DAIFSet, #0xf`) executes at the very head of the reset vector so a spurious interrupt cannot escape into an uninstalled vector table. Second, the EL drop (per [ADR-0024](../decisions/0024-el-drop-policy.md)) reads `CurrentEL`; on EL2 it configures `HCR_EL2` / `SPSR_EL2` / `ELR_EL2` and `eret`s to a post-drop label, on EL1 it falls through, on EL3 (or any unexpected EL) it halts in a named-label `wfe`-loop (`halt_unsupported_el: wfe ; b halt_unsupported_el`) — there is no Rust panic infrastructure pre-`kernel_entry`. Third, the conventional setup: load `__stack_top` into `SP`, enable FP/SIMD via `CPACR_EL1`, zero the BSS range (`__bss_start` .. `__bss_end`) using 8-byte stores, and branch to `kernel_entry`. If `kernel_entry` ever returns (it shouldn't), the stub falls into a defensive `wfe ; b 2b` halt loop. After phase two, every later instruction runs at EL1 — the precondition T-009's `UNSAFE-2026-0016` runtime check now relies on as a load-bearing invariant rather than a defensive guard. 3. **`kernel_entry` → `kernel_main_high` (Rust, in the BSP).** The first Rust code to run, split across the high-half migration (T-022 / ADR-0033; see §"High-half migration" below for the mechanism): - **`kernel_entry` (LOW physical alias, MMU off → low identity).** Constructs a throwaway low-MMIO `Pl011Uart` for early diagnostics, installs the EL1 vector table (T-012, low vectors), **activates the low-identity MMU** via `mmu_bootstrap` (T-016 / ADR-0027 — lands the v1 identity layout in `TTBR0_EL1`, flips `SCTLR_EL1.{M,I,C} = 1`; MMIO goes through device-nGnRnE attributes), then **builds the high-half `TTBR1_EL1` tables** via `high_half_activate` (T-022 / ADR-0033 — `EPD1 1→0`, both regimes now live) and **branches the running kernel into the high half** through the migration trampoline (`MSR VBAR`-high; rebase `SP`; `br kernel_main_high`). It never returns. Marked `#[no_mangle] extern "C"` so the assembly stub can find it. @@ -165,9 +165,9 @@ post_eret: [`bsp-qemu-virt/linker.ld`](../../bsp-qemu-virt/linker.ld) pins the above memory map: -- `ENTRY(_start)` — the ELF's `e_entry` is set to `_start`'s address. -- `MEMORY` — a single `RAM` region: `ORIGIN = 0x40080000, LENGTH = 128M`. -- `.text` starts with `KEEP(*(.text.boot))`, guaranteeing `_start` is at `0x40080000`. +- `ENTRY(_start_phys)` — the ELF's `e_entry` is set to `_start_phys` (`= _start - KERNEL_HH_OFFSET`), the LOW physical address of `_start`, so QEMU's reset PC is physical (the MMU is off at reset; the high VMA would translation-fault immediately). This matches the link-high/load-low migration described in §"High-half migration" and [ADR-0033](../decisions/0033-kernel-high-half-migration.md). +- **Link-high / load-low (ADR-0033).** Three constants pin the split — `KERNEL_HH_OFFSET = 0xFFFF_FFFF_0000_0000`, `KERNEL_IMAGE_PHYS_BASE = 0x40080000`, and `KBASE = KERNEL_HH_OFFSET + KERNEL_IMAGE_PHYS_BASE` (`= 0xFFFF_FFFF_4008_0000`). Virtual addresses start at `. = KBASE`; each section sets its load address low via `AT(ADDR(.section) - KERNEL_HH_OFFSET)`, so the whole image is one uniform high-half alias of the physical image loaded at `0x40080000`. (There is no `MEMORY {}` block — the single 128 MiB region is expressed directly with `KBASE` + `AT()`.) +- `.text` starts with `KEEP(*(.text.boot))` so `_start` is first (VMA `KBASE`, LMA `0x40080000` — where QEMU loads it and where it runs with the MMU off), followed by the 2 KiB-aligned `KEEP(*(.text.vectors))` exception-vector table (`VBAR_EL1` requires 2 KiB alignment). - `.bss` is 8-byte aligned at both ends so the BSS-zero loop can step by 8. - A 64 KiB stack region is reserved after `.bss`; `__stack_top` names its high end. - `/DISCARD/` drops `.comment`, `.note.*`, `.eh_frame*`, and `.gcc_except_table*` — unwinding tables are dead weight under [`panic=abort`](../standards/error-handling.md). diff --git a/docs/decisions/0033-kernel-high-half-migration.md b/docs/decisions/0033-kernel-high-half-migration.md index 38f1f19..583aa87 100644 --- a/docs/decisions/0033-kernel-high-half-migration.md +++ b/docs/decisions/0033-kernel-high-half-migration.md @@ -183,6 +183,14 @@ All six steps are [T-022](../analysis/tasks/phase-b/T-022-high-half-kernel-mappi - **Pro:** Zero new code/unsafe/risk this milestone; the B5 proxy keeps passing. - **Con:** Blocks B6's defining goal — without kernel reachability from the task's translation, a real EL0 task's `SVC` vector fetch translation-faults unrecoverably ([phase-b §B6](../roadmap/phases/phase-b.md#milestone-b6--first-userspace-hello)). "Nothing in B6 runs until this is solved." A "no decision" recorded only to reject it. +## Revision notes + +- **2026-05-30 — Post-Accept correction: single PA↔VA offset; canonical `KBASE = 0xFFFF_FFFF_4008_0000`.** Three places in the body above describe a **two-offset** model that [T-022](../analysis/tasks/phase-b/T-022-high-half-kernel-mapping.md)'s implementation did **not** ship: §Considered options Option 1's high base (`0xFFFF_FFFF_8000_0000+`), the §High-half layout table's kernel-image row (`KBASE = 0xFFFF_FFFF_8008_0000`), and §Dependency chain steps 1–2 (`KBASE = 0xFFFF_FFFF_8008_0000`; *"TWO distinct PA↔VA offsets … conflating them is a bug"* — `KERNEL_IMAGE_LINK_OFFSET` vs `KERNEL_PHYSMAP_BASE`). The shipped contract uses a **single** linear offset `KERNEL_HIGH_HALF_OFFSET = 0xFFFF_FFFF_0000_0000` ([`hal/src/mmu/mod.rs`](../../hal/src/mmu/mod.rs)), so `kernel_VA(pa) = KERNEL_HIGH_HALF_OFFSET + pa` serves **both** roles, and the kernel image — linked at `KERNEL_HIGH_HALF_OFFSET + KERNEL_IMAGE_PHYS_BASE` — lands at the canonical **`KBASE = 0xFFFF_FFFF_4008_0000`** ([`bsp-qemu-virt/linker.ld`](../../bsp-qemu-virt/linker.ld), `KERNEL_HH_OFFSET` / `KBASE`), **not** `0xFFFF_FFFF_8008_0000`. + + **Why the model collapsed to one offset.** Choosing the offset as `0xFFFF_FFFF_0000_0000` — which places PA `0` at the base of the top 4 GiB, so every QEMU `virt` PA maps with `VA[55] = 1` and no overflow — makes the **image-link** offset and the **physmap/direct-map** offset *the same value*: the kernel-image PA range is a subset of the direct-mapped PA range, so the two coincide for every in-image address and "using the wrong offset at a site" is impossible by construction (the value is identical). The original two-offset framing assumed the ARM-convention `0xFFFF_FFFF_8000_0000` base, under which the image-link and physmap bases genuinely differed; the implemented `0xFFFF_FFFF_0000_0000` base removes that distinction. The §Simulation row-0 *"both PA↔VA offsets"* verification (and §Dependency chain step 2's split) therefore reduces to the single-offset host tests on `phys_to_kernel_va` / `kernel_va_to_phys`. + + **Unchanged.** The **decision itself is intact** — Option 1 (boot-time high-half migration), the §Simulation barrier / ordering / `PXN`-window pins, the single `EPD1 1→0` flip, the boot-time framing, the ASID policy, and every consequence stand exactly as accepted; only the offset *count* and the `KBASE` *constant* are corrected. Per [ADR-0025 §Rule 2](0025-adr-governance-amendments.md), the original body above is left intact and this rider is the canonical correction. This canonical `KBASE` also supersedes the *prospective* high-half base sketched in [ADR-0027](0027-kernel-virtual-memory-layout.md) §Considered options Option C (`0xFFFF_FFFF_8008_0000+`) — that ADR's append-only body stays as written, with ADR-0033 (this rider) the canonical home of the value. Refs: [T-022](../analysis/tasks/phase-b/T-022-high-half-kernel-mapping.md), `bsp-qemu-virt/linker.ld`, `hal/src/mmu/mod.rs`. + ## References - [ADR-0027 — Kernel virtual memory layout](0027-kernel-virtual-memory-layout.md) — the identity-only B2 layout that reserved `TTBR1`/`EPD1`, pre-committed the high-half `TCR` fields, and named this ADR as the high-half home. diff --git a/hal/src/mmu/mod.rs b/hal/src/mmu/mod.rs index 737b574..28d3d95 100644 --- a/hal/src/mmu/mod.rs +++ b/hal/src/mmu/mod.rs @@ -88,14 +88,28 @@ pub const KERNEL_HIGH_HALF_OFFSET: usize = 0; /// `arithmetic_side_effects` discipline; the QEMU virt PA range cannot /// overflow the offset (see [`KERNEL_HIGH_HALF_OFFSET`]). /// +/// # Panics +/// +/// On the kernel build (`KERNEL_HIGH_HALF_OFFSET != 0`), panics if `pa` falls +/// outside the low-4 GiB direct-map window — a wrapped result would be a wild +/// pointer, so the helper fail-stops in release too (CLAUDE.md #1). The +/// condition cannot occur in v1 (the QEMU virt PA range is well under 4 GiB); +/// the `assert!` guards a future BSP that wires a larger PA without first +/// revisiting the offset. On host builds (offset `0`, identity) it +/// short-circuits — any host address is valid. +/// /// [ADR-0033]: https://github.com/HodeTech/Tyrne/blob/main/docs/decisions/0033-kernel-high-half-migration.md #[must_use] #[inline] pub const fn phys_to_kernel_va(pa: usize) -> usize { // On the kernel build (OFFSET != 0) the direct map covers only the low // 4 GiB of PA; catch an out-of-window PA early. On host builds (OFFSET == 0, - // identity) the check short-circuits — any host address is valid. - debug_assert!( + // identity) the check short-circuits — any host address is valid. Hard + // `assert!` (not `debug_assert!`): a wrapped out-of-window PA would be a + // wild pointer, so we fail-stop in release too (CLAUDE.md #1 — conservative; + // the check is a single predictable branch on a value that cannot occur in + // v1, and also rejects an out-of-window arg at const-eval time). + assert!( KERNEL_HIGH_HALF_OFFSET == 0 || pa < 0x1_0000_0000, "phys_to_kernel_va: PA outside the low-4 GiB high-half direct map", ); @@ -114,6 +128,14 @@ pub const fn phys_to_kernel_va(pa: usize) -> usize { /// broken project-wide"). Only valid for direct-mapped high-half addresses; /// `wrapping_sub` matches the kernel's `arithmetic_side_effects` discipline. /// +/// # Panics +/// +/// On the kernel build, panics if `va` is not a high-half direct-map VA whose +/// recovered PA lands in the low 4 GiB — a wrapped result would be a wild +/// pointer, so the helper fail-stops in release too (CLAUDE.md #1; mirrors +/// [`phys_to_kernel_va`]). Cannot occur in v1. On host builds (offset `0`) it +/// short-circuits. +/// /// [ADR-0033]: https://github.com/HodeTech/Tyrne/blob/main/docs/decisions/0033-kernel-high-half-migration.md #[must_use] #[inline] @@ -123,8 +145,10 @@ pub const fn kernel_va_to_phys(va: usize) -> usize { // a below-window VA (wraps high) and an above-window VA. On host builds // (OFFSET == 0) the check short-circuits. (`wrapping_sub` rather than a // `va >= OFFSET` compare avoids `clippy::absurd_extreme_comparisons` when - // OFFSET is 0 on host.) - debug_assert!( + // OFFSET is 0 on host.) Hard `assert!` (fires in release too) so a wrapped + // out-of-window VA fail-stops rather than becoming a wild pointer (CLAUDE.md + // #1; mirrors `phys_to_kernel_va`). + assert!( KERNEL_HIGH_HALF_OFFSET == 0 || va.wrapping_sub(KERNEL_HIGH_HALF_OFFSET) < 0x1_0000_0000, "kernel_va_to_phys: VA outside the high-half direct-map window", ); diff --git a/tools/smoke.sh b/tools/smoke.sh index 537c21e..cdf9c5a 100755 --- a/tools/smoke.sh +++ b/tools/smoke.sh @@ -33,10 +33,13 @@ while [[ $# -gt 0 ]]; do esac done -# Validate the budget: it is passed to `timeout`/`alarm()` as integer seconds, -# both of which fail (or misbehave) on a non-numeric value. -if ! [[ "$TO" =~ ^[0-9]+$ ]]; then - echo "error: --timeout must be a non-negative integer (seconds); got '$TO'" >&2 +# Validate the budget: it is passed to `timeout`/`alarm()` as integer seconds. +# Both fail (or misbehave) on a non-numeric value, and zero is worse than +# invalid — `timeout 0s` *disables* the timeout and `alarm(0)` cancels it, so a +# zero budget would let the WFI-idling kernel hang the run forever. Require a +# strictly positive integer (the regex rejects "0", non-numerics, and ""). +if ! [[ "$TO" =~ ^[1-9][0-9]*$ ]]; then + echo "error: --timeout must be a positive integer (seconds); got '$TO'" >&2 exit 2 fi