From 5e8c46817104275a0316d737065a95de0bf686c0 Mon Sep 17 00:00:00 2001 From: Marco Walz Date: Thu, 16 Apr 2026 05:03:01 +0200 Subject: [PATCH 1/2] docs(security): write canister upgrade safety guide Covers pre/post-upgrade hooks, stable memory patterns (Motoko persistent actor and Rust stable structures), Candid interface compatibility rules, snapshot-based rollback workflow, schema evolution, local testing procedures, and controller safety. Written as a security checklist with actionable patterns for production upgrades. --- docs/guides/security/canister-upgrades.md | 356 +++++++++++++++++++++- 1 file changed, 344 insertions(+), 12 deletions(-) diff --git a/docs/guides/security/canister-upgrades.md b/docs/guides/security/canister-upgrades.md index 727adc49..7412074f 100644 --- a/docs/guides/security/canister-upgrades.md +++ b/docs/guides/security/canister-upgrades.md @@ -1,21 +1,353 @@ --- title: "Secure Upgrades" -description: "Upgrade canisters safely with pre/post hooks, stable memory, and snapshot rollbacks" +description: "Upgrade canisters safely: pre/post hooks, stable memory, Candid compatibility, snapshot rollbacks, and testing" sidebar: order: 2 --- -TODO: Write content for this page. +Canister upgrades are one of the highest-risk operations in production. A bad upgrade can corrupt state, make the canister permanently non-upgradeable, or break clients. This guide covers the patterns and checks you need to upgrade safely. - -Upgrade canisters safely without losing data or breaking functionality. Cover pre/post-upgrade hooks (what to serialize and when), stable memory migration strategies, Candid interface compatibility (subtyping), rollback strategies using snapshots, and testing upgrades before deploying to mainnet. Written as a security checklist. +## Checklist - -- Portal: building-apps/canister-management/upgrade.mdx -- icskills: canister-security +Use this before every production upgrade: - -- guides/canister-management/lifecycle -- upgrade mechanics -- guides/canister-management/snapshots -- snapshot-based rollback -- guides/backends/data-persistence -- data across upgrades -- guides/testing/strategies -- testing upgrades +- [ ] Take a snapshot immediately before upgrading +- [ ] Run the upgrade locally first with `icp deploy` +- [ ] Verify data survives: write → upgrade → read +- [ ] Check Candid interface compatibility — no removed methods, no breaking type changes +- [ ] Avoid `pre_upgrade` hooks that serialize large state (use stable structures instead) +- [ ] In Motoko, use `persistent actor` — avoid manual `pre_upgrade`/`post_upgrade` +- [ ] Confirm you have a backup controller (cannot recover from a trapped `post_upgrade` without one) +- [ ] Add a rollback plan: snapshot ID recorded, restore procedure tested + +## How upgrades work + +When you run `icp deploy` on an existing canister, the IC executes the following sequence: + +1. **Stop** the canister (waits for in-flight messages to complete) +2. Run `pre_upgrade` on the old code (if defined) +3. Replace the Wasm module with the new code +4. Run `post_upgrade` on the new code (if defined) +5. **Restart** the canister + +Stable memory is preserved through steps 2–4. Heap memory is cleared when the new Wasm loads. If `pre_upgrade` or `post_upgrade` traps, the upgrade fails with different consequences: + +| Hook | Trap result | +|------|-------------| +| `pre_upgrade` | Upgrade cancelled. Old code still running. State intact but may need attention. | +| `post_upgrade` | New Wasm installed but initialization failed. Canister may be in an inconsistent state. | + +Both scenarios leave the canister in a difficult state. Prevention is far better than recovery. + +## Stable memory patterns + +### Motoko: use `persistent actor` + +The `persistent actor` declaration automatically stores all `let` and `var` fields in stable memory. No serialization, no upgrade hooks, no instruction-limit traps. + +```motoko +persistent actor Counter { + var count : Nat = 0; + + public func increment() : async Nat { + count += 1; + count; + }; + + public query func get() : async Nat { count }; + + // transient: resets to [] on each upgrade — correct for caches + transient var recentCallers : [Principal] = []; +}; +``` + +**Key rules:** + +- All `let`/`var` fields persist automatically — no `stable` keyword needed +- `transient var` for caches or counters that should reset on upgrade +- Do not write manual `pre_upgrade`/`post_upgrade` hooks — the runtime handles everything +- If a persistent field's type changes incompatibly, the upgrade traps. See [Schema evolution](#schema-evolution). + +### Rust: use stable structures + +In Rust, use [`ic-stable-structures`](https://docs.rs/ic-stable-structures/latest/ic_stable_structures/) to store data directly in stable memory. Data lives there from the start — no serialization step on upgrade. + +```rust +use ic_stable_structures::{ + memory_manager::{MemoryId, MemoryManager, VirtualMemory}, + DefaultMemoryImpl, StableBTreeMap, StableCell, +}; +use std::cell::RefCell; + +type Memory = VirtualMemory; + +// Each structure must have its own unique MemoryId — never reuse IDs +const USERS_MEM_ID: MemoryId = MemoryId::new(0); +const COUNTER_MEM_ID: MemoryId = MemoryId::new(1); + +thread_local! { + static MEMORY_MANAGER: RefCell> = + RefCell::new(MemoryManager::init(DefaultMemoryImpl::default())); + + static USERS: RefCell, Memory>> = + RefCell::new(StableBTreeMap::init( + MEMORY_MANAGER.with(|m| m.borrow().get(USERS_MEM_ID)) + )); + + static COUNTER: RefCell> = + RefCell::new(StableCell::init( + MEMORY_MANAGER.with(|m| m.borrow().get(COUNTER_MEM_ID)), + 0u64, + ).expect("Failed to init counter")); +} + +#[ic_cdk::post_upgrade] +fn post_upgrade() { + // Stable structures auto-restore — no deserialization needed. + // Re-initialize timers or transient state here if required. +} +``` + +> **Warning:** Each `MemoryId` must map to exactly one data structure for the lifetime of the canister. Reusing a `MemoryId` for a different structure after an upgrade corrupts both. Keep a written record of your `MemoryId` allocations and never reorder them. + +### Avoid `pre_upgrade` serialization + +The serialization-based upgrade pattern is common in older Rust code but is fundamentally fragile: + +```rust +// DO NOT DO THIS in production +#[ic_cdk::pre_upgrade] +fn pre_upgrade() { + // If STATE is large, this hits the instruction limit and traps. + // A trapped pre_upgrade prevents the upgrade — canister stays on old code. + ic_cdk::storage::stable_save((STATE.with(|s| s.borrow().clone()),)).unwrap(); +} +``` + +When `pre_upgrade` traps due to instruction exhaustion, the canister cannot be upgraded. The `skip_pre_upgrade` flag (an emergency escape hatch via the management canister's `install_code` API) bypasses the hook — but anything the hook would have saved is lost. Use stable structures so the upgrade path cannot brick itself under load. + +## Candid interface compatibility + +The IC checks your new Wasm module's Candid interface against the old one before completing the upgrade. If the new interface is not backward-compatible, the upgrade is rejected. + +**Safe changes:** + +| Change | Why it is safe | +|--------|---------------| +| Add a new method | Existing clients don't call it | +| Add optional parameters to an existing method | Old clients send no value; IC substitutes `null` | +| Remove trailing parameters from an existing method | Old clients send extra values; IC ignores them | +| Return additional values from a method | Old clients ignore extra return values | +| Change a parameter type to a supertype | Old values remain valid inputs | +| Change a return type to a subtype | New values remain valid for old clients | + +**Breaking changes (upgrade rejected or clients break):** + +| Change | Why it breaks | +|--------|--------------| +| Remove a method | Clients calling it get errors | +| Add a required (non-optional) parameter | Old clients don't send it | +| Change a parameter type to an incompatible type | Old clients send invalid values | + +**Example — safe evolution:** + +```candid +// Before +service counter : { + add : (nat) -> (); + get : () -> (int) query; +} + +// After — safe: optional param added, new return value, new method +service counter : { + add : (nat, label : opt text) -> (new_val : nat); + get : () -> (nat, last_change : nat) query; + reset : () -> (); +} +``` + +icp-cli checks Candid compatibility during deploy and prompts for confirmation if it detects a potentially breaking change. Use `--yes` in automated pipelines after manually verifying compatibility: + +```bash +icp deploy my-canister -e ic --yes +``` + +## Snapshot-based rollback + +Always take a snapshot immediately before a risky upgrade. If the upgrade causes unexpected behavior, you can restore the previous state within minutes. + +```bash +# 1. Stop the canister and create a snapshot +icp canister stop my-canister -e ic +icp canister snapshot create my-canister -e ic +# Note the snapshot ID printed in the output + +# 2. Deploy the upgrade +icp deploy my-canister -e ic + +# 3. Verify correctness +icp canister call my-canister health_check -e ic + +# 4a. If everything works, clean up when no longer needed +icp canister snapshot delete my-canister -e ic + +# 4b. If something is wrong, stop and restore +icp canister stop my-canister -e ic +icp canister snapshot restore my-canister -e ic +icp canister start my-canister -e ic +``` + +Snapshots capture the full canister state: Wasm module, Wasm heap memory, stable memory, and chunk store. Restoring from a snapshot brings back all of this state atomically. + +See [Canister snapshots](../canister-management/snapshots.md) for listing, downloading, and the state transfer workflow. + +## Schema evolution + +Upgrading canister code sometimes requires changing the shape of stored data. The rules differ by language. + +### Motoko + +When upgrading a `persistent actor`, the runtime checks that every persistent field's current type is compatible with the value stored in stable memory. Incompatible changes cause the upgrade to trap. + +**Safe changes:** + +- Add new `let` or `var` fields with initial values — the runtime initializes them on upgrade +- Add optional record fields (e.g., change `{ name : Text }` to `{ name : Text; email : ?Text }`) +- Widen a field's type (e.g., `Nat` → `Int`) + +**Unsafe changes (upgrade traps):** + +- Remove or rename a persistent field +- Narrow a field's type (e.g., `Int` → `Nat`) +- Change a non-optional field to an incompatible type + +If you need to make an unsafe change, migrate the data in two upgrades: add the new field alongside the old one, upgrade once (both fields present), then upgrade again to remove the old field. Test this two-step process locally before deploying to mainnet. + +### Rust + +Rust stable structures use serialized bytes on disk. Schema evolution safety depends on the serialization format and versioning strategy. + +**Adding fields safely with `serde(default)`:** + +```rust +use ic_stable_structures::storable::{Bound, Storable}; +use serde::{Deserialize, Serialize}; +use std::borrow::Cow; + +#[derive(Serialize, Deserialize, Clone)] +struct UserV2 { + id: u64, + name: String, + created: u64, + // New optional field — safe to add with serde default + // Old records without this field deserialize to None + #[serde(default)] + email: Option, +} + +impl Storable for UserV2 { + // Unbounded avoids write failures when struct grows. + // Bounded requires a fixed max_size; if encoded size exceeds it after + // adding fields, writes trap. + const BOUND: Bound = Bound::Unbounded; + + fn to_bytes(&self) -> Cow<'_, [u8]> { + let mut buf = vec![]; + ciborium::into_writer(self, &mut buf).expect("encode failed"); + Cow::Owned(buf) + } + + fn from_bytes(bytes: Cow<'_, [u8]>) -> Self { + ciborium::from_reader(bytes.as_ref()).expect("decode failed") + } +} +``` + +**Rules:** + +- Use `#[serde(default)]` on new optional fields so old records deserialize correctly +- Use `Bound::Unbounded` unless you have a strict size requirement +- Never reorder `MemoryId` allocations across upgrades — same effect as changing a field type +- For breaking schema changes, use a versioned enum and migrate records lazily on read + +## Testing upgrades locally + +Never upgrade on mainnet without first verifying locally that data written before the upgrade is still readable after. + +**Motoko:** + +```bash +# Start local network +icp network start -d + +# Deploy initial version +icp deploy backend + +# Write data +icp canister call backend increment '()' +icp canister call backend increment '()' +icp canister call backend get '()' +# Returns: (2 : nat) + +# Modify source code, then redeploy +icp deploy backend + +# Verify data survived +icp canister call backend get '()' +# Must still return: (2 : nat) +``` + +**Rust:** + +```bash +# Start local network +icp network start -d + +# Deploy initial version +icp deploy backend + +# Write data +icp canister call backend add_user '("Alice")' +icp canister call backend get_user_count '()' +# Returns: (1 : nat64) + +# Modify source code, then upgrade +icp deploy backend + +# Verify data survived +icp canister call backend get_user_count '()' +# Must still return: (1 : nat64) +``` + +If the count drops to zero after upgrade, your data is not in stable memory — review your storage declarations before touching mainnet. + +For advanced scenarios (upgrade rollbacks, schema migrations, concurrent call safety), use [PocketIC](../testing/pocket-ic.md) to script multi-step upgrade scenarios in a controlled environment. + +## Controller safety + +You cannot upgrade a canister without a valid controller. Losing all controller keys leaves the canister permanently frozen at its current code — there is no recovery path on the IC. + +```bash +# Check current controllers +icp canister settings show my-canister -e ic + +# Add a backup controller before any risky upgrade +icp canister settings update my-canister --add-controller -e ic +``` + +For production canisters: + +- Maintain at least two controllers (primary identity + hardware wallet or multisig) +- For fully onchain governance, add an SNS or DAO canister as controller and remove personal principals + +See [Access management](access-management.md) for detailed controller management patterns. + +## Next steps + +- [Data persistence](../backends/data-persistence.md) — stable structures and upgrade patterns in depth +- [Canister lifecycle](../canister-management/lifecycle.md) — the full upgrade sequence and install modes +- [Canister snapshots](../canister-management/snapshots.md) — create and restore snapshots +- [Testing strategies](../testing/strategies.md) — test upgrade scenarios before deploying to mainnet +- [Access management](access-management.md) — manage controllers and prevent lock-out + + From 3fa932c9d957794a30299c492b8d47abaae561ec Mon Sep 17 00:00:00 2001 From: Marco Walz Date: Thu, 16 Apr 2026 13:48:17 +0200 Subject: [PATCH 2/2] docs(canister-upgrades): address PR #57 feedback - fix snapshot workflow and schema evolution encoding --- docs/guides/security/canister-upgrades.md | 29 ++++++++++------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/docs/guides/security/canister-upgrades.md b/docs/guides/security/canister-upgrades.md index 7412074f..84f583b5 100644 --- a/docs/guides/security/canister-upgrades.md +++ b/docs/guides/security/canister-upgrades.md @@ -1,6 +1,6 @@ --- title: "Secure Upgrades" -description: "Upgrade canisters safely: pre/post hooks, stable memory, Candid compatibility, snapshot rollbacks, and testing" +description: "Upgrade canisters safely: pre/post hooks, stable memory, Candid compatibility, snapshot rollbacks, schema evolution, and testing" sidebar: order: 2 --- @@ -16,7 +16,7 @@ Use this before every production upgrade: - [ ] Verify data survives: write → upgrade → read - [ ] Check Candid interface compatibility — no removed methods, no breaking type changes - [ ] Avoid `pre_upgrade` hooks that serialize large state (use stable structures instead) -- [ ] In Motoko, use `persistent actor` — avoid manual `pre_upgrade`/`post_upgrade` +- [ ] In Motoko, use `persistent actor` (which eliminates the need for pre_upgrade hooks) — avoid manual `pre_upgrade`/`post_upgrade` - [ ] Confirm you have a backup controller (cannot recover from a trapped `post_upgrade` without one) - [ ] Add a rollback plan: snapshot ID recorded, restore procedure tested @@ -56,7 +56,7 @@ persistent actor Counter { public query func get() : async Nat { count }; - // transient: resets to [] on each upgrade — correct for caches + // transient: resets to [] on each upgrade — correct for caches, transient logs, and reset-on-upgrade counters transient var recentCallers : [Principal] = []; }; ``` @@ -124,7 +124,7 @@ fn pre_upgrade() { } ``` -When `pre_upgrade` traps due to instruction exhaustion, the canister cannot be upgraded. The `skip_pre_upgrade` flag (an emergency escape hatch via the management canister's `install_code` API) bypasses the hook — but anything the hook would have saved is lost. Use stable structures so the upgrade path cannot brick itself under load. +When `pre_upgrade` traps due to instruction exhaustion, the canister cannot be upgraded. The `skip_pre_upgrade` flag (an emergency escape hatch via the management canister's `install_code` API — see [Management canister reference](../../reference/management-canister.md)) bypasses the hook — but anything the hook would have saved is lost. Use stable structures so the upgrade path cannot brick itself under load. ## Candid interface compatibility @@ -181,6 +181,7 @@ Always take a snapshot immediately before a risky upgrade. If the upgrade causes icp canister stop my-canister -e ic icp canister snapshot create my-canister -e ic # Note the snapshot ID printed in the output +icp canister start my-canister -e ic # 2. Deploy the upgrade icp deploy my-canister -e ic @@ -227,21 +228,19 @@ If you need to make an unsafe change, migrate the data in two upgrades: add the Rust stable structures use serialized bytes on disk. Schema evolution safety depends on the serialization format and versioning strategy. -**Adding fields safely with `serde(default)`:** +**Adding fields safely with Candid encoding:** ```rust +use candid::{CandidType, Decode, Deserialize, Encode}; use ic_stable_structures::storable::{Bound, Storable}; -use serde::{Deserialize, Serialize}; use std::borrow::Cow; -#[derive(Serialize, Deserialize, Clone)] +#[derive(CandidType, Deserialize, Clone)] struct UserV2 { id: u64, name: String, created: u64, - // New optional field — safe to add with serde default - // Old records without this field deserialize to None - #[serde(default)] + // New optional field — safe to add: old records deserialize with None email: Option, } @@ -252,20 +251,18 @@ impl Storable for UserV2 { const BOUND: Bound = Bound::Unbounded; fn to_bytes(&self) -> Cow<'_, [u8]> { - let mut buf = vec![]; - ciborium::into_writer(self, &mut buf).expect("encode failed"); - Cow::Owned(buf) + Cow::Owned(Encode!(self).expect("failed to encode UserV2")) } fn from_bytes(bytes: Cow<'_, [u8]>) -> Self { - ciborium::from_reader(bytes.as_ref()).expect("decode failed") + Decode!(&bytes, Self).expect("failed to decode UserV2") } } ``` **Rules:** -- Use `#[serde(default)]` on new optional fields so old records deserialize correctly +- Use `Option` for new fields — Candid deserializes absent fields as `None`, so old records remain readable after the upgrade - Use `Bound::Unbounded` unless you have a strict size requirement - Never reorder `MemoryId` allocations across upgrades — same effect as changing a field type - For breaking schema changes, use a versioned enum and migrate records lazily on read @@ -350,4 +347,4 @@ See [Access management](access-management.md) for detailed controller management - [Testing strategies](../testing/strategies.md) — test upgrade scenarios before deploying to mainnet - [Access management](access-management.md) — manage controllers and prevent lock-out - +