AtomicIP · fejilaup-cloud · May 30, 2026 · May 30, 2026
diff --git a/.github/workflows/ops-tests.yml b/.github/workflows/ops-tests.yml
@@ -0,0 +1,22 @@
+name: Ops Tests
+
+# Hermetic tests for operational tooling: disaster recovery (#558),
+# backup integrity (#559) and canary deployment (#560). They mock all
+# external services, so no secrets or live infrastructure are required.
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    paths:
+      - "scripts/**"
+      - ".github/workflows/ops-tests.yml"
+
+jobs:
+  ops-tests:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Run operational test suites
+        run: bash scripts/run-ops-tests.sh
diff --git a/README.md b/README.md
@@ -118,6 +118,14 @@ Latest testnet deployment addresses are published in GitHub Actions deployment s
 - [API Request Deduplication](docs/api-request-deduplication.md) — Idempotency support (#531)
 - [API Batch Requests](docs/api-batch-requests.md) — Reduce round trips (#532)
 
+### Operations & Reliability
+- [Contract Upgrade Testing](docs/contract-upgrade-testing.md) — Upgrade compatibility & authorization tests (#557)
+- [Disaster Recovery Testing](docs/disaster-recovery-testing.md) — Automated backup→restore drill (#558)
+- [Backup Integrity Verification](docs/backup-integrity-verification.md) — Checksum & structural verification (#559)
+- [Canary Deployment Testing](docs/canary-deployment.md) — Health-gated rollout with auto-rollback (#560)
+- [Disaster Recovery Plan](docs/disaster-recovery-plan.md)
+- [Backup & Recovery Guide](docs/backup-recovery-guide.md)
+
 ### Additional Resources
 - [API Reference](docs/api-reference.md)
 - [Changelog Format](docs/changelog-format.md)

diff --git a/contracts/ip_registry/src/lib.rs b/contracts/ip_registry/src/lib.rs
@@ -28,6 +28,9 @@ mod differential_tests;
 #[cfg(test)]
 mod invariant_tests;
 
+#[cfg(test)]
+mod upgrade_tests;
+
 // ── Error Codes ────────────────────────────────────────────────────────────
 
 #[contracterror]

diff --git a/contracts/ip_registry/src/upgrade_tests.rs b/contracts/ip_registry/src/upgrade_tests.rs
@@ -0,0 +1,139 @@
+//! Contract upgrade compatibility tests (#557).
+//!
+//! These tests cover the upgrade-safety surface of the IP Registry contract:
+//!
+//! * `validate_upgrade` — the compatibility gate that must accept a well-formed
+//!   candidate WASM hash and reject an obviously invalid (zero) one. A zero hash
+//!   stands in for "no/garbage WASM" and must never be accepted.
+//! * State preservation — running the compatibility check must be a pure,
+//!   read-only operation: committed IP records and ID allocation are unchanged
+//!   by it. This is the property an operator relies on when validating a
+//!   candidate upgrade against live state.
+//! * Authorization — `upgrade` must refuse to run when no admin has been
+//!   established, so an un-initialized contract can never be upgraded by an
+//!   unauthorized caller.
+//!
+//! The successful `upgrade` path (`update_current_contract_wasm`) is exercised
+//! on-chain rather than here: it requires a genuinely installed WASM hash, which
+//! the unit-test host cannot provide. The compatibility and authorization logic
+//! that guards it is what these tests pin down.
+
+#[cfg(test)]
+mod upgrade_tests {
+    use crate::IpRecord;
+    use soroban_sdk::contractclient;
+    use soroban_sdk::testutils::Address as TestAddress;
+    use soroban_sdk::{Address, BytesN, Env};
+
+    #[contractclient(name = "UpgradeTestClient")]
+    #[allow(dead_code)]
+    pub trait UpgradeIface {
+        fn commit_ip(
+            env: Env,
+            owner: Address,
+            commitment_hash: BytesN<32>,
+            pow_difficulty: u32,
+        ) -> u64;
+        fn get_ip(env: Env, ip_id: u64) -> IpRecord;
+        fn validate_upgrade(env: Env, new_wasm_hash: BytesN<32>);
+        fn upgrade(env: Env, new_wasm_hash: BytesN<32>);
+    }
+
+    fn setup() -> (Env, UpgradeTestClient<'static>) {
+        let env = Env::default();
+        let contract_id = env.register(crate::IpRegistry, ());
+        let client = UpgradeTestClient::new(&env, &contract_id);
+        (env, client)
+    }
+
+    // ── validate_upgrade: acceptance ──────────────────────────────────────────
+
+    #[test]
+    fn validate_upgrade_accepts_typical_hash() {
+        let (env, client) = setup();
+        let hash = BytesN::from_array(&env, &[1u8; 32]);
+        // Must not panic.
+        client.validate_upgrade(&hash);
+    }
+
+    #[test]
+    fn validate_upgrade_accepts_all_ones_hash() {
+        let (env, client) = setup();
+        let hash = BytesN::from_array(&env, &[0xffu8; 32]);
+        client.validate_upgrade(&hash);
+    }
+
+    #[test]
+    fn validate_upgrade_accepts_single_nonzero_byte() {
+        let (env, client) = setup();
+        let mut bytes = [0u8; 32];
+        bytes[31] = 1; // smallest non-zero hash
+        let hash = BytesN::from_array(&env, &bytes);
+        client.validate_upgrade(&hash);
+    }
+
+    // ── validate_upgrade: rejection ───────────────────────────────────────────
+
+    #[test]
+    #[should_panic(expected = "Error(Contract, #5)")]
+    fn validate_upgrade_rejects_zero_hash() {
+        let (env, client) = setup();
+        let zero = BytesN::from_array(&env, &[0u8; 32]);
+        client.validate_upgrade(&zero);
+    }
+
+    // ── validate_upgrade is repeatable / side-effect free ─────────────────────
+
+    #[test]
+    fn validate_upgrade_is_idempotent() {
+        let (env, client) = setup();
+        let hash = BytesN::from_array(&env, &[7u8; 32]);
+        // Calling the compatibility check repeatedly is always safe.
+        for _ in 0..5 {
+            client.validate_upgrade(&hash);
+        }
+    }
+
+    // ── State preservation across the compatibility check ─────────────────────
+
+    #[test]
+    fn validate_upgrade_preserves_committed_state() {
+        let (env, client) = setup();
+        env.mock_all_auths();
+
+        let owner = <Address as TestAddress>::generate(&env);
+        let h1 = BytesN::from_array(&env, &[11u8; 32]);
+        let h2 = BytesN::from_array(&env, &[22u8; 32]);
+
+        let id1 = client.commit_ip(&owner, &h1, &0u32);
+        let id2 = client.commit_ip(&owner, &h2, &0u32);
+
+        // Run the upgrade compatibility gate against live state.
+        let candidate = BytesN::from_array(&env, &[9u8; 32]);
+        client.validate_upgrade(&candidate);
+
+        // Records and ID allocation must be untouched by the validation.
+        let r1 = client.get_ip(&id1);
+        let r2 = client.get_ip(&id2);
+        assert_eq!(r1.commitment_hash, h1);
+        assert_eq!(r2.commitment_hash, h2);
+        assert_eq!(r1.owner, owner);
+        assert_eq!(r2.owner, owner);
+
+        // The next allocated ID continues the sequence — no IDs were consumed.
+        let id3 = client.commit_ip(&owner, &BytesN::from_array(&env, &[33u8; 32]), &0u32);
+        assert_eq!(id3, id2 + 1);
+    }
+
+    // ── Authorization guard on upgrade ────────────────────────────────────────
+
+    #[test]
+    #[should_panic(expected = "Error(Contract, #5)")]
+    fn upgrade_rejected_when_no_admin_initialized() {
+        // A fresh contract has never had `commit_ip` called, so no admin exists.
+        // `upgrade` must refuse rather than allow an unauthorized upgrade.
+        let (env, client) = setup();
+        let hash = BytesN::from_array(&env, &[1u8; 32]);
+        client.upgrade(&hash);
+    }
+}
diff --git a/docs/backup-integrity-verification.md b/docs/backup-integrity-verification.md
@@ -0,0 +1,70 @@
+# Backup Integrity Verification (#559)
+
+## Overview
+
+Backups are only useful if they can actually be restored. `verify-backup-integrity.sh`
+checks that a backup archive is structurally sound and complete **before** it is
+relied upon for recovery, and `scripts/test-backup-integrity.sh` proves the
+verifier accepts good backups and rejects every class of corruption.
+
+```bash
+# Verify a single backup
+./scripts/verify-backup-integrity.sh /var/backups/atomicip/backup_20260101_000000.tar.gz
+
+# Run the verifier's own test suite
+bash scripts/test-backup-integrity.sh
+```
+
+## What the verifier checks
+
+1. **SHA-256 checksum** — if a `<backup>.sha256` sidecar is present (written
+   automatically by `backup-contract-state.sh`), the archive's checksum must
+   match it. A mismatch means the file was altered or truncated.
+2. **Archive integrity** — `tar -tzf` must succeed (not corrupt/truncated).
+3. **Required files present** — `metadata.json`, `ip_registry_state.json` and
+   `atomic_swap_state.json` must all exist.
+4. **Valid JSON** — every `*.json` in the backup must parse.
+
+The script exits `0` only if every check passes; otherwise it exits `1` with a
+description of the failure.
+
+## Checksums
+
+`backup-contract-state.sh` now writes a checksum sidecar next to each archive:
+
+```
+backup_20260101_000000.tar.gz
+backup_20260101_000000.tar.gz.sha256
+```
+
+Both are uploaded to S3 when `BACKUP_S3_BUCKET` is configured. If no sidecar is
+present (e.g. a legacy backup), the checksum step is skipped with a note and the
+remaining structural checks still run — so the change is backward compatible.
+
+## Test matrix
+
+The suite in `scripts/test-backup-integrity.sh` covers:
+
+| # | Input | Expected |
+|---|-------|----------|
+| 1 | Well-formed backup | Pass |
+| 2 | Backup + matching checksum sidecar | Pass (checksum verified) |
+| 3 | Archive changed after checksum written | Fail (checksum mismatch) |
+| 4 | Corrupt/garbage archive | Fail (corruption) |
+| 5 | Missing required state file | Fail (missing file) |
+| 6 | Malformed JSON in a state file | Fail (invalid JSON) |
+| 7 | No file argument | Fail (usage) |
+| 8 | Nonexistent file | Fail (not found) |
+
+## Regular verification
+
+Integrity should be checked routinely, not only at restore time:
+
+- **On creation** — `backup-contract-state.sh` writes the checksum sidecar so
+  every backup is self-verifying.
+- **On a schedule** — run `verify-backup-integrity.sh` against the most recent
+  archive(s) from a cron job or scheduled CI workflow and alert on any non-zero
+  exit.
+- **Before restore** — `restore-contract-state.sh` and `activate-dr-site.sh`
+  should be preceded by a verification pass (the DR drill in
+  `docs/disaster-recovery-testing.md` enforces this ordering).
diff --git a/docs/canary-deployment.md b/docs/canary-deployment.md
@@ -0,0 +1,84 @@
+# Canary Deployment Testing (#560)
+
+## Overview
+
+A canary deployment rolls a new version out to a small slice of
+traffic/infrastructure first, health-checks it, and only promotes it to the full
+fleet if it stays healthy. If the canary fails, it is rolled back automatically.
+`scripts/canary-deploy.sh` implements this flow and `scripts/test-canary-deployment.sh`
+verifies the decision tree (promote vs. rollback) for every branch.
+
+```bash
+# Run a canary deployment (uses real hooks / defaults)
+./scripts/canary-deploy.sh
+
+# Run the canary logic test suite
+bash scripts/test-canary-deployment.sh
+```
+
+## Flow
+
+```
+deploy canary ──▶ health gate (N consecutive checks) ──┬─ healthy ─▶ promote ─▶ SUCCESS
+                                                        └─ unhealthy ─▶ rollback ─▶ FAIL (exit 1)
+```
+
+1. **Deploy** the new version to the canary slot. If deploy fails → rollback, exit 1.
+2. **Health gate**: require `HEALTH_RETRIES` consecutive successful checks,
+   `HEALTH_INTERVAL` seconds apart. Any failed check → rollback, exit 1.
+3. **Promote** the canary to the full fleet. If promotion fails → rollback, exit 1.
+4. On full success, exit 0.
+
+Exit code `0` means promoted; `1` means failed-and-rolled-back, so CI/CD can
+gate the rollout on it.
+
+## Pluggable hooks
+
+The four side-effecting steps are command hooks, overridable via the
+environment. This lets the same script run against real infrastructure in
+production and against mocks in tests:
+
+| Variable | Purpose | Default |
+|----------|---------|---------|
+| `CANARY_DEPLOY_CMD` | Deploy the new version to the canary slot | `stellar contract deploy --network $NETWORK` |
+| `CANARY_HEALTH_CMD` | Return 0 if the canary is healthy | `curl -sf $CANARY_HEALTH_URL` |
+| `CANARY_PROMOTE_CMD` | Promote canary to the full fleet | `echo Promoting canary to full fleet` |
+| `CANARY_ROLLBACK_CMD` | Tear down the canary / restore previous version | `echo Rolling back canary` |
+
+Tuning knobs:
+
+| Variable | Purpose | Default |
+|----------|---------|---------|
+| `HEALTH_RETRIES` | Consecutive successful checks required | `3` |
+| `HEALTH_INTERVAL` | Seconds between checks (set `0` in tests) | `5` |
+| `CANARY_HEALTH_URL` | Default health endpoint | `https://canary.atomicip.io/health` |
+| `NETWORK` | Target network label | `testnet` |
+
+## Example
+
+```bash
+export NETWORK=testnet
+export CANARY_DEPLOY_CMD="./scripts/deploy.sh --network testnet --canary"
+export CANARY_HEALTH_CMD="./scripts/smoke-test.sh"
+export CANARY_PROMOTE_CMD="./scripts/promote-canary.sh"
+export CANARY_ROLLBACK_CMD="./scripts/rollback.sh"
+./scripts/canary-deploy.sh
+```
+
+## Test matrix
+
+The suite in `scripts/test-canary-deployment.sh` drives the script through every
+branch by injecting hook commands:
+
+| # | Scenario | Expected |
+|---|----------|----------|
+| 1 | Healthy canary | Promoted, exit 0, rollback never runs |
+| 2 | Unhealthy canary | Rolled back, exit 1, promotion never runs |
+| 3 | Deploy step fails | Rolled back, exit 1, health checks skipped |
+| 4 | Canary fails on the 2nd probe | Rolled back on first failed check |
+| 5 | Promotion step fails | Rolled back, exit 1 |
+
+## CI integration
+
+`.github/workflows/ops-tests.yml` runs this suite (via
+`scripts/run-ops-tests.sh`) on every change under `scripts/`.