diff --git a/.github/workflows/ops-tests.yml b/.github/workflows/ops-tests.yml new file mode 100644 index 0000000..0cfbe17 --- /dev/null +++ b/.github/workflows/ops-tests.yml @@ -0,0 +1,22 @@ +name: Ops Tests + +# Hermetic tests for operational tooling: disaster recovery (#558), +# backup integrity (#559) and canary deployment (#560). They mock all +# external services, so no secrets or live infrastructure are required. + +on: + push: + branches: [main] + pull_request: + paths: + - "scripts/**" + - ".github/workflows/ops-tests.yml" + +jobs: + ops-tests: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Run operational test suites + run: bash scripts/run-ops-tests.sh diff --git a/README.md b/README.md index 68e36bd..a6a07d3 100644 --- a/README.md +++ b/README.md @@ -118,6 +118,14 @@ Latest testnet deployment addresses are published in GitHub Actions deployment s - [API Request Deduplication](docs/api-request-deduplication.md) — Idempotency support (#531) - [API Batch Requests](docs/api-batch-requests.md) — Reduce round trips (#532) +### Operations & Reliability +- [Contract Upgrade Testing](docs/contract-upgrade-testing.md) — Upgrade compatibility & authorization tests (#557) +- [Disaster Recovery Testing](docs/disaster-recovery-testing.md) — Automated backup→restore drill (#558) +- [Backup Integrity Verification](docs/backup-integrity-verification.md) — Checksum & structural verification (#559) +- [Canary Deployment Testing](docs/canary-deployment.md) — Health-gated rollout with auto-rollback (#560) +- [Disaster Recovery Plan](docs/disaster-recovery-plan.md) +- [Backup & Recovery Guide](docs/backup-recovery-guide.md) + ### Additional Resources - [API Reference](docs/api-reference.md) - [Changelog Format](docs/changelog-format.md) diff --git a/contracts/ip_registry/src/lib.rs b/contracts/ip_registry/src/lib.rs index 9aaca48..3c07500 100644 --- a/contracts/ip_registry/src/lib.rs +++ b/contracts/ip_registry/src/lib.rs @@ -28,6 +28,9 @@ mod differential_tests; #[cfg(test)] mod invariant_tests; +#[cfg(test)] +mod upgrade_tests; + // ── Error Codes ──────────────────────────────────────────────────────────── #[contracterror] diff --git a/contracts/ip_registry/src/upgrade_tests.rs b/contracts/ip_registry/src/upgrade_tests.rs new file mode 100644 index 0000000..09a8b83 --- /dev/null +++ b/contracts/ip_registry/src/upgrade_tests.rs @@ -0,0 +1,139 @@ +//! Contract upgrade compatibility tests (#557). +//! +//! These tests cover the upgrade-safety surface of the IP Registry contract: +//! +//! * `validate_upgrade` — the compatibility gate that must accept a well-formed +//! candidate WASM hash and reject an obviously invalid (zero) one. A zero hash +//! stands in for "no/garbage WASM" and must never be accepted. +//! * State preservation — running the compatibility check must be a pure, +//! read-only operation: committed IP records and ID allocation are unchanged +//! by it. This is the property an operator relies on when validating a +//! candidate upgrade against live state. +//! * Authorization — `upgrade` must refuse to run when no admin has been +//! established, so an un-initialized contract can never be upgraded by an +//! unauthorized caller. +//! +//! The successful `upgrade` path (`update_current_contract_wasm`) is exercised +//! on-chain rather than here: it requires a genuinely installed WASM hash, which +//! the unit-test host cannot provide. The compatibility and authorization logic +//! that guards it is what these tests pin down. + +#[cfg(test)] +mod upgrade_tests { + use crate::IpRecord; + use soroban_sdk::contractclient; + use soroban_sdk::testutils::Address as TestAddress; + use soroban_sdk::{Address, BytesN, Env}; + + #[contractclient(name = "UpgradeTestClient")] + #[allow(dead_code)] + pub trait UpgradeIface { + fn commit_ip( + env: Env, + owner: Address, + commitment_hash: BytesN<32>, + pow_difficulty: u32, + ) -> u64; + fn get_ip(env: Env, ip_id: u64) -> IpRecord; + fn validate_upgrade(env: Env, new_wasm_hash: BytesN<32>); + fn upgrade(env: Env, new_wasm_hash: BytesN<32>); + } + + fn setup() -> (Env, UpgradeTestClient<'static>) { + let env = Env::default(); + let contract_id = env.register(crate::IpRegistry, ()); + let client = UpgradeTestClient::new(&env, &contract_id); + (env, client) + } + + // ── validate_upgrade: acceptance ────────────────────────────────────────── + + #[test] + fn validate_upgrade_accepts_typical_hash() { + let (env, client) = setup(); + let hash = BytesN::from_array(&env, &[1u8; 32]); + // Must not panic. + client.validate_upgrade(&hash); + } + + #[test] + fn validate_upgrade_accepts_all_ones_hash() { + let (env, client) = setup(); + let hash = BytesN::from_array(&env, &[0xffu8; 32]); + client.validate_upgrade(&hash); + } + + #[test] + fn validate_upgrade_accepts_single_nonzero_byte() { + let (env, client) = setup(); + let mut bytes = [0u8; 32]; + bytes[31] = 1; // smallest non-zero hash + let hash = BytesN::from_array(&env, &bytes); + client.validate_upgrade(&hash); + } + + // ── validate_upgrade: rejection ─────────────────────────────────────────── + + #[test] + #[should_panic(expected = "Error(Contract, #5)")] + fn validate_upgrade_rejects_zero_hash() { + let (env, client) = setup(); + let zero = BytesN::from_array(&env, &[0u8; 32]); + client.validate_upgrade(&zero); + } + + // ── validate_upgrade is repeatable / side-effect free ───────────────────── + + #[test] + fn validate_upgrade_is_idempotent() { + let (env, client) = setup(); + let hash = BytesN::from_array(&env, &[7u8; 32]); + // Calling the compatibility check repeatedly is always safe. + for _ in 0..5 { + client.validate_upgrade(&hash); + } + } + + // ── State preservation across the compatibility check ───────────────────── + + #[test] + fn validate_upgrade_preserves_committed_state() { + let (env, client) = setup(); + env.mock_all_auths(); + + let owner =
::generate(&env); + let h1 = BytesN::from_array(&env, &[11u8; 32]); + let h2 = BytesN::from_array(&env, &[22u8; 32]); + + let id1 = client.commit_ip(&owner, &h1, &0u32); + let id2 = client.commit_ip(&owner, &h2, &0u32); + + // Run the upgrade compatibility gate against live state. + let candidate = BytesN::from_array(&env, &[9u8; 32]); + client.validate_upgrade(&candidate); + + // Records and ID allocation must be untouched by the validation. + let r1 = client.get_ip(&id1); + let r2 = client.get_ip(&id2); + assert_eq!(r1.commitment_hash, h1); + assert_eq!(r2.commitment_hash, h2); + assert_eq!(r1.owner, owner); + assert_eq!(r2.owner, owner); + + // The next allocated ID continues the sequence — no IDs were consumed. + let id3 = client.commit_ip(&owner, &BytesN::from_array(&env, &[33u8; 32]), &0u32); + assert_eq!(id3, id2 + 1); + } + + // ── Authorization guard on upgrade ──────────────────────────────────────── + + #[test] + #[should_panic(expected = "Error(Contract, #5)")] + fn upgrade_rejected_when_no_admin_initialized() { + // A fresh contract has never had `commit_ip` called, so no admin exists. + // `upgrade` must refuse rather than allow an unauthorized upgrade. + let (env, client) = setup(); + let hash = BytesN::from_array(&env, &[1u8; 32]); + client.upgrade(&hash); + } +} diff --git a/docs/backup-integrity-verification.md b/docs/backup-integrity-verification.md new file mode 100644 index 0000000..95275e4 --- /dev/null +++ b/docs/backup-integrity-verification.md @@ -0,0 +1,70 @@ +# Backup Integrity Verification (#559) + +## Overview + +Backups are only useful if they can actually be restored. `verify-backup-integrity.sh` +checks that a backup archive is structurally sound and complete **before** it is +relied upon for recovery, and `scripts/test-backup-integrity.sh` proves the +verifier accepts good backups and rejects every class of corruption. + +```bash +# Verify a single backup +./scripts/verify-backup-integrity.sh /var/backups/atomicip/backup_20260101_000000.tar.gz + +# Run the verifier's own test suite +bash scripts/test-backup-integrity.sh +``` + +## What the verifier checks + +1. **SHA-256 checksum** — if a `.sha256` sidecar is present (written + automatically by `backup-contract-state.sh`), the archive's checksum must + match it. A mismatch means the file was altered or truncated. +2. **Archive integrity** — `tar -tzf` must succeed (not corrupt/truncated). +3. **Required files present** — `metadata.json`, `ip_registry_state.json` and + `atomic_swap_state.json` must all exist. +4. **Valid JSON** — every `*.json` in the backup must parse. + +The script exits `0` only if every check passes; otherwise it exits `1` with a +description of the failure. + +## Checksums + +`backup-contract-state.sh` now writes a checksum sidecar next to each archive: + +``` +backup_20260101_000000.tar.gz +backup_20260101_000000.tar.gz.sha256 +``` + +Both are uploaded to S3 when `BACKUP_S3_BUCKET` is configured. If no sidecar is +present (e.g. a legacy backup), the checksum step is skipped with a note and the +remaining structural checks still run — so the change is backward compatible. + +## Test matrix + +The suite in `scripts/test-backup-integrity.sh` covers: + +| # | Input | Expected | +|---|-------|----------| +| 1 | Well-formed backup | Pass | +| 2 | Backup + matching checksum sidecar | Pass (checksum verified) | +| 3 | Archive changed after checksum written | Fail (checksum mismatch) | +| 4 | Corrupt/garbage archive | Fail (corruption) | +| 5 | Missing required state file | Fail (missing file) | +| 6 | Malformed JSON in a state file | Fail (invalid JSON) | +| 7 | No file argument | Fail (usage) | +| 8 | Nonexistent file | Fail (not found) | + +## Regular verification + +Integrity should be checked routinely, not only at restore time: + +- **On creation** — `backup-contract-state.sh` writes the checksum sidecar so + every backup is self-verifying. +- **On a schedule** — run `verify-backup-integrity.sh` against the most recent + archive(s) from a cron job or scheduled CI workflow and alert on any non-zero + exit. +- **Before restore** — `restore-contract-state.sh` and `activate-dr-site.sh` + should be preceded by a verification pass (the DR drill in + `docs/disaster-recovery-testing.md` enforces this ordering). diff --git a/docs/canary-deployment.md b/docs/canary-deployment.md new file mode 100644 index 0000000..431042b --- /dev/null +++ b/docs/canary-deployment.md @@ -0,0 +1,84 @@ +# Canary Deployment Testing (#560) + +## Overview + +A canary deployment rolls a new version out to a small slice of +traffic/infrastructure first, health-checks it, and only promotes it to the full +fleet if it stays healthy. If the canary fails, it is rolled back automatically. +`scripts/canary-deploy.sh` implements this flow and `scripts/test-canary-deployment.sh` +verifies the decision tree (promote vs. rollback) for every branch. + +```bash +# Run a canary deployment (uses real hooks / defaults) +./scripts/canary-deploy.sh + +# Run the canary logic test suite +bash scripts/test-canary-deployment.sh +``` + +## Flow + +``` +deploy canary ──▶ health gate (N consecutive checks) ──┬─ healthy ─▶ promote ─▶ SUCCESS + └─ unhealthy ─▶ rollback ─▶ FAIL (exit 1) +``` + +1. **Deploy** the new version to the canary slot. If deploy fails → rollback, exit 1. +2. **Health gate**: require `HEALTH_RETRIES` consecutive successful checks, + `HEALTH_INTERVAL` seconds apart. Any failed check → rollback, exit 1. +3. **Promote** the canary to the full fleet. If promotion fails → rollback, exit 1. +4. On full success, exit 0. + +Exit code `0` means promoted; `1` means failed-and-rolled-back, so CI/CD can +gate the rollout on it. + +## Pluggable hooks + +The four side-effecting steps are command hooks, overridable via the +environment. This lets the same script run against real infrastructure in +production and against mocks in tests: + +| Variable | Purpose | Default | +|----------|---------|---------| +| `CANARY_DEPLOY_CMD` | Deploy the new version to the canary slot | `stellar contract deploy --network $NETWORK` | +| `CANARY_HEALTH_CMD` | Return 0 if the canary is healthy | `curl -sf $CANARY_HEALTH_URL` | +| `CANARY_PROMOTE_CMD` | Promote canary to the full fleet | `echo Promoting canary to full fleet` | +| `CANARY_ROLLBACK_CMD` | Tear down the canary / restore previous version | `echo Rolling back canary` | + +Tuning knobs: + +| Variable | Purpose | Default | +|----------|---------|---------| +| `HEALTH_RETRIES` | Consecutive successful checks required | `3` | +| `HEALTH_INTERVAL` | Seconds between checks (set `0` in tests) | `5` | +| `CANARY_HEALTH_URL` | Default health endpoint | `https://canary.atomicip.io/health` | +| `NETWORK` | Target network label | `testnet` | + +## Example + +```bash +export NETWORK=testnet +export CANARY_DEPLOY_CMD="./scripts/deploy.sh --network testnet --canary" +export CANARY_HEALTH_CMD="./scripts/smoke-test.sh" +export CANARY_PROMOTE_CMD="./scripts/promote-canary.sh" +export CANARY_ROLLBACK_CMD="./scripts/rollback.sh" +./scripts/canary-deploy.sh +``` + +## Test matrix + +The suite in `scripts/test-canary-deployment.sh` drives the script through every +branch by injecting hook commands: + +| # | Scenario | Expected | +|---|----------|----------| +| 1 | Healthy canary | Promoted, exit 0, rollback never runs | +| 2 | Unhealthy canary | Rolled back, exit 1, promotion never runs | +| 3 | Deploy step fails | Rolled back, exit 1, health checks skipped | +| 4 | Canary fails on the 2nd probe | Rolled back on first failed check | +| 5 | Promotion step fails | Rolled back, exit 1 | + +## CI integration + +`.github/workflows/ops-tests.yml` runs this suite (via +`scripts/run-ops-tests.sh`) on every change under `scripts/`. diff --git a/docs/contract-upgrade-testing.md b/docs/contract-upgrade-testing.md new file mode 100644 index 0000000..f54234c --- /dev/null +++ b/docs/contract-upgrade-testing.md @@ -0,0 +1,64 @@ +# Contract Upgrade Testing (#557) + +## Overview + +Soroban contracts are upgraded in place via +`env.deployer().update_current_contract_wasm(new_wasm_hash)`. Because the same +persistent storage survives the swap, a new WASM that drops a storage key, +removes a function, or renumbers an error code can silently corrupt live data. +These tests pin down the **upgrade-safety surface** of the IP Registry contract +so an incompatible or unauthorized upgrade is rejected before it can run. + +```bash +cargo test -p ip_registry upgrade_tests +``` + +## What is tested + +The suite lives in `contracts/ip_registry/src/upgrade_tests.rs` and exercises +the two guards that protect an upgrade: + +| Test | Property | Expected outcome | +|------|----------|------------------| +| `validate_upgrade_accepts_typical_hash` | A well-formed candidate WASM hash is accepted | No panic | +| `validate_upgrade_accepts_all_ones_hash` | Boundary hash (`0xff…`) is accepted | No panic | +| `validate_upgrade_accepts_single_nonzero_byte` | Smallest non-zero hash is accepted | No panic | +| `validate_upgrade_rejects_zero_hash` | The zero hash (no/garbage WASM) is rejected | Panics `#5 UnauthorizedUpgrade` | +| `validate_upgrade_is_idempotent` | The check is repeatable | No panic across repeated calls | +| `validate_upgrade_preserves_committed_state` | Compatibility validation is read-only | Committed records and ID allocation unchanged | +| `upgrade_rejected_when_no_admin_initialized` | Upgrade requires an established admin | Panics `#5 UnauthorizedUpgrade` | + +## Upgrade-compatibility contract + +The following must **not** change across an upgrade, or committed IP records +become unreadable: + +| Category | Rule | +|----------|------| +| Storage keys | `DataKey` variants in use (e.g. `IpRecord`, `Admin`, `NextId`) must be preserved | +| Function names | Existing exported functions must remain callable | +| Error codes | `ContractError` discriminants must be stable (e.g. `UnauthorizedUpgrade = 5`) | +| Record layout | `IpRecord` fields must remain backward-compatible | + +`validate_upgrade` is the on-chain gate for this contract. It currently rejects +an obviously invalid (zero) WASM hash; richer manifest comparison (enumerating +functions, storage keys and error codes) is tracked as a TODO in +`contracts/ip_registry/src/lib.rs`. + +## What is intentionally *not* unit-tested + +The successful `upgrade` path calls `update_current_contract_wasm`, which +requires a genuinely **installed** WASM hash. The unit-test host cannot install +a second contract WASM, so the success path is validated on testnet during a +real deploy (see `docs/deployment-guide.md`) rather than in unit tests. The +compatibility check and the admin-authorization guard that protect that call +are what these tests cover. + +## Adding new upgrade tests + +1. Add a `#[test]` to the `upgrade_tests` module. +2. For rejection paths, assert the specific error with + `#[should_panic(expected = "Error(Contract, #5)")]`. +3. For state-preservation properties, commit records, run the operation under + test, then assert records and `NextId` allocation are unchanged. +4. Document the new case in the table above. diff --git a/docs/disaster-recovery-testing.md b/docs/disaster-recovery-testing.md new file mode 100644 index 0000000..43bb6c5 --- /dev/null +++ b/docs/disaster-recovery-testing.md @@ -0,0 +1,75 @@ +# Disaster Recovery Testing (#558) + +## Overview + +A disaster recovery plan is only credible if it is exercised. `scripts/test-disaster-recovery.sh` +runs the full recovery chain — **back up → verify → restore → re-verify +services** — automatically, in a hermetic sandbox. External dependencies +(Stellar RPC, AWS S3, the live API, Postgres) are replaced by mocks, so the +drill runs anywhere without touching production infrastructure. + +It complements the human-facing runbook in `docs/disaster-recovery-plan.md`, +turning the documented procedure into something CI can gate on. + +```bash +bash scripts/test-disaster-recovery.sh +``` + +## Recovery chain under test + +``` +backup-contract-state.sh ─▶ verify-backup-integrity.sh ─▶ restore-contract-state.sh ─▶ verify-all-services.sh +``` + +## Scenarios + +| # | Scenario | Expected outcome | +|---|----------|------------------| +| 1 | Back up contract state | Backup archive + checksum sidecar produced | +| 2 | Verify the fresh backup | Integrity check passes | +| 3 | Restore from backup (non-interactive) | Restore completes | +| 4 | Verify all services post-recovery | API, contracts and DB healthy | +| 5 | Corrupt/truncated backup | Rejected before restore | +| 6 | Backup missing required state | Rejected before restore | + +## How the sandbox works + +The suite sources `scripts/tests/lib.sh`, which: + +- creates a throwaway working directory and a `mock-bin` directory prepended to `PATH`; +- installs mock `stellar-cli`, `aws`, `curl` and `pg_isready` commands that + return controlled output; +- provides assertion helpers (`run_case`, `assert_contains`, …) and a + `build_valid_backup` fixture. + +Because the recovery scripts only ever call the mocked binaries, no real +Stellar transaction, S3 upload or HTTP request is made. + +## Recovery objectives + +These are defined in `docs/disaster-recovery-plan.md` and the drill validates +that the mechanical steps behind them work: + +| Objective | Target | +|-----------|--------| +| RTO (Recovery Time Objective) | 4 hours | +| RPO (Recovery Point Objective) | 1 hour | +| MTD (Maximum Tolerable Downtime) | 24 hours | + +## CI integration + +`.github/workflows/ops-tests.yml` runs this suite (via +`scripts/run-ops-tests.sh`) on every change under `scripts/`, so a regression +in any recovery script fails the build. + +## Running a real drill + +In a staging environment with real credentials, run the underlying scripts +directly instead of the test harness: + +```bash +export IP_REGISTRY_CONTRACT_ID=... ATOMIC_SWAP_CONTRACT_ID=... NETWORK=testnet +./scripts/backup-contract-state.sh +./scripts/verify-backup-integrity.sh /var/backups/atomicip/backup_*.tar.gz +./scripts/activate-dr-site.sh # full DR-site activation +``` diff --git a/scripts/backup-contract-state.sh b/scripts/backup-contract-state.sh index 4d72021..b208067 100755 --- a/scripts/backup-contract-state.sh +++ b/scripts/backup-contract-state.sh @@ -55,12 +55,19 @@ echo "Compressing backup..." tar -czf "$BACKUP_DIR/backup_$TIMESTAMP.tar.gz" -C "$BACKUP_DIR" "$TIMESTAMP" rm -rf "$BACKUP_DIR/$TIMESTAMP" -# Upload to remote storage if configured +# Write a SHA-256 checksum sidecar so integrity can be verified later (#559). +echo "Writing checksum sidecar..." +( cd "$BACKUP_DIR" && sha256sum "backup_$TIMESTAMP.tar.gz" > "backup_$TIMESTAMP.tar.gz.sha256" ) + +# Upload to remote storage if configured (archive + checksum sidecar) if [ -n "$BACKUP_S3_BUCKET" ]; then echo "Uploading to S3..." aws s3 cp "$BACKUP_DIR/backup_$TIMESTAMP.tar.gz" \ "s3://$BACKUP_S3_BUCKET/$NETWORK/" \ --storage-class STANDARD_IA + aws s3 cp "$BACKUP_DIR/backup_$TIMESTAMP.tar.gz.sha256" \ + "s3://$BACKUP_S3_BUCKET/$NETWORK/" \ + --storage-class STANDARD_IA fi echo "Backup completed: backup_$TIMESTAMP.tar.gz" diff --git a/scripts/canary-deploy.sh b/scripts/canary-deploy.sh new file mode 100755 index 0000000..ff93708 --- /dev/null +++ b/scripts/canary-deploy.sh @@ -0,0 +1,90 @@ +#!/usr/bin/env bash +# scripts/canary-deploy.sh +# Canary deployment with automated health gating and rollback (#560). +# +# A canary release is rolled out to a small slice of traffic/infrastructure +# first, health-checked, and only promoted to the full fleet if it stays +# healthy. If the canary fails its health checks it is rolled back automatically +# and the script exits non-zero so CI/CD halts the rollout. +# +# The four side-effecting steps are pluggable command hooks so the script can be +# driven against real infrastructure in production and against mocks in tests: +# +# CANARY_DEPLOY_CMD deploy the new version to the canary slot +# CANARY_HEALTH_CMD return 0 if the canary is healthy, non-zero otherwise +# CANARY_PROMOTE_CMD promote the canary to the full fleet +# CANARY_ROLLBACK_CMD tear the canary down / restore the previous version +# +# Tuning knobs: +# HEALTH_RETRIES number of consecutive successful checks required (default 3) +# HEALTH_INTERVAL seconds to sleep between checks (default 5; set 0 in tests) +# NETWORK target network label, informational (default testnet) +# +# Exit codes: 0 = canary healthy and promoted; 1 = failed and rolled back. + +set -uo pipefail + +NETWORK="${NETWORK:-testnet}" +HEALTH_RETRIES="${HEALTH_RETRIES:-3}" +HEALTH_INTERVAL="${HEALTH_INTERVAL:-5}" +CANARY_HEALTH_URL="${CANARY_HEALTH_URL:-https://canary.atomicip.io/health}" + +# Default hooks. Each may be overridden via the environment. +CANARY_DEPLOY_CMD="${CANARY_DEPLOY_CMD:-stellar contract deploy --network $NETWORK}" +CANARY_HEALTH_CMD="${CANARY_HEALTH_CMD:-curl -sf $CANARY_HEALTH_URL}" +CANARY_PROMOTE_CMD="${CANARY_PROMOTE_CMD:-echo Promoting canary to full fleet}" +CANARY_ROLLBACK_CMD="${CANARY_ROLLBACK_CMD:-echo Rolling back canary}" + +echo "=== Canary Deployment ($NETWORK) ===" +echo "Health gate: $HEALTH_RETRIES consecutive checks, ${HEALTH_INTERVAL}s apart" +echo "" + +rollback() { + echo "" + echo "CANARY: rolling back" + if eval "$CANARY_ROLLBACK_CMD"; then + echo "✓ Rollback complete" + else + echo "✗ Rollback command failed — MANUAL INTERVENTION REQUIRED" + fi + echo "=== CANARY DEPLOYMENT FAILED — ROLLED BACK ===" +} + +# ── Step 1: deploy the canary ───────────────────────────────────────────────── +echo "Step 1: Deploying canary..." +if ! eval "$CANARY_DEPLOY_CMD"; then + echo "✗ Canary deployment failed" + rollback + exit 1 +fi +echo "✓ Canary deployed" + +# ── Step 2: health gate ─────────────────────────────────────────────────────── +echo "" +echo "Step 2: Health-checking canary..." +attempt=0 +while [ "$attempt" -lt "$HEALTH_RETRIES" ]; do + attempt=$((attempt + 1)) + if eval "$CANARY_HEALTH_CMD" > /dev/null 2>&1; then + echo "✓ Health check $attempt/$HEALTH_RETRIES passed" + else + echo "✗ Health check $attempt/$HEALTH_RETRIES failed" + rollback + exit 1 + fi + if [ "$attempt" -lt "$HEALTH_RETRIES" ] && [ "$HEALTH_INTERVAL" -gt 0 ]; then + sleep "$HEALTH_INTERVAL" + fi +done + +# ── Step 3: promote ─────────────────────────────────────────────────────────── +echo "" +echo "Step 3: Canary healthy — promoting to full fleet..." +if ! eval "$CANARY_PROMOTE_CMD"; then + echo "✗ Promotion failed" + rollback + exit 1 +fi +echo "✓ Promotion complete" +echo "" +echo "=== CANARY DEPLOYMENT SUCCESSFUL ===" diff --git a/scripts/run-ops-tests.sh b/scripts/run-ops-tests.sh new file mode 100755 index 0000000..cfd3292 --- /dev/null +++ b/scripts/run-ops-tests.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash +# scripts/run-ops-tests.sh +# Run all operational shell-script test suites (#558, #559, #560). +# +# These suites are hermetic — they mock Stellar, AWS, the API and Postgres — so +# they are safe to run locally and in CI without any infrastructure. + +set -uo pipefail +DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +SUITES=( + "test-disaster-recovery.sh" + "test-backup-integrity.sh" + "test-canary-deployment.sh" +) + +FAILED=() +for suite in "${SUITES[@]}"; do + echo "" + echo "###########################################################" + echo "# $suite" + echo "###########################################################" + if ! bash "$DIR/$suite"; then + FAILED+=("$suite") + fi +done + +echo "" +echo "===========================================================" +if [ "${#FAILED[@]}" -gt 0 ]; then + echo "OPS TESTS FAILED: ${FAILED[*]}" + exit 1 +fi +echo "ALL OPS TEST SUITES PASSED" diff --git a/scripts/test-backup-integrity.sh b/scripts/test-backup-integrity.sh new file mode 100755 index 0000000..5468d45 --- /dev/null +++ b/scripts/test-backup-integrity.sh @@ -0,0 +1,90 @@ +#!/usr/bin/env bash +# scripts/test-backup-integrity.sh +# Tests for backup integrity verification (#559). +# +# Exercises scripts/verify-backup-integrity.sh against a matrix of healthy and +# damaged backups to prove the verifier accepts good archives and rejects every +# class of corruption: bad archive, missing state, malformed JSON, and a +# checksum that does not match its sidecar. + +set -uo pipefail +source "$(dirname "${BASH_SOURCE[0]}")/tests/lib.sh" + +VERIFY="$SCRIPTS_DIR/verify-backup-integrity.sh" + +echo "=== Backup Integrity Verification (#559) ===" + +setup_sandbox +trap teardown_sandbox EXIT + +# ── Case 1: a well-formed backup passes ─────────────────────────────────────── +echo "" +echo "Case 1: Valid backup" +GOOD="$SANDBOX/good.tar.gz" +build_valid_backup "$GOOD" +run_case "valid backup passes" 0 bash "$VERIFY" "$GOOD" +assert_contains "reports success" "Backup verification passed" + +# ── Case 2: matching checksum sidecar passes ────────────────────────────────── +echo "" +echo "Case 2: Valid backup with matching checksum sidecar" +sha256sum "$GOOD" > "$GOOD.sha256" +run_case "matching checksum passes" 0 bash "$VERIFY" "$GOOD" +assert_contains "checksum verified" "Checksum matches" + +# ── Case 3: tampered archive with stale checksum fails ──────────────────────── +echo "" +echo "Case 3: Checksum mismatch (archive changed after checksum written)" +TAMPERED="$SANDBOX/tampered.tar.gz" +build_valid_backup "$TAMPERED" "20260202_000000" +sha256sum "$GOOD" > "$TAMPERED.sha256" # sidecar belongs to a different archive +run_case "checksum mismatch fails" 1 bash "$VERIFY" "$TAMPERED" +assert_contains "mismatch reported" "Checksum mismatch" + +# ── Case 4: corrupt archive fails ───────────────────────────────────────────── +echo "" +echo "Case 4: Corrupt archive" +CORRUPT="$SANDBOX/corrupt.tar.gz" +head -c 512 /dev/urandom > "$CORRUPT" +run_case "corrupt archive fails" 1 bash "$VERIFY" "$CORRUPT" +assert_contains "corruption reported" "corrupted" + +# ── Case 5: missing required state file fails ───────────────────────────────── +echo "" +echo "Case 5: Missing required state file" +MISSING_STAGE="$SANDBOX/missing/20260303_000000" +mkdir -p "$MISSING_STAGE" +echo '{"timestamp":"x"}' > "$MISSING_STAGE/metadata.json" +echo '[]' > "$MISSING_STAGE/ip_registry_state.json" +# atomic_swap_state.json intentionally omitted +MISSING="$SANDBOX/missing.tar.gz" +tar -czf "$MISSING" -C "$SANDBOX/missing" "20260303_000000" +run_case "missing state file fails" 1 bash "$VERIFY" "$MISSING" +assert_contains "missing file reported" "(missing)" + +# ── Case 6: malformed JSON fails ────────────────────────────────────────────── +echo "" +echo "Case 6: Malformed JSON in state file" +BADJSON_STAGE="$SANDBOX/badjson/20260404_000000" +mkdir -p "$BADJSON_STAGE" +echo '{"timestamp":"x"}' > "$BADJSON_STAGE/metadata.json" +echo 'not valid json {' > "$BADJSON_STAGE/ip_registry_state.json" +echo '[]' > "$BADJSON_STAGE/atomic_swap_state.json" +BADJSON="$SANDBOX/badjson.tar.gz" +tar -czf "$BADJSON" -C "$SANDBOX/badjson" "20260404_000000" +run_case "malformed JSON fails" 1 bash "$VERIFY" "$BADJSON" +assert_contains "invalid JSON reported" "invalid JSON" + +# ── Case 7: missing file argument fails with usage ──────────────────────────── +echo "" +echo "Case 7: No argument" +run_case "missing argument fails" 1 bash "$VERIFY" +assert_contains "prints usage" "Usage:" + +# ── Case 8: nonexistent file fails ──────────────────────────────────────────── +echo "" +echo "Case 8: Nonexistent file" +run_case "nonexistent file fails" 1 bash "$VERIFY" "$SANDBOX/does-not-exist.tar.gz" +assert_contains "not-found reported" "not found" + +finish_suite diff --git a/scripts/test-canary-deployment.sh b/scripts/test-canary-deployment.sh new file mode 100755 index 0000000..562580a --- /dev/null +++ b/scripts/test-canary-deployment.sh @@ -0,0 +1,118 @@ +#!/usr/bin/env bash +# scripts/test-canary-deployment.sh +# Tests for canary deployment with health gating and rollback (#560). +# +# Drives scripts/canary-deploy.sh through its decision tree by injecting hook +# commands, and asserts the right terminal state (promote vs. rollback) for each: +# 1. Healthy canary → promoted, exits 0 +# 2. Unhealthy canary → rolled back, exits 1 +# 3. Deploy step fails → rolled back, exits 1, never health-checks +# 4. Flaky-then-failing canary→ rolled back on first failed check +# 5. Promotion step fails → rolled back, exits 1 + +set -uo pipefail +source "$(dirname "${BASH_SOURCE[0]}")/tests/lib.sh" + +CANARY="$SCRIPTS_DIR/canary-deploy.sh" + +echo "=== Canary Deployment Testing (#560) ===" + +setup_sandbox +trap teardown_sandbox EXIT + +# No real sleeps between checks. +export HEALTH_INTERVAL=0 +export HEALTH_RETRIES=3 + +# ── Case 1: healthy canary is promoted ──────────────────────────────────────── +echo "" +echo "Case 1: Healthy canary → promote" +run_case "healthy canary exits 0" 0 env \ + CANARY_DEPLOY_CMD="true" \ + CANARY_HEALTH_CMD="true" \ + CANARY_PROMOTE_CMD="echo PROMOTED" \ + CANARY_ROLLBACK_CMD="echo SHOULD-NOT-ROLLBACK" \ + bash "$CANARY" +assert_contains "deployment marked successful" "CANARY DEPLOYMENT SUCCESSFUL" +assert_contains "promotion ran" "PROMOTED" + +# A healthy run must never invoke rollback. +TESTS_RUN=$((TESTS_RUN + 1)) +if echo "$LAST_OUTPUT" | grep -qF "SHOULD-NOT-ROLLBACK"; then + echo " ✗ rollback must not run on healthy canary" + TESTS_FAILED=$((TESTS_FAILED + 1)); FAILED_NAMES+=("no-rollback-on-healthy") +else + echo " ✓ rollback not invoked on healthy canary" + TESTS_PASSED=$((TESTS_PASSED + 1)) +fi + +# ── Case 2: unhealthy canary is rolled back ─────────────────────────────────── +echo "" +echo "Case 2: Unhealthy canary → rollback" +run_case "unhealthy canary exits 1" 1 env \ + CANARY_DEPLOY_CMD="true" \ + CANARY_HEALTH_CMD="false" \ + CANARY_PROMOTE_CMD="echo SHOULD-NOT-PROMOTE" \ + CANARY_ROLLBACK_CMD="echo ROLLED-BACK" \ + bash "$CANARY" +assert_contains "deployment marked failed" "CANARY DEPLOYMENT FAILED" +assert_contains "rollback ran" "ROLLED-BACK" + +TESTS_RUN=$((TESTS_RUN + 1)) +if echo "$LAST_OUTPUT" | grep -qF "SHOULD-NOT-PROMOTE"; then + echo " ✗ promotion must not run on unhealthy canary" + TESTS_FAILED=$((TESTS_FAILED + 1)); FAILED_NAMES+=("no-promote-on-unhealthy") +else + echo " ✓ promotion not invoked on unhealthy canary" + TESTS_PASSED=$((TESTS_PASSED + 1)) +fi + +# ── Case 3: deploy failure rolls back before health checks ──────────────────── +echo "" +echo "Case 3: Deploy step fails → rollback, no health checks" +run_case "deploy failure exits 1" 1 env \ + CANARY_DEPLOY_CMD="false" \ + CANARY_HEALTH_CMD="echo SHOULD-NOT-HEALTHCHECK" \ + CANARY_ROLLBACK_CMD="echo ROLLED-BACK" \ + bash "$CANARY" +assert_contains "deploy failure reported" "Canary deployment failed" + +TESTS_RUN=$((TESTS_RUN + 1)) +if echo "$LAST_OUTPUT" | grep -qF "SHOULD-NOT-HEALTHCHECK"; then + echo " ✗ health check must not run when deploy fails" + TESTS_FAILED=$((TESTS_FAILED + 1)); FAILED_NAMES+=("no-healthcheck-on-deploy-fail") +else + echo " ✓ health check skipped when deploy fails" + TESTS_PASSED=$((TESTS_PASSED + 1)) +fi + +# ── Case 4: canary that fails partway through the gate is rolled back ────────── +echo "" +echo "Case 4: Health check fails on 2nd probe → rollback" +COUNTER="$SANDBOX/health_counter" +echo 0 > "$COUNTER" +# Passes the first probe, fails the second. +HEALTH_SCRIPT="n=\$(cat $COUNTER); n=\$((n+1)); echo \$n > $COUNTER; [ \$n -le 1 ]" +run_case "flaky canary exits 1" 1 env \ + CANARY_DEPLOY_CMD="true" \ + CANARY_HEALTH_CMD="$HEALTH_SCRIPT" \ + CANARY_PROMOTE_CMD="echo SHOULD-NOT-PROMOTE" \ + CANARY_ROLLBACK_CMD="echo ROLLED-BACK" \ + bash "$CANARY" +assert_contains "rolled back after partial failure" "ROLLED-BACK" +assert_contains "first probe passed" "Health check 1/3 passed" +assert_contains "second probe failed" "Health check 2/3 failed" + +# ── Case 5: promotion failure rolls back ────────────────────────────────────── +echo "" +echo "Case 5: Promotion step fails → rollback" +run_case "promotion failure exits 1" 1 env \ + CANARY_DEPLOY_CMD="true" \ + CANARY_HEALTH_CMD="true" \ + CANARY_PROMOTE_CMD="false" \ + CANARY_ROLLBACK_CMD="echo ROLLED-BACK" \ + bash "$CANARY" +assert_contains "promotion failure reported" "Promotion failed" +assert_contains "rolled back after promotion failure" "ROLLED-BACK" + +finish_suite diff --git a/scripts/test-disaster-recovery.sh b/scripts/test-disaster-recovery.sh new file mode 100755 index 0000000..df7287a --- /dev/null +++ b/scripts/test-disaster-recovery.sh @@ -0,0 +1,103 @@ +#!/usr/bin/env bash +# scripts/test-disaster-recovery.sh +# Automated disaster-recovery drill (#558). +# +# Exercises the full backup → verify → restore recovery chain end-to-end in a +# hermetic sandbox. External dependencies (Stellar RPC, AWS S3, the live API, +# Postgres) are replaced by mocks, so the drill runs anywhere — locally or in +# CI — without touching production infrastructure. +# +# Scenarios covered: +# 1. A backup can be produced from contract state. +# 2. The produced backup passes integrity verification. +# 3. The backup can be restored (non-interactive confirmation). +# 4. All services report healthy after recovery. +# 5. A truncated/corrupt backup is rejected before any restore is attempted. +# 6. A backup missing required state is rejected. +# +# Exit code is non-zero if any scenario fails, so CI can gate on it. + +set -uo pipefail +source "$(dirname "${BASH_SOURCE[0]}")/tests/lib.sh" + +echo "=== Disaster Recovery Drill (#558) ===" + +setup_sandbox +trap teardown_sandbox EXIT + +# ── Mock external services ──────────────────────────────────────────────────── +# stellar-cli is used by backup-contract-state.sh (state export + ledger query) +# and verify-all-services.sh (contract reachability). +mock_command stellar-cli ' +case "$*" in + *"network status"*) echo "{\"ledger\": 12345}" ;; + *"contract invoke"*get_ip_count*) echo "0" ;; + *"contract invoke"*get_swap_count*) echo "0" ;; + *"contract invoke"*list_all_ips*) echo "[{\"ip_id\":1,\"owner\":\"GTEST\"}]" ;; + *"contract invoke"*list_all_swaps*) echo "[{\"swap_id\":1,\"ip_id\":1}]" ;; + *) echo "[]" ;; +esac +' +# verify-all-services.sh probes the API and DB; make them all healthy. +mock_command curl 'exit 0' +mock_command pg_isready 'exit 0' +mock_command aws 'echo "mock-aws $*"; exit 0' + +export BACKUP_DIR="$SANDBOX/backups" +export NETWORK="testnet" +export IP_REGISTRY_CONTRACT_ID="CIPREGISTRY000000000000000000000000000000000000000000000" +export ATOMIC_SWAP_CONTRACT_ID="CATOMICSWAP0000000000000000000000000000000000000000000000" + +# ── Scenario 1: produce a backup ────────────────────────────────────────────── +echo "" +echo "Scenario 1: Back up contract state" +run_case "backup-contract-state.sh succeeds" 0 \ + bash "$SCRIPTS_DIR/backup-contract-state.sh" +assert_contains "backup reports completion" "Backup completed" + +BACKUP_FILE="$(find "$BACKUP_DIR" -name 'backup_*.tar.gz' 2>/dev/null | head -1)" +assert_file_exists "backup archive was created" "$BACKUP_FILE" + +# ── Scenario 2: verify the backup ───────────────────────────────────────────── +echo "" +echo "Scenario 2: Verify backup integrity" +run_case "verify-backup-integrity.sh passes on fresh backup" 0 \ + bash "$SCRIPTS_DIR/verify-backup-integrity.sh" "$BACKUP_FILE" +assert_contains "verification passes" "Backup verification passed" + +# ── Scenario 3: restore the backup (non-interactive) ────────────────────────── +echo "" +echo "Scenario 3: Restore from backup" +restore_with_yes() { echo "yes" | bash "$SCRIPTS_DIR/restore-contract-state.sh" "$BACKUP_FILE"; } +run_case "restore-contract-state.sh completes" 0 restore_with_yes +assert_contains "restore reaches completion" "Restoration completed" + +# ── Scenario 4: services healthy after recovery ─────────────────────────────── +echo "" +echo "Scenario 4: Post-recovery service verification" +run_case "verify-all-services.sh reports healthy" 0 \ + bash "$SCRIPTS_DIR/verify-all-services.sh" +assert_contains "all services verified" "All Services Verified" + +# ── Scenario 5: corrupt backup is rejected ──────────────────────────────────── +echo "" +echo "Scenario 5: Corrupt backup must be rejected" +CORRUPT="$SANDBOX/corrupt.tar.gz" +head -c 256 /dev/urandom > "$CORRUPT" +run_case "verify rejects corrupt archive" 1 \ + bash "$SCRIPTS_DIR/verify-backup-integrity.sh" "$CORRUPT" + +# ── Scenario 6: incomplete backup is rejected ───────────────────────────────── +echo "" +echo "Scenario 6: Incomplete backup must be rejected" +INCOMPLETE_STAGE="$SANDBOX/incomplete/20260101_000000" +mkdir -p "$INCOMPLETE_STAGE" +echo '{"timestamp":"x","network":"testnet"}' > "$INCOMPLETE_STAGE/metadata.json" +# ip_registry_state.json and atomic_swap_state.json deliberately omitted. +INCOMPLETE="$SANDBOX/incomplete.tar.gz" +tar -czf "$INCOMPLETE" -C "$SANDBOX/incomplete" "20260101_000000" +run_case "verify rejects incomplete backup" 1 \ + bash "$SCRIPTS_DIR/verify-backup-integrity.sh" "$INCOMPLETE" +assert_contains "missing-file failure reported" "verification failed" + +finish_suite diff --git a/scripts/tests/lib.sh b/scripts/tests/lib.sh new file mode 100755 index 0000000..014454c --- /dev/null +++ b/scripts/tests/lib.sh @@ -0,0 +1,150 @@ +#!/usr/bin/env bash +# scripts/tests/lib.sh +# Shared helpers for the operational shell-script test suites +# (#558 disaster recovery, #559 backup integrity, #560 canary deployment). +# +# These helpers let the ops scripts be exercised hermetically — no AWS, no +# Stellar RPC, no live API. External commands are replaced by mocks placed on +# a throwaway PATH so the control flow of each script can be asserted offline. + +# ── Counters ──────────────────────────────────────────────────────────────── +TESTS_RUN=0 +TESTS_PASSED=0 +TESTS_FAILED=0 +FAILED_NAMES=() + +# Root directory of the repository (two levels up from this file). +LIB_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "$LIB_DIR/../.." && pwd)" +SCRIPTS_DIR="$REPO_ROOT/scripts" + +# ── Sandbox management ──────────────────────────────────────────────────────── + +# Create an isolated working directory and a mock-binary directory, and prepend +# the mock dir to PATH. Returns the sandbox path via the SANDBOX global. +setup_sandbox() { + SANDBOX="$(mktemp -d "${TMPDIR:-/tmp}/atomicip_optest.XXXXXX")" + MOCK_BIN="$SANDBOX/mock-bin" + mkdir -p "$MOCK_BIN" + ORIGINAL_PATH="$PATH" + export PATH="$MOCK_BIN:$PATH" +} + +teardown_sandbox() { + export PATH="$ORIGINAL_PATH" + [ -n "$SANDBOX" ] && rm -rf "$SANDBOX" + SANDBOX="" +} + +# Write an executable mock command onto the sandbox PATH. +# mock_command +# The body is a bash snippet; "$@" holds the args the script passed. +mock_command() { + local name="$1" + local body="$2" + cat > "$MOCK_BIN/$name" < +# Runs the command, captures combined output into LAST_OUTPUT, and asserts the +# exit code matches the expectation. +run_case() { + local name="$1" + local expected="$2" + shift 2 + + TESTS_RUN=$((TESTS_RUN + 1)) + local actual=0 + LAST_OUTPUT="$("$@" 2>&1)" || actual=$? + + if [ "$actual" -eq "$expected" ]; then + echo " ✓ $name (exit $actual)" + TESTS_PASSED=$((TESTS_PASSED + 1)) + return 0 + else + echo " ✗ $name — expected exit $expected, got $actual" + echo " ── output ──" + echo "$LAST_OUTPUT" | sed 's/^/ /' + TESTS_FAILED=$((TESTS_FAILED + 1)) + FAILED_NAMES+=("$name") + return 1 + fi +} + +# assert_contains — checks LAST_OUTPUT from the previous run_case. +assert_contains() { + local name="$1" + local needle="$2" + TESTS_RUN=$((TESTS_RUN + 1)) + if echo "$LAST_OUTPUT" | grep -qF -- "$needle"; then + echo " ✓ $name (found \"$needle\")" + TESTS_PASSED=$((TESTS_PASSED + 1)) + else + echo " ✗ $name — expected output to contain \"$needle\"" + TESTS_FAILED=$((TESTS_FAILED + 1)) + FAILED_NAMES+=("$name") + fi +} + +# assert_file_exists +assert_file_exists() { + local name="$1" + local path="$2" + TESTS_RUN=$((TESTS_RUN + 1)) + if [ -f "$path" ]; then + echo " ✓ $name" + TESTS_PASSED=$((TESTS_PASSED + 1)) + else + echo " ✗ $name — expected file: $path" + TESTS_FAILED=$((TESTS_FAILED + 1)) + FAILED_NAMES+=("$name") + fi +} + +# Print a summary and return non-zero if anything failed. +finish_suite() { + echo "" + echo "──────────────────────────────────────────" + echo " $TESTS_PASSED/$TESTS_RUN checks passed" + if [ "$TESTS_FAILED" -gt 0 ]; then + echo " Failed: ${FAILED_NAMES[*]}" + echo "──────────────────────────────────────────" + return 1 + fi + echo "──────────────────────────────────────────" + return 0 +} + +# ── Fixtures ────────────────────────────────────────────────────────────────── + +# build_valid_backup +# Produces a well-formed backup archive identical in shape to what +# backup-contract-state.sh emits: a timestamped directory containing +# metadata.json, ip_registry_state.json and atomic_swap_state.json. +build_valid_backup() { + local dest="$1" + local stamp="${2:-20260101_000000}" + local stage="$SANDBOX/stage_$stamp" + mkdir -p "$stage/$stamp" + cat > "$stage/$stamp/metadata.json" < "$stage/$stamp/ip_registry_state.json" + echo '[{"swap_id":1,"ip_id":1,"price":1000}]' \ + > "$stage/$stamp/atomic_swap_state.json" + tar -czf "$dest" -C "$stage" "$stamp" + rm -rf "$stage" +} diff --git a/scripts/verify-backup-integrity.sh b/scripts/verify-backup-integrity.sh index 06480b9..fab7521 100755 --- a/scripts/verify-backup-integrity.sh +++ b/scripts/verify-backup-integrity.sh @@ -20,6 +20,28 @@ echo "Verifying backup: $BACKUP_FILE" echo "File size: $(du -h "$BACKUP_FILE" | cut -f1)" echo "" +# Verify SHA-256 checksum against a sidecar file if one is present (#559). +# backup-contract-state.sh writes ".sha256" alongside each archive. +# A mismatch means the archive was altered or truncated in transit/at rest. +CHECKSUM_FILE="$BACKUP_FILE.sha256" +if [ -f "$CHECKSUM_FILE" ]; then + echo "Verifying SHA-256 checksum..." + EXPECTED=$(awk '{print $1}' "$CHECKSUM_FILE") + ACTUAL=$(sha256sum "$BACKUP_FILE" | awk '{print $1}') + if [ "$EXPECTED" = "$ACTUAL" ]; then + echo "✓ Checksum matches ($ACTUAL)" + else + echo "✗ Checksum mismatch" + echo " expected: $EXPECTED" + echo " actual: $ACTUAL" + exit 1 + fi + echo "" +else + echo "Note: no checksum sidecar ($CHECKSUM_FILE); skipping checksum verification" + echo "" +fi + # Test archive integrity echo "Testing archive integrity..." if tar -tzf "$BACKUP_FILE" > /dev/null 2>&1; then