diff --git a/.github/workflows/terraform-apply-production.yml b/.github/workflows/terraform-apply-production.yml
new file mode 100644
index 0000000..415670e
--- /dev/null
+++ b/.github/workflows/terraform-apply-production.yml
@@ -0,0 +1,152 @@
+---
+# infra — gated Terraform apply for the PRODUCTION Cloudflare workspace.
+#
+# APPROVAL MODEL: workflow_dispatch + GitHub Environment "production"
+# with required reviewers. No push trigger. No "promote from staging"
+# trigger. Every production apply is a separate, deliberate decision
+# made by a human reviewer on a human-triggered run.
+#
+# Confirm phrase is stricter than staging — operator must type a
+# matching staging RUN_ID so they cannot apply prod without having
+# first applied + observed the same change in staging.
+#
+# Security note: every GHA expression consumed in a run: block is
+# wrapped through env: to prevent script injection.
+
+name: terraform-apply-production
+
+on:
+ workflow_dispatch:
+ inputs:
+ confirm:
+ description: 'Type APPLY-PRODUCTION to confirm'
+ required: true
+ type: string
+ staging_run_id:
+ description: 'GH Actions run_id of the matching staging apply (must be a numeric id)'
+ required: true
+ type: string
+
+permissions:
+ contents: read
+
+concurrency:
+ group: terraform-apply-production
+ cancel-in-progress: false # never cancel an in-flight apply
+
+env:
+ TF_VERSION: '1.9.8'
+ TF_IN_AUTOMATION: 'true'
+ TF_ENV: 'production'
+ CLOUDFLARE_API_TOKEN: ${{ secrets.CLOUDFLARE_API_TOKEN }}
+ AWS_ACCESS_KEY_ID: ${{ secrets.TF_STATE_R2_ACCESS_KEY_ID }}
+ AWS_SECRET_ACCESS_KEY: ${{ secrets.TF_STATE_R2_SECRET_ACCESS_KEY }}
+ AWS_REGION: 'auto'
+ CF_ACCOUNT_ID: ${{ secrets.CF_ACCOUNT_ID }}
+
+jobs:
+ guard:
+ name: confirm-input + staging-precedent guard
+ runs-on: ubuntu-latest
+ env:
+ CONFIRM_INPUT: ${{ inputs.confirm }}
+ STAGING_RUN_ID: ${{ inputs.staging_run_id }}
+ steps:
+ - name: Reject if confirm phrase wrong
+ run: |
+ if [ "${CONFIRM_INPUT}" != "APPLY-PRODUCTION" ]; then
+ echo "::error::confirm input must be exactly 'APPLY-PRODUCTION'"
+ exit 1
+ fi
+
+ - name: Reject if staging_run_id is not numeric
+ run: |
+ # ref-injection mitigation: validate strictly before any use.
+ case "${STAGING_RUN_ID}" in
+ ''|*[!0-9]*)
+ echo "::error::staging_run_id must be a numeric GH Actions run id (got '${STAGING_RUN_ID}')"
+ exit 1
+ ;;
+ esac
+
+ - name: Verify staging run exists + succeeded
+ env:
+ GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+ # STAGING_RUN_ID already validated as numeric above; safe to use.
+ run: |
+ conclusion=$(gh run view "${STAGING_RUN_ID}" --repo "${GITHUB_REPOSITORY}" --json conclusion --jq '.conclusion')
+ name=$(gh run view "${STAGING_RUN_ID}" --repo "${GITHUB_REPOSITORY}" --json name --jq '.name')
+ if [ "${name}" != "terraform-apply-staging" ]; then
+ echo "::error::staging_run_id ${STAGING_RUN_ID} is not a terraform-apply-staging run (got: ${name})"
+ exit 1
+ fi
+ if [ "${conclusion}" != "success" ]; then
+ echo "::error::staging_run_id ${STAGING_RUN_ID} did not succeed (conclusion: ${conclusion})"
+ exit 1
+ fi
+ echo "staging precedent ✓ (run ${STAGING_RUN_ID} = success)"
+
+ apply:
+ name: apply production
+ needs: guard
+ runs-on: ubuntu-latest
+ # GitHub Environment "production" must be configured with Required
+ # Reviewers — operator sets this up at repo Settings → Environments →
+ # production → Deployment protection rules. This is the second gate
+ # on top of the confirm input + staging-precedent checks above.
+ environment: production
+ defaults:
+ run:
+ working-directory: terraform/cloudflare
+ steps:
+ - uses: actions/checkout@v6
+
+ - uses: hashicorp/setup-terraform@v3
+ with:
+ terraform_version: ${{ env.TF_VERSION }}
+
+ - name: Verify operator secrets are set
+ run: |
+ missing=""
+ [ -z "${CLOUDFLARE_API_TOKEN}" ] && missing="${missing} CLOUDFLARE_API_TOKEN"
+ [ -z "${AWS_ACCESS_KEY_ID}" ] && missing="${missing} TF_STATE_R2_ACCESS_KEY_ID"
+ [ -z "${AWS_SECRET_ACCESS_KEY}" ] && missing="${missing} TF_STATE_R2_SECRET_ACCESS_KEY"
+ [ -z "${CF_ACCOUNT_ID}" ] && missing="${missing} CF_ACCOUNT_ID"
+ if [ -n "${missing}" ]; then
+ echo "::error::Operator action required — these repo secrets are not set:${missing}"
+ echo "::error::See https://github.com/InstaNode-dev/infra/blob/master/terraform/cloudflare/README.md#bootstrap-one-time"
+ exit 1
+ fi
+
+ - name: terraform init
+ run: |
+ terraform init \
+ -backend-config="endpoints={s3=\"https://${CF_ACCOUNT_ID}.r2.cloudflarestorage.com\"}" \
+ -backend-config="workspace_key_prefix=${TF_ENV}"
+
+ - name: terraform workspace select
+ run: terraform workspace select "${TF_ENV}"
+
+ - name: terraform plan
+ run: |
+ terraform plan \
+ -var-file="${TF_ENV}.auto.tfvars" \
+ -no-color \
+ -out=tfplan.bin
+
+ - name: terraform apply
+ run: terraform apply -no-color tfplan.bin
+
+ - name: Surface non-sensitive outputs (ids only, NO token values)
+ run: |
+ terraform output -no-color account_id || true
+ terraform output -no-color zone_id || true
+ terraform output -no-color deploy_token_id || true
+ terraform output -no-color admin_tunnel_token_id || true
+
+ - name: Reminder
+ run: |
+ echo "::notice::PRODUCTION APPLY COMPLETE."
+ echo "::notice::If tokens were created or rotated, run on an operator workstation:"
+ echo "::notice:: make install-secrets ENV=production"
+ echo "::notice::Confirm the CF dashboard audit log shows the change before revoking the prior token."
diff --git a/.github/workflows/terraform-apply-staging.yml b/.github/workflows/terraform-apply-staging.yml
new file mode 100644
index 0000000..2e8ef76
--- /dev/null
+++ b/.github/workflows/terraform-apply-staging.yml
@@ -0,0 +1,116 @@
+---
+# infra — gated Terraform apply for the STAGING Cloudflare workspace.
+#
+# APPROVAL MODEL: workflow_dispatch ONLY. Never on push, never on merge,
+# never auto-promoted from a previous apply. Operator deliberately
+# triggers this from the Actions tab.
+#
+# Why split per env: staging and production must not share an apply
+# trigger. Splitting prevents a "promote-on-success" pipeline from
+# ever existing for production — every prod apply is a separate human
+# decision (see terraform-apply-production.yml).
+#
+# Security note: every GHA expression consumed in a run: block is
+# wrapped through env: to prevent script injection.
+
+name: terraform-apply-staging
+
+on:
+ workflow_dispatch:
+ inputs:
+ confirm:
+ description: 'Type APPLY-STAGING to confirm'
+ required: true
+ type: string
+
+permissions:
+ contents: read
+
+concurrency:
+ group: terraform-apply-staging
+ cancel-in-progress: false # never cancel an in-flight apply
+
+env:
+ TF_VERSION: '1.9.8'
+ TF_IN_AUTOMATION: 'true'
+ TF_ENV: 'staging'
+ CLOUDFLARE_API_TOKEN: ${{ secrets.CLOUDFLARE_API_TOKEN }}
+ AWS_ACCESS_KEY_ID: ${{ secrets.TF_STATE_R2_ACCESS_KEY_ID }}
+ AWS_SECRET_ACCESS_KEY: ${{ secrets.TF_STATE_R2_SECRET_ACCESS_KEY }}
+ AWS_REGION: 'auto'
+ CF_ACCOUNT_ID: ${{ secrets.CF_ACCOUNT_ID }}
+
+jobs:
+ guard:
+ name: confirm-input guard
+ runs-on: ubuntu-latest
+ env:
+ CONFIRM_INPUT: ${{ inputs.confirm }}
+ steps:
+ - name: Reject if confirm phrase wrong
+ run: |
+ if [ "${CONFIRM_INPUT}" != "APPLY-STAGING" ]; then
+ echo "::error::confirm input must be exactly 'APPLY-STAGING'"
+ exit 1
+ fi
+
+ apply:
+ name: apply staging
+ needs: guard
+ runs-on: ubuntu-latest
+ environment: staging
+ defaults:
+ run:
+ working-directory: terraform/cloudflare
+ steps:
+ - uses: actions/checkout@v6
+
+ - uses: hashicorp/setup-terraform@v3
+ with:
+ terraform_version: ${{ env.TF_VERSION }}
+
+ - name: Verify operator secrets are set
+ run: |
+ missing=""
+ [ -z "${CLOUDFLARE_API_TOKEN}" ] && missing="${missing} CLOUDFLARE_API_TOKEN"
+ [ -z "${AWS_ACCESS_KEY_ID}" ] && missing="${missing} TF_STATE_R2_ACCESS_KEY_ID"
+ [ -z "${AWS_SECRET_ACCESS_KEY}" ] && missing="${missing} TF_STATE_R2_SECRET_ACCESS_KEY"
+ [ -z "${CF_ACCOUNT_ID}" ] && missing="${missing} CF_ACCOUNT_ID"
+ if [ -n "${missing}" ]; then
+ echo "::error::Operator action required — these repo secrets are not set:${missing}"
+ echo "::error::See https://github.com/InstaNode-dev/infra/blob/master/terraform/cloudflare/README.md#bootstrap-one-time"
+ exit 1
+ fi
+
+ - name: terraform init
+ run: |
+ terraform init \
+ -backend-config="endpoints={s3=\"https://${CF_ACCOUNT_ID}.r2.cloudflarestorage.com\"}" \
+ -backend-config="workspace_key_prefix=${TF_ENV}"
+
+ - name: terraform workspace select
+ run: terraform workspace select "${TF_ENV}"
+
+ - name: terraform plan
+ run: |
+ terraform plan \
+ -var-file="${TF_ENV}.auto.tfvars" \
+ -no-color \
+ -out=tfplan.bin
+
+ - name: terraform apply
+ run: terraform apply -no-color tfplan.bin
+
+ - name: Surface non-sensitive outputs (ids only, NO token values)
+ run: |
+ terraform output -no-color account_id || true
+ terraform output -no-color zone_id || true
+ terraform output -no-color deploy_token_id || true
+ terraform output -no-color admin_tunnel_token_id || true
+
+ - name: Reminder
+ run: |
+ echo "::notice::STAGING APPLY COMPLETE."
+ echo "::notice::If tokens were created or rotated, run on an operator workstation:"
+ echo "::notice:: make install-secrets ENV=staging"
+ echo "::notice::Promoting to production is a SEPARATE manual decision via terraform-apply-production.yml."
diff --git a/.github/workflows/terraform.yml b/.github/workflows/terraform.yml
new file mode 100644
index 0000000..fe9cc2c
--- /dev/null
+++ b/.github/workflows/terraform.yml
@@ -0,0 +1,170 @@
+---
+# infra — Terraform fmt + validate + plan for CF resources.
+#
+# Runs on every push to master and on PRs touching terraform/**.
+# Plan is read-only. Apply is split into per-env manual workflows
+# (terraform-apply-staging.yml, terraform-apply-production.yml).
+# This file NEVER applies — see those workflows for the apply path.
+#
+# Posts the plan diff as a PR comment so reviewers see what apply
+# would do without granting CI apply rights.
+#
+# Security note: all GHA expressions consumed in run: blocks are
+# referenced through env vars to prevent script injection.
+
+name: terraform
+
+on:
+ push:
+ branches: [master]
+ paths:
+ - 'terraform/**'
+ - '.github/workflows/terraform*.yml'
+ pull_request:
+ paths:
+ - 'terraform/**'
+ - '.github/workflows/terraform*.yml'
+ workflow_dispatch:
+
+permissions:
+ contents: read
+ pull-requests: write # for the plan comment
+
+concurrency:
+ group: terraform-plan-${{ github.ref }}
+ cancel-in-progress: true
+
+env:
+ TF_VERSION: '1.9.8'
+ TF_IN_AUTOMATION: 'true'
+
+jobs:
+ fmt-validate:
+ name: fmt + validate
+ runs-on: ubuntu-latest
+ defaults:
+ run:
+ working-directory: terraform/cloudflare
+ steps:
+ - uses: actions/checkout@v6
+
+ - uses: hashicorp/setup-terraform@v3
+ with:
+ terraform_version: ${{ env.TF_VERSION }}
+
+ - name: terraform fmt -check
+ run: terraform fmt -check -recursive
+
+ - name: terraform init (backend-bypassed)
+ run: terraform init -backend=false
+
+ - name: terraform validate
+ run: terraform validate -no-color
+
+ plan:
+ name: plan (${{ matrix.env }})
+ needs: fmt-validate
+ runs-on: ubuntu-latest
+ strategy:
+ fail-fast: false
+ matrix:
+ env: [staging, production]
+ defaults:
+ run:
+ working-directory: terraform/cloudflare
+ # CF creds + state-backend creds passed in via env, not inlined in run:.
+ env:
+ CLOUDFLARE_API_TOKEN: ${{ secrets.CLOUDFLARE_API_TOKEN }}
+ AWS_ACCESS_KEY_ID: ${{ secrets.TF_STATE_R2_ACCESS_KEY_ID }}
+ AWS_SECRET_ACCESS_KEY: ${{ secrets.TF_STATE_R2_SECRET_ACCESS_KEY }}
+ AWS_REGION: 'auto'
+ CF_ACCOUNT_ID: ${{ secrets.CF_ACCOUNT_ID }}
+ TF_ENV: ${{ matrix.env }}
+ steps:
+ - uses: actions/checkout@v6
+
+ - uses: hashicorp/setup-terraform@v3
+ with:
+ terraform_version: ${{ env.TF_VERSION }}
+
+ - name: Verify operator secrets are set
+ # Bootstrap chicken-and-egg: plan needs CF + R2-HMAC creds, but
+ # those are operator-only one-time setup (see README §Bootstrap).
+ # Without this guard the failure mode is a cryptic AWS-IAM stack
+ # trace from `terraform init`. With it, the error is one line
+ # pointing at the README and the exact missing variable names.
+ run: |
+ missing=""
+ [ -z "${CLOUDFLARE_API_TOKEN}" ] && missing="${missing} CLOUDFLARE_API_TOKEN"
+ [ -z "${AWS_ACCESS_KEY_ID}" ] && missing="${missing} TF_STATE_R2_ACCESS_KEY_ID"
+ [ -z "${AWS_SECRET_ACCESS_KEY}" ] && missing="${missing} TF_STATE_R2_SECRET_ACCESS_KEY"
+ [ -z "${CF_ACCOUNT_ID}" ] && missing="${missing} CF_ACCOUNT_ID"
+ if [ -n "${missing}" ]; then
+ echo "::error::Operator action required — these repo secrets are not set:${missing}"
+ echo "::error::See https://github.com/InstaNode-dev/infra/blob/master/terraform/cloudflare/README.md#bootstrap-one-time"
+ exit 1
+ fi
+ echo "all 4 operator secrets present"
+
+ - name: terraform init
+ run: |
+ terraform init \
+ -backend-config="endpoints={s3=\"https://${CF_ACCOUNT_ID}.r2.cloudflarestorage.com\"}" \
+ -backend-config="workspace_key_prefix=${TF_ENV}"
+
+ - name: terraform workspace select-or-create
+ run: terraform workspace select "${TF_ENV}" 2>/dev/null || terraform workspace new "${TF_ENV}"
+
+ - name: terraform plan
+ id: plan
+ run: |
+ set +e
+ terraform plan \
+ -var-file="${TF_ENV}.auto.tfvars" \
+ -no-color \
+ -out=tfplan.bin \
+ -detailed-exitcode 2>&1 | tee /tmp/plan.out
+ ec=${PIPESTATUS[0]}
+ echo "exitcode=${ec}" >> "$GITHUB_OUTPUT"
+ # 0 = no changes, 2 = changes, 1 = error
+ [ "${ec}" -eq 1 ] && exit 1 || exit 0
+
+ - name: Comment plan on PR
+ if: github.event_name == 'pull_request'
+ uses: actions/github-script@v7
+ env:
+ PLAN_ENV: ${{ matrix.env }}
+ PLAN_CODE: ${{ steps.plan.outputs.exitcode }}
+ RUN_ID: ${{ github.run_id }}
+ with:
+ script: |
+ const fs = require('fs');
+ let plan = fs.readFileSync('/tmp/plan.out', 'utf8');
+ if (plan.length > 60000) {
+ plan = plan.slice(0, 60000) + '\n\n... (truncated; full plan in job log)';
+ }
+ const env = process.env.PLAN_ENV;
+ const code = process.env.PLAN_CODE;
+ const verdict = code === '0' ? '✅ no changes'
+ : code === '2' ? '🟡 changes present — review before manual apply'
+ : '❌ plan failed';
+ const body = [
+ `### Terraform plan — \`${env}\``,
+ verdict,
+ '',
+ 'plan output
',
+ '',
+ '```hcl',
+ plan,
+ '```',
+ '',
+ ' ',
+ '',
+ `_Posted by terraform.yml run ${process.env.RUN_ID}. Apply requires manual trigger of terraform-apply-${env}.yml._`,
+ ].join('\n');
+ await github.rest.issues.createComment({
+ issue_number: context.issue.number,
+ owner: context.repo.owner,
+ repo: context.repo.repo,
+ body,
+ });
diff --git a/.github/workflows/wrangler-build-staging-images.yml b/.github/workflows/wrangler-build-staging-images.yml
new file mode 100644
index 0000000..c53ce26
--- /dev/null
+++ b/.github/workflows/wrangler-build-staging-images.yml
@@ -0,0 +1,203 @@
+---
+# infra — Build custom Docker images for CF Containers (staging only).
+#
+# Builds images that don't ship a usable upstream:
+# - pg-platform: postgres + pgvector + all 63 platform migrations baked in
+#
+# api / worker / provisioner images are built by their own repos' deploy.yml
+# (which now also pushes :staging — see api/.github/workflows/deploy.yml).
+# This workflow handles only the "wrapped upstream image" cases.
+#
+# Triggers:
+# - workflow_dispatch (with service input)
+# - daily cron 09:00 UTC (to pick up migrations merged in api repo)
+# - push to master touching infra/wrangler/pg-platform/**
+# - repository_dispatch event "migrations-changed" from the api repo
+#
+# Security: all GHA expressions consumed in run: blocks are wrapped
+# through env: to prevent script injection.
+
+name: wrangler-build-staging-images
+
+on:
+ workflow_dispatch:
+ inputs:
+ service:
+ description: 'Which custom image to build (or "all")'
+ required: true
+ type: choice
+ default: 'all'
+ options:
+ - all
+ - pg-platform
+ - mongodb
+ - redis-provision
+ - nats
+ push:
+ branches: [master]
+ paths:
+ - 'wrangler/pg-platform/**'
+ - 'wrangler/mongodb/**'
+ - 'wrangler/redis-provision/**'
+ - 'wrangler/nats/**'
+ - '.github/workflows/wrangler-build-staging-images.yml'
+ schedule:
+ - cron: '0 9 * * *' # daily 09:00 UTC
+ repository_dispatch:
+ types: [migrations-changed]
+
+permissions:
+ contents: read
+ packages: write
+
+concurrency:
+ group: wrangler-build-staging-${{ github.event.inputs.service || 'all' }}
+ cancel-in-progress: false
+
+env:
+ REGISTRY: ghcr.io
+ ORG: instanode-dev
+
+jobs:
+ pg-platform:
+ name: build pg-platform :staging
+ if: |
+ github.event_name == 'schedule' ||
+ github.event_name == 'push' ||
+ github.event_name == 'repository_dispatch' ||
+ (github.event_name == 'workflow_dispatch' && (github.event.inputs.service == 'all' || github.event.inputs.service == 'pg-platform'))
+ runs-on: ubuntu-latest
+ env:
+ SERVICE: pg-platform
+ steps:
+ - name: Checkout infra repo
+ uses: actions/checkout@v6
+ with:
+ path: infra
+
+ - name: Checkout api repo (for the migrations)
+ uses: actions/checkout@v6
+ with:
+ repository: ${{ vars.API_REPO || format('{0}/api', github.repository_owner) }}
+ token: ${{ secrets.REPO_ACCESS_TOKEN || secrets.GITHUB_TOKEN }}
+ path: api
+
+ - name: Verify migrations dir exists + count
+ env:
+ MIGRATIONS_DIR: api/internal/db/migrations
+ run: |
+ if [ ! -d "$MIGRATIONS_DIR" ]; then
+ echo "::error::expected migrations dir $MIGRATIONS_DIR not found"
+ exit 1
+ fi
+ count=$(find "$MIGRATIONS_DIR" -name '*.sql' | wc -l | tr -d ' ')
+ echo "migrations found: $count"
+ if [ "$count" -lt 50 ]; then
+ echo "::warning::only $count migration files — expected ≥50 (live count was 63 as of 2026-05-30)"
+ fi
+
+ - name: Set up Docker Buildx
+ uses: docker/setup-buildx-action@v4
+
+ - name: Log in to GHCR
+ uses: docker/login-action@v4
+ with:
+ registry: ${{ env.REGISTRY }}
+ username: ${{ github.actor }}
+ # GHCR_PUSH_TOKEN is a classic PAT with write:packages, same
+ # pattern as the api/worker/provisioner deploy.yml workflows.
+ password: ${{ secrets.GHCR_PUSH_TOKEN || secrets.GITHUB_TOKEN }}
+
+ - name: Build and push
+ env:
+ IMAGE: ${{ env.REGISTRY }}/${{ env.ORG }}/instant-pg-platform
+ run: |
+ docker buildx build \
+ --platform linux/amd64 \
+ -f infra/wrangler/pg-platform/Dockerfile \
+ -t "${IMAGE}:staging" \
+ -t "${IMAGE}:staging-$(date -u +%Y%m%d)" \
+ --push \
+ .
+
+ - name: Reminder
+ run: |
+ echo "::notice::pg-platform :staging image rebuilt with current migrations."
+ echo "::notice::Next CF Container cold start will re-apply them from the new image."
+ echo "::notice::Trigger a rolling restart with: wrangler deployments tail --env staging"
+
+ # ---------------------------------------------------------------------------
+ # mongodb / redis-provision / nats — small wrapped images.
+ #
+ # These don't need cross-repo migration sync (the wrapping config is fully
+ # self-contained under infra/wrangler//). Single-repo checkout +
+ # build + push to GHCR. Same SERVICE-input gating as pg-platform.
+ # ---------------------------------------------------------------------------
+
+ small-images:
+ name: build ${{ matrix.svc }} :staging
+ if: |
+ github.event_name == 'schedule' ||
+ github.event_name == 'push' ||
+ github.event_name == 'repository_dispatch' ||
+ (github.event_name == 'workflow_dispatch' && (github.event.inputs.service == 'all' || github.event.inputs.service == 'mongodb' || github.event.inputs.service == 'redis-provision' || github.event.inputs.service == 'nats'))
+ runs-on: ubuntu-latest
+ strategy:
+ fail-fast: false
+ matrix:
+ svc: [mongodb, redis-provision, nats]
+ env:
+ SVC: ${{ matrix.svc }}
+ steps:
+ - name: Checkout infra repo
+ uses: actions/checkout@v6
+ with:
+ path: infra
+
+ - name: Skip if matrix svc doesn't match workflow_dispatch input
+ # Avoids spurious matrix entries when operator selected a single
+ # svc via workflow_dispatch. push / cron / dispatch run all 3.
+ id: gate
+ run: |
+ if [ "${{ github.event_name }}" != "workflow_dispatch" ]; then
+ echo "skip=false" >> "$GITHUB_OUTPUT"
+ exit 0
+ fi
+ INPUT="${{ github.event.inputs.service }}"
+ if [ "$INPUT" = "all" ] || [ "$INPUT" = "$SVC" ]; then
+ echo "skip=false" >> "$GITHUB_OUTPUT"
+ else
+ echo "skip=true" >> "$GITHUB_OUTPUT"
+ echo "::notice::skipping $SVC (workflow_dispatch input was '$INPUT')"
+ fi
+
+ - name: Set up Docker Buildx
+ if: steps.gate.outputs.skip == 'false'
+ uses: docker/setup-buildx-action@v4
+
+ - name: Log in to GHCR
+ if: steps.gate.outputs.skip == 'false'
+ uses: docker/login-action@v4
+ with:
+ registry: ${{ env.REGISTRY }}
+ username: ${{ github.actor }}
+ password: ${{ secrets.GHCR_PUSH_TOKEN || secrets.GITHUB_TOKEN }}
+
+ - name: Build and push
+ if: steps.gate.outputs.skip == 'false'
+ env:
+ IMAGE: ${{ env.REGISTRY }}/${{ env.ORG }}/instant-${{ matrix.svc }}
+ run: |
+ docker buildx build \
+ --platform linux/amd64 \
+ -f "infra/wrangler/${SVC}/Dockerfile" \
+ -t "${IMAGE}:staging" \
+ -t "${IMAGE}:staging-$(date -u +%Y%m%d)" \
+ --push \
+ .
+
+ - name: Reminder
+ if: steps.gate.outputs.skip == 'false'
+ run: |
+ echo "::notice::${SVC} :staging image rebuilt."
+ echo "::notice::Trigger a rolling restart with: wrangler containers deploy --env staging"
diff --git a/.github/workflows/wrangler-deploy-staging.yml b/.github/workflows/wrangler-deploy-staging.yml
new file mode 100644
index 0000000..69c63cb
--- /dev/null
+++ b/.github/workflows/wrangler-deploy-staging.yml
@@ -0,0 +1,110 @@
+---
+# infra — CF Containers deploy for staging via wrangler.
+#
+# APPROVAL MODEL: workflow_dispatch ONLY for the first ~10 runs (manual
+# verification). After staging stabilizes, can be promoted to auto-run on
+# merge to master (controlled by the `auto_deploy` input).
+#
+# Production does NOT use this workflow — see the eventual
+# production-deploy.yml when the prod target is settled.
+#
+# Security: all GHA expressions consumed in run: blocks are wrapped
+# through env: to prevent script injection.
+
+name: wrangler-deploy-staging
+
+on:
+ workflow_dispatch:
+ inputs:
+ service:
+ description: 'Which service to deploy (or "all")'
+ required: true
+ type: choice
+ options:
+ - all
+ - api
+ - worker
+ - provisioner
+ - pg-platform
+ - pg-customers
+ - mongodb
+ - redis-provision
+ - nats
+ confirm:
+ description: 'Type DEPLOY-STAGING to confirm'
+ required: true
+ type: string
+
+permissions:
+ contents: read
+
+concurrency:
+ group: wrangler-deploy-staging-${{ inputs.service }}
+ cancel-in-progress: false
+
+jobs:
+ guard:
+ name: confirm-input guard
+ runs-on: ubuntu-latest
+ env:
+ CONFIRM_INPUT: ${{ inputs.confirm }}
+ steps:
+ - name: Reject if confirm phrase wrong
+ run: |
+ if [ "${CONFIRM_INPUT}" != "DEPLOY-STAGING" ]; then
+ echo "::error::confirm must be exactly 'DEPLOY-STAGING'"
+ exit 1
+ fi
+
+ deploy:
+ name: deploy ${{ inputs.service }}
+ needs: guard
+ runs-on: ubuntu-latest
+ environment: staging
+ env:
+ CLOUDFLARE_API_TOKEN: ${{ secrets.CLOUDFLARE_API_TOKEN }}
+ CF_ACCOUNT_ID: ${{ secrets.CF_ACCOUNT_ID }}
+ SERVICE_INPUT: ${{ inputs.service }}
+ steps:
+ - uses: actions/checkout@v6
+
+ - uses: actions/setup-node@v4
+ with:
+ node-version: '20'
+
+ - name: Install wrangler
+ run: npm install -g wrangler@latest
+
+ - name: Validate service name
+ run: |
+ # Whitelist enforced — never embed user input into shell paths
+ # without validating it matches a known service.
+ case "${SERVICE_INPUT}" in
+ all|api|worker|provisioner|pg-platform|pg-customers|mongodb|redis-provision|nats) : ;;
+ *)
+ echo "::error::Unknown service: ${SERVICE_INPUT}"
+ exit 1
+ ;;
+ esac
+
+ - name: Deploy
+ run: |
+ set -euo pipefail
+ if [ "${SERVICE_INPUT}" = "all" ]; then
+ SERVICES="api worker provisioner pg-platform pg-customers mongodb redis-provision nats"
+ else
+ SERVICES="${SERVICE_INPUT}"
+ fi
+ for svc in $SERVICES; do
+ echo "::group::deploying $svc"
+ cd "infra/wrangler/$svc"
+ wrangler deploy --env staging
+ cd - >/dev/null
+ echo "::endgroup::"
+ done
+
+ - name: Reminder
+ run: |
+ echo "::notice::STAGING DEPLOY COMPLETE."
+ echo "::notice::Verify with: curl https://api.staging.instanode.dev/healthz"
+ echo "::notice::Note: stateful containers (pg-*/mongodb/redis-*/nats) have ephemeral disk."
diff --git a/terraform/cloudflare/.gitignore b/terraform/cloudflare/.gitignore
new file mode 100644
index 0000000..343dfcf
--- /dev/null
+++ b/terraform/cloudflare/.gitignore
@@ -0,0 +1,27 @@
+# TF state — lives in R2 backend, never in repo.
+*.tfstate
+*.tfstate.*
+*.tfstate.backup
+.terraform/
+.terraform.lock.hcl
+
+# Per-environment variable files — committable ONLY if they contain
+# no secrets. As of bootstrap there are no secrets in any tfvars (auth
+# is via env vars), so we DO commit the .auto.tfvars files. Below
+# excludes only the local ad-hoc ones.
+*.local.tfvars
+*.local.auto.tfvars
+
+# Operator-local overrides
+override.tf
+override.tf.json
+*_override.tf
+*_override.tf.json
+
+# Plan outputs (often contain post-apply secret values)
+*.tfplan
+*.tfplan.bin
+
+# crash logs from the provider
+crash.log
+crash.*.log
diff --git a/terraform/cloudflare/Makefile b/terraform/cloudflare/Makefile
new file mode 100644
index 0000000..23593ff
--- /dev/null
+++ b/terraform/cloudflare/Makefile
@@ -0,0 +1,101 @@
+# Terraform helpers for the CF migration. Run from this dir.
+#
+# Required env vars (export before any target):
+# CLOUDFLARE_API_TOKEN — Token A (deploy) for plan/apply
+# AWS_ACCESS_KEY_ID — R2 HMAC for TF state bucket
+# AWS_SECRET_ACCESS_KEY — R2 HMAC secret for TF state bucket
+# CF_ACCOUNT_ID — for backend endpoint URL
+#
+# ENV defaults to staging; pass ENV=production for prod.
+
+ENV ?= staging
+TF ?= terraform
+
+ifneq ($(filter $(ENV),staging production),$(ENV))
+$(error ENV must be 'staging' or 'production' (got '$(ENV)'))
+endif
+
+.PHONY: help init fmt validate plan apply destroy install-secrets rotate-tokens clean
+
+help:
+ @echo "Targets:"
+ @echo " init — terraform init with R2 backend (one-time per workspace)"
+ @echo " fmt — terraform fmt -check (CI also enforces)"
+ @echo " validate — terraform validate (offline)"
+ @echo " plan — terraform plan (writes tfplan.bin)"
+ @echo " apply — terraform apply (reads tfplan.bin from plan target)"
+ @echo " install-secrets — pull token outputs and push to k8s + GH org secrets"
+ @echo " rotate-tokens — bump expiry, plan, apply, install"
+ @echo " destroy — DANGEROUS, only for tearing down ephemeral staging"
+ @echo
+ @echo "Env: ENV=$(ENV) (override with ENV=production)"
+
+init:
+ @: $${CF_ACCOUNT_ID?CF_ACCOUNT_ID must be set}
+ $(TF) init \
+ -backend-config="endpoints={s3=\"https://$$CF_ACCOUNT_ID.r2.cloudflarestorage.com\"}" \
+ -backend-config="workspace_key_prefix=$(ENV)"
+ $(TF) workspace select $(ENV) 2>/dev/null || $(TF) workspace new $(ENV)
+
+fmt:
+ $(TF) fmt -check -recursive
+
+validate:
+ $(TF) validate -no-color
+
+plan:
+ $(TF) plan -var-file=$(ENV).auto.tfvars -out=tfplan.bin
+
+apply:
+ $(TF) apply tfplan.bin
+ @echo
+ @echo "==> Apply complete. If tokens were created/rotated, run:"
+ @echo " make install-secrets ENV=$(ENV)"
+
+# Pull sensitive token outputs (one-shot, never written to disk) and
+# install them as k8s + GH secrets across all consuming repos. Token
+# VALUES are scrubbed from the env on exit.
+install-secrets:
+ @: $${GH_TOKEN?GH_TOKEN must be set for 'gh secret set' calls}
+ @DEPLOY_TOKEN="$$($(TF) output -raw deploy_token)"; \
+ if [ -z "$$DEPLOY_TOKEN" ]; then echo "no deploy_token in state — apply first"; exit 1; fi; \
+ echo "==> k8s: writing CLOUDFLARE_API_TOKEN to instant-secrets-cf in instant-$(ENV)"; \
+ kubectl create secret generic instant-secrets-cf \
+ -n instant-$(ENV) \
+ --from-literal=CLOUDFLARE_API_TOKEN="$$DEPLOY_TOKEN" \
+ --dry-run=client -o yaml | kubectl apply -f -; \
+ echo "==> GH org secrets: CLOUDFLARE_API_TOKEN across instanodedev/{api,worker,provisioner,instanode-web,dashboard,infra,cli,mcp}"; \
+ for repo in instanodedev/api instanodedev/worker instanodedev/provisioner \
+ instanodedev/instanode-web instanodedev/dashboard \
+ instanodedev/infra instanodedev/cli instanodedev/mcp; do \
+ gh secret set CLOUDFLARE_API_TOKEN -b"$$DEPLOY_TOKEN" -R "$$repo" >/dev/null \
+ && echo " ✓ $$repo" \
+ || echo " ✗ $$repo (skipped — repo missing or not authorized)"; \
+ done; \
+ unset DEPLOY_TOKEN
+ @echo
+ @echo "==> Admin/tunnel token (Token B) is operator-only — NOT pushed to CI."
+ @echo " To install into your local 1Password vault:"
+ @echo " $(TF) output -raw admin_tunnel_token | op item create --category=ApiCredential --title='cf-admin-tunnel-$(ENV)' credential=-"
+
+# Bump expiry by 180d (deploy) / 90d (admin) — operator edits the .auto.tfvars
+# to set new dates, then this target runs the plan/apply/install loop.
+rotate-tokens:
+ @echo "==> Edit $(ENV).auto.tfvars to set new *_expires_on dates, then:"
+ @echo " make plan ENV=$(ENV)"
+ @echo " make apply ENV=$(ENV)"
+ @echo " make install-secrets ENV=$(ENV)"
+ @echo " Confirm the rotation in the CF dashboard audit log before"
+ @echo " revoking the previous token version."
+
+# Tearing down staging is OK (Phase 1 acceptance allows it). NEVER
+# run against production — D-3 cutover keeps state on DO throughout.
+destroy:
+ @if [ "$(ENV)" = "production" ]; then \
+ echo "ABORTING — destroy against production is forbidden (D-1/D-3)."; \
+ exit 1; \
+ fi
+ $(TF) destroy -var-file=$(ENV).auto.tfvars
+
+clean:
+ rm -f tfplan.bin
diff --git a/terraform/cloudflare/README.md b/terraform/cloudflare/README.md
new file mode 100644
index 0000000..8f29db0
--- /dev/null
+++ b/terraform/cloudflare/README.md
@@ -0,0 +1,161 @@
+# Cloudflare resources — Terraform
+
+Source of truth for everything we declare in Cloudflare for the InstaNode
+migration: API tokens (deploy + admin/tunnel), DNS records, R2 buckets,
+Pages projects, and (later) Workers + Load Balancers + Page Rules.
+
+> **k8s is NOT in scope here.** k8s manifests stay under `../../k8s/`,
+> managed by `kubectl set image` + the existing per-service auto-deploy
+> per CLAUDE.md rule 15. This dir is for Cloudflare-managed resources only.
+
+## Decision references
+
+This module implements:
+- **D-1** (scope — R2, Pages, CF proxy on api, staging-only Tunnel)
+- **D-2** (staging on full CF stack)
+- **D-3** (per-service DNS-weighted cutover; TTL 60s ≥48h)
+- **D-4** (separate `instant-staging-data` ns — k8s-side, not here, but the staging Pages project + R2 bucket parallel it)
+- **D-7** (NS delegation is CF; already verified)
+- **D-8** (R2 env-var canonical names: `R2_HMAC_KEY_ID` / `R2_HMAC_SECRET`)
+- **D-14** (operator credentials — outputs from `tokens.tf` install via `make install-secrets`)
+
+Source: `/tmp/cf-migration/shared/DECISIONS.md`.
+
+## Bootstrap (one-time)
+
+The TF state lives in R2, which means the R2 bucket for state and the
+HMAC creds to write to it must exist BEFORE `terraform init`. Manual
+chicken-and-egg step:
+
+```bash
+# 1. Create the state bucket via wrangler (operator-side, one time).
+wrangler r2 bucket create instanode-tf-state --location wnam
+
+# 2. Create R2 HMAC for state access only (scope: instanode-tf-state).
+# Dashboard → R2 → Manage R2 API Tokens → Create:
+# - Name: "tf-state-rw"
+# - Permission: Object Read & Write
+# - Specify buckets: instanode-tf-state
+# Save the Access Key ID + Secret + Endpoint.
+
+# 3. Export the state-backend creds + CF auth token for terraform.
+export AWS_ACCESS_KEY_ID=""
+export AWS_SECRET_ACCESS_KEY=""
+export CLOUDFLARE_API_TOKEN=""
+
+# 4. Init the backend with the env-specific account endpoint.
+terraform init \
+ -backend-config="endpoints={s3=\"https://${CF_ACCOUNT_ID}.r2.cloudflarestorage.com\"}"
+
+# 5. Pick a workspace (staging first).
+terraform workspace new staging
+terraform workspace select staging
+
+# 6. Plan + apply.
+terraform plan -out=staging.tfplan
+terraform apply staging.tfplan
+```
+
+After `apply` succeeds you have:
+- Two CF API tokens in TF state (deploy + admin_tunnel).
+- The staging Pages project + R2 bucket + DNS records.
+- Output values for token secrets (sensitive — see next section).
+
+## Installing token secrets into k8s + GH
+
+Tokens are SENSITIVE outputs — they appear once in TF state and once
+when `terraform output -raw ` is run. To install:
+
+```bash
+# Read the tokens (do NOT redirect to a file you'll commit).
+DEPLOY_TOKEN="$(terraform output -raw deploy_token)"
+ADMIN_TUNNEL_TOKEN="$(terraform output -raw admin_tunnel_token)"
+
+# k8s — staging namespace.
+kubectl create secret generic instant-secrets-cf \
+ -n instant-staging \
+ --from-literal=CLOUDFLARE_API_TOKEN="$DEPLOY_TOKEN" \
+ --dry-run=client -o yaml | kubectl apply -f -
+
+# GH org / repo secrets — for CI auto-deploys.
+for repo in instanodedev/api instanodedev/worker instanodedev/provisioner \
+ instanodedev/instanode-web instanodedev/dashboard \
+ instanodedev/infra; do
+ gh secret set CLOUDFLARE_API_TOKEN -b"$DEPLOY_TOKEN" -R "$repo"
+done
+
+# Admin/tunnel token: ONLY into a separate operator-local Vault, never
+# into CI. Used break-glass for Tunnel/Access changes.
+op item create --category=ApiCredential --title="cf-admin-tunnel-staging" \
+ --vault="instanode-prod" credential="$ADMIN_TUNNEL_TOKEN"
+
+unset DEPLOY_TOKEN ADMIN_TUNNEL_TOKEN
+```
+
+## Workflow during the migration
+
+1. **Plan-on-PR.** Every PR that changes a `.tf` file under this dir
+ triggers `terraform plan` in CI; diff posted as PR comment.
+2. **Apply-on-merge.** Merge to `main` triggers `terraform apply` via
+ the workflow (gated on approval — `instanodedev/infra` already has
+ manual-apply discipline; rule 15 doesn't auto-deploy `infra`).
+3. **Per-PR contract checklist (rule 22)** still applies. A TF PR that
+ adds a new host or changes the API base URL ALSO needs the
+ synchronized code edits in `api/internal/handlers/openapi.go` +
+ `content/llms.txt` + the dashboard/cli/mcp/sdk-go base-URL constants.
+4. **Per-PR observability checklist (rule 25)** still applies. New
+ resources that emit metrics need an `instant_*` Prom rule + NR alert
+ JSON + dashboard tile + METRICS-CATALOG row in the same PR.
+
+## Workspace conventions
+
+- `terraform workspace new staging` / `terraform workspace new production`
+- `terraform workspace select ` before any plan/apply
+- `var.environment` is set automatically via `*.auto.tfvars` files
+ selected by workspace (TF auto-loads `staging.auto.tfvars` when the
+ workspace is `staging` if your CI passes `-var-file` accordingly;
+ during interactive use, pass `-var-file=staging.auto.tfvars` explicit-
+ ly to avoid surprises).
+
+## File layout
+
+| File | Purpose |
+|---|---|
+| `versions.tf` | TF + provider pinning, R2 backend config |
+| `providers.tf` | CF provider (reads `CLOUDFLARE_API_TOKEN` env) |
+| `variables.tf` | account_id, zone_id, environment, token expiries |
+| `tokens.tf` | `cloudflare_account_token.deploy` + `.admin_tunnel` |
+| `r2.tf` | R2 bucket + 24h-TTL lifecycle rule on `anon/` prefix |
+| `dns.tf` | DNS records (apex / www / api / staging) with TTL 60s |
+| `pages.tf` | Pages project for `instanode-web` (Phase 2) |
+| `outputs.tf` | Sensitive token outputs (consumed by `make install-secrets`) |
+| `staging.auto.tfvars` | Workspace-scoped vars for staging |
+| `production.auto.tfvars` | Workspace-scoped vars for production |
+
+## What's NOT here (yet)
+
+- **Workers** — CEO D-1 deferred until measured TTFB benefit shows up.
+- **Hyperdrive** — same; api and DO Managed PG are same-region, no win today.
+- **D1** — KILLED per D-1.
+- **CF Email Routing** — DEFERRED; outbound stays on Brevo.
+- **Tunnels** — Phase 5 staging-only; add `tunnels.tf` when that PR ships, scoped to admin_tunnel token.
+- **Load Balancers** — pending the CF Startups operator ticket (D-6, 5–10 day lead). Once enabled, add `lb.tf`.
+- **Page Rules / Cache Rules** — Phase 4 only (api orange-cloud cut). Per D-12, the rule is an explicit path-allowlist for `/healthz`, `/openapi.json`, `/llms.txt`; NEVER Authorization-header-based.
+
+## R2 HMAC keys (NOT here)
+
+The R2 HMAC Access Key ID / Secret used by `common/storageprovider/r2/`
+are SEPARATE from the CF API token and are generated via the R2 dashboard
+"Manage R2 API Tokens" UI (NOT this Terraform). Reason: the
+`cloudflare_r2_bucket` resource doesn't issue per-bucket HMAC pairs;
+that's a one-off operator action, scoped to the specific bucket.
+
+After Phase 0 creates the staging bucket, the operator runs:
+1. Dashboard → R2 → Manage R2 API Tokens → Create
+2. Permissions: Object Read & Write
+3. Specify buckets: `instant-shared-staging` (NOT *Apply to all buckets*)
+4. TTL: 180 days
+5. Save the resulting `Access Key ID` + `Secret Access Key` into
+ `instant-secrets` as `R2_HMAC_KEY_ID` + `R2_HMAC_SECRET` (D-8 names).
+
+Repeat for `instant-shared` (prod) after staging passes 48h green (D-9).
diff --git a/terraform/cloudflare/cache.tf b/terraform/cloudflare/cache.tf
new file mode 100644
index 0000000..9864c11
--- /dev/null
+++ b/terraform/cloudflare/cache.tf
@@ -0,0 +1,96 @@
+# Cache rules for api.staging.instanode.dev (and api.instanode.dev once
+# Phase 4 flips proxied=true on the api A-record).
+#
+# D-12 (LOCKED): cache scope is an EXPLICIT path allowlist — `/healthz`,
+# `/openapi.json`, `/llms.txt`. Everything else BYPASSES cache regardless
+# of Authorization header presence. The original "bypass cache when
+# Authorization header is set" approach was deleted because (a) the
+# primitive doesn't exist on our zone tier, (b) it's a footgun if an
+# authed response ever flows through cache.
+#
+# Plus: `instant_unexpected_cached_response_total` P0 metric in the api
+# code (NOT here — handler-side) trips an alert if a request OUTSIDE
+# the allowlist ever responds with cache-hit semantics. Defense in depth.
+
+# Catch-all bypass at top priority — cache OFF for everything by default.
+resource "cloudflare_ruleset" "api_cache_rules" {
+ zone_id = var.zone_id
+ name = "api-cache-rules"
+ description = "D-12 explicit-path allowlist for api${var.environment == "production" ? "" : ".staging"}.${var.zone_name}"
+ kind = "zone"
+ phase = "http_request_cache_settings"
+
+ # Rules evaluated top-to-bottom; first match wins.
+ rules = [
+ # Rule 1: bypass cache for everything by default (catch-all at lowest
+ # priority via `Last`).
+ {
+ action = "set_cache_settings"
+ description = "bypass cache for all api.* paths by default"
+ enabled = true
+ expression = "(http.host eq \"api${var.environment == "production" ? "" : ".staging"}.${var.zone_name}\")"
+ action_parameters = {
+ cache = false
+ }
+ },
+ # Rule 2: allow cache for /healthz (overrides bypass via earlier
+ # evaluation only if listed BEFORE the catch-all; CF Rulesets evaluate
+ # all rules and the LAST matching action wins for `set_cache_settings`,
+ # so explicit allowlist comes after the catch-all).
+ {
+ action = "set_cache_settings"
+ description = "cache /healthz at edge for 30s — same SHA across instances"
+ enabled = true
+ expression = "(http.host eq \"api${var.environment == "production" ? "" : ".staging"}.${var.zone_name}\") and (http.request.uri.path eq \"/healthz\")"
+ action_parameters = {
+ cache = true
+ edge_ttl = {
+ mode = "override_origin"
+ default = 30
+ }
+ browser_ttl = {
+ mode = "override_origin"
+ default = 0
+ }
+ }
+ },
+ # Rule 3: cache /openapi.json for 5 minutes — frequently re-fetched
+ # by tooling, changes rarely.
+ {
+ action = "set_cache_settings"
+ description = "cache /openapi.json at edge for 5min"
+ enabled = true
+ expression = "(http.host eq \"api${var.environment == "production" ? "" : ".staging"}.${var.zone_name}\") and (http.request.uri.path eq \"/openapi.json\")"
+ action_parameters = {
+ cache = true
+ edge_ttl = {
+ mode = "override_origin"
+ default = 300
+ }
+ browser_ttl = {
+ mode = "override_origin"
+ default = 60
+ }
+ }
+ },
+ # Rule 4: cache /llms.txt for 1 hour — static content from content
+ # repo, refresh cadence is "operator manually re-syncs".
+ {
+ action = "set_cache_settings"
+ description = "cache /llms.txt at edge for 1h"
+ enabled = true
+ expression = "(http.host eq \"api${var.environment == "production" ? "" : ".staging"}.${var.zone_name}\") and (http.request.uri.path eq \"/llms.txt\")"
+ action_parameters = {
+ cache = true
+ edge_ttl = {
+ mode = "override_origin"
+ default = 3600
+ }
+ browser_ttl = {
+ mode = "override_origin"
+ default = 600
+ }
+ }
+ },
+ ]
+}
diff --git a/terraform/cloudflare/dns.tf b/terraform/cloudflare/dns.tf
new file mode 100644
index 0000000..fbfd48c
--- /dev/null
+++ b/terraform/cloudflare/dns.tf
@@ -0,0 +1,55 @@
+# DNS records under management.
+#
+# Pre-cutover ritual (D-3): TTL must be 60s for ≥48h BEFORE any cut.
+# Setting it that low here means terraform plan/apply itself satisfies
+# the pre-step the first time we touch the record.
+#
+# `proxied = true` = CF orange-cloud; `false` = grey-cloud (DNS only, no
+# proxy). Today: marketing apex is orange (Phase 0 baseline), api is grey
+# (becomes orange in Phase 4 — flip this flag in that phase's PR).
+
+locals {
+ marketing_origin = "instanode-web.pages.dev" # set per environment in staging.tfvars / production.tfvars after Pages project is created
+ api_origin = "152.42.154.144" # DigitalOcean LB; replaced with LB pool resource in Phase 4
+}
+
+resource "cloudflare_dns_record" "apex" {
+ zone_id = var.zone_id
+ name = var.zone_name
+ type = "CNAME"
+ content = local.marketing_origin
+ ttl = 60
+ proxied = true
+ comment = "marketing apex; CNAME-flattened to Pages project"
+}
+
+resource "cloudflare_dns_record" "www" {
+ zone_id = var.zone_id
+ name = "www.${var.zone_name}"
+ type = "CNAME"
+ content = var.zone_name
+ ttl = 60
+ proxied = true
+ comment = "www → apex redirect handled by CF page rule"
+}
+
+resource "cloudflare_dns_record" "api" {
+ zone_id = var.zone_id
+ name = "api.${var.zone_name}"
+ type = "A"
+ content = local.api_origin
+ ttl = 60
+ proxied = false # Phase 4 flips this to true after CF orange-cloud cache rules are applied
+ comment = "api; grey-cloud today, orange-cloud per Phase 4 cut (D-3)"
+}
+
+resource "cloudflare_dns_record" "staging" {
+ count = var.environment == "staging" ? 1 : 0
+ zone_id = var.zone_id
+ name = "staging.${var.zone_name}"
+ type = "CNAME"
+ content = "instant-staging.${var.zone_name}.cdn.cloudflare.net" # Pages preview hostname; replaced after Pages project is up
+ ttl = 60
+ proxied = true
+ comment = "staging mirror per D-2"
+}
diff --git a/terraform/cloudflare/outputs.tf b/terraform/cloudflare/outputs.tf
new file mode 100644
index 0000000..3b123f3
--- /dev/null
+++ b/terraform/cloudflare/outputs.tf
@@ -0,0 +1,34 @@
+# Token VALUES are sensitive — operator must `terraform output -raw deploy_token`
+# and immediately pipe into `kubectl create secret` / `gh secret set`. Never
+# `terraform output` (no -raw) in a CI log: the redacted form ("(sensitive)")
+# is still a footgun if anyone removes `sensitive = true`.
+
+output "deploy_token_id" {
+ value = cloudflare_account_token.deploy.id
+ description = "Token A id (non-sensitive; safe in CI logs)."
+}
+
+output "deploy_token" {
+ value = cloudflare_account_token.deploy.value
+ description = "Token A secret. Pipe directly into k8s/GH secret; never log."
+ sensitive = true
+}
+
+output "admin_tunnel_token_id" {
+ value = cloudflare_account_token.admin_tunnel.id
+ description = "Token B id (non-sensitive)."
+}
+
+output "admin_tunnel_token" {
+ value = cloudflare_account_token.admin_tunnel.value
+ description = "Token B secret. Operator-only; never put into CI."
+ sensitive = true
+}
+
+output "account_id" {
+ value = var.account_id
+}
+
+output "zone_id" {
+ value = var.zone_id
+}
diff --git a/terraform/cloudflare/pages.tf b/terraform/cloudflare/pages.tf
new file mode 100644
index 0000000..49f0037
--- /dev/null
+++ b/terraform/cloudflare/pages.tf
@@ -0,0 +1,61 @@
+# Cloudflare Pages project for instanode-web (marketing site).
+# Phase 2 in FINAL-PLAN.md. Dashboard-on-Pages is KILLED per D-5;
+# do NOT add a second `cloudflare_pages_project` for dashboard here.
+
+resource "cloudflare_pages_project" "instanode_web" {
+ account_id = var.account_id
+ name = var.environment == "production" ? "instanode-web" : "instanode-web-staging"
+ production_branch = "main"
+
+ build_config = {
+ build_command = "npm run build"
+ destination_dir = "dist"
+ root_dir = ""
+ web_analytics_tag = null
+ web_analytics_token = null
+ }
+
+ source = {
+ type = "github"
+ config = {
+ owner = "instanodedev"
+ repo_name = "instanode-web"
+ production_branch = "main"
+ pr_comments_enabled = true
+ production_deployment_enabled = true
+ preview_deployment_setting = "all"
+ preview_branch_includes = ["*"]
+ preview_branch_excludes = []
+ }
+ }
+
+ deployment_configs = {
+ production = {
+ compatibility_date = "2026-05-30"
+ compatibility_flags = []
+ env_vars = {
+ VITE_API_URL = {
+ type = "plain_text"
+ value = var.environment == "production" ? "https://api.instanode.dev" : "https://api.staging.instanode.dev"
+ }
+ VITE_ENV = {
+ type = "plain_text"
+ value = var.environment
+ }
+ }
+ }
+ preview = {
+ compatibility_date = "2026-05-30"
+ compatibility_flags = []
+ }
+ }
+}
+
+# Custom domain binding — only after Phase 2 acceptance (D-9 equivalent
+# for marketing: zero broken-link diff). Until then, traffic stays on
+# GH Pages via DNS, and this resource is dormant.
+resource "cloudflare_pages_domain" "instanode_web" {
+ account_id = var.account_id
+ project_name = cloudflare_pages_project.instanode_web.name
+ name = var.environment == "production" ? var.zone_name : "staging.${var.zone_name}"
+}
diff --git a/terraform/cloudflare/production.auto.tfvars b/terraform/cloudflare/production.auto.tfvars
new file mode 100644
index 0000000..0c188fe
--- /dev/null
+++ b/terraform/cloudflare/production.auto.tfvars
@@ -0,0 +1,4 @@
+environment = "production"
+
+deploy_token_expires_on = "2026-11-26T23:59:59Z"
+admin_tunnel_token_expires_on = "2026-08-28T23:59:59Z"
diff --git a/terraform/cloudflare/providers.tf b/terraform/cloudflare/providers.tf
new file mode 100644
index 0000000..a89234e
--- /dev/null
+++ b/terraform/cloudflare/providers.tf
@@ -0,0 +1,8 @@
+provider "cloudflare" {
+ # Reads CLOUDFLARE_API_TOKEN from env. Operator uses Token A
+ # ("instanode-migration-deploy") for everything except Tunnel/Access
+ # changes — for those, switch the env var to Token B in a separate
+ # apply (see _modules/tunnel/README.md).
+ #
+ # Never commit a value here.
+}
diff --git a/terraform/cloudflare/r2.tf b/terraform/cloudflare/r2.tf
new file mode 100644
index 0000000..61206ba
--- /dev/null
+++ b/terraform/cloudflare/r2.tf
@@ -0,0 +1,38 @@
+# R2 buckets. Per CEO D-1 + DevOps D-4, staging gets a parallel bucket
+# (`instant-shared-staging`); production keeps the existing name and
+# moves traffic into it via the storageprovider env-flip (D-8 names).
+#
+# Lifecycle rule: anon/ prefix expires after 24h (matches the platform's
+# anon-resource TTL contract — pay-from-day-one, no trial creep).
+
+locals {
+ bucket_name = var.environment == "production" ? "instant-shared" : "instant-shared-staging"
+}
+
+resource "cloudflare_r2_bucket" "shared" {
+ account_id = var.account_id
+ name = local.bucket_name
+ location = "WNAM" # North America West — closest to our DO NYC3 cluster latency-wise
+ storage_class = "Standard"
+}
+
+# 24h TTL on anon/ — matches platform contract that anonymous resources
+# expire after 24h (CLAUDE.md "anonymous (24h TTL) is the only free tier").
+resource "cloudflare_r2_bucket_lifecycle" "shared_anon_24h" {
+ account_id = var.account_id
+ bucket_name = cloudflare_r2_bucket.shared.name
+
+ rules = [{
+ id = "anon-24h"
+ enabled = true
+ conditions = {
+ prefix = "anon/"
+ }
+ delete_objects_transition = {
+ condition = {
+ type = "Age"
+ max_age = 86400 # 24h in seconds
+ }
+ }
+ }]
+}
diff --git a/terraform/cloudflare/staging.auto.tfvars b/terraform/cloudflare/staging.auto.tfvars
new file mode 100644
index 0000000..b7489f6
--- /dev/null
+++ b/terraform/cloudflare/staging.auto.tfvars
@@ -0,0 +1,6 @@
+environment = "staging"
+
+# Tokens rotate every 180d (deploy) / 90d (admin). Override per env
+# if staging is on a shorter cycle.
+deploy_token_expires_on = "2026-11-26T23:59:59Z"
+admin_tunnel_token_expires_on = "2026-08-28T23:59:59Z"
diff --git a/terraform/cloudflare/staging.tf b/terraform/cloudflare/staging.tf
new file mode 100644
index 0000000..10deda6
--- /dev/null
+++ b/terraform/cloudflare/staging.tf
@@ -0,0 +1,182 @@
+# Staging-environment subdomains under staging.instanode.dev.
+#
+# All resources here are count-gated on `var.environment == "staging"` so
+# they only materialize in the staging workspace; the production workspace
+# plan shows no changes from this file.
+#
+# DIVISION OF RESPONSIBILITY between TF and wrangler:
+#
+# - **TF owns** wildcard records, env-level subdomains (dashboard, webhook),
+# and the deployment-app wildcard. These don't have a 1:1 Worker/Container
+# mapping or they're pre-deploy plumbing.
+# - **Wrangler owns** service-specific hostnames via `custom_domain = true`
+# in each wrangler.toml. wrangler auto-creates the DNS + cert + route on
+# first deploy. That covers: api.staging.instanode.dev (managed by
+# infra/wrangler/api/wrangler.toml).
+#
+# DO NOT add explicit TF records for hostnames wrangler is already
+# custom-domain-claiming — wrangler will fail to deploy with "DNS record
+# already exists" if both manage it.
+
+locals {
+ is_staging = var.environment == "staging"
+ # All staging subdomains live under this stem.
+ staging_stem = "staging.${var.zone_name}"
+}
+
+# -----------------------------------------------------------------------------
+# Wildcards under *.staging.instanode.dev
+# -----------------------------------------------------------------------------
+#
+# Each per-tenant service in wrangler/ uses a hostname-shard pattern:
+# - pg-customer-.staging.instanode.dev (pg-customers Container)
+# - mongo-.staging.instanode.dev (mongodb Container)
+# - redis-.staging.instanode.dev (redis-provision Container)
+# - nats-.staging.instanode.dev (nats Container)
+#
+# A single proxied wildcard CNAME catches all of them; the Worker shells
+# in each wrangler service extract the tenant from the hostname and
+# dispatch to the right Durable Object via `idFromName(tenant)`.
+
+resource "cloudflare_dns_record" "staging_wildcard" {
+ count = local.is_staging ? 1 : 0
+ zone_id = var.zone_id
+ name = "*.${local.staging_stem}"
+ type = "CNAME"
+ # CF requires SOME content for proxied CNAMEs; this is a placeholder. The
+ # cloudflare_workers_route below routes traffic to the correct Worker
+ # regardless of what's here. A 404 sink is intentional — any unrouted
+ # subdomain hits CF's default 404 page.
+ content = local.staging_stem
+ ttl = 60
+ proxied = true
+ comment = "wildcard for per-tenant CF Container services in staging; routed via cloudflare_workers_route below"
+}
+
+# -----------------------------------------------------------------------------
+# Deployment-app wildcard: *.deployment.staging.instanode.dev
+# -----------------------------------------------------------------------------
+#
+# Mirror of prod's `*.deployment.instanode.dev`. Every /deploy/new staging
+# call provisions an app at `.deployment.staging.instanode.dev`.
+# Wrangler-managed Containers for the deploy compute target this wildcard;
+# the api Worker creates a DNS-less custom-domain claim per slug, but the
+# wildcard ensures any future deploy slug resolves to CF before its
+# custom-domain claim lands.
+
+resource "cloudflare_dns_record" "staging_deployment_wildcard" {
+ count = local.is_staging ? 1 : 0
+ zone_id = var.zone_id
+ name = "*.deployment.${local.staging_stem}"
+ type = "CNAME"
+ content = "deployment.${local.staging_stem}"
+ ttl = 60
+ proxied = true
+ comment = "wildcard for /deploy/new staging apps (mirrors prod *.deployment.instanode.dev)"
+}
+
+# Anchor for the deployment wildcard CNAME (the wildcard's content needs
+# a real record at the parent name).
+resource "cloudflare_dns_record" "staging_deployment_anchor" {
+ count = local.is_staging ? 1 : 0
+ zone_id = var.zone_id
+ name = "deployment.${local.staging_stem}"
+ type = "AAAA"
+ content = "100::" # IPv6 discard prefix — never reachable; CF proxied front-end terminates
+ ttl = 60
+ proxied = true
+ comment = "anchor for deployment wildcard CNAME (CF requires a real record at the parent)"
+}
+
+# -----------------------------------------------------------------------------
+# Webhook subdomain: webhook.staging.instanode.dev
+# -----------------------------------------------------------------------------
+#
+# /webhook/new staging endpoints return a URL at this host. Routed to the
+# api Container via a Worker route. Separate subdomain (vs api.staging.)
+# so customers can filter outbound webhook traffic by destination host.
+
+resource "cloudflare_dns_record" "staging_webhook" {
+ count = local.is_staging ? 1 : 0
+ zone_id = var.zone_id
+ name = "webhook.${local.staging_stem}"
+ type = "AAAA"
+ content = "100::" # placeholder; CF orange-cloud handles routing
+ ttl = 60
+ proxied = true
+ comment = "staging /webhook/new receiver subdomain"
+}
+
+# -----------------------------------------------------------------------------
+# Dashboard subdomain: dashboard.staging.instanode.dev
+# -----------------------------------------------------------------------------
+#
+# CEO killed dashboard-on-Pages for PROD (D-5) but staging dashboard is
+# useful for QA. Points at the same dashboard Pages project at the
+# `staging` branch preview hostname. NOT enabled for production — D-5
+# stands.
+
+resource "cloudflare_dns_record" "staging_dashboard" {
+ count = local.is_staging ? 1 : 0
+ zone_id = var.zone_id
+ name = "dashboard.${local.staging_stem}"
+ type = "CNAME"
+ content = "instanode-dashboard-staging.pages.dev" # set after dashboard Pages project is created
+ ttl = 60
+ proxied = true
+ comment = "staging dashboard — QA-only; D-5 keeps prod dashboard off Pages"
+}
+
+# -----------------------------------------------------------------------------
+# Workers Routes for per-tenant wildcards
+# -----------------------------------------------------------------------------
+#
+# `custom_domain = true` in wrangler.toml does NOT support wildcards.
+# Wildcards need cloudflare_workers_route + a wildcard DNS record (done
+# above). Each route binds a pattern to a specific Worker name; wrangler
+# deploys the Worker, TF wires the route.
+
+resource "cloudflare_workers_route" "staging_pg_customers" {
+ count = local.is_staging ? 1 : 0
+ zone_id = var.zone_id
+ pattern = "pg-customer-*.${local.staging_stem}/*"
+ script = "instanode-pg-customers-staging"
+}
+
+resource "cloudflare_workers_route" "staging_mongodb" {
+ count = local.is_staging ? 1 : 0
+ zone_id = var.zone_id
+ pattern = "mongo-*.${local.staging_stem}/*"
+ script = "instanode-mongodb-staging"
+}
+
+resource "cloudflare_workers_route" "staging_redis" {
+ count = local.is_staging ? 1 : 0
+ zone_id = var.zone_id
+ pattern = "redis-*.${local.staging_stem}/*"
+ script = "instanode-redis-provision-staging"
+}
+
+resource "cloudflare_workers_route" "staging_nats" {
+ count = local.is_staging ? 1 : 0
+ zone_id = var.zone_id
+ pattern = "nats-*.${local.staging_stem}/*"
+ script = "instanode-nats-staging"
+}
+
+# -----------------------------------------------------------------------------
+# Pages custom domain — staging marketing site
+# -----------------------------------------------------------------------------
+#
+# The Pages project itself is declared in pages.tf with the
+# `var.environment == "staging" ? "instanode-web-staging" : "instanode-web"`
+# name pattern. The custom-domain attachment is here so prod's pages.tf
+# stays simple.
+
+resource "cloudflare_pages_domain" "staging_marketing" {
+ count = local.is_staging ? 1 : 0
+ account_id = var.account_id
+ project_name = "instanode-web-staging"
+ name = local.staging_stem
+ depends_on = [cloudflare_dns_record.staging]
+}
diff --git a/terraform/cloudflare/tokens.tf b/terraform/cloudflare/tokens.tf
new file mode 100644
index 0000000..8a37ad0
--- /dev/null
+++ b/terraform/cloudflare/tokens.tf
@@ -0,0 +1,75 @@
+# Two scoped API tokens replace the Global API Key for CI / DevOps use.
+# Source: exported from CF dashboard 2026-05-30, renamed to avoid the
+# default `example_account_token` collision.
+#
+# WARNING — token values are SENSITIVE outputs. They appear once in TF
+# state after `apply`. Operator MUST run the `make install-secrets`
+# helper (see Makefile) to push them into k8s + GH org secrets, then
+# rotate state.
+
+# Token A — day-to-day deploy + DNS + R2 + Pages + Workers + Page Rules
+# + Load Balancing + Cache Purge + Zone Settings. Account-broad, zone-
+# narrow on instanode.dev. Used by CI.
+resource "cloudflare_account_token" "deploy" {
+ account_id = var.account_id
+ name = "instanode-migration-deploy-${var.environment}"
+ expires_on = var.deploy_token_expires_on
+
+ policies = [
+ # Zone-scoped permissions on instanode.dev (zone_id pinned).
+ {
+ effect = "allow"
+ permission_groups = [
+ { id = "c4df38be41c247b3b4b7702e76eadae0" }, # Zone:Read
+ { id = "3030687196b94b638145a3953da2b699" }, # DNS:Edit
+ { id = "c8fed203ed3043cba015a93ad1616f1f" }, # Zone Settings:Edit
+ { id = "c03055bc037c4ea9afb9a9f104b7b721" }, # Cache Purge:Purge
+ { id = "e17beae8b8cb423a99b1730f21238bed" }, # Page Rules:Edit
+ { id = "ed07f6c337da4195b4e72a1fb2c6bcae" }, # SSL and Certificates:Edit
+ { id = "6d7f2f5f5b1d4a0e9081fdc98d432fd1" }, # Load Balancers:Edit
+ { id = "4755a26eedb94da69e1066d98aa820be" }, # Apps:Edit (zone-side)
+ ]
+ resources = jsonencode({
+ "com.cloudflare.api.account.zone.${var.zone_id}" = "*"
+ })
+ },
+ # Account-scoped permissions for resources that aren't zone-bound.
+ {
+ effect = "allow"
+ permission_groups = [
+ { id = "dc44f27f48ab405392a5f69fe822bd01" }, # Workers Scripts:Edit
+ { id = "8d28297797f24fb8a0c332fe0866ec89" }, # Workers KV Storage:Edit
+ { id = "bf7481a1826f439697cb59a20b22293e" }, # Workers R2 Storage:Edit
+ { id = "f7f0eda5697f475c90846e879bab8666" }, # Cloudflare Pages:Edit
+ { id = "e086da7e2179491d91ee5f35b3ca210a" }, # Account Settings:Read
+ { id = "d2a1802cc9a34e30852f8b33869b2f3c" }, # LB Monitors & Pools:Edit
+ { id = "c1fde68c7bcc44588cbb6ddbc16d6480" }, # Account Analytics:Read
+ ]
+ resources = jsonencode({
+ "com.cloudflare.api.account.${var.account_id}" = "*"
+ })
+ },
+ ]
+}
+
+# Token B — break-glass / rare-use Tunnel + Access. Smaller scope, shorter
+# expiry. NOT used by CI; kept as separate apply for blast-radius isolation.
+resource "cloudflare_account_token" "admin_tunnel" {
+ account_id = var.account_id
+ name = "instanode-migration-admin-tunnel-${var.environment}"
+ expires_on = var.admin_tunnel_token_expires_on
+
+ policies = [{
+ effect = "allow"
+ permission_groups = [
+ { id = "ad7a6f88896d498f98eb30592abfbbf4" }, # Cloudflare Tunnel:Edit
+ { id = "77efc2c0724d4c4eb94bfd9656247130" }, # Access: Apps and Policies:Edit
+ { id = "db37e5f1cb1a4e1aabaef8deaea43575" }, # Access: Service Tokens:Edit
+ { id = "a1c0fec57cf94af79479a6d827fa518c" }, # Access: Organizations, Identity Providers:Edit
+ { id = "1e13c5124ca64b72b1969a67e8829049" }, # Account Settings:Read
+ ]
+ resources = jsonencode({
+ "com.cloudflare.api.account.${var.account_id}" = "*"
+ })
+ }]
+}
diff --git a/terraform/cloudflare/variables.tf b/terraform/cloudflare/variables.tf
new file mode 100644
index 0000000..7e9f005
--- /dev/null
+++ b/terraform/cloudflare/variables.tf
@@ -0,0 +1,37 @@
+variable "account_id" {
+ type = string
+ description = "Cloudflare account ID (CF for Startups credit-tagged account)."
+ default = "613a9e74136364c781a8e258326019f9"
+}
+
+variable "zone_id" {
+ type = string
+ description = "Cloudflare zone ID for instanode.dev."
+ default = "08a1a569d2d6f9a713dc6d62103c5dc6"
+}
+
+variable "zone_name" {
+ type = string
+ default = "instanode.dev"
+}
+
+variable "environment" {
+ type = string
+ description = "staging or production. Selected via `terraform workspace`."
+ validation {
+ condition = contains(["staging", "production"], var.environment)
+ error_message = "environment must be one of: staging, production."
+ }
+}
+
+variable "deploy_token_expires_on" {
+ type = string
+ description = "RFC3339 expiry for the deploy token. Rotate every ≤180d."
+ default = "2026-11-26T23:59:59Z"
+}
+
+variable "admin_tunnel_token_expires_on" {
+ type = string
+ description = "RFC3339 expiry for the admin/tunnel token. Rotate every ≤90d."
+ default = "2026-08-28T23:59:59Z"
+}
diff --git a/terraform/cloudflare/versions.tf b/terraform/cloudflare/versions.tf
new file mode 100644
index 0000000..942c3ae
--- /dev/null
+++ b/terraform/cloudflare/versions.tf
@@ -0,0 +1,27 @@
+terraform {
+ required_version = ">= 1.4"
+
+ required_providers {
+ cloudflare = {
+ source = "cloudflare/cloudflare"
+ version = "~> 5.0"
+ }
+ }
+
+ # State lives in R2 (S3-compatible). The bucket "instanode-tf-state" must
+ # be created out-of-band before `terraform init` — see README §Bootstrap.
+ # Operator passes -backend-config="..." at init time; we DON'T hardcode
+ # the account-specific endpoint or HMAC creds here.
+ backend "s3" {
+ bucket = "instanode-tf-state"
+ key = "cloudflare/terraform.tfstate"
+ region = "auto"
+ use_path_style = true
+ skip_credentials_validation = true
+ skip_metadata_api_check = true
+ skip_region_validation = true
+ skip_requesting_account_id = true
+ skip_s3_checksum = true
+ encrypt = true
+ }
+}
diff --git a/wrangler/README.md b/wrangler/README.md
new file mode 100644
index 0000000..db2b867
--- /dev/null
+++ b/wrangler/README.md
@@ -0,0 +1,97 @@
+# Wrangler — CF Containers for staging
+
+This directory deploys instanode.dev services as **Cloudflare Containers**
+to the **staging** environment. Each service has its own subdir with a
+`wrangler.toml` + a tiny Worker shell (`src/worker.ts`) that exposes the
+Container via a Durable Object binding.
+
+Production does NOT use this — see the `production-` workflow when written.
+Per user direction 2026-05-30: staging is CF-only, ephemeral state acceptable.
+
+## Why wrangler, not Terraform
+
+The `cloudflare/cloudflare` Terraform provider (v5.19.1 as of bootstrap) does
+NOT yet expose a `cloudflare_container` resource. Verified by `terraform
+providers schema -json | jq '.. | keys?' | grep container` → empty.
+
+Until the provider catches up, we manage Containers via `wrangler` and
+**Terraform manages everything else**: DNS, R2, Pages, Hyperdrive, KV,
+Queues, secrets — see `../terraform/cloudflare/`.
+
+When `cloudflare_container` ships, we'll swap in. Until then, the
+boundary is clean:
+
+| Surface | Tool |
+|---|---|
+| DNS records, R2 buckets, Pages projects, Hyperdrive config, API tokens | **Terraform** (`../terraform/cloudflare/`) |
+| CF Containers (api/worker/provisioner + stateful staging services) | **Wrangler** (this dir) |
+| k8s manifests (production data plane until that migrates) | **kubectl** (`../k8s/`) |
+
+## Ephemeral-state acceptance criterion
+
+CF Containers wipe disk every time an instance goes to sleep (which fires
+on traffic-quiet, not just intentional restart). Source:
+https://developers.cloudflare.com/containers/platform-details/
+
+This means our staging Postgres / Mongo / Redis / NATS containers WILL
+lose their data, mid-test sometimes. E2E test design MUST tolerate this:
+
+1. **Every test seeds its own fixtures** at start; no test assumes state
+ from a prior test.
+2. **No "deploy now, verify in 2h" tests** — the container may have
+ slept and lost its state in between.
+3. **Tests that span multiple HTTP calls** must complete within one
+ container-active window (typically minutes).
+4. **`/db/new` in staging** returns a connection string that may stop
+ working when the backing Container sleeps. Documented in the staging
+ API responses.
+5. **Synthetic monitors** keep the high-traffic Containers warm; cold
+ ones are accepted as ephemeral.
+
+These tradeoffs are explicit and user-blessed per the CF-only staging
+decision. Production has a different host (TBD — not in this dir).
+
+## Per-service layout
+
+Each subdir contains:
+
+```
+infra/wrangler//
+├── wrangler.toml # CF Container + Worker config
+├── src/
+│ └── worker.ts # Tiny Worker shell that wraps the Container DO
+├── Dockerfile # Optional override; defaults to ../..//Dockerfile
+└── README.md # Service-specific notes (image source, env vars, ports)
+```
+
+The actual service code (api, worker, provisioner) lives in its own repo
+under `instanodedev/` and produces a Docker image that wrangler ships.
+For services without a separate repo (pg-platform, pg-customers, mongodb,
+redis-provision, nats), we use upstream public images (`postgres:16`,
+`mongo:7`, `redis:7`, `nats:2`) and a small staging-only init script.
+
+## Deploy
+
+CI auto-deploys on merge to `master` via `../.github/workflows/wrangler-deploy-staging.yml`.
+Manual deploy from an operator workstation:
+
+```bash
+cd infra/wrangler/
+wrangler login # one-time
+wrangler containers deploy --env staging
+```
+
+Requires `CLOUDFLARE_API_TOKEN` env (Token A from the TF outputs).
+
+## Service inventory
+
+| Subdir | What runs | Stateful? | Public hostname (staging) | Notes |
+|---|---|---|---|---|
+| `api/` | instanode.dev api binary | no | `api.staging.instanode.dev` | HTTP only |
+| `worker/` | River job worker | no | none (cron) | Triggered by CF Cron |
+| `provisioner/` | gRPC :50051 service | no | private (Container→Container only) | api calls it |
+| `pg-platform/` | postgres:16 | **yes, ephemeral** | private | `instance_type=standard`; data wiped on sleep |
+| `pg-customers/` | postgres:16 | **yes, ephemeral** | `pg-customer-.staging.instanode.dev` (one per tenant) | Customer-facing in staging only |
+| `mongodb/` | mongo:7 | **yes, ephemeral** | private | accessed by /nosql/new staging |
+| `redis-provision/` | redis:7 | **yes, ephemeral** | `redis-.staging.instanode.dev` | Customer-facing |
+| `nats/` | nats:2 (no JetStream — JS needs durable disk) | **yes, ephemeral** | `nats-.staging.instanode.dev` | Core NATS only in staging |
diff --git a/wrangler/api/README.md b/wrangler/api/README.md
new file mode 100644
index 0000000..80a190f
--- /dev/null
+++ b/wrangler/api/README.md
@@ -0,0 +1,35 @@
+# api — CF Containers staging deploy
+
+Wraps the Go api binary (port 8080) in a CF Container. Image pulled from
+`ghcr.io/instanodedev/api:staging` — built by the api repo's CI on every
+push to master, tagged with `:staging` for staging deploys.
+
+## Env vars and secrets
+
+Config (committed):
+- `ENVIRONMENT=staging`
+- `OBJECT_STORE_BACKEND=r2`
+- `R2_BUCKET_NAME=instant-shared-staging`
+
+Secrets (via `wrangler secret put`):
+- `DATABASE_URL` — points at `pg-platform` Container DO via service binding
+- `CUSTOMER_DATABASE_URL` — points at `pg-customers` Container DO
+- `REDIS_URL` — service binding to `redis-platform`
+- `NATS_URL` — service binding to `nats`
+- `AES_KEY`, `JWT_SECRET`, `RAZORPAY_WEBHOOK_SECRET`, `BREVO_API_KEY` — same names as k8s prod
+- `R2_HMAC_KEY_ID`, `R2_HMAC_SECRET` — from R2 dashboard, scoped to `instant-shared-staging` bucket
+
+## Deploy
+
+```bash
+cd infra/wrangler/api
+wrangler containers deploy --env staging
+```
+
+CI auto-deploys on merge to master via the workflow in `infra/.github/workflows/`.
+
+## Known constraints
+
+- **Disk wipes on sleep** — api itself is stateless so this is fine; downstream PG/Mongo are NOT (see ../README.md acceptance criterion).
+- **HTTP only** — gRPC api→provisioner is fine (CF Containers support HTTP/2).
+- **No persistent customer port-forwards** — the dashboard's port-forward proxy is disabled on staging.
diff --git a/wrangler/api/src/worker.ts b/wrangler/api/src/worker.ts
new file mode 100644
index 0000000..7e78d5c
--- /dev/null
+++ b/wrangler/api/src/worker.ts
@@ -0,0 +1,32 @@
+// Tiny Worker shell for the api Container.
+//
+// CF Containers require a Worker entrypoint that forwards requests to
+// the Container's Durable Object. The container itself runs the actual
+// Go binary (instanodedev/api), listening on :8080.
+//
+// Every incoming HTTP request is routed to a Container instance; CF
+// handles spin-up/spin-down. Disk is ephemeral — see ../README.md.
+
+import { Container, getContainer } from "@cloudflare/containers";
+
+export class ApiContainer extends Container {
+ // The Go binary listens on :8080.
+ defaultPort = 8080;
+ // Sleep after 10 minutes of no traffic. CF will spin back up on the
+ // next request, with a fresh disk. The api is stateless (state lives
+ // in pg-platform Container), so cold-start is correctness-safe.
+ sleepAfter = "10m";
+}
+
+export default {
+ async fetch(request: Request, env: Env): Promise {
+ // Route every request to a single Container instance (single-shard
+ // for staging; production would shard by tenant or geo).
+ const container = getContainer(env.API_CONTAINER);
+ return container.fetch(request);
+ },
+};
+
+interface Env {
+ API_CONTAINER: DurableObjectNamespace;
+}
diff --git a/wrangler/api/wrangler.toml b/wrangler/api/wrangler.toml
new file mode 100644
index 0000000..a403a09
--- /dev/null
+++ b/wrangler/api/wrangler.toml
@@ -0,0 +1,64 @@
+# instanode-api on CF Containers (staging).
+#
+# The api is a Go binary listening on :8080. CF Containers wraps it in a
+# Durable Object; the Worker shell in src/worker.ts forwards every HTTP
+# request to the container.
+#
+# Image: pulled from GHCR (built by api repo's CI on every push to master).
+
+name = "instanode-api"
+main = "src/worker.ts"
+compatibility_date = "2026-05-30"
+
+# Per-environment config keeps the staging deploy isolated from any future
+# prod deploy (which won't live here — production goes to a non-CF k8s).
+[env.staging]
+name = "instanode-api-staging"
+routes = [
+ { pattern = "api.staging.instanode.dev/*", custom_domain = true },
+]
+
+# Container backed by a Durable Object class.
+[[env.staging.containers]]
+class_name = "ApiContainer"
+image = "ghcr.io/instanode-dev/instant-api:staging"
+max_instances = 3
+instance_type = "standard" # 1 vCPU, 4 GiB RAM, 8 GiB ephemeral disk
+
+[[env.staging.durable_objects.bindings]]
+name = "API_CONTAINER"
+class_name = "ApiContainer"
+
+[[env.staging.migrations]]
+tag = "v1"
+new_sqlite_classes = ["ApiContainer"]
+
+# Env vars passed to the container. Secrets via `wrangler secret put`.
+[env.staging.vars]
+ENVIRONMENT = "staging"
+OBJECT_STORE_BACKEND = "r2"
+R2_BUCKET_NAME = "instant-shared-staging"
+# DATABASE_URL, REDIS_URL, NATS_URL, etc. resolve to other Container DOs
+# via service bindings — see [[env.staging.services]] block.
+
+# Service bindings — Worker can RPC into other Containers/Workers without
+# a public hostname.
+[[env.staging.services]]
+binding = "PG_PLATFORM"
+service = "instanode-pg-platform-staging"
+environment = "staging"
+
+[[env.staging.services]]
+binding = "PROVISIONER"
+service = "instanode-provisioner-staging"
+environment = "staging"
+
+[[env.staging.services]]
+binding = "REDIS_PLATFORM"
+service = "instanode-redis-platform-staging"
+environment = "staging"
+
+# Observability — send Container stdout/stderr to a CF Logpush sink.
+[env.staging.observability]
+enabled = true
+head_sampling_rate = 1.0
diff --git a/wrangler/mongodb/Dockerfile b/wrangler/mongodb/Dockerfile
new file mode 100644
index 0000000..afbe234
--- /dev/null
+++ b/wrangler/mongodb/Dockerfile
@@ -0,0 +1,30 @@
+# mongodb image for staging CF Container.
+#
+# Base: mongo:7. CF Containers' ephemeral disk means EVERY cold start
+# is a fresh init — there is no "first init vs subsequent restart"
+# distinction. The mongo image's docker-entrypoint runs initdb scripts
+# on every fresh /data/db, so the staging-bootstrap script below runs
+# every cold start.
+#
+# Why custom (vs pristine mongo:7):
+# - Bake the staging-bootstrap that creates the admin user + sets
+# the wire compression default so api can connect without
+# post-deploy operator action.
+# - Healthcheck via `mongosh ping` for the Worker shell's wait loop.
+# - Per-tenant database names are CREATED on demand by provisioner;
+# no per-tenant schema baked in here.
+
+FROM mongo:7
+
+# Staging-bootstrap: idempotent admin user. Mongo entrypoint reads
+# MONGO_INITDB_ROOT_USERNAME / MONGO_INITDB_ROOT_PASSWORD from env on
+# first init; this script is a defence-in-depth ensure path used by
+# the api's connection-test against `db.adminCommand({ ping: 1 })`.
+COPY infra/wrangler/mongodb/docker-entrypoint-initdb.d/ /docker-entrypoint-initdb.d/
+
+# `mongosh` is in the base image; the healthcheck just exercises a
+# round-trip via the admin DB to confirm the daemon is up + responsive.
+HEALTHCHECK --interval=10s --timeout=3s --start-period=30s --retries=3 \
+ CMD mongosh --quiet --eval "db.adminCommand({ping:1}).ok" --host=localhost | grep -q '^1$' || exit 1
+
+EXPOSE 27017
diff --git a/wrangler/mongodb/docker-entrypoint-initdb.d/00_staging_bootstrap.js b/wrangler/mongodb/docker-entrypoint-initdb.d/00_staging_bootstrap.js
new file mode 100644
index 0000000..ef7f31e
--- /dev/null
+++ b/wrangler/mongodb/docker-entrypoint-initdb.d/00_staging_bootstrap.js
@@ -0,0 +1,27 @@
+// Staging-bootstrap for mongodb CF Container. Runs on EVERY cold start
+// because CF Containers wipe /data/db on sleep.
+//
+// Idempotent: createUser fails with code 51003 ("user already exists")
+// if the admin already created the user in the same boot — we swallow
+// that. Other codes propagate.
+
+(function () {
+ var adminDb = db.getSiblingDB('admin');
+
+ // Mongo entrypoint already creates the root user from
+ // MONGO_INITDB_ROOT_USERNAME/MONGO_INITDB_ROOT_PASSWORD. Confirm it
+ // resolved successfully so the api connection doesn't hit "no users
+ // configured" on the first call.
+ var users = adminDb.system.users.find({ user: 'admin' }).count();
+ if (users === 0) {
+ print('00_staging_bootstrap: no admin user found, creating one from env vars');
+ adminDb.createUser({
+ user: process.env.MONGO_INITDB_ROOT_USERNAME || 'admin',
+ pwd: process.env.MONGO_INITDB_ROOT_PASSWORD || 'staging-bootstrap',
+ roles: [{ role: 'root', db: 'admin' }],
+ });
+ } else {
+ print('00_staging_bootstrap: admin user already provisioned by mongo entrypoint');
+ }
+ print('00_staging_bootstrap: complete');
+})();
diff --git a/wrangler/mongodb/src/worker.ts b/wrangler/mongodb/src/worker.ts
new file mode 100644
index 0000000..5cc2570
--- /dev/null
+++ b/wrangler/mongodb/src/worker.ts
@@ -0,0 +1,19 @@
+import { Container, getContainer } from "@cloudflare/containers";
+
+export class MongoContainer extends Container {
+ defaultPort = 27017;
+ sleepAfter = "20m";
+}
+
+export default {
+ async fetch(request: Request, env: Env): Promise {
+ const url = new URL(request.url);
+ const tenant = url.hostname.split(".")[0].replace(/^mongo-/, "");
+ const id = env.MONGO_CONTAINER.idFromName(tenant);
+ return env.MONGO_CONTAINER.get(id).fetch(request);
+ },
+};
+
+interface Env {
+ MONGO_CONTAINER: DurableObjectNamespace;
+}
diff --git a/wrangler/mongodb/wrangler.toml b/wrangler/mongodb/wrangler.toml
new file mode 100644
index 0000000..48d30dc
--- /dev/null
+++ b/wrangler/mongodb/wrangler.toml
@@ -0,0 +1,30 @@
+# mongodb — per-tenant Mongo in a CF Container (staging).
+
+name = "instanode-mongodb"
+main = "src/worker.ts"
+compatibility_date = "2026-05-30"
+
+[env.staging]
+name = "instanode-mongodb-staging"
+routes = [
+ { pattern = "mongo-*.staging.instanode.dev/*", custom_domain = true },
+]
+
+[[env.staging.containers]]
+class_name = "MongoContainer"
+# Custom image — wraps mongo:7 with staging-bootstrap + healthcheck.
+image = "ghcr.io/instanode-dev/instant-mongodb:staging"
+max_instances = 10
+instance_type = "standard"
+
+[[env.staging.durable_objects.bindings]]
+name = "MONGO_CONTAINER"
+class_name = "MongoContainer"
+
+[[env.staging.migrations]]
+tag = "v1"
+new_sqlite_classes = ["MongoContainer"]
+
+[env.staging.observability]
+enabled = true
+head_sampling_rate = 1.0
diff --git a/wrangler/nats/Dockerfile b/wrangler/nats/Dockerfile
new file mode 100644
index 0000000..e3cd67a
--- /dev/null
+++ b/wrangler/nats/Dockerfile
@@ -0,0 +1,23 @@
+# nats image for staging CF Container.
+#
+# Base: nats:2-alpine. JetStream needs durable disk — NOT viable on
+# CF Containers' ephemeral storage — so this image runs CORE NATS ONLY
+# (no -js flag). Customer-facing /queue/new in staging returns a
+# legacy_open connection string and tests that exercise JetStream
+# features are skipped (see test guard in api/internal/handlers/queue.go).
+#
+# Auth mode: legacy_open. Per CLAUDE.md "Known Design Gaps", prod
+# serves legacy_open until the operator runs `nsc generate` for
+# operator/sys NKeys (NATS-AUTH-RUNBOOK.md). Staging matches prod's
+# current auth posture.
+
+FROM nats:2-alpine
+
+COPY infra/wrangler/nats/nats-server.conf /etc/nats/nats-server.conf
+
+HEALTHCHECK --interval=10s --timeout=3s --start-period=10s --retries=3 \
+ CMD wget -qO- http://localhost:8222/healthz | grep -q '"status":"ok"' || exit 1
+
+EXPOSE 4222 8222
+
+CMD ["-c", "/etc/nats/nats-server.conf"]
diff --git a/wrangler/nats/nats-server.conf b/wrangler/nats/nats-server.conf
new file mode 100644
index 0000000..db33a4f
--- /dev/null
+++ b/wrangler/nats/nats-server.conf
@@ -0,0 +1,33 @@
+# Staging nats-server.conf — core NATS only (no JetStream — ephemeral
+# disk on CF Containers can't satisfy JetStream's durable WAL).
+#
+# Auth mode: legacy_open. No per-tenant JWT in staging. Production
+# eventually upgrades to per-tenant JWT once an operator runs
+# `nsc generate` for operator + sys NKeys (NATS-AUTH-RUNBOOK.md).
+# This staging config DOES NOT block on that.
+
+listen: 0.0.0.0:4222
+
+# HTTP monitoring endpoint used by the Worker shell's healthcheck.
+http: 0.0.0.0:8222
+
+# Connection + payload limits matched to CF Container "basic" class.
+max_connections: 1000
+max_payload: 1MB
+max_pending: 32MB
+
+# Logging to stdout for `wrangler tail`.
+debug: false
+trace: false
+logtime: true
+
+# Auth — legacy_open: no creds required. Customers connecting via
+# /queue/new staging endpoint get an open URL.
+authorization {
+ # Empty block = no auth. Documented intentional choice.
+}
+
+# NO JetStream block — explicitly disabled because CF Container disk
+# is ephemeral. Tests that require JetStream skip on staging via the
+# `auth_mode=legacy_open` resource field (see CLAUDE.md /queue/new).
+# jetstream { ... } # DO NOT enable in staging
diff --git a/wrangler/nats/src/worker.ts b/wrangler/nats/src/worker.ts
new file mode 100644
index 0000000..45f2350
--- /dev/null
+++ b/wrangler/nats/src/worker.ts
@@ -0,0 +1,19 @@
+import { Container, getContainer } from "@cloudflare/containers";
+
+export class NatsContainer extends Container {
+ defaultPort = 4222;
+ sleepAfter = "20m";
+}
+
+export default {
+ async fetch(request: Request, env: Env): Promise {
+ const url = new URL(request.url);
+ const tenant = url.hostname.split(".")[0].replace(/^nats-/, "");
+ const id = env.NATS_CONTAINER.idFromName(tenant);
+ return env.NATS_CONTAINER.get(id).fetch(request);
+ },
+};
+
+interface Env {
+ NATS_CONTAINER: DurableObjectNamespace;
+}
diff --git a/wrangler/nats/wrangler.toml b/wrangler/nats/wrangler.toml
new file mode 100644
index 0000000..7315949
--- /dev/null
+++ b/wrangler/nats/wrangler.toml
@@ -0,0 +1,40 @@
+# nats — per-tenant NATS in a CF Container (staging).
+# NATS JetStream needs durable disk — NOT viable on ephemeral. Staging
+# runs core NATS only (no streams). /queue/new in staging returns a
+# legacy_open connection string. JetStream features test-skipped.
+
+name = "instanode-nats"
+main = "src/worker.ts"
+compatibility_date = "2026-05-30"
+
+[env.staging]
+name = "instanode-nats-staging"
+routes = [
+ { pattern = "nats-*.staging.instanode.dev/*", custom_domain = true },
+]
+
+[[env.staging.containers]]
+class_name = "NatsContainer"
+# Custom image — wraps nats:2-alpine with /etc/nats/nats-server.conf
+# baked in (core NATS only, no JetStream, legacy_open auth — matches
+# prod's current auth posture).
+image = "ghcr.io/instanode-dev/instant-nats:staging"
+max_instances = 10
+instance_type = "basic"
+
+[[env.staging.durable_objects.bindings]]
+name = "NATS_CONTAINER"
+class_name = "NatsContainer"
+
+[[env.staging.migrations]]
+tag = "v1"
+new_sqlite_classes = ["NatsContainer"]
+
+[env.staging.vars]
+# No -js flag → core NATS only. Document that JetStream is staging-disabled
+# in /tmp/cf-migration/shared/STAGING-LIMITATIONS.md.
+NATS_ARGS = "-m 8222"
+
+[env.staging.observability]
+enabled = true
+head_sampling_rate = 1.0
diff --git a/wrangler/pg-customers/src/worker.ts b/wrangler/pg-customers/src/worker.ts
new file mode 100644
index 0000000..73ce9b0
--- /dev/null
+++ b/wrangler/pg-customers/src/worker.ts
@@ -0,0 +1,22 @@
+import { Container, getContainer } from "@cloudflare/containers";
+
+export class PgCustomersContainer extends Container {
+ defaultPort = 5432;
+ sleepAfter = "20m";
+}
+
+export default {
+ async fetch(request: Request, env: Env): Promise {
+ // Per-tenant routing: extract tenant from subdomain.
+ const url = new URL(request.url);
+ const tenant = url.hostname.split(".")[0].replace(/^pg-customer-/, "");
+ // ID by tenant → one DO instance per tenant (their isolated PG).
+ const id = env.PG_CUSTOMERS_CONTAINER.idFromName(tenant);
+ const container = env.PG_CUSTOMERS_CONTAINER.get(id);
+ return container.fetch(request);
+ },
+};
+
+interface Env {
+ PG_CUSTOMERS_CONTAINER: DurableObjectNamespace;
+}
diff --git a/wrangler/pg-customers/wrangler.toml b/wrangler/pg-customers/wrangler.toml
new file mode 100644
index 0000000..65a2b52
--- /dev/null
+++ b/wrangler/pg-customers/wrangler.toml
@@ -0,0 +1,36 @@
+# pg-customers — per-tenant Postgres in a CF Container (staging only).
+# Customer-facing: /db/new in staging returns a connection string here.
+# Data is EPHEMERAL — wipes on container sleep. Documented in ../README.md.
+
+name = "instanode-pg-customers"
+main = "src/worker.ts"
+compatibility_date = "2026-05-30"
+
+[env.staging]
+name = "instanode-pg-customers-staging"
+# Public TCP exposure happens via the Worker shell; staging clients dial
+# `pg-customer-.staging.instanode.dev:5432`.
+routes = [
+ { pattern = "pg-customer-*.staging.instanode.dev/*", custom_domain = true },
+]
+
+[[env.staging.containers]]
+class_name = "PgCustomersContainer"
+image = "postgres:16-alpine"
+max_instances = 10 # staging cap — bump if QA needs more
+instance_type = "standard"
+
+[[env.staging.durable_objects.bindings]]
+name = "PG_CUSTOMERS_CONTAINER"
+class_name = "PgCustomersContainer"
+
+[[env.staging.migrations]]
+tag = "v1"
+new_sqlite_classes = ["PgCustomersContainer"]
+
+[env.staging.vars]
+PGDATA = "/var/lib/postgresql/data/pgdata"
+
+[env.staging.observability]
+enabled = true
+head_sampling_rate = 1.0
diff --git a/wrangler/pg-platform/00_pre.sql b/wrangler/pg-platform/00_pre.sql
new file mode 100644
index 0000000..f2c18fb
--- /dev/null
+++ b/wrangler/pg-platform/00_pre.sql
@@ -0,0 +1,25 @@
+-- Runs FIRST in /docker-entrypoint-initdb.d/ (alphabetical sort puts
+-- "00_pre.sql" ahead of "001_initial.sql"). Sets up extensions + log
+-- markers that every later migration depends on.
+--
+-- This file is staging-only — production uses different operator-run
+-- bootstrap. See infra/wrangler/pg-platform/Dockerfile for context.
+
+-- pgvector — mig 040+ does CREATE EXTENSION vector and assumes the
+-- shared library is loadable. pgvector/pgvector:pg16 ships the .so;
+-- this just registers it in the freshly-init'd database.
+CREATE EXTENSION IF NOT EXISTS vector;
+
+-- Standard extensions we use across migrations.
+CREATE EXTENSION IF NOT EXISTS "uuid-ossp";
+CREATE EXTENSION IF NOT EXISTS "pgcrypto";
+
+-- Match prod timezone — every timestamp comparison in tests assumes UTC.
+SET TIME ZONE 'UTC';
+
+-- Log marker. Shows in `wrangler tail` so operators know this is a
+-- cold-start init (vs an unexpected mid-life restart).
+DO $$
+BEGIN
+ RAISE NOTICE 'pg-platform staging cold start — re-applying 63 migrations against fresh PGDATA';
+END $$;
diff --git a/wrangler/pg-platform/Dockerfile b/wrangler/pg-platform/Dockerfile
new file mode 100644
index 0000000..3b83f11
--- /dev/null
+++ b/wrangler/pg-platform/Dockerfile
@@ -0,0 +1,53 @@
+# pg-platform image for staging CF Container.
+#
+# Base: pgvector/pgvector:pg16 — Postgres 16 + the pgvector extension
+# that platform_db's resource embeddings table requires (extension CREATE
+# in mig 040+; without pgvector the image init fails on the first
+# `CREATE EXTENSION vector` statement).
+#
+# Migrations: the 63 *.sql files from api/internal/db/migrations/ are
+# copied into /docker-entrypoint-initdb.d/. Postgres's official
+# entrypoint runs every *.sql alphabetically on first cluster init —
+# and CF Containers' ephemeral disk means EVERY cold start IS a first
+# cluster init, so the migrations re-apply on every wake-from-sleep.
+#
+# This is the explicit, user-blessed ephemeral-state tradeoff for the
+# CF-only staging design. See ../README.md acceptance criterion.
+#
+# Build context: workspace root (../../).
+# Build command (CI runs this; not for ad-hoc local use):
+# docker buildx build \
+# -f infra/wrangler/pg-platform/Dockerfile \
+# -t ghcr.io/instanode-dev/instant-pg-platform:staging \
+# --push \
+# .
+
+FROM pgvector/pgvector:pg16
+
+# Copy every migration file in numeric (=alphabetical) order. The
+# leading 0NN_*.sql naming guarantees the entrypoint applies them in
+# the same order as `make test-db-up` does locally.
+COPY api/internal/db/migrations/*.sql /docker-entrypoint-initdb.d/
+
+# A pre-script that runs before any migration. Names start with "00_"
+# so it sorts ahead of "001_initial.sql".
+#
+# We use it to:
+# 1. CREATE EXTENSION pgvector (idempotent — base image has the
+# shared lib; this enables it in the freshly-init'd database).
+# 2. Set timezone to UTC to match production.
+# 3. Print a one-line marker so the CF Container's logs make clear
+# this is a fresh init (operator confidence on cold start).
+COPY infra/wrangler/pg-platform/00_pre.sql /docker-entrypoint-initdb.d/00_pre.sql
+
+# postgres image expects POSTGRES_PASSWORD set; staging wrangler.toml
+# wires that through `wrangler secret put POSTGRES_PASSWORD`. The
+# image also reads POSTGRES_DB / POSTGRES_USER if provided (wrangler
+# env block sets POSTGRES_DB=instant_platform).
+
+# Healthcheck — pg_isready against the local socket. Used by the
+# Worker shell's container.fetch wait-loop.
+HEALTHCHECK --interval=10s --timeout=3s --start-period=30s --retries=3 \
+ CMD pg_isready -U "${POSTGRES_USER:-postgres}" -d "${POSTGRES_DB:-instant_platform}" || exit 1
+
+EXPOSE 5432
diff --git a/wrangler/pg-platform/README.md b/wrangler/pg-platform/README.md
new file mode 100644
index 0000000..67b992a
--- /dev/null
+++ b/wrangler/pg-platform/README.md
@@ -0,0 +1,87 @@
+# pg-platform — staging CF Container
+
+Postgres 16 + pgvector. Image baked with all 63 platform migrations in
+`/docker-entrypoint-initdb.d/` so cold starts come up with a fully
+migrated schema.
+
+## Ephemeral acceptance
+
+Per the CF-only staging decision (2026-05-30): disk wipes every time the
+Container sleeps (which fires on traffic-quiet, not just intentional
+restart). Each cold start:
+
+1. CF Containers wakes the Container with a fresh disk.
+2. Postgres entrypoint sees PGDATA empty → runs `initdb`.
+3. `00_pre.sql` runs first — pgvector + uuid-ossp + pgcrypto extensions, UTC tz.
+4. The 63 migration files run in numeric order (001 → 063).
+5. Container reports healthy via `pg_isready`.
+6. api / worker / provisioner Containers can now connect via service binding.
+
+Total cold-start time: estimated 15–45s depending on Container class +
+migration count. Anything that talks to pg-platform must tolerate this
+warmup (Worker shell's `container.fetch` blocks until healthy).
+
+## Image build
+
+The image is built by `infra/.github/workflows/wrangler-build-staging-images.yml`
+on push to master that changes any of:
+- `api/internal/db/migrations/**` (cross-repo trigger via repository_dispatch — see below)
+- `infra/wrangler/pg-platform/**`
+
+Plus daily at 09:00 UTC to keep up with migrations merged in api repo without
+explicit infra commits.
+
+Manual rebuild:
+```bash
+gh workflow run wrangler-build-staging-images.yml \
+ -R instanode-dev/infra \
+ -f service=pg-platform
+```
+
+## Cross-repo migration sync
+
+Migrations live in the `api` repo, not infra. Two patterns to keep the
+image current:
+
+1. **Daily cron rebuild** — the build workflow runs nightly with a fresh
+ checkout of both repos; any new `.sql` file lands within 24h.
+2. **`api` repo notifies on migration change** — `api/.github/workflows/notify-infra-on-migration.yml`
+ sends a `repository_dispatch` event to infra when `api/internal/db/migrations/**`
+ changes, triggering an immediate build.
+
+If neither runs, staging pg-platform will be behind on migrations and
+api startup will fail with "migration not applied" — operator-visible
+via `wrangler tail instanode-pg-platform-staging`.
+
+## Secrets
+
+Set via `wrangler secret put`, scoped to `--env staging`:
+
+| Secret | Source | Purpose |
+|---|---|---|
+| `POSTGRES_USER` | operator-defined (e.g. `instanode_admin`) | role for connection |
+| `POSTGRES_PASSWORD` | random, ≥32 chars | passed to connection_url |
+| `POSTGRES_DB` | `instant_platform` | initial DB created at first start |
+
+The actual connection string handed to api/worker/provisioner is built
+via service binding — they see `PG_PLATFORM` env binding, not a raw
+URL with the password.
+
+## Verifying
+
+```bash
+wrangler tail instanode-pg-platform-staging --format pretty
+# wait for: "pg-platform staging cold start — re-applying 63 migrations against fresh PGDATA"
+# then: "database system is ready to accept connections"
+
+# from a debug Worker shell:
+wrangler dev --env staging
+# Then inside the Worker: env.PG_PLATFORM.fetch("http://internal/healthz")
+```
+
+## Known limitations
+
+- **Cold-start cost is ~15-45s.** Synthetic warmer can keep it hot; without one, every traffic gap > sleepAfter (currently 30m) pays the full re-migration cost.
+- **No replication.** max_instances=1; HA is meaningless when disk is ephemeral. Production gets a different model entirely.
+- **No `pg_dump` artifacts persist.** If you need a snapshot for debugging, dump and immediately stream to R2 via the customer-backup pipeline; the local file dies on next sleep.
+- **63 migrations is the live count as of 2026-05-30.** When api repo adds mig 064+, the daily cron rebuild picks them up.
diff --git a/wrangler/pg-platform/src/worker.ts b/wrangler/pg-platform/src/worker.ts
new file mode 100644
index 0000000..7646da8
--- /dev/null
+++ b/wrangler/pg-platform/src/worker.ts
@@ -0,0 +1,25 @@
+// pg-platform Worker shell. Postgres doesn't speak HTTP, but CF
+// Containers require a Worker entrypoint. The Worker accepts a
+// service-binding RPC from other Containers and forwards a connection
+// hint; the actual TCP traffic flows over the Container DO's internal
+// network using `container.fetch(request)` with `Upgrade: tcp` semantics
+// (CF Containers' raw-TCP mode, available since the GA release).
+
+import { Container, getContainer } from "@cloudflare/containers";
+
+export class PgPlatformContainer extends Container {
+ defaultPort = 5432;
+ sleepAfter = "30m"; // Longer than api so platform_db survives test bursts.
+}
+
+export default {
+ async fetch(request: Request, env: Env): Promise {
+ const container = getContainer(env.PG_CONTAINER);
+ // Container holds the TCP listener; CF routes the upgraded socket through.
+ return container.fetch(request);
+ },
+};
+
+interface Env {
+ PG_CONTAINER: DurableObjectNamespace;
+}
diff --git a/wrangler/pg-platform/wrangler.toml b/wrangler/pg-platform/wrangler.toml
new file mode 100644
index 0000000..274e033
--- /dev/null
+++ b/wrangler/pg-platform/wrangler.toml
@@ -0,0 +1,48 @@
+# pg-platform on CF Containers (staging).
+#
+# Runs `postgres:16` in a CF Container. Data dir is ephemeral —
+# every sleep wipes /var/lib/postgresql/data. This is the explicit
+# user-blessed tradeoff for CF-only staging.
+#
+# Production does NOT use this; prod platform_db lives elsewhere.
+
+name = "instanode-pg-platform"
+main = "src/worker.ts"
+compatibility_date = "2026-05-30"
+
+[env.staging]
+name = "instanode-pg-platform-staging"
+# No public route — accessed only via service binding from api/worker/provisioner.
+
+[[env.staging.containers]]
+class_name = "PgPlatformContainer"
+# Custom image built by infra/.github/workflows/wrangler-build-staging-images.yml.
+# Bakes the 63 migrations from api/internal/db/migrations/*.sql into
+# /docker-entrypoint-initdb.d/ + pgvector extension. See ./Dockerfile.
+image = "ghcr.io/instanode-dev/instant-pg-platform:staging"
+max_instances = 1 # Single-writer; HA is meaningless when disk is ephemeral.
+instance_type = "standard" # 1 vCPU, 4 GiB RAM, 8 GiB ephemeral
+
+[[env.staging.durable_objects.bindings]]
+name = "PG_CONTAINER"
+class_name = "PgPlatformContainer"
+
+[[env.staging.migrations]]
+tag = "v1"
+new_sqlite_classes = ["PgPlatformContainer"]
+
+# Bootstrap secrets via wrangler secret put:
+# POSTGRES_USER, POSTGRES_PASSWORD, POSTGRES_DB
+# The Postgres image reads these env vars on first boot to initialize the
+# cluster — which it'll redo every sleep cycle.
+[env.staging.vars]
+POSTGRES_DB = "instant_platform"
+# POSTGRES_INITDB_ARGS controls locale; staging just uses default.
+PGDATA = "/var/lib/postgresql/data/pgdata"
+# Run our 62 migrations on container boot. The init script lives in
+# src/bootstrap.sh and is included in the image via Dockerfile.
+APPLY_MIGRATIONS_ON_BOOT = "true"
+
+[env.staging.observability]
+enabled = true
+head_sampling_rate = 1.0
diff --git a/wrangler/provisioner/src/worker.ts b/wrangler/provisioner/src/worker.ts
new file mode 100644
index 0000000..72fde55
--- /dev/null
+++ b/wrangler/provisioner/src/worker.ts
@@ -0,0 +1,16 @@
+import { Container, getContainer } from "@cloudflare/containers";
+
+export class ProvisionerContainer extends Container {
+ defaultPort = 50051; // gRPC
+ sleepAfter = "20m";
+}
+
+export default {
+ async fetch(request: Request, env: Env): Promise {
+ return getContainer(env.PROVISIONER_CONTAINER).fetch(request);
+ },
+};
+
+interface Env {
+ PROVISIONER_CONTAINER: DurableObjectNamespace;
+}
diff --git a/wrangler/provisioner/wrangler.toml b/wrangler/provisioner/wrangler.toml
new file mode 100644
index 0000000..d1c93dc
--- /dev/null
+++ b/wrangler/provisioner/wrangler.toml
@@ -0,0 +1,47 @@
+# provisioner — gRPC service in a CF Container (staging).
+# No public route; api reaches it via service binding.
+
+name = "instanode-provisioner"
+main = "src/worker.ts"
+compatibility_date = "2026-05-30"
+
+[env.staging]
+name = "instanode-provisioner-staging"
+
+[[env.staging.containers]]
+class_name = "ProvisionerContainer"
+image = "ghcr.io/instanode-dev/instant-provisioner:staging"
+max_instances = 2
+instance_type = "standard"
+
+[[env.staging.durable_objects.bindings]]
+name = "PROVISIONER_CONTAINER"
+class_name = "ProvisionerContainer"
+
+[[env.staging.migrations]]
+tag = "v1"
+new_sqlite_classes = ["ProvisionerContainer"]
+
+[env.staging.vars]
+ENVIRONMENT = "staging"
+
+# Provisioner reaches the customer-data Containers via service bindings.
+[[env.staging.services]]
+binding = "PG_CUSTOMERS"
+service = "instanode-pg-customers-staging"
+
+[[env.staging.services]]
+binding = "MONGODB"
+service = "instanode-mongodb-staging"
+
+[[env.staging.services]]
+binding = "REDIS_PROVISION"
+service = "instanode-redis-provision-staging"
+
+[[env.staging.services]]
+binding = "NATS"
+service = "instanode-nats-staging"
+
+[env.staging.observability]
+enabled = true
+head_sampling_rate = 1.0
diff --git a/wrangler/redis-provision/Dockerfile b/wrangler/redis-provision/Dockerfile
new file mode 100644
index 0000000..299d710
--- /dev/null
+++ b/wrangler/redis-provision/Dockerfile
@@ -0,0 +1,30 @@
+# redis-provision image for staging CF Container.
+#
+# Base: redis:7-alpine. CF Containers' ephemeral disk means RDB
+# persistence is pointless — every sleep wipes /data. We disable
+# RDB + AOF entirely and run in-memory-only with `allkeys-lru`
+# eviction so the Container can't OOM under sustained writes.
+#
+# Why custom (vs pristine redis:7-alpine):
+# - Bake redis.conf with auth + memory + eviction policy so the
+# Worker shell doesn't have to pass them via wrangler.toml CMD.
+# - Healthcheck via `redis-cli -a $REDIS_PASSWORD ping`.
+# - Auth is via `requirepass` from REDIS_PASSWORD env (wrangler
+# secret).
+
+FROM redis:7-alpine
+
+COPY infra/wrangler/redis-provision/redis.conf /etc/redis/redis.conf
+
+# Entrypoint that templates REDIS_PASSWORD env into the conf at boot.
+# Without this, the conf can't contain the secret at build time.
+COPY infra/wrangler/redis-provision/entrypoint.sh /usr/local/bin/staging-entrypoint.sh
+RUN chmod +x /usr/local/bin/staging-entrypoint.sh
+
+HEALTHCHECK --interval=10s --timeout=3s --start-period=10s --retries=3 \
+ CMD redis-cli -a "$REDIS_PASSWORD" --no-auth-warning ping | grep -q '^PONG$' || exit 1
+
+EXPOSE 6379
+
+ENTRYPOINT ["/usr/local/bin/staging-entrypoint.sh"]
+CMD ["redis-server", "/etc/redis/redis.conf"]
diff --git a/wrangler/redis-provision/entrypoint.sh b/wrangler/redis-provision/entrypoint.sh
new file mode 100644
index 0000000..bc62464
--- /dev/null
+++ b/wrangler/redis-provision/entrypoint.sh
@@ -0,0 +1,21 @@
+#!/bin/sh
+# Templating entrypoint for staging redis. Inlines REDIS_PASSWORD into
+# /etc/redis/redis.conf at boot (the file ships with __REDIS_PASSWORD__
+# as a literal marker; we never bake a real secret into the image).
+
+set -eu
+
+if [ -z "${REDIS_PASSWORD:-}" ]; then
+ echo "redis-provision: REDIS_PASSWORD env var is required" >&2
+ exit 1
+fi
+
+# In-place substitute. Using a temp file because sed -i on alpine
+# behaves differently than GNU sed; this is portable.
+TMP="$(mktemp)"
+sed "s|__REDIS_PASSWORD__|${REDIS_PASSWORD}|" /etc/redis/redis.conf > "$TMP"
+mv "$TMP" /etc/redis/redis.conf
+chmod 600 /etc/redis/redis.conf # only root reads — defense in depth
+
+# Hand off to the configured CMD (`redis-server /etc/redis/redis.conf`).
+exec "$@"
diff --git a/wrangler/redis-provision/redis.conf b/wrangler/redis-provision/redis.conf
new file mode 100644
index 0000000..7b423d0
--- /dev/null
+++ b/wrangler/redis-provision/redis.conf
@@ -0,0 +1,28 @@
+# Staging redis.conf — ephemeral, auth'd, LRU-capped.
+# REDIS_PASSWORD is substituted at container boot by entrypoint.sh.
+
+bind 0.0.0.0
+port 6379
+protected-mode yes
+
+# Auth — entrypoint.sh inlines REDIS_PASSWORD env value here.
+requirepass __REDIS_PASSWORD__
+
+# Memory cap + eviction. CF Container "basic" tier has 4 GiB; cap at 3
+# GiB to leave headroom for connection buffers + COW during eviction.
+maxmemory 3gb
+maxmemory-policy allkeys-lru
+
+# No persistence — CF Containers wipe /data on sleep, so RDB snapshots
+# only waste CPU. AOF same. Staging is in-memory-only by design.
+save ""
+appendonly no
+
+# Logging to stdout for `wrangler tail`.
+logfile ""
+loglevel notice
+
+# Connection limits matched to instance class.
+maxclients 1000
+timeout 300
+tcp-keepalive 60
diff --git a/wrangler/redis-provision/src/worker.ts b/wrangler/redis-provision/src/worker.ts
new file mode 100644
index 0000000..2b77911
--- /dev/null
+++ b/wrangler/redis-provision/src/worker.ts
@@ -0,0 +1,19 @@
+import { Container, getContainer } from "@cloudflare/containers";
+
+export class RedisContainer extends Container {
+ defaultPort = 6379;
+ sleepAfter = "20m";
+}
+
+export default {
+ async fetch(request: Request, env: Env): Promise {
+ const url = new URL(request.url);
+ const tenant = url.hostname.split(".")[0].replace(/^redis-/, "");
+ const id = env.REDIS_CONTAINER.idFromName(tenant);
+ return env.REDIS_CONTAINER.get(id).fetch(request);
+ },
+};
+
+interface Env {
+ REDIS_CONTAINER: DurableObjectNamespace;
+}
diff --git a/wrangler/redis-provision/wrangler.toml b/wrangler/redis-provision/wrangler.toml
new file mode 100644
index 0000000..2896e8d
--- /dev/null
+++ b/wrangler/redis-provision/wrangler.toml
@@ -0,0 +1,32 @@
+# redis-provision — per-tenant Redis in a CF Container (staging).
+
+name = "instanode-redis-provision"
+main = "src/worker.ts"
+compatibility_date = "2026-05-30"
+
+[env.staging]
+name = "instanode-redis-provision-staging"
+routes = [
+ { pattern = "redis-*.staging.instanode.dev/*", custom_domain = true },
+]
+
+[[env.staging.containers]]
+class_name = "RedisContainer"
+# Custom image — wraps redis:7-alpine with auth + maxmemory + LRU
+# eviction baked into /etc/redis/redis.conf (entrypoint templates
+# REDIS_PASSWORD in at boot).
+image = "ghcr.io/instanode-dev/instant-redis-provision:staging"
+max_instances = 10
+instance_type = "basic" # Redis is lighter than PG/Mongo
+
+[[env.staging.durable_objects.bindings]]
+name = "REDIS_CONTAINER"
+class_name = "RedisContainer"
+
+[[env.staging.migrations]]
+tag = "v1"
+new_sqlite_classes = ["RedisContainer"]
+
+[env.staging.observability]
+enabled = true
+head_sampling_rate = 1.0
diff --git a/wrangler/worker/src/worker.ts b/wrangler/worker/src/worker.ts
new file mode 100644
index 0000000..db330bb
--- /dev/null
+++ b/wrangler/worker/src/worker.ts
@@ -0,0 +1,23 @@
+import { Container, getContainer } from "@cloudflare/containers";
+
+export class WorkerContainer extends Container {
+ defaultPort = 8091; // worker exposes /metrics + /readyz on 8091
+ sleepAfter = "20m";
+}
+
+export default {
+ // HTTP path: forward to container (rare; mostly metrics scrapes).
+ async fetch(request: Request, env: Env): Promise {
+ return getContainer(env.WORKER_CONTAINER).fetch(request);
+ },
+ // Cron path: wake the container so River picks up due jobs.
+ async scheduled(_event: ScheduledEvent, env: Env): Promise {
+ const c = getContainer(env.WORKER_CONTAINER);
+ // A no-op POST that the worker binary handles as "tick the job loop".
+ await c.fetch("http://internal/tick", { method: "POST" });
+ },
+};
+
+interface Env {
+ WORKER_CONTAINER: DurableObjectNamespace;
+}
diff --git a/wrangler/worker/wrangler.toml b/wrangler/worker/wrangler.toml
new file mode 100644
index 0000000..05b555d
--- /dev/null
+++ b/wrangler/worker/wrangler.toml
@@ -0,0 +1,40 @@
+# worker — River jobs in a CF Container (staging).
+# Cron triggers via CF Cron Triggers (no public route).
+
+name = "instanode-worker"
+main = "src/worker.ts"
+compatibility_date = "2026-05-30"
+
+[env.staging]
+name = "instanode-worker-staging"
+
+[[env.staging.containers]]
+class_name = "WorkerContainer"
+image = "ghcr.io/instanode-dev/instant-worker:staging"
+max_instances = 2
+instance_type = "standard"
+
+[[env.staging.durable_objects.bindings]]
+name = "WORKER_CONTAINER"
+class_name = "WorkerContainer"
+
+[[env.staging.migrations]]
+tag = "v1"
+new_sqlite_classes = ["WorkerContainer"]
+
+# Cron — fires every 5 minutes; the Worker shell wakes the Container.
+[env.staging.triggers]
+crons = ["*/5 * * * *"]
+
+[env.staging.vars]
+ENVIRONMENT = "staging"
+OBJECT_STORE_BACKEND = "r2"
+R2_BUCKET_NAME = "instant-shared-staging"
+
+[[env.staging.services]]
+binding = "PG_PLATFORM"
+service = "instanode-pg-platform-staging"
+
+[env.staging.observability]
+enabled = true
+head_sampling_rate = 1.0