diff --git a/.github/workflows/terraform-apply-production.yml b/.github/workflows/terraform-apply-production.yml new file mode 100644 index 0000000..415670e --- /dev/null +++ b/.github/workflows/terraform-apply-production.yml @@ -0,0 +1,152 @@ +--- +# infra — gated Terraform apply for the PRODUCTION Cloudflare workspace. +# +# APPROVAL MODEL: workflow_dispatch + GitHub Environment "production" +# with required reviewers. No push trigger. No "promote from staging" +# trigger. Every production apply is a separate, deliberate decision +# made by a human reviewer on a human-triggered run. +# +# Confirm phrase is stricter than staging — operator must type a +# matching staging RUN_ID so they cannot apply prod without having +# first applied + observed the same change in staging. +# +# Security note: every GHA expression consumed in a run: block is +# wrapped through env: to prevent script injection. + +name: terraform-apply-production + +on: + workflow_dispatch: + inputs: + confirm: + description: 'Type APPLY-PRODUCTION to confirm' + required: true + type: string + staging_run_id: + description: 'GH Actions run_id of the matching staging apply (must be a numeric id)' + required: true + type: string + +permissions: + contents: read + +concurrency: + group: terraform-apply-production + cancel-in-progress: false # never cancel an in-flight apply + +env: + TF_VERSION: '1.9.8' + TF_IN_AUTOMATION: 'true' + TF_ENV: 'production' + CLOUDFLARE_API_TOKEN: ${{ secrets.CLOUDFLARE_API_TOKEN }} + AWS_ACCESS_KEY_ID: ${{ secrets.TF_STATE_R2_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.TF_STATE_R2_SECRET_ACCESS_KEY }} + AWS_REGION: 'auto' + CF_ACCOUNT_ID: ${{ secrets.CF_ACCOUNT_ID }} + +jobs: + guard: + name: confirm-input + staging-precedent guard + runs-on: ubuntu-latest + env: + CONFIRM_INPUT: ${{ inputs.confirm }} + STAGING_RUN_ID: ${{ inputs.staging_run_id }} + steps: + - name: Reject if confirm phrase wrong + run: | + if [ "${CONFIRM_INPUT}" != "APPLY-PRODUCTION" ]; then + echo "::error::confirm input must be exactly 'APPLY-PRODUCTION'" + exit 1 + fi + + - name: Reject if staging_run_id is not numeric + run: | + # ref-injection mitigation: validate strictly before any use. + case "${STAGING_RUN_ID}" in + ''|*[!0-9]*) + echo "::error::staging_run_id must be a numeric GH Actions run id (got '${STAGING_RUN_ID}')" + exit 1 + ;; + esac + + - name: Verify staging run exists + succeeded + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # STAGING_RUN_ID already validated as numeric above; safe to use. + run: | + conclusion=$(gh run view "${STAGING_RUN_ID}" --repo "${GITHUB_REPOSITORY}" --json conclusion --jq '.conclusion') + name=$(gh run view "${STAGING_RUN_ID}" --repo "${GITHUB_REPOSITORY}" --json name --jq '.name') + if [ "${name}" != "terraform-apply-staging" ]; then + echo "::error::staging_run_id ${STAGING_RUN_ID} is not a terraform-apply-staging run (got: ${name})" + exit 1 + fi + if [ "${conclusion}" != "success" ]; then + echo "::error::staging_run_id ${STAGING_RUN_ID} did not succeed (conclusion: ${conclusion})" + exit 1 + fi + echo "staging precedent ✓ (run ${STAGING_RUN_ID} = success)" + + apply: + name: apply production + needs: guard + runs-on: ubuntu-latest + # GitHub Environment "production" must be configured with Required + # Reviewers — operator sets this up at repo Settings → Environments → + # production → Deployment protection rules. This is the second gate + # on top of the confirm input + staging-precedent checks above. + environment: production + defaults: + run: + working-directory: terraform/cloudflare + steps: + - uses: actions/checkout@v6 + + - uses: hashicorp/setup-terraform@v3 + with: + terraform_version: ${{ env.TF_VERSION }} + + - name: Verify operator secrets are set + run: | + missing="" + [ -z "${CLOUDFLARE_API_TOKEN}" ] && missing="${missing} CLOUDFLARE_API_TOKEN" + [ -z "${AWS_ACCESS_KEY_ID}" ] && missing="${missing} TF_STATE_R2_ACCESS_KEY_ID" + [ -z "${AWS_SECRET_ACCESS_KEY}" ] && missing="${missing} TF_STATE_R2_SECRET_ACCESS_KEY" + [ -z "${CF_ACCOUNT_ID}" ] && missing="${missing} CF_ACCOUNT_ID" + if [ -n "${missing}" ]; then + echo "::error::Operator action required — these repo secrets are not set:${missing}" + echo "::error::See https://github.com/InstaNode-dev/infra/blob/master/terraform/cloudflare/README.md#bootstrap-one-time" + exit 1 + fi + + - name: terraform init + run: | + terraform init \ + -backend-config="endpoints={s3=\"https://${CF_ACCOUNT_ID}.r2.cloudflarestorage.com\"}" \ + -backend-config="workspace_key_prefix=${TF_ENV}" + + - name: terraform workspace select + run: terraform workspace select "${TF_ENV}" + + - name: terraform plan + run: | + terraform plan \ + -var-file="${TF_ENV}.auto.tfvars" \ + -no-color \ + -out=tfplan.bin + + - name: terraform apply + run: terraform apply -no-color tfplan.bin + + - name: Surface non-sensitive outputs (ids only, NO token values) + run: | + terraform output -no-color account_id || true + terraform output -no-color zone_id || true + terraform output -no-color deploy_token_id || true + terraform output -no-color admin_tunnel_token_id || true + + - name: Reminder + run: | + echo "::notice::PRODUCTION APPLY COMPLETE." + echo "::notice::If tokens were created or rotated, run on an operator workstation:" + echo "::notice:: make install-secrets ENV=production" + echo "::notice::Confirm the CF dashboard audit log shows the change before revoking the prior token." diff --git a/.github/workflows/terraform-apply-staging.yml b/.github/workflows/terraform-apply-staging.yml new file mode 100644 index 0000000..2e8ef76 --- /dev/null +++ b/.github/workflows/terraform-apply-staging.yml @@ -0,0 +1,116 @@ +--- +# infra — gated Terraform apply for the STAGING Cloudflare workspace. +# +# APPROVAL MODEL: workflow_dispatch ONLY. Never on push, never on merge, +# never auto-promoted from a previous apply. Operator deliberately +# triggers this from the Actions tab. +# +# Why split per env: staging and production must not share an apply +# trigger. Splitting prevents a "promote-on-success" pipeline from +# ever existing for production — every prod apply is a separate human +# decision (see terraform-apply-production.yml). +# +# Security note: every GHA expression consumed in a run: block is +# wrapped through env: to prevent script injection. + +name: terraform-apply-staging + +on: + workflow_dispatch: + inputs: + confirm: + description: 'Type APPLY-STAGING to confirm' + required: true + type: string + +permissions: + contents: read + +concurrency: + group: terraform-apply-staging + cancel-in-progress: false # never cancel an in-flight apply + +env: + TF_VERSION: '1.9.8' + TF_IN_AUTOMATION: 'true' + TF_ENV: 'staging' + CLOUDFLARE_API_TOKEN: ${{ secrets.CLOUDFLARE_API_TOKEN }} + AWS_ACCESS_KEY_ID: ${{ secrets.TF_STATE_R2_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.TF_STATE_R2_SECRET_ACCESS_KEY }} + AWS_REGION: 'auto' + CF_ACCOUNT_ID: ${{ secrets.CF_ACCOUNT_ID }} + +jobs: + guard: + name: confirm-input guard + runs-on: ubuntu-latest + env: + CONFIRM_INPUT: ${{ inputs.confirm }} + steps: + - name: Reject if confirm phrase wrong + run: | + if [ "${CONFIRM_INPUT}" != "APPLY-STAGING" ]; then + echo "::error::confirm input must be exactly 'APPLY-STAGING'" + exit 1 + fi + + apply: + name: apply staging + needs: guard + runs-on: ubuntu-latest + environment: staging + defaults: + run: + working-directory: terraform/cloudflare + steps: + - uses: actions/checkout@v6 + + - uses: hashicorp/setup-terraform@v3 + with: + terraform_version: ${{ env.TF_VERSION }} + + - name: Verify operator secrets are set + run: | + missing="" + [ -z "${CLOUDFLARE_API_TOKEN}" ] && missing="${missing} CLOUDFLARE_API_TOKEN" + [ -z "${AWS_ACCESS_KEY_ID}" ] && missing="${missing} TF_STATE_R2_ACCESS_KEY_ID" + [ -z "${AWS_SECRET_ACCESS_KEY}" ] && missing="${missing} TF_STATE_R2_SECRET_ACCESS_KEY" + [ -z "${CF_ACCOUNT_ID}" ] && missing="${missing} CF_ACCOUNT_ID" + if [ -n "${missing}" ]; then + echo "::error::Operator action required — these repo secrets are not set:${missing}" + echo "::error::See https://github.com/InstaNode-dev/infra/blob/master/terraform/cloudflare/README.md#bootstrap-one-time" + exit 1 + fi + + - name: terraform init + run: | + terraform init \ + -backend-config="endpoints={s3=\"https://${CF_ACCOUNT_ID}.r2.cloudflarestorage.com\"}" \ + -backend-config="workspace_key_prefix=${TF_ENV}" + + - name: terraform workspace select + run: terraform workspace select "${TF_ENV}" + + - name: terraform plan + run: | + terraform plan \ + -var-file="${TF_ENV}.auto.tfvars" \ + -no-color \ + -out=tfplan.bin + + - name: terraform apply + run: terraform apply -no-color tfplan.bin + + - name: Surface non-sensitive outputs (ids only, NO token values) + run: | + terraform output -no-color account_id || true + terraform output -no-color zone_id || true + terraform output -no-color deploy_token_id || true + terraform output -no-color admin_tunnel_token_id || true + + - name: Reminder + run: | + echo "::notice::STAGING APPLY COMPLETE." + echo "::notice::If tokens were created or rotated, run on an operator workstation:" + echo "::notice:: make install-secrets ENV=staging" + echo "::notice::Promoting to production is a SEPARATE manual decision via terraform-apply-production.yml." diff --git a/.github/workflows/terraform.yml b/.github/workflows/terraform.yml new file mode 100644 index 0000000..fe9cc2c --- /dev/null +++ b/.github/workflows/terraform.yml @@ -0,0 +1,170 @@ +--- +# infra — Terraform fmt + validate + plan for CF resources. +# +# Runs on every push to master and on PRs touching terraform/**. +# Plan is read-only. Apply is split into per-env manual workflows +# (terraform-apply-staging.yml, terraform-apply-production.yml). +# This file NEVER applies — see those workflows for the apply path. +# +# Posts the plan diff as a PR comment so reviewers see what apply +# would do without granting CI apply rights. +# +# Security note: all GHA expressions consumed in run: blocks are +# referenced through env vars to prevent script injection. + +name: terraform + +on: + push: + branches: [master] + paths: + - 'terraform/**' + - '.github/workflows/terraform*.yml' + pull_request: + paths: + - 'terraform/**' + - '.github/workflows/terraform*.yml' + workflow_dispatch: + +permissions: + contents: read + pull-requests: write # for the plan comment + +concurrency: + group: terraform-plan-${{ github.ref }} + cancel-in-progress: true + +env: + TF_VERSION: '1.9.8' + TF_IN_AUTOMATION: 'true' + +jobs: + fmt-validate: + name: fmt + validate + runs-on: ubuntu-latest + defaults: + run: + working-directory: terraform/cloudflare + steps: + - uses: actions/checkout@v6 + + - uses: hashicorp/setup-terraform@v3 + with: + terraform_version: ${{ env.TF_VERSION }} + + - name: terraform fmt -check + run: terraform fmt -check -recursive + + - name: terraform init (backend-bypassed) + run: terraform init -backend=false + + - name: terraform validate + run: terraform validate -no-color + + plan: + name: plan (${{ matrix.env }}) + needs: fmt-validate + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + env: [staging, production] + defaults: + run: + working-directory: terraform/cloudflare + # CF creds + state-backend creds passed in via env, not inlined in run:. + env: + CLOUDFLARE_API_TOKEN: ${{ secrets.CLOUDFLARE_API_TOKEN }} + AWS_ACCESS_KEY_ID: ${{ secrets.TF_STATE_R2_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.TF_STATE_R2_SECRET_ACCESS_KEY }} + AWS_REGION: 'auto' + CF_ACCOUNT_ID: ${{ secrets.CF_ACCOUNT_ID }} + TF_ENV: ${{ matrix.env }} + steps: + - uses: actions/checkout@v6 + + - uses: hashicorp/setup-terraform@v3 + with: + terraform_version: ${{ env.TF_VERSION }} + + - name: Verify operator secrets are set + # Bootstrap chicken-and-egg: plan needs CF + R2-HMAC creds, but + # those are operator-only one-time setup (see README §Bootstrap). + # Without this guard the failure mode is a cryptic AWS-IAM stack + # trace from `terraform init`. With it, the error is one line + # pointing at the README and the exact missing variable names. + run: | + missing="" + [ -z "${CLOUDFLARE_API_TOKEN}" ] && missing="${missing} CLOUDFLARE_API_TOKEN" + [ -z "${AWS_ACCESS_KEY_ID}" ] && missing="${missing} TF_STATE_R2_ACCESS_KEY_ID" + [ -z "${AWS_SECRET_ACCESS_KEY}" ] && missing="${missing} TF_STATE_R2_SECRET_ACCESS_KEY" + [ -z "${CF_ACCOUNT_ID}" ] && missing="${missing} CF_ACCOUNT_ID" + if [ -n "${missing}" ]; then + echo "::error::Operator action required — these repo secrets are not set:${missing}" + echo "::error::See https://github.com/InstaNode-dev/infra/blob/master/terraform/cloudflare/README.md#bootstrap-one-time" + exit 1 + fi + echo "all 4 operator secrets present" + + - name: terraform init + run: | + terraform init \ + -backend-config="endpoints={s3=\"https://${CF_ACCOUNT_ID}.r2.cloudflarestorage.com\"}" \ + -backend-config="workspace_key_prefix=${TF_ENV}" + + - name: terraform workspace select-or-create + run: terraform workspace select "${TF_ENV}" 2>/dev/null || terraform workspace new "${TF_ENV}" + + - name: terraform plan + id: plan + run: | + set +e + terraform plan \ + -var-file="${TF_ENV}.auto.tfvars" \ + -no-color \ + -out=tfplan.bin \ + -detailed-exitcode 2>&1 | tee /tmp/plan.out + ec=${PIPESTATUS[0]} + echo "exitcode=${ec}" >> "$GITHUB_OUTPUT" + # 0 = no changes, 2 = changes, 1 = error + [ "${ec}" -eq 1 ] && exit 1 || exit 0 + + - name: Comment plan on PR + if: github.event_name == 'pull_request' + uses: actions/github-script@v7 + env: + PLAN_ENV: ${{ matrix.env }} + PLAN_CODE: ${{ steps.plan.outputs.exitcode }} + RUN_ID: ${{ github.run_id }} + with: + script: | + const fs = require('fs'); + let plan = fs.readFileSync('/tmp/plan.out', 'utf8'); + if (plan.length > 60000) { + plan = plan.slice(0, 60000) + '\n\n... (truncated; full plan in job log)'; + } + const env = process.env.PLAN_ENV; + const code = process.env.PLAN_CODE; + const verdict = code === '0' ? '✅ no changes' + : code === '2' ? '🟡 changes present — review before manual apply' + : '❌ plan failed'; + const body = [ + `### Terraform plan — \`${env}\``, + verdict, + '', + '
plan output', + '', + '```hcl', + plan, + '```', + '', + '
', + '', + `_Posted by terraform.yml run ${process.env.RUN_ID}. Apply requires manual trigger of terraform-apply-${env}.yml._`, + ].join('\n'); + await github.rest.issues.createComment({ + issue_number: context.issue.number, + owner: context.repo.owner, + repo: context.repo.repo, + body, + }); diff --git a/.github/workflows/wrangler-build-staging-images.yml b/.github/workflows/wrangler-build-staging-images.yml new file mode 100644 index 0000000..c53ce26 --- /dev/null +++ b/.github/workflows/wrangler-build-staging-images.yml @@ -0,0 +1,203 @@ +--- +# infra — Build custom Docker images for CF Containers (staging only). +# +# Builds images that don't ship a usable upstream: +# - pg-platform: postgres + pgvector + all 63 platform migrations baked in +# +# api / worker / provisioner images are built by their own repos' deploy.yml +# (which now also pushes :staging — see api/.github/workflows/deploy.yml). +# This workflow handles only the "wrapped upstream image" cases. +# +# Triggers: +# - workflow_dispatch (with service input) +# - daily cron 09:00 UTC (to pick up migrations merged in api repo) +# - push to master touching infra/wrangler/pg-platform/** +# - repository_dispatch event "migrations-changed" from the api repo +# +# Security: all GHA expressions consumed in run: blocks are wrapped +# through env: to prevent script injection. + +name: wrangler-build-staging-images + +on: + workflow_dispatch: + inputs: + service: + description: 'Which custom image to build (or "all")' + required: true + type: choice + default: 'all' + options: + - all + - pg-platform + - mongodb + - redis-provision + - nats + push: + branches: [master] + paths: + - 'wrangler/pg-platform/**' + - 'wrangler/mongodb/**' + - 'wrangler/redis-provision/**' + - 'wrangler/nats/**' + - '.github/workflows/wrangler-build-staging-images.yml' + schedule: + - cron: '0 9 * * *' # daily 09:00 UTC + repository_dispatch: + types: [migrations-changed] + +permissions: + contents: read + packages: write + +concurrency: + group: wrangler-build-staging-${{ github.event.inputs.service || 'all' }} + cancel-in-progress: false + +env: + REGISTRY: ghcr.io + ORG: instanode-dev + +jobs: + pg-platform: + name: build pg-platform :staging + if: | + github.event_name == 'schedule' || + github.event_name == 'push' || + github.event_name == 'repository_dispatch' || + (github.event_name == 'workflow_dispatch' && (github.event.inputs.service == 'all' || github.event.inputs.service == 'pg-platform')) + runs-on: ubuntu-latest + env: + SERVICE: pg-platform + steps: + - name: Checkout infra repo + uses: actions/checkout@v6 + with: + path: infra + + - name: Checkout api repo (for the migrations) + uses: actions/checkout@v6 + with: + repository: ${{ vars.API_REPO || format('{0}/api', github.repository_owner) }} + token: ${{ secrets.REPO_ACCESS_TOKEN || secrets.GITHUB_TOKEN }} + path: api + + - name: Verify migrations dir exists + count + env: + MIGRATIONS_DIR: api/internal/db/migrations + run: | + if [ ! -d "$MIGRATIONS_DIR" ]; then + echo "::error::expected migrations dir $MIGRATIONS_DIR not found" + exit 1 + fi + count=$(find "$MIGRATIONS_DIR" -name '*.sql' | wc -l | tr -d ' ') + echo "migrations found: $count" + if [ "$count" -lt 50 ]; then + echo "::warning::only $count migration files — expected ≥50 (live count was 63 as of 2026-05-30)" + fi + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v4 + + - name: Log in to GHCR + uses: docker/login-action@v4 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + # GHCR_PUSH_TOKEN is a classic PAT with write:packages, same + # pattern as the api/worker/provisioner deploy.yml workflows. + password: ${{ secrets.GHCR_PUSH_TOKEN || secrets.GITHUB_TOKEN }} + + - name: Build and push + env: + IMAGE: ${{ env.REGISTRY }}/${{ env.ORG }}/instant-pg-platform + run: | + docker buildx build \ + --platform linux/amd64 \ + -f infra/wrangler/pg-platform/Dockerfile \ + -t "${IMAGE}:staging" \ + -t "${IMAGE}:staging-$(date -u +%Y%m%d)" \ + --push \ + . + + - name: Reminder + run: | + echo "::notice::pg-platform :staging image rebuilt with current migrations." + echo "::notice::Next CF Container cold start will re-apply them from the new image." + echo "::notice::Trigger a rolling restart with: wrangler deployments tail --env staging" + + # --------------------------------------------------------------------------- + # mongodb / redis-provision / nats — small wrapped images. + # + # These don't need cross-repo migration sync (the wrapping config is fully + # self-contained under infra/wrangler//). Single-repo checkout + + # build + push to GHCR. Same SERVICE-input gating as pg-platform. + # --------------------------------------------------------------------------- + + small-images: + name: build ${{ matrix.svc }} :staging + if: | + github.event_name == 'schedule' || + github.event_name == 'push' || + github.event_name == 'repository_dispatch' || + (github.event_name == 'workflow_dispatch' && (github.event.inputs.service == 'all' || github.event.inputs.service == 'mongodb' || github.event.inputs.service == 'redis-provision' || github.event.inputs.service == 'nats')) + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + svc: [mongodb, redis-provision, nats] + env: + SVC: ${{ matrix.svc }} + steps: + - name: Checkout infra repo + uses: actions/checkout@v6 + with: + path: infra + + - name: Skip if matrix svc doesn't match workflow_dispatch input + # Avoids spurious matrix entries when operator selected a single + # svc via workflow_dispatch. push / cron / dispatch run all 3. + id: gate + run: | + if [ "${{ github.event_name }}" != "workflow_dispatch" ]; then + echo "skip=false" >> "$GITHUB_OUTPUT" + exit 0 + fi + INPUT="${{ github.event.inputs.service }}" + if [ "$INPUT" = "all" ] || [ "$INPUT" = "$SVC" ]; then + echo "skip=false" >> "$GITHUB_OUTPUT" + else + echo "skip=true" >> "$GITHUB_OUTPUT" + echo "::notice::skipping $SVC (workflow_dispatch input was '$INPUT')" + fi + + - name: Set up Docker Buildx + if: steps.gate.outputs.skip == 'false' + uses: docker/setup-buildx-action@v4 + + - name: Log in to GHCR + if: steps.gate.outputs.skip == 'false' + uses: docker/login-action@v4 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GHCR_PUSH_TOKEN || secrets.GITHUB_TOKEN }} + + - name: Build and push + if: steps.gate.outputs.skip == 'false' + env: + IMAGE: ${{ env.REGISTRY }}/${{ env.ORG }}/instant-${{ matrix.svc }} + run: | + docker buildx build \ + --platform linux/amd64 \ + -f "infra/wrangler/${SVC}/Dockerfile" \ + -t "${IMAGE}:staging" \ + -t "${IMAGE}:staging-$(date -u +%Y%m%d)" \ + --push \ + . + + - name: Reminder + if: steps.gate.outputs.skip == 'false' + run: | + echo "::notice::${SVC} :staging image rebuilt." + echo "::notice::Trigger a rolling restart with: wrangler containers deploy --env staging" diff --git a/.github/workflows/wrangler-deploy-staging.yml b/.github/workflows/wrangler-deploy-staging.yml new file mode 100644 index 0000000..69c63cb --- /dev/null +++ b/.github/workflows/wrangler-deploy-staging.yml @@ -0,0 +1,110 @@ +--- +# infra — CF Containers deploy for staging via wrangler. +# +# APPROVAL MODEL: workflow_dispatch ONLY for the first ~10 runs (manual +# verification). After staging stabilizes, can be promoted to auto-run on +# merge to master (controlled by the `auto_deploy` input). +# +# Production does NOT use this workflow — see the eventual +# production-deploy.yml when the prod target is settled. +# +# Security: all GHA expressions consumed in run: blocks are wrapped +# through env: to prevent script injection. + +name: wrangler-deploy-staging + +on: + workflow_dispatch: + inputs: + service: + description: 'Which service to deploy (or "all")' + required: true + type: choice + options: + - all + - api + - worker + - provisioner + - pg-platform + - pg-customers + - mongodb + - redis-provision + - nats + confirm: + description: 'Type DEPLOY-STAGING to confirm' + required: true + type: string + +permissions: + contents: read + +concurrency: + group: wrangler-deploy-staging-${{ inputs.service }} + cancel-in-progress: false + +jobs: + guard: + name: confirm-input guard + runs-on: ubuntu-latest + env: + CONFIRM_INPUT: ${{ inputs.confirm }} + steps: + - name: Reject if confirm phrase wrong + run: | + if [ "${CONFIRM_INPUT}" != "DEPLOY-STAGING" ]; then + echo "::error::confirm must be exactly 'DEPLOY-STAGING'" + exit 1 + fi + + deploy: + name: deploy ${{ inputs.service }} + needs: guard + runs-on: ubuntu-latest + environment: staging + env: + CLOUDFLARE_API_TOKEN: ${{ secrets.CLOUDFLARE_API_TOKEN }} + CF_ACCOUNT_ID: ${{ secrets.CF_ACCOUNT_ID }} + SERVICE_INPUT: ${{ inputs.service }} + steps: + - uses: actions/checkout@v6 + + - uses: actions/setup-node@v4 + with: + node-version: '20' + + - name: Install wrangler + run: npm install -g wrangler@latest + + - name: Validate service name + run: | + # Whitelist enforced — never embed user input into shell paths + # without validating it matches a known service. + case "${SERVICE_INPUT}" in + all|api|worker|provisioner|pg-platform|pg-customers|mongodb|redis-provision|nats) : ;; + *) + echo "::error::Unknown service: ${SERVICE_INPUT}" + exit 1 + ;; + esac + + - name: Deploy + run: | + set -euo pipefail + if [ "${SERVICE_INPUT}" = "all" ]; then + SERVICES="api worker provisioner pg-platform pg-customers mongodb redis-provision nats" + else + SERVICES="${SERVICE_INPUT}" + fi + for svc in $SERVICES; do + echo "::group::deploying $svc" + cd "infra/wrangler/$svc" + wrangler deploy --env staging + cd - >/dev/null + echo "::endgroup::" + done + + - name: Reminder + run: | + echo "::notice::STAGING DEPLOY COMPLETE." + echo "::notice::Verify with: curl https://api.staging.instanode.dev/healthz" + echo "::notice::Note: stateful containers (pg-*/mongodb/redis-*/nats) have ephemeral disk." diff --git a/terraform/cloudflare/.gitignore b/terraform/cloudflare/.gitignore new file mode 100644 index 0000000..343dfcf --- /dev/null +++ b/terraform/cloudflare/.gitignore @@ -0,0 +1,27 @@ +# TF state — lives in R2 backend, never in repo. +*.tfstate +*.tfstate.* +*.tfstate.backup +.terraform/ +.terraform.lock.hcl + +# Per-environment variable files — committable ONLY if they contain +# no secrets. As of bootstrap there are no secrets in any tfvars (auth +# is via env vars), so we DO commit the .auto.tfvars files. Below +# excludes only the local ad-hoc ones. +*.local.tfvars +*.local.auto.tfvars + +# Operator-local overrides +override.tf +override.tf.json +*_override.tf +*_override.tf.json + +# Plan outputs (often contain post-apply secret values) +*.tfplan +*.tfplan.bin + +# crash logs from the provider +crash.log +crash.*.log diff --git a/terraform/cloudflare/Makefile b/terraform/cloudflare/Makefile new file mode 100644 index 0000000..23593ff --- /dev/null +++ b/terraform/cloudflare/Makefile @@ -0,0 +1,101 @@ +# Terraform helpers for the CF migration. Run from this dir. +# +# Required env vars (export before any target): +# CLOUDFLARE_API_TOKEN — Token A (deploy) for plan/apply +# AWS_ACCESS_KEY_ID — R2 HMAC for TF state bucket +# AWS_SECRET_ACCESS_KEY — R2 HMAC secret for TF state bucket +# CF_ACCOUNT_ID — for backend endpoint URL +# +# ENV defaults to staging; pass ENV=production for prod. + +ENV ?= staging +TF ?= terraform + +ifneq ($(filter $(ENV),staging production),$(ENV)) +$(error ENV must be 'staging' or 'production' (got '$(ENV)')) +endif + +.PHONY: help init fmt validate plan apply destroy install-secrets rotate-tokens clean + +help: + @echo "Targets:" + @echo " init — terraform init with R2 backend (one-time per workspace)" + @echo " fmt — terraform fmt -check (CI also enforces)" + @echo " validate — terraform validate (offline)" + @echo " plan — terraform plan (writes tfplan.bin)" + @echo " apply — terraform apply (reads tfplan.bin from plan target)" + @echo " install-secrets — pull token outputs and push to k8s + GH org secrets" + @echo " rotate-tokens — bump expiry, plan, apply, install" + @echo " destroy — DANGEROUS, only for tearing down ephemeral staging" + @echo + @echo "Env: ENV=$(ENV) (override with ENV=production)" + +init: + @: $${CF_ACCOUNT_ID?CF_ACCOUNT_ID must be set} + $(TF) init \ + -backend-config="endpoints={s3=\"https://$$CF_ACCOUNT_ID.r2.cloudflarestorage.com\"}" \ + -backend-config="workspace_key_prefix=$(ENV)" + $(TF) workspace select $(ENV) 2>/dev/null || $(TF) workspace new $(ENV) + +fmt: + $(TF) fmt -check -recursive + +validate: + $(TF) validate -no-color + +plan: + $(TF) plan -var-file=$(ENV).auto.tfvars -out=tfplan.bin + +apply: + $(TF) apply tfplan.bin + @echo + @echo "==> Apply complete. If tokens were created/rotated, run:" + @echo " make install-secrets ENV=$(ENV)" + +# Pull sensitive token outputs (one-shot, never written to disk) and +# install them as k8s + GH secrets across all consuming repos. Token +# VALUES are scrubbed from the env on exit. +install-secrets: + @: $${GH_TOKEN?GH_TOKEN must be set for 'gh secret set' calls} + @DEPLOY_TOKEN="$$($(TF) output -raw deploy_token)"; \ + if [ -z "$$DEPLOY_TOKEN" ]; then echo "no deploy_token in state — apply first"; exit 1; fi; \ + echo "==> k8s: writing CLOUDFLARE_API_TOKEN to instant-secrets-cf in instant-$(ENV)"; \ + kubectl create secret generic instant-secrets-cf \ + -n instant-$(ENV) \ + --from-literal=CLOUDFLARE_API_TOKEN="$$DEPLOY_TOKEN" \ + --dry-run=client -o yaml | kubectl apply -f -; \ + echo "==> GH org secrets: CLOUDFLARE_API_TOKEN across instanodedev/{api,worker,provisioner,instanode-web,dashboard,infra,cli,mcp}"; \ + for repo in instanodedev/api instanodedev/worker instanodedev/provisioner \ + instanodedev/instanode-web instanodedev/dashboard \ + instanodedev/infra instanodedev/cli instanodedev/mcp; do \ + gh secret set CLOUDFLARE_API_TOKEN -b"$$DEPLOY_TOKEN" -R "$$repo" >/dev/null \ + && echo " ✓ $$repo" \ + || echo " ✗ $$repo (skipped — repo missing or not authorized)"; \ + done; \ + unset DEPLOY_TOKEN + @echo + @echo "==> Admin/tunnel token (Token B) is operator-only — NOT pushed to CI." + @echo " To install into your local 1Password vault:" + @echo " $(TF) output -raw admin_tunnel_token | op item create --category=ApiCredential --title='cf-admin-tunnel-$(ENV)' credential=-" + +# Bump expiry by 180d (deploy) / 90d (admin) — operator edits the .auto.tfvars +# to set new dates, then this target runs the plan/apply/install loop. +rotate-tokens: + @echo "==> Edit $(ENV).auto.tfvars to set new *_expires_on dates, then:" + @echo " make plan ENV=$(ENV)" + @echo " make apply ENV=$(ENV)" + @echo " make install-secrets ENV=$(ENV)" + @echo " Confirm the rotation in the CF dashboard audit log before" + @echo " revoking the previous token version." + +# Tearing down staging is OK (Phase 1 acceptance allows it). NEVER +# run against production — D-3 cutover keeps state on DO throughout. +destroy: + @if [ "$(ENV)" = "production" ]; then \ + echo "ABORTING — destroy against production is forbidden (D-1/D-3)."; \ + exit 1; \ + fi + $(TF) destroy -var-file=$(ENV).auto.tfvars + +clean: + rm -f tfplan.bin diff --git a/terraform/cloudflare/README.md b/terraform/cloudflare/README.md new file mode 100644 index 0000000..8f29db0 --- /dev/null +++ b/terraform/cloudflare/README.md @@ -0,0 +1,161 @@ +# Cloudflare resources — Terraform + +Source of truth for everything we declare in Cloudflare for the InstaNode +migration: API tokens (deploy + admin/tunnel), DNS records, R2 buckets, +Pages projects, and (later) Workers + Load Balancers + Page Rules. + +> **k8s is NOT in scope here.** k8s manifests stay under `../../k8s/`, +> managed by `kubectl set image` + the existing per-service auto-deploy +> per CLAUDE.md rule 15. This dir is for Cloudflare-managed resources only. + +## Decision references + +This module implements: +- **D-1** (scope — R2, Pages, CF proxy on api, staging-only Tunnel) +- **D-2** (staging on full CF stack) +- **D-3** (per-service DNS-weighted cutover; TTL 60s ≥48h) +- **D-4** (separate `instant-staging-data` ns — k8s-side, not here, but the staging Pages project + R2 bucket parallel it) +- **D-7** (NS delegation is CF; already verified) +- **D-8** (R2 env-var canonical names: `R2_HMAC_KEY_ID` / `R2_HMAC_SECRET`) +- **D-14** (operator credentials — outputs from `tokens.tf` install via `make install-secrets`) + +Source: `/tmp/cf-migration/shared/DECISIONS.md`. + +## Bootstrap (one-time) + +The TF state lives in R2, which means the R2 bucket for state and the +HMAC creds to write to it must exist BEFORE `terraform init`. Manual +chicken-and-egg step: + +```bash +# 1. Create the state bucket via wrangler (operator-side, one time). +wrangler r2 bucket create instanode-tf-state --location wnam + +# 2. Create R2 HMAC for state access only (scope: instanode-tf-state). +# Dashboard → R2 → Manage R2 API Tokens → Create: +# - Name: "tf-state-rw" +# - Permission: Object Read & Write +# - Specify buckets: instanode-tf-state +# Save the Access Key ID + Secret + Endpoint. + +# 3. Export the state-backend creds + CF auth token for terraform. +export AWS_ACCESS_KEY_ID="" +export AWS_SECRET_ACCESS_KEY="" +export CLOUDFLARE_API_TOKEN="" + +# 4. Init the backend with the env-specific account endpoint. +terraform init \ + -backend-config="endpoints={s3=\"https://${CF_ACCOUNT_ID}.r2.cloudflarestorage.com\"}" + +# 5. Pick a workspace (staging first). +terraform workspace new staging +terraform workspace select staging + +# 6. Plan + apply. +terraform plan -out=staging.tfplan +terraform apply staging.tfplan +``` + +After `apply` succeeds you have: +- Two CF API tokens in TF state (deploy + admin_tunnel). +- The staging Pages project + R2 bucket + DNS records. +- Output values for token secrets (sensitive — see next section). + +## Installing token secrets into k8s + GH + +Tokens are SENSITIVE outputs — they appear once in TF state and once +when `terraform output -raw ` is run. To install: + +```bash +# Read the tokens (do NOT redirect to a file you'll commit). +DEPLOY_TOKEN="$(terraform output -raw deploy_token)" +ADMIN_TUNNEL_TOKEN="$(terraform output -raw admin_tunnel_token)" + +# k8s — staging namespace. +kubectl create secret generic instant-secrets-cf \ + -n instant-staging \ + --from-literal=CLOUDFLARE_API_TOKEN="$DEPLOY_TOKEN" \ + --dry-run=client -o yaml | kubectl apply -f - + +# GH org / repo secrets — for CI auto-deploys. +for repo in instanodedev/api instanodedev/worker instanodedev/provisioner \ + instanodedev/instanode-web instanodedev/dashboard \ + instanodedev/infra; do + gh secret set CLOUDFLARE_API_TOKEN -b"$DEPLOY_TOKEN" -R "$repo" +done + +# Admin/tunnel token: ONLY into a separate operator-local Vault, never +# into CI. Used break-glass for Tunnel/Access changes. +op item create --category=ApiCredential --title="cf-admin-tunnel-staging" \ + --vault="instanode-prod" credential="$ADMIN_TUNNEL_TOKEN" + +unset DEPLOY_TOKEN ADMIN_TUNNEL_TOKEN +``` + +## Workflow during the migration + +1. **Plan-on-PR.** Every PR that changes a `.tf` file under this dir + triggers `terraform plan` in CI; diff posted as PR comment. +2. **Apply-on-merge.** Merge to `main` triggers `terraform apply` via + the workflow (gated on approval — `instanodedev/infra` already has + manual-apply discipline; rule 15 doesn't auto-deploy `infra`). +3. **Per-PR contract checklist (rule 22)** still applies. A TF PR that + adds a new host or changes the API base URL ALSO needs the + synchronized code edits in `api/internal/handlers/openapi.go` + + `content/llms.txt` + the dashboard/cli/mcp/sdk-go base-URL constants. +4. **Per-PR observability checklist (rule 25)** still applies. New + resources that emit metrics need an `instant_*` Prom rule + NR alert + JSON + dashboard tile + METRICS-CATALOG row in the same PR. + +## Workspace conventions + +- `terraform workspace new staging` / `terraform workspace new production` +- `terraform workspace select ` before any plan/apply +- `var.environment` is set automatically via `*.auto.tfvars` files + selected by workspace (TF auto-loads `staging.auto.tfvars` when the + workspace is `staging` if your CI passes `-var-file` accordingly; + during interactive use, pass `-var-file=staging.auto.tfvars` explicit- + ly to avoid surprises). + +## File layout + +| File | Purpose | +|---|---| +| `versions.tf` | TF + provider pinning, R2 backend config | +| `providers.tf` | CF provider (reads `CLOUDFLARE_API_TOKEN` env) | +| `variables.tf` | account_id, zone_id, environment, token expiries | +| `tokens.tf` | `cloudflare_account_token.deploy` + `.admin_tunnel` | +| `r2.tf` | R2 bucket + 24h-TTL lifecycle rule on `anon/` prefix | +| `dns.tf` | DNS records (apex / www / api / staging) with TTL 60s | +| `pages.tf` | Pages project for `instanode-web` (Phase 2) | +| `outputs.tf` | Sensitive token outputs (consumed by `make install-secrets`) | +| `staging.auto.tfvars` | Workspace-scoped vars for staging | +| `production.auto.tfvars` | Workspace-scoped vars for production | + +## What's NOT here (yet) + +- **Workers** — CEO D-1 deferred until measured TTFB benefit shows up. +- **Hyperdrive** — same; api and DO Managed PG are same-region, no win today. +- **D1** — KILLED per D-1. +- **CF Email Routing** — DEFERRED; outbound stays on Brevo. +- **Tunnels** — Phase 5 staging-only; add `tunnels.tf` when that PR ships, scoped to admin_tunnel token. +- **Load Balancers** — pending the CF Startups operator ticket (D-6, 5–10 day lead). Once enabled, add `lb.tf`. +- **Page Rules / Cache Rules** — Phase 4 only (api orange-cloud cut). Per D-12, the rule is an explicit path-allowlist for `/healthz`, `/openapi.json`, `/llms.txt`; NEVER Authorization-header-based. + +## R2 HMAC keys (NOT here) + +The R2 HMAC Access Key ID / Secret used by `common/storageprovider/r2/` +are SEPARATE from the CF API token and are generated via the R2 dashboard +"Manage R2 API Tokens" UI (NOT this Terraform). Reason: the +`cloudflare_r2_bucket` resource doesn't issue per-bucket HMAC pairs; +that's a one-off operator action, scoped to the specific bucket. + +After Phase 0 creates the staging bucket, the operator runs: +1. Dashboard → R2 → Manage R2 API Tokens → Create +2. Permissions: Object Read & Write +3. Specify buckets: `instant-shared-staging` (NOT *Apply to all buckets*) +4. TTL: 180 days +5. Save the resulting `Access Key ID` + `Secret Access Key` into + `instant-secrets` as `R2_HMAC_KEY_ID` + `R2_HMAC_SECRET` (D-8 names). + +Repeat for `instant-shared` (prod) after staging passes 48h green (D-9). diff --git a/terraform/cloudflare/cache.tf b/terraform/cloudflare/cache.tf new file mode 100644 index 0000000..9864c11 --- /dev/null +++ b/terraform/cloudflare/cache.tf @@ -0,0 +1,96 @@ +# Cache rules for api.staging.instanode.dev (and api.instanode.dev once +# Phase 4 flips proxied=true on the api A-record). +# +# D-12 (LOCKED): cache scope is an EXPLICIT path allowlist — `/healthz`, +# `/openapi.json`, `/llms.txt`. Everything else BYPASSES cache regardless +# of Authorization header presence. The original "bypass cache when +# Authorization header is set" approach was deleted because (a) the +# primitive doesn't exist on our zone tier, (b) it's a footgun if an +# authed response ever flows through cache. +# +# Plus: `instant_unexpected_cached_response_total` P0 metric in the api +# code (NOT here — handler-side) trips an alert if a request OUTSIDE +# the allowlist ever responds with cache-hit semantics. Defense in depth. + +# Catch-all bypass at top priority — cache OFF for everything by default. +resource "cloudflare_ruleset" "api_cache_rules" { + zone_id = var.zone_id + name = "api-cache-rules" + description = "D-12 explicit-path allowlist for api${var.environment == "production" ? "" : ".staging"}.${var.zone_name}" + kind = "zone" + phase = "http_request_cache_settings" + + # Rules evaluated top-to-bottom; first match wins. + rules = [ + # Rule 1: bypass cache for everything by default (catch-all at lowest + # priority via `Last`). + { + action = "set_cache_settings" + description = "bypass cache for all api.* paths by default" + enabled = true + expression = "(http.host eq \"api${var.environment == "production" ? "" : ".staging"}.${var.zone_name}\")" + action_parameters = { + cache = false + } + }, + # Rule 2: allow cache for /healthz (overrides bypass via earlier + # evaluation only if listed BEFORE the catch-all; CF Rulesets evaluate + # all rules and the LAST matching action wins for `set_cache_settings`, + # so explicit allowlist comes after the catch-all). + { + action = "set_cache_settings" + description = "cache /healthz at edge for 30s — same SHA across instances" + enabled = true + expression = "(http.host eq \"api${var.environment == "production" ? "" : ".staging"}.${var.zone_name}\") and (http.request.uri.path eq \"/healthz\")" + action_parameters = { + cache = true + edge_ttl = { + mode = "override_origin" + default = 30 + } + browser_ttl = { + mode = "override_origin" + default = 0 + } + } + }, + # Rule 3: cache /openapi.json for 5 minutes — frequently re-fetched + # by tooling, changes rarely. + { + action = "set_cache_settings" + description = "cache /openapi.json at edge for 5min" + enabled = true + expression = "(http.host eq \"api${var.environment == "production" ? "" : ".staging"}.${var.zone_name}\") and (http.request.uri.path eq \"/openapi.json\")" + action_parameters = { + cache = true + edge_ttl = { + mode = "override_origin" + default = 300 + } + browser_ttl = { + mode = "override_origin" + default = 60 + } + } + }, + # Rule 4: cache /llms.txt for 1 hour — static content from content + # repo, refresh cadence is "operator manually re-syncs". + { + action = "set_cache_settings" + description = "cache /llms.txt at edge for 1h" + enabled = true + expression = "(http.host eq \"api${var.environment == "production" ? "" : ".staging"}.${var.zone_name}\") and (http.request.uri.path eq \"/llms.txt\")" + action_parameters = { + cache = true + edge_ttl = { + mode = "override_origin" + default = 3600 + } + browser_ttl = { + mode = "override_origin" + default = 600 + } + } + }, + ] +} diff --git a/terraform/cloudflare/dns.tf b/terraform/cloudflare/dns.tf new file mode 100644 index 0000000..fbfd48c --- /dev/null +++ b/terraform/cloudflare/dns.tf @@ -0,0 +1,55 @@ +# DNS records under management. +# +# Pre-cutover ritual (D-3): TTL must be 60s for ≥48h BEFORE any cut. +# Setting it that low here means terraform plan/apply itself satisfies +# the pre-step the first time we touch the record. +# +# `proxied = true` = CF orange-cloud; `false` = grey-cloud (DNS only, no +# proxy). Today: marketing apex is orange (Phase 0 baseline), api is grey +# (becomes orange in Phase 4 — flip this flag in that phase's PR). + +locals { + marketing_origin = "instanode-web.pages.dev" # set per environment in staging.tfvars / production.tfvars after Pages project is created + api_origin = "152.42.154.144" # DigitalOcean LB; replaced with LB pool resource in Phase 4 +} + +resource "cloudflare_dns_record" "apex" { + zone_id = var.zone_id + name = var.zone_name + type = "CNAME" + content = local.marketing_origin + ttl = 60 + proxied = true + comment = "marketing apex; CNAME-flattened to Pages project" +} + +resource "cloudflare_dns_record" "www" { + zone_id = var.zone_id + name = "www.${var.zone_name}" + type = "CNAME" + content = var.zone_name + ttl = 60 + proxied = true + comment = "www → apex redirect handled by CF page rule" +} + +resource "cloudflare_dns_record" "api" { + zone_id = var.zone_id + name = "api.${var.zone_name}" + type = "A" + content = local.api_origin + ttl = 60 + proxied = false # Phase 4 flips this to true after CF orange-cloud cache rules are applied + comment = "api; grey-cloud today, orange-cloud per Phase 4 cut (D-3)" +} + +resource "cloudflare_dns_record" "staging" { + count = var.environment == "staging" ? 1 : 0 + zone_id = var.zone_id + name = "staging.${var.zone_name}" + type = "CNAME" + content = "instant-staging.${var.zone_name}.cdn.cloudflare.net" # Pages preview hostname; replaced after Pages project is up + ttl = 60 + proxied = true + comment = "staging mirror per D-2" +} diff --git a/terraform/cloudflare/outputs.tf b/terraform/cloudflare/outputs.tf new file mode 100644 index 0000000..3b123f3 --- /dev/null +++ b/terraform/cloudflare/outputs.tf @@ -0,0 +1,34 @@ +# Token VALUES are sensitive — operator must `terraform output -raw deploy_token` +# and immediately pipe into `kubectl create secret` / `gh secret set`. Never +# `terraform output` (no -raw) in a CI log: the redacted form ("(sensitive)") +# is still a footgun if anyone removes `sensitive = true`. + +output "deploy_token_id" { + value = cloudflare_account_token.deploy.id + description = "Token A id (non-sensitive; safe in CI logs)." +} + +output "deploy_token" { + value = cloudflare_account_token.deploy.value + description = "Token A secret. Pipe directly into k8s/GH secret; never log." + sensitive = true +} + +output "admin_tunnel_token_id" { + value = cloudflare_account_token.admin_tunnel.id + description = "Token B id (non-sensitive)." +} + +output "admin_tunnel_token" { + value = cloudflare_account_token.admin_tunnel.value + description = "Token B secret. Operator-only; never put into CI." + sensitive = true +} + +output "account_id" { + value = var.account_id +} + +output "zone_id" { + value = var.zone_id +} diff --git a/terraform/cloudflare/pages.tf b/terraform/cloudflare/pages.tf new file mode 100644 index 0000000..49f0037 --- /dev/null +++ b/terraform/cloudflare/pages.tf @@ -0,0 +1,61 @@ +# Cloudflare Pages project for instanode-web (marketing site). +# Phase 2 in FINAL-PLAN.md. Dashboard-on-Pages is KILLED per D-5; +# do NOT add a second `cloudflare_pages_project` for dashboard here. + +resource "cloudflare_pages_project" "instanode_web" { + account_id = var.account_id + name = var.environment == "production" ? "instanode-web" : "instanode-web-staging" + production_branch = "main" + + build_config = { + build_command = "npm run build" + destination_dir = "dist" + root_dir = "" + web_analytics_tag = null + web_analytics_token = null + } + + source = { + type = "github" + config = { + owner = "instanodedev" + repo_name = "instanode-web" + production_branch = "main" + pr_comments_enabled = true + production_deployment_enabled = true + preview_deployment_setting = "all" + preview_branch_includes = ["*"] + preview_branch_excludes = [] + } + } + + deployment_configs = { + production = { + compatibility_date = "2026-05-30" + compatibility_flags = [] + env_vars = { + VITE_API_URL = { + type = "plain_text" + value = var.environment == "production" ? "https://api.instanode.dev" : "https://api.staging.instanode.dev" + } + VITE_ENV = { + type = "plain_text" + value = var.environment + } + } + } + preview = { + compatibility_date = "2026-05-30" + compatibility_flags = [] + } + } +} + +# Custom domain binding — only after Phase 2 acceptance (D-9 equivalent +# for marketing: zero broken-link diff). Until then, traffic stays on +# GH Pages via DNS, and this resource is dormant. +resource "cloudflare_pages_domain" "instanode_web" { + account_id = var.account_id + project_name = cloudflare_pages_project.instanode_web.name + name = var.environment == "production" ? var.zone_name : "staging.${var.zone_name}" +} diff --git a/terraform/cloudflare/production.auto.tfvars b/terraform/cloudflare/production.auto.tfvars new file mode 100644 index 0000000..0c188fe --- /dev/null +++ b/terraform/cloudflare/production.auto.tfvars @@ -0,0 +1,4 @@ +environment = "production" + +deploy_token_expires_on = "2026-11-26T23:59:59Z" +admin_tunnel_token_expires_on = "2026-08-28T23:59:59Z" diff --git a/terraform/cloudflare/providers.tf b/terraform/cloudflare/providers.tf new file mode 100644 index 0000000..a89234e --- /dev/null +++ b/terraform/cloudflare/providers.tf @@ -0,0 +1,8 @@ +provider "cloudflare" { + # Reads CLOUDFLARE_API_TOKEN from env. Operator uses Token A + # ("instanode-migration-deploy") for everything except Tunnel/Access + # changes — for those, switch the env var to Token B in a separate + # apply (see _modules/tunnel/README.md). + # + # Never commit a value here. +} diff --git a/terraform/cloudflare/r2.tf b/terraform/cloudflare/r2.tf new file mode 100644 index 0000000..61206ba --- /dev/null +++ b/terraform/cloudflare/r2.tf @@ -0,0 +1,38 @@ +# R2 buckets. Per CEO D-1 + DevOps D-4, staging gets a parallel bucket +# (`instant-shared-staging`); production keeps the existing name and +# moves traffic into it via the storageprovider env-flip (D-8 names). +# +# Lifecycle rule: anon/ prefix expires after 24h (matches the platform's +# anon-resource TTL contract — pay-from-day-one, no trial creep). + +locals { + bucket_name = var.environment == "production" ? "instant-shared" : "instant-shared-staging" +} + +resource "cloudflare_r2_bucket" "shared" { + account_id = var.account_id + name = local.bucket_name + location = "WNAM" # North America West — closest to our DO NYC3 cluster latency-wise + storage_class = "Standard" +} + +# 24h TTL on anon/ — matches platform contract that anonymous resources +# expire after 24h (CLAUDE.md "anonymous (24h TTL) is the only free tier"). +resource "cloudflare_r2_bucket_lifecycle" "shared_anon_24h" { + account_id = var.account_id + bucket_name = cloudflare_r2_bucket.shared.name + + rules = [{ + id = "anon-24h" + enabled = true + conditions = { + prefix = "anon/" + } + delete_objects_transition = { + condition = { + type = "Age" + max_age = 86400 # 24h in seconds + } + } + }] +} diff --git a/terraform/cloudflare/staging.auto.tfvars b/terraform/cloudflare/staging.auto.tfvars new file mode 100644 index 0000000..b7489f6 --- /dev/null +++ b/terraform/cloudflare/staging.auto.tfvars @@ -0,0 +1,6 @@ +environment = "staging" + +# Tokens rotate every 180d (deploy) / 90d (admin). Override per env +# if staging is on a shorter cycle. +deploy_token_expires_on = "2026-11-26T23:59:59Z" +admin_tunnel_token_expires_on = "2026-08-28T23:59:59Z" diff --git a/terraform/cloudflare/staging.tf b/terraform/cloudflare/staging.tf new file mode 100644 index 0000000..10deda6 --- /dev/null +++ b/terraform/cloudflare/staging.tf @@ -0,0 +1,182 @@ +# Staging-environment subdomains under staging.instanode.dev. +# +# All resources here are count-gated on `var.environment == "staging"` so +# they only materialize in the staging workspace; the production workspace +# plan shows no changes from this file. +# +# DIVISION OF RESPONSIBILITY between TF and wrangler: +# +# - **TF owns** wildcard records, env-level subdomains (dashboard, webhook), +# and the deployment-app wildcard. These don't have a 1:1 Worker/Container +# mapping or they're pre-deploy plumbing. +# - **Wrangler owns** service-specific hostnames via `custom_domain = true` +# in each wrangler.toml. wrangler auto-creates the DNS + cert + route on +# first deploy. That covers: api.staging.instanode.dev (managed by +# infra/wrangler/api/wrangler.toml). +# +# DO NOT add explicit TF records for hostnames wrangler is already +# custom-domain-claiming — wrangler will fail to deploy with "DNS record +# already exists" if both manage it. + +locals { + is_staging = var.environment == "staging" + # All staging subdomains live under this stem. + staging_stem = "staging.${var.zone_name}" +} + +# ----------------------------------------------------------------------------- +# Wildcards under *.staging.instanode.dev +# ----------------------------------------------------------------------------- +# +# Each per-tenant service in wrangler/ uses a hostname-shard pattern: +# - pg-customer-.staging.instanode.dev (pg-customers Container) +# - mongo-.staging.instanode.dev (mongodb Container) +# - redis-.staging.instanode.dev (redis-provision Container) +# - nats-.staging.instanode.dev (nats Container) +# +# A single proxied wildcard CNAME catches all of them; the Worker shells +# in each wrangler service extract the tenant from the hostname and +# dispatch to the right Durable Object via `idFromName(tenant)`. + +resource "cloudflare_dns_record" "staging_wildcard" { + count = local.is_staging ? 1 : 0 + zone_id = var.zone_id + name = "*.${local.staging_stem}" + type = "CNAME" + # CF requires SOME content for proxied CNAMEs; this is a placeholder. The + # cloudflare_workers_route below routes traffic to the correct Worker + # regardless of what's here. A 404 sink is intentional — any unrouted + # subdomain hits CF's default 404 page. + content = local.staging_stem + ttl = 60 + proxied = true + comment = "wildcard for per-tenant CF Container services in staging; routed via cloudflare_workers_route below" +} + +# ----------------------------------------------------------------------------- +# Deployment-app wildcard: *.deployment.staging.instanode.dev +# ----------------------------------------------------------------------------- +# +# Mirror of prod's `*.deployment.instanode.dev`. Every /deploy/new staging +# call provisions an app at `.deployment.staging.instanode.dev`. +# Wrangler-managed Containers for the deploy compute target this wildcard; +# the api Worker creates a DNS-less custom-domain claim per slug, but the +# wildcard ensures any future deploy slug resolves to CF before its +# custom-domain claim lands. + +resource "cloudflare_dns_record" "staging_deployment_wildcard" { + count = local.is_staging ? 1 : 0 + zone_id = var.zone_id + name = "*.deployment.${local.staging_stem}" + type = "CNAME" + content = "deployment.${local.staging_stem}" + ttl = 60 + proxied = true + comment = "wildcard for /deploy/new staging apps (mirrors prod *.deployment.instanode.dev)" +} + +# Anchor for the deployment wildcard CNAME (the wildcard's content needs +# a real record at the parent name). +resource "cloudflare_dns_record" "staging_deployment_anchor" { + count = local.is_staging ? 1 : 0 + zone_id = var.zone_id + name = "deployment.${local.staging_stem}" + type = "AAAA" + content = "100::" # IPv6 discard prefix — never reachable; CF proxied front-end terminates + ttl = 60 + proxied = true + comment = "anchor for deployment wildcard CNAME (CF requires a real record at the parent)" +} + +# ----------------------------------------------------------------------------- +# Webhook subdomain: webhook.staging.instanode.dev +# ----------------------------------------------------------------------------- +# +# /webhook/new staging endpoints return a URL at this host. Routed to the +# api Container via a Worker route. Separate subdomain (vs api.staging.) +# so customers can filter outbound webhook traffic by destination host. + +resource "cloudflare_dns_record" "staging_webhook" { + count = local.is_staging ? 1 : 0 + zone_id = var.zone_id + name = "webhook.${local.staging_stem}" + type = "AAAA" + content = "100::" # placeholder; CF orange-cloud handles routing + ttl = 60 + proxied = true + comment = "staging /webhook/new receiver subdomain" +} + +# ----------------------------------------------------------------------------- +# Dashboard subdomain: dashboard.staging.instanode.dev +# ----------------------------------------------------------------------------- +# +# CEO killed dashboard-on-Pages for PROD (D-5) but staging dashboard is +# useful for QA. Points at the same dashboard Pages project at the +# `staging` branch preview hostname. NOT enabled for production — D-5 +# stands. + +resource "cloudflare_dns_record" "staging_dashboard" { + count = local.is_staging ? 1 : 0 + zone_id = var.zone_id + name = "dashboard.${local.staging_stem}" + type = "CNAME" + content = "instanode-dashboard-staging.pages.dev" # set after dashboard Pages project is created + ttl = 60 + proxied = true + comment = "staging dashboard — QA-only; D-5 keeps prod dashboard off Pages" +} + +# ----------------------------------------------------------------------------- +# Workers Routes for per-tenant wildcards +# ----------------------------------------------------------------------------- +# +# `custom_domain = true` in wrangler.toml does NOT support wildcards. +# Wildcards need cloudflare_workers_route + a wildcard DNS record (done +# above). Each route binds a pattern to a specific Worker name; wrangler +# deploys the Worker, TF wires the route. + +resource "cloudflare_workers_route" "staging_pg_customers" { + count = local.is_staging ? 1 : 0 + zone_id = var.zone_id + pattern = "pg-customer-*.${local.staging_stem}/*" + script = "instanode-pg-customers-staging" +} + +resource "cloudflare_workers_route" "staging_mongodb" { + count = local.is_staging ? 1 : 0 + zone_id = var.zone_id + pattern = "mongo-*.${local.staging_stem}/*" + script = "instanode-mongodb-staging" +} + +resource "cloudflare_workers_route" "staging_redis" { + count = local.is_staging ? 1 : 0 + zone_id = var.zone_id + pattern = "redis-*.${local.staging_stem}/*" + script = "instanode-redis-provision-staging" +} + +resource "cloudflare_workers_route" "staging_nats" { + count = local.is_staging ? 1 : 0 + zone_id = var.zone_id + pattern = "nats-*.${local.staging_stem}/*" + script = "instanode-nats-staging" +} + +# ----------------------------------------------------------------------------- +# Pages custom domain — staging marketing site +# ----------------------------------------------------------------------------- +# +# The Pages project itself is declared in pages.tf with the +# `var.environment == "staging" ? "instanode-web-staging" : "instanode-web"` +# name pattern. The custom-domain attachment is here so prod's pages.tf +# stays simple. + +resource "cloudflare_pages_domain" "staging_marketing" { + count = local.is_staging ? 1 : 0 + account_id = var.account_id + project_name = "instanode-web-staging" + name = local.staging_stem + depends_on = [cloudflare_dns_record.staging] +} diff --git a/terraform/cloudflare/tokens.tf b/terraform/cloudflare/tokens.tf new file mode 100644 index 0000000..8a37ad0 --- /dev/null +++ b/terraform/cloudflare/tokens.tf @@ -0,0 +1,75 @@ +# Two scoped API tokens replace the Global API Key for CI / DevOps use. +# Source: exported from CF dashboard 2026-05-30, renamed to avoid the +# default `example_account_token` collision. +# +# WARNING — token values are SENSITIVE outputs. They appear once in TF +# state after `apply`. Operator MUST run the `make install-secrets` +# helper (see Makefile) to push them into k8s + GH org secrets, then +# rotate state. + +# Token A — day-to-day deploy + DNS + R2 + Pages + Workers + Page Rules +# + Load Balancing + Cache Purge + Zone Settings. Account-broad, zone- +# narrow on instanode.dev. Used by CI. +resource "cloudflare_account_token" "deploy" { + account_id = var.account_id + name = "instanode-migration-deploy-${var.environment}" + expires_on = var.deploy_token_expires_on + + policies = [ + # Zone-scoped permissions on instanode.dev (zone_id pinned). + { + effect = "allow" + permission_groups = [ + { id = "c4df38be41c247b3b4b7702e76eadae0" }, # Zone:Read + { id = "3030687196b94b638145a3953da2b699" }, # DNS:Edit + { id = "c8fed203ed3043cba015a93ad1616f1f" }, # Zone Settings:Edit + { id = "c03055bc037c4ea9afb9a9f104b7b721" }, # Cache Purge:Purge + { id = "e17beae8b8cb423a99b1730f21238bed" }, # Page Rules:Edit + { id = "ed07f6c337da4195b4e72a1fb2c6bcae" }, # SSL and Certificates:Edit + { id = "6d7f2f5f5b1d4a0e9081fdc98d432fd1" }, # Load Balancers:Edit + { id = "4755a26eedb94da69e1066d98aa820be" }, # Apps:Edit (zone-side) + ] + resources = jsonencode({ + "com.cloudflare.api.account.zone.${var.zone_id}" = "*" + }) + }, + # Account-scoped permissions for resources that aren't zone-bound. + { + effect = "allow" + permission_groups = [ + { id = "dc44f27f48ab405392a5f69fe822bd01" }, # Workers Scripts:Edit + { id = "8d28297797f24fb8a0c332fe0866ec89" }, # Workers KV Storage:Edit + { id = "bf7481a1826f439697cb59a20b22293e" }, # Workers R2 Storage:Edit + { id = "f7f0eda5697f475c90846e879bab8666" }, # Cloudflare Pages:Edit + { id = "e086da7e2179491d91ee5f35b3ca210a" }, # Account Settings:Read + { id = "d2a1802cc9a34e30852f8b33869b2f3c" }, # LB Monitors & Pools:Edit + { id = "c1fde68c7bcc44588cbb6ddbc16d6480" }, # Account Analytics:Read + ] + resources = jsonencode({ + "com.cloudflare.api.account.${var.account_id}" = "*" + }) + }, + ] +} + +# Token B — break-glass / rare-use Tunnel + Access. Smaller scope, shorter +# expiry. NOT used by CI; kept as separate apply for blast-radius isolation. +resource "cloudflare_account_token" "admin_tunnel" { + account_id = var.account_id + name = "instanode-migration-admin-tunnel-${var.environment}" + expires_on = var.admin_tunnel_token_expires_on + + policies = [{ + effect = "allow" + permission_groups = [ + { id = "ad7a6f88896d498f98eb30592abfbbf4" }, # Cloudflare Tunnel:Edit + { id = "77efc2c0724d4c4eb94bfd9656247130" }, # Access: Apps and Policies:Edit + { id = "db37e5f1cb1a4e1aabaef8deaea43575" }, # Access: Service Tokens:Edit + { id = "a1c0fec57cf94af79479a6d827fa518c" }, # Access: Organizations, Identity Providers:Edit + { id = "1e13c5124ca64b72b1969a67e8829049" }, # Account Settings:Read + ] + resources = jsonencode({ + "com.cloudflare.api.account.${var.account_id}" = "*" + }) + }] +} diff --git a/terraform/cloudflare/variables.tf b/terraform/cloudflare/variables.tf new file mode 100644 index 0000000..7e9f005 --- /dev/null +++ b/terraform/cloudflare/variables.tf @@ -0,0 +1,37 @@ +variable "account_id" { + type = string + description = "Cloudflare account ID (CF for Startups credit-tagged account)." + default = "613a9e74136364c781a8e258326019f9" +} + +variable "zone_id" { + type = string + description = "Cloudflare zone ID for instanode.dev." + default = "08a1a569d2d6f9a713dc6d62103c5dc6" +} + +variable "zone_name" { + type = string + default = "instanode.dev" +} + +variable "environment" { + type = string + description = "staging or production. Selected via `terraform workspace`." + validation { + condition = contains(["staging", "production"], var.environment) + error_message = "environment must be one of: staging, production." + } +} + +variable "deploy_token_expires_on" { + type = string + description = "RFC3339 expiry for the deploy token. Rotate every ≤180d." + default = "2026-11-26T23:59:59Z" +} + +variable "admin_tunnel_token_expires_on" { + type = string + description = "RFC3339 expiry for the admin/tunnel token. Rotate every ≤90d." + default = "2026-08-28T23:59:59Z" +} diff --git a/terraform/cloudflare/versions.tf b/terraform/cloudflare/versions.tf new file mode 100644 index 0000000..942c3ae --- /dev/null +++ b/terraform/cloudflare/versions.tf @@ -0,0 +1,27 @@ +terraform { + required_version = ">= 1.4" + + required_providers { + cloudflare = { + source = "cloudflare/cloudflare" + version = "~> 5.0" + } + } + + # State lives in R2 (S3-compatible). The bucket "instanode-tf-state" must + # be created out-of-band before `terraform init` — see README §Bootstrap. + # Operator passes -backend-config="..." at init time; we DON'T hardcode + # the account-specific endpoint or HMAC creds here. + backend "s3" { + bucket = "instanode-tf-state" + key = "cloudflare/terraform.tfstate" + region = "auto" + use_path_style = true + skip_credentials_validation = true + skip_metadata_api_check = true + skip_region_validation = true + skip_requesting_account_id = true + skip_s3_checksum = true + encrypt = true + } +} diff --git a/wrangler/README.md b/wrangler/README.md new file mode 100644 index 0000000..db2b867 --- /dev/null +++ b/wrangler/README.md @@ -0,0 +1,97 @@ +# Wrangler — CF Containers for staging + +This directory deploys instanode.dev services as **Cloudflare Containers** +to the **staging** environment. Each service has its own subdir with a +`wrangler.toml` + a tiny Worker shell (`src/worker.ts`) that exposes the +Container via a Durable Object binding. + +Production does NOT use this — see the `production-` workflow when written. +Per user direction 2026-05-30: staging is CF-only, ephemeral state acceptable. + +## Why wrangler, not Terraform + +The `cloudflare/cloudflare` Terraform provider (v5.19.1 as of bootstrap) does +NOT yet expose a `cloudflare_container` resource. Verified by `terraform +providers schema -json | jq '.. | keys?' | grep container` → empty. + +Until the provider catches up, we manage Containers via `wrangler` and +**Terraform manages everything else**: DNS, R2, Pages, Hyperdrive, KV, +Queues, secrets — see `../terraform/cloudflare/`. + +When `cloudflare_container` ships, we'll swap in. Until then, the +boundary is clean: + +| Surface | Tool | +|---|---| +| DNS records, R2 buckets, Pages projects, Hyperdrive config, API tokens | **Terraform** (`../terraform/cloudflare/`) | +| CF Containers (api/worker/provisioner + stateful staging services) | **Wrangler** (this dir) | +| k8s manifests (production data plane until that migrates) | **kubectl** (`../k8s/`) | + +## Ephemeral-state acceptance criterion + +CF Containers wipe disk every time an instance goes to sleep (which fires +on traffic-quiet, not just intentional restart). Source: +https://developers.cloudflare.com/containers/platform-details/ + +This means our staging Postgres / Mongo / Redis / NATS containers WILL +lose their data, mid-test sometimes. E2E test design MUST tolerate this: + +1. **Every test seeds its own fixtures** at start; no test assumes state + from a prior test. +2. **No "deploy now, verify in 2h" tests** — the container may have + slept and lost its state in between. +3. **Tests that span multiple HTTP calls** must complete within one + container-active window (typically minutes). +4. **`/db/new` in staging** returns a connection string that may stop + working when the backing Container sleeps. Documented in the staging + API responses. +5. **Synthetic monitors** keep the high-traffic Containers warm; cold + ones are accepted as ephemeral. + +These tradeoffs are explicit and user-blessed per the CF-only staging +decision. Production has a different host (TBD — not in this dir). + +## Per-service layout + +Each subdir contains: + +``` +infra/wrangler// +├── wrangler.toml # CF Container + Worker config +├── src/ +│ └── worker.ts # Tiny Worker shell that wraps the Container DO +├── Dockerfile # Optional override; defaults to ../..//Dockerfile +└── README.md # Service-specific notes (image source, env vars, ports) +``` + +The actual service code (api, worker, provisioner) lives in its own repo +under `instanodedev/` and produces a Docker image that wrangler ships. +For services without a separate repo (pg-platform, pg-customers, mongodb, +redis-provision, nats), we use upstream public images (`postgres:16`, +`mongo:7`, `redis:7`, `nats:2`) and a small staging-only init script. + +## Deploy + +CI auto-deploys on merge to `master` via `../.github/workflows/wrangler-deploy-staging.yml`. +Manual deploy from an operator workstation: + +```bash +cd infra/wrangler/ +wrangler login # one-time +wrangler containers deploy --env staging +``` + +Requires `CLOUDFLARE_API_TOKEN` env (Token A from the TF outputs). + +## Service inventory + +| Subdir | What runs | Stateful? | Public hostname (staging) | Notes | +|---|---|---|---|---| +| `api/` | instanode.dev api binary | no | `api.staging.instanode.dev` | HTTP only | +| `worker/` | River job worker | no | none (cron) | Triggered by CF Cron | +| `provisioner/` | gRPC :50051 service | no | private (Container→Container only) | api calls it | +| `pg-platform/` | postgres:16 | **yes, ephemeral** | private | `instance_type=standard`; data wiped on sleep | +| `pg-customers/` | postgres:16 | **yes, ephemeral** | `pg-customer-.staging.instanode.dev` (one per tenant) | Customer-facing in staging only | +| `mongodb/` | mongo:7 | **yes, ephemeral** | private | accessed by /nosql/new staging | +| `redis-provision/` | redis:7 | **yes, ephemeral** | `redis-.staging.instanode.dev` | Customer-facing | +| `nats/` | nats:2 (no JetStream — JS needs durable disk) | **yes, ephemeral** | `nats-.staging.instanode.dev` | Core NATS only in staging | diff --git a/wrangler/api/README.md b/wrangler/api/README.md new file mode 100644 index 0000000..80a190f --- /dev/null +++ b/wrangler/api/README.md @@ -0,0 +1,35 @@ +# api — CF Containers staging deploy + +Wraps the Go api binary (port 8080) in a CF Container. Image pulled from +`ghcr.io/instanodedev/api:staging` — built by the api repo's CI on every +push to master, tagged with `:staging` for staging deploys. + +## Env vars and secrets + +Config (committed): +- `ENVIRONMENT=staging` +- `OBJECT_STORE_BACKEND=r2` +- `R2_BUCKET_NAME=instant-shared-staging` + +Secrets (via `wrangler secret put`): +- `DATABASE_URL` — points at `pg-platform` Container DO via service binding +- `CUSTOMER_DATABASE_URL` — points at `pg-customers` Container DO +- `REDIS_URL` — service binding to `redis-platform` +- `NATS_URL` — service binding to `nats` +- `AES_KEY`, `JWT_SECRET`, `RAZORPAY_WEBHOOK_SECRET`, `BREVO_API_KEY` — same names as k8s prod +- `R2_HMAC_KEY_ID`, `R2_HMAC_SECRET` — from R2 dashboard, scoped to `instant-shared-staging` bucket + +## Deploy + +```bash +cd infra/wrangler/api +wrangler containers deploy --env staging +``` + +CI auto-deploys on merge to master via the workflow in `infra/.github/workflows/`. + +## Known constraints + +- **Disk wipes on sleep** — api itself is stateless so this is fine; downstream PG/Mongo are NOT (see ../README.md acceptance criterion). +- **HTTP only** — gRPC api→provisioner is fine (CF Containers support HTTP/2). +- **No persistent customer port-forwards** — the dashboard's port-forward proxy is disabled on staging. diff --git a/wrangler/api/src/worker.ts b/wrangler/api/src/worker.ts new file mode 100644 index 0000000..7e78d5c --- /dev/null +++ b/wrangler/api/src/worker.ts @@ -0,0 +1,32 @@ +// Tiny Worker shell for the api Container. +// +// CF Containers require a Worker entrypoint that forwards requests to +// the Container's Durable Object. The container itself runs the actual +// Go binary (instanodedev/api), listening on :8080. +// +// Every incoming HTTP request is routed to a Container instance; CF +// handles spin-up/spin-down. Disk is ephemeral — see ../README.md. + +import { Container, getContainer } from "@cloudflare/containers"; + +export class ApiContainer extends Container { + // The Go binary listens on :8080. + defaultPort = 8080; + // Sleep after 10 minutes of no traffic. CF will spin back up on the + // next request, with a fresh disk. The api is stateless (state lives + // in pg-platform Container), so cold-start is correctness-safe. + sleepAfter = "10m"; +} + +export default { + async fetch(request: Request, env: Env): Promise { + // Route every request to a single Container instance (single-shard + // for staging; production would shard by tenant or geo). + const container = getContainer(env.API_CONTAINER); + return container.fetch(request); + }, +}; + +interface Env { + API_CONTAINER: DurableObjectNamespace; +} diff --git a/wrangler/api/wrangler.toml b/wrangler/api/wrangler.toml new file mode 100644 index 0000000..a403a09 --- /dev/null +++ b/wrangler/api/wrangler.toml @@ -0,0 +1,64 @@ +# instanode-api on CF Containers (staging). +# +# The api is a Go binary listening on :8080. CF Containers wraps it in a +# Durable Object; the Worker shell in src/worker.ts forwards every HTTP +# request to the container. +# +# Image: pulled from GHCR (built by api repo's CI on every push to master). + +name = "instanode-api" +main = "src/worker.ts" +compatibility_date = "2026-05-30" + +# Per-environment config keeps the staging deploy isolated from any future +# prod deploy (which won't live here — production goes to a non-CF k8s). +[env.staging] +name = "instanode-api-staging" +routes = [ + { pattern = "api.staging.instanode.dev/*", custom_domain = true }, +] + +# Container backed by a Durable Object class. +[[env.staging.containers]] +class_name = "ApiContainer" +image = "ghcr.io/instanode-dev/instant-api:staging" +max_instances = 3 +instance_type = "standard" # 1 vCPU, 4 GiB RAM, 8 GiB ephemeral disk + +[[env.staging.durable_objects.bindings]] +name = "API_CONTAINER" +class_name = "ApiContainer" + +[[env.staging.migrations]] +tag = "v1" +new_sqlite_classes = ["ApiContainer"] + +# Env vars passed to the container. Secrets via `wrangler secret put`. +[env.staging.vars] +ENVIRONMENT = "staging" +OBJECT_STORE_BACKEND = "r2" +R2_BUCKET_NAME = "instant-shared-staging" +# DATABASE_URL, REDIS_URL, NATS_URL, etc. resolve to other Container DOs +# via service bindings — see [[env.staging.services]] block. + +# Service bindings — Worker can RPC into other Containers/Workers without +# a public hostname. +[[env.staging.services]] +binding = "PG_PLATFORM" +service = "instanode-pg-platform-staging" +environment = "staging" + +[[env.staging.services]] +binding = "PROVISIONER" +service = "instanode-provisioner-staging" +environment = "staging" + +[[env.staging.services]] +binding = "REDIS_PLATFORM" +service = "instanode-redis-platform-staging" +environment = "staging" + +# Observability — send Container stdout/stderr to a CF Logpush sink. +[env.staging.observability] +enabled = true +head_sampling_rate = 1.0 diff --git a/wrangler/mongodb/Dockerfile b/wrangler/mongodb/Dockerfile new file mode 100644 index 0000000..afbe234 --- /dev/null +++ b/wrangler/mongodb/Dockerfile @@ -0,0 +1,30 @@ +# mongodb image for staging CF Container. +# +# Base: mongo:7. CF Containers' ephemeral disk means EVERY cold start +# is a fresh init — there is no "first init vs subsequent restart" +# distinction. The mongo image's docker-entrypoint runs initdb scripts +# on every fresh /data/db, so the staging-bootstrap script below runs +# every cold start. +# +# Why custom (vs pristine mongo:7): +# - Bake the staging-bootstrap that creates the admin user + sets +# the wire compression default so api can connect without +# post-deploy operator action. +# - Healthcheck via `mongosh ping` for the Worker shell's wait loop. +# - Per-tenant database names are CREATED on demand by provisioner; +# no per-tenant schema baked in here. + +FROM mongo:7 + +# Staging-bootstrap: idempotent admin user. Mongo entrypoint reads +# MONGO_INITDB_ROOT_USERNAME / MONGO_INITDB_ROOT_PASSWORD from env on +# first init; this script is a defence-in-depth ensure path used by +# the api's connection-test against `db.adminCommand({ ping: 1 })`. +COPY infra/wrangler/mongodb/docker-entrypoint-initdb.d/ /docker-entrypoint-initdb.d/ + +# `mongosh` is in the base image; the healthcheck just exercises a +# round-trip via the admin DB to confirm the daemon is up + responsive. +HEALTHCHECK --interval=10s --timeout=3s --start-period=30s --retries=3 \ + CMD mongosh --quiet --eval "db.adminCommand({ping:1}).ok" --host=localhost | grep -q '^1$' || exit 1 + +EXPOSE 27017 diff --git a/wrangler/mongodb/docker-entrypoint-initdb.d/00_staging_bootstrap.js b/wrangler/mongodb/docker-entrypoint-initdb.d/00_staging_bootstrap.js new file mode 100644 index 0000000..ef7f31e --- /dev/null +++ b/wrangler/mongodb/docker-entrypoint-initdb.d/00_staging_bootstrap.js @@ -0,0 +1,27 @@ +// Staging-bootstrap for mongodb CF Container. Runs on EVERY cold start +// because CF Containers wipe /data/db on sleep. +// +// Idempotent: createUser fails with code 51003 ("user already exists") +// if the admin already created the user in the same boot — we swallow +// that. Other codes propagate. + +(function () { + var adminDb = db.getSiblingDB('admin'); + + // Mongo entrypoint already creates the root user from + // MONGO_INITDB_ROOT_USERNAME/MONGO_INITDB_ROOT_PASSWORD. Confirm it + // resolved successfully so the api connection doesn't hit "no users + // configured" on the first call. + var users = adminDb.system.users.find({ user: 'admin' }).count(); + if (users === 0) { + print('00_staging_bootstrap: no admin user found, creating one from env vars'); + adminDb.createUser({ + user: process.env.MONGO_INITDB_ROOT_USERNAME || 'admin', + pwd: process.env.MONGO_INITDB_ROOT_PASSWORD || 'staging-bootstrap', + roles: [{ role: 'root', db: 'admin' }], + }); + } else { + print('00_staging_bootstrap: admin user already provisioned by mongo entrypoint'); + } + print('00_staging_bootstrap: complete'); +})(); diff --git a/wrangler/mongodb/src/worker.ts b/wrangler/mongodb/src/worker.ts new file mode 100644 index 0000000..5cc2570 --- /dev/null +++ b/wrangler/mongodb/src/worker.ts @@ -0,0 +1,19 @@ +import { Container, getContainer } from "@cloudflare/containers"; + +export class MongoContainer extends Container { + defaultPort = 27017; + sleepAfter = "20m"; +} + +export default { + async fetch(request: Request, env: Env): Promise { + const url = new URL(request.url); + const tenant = url.hostname.split(".")[0].replace(/^mongo-/, ""); + const id = env.MONGO_CONTAINER.idFromName(tenant); + return env.MONGO_CONTAINER.get(id).fetch(request); + }, +}; + +interface Env { + MONGO_CONTAINER: DurableObjectNamespace; +} diff --git a/wrangler/mongodb/wrangler.toml b/wrangler/mongodb/wrangler.toml new file mode 100644 index 0000000..48d30dc --- /dev/null +++ b/wrangler/mongodb/wrangler.toml @@ -0,0 +1,30 @@ +# mongodb — per-tenant Mongo in a CF Container (staging). + +name = "instanode-mongodb" +main = "src/worker.ts" +compatibility_date = "2026-05-30" + +[env.staging] +name = "instanode-mongodb-staging" +routes = [ + { pattern = "mongo-*.staging.instanode.dev/*", custom_domain = true }, +] + +[[env.staging.containers]] +class_name = "MongoContainer" +# Custom image — wraps mongo:7 with staging-bootstrap + healthcheck. +image = "ghcr.io/instanode-dev/instant-mongodb:staging" +max_instances = 10 +instance_type = "standard" + +[[env.staging.durable_objects.bindings]] +name = "MONGO_CONTAINER" +class_name = "MongoContainer" + +[[env.staging.migrations]] +tag = "v1" +new_sqlite_classes = ["MongoContainer"] + +[env.staging.observability] +enabled = true +head_sampling_rate = 1.0 diff --git a/wrangler/nats/Dockerfile b/wrangler/nats/Dockerfile new file mode 100644 index 0000000..e3cd67a --- /dev/null +++ b/wrangler/nats/Dockerfile @@ -0,0 +1,23 @@ +# nats image for staging CF Container. +# +# Base: nats:2-alpine. JetStream needs durable disk — NOT viable on +# CF Containers' ephemeral storage — so this image runs CORE NATS ONLY +# (no -js flag). Customer-facing /queue/new in staging returns a +# legacy_open connection string and tests that exercise JetStream +# features are skipped (see test guard in api/internal/handlers/queue.go). +# +# Auth mode: legacy_open. Per CLAUDE.md "Known Design Gaps", prod +# serves legacy_open until the operator runs `nsc generate` for +# operator/sys NKeys (NATS-AUTH-RUNBOOK.md). Staging matches prod's +# current auth posture. + +FROM nats:2-alpine + +COPY infra/wrangler/nats/nats-server.conf /etc/nats/nats-server.conf + +HEALTHCHECK --interval=10s --timeout=3s --start-period=10s --retries=3 \ + CMD wget -qO- http://localhost:8222/healthz | grep -q '"status":"ok"' || exit 1 + +EXPOSE 4222 8222 + +CMD ["-c", "/etc/nats/nats-server.conf"] diff --git a/wrangler/nats/nats-server.conf b/wrangler/nats/nats-server.conf new file mode 100644 index 0000000..db33a4f --- /dev/null +++ b/wrangler/nats/nats-server.conf @@ -0,0 +1,33 @@ +# Staging nats-server.conf — core NATS only (no JetStream — ephemeral +# disk on CF Containers can't satisfy JetStream's durable WAL). +# +# Auth mode: legacy_open. No per-tenant JWT in staging. Production +# eventually upgrades to per-tenant JWT once an operator runs +# `nsc generate` for operator + sys NKeys (NATS-AUTH-RUNBOOK.md). +# This staging config DOES NOT block on that. + +listen: 0.0.0.0:4222 + +# HTTP monitoring endpoint used by the Worker shell's healthcheck. +http: 0.0.0.0:8222 + +# Connection + payload limits matched to CF Container "basic" class. +max_connections: 1000 +max_payload: 1MB +max_pending: 32MB + +# Logging to stdout for `wrangler tail`. +debug: false +trace: false +logtime: true + +# Auth — legacy_open: no creds required. Customers connecting via +# /queue/new staging endpoint get an open URL. +authorization { + # Empty block = no auth. Documented intentional choice. +} + +# NO JetStream block — explicitly disabled because CF Container disk +# is ephemeral. Tests that require JetStream skip on staging via the +# `auth_mode=legacy_open` resource field (see CLAUDE.md /queue/new). +# jetstream { ... } # DO NOT enable in staging diff --git a/wrangler/nats/src/worker.ts b/wrangler/nats/src/worker.ts new file mode 100644 index 0000000..45f2350 --- /dev/null +++ b/wrangler/nats/src/worker.ts @@ -0,0 +1,19 @@ +import { Container, getContainer } from "@cloudflare/containers"; + +export class NatsContainer extends Container { + defaultPort = 4222; + sleepAfter = "20m"; +} + +export default { + async fetch(request: Request, env: Env): Promise { + const url = new URL(request.url); + const tenant = url.hostname.split(".")[0].replace(/^nats-/, ""); + const id = env.NATS_CONTAINER.idFromName(tenant); + return env.NATS_CONTAINER.get(id).fetch(request); + }, +}; + +interface Env { + NATS_CONTAINER: DurableObjectNamespace; +} diff --git a/wrangler/nats/wrangler.toml b/wrangler/nats/wrangler.toml new file mode 100644 index 0000000..7315949 --- /dev/null +++ b/wrangler/nats/wrangler.toml @@ -0,0 +1,40 @@ +# nats — per-tenant NATS in a CF Container (staging). +# NATS JetStream needs durable disk — NOT viable on ephemeral. Staging +# runs core NATS only (no streams). /queue/new in staging returns a +# legacy_open connection string. JetStream features test-skipped. + +name = "instanode-nats" +main = "src/worker.ts" +compatibility_date = "2026-05-30" + +[env.staging] +name = "instanode-nats-staging" +routes = [ + { pattern = "nats-*.staging.instanode.dev/*", custom_domain = true }, +] + +[[env.staging.containers]] +class_name = "NatsContainer" +# Custom image — wraps nats:2-alpine with /etc/nats/nats-server.conf +# baked in (core NATS only, no JetStream, legacy_open auth — matches +# prod's current auth posture). +image = "ghcr.io/instanode-dev/instant-nats:staging" +max_instances = 10 +instance_type = "basic" + +[[env.staging.durable_objects.bindings]] +name = "NATS_CONTAINER" +class_name = "NatsContainer" + +[[env.staging.migrations]] +tag = "v1" +new_sqlite_classes = ["NatsContainer"] + +[env.staging.vars] +# No -js flag → core NATS only. Document that JetStream is staging-disabled +# in /tmp/cf-migration/shared/STAGING-LIMITATIONS.md. +NATS_ARGS = "-m 8222" + +[env.staging.observability] +enabled = true +head_sampling_rate = 1.0 diff --git a/wrangler/pg-customers/src/worker.ts b/wrangler/pg-customers/src/worker.ts new file mode 100644 index 0000000..73ce9b0 --- /dev/null +++ b/wrangler/pg-customers/src/worker.ts @@ -0,0 +1,22 @@ +import { Container, getContainer } from "@cloudflare/containers"; + +export class PgCustomersContainer extends Container { + defaultPort = 5432; + sleepAfter = "20m"; +} + +export default { + async fetch(request: Request, env: Env): Promise { + // Per-tenant routing: extract tenant from subdomain. + const url = new URL(request.url); + const tenant = url.hostname.split(".")[0].replace(/^pg-customer-/, ""); + // ID by tenant → one DO instance per tenant (their isolated PG). + const id = env.PG_CUSTOMERS_CONTAINER.idFromName(tenant); + const container = env.PG_CUSTOMERS_CONTAINER.get(id); + return container.fetch(request); + }, +}; + +interface Env { + PG_CUSTOMERS_CONTAINER: DurableObjectNamespace; +} diff --git a/wrangler/pg-customers/wrangler.toml b/wrangler/pg-customers/wrangler.toml new file mode 100644 index 0000000..65a2b52 --- /dev/null +++ b/wrangler/pg-customers/wrangler.toml @@ -0,0 +1,36 @@ +# pg-customers — per-tenant Postgres in a CF Container (staging only). +# Customer-facing: /db/new in staging returns a connection string here. +# Data is EPHEMERAL — wipes on container sleep. Documented in ../README.md. + +name = "instanode-pg-customers" +main = "src/worker.ts" +compatibility_date = "2026-05-30" + +[env.staging] +name = "instanode-pg-customers-staging" +# Public TCP exposure happens via the Worker shell; staging clients dial +# `pg-customer-.staging.instanode.dev:5432`. +routes = [ + { pattern = "pg-customer-*.staging.instanode.dev/*", custom_domain = true }, +] + +[[env.staging.containers]] +class_name = "PgCustomersContainer" +image = "postgres:16-alpine" +max_instances = 10 # staging cap — bump if QA needs more +instance_type = "standard" + +[[env.staging.durable_objects.bindings]] +name = "PG_CUSTOMERS_CONTAINER" +class_name = "PgCustomersContainer" + +[[env.staging.migrations]] +tag = "v1" +new_sqlite_classes = ["PgCustomersContainer"] + +[env.staging.vars] +PGDATA = "/var/lib/postgresql/data/pgdata" + +[env.staging.observability] +enabled = true +head_sampling_rate = 1.0 diff --git a/wrangler/pg-platform/00_pre.sql b/wrangler/pg-platform/00_pre.sql new file mode 100644 index 0000000..f2c18fb --- /dev/null +++ b/wrangler/pg-platform/00_pre.sql @@ -0,0 +1,25 @@ +-- Runs FIRST in /docker-entrypoint-initdb.d/ (alphabetical sort puts +-- "00_pre.sql" ahead of "001_initial.sql"). Sets up extensions + log +-- markers that every later migration depends on. +-- +-- This file is staging-only — production uses different operator-run +-- bootstrap. See infra/wrangler/pg-platform/Dockerfile for context. + +-- pgvector — mig 040+ does CREATE EXTENSION vector and assumes the +-- shared library is loadable. pgvector/pgvector:pg16 ships the .so; +-- this just registers it in the freshly-init'd database. +CREATE EXTENSION IF NOT EXISTS vector; + +-- Standard extensions we use across migrations. +CREATE EXTENSION IF NOT EXISTS "uuid-ossp"; +CREATE EXTENSION IF NOT EXISTS "pgcrypto"; + +-- Match prod timezone — every timestamp comparison in tests assumes UTC. +SET TIME ZONE 'UTC'; + +-- Log marker. Shows in `wrangler tail` so operators know this is a +-- cold-start init (vs an unexpected mid-life restart). +DO $$ +BEGIN + RAISE NOTICE 'pg-platform staging cold start — re-applying 63 migrations against fresh PGDATA'; +END $$; diff --git a/wrangler/pg-platform/Dockerfile b/wrangler/pg-platform/Dockerfile new file mode 100644 index 0000000..3b83f11 --- /dev/null +++ b/wrangler/pg-platform/Dockerfile @@ -0,0 +1,53 @@ +# pg-platform image for staging CF Container. +# +# Base: pgvector/pgvector:pg16 — Postgres 16 + the pgvector extension +# that platform_db's resource embeddings table requires (extension CREATE +# in mig 040+; without pgvector the image init fails on the first +# `CREATE EXTENSION vector` statement). +# +# Migrations: the 63 *.sql files from api/internal/db/migrations/ are +# copied into /docker-entrypoint-initdb.d/. Postgres's official +# entrypoint runs every *.sql alphabetically on first cluster init — +# and CF Containers' ephemeral disk means EVERY cold start IS a first +# cluster init, so the migrations re-apply on every wake-from-sleep. +# +# This is the explicit, user-blessed ephemeral-state tradeoff for the +# CF-only staging design. See ../README.md acceptance criterion. +# +# Build context: workspace root (../../). +# Build command (CI runs this; not for ad-hoc local use): +# docker buildx build \ +# -f infra/wrangler/pg-platform/Dockerfile \ +# -t ghcr.io/instanode-dev/instant-pg-platform:staging \ +# --push \ +# . + +FROM pgvector/pgvector:pg16 + +# Copy every migration file in numeric (=alphabetical) order. The +# leading 0NN_*.sql naming guarantees the entrypoint applies them in +# the same order as `make test-db-up` does locally. +COPY api/internal/db/migrations/*.sql /docker-entrypoint-initdb.d/ + +# A pre-script that runs before any migration. Names start with "00_" +# so it sorts ahead of "001_initial.sql". +# +# We use it to: +# 1. CREATE EXTENSION pgvector (idempotent — base image has the +# shared lib; this enables it in the freshly-init'd database). +# 2. Set timezone to UTC to match production. +# 3. Print a one-line marker so the CF Container's logs make clear +# this is a fresh init (operator confidence on cold start). +COPY infra/wrangler/pg-platform/00_pre.sql /docker-entrypoint-initdb.d/00_pre.sql + +# postgres image expects POSTGRES_PASSWORD set; staging wrangler.toml +# wires that through `wrangler secret put POSTGRES_PASSWORD`. The +# image also reads POSTGRES_DB / POSTGRES_USER if provided (wrangler +# env block sets POSTGRES_DB=instant_platform). + +# Healthcheck — pg_isready against the local socket. Used by the +# Worker shell's container.fetch wait-loop. +HEALTHCHECK --interval=10s --timeout=3s --start-period=30s --retries=3 \ + CMD pg_isready -U "${POSTGRES_USER:-postgres}" -d "${POSTGRES_DB:-instant_platform}" || exit 1 + +EXPOSE 5432 diff --git a/wrangler/pg-platform/README.md b/wrangler/pg-platform/README.md new file mode 100644 index 0000000..67b992a --- /dev/null +++ b/wrangler/pg-platform/README.md @@ -0,0 +1,87 @@ +# pg-platform — staging CF Container + +Postgres 16 + pgvector. Image baked with all 63 platform migrations in +`/docker-entrypoint-initdb.d/` so cold starts come up with a fully +migrated schema. + +## Ephemeral acceptance + +Per the CF-only staging decision (2026-05-30): disk wipes every time the +Container sleeps (which fires on traffic-quiet, not just intentional +restart). Each cold start: + +1. CF Containers wakes the Container with a fresh disk. +2. Postgres entrypoint sees PGDATA empty → runs `initdb`. +3. `00_pre.sql` runs first — pgvector + uuid-ossp + pgcrypto extensions, UTC tz. +4. The 63 migration files run in numeric order (001 → 063). +5. Container reports healthy via `pg_isready`. +6. api / worker / provisioner Containers can now connect via service binding. + +Total cold-start time: estimated 15–45s depending on Container class + +migration count. Anything that talks to pg-platform must tolerate this +warmup (Worker shell's `container.fetch` blocks until healthy). + +## Image build + +The image is built by `infra/.github/workflows/wrangler-build-staging-images.yml` +on push to master that changes any of: +- `api/internal/db/migrations/**` (cross-repo trigger via repository_dispatch — see below) +- `infra/wrangler/pg-platform/**` + +Plus daily at 09:00 UTC to keep up with migrations merged in api repo without +explicit infra commits. + +Manual rebuild: +```bash +gh workflow run wrangler-build-staging-images.yml \ + -R instanode-dev/infra \ + -f service=pg-platform +``` + +## Cross-repo migration sync + +Migrations live in the `api` repo, not infra. Two patterns to keep the +image current: + +1. **Daily cron rebuild** — the build workflow runs nightly with a fresh + checkout of both repos; any new `.sql` file lands within 24h. +2. **`api` repo notifies on migration change** — `api/.github/workflows/notify-infra-on-migration.yml` + sends a `repository_dispatch` event to infra when `api/internal/db/migrations/**` + changes, triggering an immediate build. + +If neither runs, staging pg-platform will be behind on migrations and +api startup will fail with "migration not applied" — operator-visible +via `wrangler tail instanode-pg-platform-staging`. + +## Secrets + +Set via `wrangler secret put`, scoped to `--env staging`: + +| Secret | Source | Purpose | +|---|---|---| +| `POSTGRES_USER` | operator-defined (e.g. `instanode_admin`) | role for connection | +| `POSTGRES_PASSWORD` | random, ≥32 chars | passed to connection_url | +| `POSTGRES_DB` | `instant_platform` | initial DB created at first start | + +The actual connection string handed to api/worker/provisioner is built +via service binding — they see `PG_PLATFORM` env binding, not a raw +URL with the password. + +## Verifying + +```bash +wrangler tail instanode-pg-platform-staging --format pretty +# wait for: "pg-platform staging cold start — re-applying 63 migrations against fresh PGDATA" +# then: "database system is ready to accept connections" + +# from a debug Worker shell: +wrangler dev --env staging +# Then inside the Worker: env.PG_PLATFORM.fetch("http://internal/healthz") +``` + +## Known limitations + +- **Cold-start cost is ~15-45s.** Synthetic warmer can keep it hot; without one, every traffic gap > sleepAfter (currently 30m) pays the full re-migration cost. +- **No replication.** max_instances=1; HA is meaningless when disk is ephemeral. Production gets a different model entirely. +- **No `pg_dump` artifacts persist.** If you need a snapshot for debugging, dump and immediately stream to R2 via the customer-backup pipeline; the local file dies on next sleep. +- **63 migrations is the live count as of 2026-05-30.** When api repo adds mig 064+, the daily cron rebuild picks them up. diff --git a/wrangler/pg-platform/src/worker.ts b/wrangler/pg-platform/src/worker.ts new file mode 100644 index 0000000..7646da8 --- /dev/null +++ b/wrangler/pg-platform/src/worker.ts @@ -0,0 +1,25 @@ +// pg-platform Worker shell. Postgres doesn't speak HTTP, but CF +// Containers require a Worker entrypoint. The Worker accepts a +// service-binding RPC from other Containers and forwards a connection +// hint; the actual TCP traffic flows over the Container DO's internal +// network using `container.fetch(request)` with `Upgrade: tcp` semantics +// (CF Containers' raw-TCP mode, available since the GA release). + +import { Container, getContainer } from "@cloudflare/containers"; + +export class PgPlatformContainer extends Container { + defaultPort = 5432; + sleepAfter = "30m"; // Longer than api so platform_db survives test bursts. +} + +export default { + async fetch(request: Request, env: Env): Promise { + const container = getContainer(env.PG_CONTAINER); + // Container holds the TCP listener; CF routes the upgraded socket through. + return container.fetch(request); + }, +}; + +interface Env { + PG_CONTAINER: DurableObjectNamespace; +} diff --git a/wrangler/pg-platform/wrangler.toml b/wrangler/pg-platform/wrangler.toml new file mode 100644 index 0000000..274e033 --- /dev/null +++ b/wrangler/pg-platform/wrangler.toml @@ -0,0 +1,48 @@ +# pg-platform on CF Containers (staging). +# +# Runs `postgres:16` in a CF Container. Data dir is ephemeral — +# every sleep wipes /var/lib/postgresql/data. This is the explicit +# user-blessed tradeoff for CF-only staging. +# +# Production does NOT use this; prod platform_db lives elsewhere. + +name = "instanode-pg-platform" +main = "src/worker.ts" +compatibility_date = "2026-05-30" + +[env.staging] +name = "instanode-pg-platform-staging" +# No public route — accessed only via service binding from api/worker/provisioner. + +[[env.staging.containers]] +class_name = "PgPlatformContainer" +# Custom image built by infra/.github/workflows/wrangler-build-staging-images.yml. +# Bakes the 63 migrations from api/internal/db/migrations/*.sql into +# /docker-entrypoint-initdb.d/ + pgvector extension. See ./Dockerfile. +image = "ghcr.io/instanode-dev/instant-pg-platform:staging" +max_instances = 1 # Single-writer; HA is meaningless when disk is ephemeral. +instance_type = "standard" # 1 vCPU, 4 GiB RAM, 8 GiB ephemeral + +[[env.staging.durable_objects.bindings]] +name = "PG_CONTAINER" +class_name = "PgPlatformContainer" + +[[env.staging.migrations]] +tag = "v1" +new_sqlite_classes = ["PgPlatformContainer"] + +# Bootstrap secrets via wrangler secret put: +# POSTGRES_USER, POSTGRES_PASSWORD, POSTGRES_DB +# The Postgres image reads these env vars on first boot to initialize the +# cluster — which it'll redo every sleep cycle. +[env.staging.vars] +POSTGRES_DB = "instant_platform" +# POSTGRES_INITDB_ARGS controls locale; staging just uses default. +PGDATA = "/var/lib/postgresql/data/pgdata" +# Run our 62 migrations on container boot. The init script lives in +# src/bootstrap.sh and is included in the image via Dockerfile. +APPLY_MIGRATIONS_ON_BOOT = "true" + +[env.staging.observability] +enabled = true +head_sampling_rate = 1.0 diff --git a/wrangler/provisioner/src/worker.ts b/wrangler/provisioner/src/worker.ts new file mode 100644 index 0000000..72fde55 --- /dev/null +++ b/wrangler/provisioner/src/worker.ts @@ -0,0 +1,16 @@ +import { Container, getContainer } from "@cloudflare/containers"; + +export class ProvisionerContainer extends Container { + defaultPort = 50051; // gRPC + sleepAfter = "20m"; +} + +export default { + async fetch(request: Request, env: Env): Promise { + return getContainer(env.PROVISIONER_CONTAINER).fetch(request); + }, +}; + +interface Env { + PROVISIONER_CONTAINER: DurableObjectNamespace; +} diff --git a/wrangler/provisioner/wrangler.toml b/wrangler/provisioner/wrangler.toml new file mode 100644 index 0000000..d1c93dc --- /dev/null +++ b/wrangler/provisioner/wrangler.toml @@ -0,0 +1,47 @@ +# provisioner — gRPC service in a CF Container (staging). +# No public route; api reaches it via service binding. + +name = "instanode-provisioner" +main = "src/worker.ts" +compatibility_date = "2026-05-30" + +[env.staging] +name = "instanode-provisioner-staging" + +[[env.staging.containers]] +class_name = "ProvisionerContainer" +image = "ghcr.io/instanode-dev/instant-provisioner:staging" +max_instances = 2 +instance_type = "standard" + +[[env.staging.durable_objects.bindings]] +name = "PROVISIONER_CONTAINER" +class_name = "ProvisionerContainer" + +[[env.staging.migrations]] +tag = "v1" +new_sqlite_classes = ["ProvisionerContainer"] + +[env.staging.vars] +ENVIRONMENT = "staging" + +# Provisioner reaches the customer-data Containers via service bindings. +[[env.staging.services]] +binding = "PG_CUSTOMERS" +service = "instanode-pg-customers-staging" + +[[env.staging.services]] +binding = "MONGODB" +service = "instanode-mongodb-staging" + +[[env.staging.services]] +binding = "REDIS_PROVISION" +service = "instanode-redis-provision-staging" + +[[env.staging.services]] +binding = "NATS" +service = "instanode-nats-staging" + +[env.staging.observability] +enabled = true +head_sampling_rate = 1.0 diff --git a/wrangler/redis-provision/Dockerfile b/wrangler/redis-provision/Dockerfile new file mode 100644 index 0000000..299d710 --- /dev/null +++ b/wrangler/redis-provision/Dockerfile @@ -0,0 +1,30 @@ +# redis-provision image for staging CF Container. +# +# Base: redis:7-alpine. CF Containers' ephemeral disk means RDB +# persistence is pointless — every sleep wipes /data. We disable +# RDB + AOF entirely and run in-memory-only with `allkeys-lru` +# eviction so the Container can't OOM under sustained writes. +# +# Why custom (vs pristine redis:7-alpine): +# - Bake redis.conf with auth + memory + eviction policy so the +# Worker shell doesn't have to pass them via wrangler.toml CMD. +# - Healthcheck via `redis-cli -a $REDIS_PASSWORD ping`. +# - Auth is via `requirepass` from REDIS_PASSWORD env (wrangler +# secret). + +FROM redis:7-alpine + +COPY infra/wrangler/redis-provision/redis.conf /etc/redis/redis.conf + +# Entrypoint that templates REDIS_PASSWORD env into the conf at boot. +# Without this, the conf can't contain the secret at build time. +COPY infra/wrangler/redis-provision/entrypoint.sh /usr/local/bin/staging-entrypoint.sh +RUN chmod +x /usr/local/bin/staging-entrypoint.sh + +HEALTHCHECK --interval=10s --timeout=3s --start-period=10s --retries=3 \ + CMD redis-cli -a "$REDIS_PASSWORD" --no-auth-warning ping | grep -q '^PONG$' || exit 1 + +EXPOSE 6379 + +ENTRYPOINT ["/usr/local/bin/staging-entrypoint.sh"] +CMD ["redis-server", "/etc/redis/redis.conf"] diff --git a/wrangler/redis-provision/entrypoint.sh b/wrangler/redis-provision/entrypoint.sh new file mode 100644 index 0000000..bc62464 --- /dev/null +++ b/wrangler/redis-provision/entrypoint.sh @@ -0,0 +1,21 @@ +#!/bin/sh +# Templating entrypoint for staging redis. Inlines REDIS_PASSWORD into +# /etc/redis/redis.conf at boot (the file ships with __REDIS_PASSWORD__ +# as a literal marker; we never bake a real secret into the image). + +set -eu + +if [ -z "${REDIS_PASSWORD:-}" ]; then + echo "redis-provision: REDIS_PASSWORD env var is required" >&2 + exit 1 +fi + +# In-place substitute. Using a temp file because sed -i on alpine +# behaves differently than GNU sed; this is portable. +TMP="$(mktemp)" +sed "s|__REDIS_PASSWORD__|${REDIS_PASSWORD}|" /etc/redis/redis.conf > "$TMP" +mv "$TMP" /etc/redis/redis.conf +chmod 600 /etc/redis/redis.conf # only root reads — defense in depth + +# Hand off to the configured CMD (`redis-server /etc/redis/redis.conf`). +exec "$@" diff --git a/wrangler/redis-provision/redis.conf b/wrangler/redis-provision/redis.conf new file mode 100644 index 0000000..7b423d0 --- /dev/null +++ b/wrangler/redis-provision/redis.conf @@ -0,0 +1,28 @@ +# Staging redis.conf — ephemeral, auth'd, LRU-capped. +# REDIS_PASSWORD is substituted at container boot by entrypoint.sh. + +bind 0.0.0.0 +port 6379 +protected-mode yes + +# Auth — entrypoint.sh inlines REDIS_PASSWORD env value here. +requirepass __REDIS_PASSWORD__ + +# Memory cap + eviction. CF Container "basic" tier has 4 GiB; cap at 3 +# GiB to leave headroom for connection buffers + COW during eviction. +maxmemory 3gb +maxmemory-policy allkeys-lru + +# No persistence — CF Containers wipe /data on sleep, so RDB snapshots +# only waste CPU. AOF same. Staging is in-memory-only by design. +save "" +appendonly no + +# Logging to stdout for `wrangler tail`. +logfile "" +loglevel notice + +# Connection limits matched to instance class. +maxclients 1000 +timeout 300 +tcp-keepalive 60 diff --git a/wrangler/redis-provision/src/worker.ts b/wrangler/redis-provision/src/worker.ts new file mode 100644 index 0000000..2b77911 --- /dev/null +++ b/wrangler/redis-provision/src/worker.ts @@ -0,0 +1,19 @@ +import { Container, getContainer } from "@cloudflare/containers"; + +export class RedisContainer extends Container { + defaultPort = 6379; + sleepAfter = "20m"; +} + +export default { + async fetch(request: Request, env: Env): Promise { + const url = new URL(request.url); + const tenant = url.hostname.split(".")[0].replace(/^redis-/, ""); + const id = env.REDIS_CONTAINER.idFromName(tenant); + return env.REDIS_CONTAINER.get(id).fetch(request); + }, +}; + +interface Env { + REDIS_CONTAINER: DurableObjectNamespace; +} diff --git a/wrangler/redis-provision/wrangler.toml b/wrangler/redis-provision/wrangler.toml new file mode 100644 index 0000000..2896e8d --- /dev/null +++ b/wrangler/redis-provision/wrangler.toml @@ -0,0 +1,32 @@ +# redis-provision — per-tenant Redis in a CF Container (staging). + +name = "instanode-redis-provision" +main = "src/worker.ts" +compatibility_date = "2026-05-30" + +[env.staging] +name = "instanode-redis-provision-staging" +routes = [ + { pattern = "redis-*.staging.instanode.dev/*", custom_domain = true }, +] + +[[env.staging.containers]] +class_name = "RedisContainer" +# Custom image — wraps redis:7-alpine with auth + maxmemory + LRU +# eviction baked into /etc/redis/redis.conf (entrypoint templates +# REDIS_PASSWORD in at boot). +image = "ghcr.io/instanode-dev/instant-redis-provision:staging" +max_instances = 10 +instance_type = "basic" # Redis is lighter than PG/Mongo + +[[env.staging.durable_objects.bindings]] +name = "REDIS_CONTAINER" +class_name = "RedisContainer" + +[[env.staging.migrations]] +tag = "v1" +new_sqlite_classes = ["RedisContainer"] + +[env.staging.observability] +enabled = true +head_sampling_rate = 1.0 diff --git a/wrangler/worker/src/worker.ts b/wrangler/worker/src/worker.ts new file mode 100644 index 0000000..db330bb --- /dev/null +++ b/wrangler/worker/src/worker.ts @@ -0,0 +1,23 @@ +import { Container, getContainer } from "@cloudflare/containers"; + +export class WorkerContainer extends Container { + defaultPort = 8091; // worker exposes /metrics + /readyz on 8091 + sleepAfter = "20m"; +} + +export default { + // HTTP path: forward to container (rare; mostly metrics scrapes). + async fetch(request: Request, env: Env): Promise { + return getContainer(env.WORKER_CONTAINER).fetch(request); + }, + // Cron path: wake the container so River picks up due jobs. + async scheduled(_event: ScheduledEvent, env: Env): Promise { + const c = getContainer(env.WORKER_CONTAINER); + // A no-op POST that the worker binary handles as "tick the job loop". + await c.fetch("http://internal/tick", { method: "POST" }); + }, +}; + +interface Env { + WORKER_CONTAINER: DurableObjectNamespace; +} diff --git a/wrangler/worker/wrangler.toml b/wrangler/worker/wrangler.toml new file mode 100644 index 0000000..05b555d --- /dev/null +++ b/wrangler/worker/wrangler.toml @@ -0,0 +1,40 @@ +# worker — River jobs in a CF Container (staging). +# Cron triggers via CF Cron Triggers (no public route). + +name = "instanode-worker" +main = "src/worker.ts" +compatibility_date = "2026-05-30" + +[env.staging] +name = "instanode-worker-staging" + +[[env.staging.containers]] +class_name = "WorkerContainer" +image = "ghcr.io/instanode-dev/instant-worker:staging" +max_instances = 2 +instance_type = "standard" + +[[env.staging.durable_objects.bindings]] +name = "WORKER_CONTAINER" +class_name = "WorkerContainer" + +[[env.staging.migrations]] +tag = "v1" +new_sqlite_classes = ["WorkerContainer"] + +# Cron — fires every 5 minutes; the Worker shell wakes the Container. +[env.staging.triggers] +crons = ["*/5 * * * *"] + +[env.staging.vars] +ENVIRONMENT = "staging" +OBJECT_STORE_BACKEND = "r2" +R2_BUCKET_NAME = "instant-shared-staging" + +[[env.staging.services]] +binding = "PG_PLATFORM" +service = "instanode-pg-platform-staging" + +[env.staging.observability] +enabled = true +head_sampling_rate = 1.0