From 8d1e8a8fcb002307dc8a590695498be58ed5614c Mon Sep 17 00:00:00 2001 From: Manas Srivastava Date: Sat, 30 May 2026 14:29:03 +0530 Subject: [PATCH 1/3] Revert "Merge pull request #34 from InstaNode-dev/cf/staging-drop-cache" This reverts commit 9d1d9ad99643a97fe463a60ca173772ec7592a0a, reversing changes made to 86922f2bb663bc8511c7f0f3831a776dc9f87297. --- .github/workflows/terraform.yml | 4 -- terraform/cloudflare/cache.tf | 96 +++++++++++++++++++++++++++++++++ 2 files changed, 96 insertions(+), 4 deletions(-) create mode 100644 terraform/cloudflare/cache.tf diff --git a/.github/workflows/terraform.yml b/.github/workflows/terraform.yml index f301985..693a808 100644 --- a/.github/workflows/terraform.yml +++ b/.github/workflows/terraform.yml @@ -173,7 +173,3 @@ jobs: repo: context.repo.repo, body, }); - -# (cf/staging-drop-cache touched this file so the Lint + schema-check -# k8s manifests required check on the infra repo fires for this PR. -# No semantic change.) diff --git a/terraform/cloudflare/cache.tf b/terraform/cloudflare/cache.tf new file mode 100644 index 0000000..9864c11 --- /dev/null +++ b/terraform/cloudflare/cache.tf @@ -0,0 +1,96 @@ +# Cache rules for api.staging.instanode.dev (and api.instanode.dev once +# Phase 4 flips proxied=true on the api A-record). +# +# D-12 (LOCKED): cache scope is an EXPLICIT path allowlist — `/healthz`, +# `/openapi.json`, `/llms.txt`. Everything else BYPASSES cache regardless +# of Authorization header presence. The original "bypass cache when +# Authorization header is set" approach was deleted because (a) the +# primitive doesn't exist on our zone tier, (b) it's a footgun if an +# authed response ever flows through cache. +# +# Plus: `instant_unexpected_cached_response_total` P0 metric in the api +# code (NOT here — handler-side) trips an alert if a request OUTSIDE +# the allowlist ever responds with cache-hit semantics. Defense in depth. + +# Catch-all bypass at top priority — cache OFF for everything by default. +resource "cloudflare_ruleset" "api_cache_rules" { + zone_id = var.zone_id + name = "api-cache-rules" + description = "D-12 explicit-path allowlist for api${var.environment == "production" ? "" : ".staging"}.${var.zone_name}" + kind = "zone" + phase = "http_request_cache_settings" + + # Rules evaluated top-to-bottom; first match wins. + rules = [ + # Rule 1: bypass cache for everything by default (catch-all at lowest + # priority via `Last`). + { + action = "set_cache_settings" + description = "bypass cache for all api.* paths by default" + enabled = true + expression = "(http.host eq \"api${var.environment == "production" ? "" : ".staging"}.${var.zone_name}\")" + action_parameters = { + cache = false + } + }, + # Rule 2: allow cache for /healthz (overrides bypass via earlier + # evaluation only if listed BEFORE the catch-all; CF Rulesets evaluate + # all rules and the LAST matching action wins for `set_cache_settings`, + # so explicit allowlist comes after the catch-all). + { + action = "set_cache_settings" + description = "cache /healthz at edge for 30s — same SHA across instances" + enabled = true + expression = "(http.host eq \"api${var.environment == "production" ? "" : ".staging"}.${var.zone_name}\") and (http.request.uri.path eq \"/healthz\")" + action_parameters = { + cache = true + edge_ttl = { + mode = "override_origin" + default = 30 + } + browser_ttl = { + mode = "override_origin" + default = 0 + } + } + }, + # Rule 3: cache /openapi.json for 5 minutes — frequently re-fetched + # by tooling, changes rarely. + { + action = "set_cache_settings" + description = "cache /openapi.json at edge for 5min" + enabled = true + expression = "(http.host eq \"api${var.environment == "production" ? "" : ".staging"}.${var.zone_name}\") and (http.request.uri.path eq \"/openapi.json\")" + action_parameters = { + cache = true + edge_ttl = { + mode = "override_origin" + default = 300 + } + browser_ttl = { + mode = "override_origin" + default = 60 + } + } + }, + # Rule 4: cache /llms.txt for 1 hour — static content from content + # repo, refresh cadence is "operator manually re-syncs". + { + action = "set_cache_settings" + description = "cache /llms.txt at edge for 1h" + enabled = true + expression = "(http.host eq \"api${var.environment == "production" ? "" : ".staging"}.${var.zone_name}\") and (http.request.uri.path eq \"/llms.txt\")" + action_parameters = { + cache = true + edge_ttl = { + mode = "override_origin" + default = 3600 + } + browser_ttl = { + mode = "override_origin" + default = 600 + } + } + }, + ] +} From 1da613a75ce29bcc8e0fad98490b4870725a269c Mon Sep 17 00:00:00 2001 From: Manas Srivastava Date: Sat, 30 May 2026 14:29:03 +0530 Subject: [PATCH 2/3] Revert "Merge pull request #33 from InstaNode-dev/cf/staging-apply-fixes" This reverts commit 86922f2bb663bc8511c7f0f3831a776dc9f87297, reversing changes made to 9448e80f16fab2d34a31b43bb839d328bc43a979. --- .../workflows/terraform-apply-production.yml | 6 ++---- .github/workflows/terraform-apply-staging.yml | 8 ++------ .github/workflows/terraform.yml | 11 +++-------- terraform/cloudflare/dns.tf | 18 ++++++++---------- terraform/cloudflare/staging.tf | 10 +++++----- 5 files changed, 20 insertions(+), 33 deletions(-) diff --git a/.github/workflows/terraform-apply-production.yml b/.github/workflows/terraform-apply-production.yml index e4322b4..415670e 100644 --- a/.github/workflows/terraform-apply-production.yml +++ b/.github/workflows/terraform-apply-production.yml @@ -38,8 +38,7 @@ env: TF_VERSION: '1.9.8' TF_IN_AUTOMATION: 'true' TF_ENV: 'production' - CLOUDFLARE_EMAIL: ${{ secrets.CLOUDFLARE_EMAIL }} - CLOUDFLARE_API_KEY: ${{ secrets.CLOUDFLARE_API_KEY }} + CLOUDFLARE_API_TOKEN: ${{ secrets.CLOUDFLARE_API_TOKEN }} AWS_ACCESS_KEY_ID: ${{ secrets.TF_STATE_R2_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.TF_STATE_R2_SECRET_ACCESS_KEY }} AWS_REGION: 'auto' @@ -109,8 +108,7 @@ jobs: - name: Verify operator secrets are set run: | missing="" - [ -z "${CLOUDFLARE_EMAIL}" ] && missing="${missing} CLOUDFLARE_EMAIL" - [ -z "${CLOUDFLARE_API_KEY}" ] && missing="${missing} CLOUDFLARE_API_KEY" + [ -z "${CLOUDFLARE_API_TOKEN}" ] && missing="${missing} CLOUDFLARE_API_TOKEN" [ -z "${AWS_ACCESS_KEY_ID}" ] && missing="${missing} TF_STATE_R2_ACCESS_KEY_ID" [ -z "${AWS_SECRET_ACCESS_KEY}" ] && missing="${missing} TF_STATE_R2_SECRET_ACCESS_KEY" [ -z "${CF_ACCOUNT_ID}" ] && missing="${missing} CF_ACCOUNT_ID" diff --git a/.github/workflows/terraform-apply-staging.yml b/.github/workflows/terraform-apply-staging.yml index bceb50b..2e8ef76 100644 --- a/.github/workflows/terraform-apply-staging.yml +++ b/.github/workflows/terraform-apply-staging.yml @@ -34,10 +34,7 @@ env: TF_VERSION: '1.9.8' TF_IN_AUTOMATION: 'true' TF_ENV: 'staging' - # Global Key via EMAIL+KEY env vars (CLOUDFLARE_API_TOKEN forces Bearer - # which Global Keys fail on Rulesets/R2/account-scoped endpoints — 9106). - CLOUDFLARE_EMAIL: ${{ secrets.CLOUDFLARE_EMAIL }} - CLOUDFLARE_API_KEY: ${{ secrets.CLOUDFLARE_API_KEY }} + CLOUDFLARE_API_TOKEN: ${{ secrets.CLOUDFLARE_API_TOKEN }} AWS_ACCESS_KEY_ID: ${{ secrets.TF_STATE_R2_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.TF_STATE_R2_SECRET_ACCESS_KEY }} AWS_REGION: 'auto' @@ -75,8 +72,7 @@ jobs: - name: Verify operator secrets are set run: | missing="" - [ -z "${CLOUDFLARE_EMAIL}" ] && missing="${missing} CLOUDFLARE_EMAIL" - [ -z "${CLOUDFLARE_API_KEY}" ] && missing="${missing} CLOUDFLARE_API_KEY" + [ -z "${CLOUDFLARE_API_TOKEN}" ] && missing="${missing} CLOUDFLARE_API_TOKEN" [ -z "${AWS_ACCESS_KEY_ID}" ] && missing="${missing} TF_STATE_R2_ACCESS_KEY_ID" [ -z "${AWS_SECRET_ACCESS_KEY}" ] && missing="${missing} TF_STATE_R2_SECRET_ACCESS_KEY" [ -z "${CF_ACCOUNT_ID}" ] && missing="${missing} CF_ACCOUNT_ID" diff --git a/.github/workflows/terraform.yml b/.github/workflows/terraform.yml index 693a808..fe9cc2c 100644 --- a/.github/workflows/terraform.yml +++ b/.github/workflows/terraform.yml @@ -74,11 +74,7 @@ jobs: working-directory: terraform/cloudflare # CF creds + state-backend creds passed in via env, not inlined in run:. env: - # Global Key via EMAIL+KEY env vars (provider uses X-Auth-* headers). - # NOT CLOUDFLARE_API_TOKEN — that's Bearer-only and Global Keys fail - # Bearer auth on Rulesets / R2 / account-scoped endpoints (9106). - CLOUDFLARE_EMAIL: ${{ secrets.CLOUDFLARE_EMAIL }} - CLOUDFLARE_API_KEY: ${{ secrets.CLOUDFLARE_API_KEY }} + CLOUDFLARE_API_TOKEN: ${{ secrets.CLOUDFLARE_API_TOKEN }} AWS_ACCESS_KEY_ID: ${{ secrets.TF_STATE_R2_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.TF_STATE_R2_SECRET_ACCESS_KEY }} AWS_REGION: 'auto' @@ -99,8 +95,7 @@ jobs: # pointing at the README and the exact missing variable names. run: | missing="" - [ -z "${CLOUDFLARE_EMAIL}" ] && missing="${missing} CLOUDFLARE_EMAIL" - [ -z "${CLOUDFLARE_API_KEY}" ] && missing="${missing} CLOUDFLARE_API_KEY" + [ -z "${CLOUDFLARE_API_TOKEN}" ] && missing="${missing} CLOUDFLARE_API_TOKEN" [ -z "${AWS_ACCESS_KEY_ID}" ] && missing="${missing} TF_STATE_R2_ACCESS_KEY_ID" [ -z "${AWS_SECRET_ACCESS_KEY}" ] && missing="${missing} TF_STATE_R2_SECRET_ACCESS_KEY" [ -z "${CF_ACCOUNT_ID}" ] && missing="${missing} CF_ACCOUNT_ID" @@ -109,7 +104,7 @@ jobs: echo "::error::See https://github.com/InstaNode-dev/infra/blob/master/terraform/cloudflare/README.md#bootstrap-one-time" exit 1 fi - echo "all 5 operator secrets present" + echo "all 4 operator secrets present" - name: terraform init run: | diff --git a/terraform/cloudflare/dns.tf b/terraform/cloudflare/dns.tf index 9f3d9c4..fbfd48c 100644 --- a/terraform/cloudflare/dns.tf +++ b/terraform/cloudflare/dns.tf @@ -1,14 +1,12 @@ # DNS records under management. # # Pre-cutover ritual (D-3): TTL must be 60s for ≥48h BEFORE any cut. -# That ONLY applies to grey-cloud (proxied=false) records — CF requires -# proxied=true records to have ttl=1 (CF manages TTL internally; setting -# 60 returns a 400 "ttl must be set to 1 when `proxied` is true"). +# Setting it that low here means terraform plan/apply itself satisfies +# the pre-step the first time we touch the record. # -# `proxied = true` = CF orange-cloud (ttl=1); `false` = grey-cloud, -# DNS only (ttl=60 for cutover ramp). Today: marketing apex is orange -# (Phase 0 baseline), api is grey (becomes orange in Phase 4 — flip -# both the proxied flag AND ttl=60→1 in that phase's PR). +# `proxied = true` = CF orange-cloud; `false` = grey-cloud (DNS only, no +# proxy). Today: marketing apex is orange (Phase 0 baseline), api is grey +# (becomes orange in Phase 4 — flip this flag in that phase's PR). locals { marketing_origin = "instanode-web.pages.dev" # set per environment in staging.tfvars / production.tfvars after Pages project is created @@ -20,7 +18,7 @@ resource "cloudflare_dns_record" "apex" { name = var.zone_name type = "CNAME" content = local.marketing_origin - ttl = 1 + ttl = 60 proxied = true comment = "marketing apex; CNAME-flattened to Pages project" } @@ -30,7 +28,7 @@ resource "cloudflare_dns_record" "www" { name = "www.${var.zone_name}" type = "CNAME" content = var.zone_name - ttl = 1 + ttl = 60 proxied = true comment = "www → apex redirect handled by CF page rule" } @@ -51,7 +49,7 @@ resource "cloudflare_dns_record" "staging" { name = "staging.${var.zone_name}" type = "CNAME" content = "instant-staging.${var.zone_name}.cdn.cloudflare.net" # Pages preview hostname; replaced after Pages project is up - ttl = 1 + ttl = 60 proxied = true comment = "staging mirror per D-2" } diff --git a/terraform/cloudflare/staging.tf b/terraform/cloudflare/staging.tf index 08bc26a..10deda6 100644 --- a/terraform/cloudflare/staging.tf +++ b/terraform/cloudflare/staging.tf @@ -48,7 +48,7 @@ resource "cloudflare_dns_record" "staging_wildcard" { # regardless of what's here. A 404 sink is intentional — any unrouted # subdomain hits CF's default 404 page. content = local.staging_stem - ttl = 1 + ttl = 60 proxied = true comment = "wildcard for per-tenant CF Container services in staging; routed via cloudflare_workers_route below" } @@ -70,7 +70,7 @@ resource "cloudflare_dns_record" "staging_deployment_wildcard" { name = "*.deployment.${local.staging_stem}" type = "CNAME" content = "deployment.${local.staging_stem}" - ttl = 1 + ttl = 60 proxied = true comment = "wildcard for /deploy/new staging apps (mirrors prod *.deployment.instanode.dev)" } @@ -83,7 +83,7 @@ resource "cloudflare_dns_record" "staging_deployment_anchor" { name = "deployment.${local.staging_stem}" type = "AAAA" content = "100::" # IPv6 discard prefix — never reachable; CF proxied front-end terminates - ttl = 1 + ttl = 60 proxied = true comment = "anchor for deployment wildcard CNAME (CF requires a real record at the parent)" } @@ -102,7 +102,7 @@ resource "cloudflare_dns_record" "staging_webhook" { name = "webhook.${local.staging_stem}" type = "AAAA" content = "100::" # placeholder; CF orange-cloud handles routing - ttl = 1 + ttl = 60 proxied = true comment = "staging /webhook/new receiver subdomain" } @@ -122,7 +122,7 @@ resource "cloudflare_dns_record" "staging_dashboard" { name = "dashboard.${local.staging_stem}" type = "CNAME" content = "instanode-dashboard-staging.pages.dev" # set after dashboard Pages project is created - ttl = 1 + ttl = 60 proxied = true comment = "staging dashboard — QA-only; D-5 keeps prod dashboard off Pages" } From c26a910c38c834900909cf30170cd325ef36fe2f Mon Sep 17 00:00:00 2001 From: Manas Srivastava Date: Sat, 30 May 2026 14:29:03 +0530 Subject: [PATCH 3/3] Revert "Merge pull request #32 from InstaNode-dev/cf/staging-bootstrap" This reverts commit 9448e80f16fab2d34a31b43bb839d328bc43a979, reversing changes made to af87dfccca52b8513cb1e74d54078a4e5cc50c38. --- .../workflows/terraform-apply-production.yml | 152 ------------- .github/workflows/terraform-apply-staging.yml | 116 ---------- .github/workflows/terraform.yml | 170 --------------- .../wrangler-build-staging-images.yml | 203 ------------------ .github/workflows/wrangler-deploy-staging.yml | 110 ---------- terraform/cloudflare/.gitignore | 27 --- terraform/cloudflare/Makefile | 101 --------- terraform/cloudflare/README.md | 161 -------------- terraform/cloudflare/cache.tf | 96 --------- terraform/cloudflare/dns.tf | 55 ----- terraform/cloudflare/outputs.tf | 34 --- terraform/cloudflare/pages.tf | 61 ------ terraform/cloudflare/production.auto.tfvars | 4 - terraform/cloudflare/providers.tf | 8 - terraform/cloudflare/r2.tf | 38 ---- terraform/cloudflare/staging.auto.tfvars | 6 - terraform/cloudflare/staging.tf | 182 ---------------- terraform/cloudflare/tokens.tf | 75 ------- terraform/cloudflare/variables.tf | 37 ---- terraform/cloudflare/versions.tf | 27 --- wrangler/README.md | 97 --------- wrangler/api/README.md | 35 --- wrangler/api/src/worker.ts | 32 --- wrangler/api/wrangler.toml | 64 ------ wrangler/mongodb/Dockerfile | 30 --- .../00_staging_bootstrap.js | 27 --- wrangler/mongodb/src/worker.ts | 19 -- wrangler/mongodb/wrangler.toml | 30 --- wrangler/nats/Dockerfile | 23 -- wrangler/nats/nats-server.conf | 33 --- wrangler/nats/src/worker.ts | 19 -- wrangler/nats/wrangler.toml | 40 ---- wrangler/pg-customers/src/worker.ts | 22 -- wrangler/pg-customers/wrangler.toml | 36 ---- wrangler/pg-platform/00_pre.sql | 25 --- wrangler/pg-platform/Dockerfile | 53 ----- wrangler/pg-platform/README.md | 87 -------- wrangler/pg-platform/src/worker.ts | 25 --- wrangler/pg-platform/wrangler.toml | 48 ----- wrangler/provisioner/src/worker.ts | 16 -- wrangler/provisioner/wrangler.toml | 47 ---- wrangler/redis-provision/Dockerfile | 30 --- wrangler/redis-provision/entrypoint.sh | 21 -- wrangler/redis-provision/redis.conf | 28 --- wrangler/redis-provision/src/worker.ts | 19 -- wrangler/redis-provision/wrangler.toml | 32 --- wrangler/worker/src/worker.ts | 23 -- wrangler/worker/wrangler.toml | 40 ---- 48 files changed, 2664 deletions(-) delete mode 100644 .github/workflows/terraform-apply-production.yml delete mode 100644 .github/workflows/terraform-apply-staging.yml delete mode 100644 .github/workflows/terraform.yml delete mode 100644 .github/workflows/wrangler-build-staging-images.yml delete mode 100644 .github/workflows/wrangler-deploy-staging.yml delete mode 100644 terraform/cloudflare/.gitignore delete mode 100644 terraform/cloudflare/Makefile delete mode 100644 terraform/cloudflare/README.md delete mode 100644 terraform/cloudflare/cache.tf delete mode 100644 terraform/cloudflare/dns.tf delete mode 100644 terraform/cloudflare/outputs.tf delete mode 100644 terraform/cloudflare/pages.tf delete mode 100644 terraform/cloudflare/production.auto.tfvars delete mode 100644 terraform/cloudflare/providers.tf delete mode 100644 terraform/cloudflare/r2.tf delete mode 100644 terraform/cloudflare/staging.auto.tfvars delete mode 100644 terraform/cloudflare/staging.tf delete mode 100644 terraform/cloudflare/tokens.tf delete mode 100644 terraform/cloudflare/variables.tf delete mode 100644 terraform/cloudflare/versions.tf delete mode 100644 wrangler/README.md delete mode 100644 wrangler/api/README.md delete mode 100644 wrangler/api/src/worker.ts delete mode 100644 wrangler/api/wrangler.toml delete mode 100644 wrangler/mongodb/Dockerfile delete mode 100644 wrangler/mongodb/docker-entrypoint-initdb.d/00_staging_bootstrap.js delete mode 100644 wrangler/mongodb/src/worker.ts delete mode 100644 wrangler/mongodb/wrangler.toml delete mode 100644 wrangler/nats/Dockerfile delete mode 100644 wrangler/nats/nats-server.conf delete mode 100644 wrangler/nats/src/worker.ts delete mode 100644 wrangler/nats/wrangler.toml delete mode 100644 wrangler/pg-customers/src/worker.ts delete mode 100644 wrangler/pg-customers/wrangler.toml delete mode 100644 wrangler/pg-platform/00_pre.sql delete mode 100644 wrangler/pg-platform/Dockerfile delete mode 100644 wrangler/pg-platform/README.md delete mode 100644 wrangler/pg-platform/src/worker.ts delete mode 100644 wrangler/pg-platform/wrangler.toml delete mode 100644 wrangler/provisioner/src/worker.ts delete mode 100644 wrangler/provisioner/wrangler.toml delete mode 100644 wrangler/redis-provision/Dockerfile delete mode 100644 wrangler/redis-provision/entrypoint.sh delete mode 100644 wrangler/redis-provision/redis.conf delete mode 100644 wrangler/redis-provision/src/worker.ts delete mode 100644 wrangler/redis-provision/wrangler.toml delete mode 100644 wrangler/worker/src/worker.ts delete mode 100644 wrangler/worker/wrangler.toml diff --git a/.github/workflows/terraform-apply-production.yml b/.github/workflows/terraform-apply-production.yml deleted file mode 100644 index 415670e..0000000 --- a/.github/workflows/terraform-apply-production.yml +++ /dev/null @@ -1,152 +0,0 @@ ---- -# infra — gated Terraform apply for the PRODUCTION Cloudflare workspace. -# -# APPROVAL MODEL: workflow_dispatch + GitHub Environment "production" -# with required reviewers. No push trigger. No "promote from staging" -# trigger. Every production apply is a separate, deliberate decision -# made by a human reviewer on a human-triggered run. -# -# Confirm phrase is stricter than staging — operator must type a -# matching staging RUN_ID so they cannot apply prod without having -# first applied + observed the same change in staging. -# -# Security note: every GHA expression consumed in a run: block is -# wrapped through env: to prevent script injection. - -name: terraform-apply-production - -on: - workflow_dispatch: - inputs: - confirm: - description: 'Type APPLY-PRODUCTION to confirm' - required: true - type: string - staging_run_id: - description: 'GH Actions run_id of the matching staging apply (must be a numeric id)' - required: true - type: string - -permissions: - contents: read - -concurrency: - group: terraform-apply-production - cancel-in-progress: false # never cancel an in-flight apply - -env: - TF_VERSION: '1.9.8' - TF_IN_AUTOMATION: 'true' - TF_ENV: 'production' - CLOUDFLARE_API_TOKEN: ${{ secrets.CLOUDFLARE_API_TOKEN }} - AWS_ACCESS_KEY_ID: ${{ secrets.TF_STATE_R2_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.TF_STATE_R2_SECRET_ACCESS_KEY }} - AWS_REGION: 'auto' - CF_ACCOUNT_ID: ${{ secrets.CF_ACCOUNT_ID }} - -jobs: - guard: - name: confirm-input + staging-precedent guard - runs-on: ubuntu-latest - env: - CONFIRM_INPUT: ${{ inputs.confirm }} - STAGING_RUN_ID: ${{ inputs.staging_run_id }} - steps: - - name: Reject if confirm phrase wrong - run: | - if [ "${CONFIRM_INPUT}" != "APPLY-PRODUCTION" ]; then - echo "::error::confirm input must be exactly 'APPLY-PRODUCTION'" - exit 1 - fi - - - name: Reject if staging_run_id is not numeric - run: | - # ref-injection mitigation: validate strictly before any use. - case "${STAGING_RUN_ID}" in - ''|*[!0-9]*) - echo "::error::staging_run_id must be a numeric GH Actions run id (got '${STAGING_RUN_ID}')" - exit 1 - ;; - esac - - - name: Verify staging run exists + succeeded - env: - GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # STAGING_RUN_ID already validated as numeric above; safe to use. - run: | - conclusion=$(gh run view "${STAGING_RUN_ID}" --repo "${GITHUB_REPOSITORY}" --json conclusion --jq '.conclusion') - name=$(gh run view "${STAGING_RUN_ID}" --repo "${GITHUB_REPOSITORY}" --json name --jq '.name') - if [ "${name}" != "terraform-apply-staging" ]; then - echo "::error::staging_run_id ${STAGING_RUN_ID} is not a terraform-apply-staging run (got: ${name})" - exit 1 - fi - if [ "${conclusion}" != "success" ]; then - echo "::error::staging_run_id ${STAGING_RUN_ID} did not succeed (conclusion: ${conclusion})" - exit 1 - fi - echo "staging precedent ✓ (run ${STAGING_RUN_ID} = success)" - - apply: - name: apply production - needs: guard - runs-on: ubuntu-latest - # GitHub Environment "production" must be configured with Required - # Reviewers — operator sets this up at repo Settings → Environments → - # production → Deployment protection rules. This is the second gate - # on top of the confirm input + staging-precedent checks above. - environment: production - defaults: - run: - working-directory: terraform/cloudflare - steps: - - uses: actions/checkout@v6 - - - uses: hashicorp/setup-terraform@v3 - with: - terraform_version: ${{ env.TF_VERSION }} - - - name: Verify operator secrets are set - run: | - missing="" - [ -z "${CLOUDFLARE_API_TOKEN}" ] && missing="${missing} CLOUDFLARE_API_TOKEN" - [ -z "${AWS_ACCESS_KEY_ID}" ] && missing="${missing} TF_STATE_R2_ACCESS_KEY_ID" - [ -z "${AWS_SECRET_ACCESS_KEY}" ] && missing="${missing} TF_STATE_R2_SECRET_ACCESS_KEY" - [ -z "${CF_ACCOUNT_ID}" ] && missing="${missing} CF_ACCOUNT_ID" - if [ -n "${missing}" ]; then - echo "::error::Operator action required — these repo secrets are not set:${missing}" - echo "::error::See https://github.com/InstaNode-dev/infra/blob/master/terraform/cloudflare/README.md#bootstrap-one-time" - exit 1 - fi - - - name: terraform init - run: | - terraform init \ - -backend-config="endpoints={s3=\"https://${CF_ACCOUNT_ID}.r2.cloudflarestorage.com\"}" \ - -backend-config="workspace_key_prefix=${TF_ENV}" - - - name: terraform workspace select - run: terraform workspace select "${TF_ENV}" - - - name: terraform plan - run: | - terraform plan \ - -var-file="${TF_ENV}.auto.tfvars" \ - -no-color \ - -out=tfplan.bin - - - name: terraform apply - run: terraform apply -no-color tfplan.bin - - - name: Surface non-sensitive outputs (ids only, NO token values) - run: | - terraform output -no-color account_id || true - terraform output -no-color zone_id || true - terraform output -no-color deploy_token_id || true - terraform output -no-color admin_tunnel_token_id || true - - - name: Reminder - run: | - echo "::notice::PRODUCTION APPLY COMPLETE." - echo "::notice::If tokens were created or rotated, run on an operator workstation:" - echo "::notice:: make install-secrets ENV=production" - echo "::notice::Confirm the CF dashboard audit log shows the change before revoking the prior token." diff --git a/.github/workflows/terraform-apply-staging.yml b/.github/workflows/terraform-apply-staging.yml deleted file mode 100644 index 2e8ef76..0000000 --- a/.github/workflows/terraform-apply-staging.yml +++ /dev/null @@ -1,116 +0,0 @@ ---- -# infra — gated Terraform apply for the STAGING Cloudflare workspace. -# -# APPROVAL MODEL: workflow_dispatch ONLY. Never on push, never on merge, -# never auto-promoted from a previous apply. Operator deliberately -# triggers this from the Actions tab. -# -# Why split per env: staging and production must not share an apply -# trigger. Splitting prevents a "promote-on-success" pipeline from -# ever existing for production — every prod apply is a separate human -# decision (see terraform-apply-production.yml). -# -# Security note: every GHA expression consumed in a run: block is -# wrapped through env: to prevent script injection. - -name: terraform-apply-staging - -on: - workflow_dispatch: - inputs: - confirm: - description: 'Type APPLY-STAGING to confirm' - required: true - type: string - -permissions: - contents: read - -concurrency: - group: terraform-apply-staging - cancel-in-progress: false # never cancel an in-flight apply - -env: - TF_VERSION: '1.9.8' - TF_IN_AUTOMATION: 'true' - TF_ENV: 'staging' - CLOUDFLARE_API_TOKEN: ${{ secrets.CLOUDFLARE_API_TOKEN }} - AWS_ACCESS_KEY_ID: ${{ secrets.TF_STATE_R2_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.TF_STATE_R2_SECRET_ACCESS_KEY }} - AWS_REGION: 'auto' - CF_ACCOUNT_ID: ${{ secrets.CF_ACCOUNT_ID }} - -jobs: - guard: - name: confirm-input guard - runs-on: ubuntu-latest - env: - CONFIRM_INPUT: ${{ inputs.confirm }} - steps: - - name: Reject if confirm phrase wrong - run: | - if [ "${CONFIRM_INPUT}" != "APPLY-STAGING" ]; then - echo "::error::confirm input must be exactly 'APPLY-STAGING'" - exit 1 - fi - - apply: - name: apply staging - needs: guard - runs-on: ubuntu-latest - environment: staging - defaults: - run: - working-directory: terraform/cloudflare - steps: - - uses: actions/checkout@v6 - - - uses: hashicorp/setup-terraform@v3 - with: - terraform_version: ${{ env.TF_VERSION }} - - - name: Verify operator secrets are set - run: | - missing="" - [ -z "${CLOUDFLARE_API_TOKEN}" ] && missing="${missing} CLOUDFLARE_API_TOKEN" - [ -z "${AWS_ACCESS_KEY_ID}" ] && missing="${missing} TF_STATE_R2_ACCESS_KEY_ID" - [ -z "${AWS_SECRET_ACCESS_KEY}" ] && missing="${missing} TF_STATE_R2_SECRET_ACCESS_KEY" - [ -z "${CF_ACCOUNT_ID}" ] && missing="${missing} CF_ACCOUNT_ID" - if [ -n "${missing}" ]; then - echo "::error::Operator action required — these repo secrets are not set:${missing}" - echo "::error::See https://github.com/InstaNode-dev/infra/blob/master/terraform/cloudflare/README.md#bootstrap-one-time" - exit 1 - fi - - - name: terraform init - run: | - terraform init \ - -backend-config="endpoints={s3=\"https://${CF_ACCOUNT_ID}.r2.cloudflarestorage.com\"}" \ - -backend-config="workspace_key_prefix=${TF_ENV}" - - - name: terraform workspace select - run: terraform workspace select "${TF_ENV}" - - - name: terraform plan - run: | - terraform plan \ - -var-file="${TF_ENV}.auto.tfvars" \ - -no-color \ - -out=tfplan.bin - - - name: terraform apply - run: terraform apply -no-color tfplan.bin - - - name: Surface non-sensitive outputs (ids only, NO token values) - run: | - terraform output -no-color account_id || true - terraform output -no-color zone_id || true - terraform output -no-color deploy_token_id || true - terraform output -no-color admin_tunnel_token_id || true - - - name: Reminder - run: | - echo "::notice::STAGING APPLY COMPLETE." - echo "::notice::If tokens were created or rotated, run on an operator workstation:" - echo "::notice:: make install-secrets ENV=staging" - echo "::notice::Promoting to production is a SEPARATE manual decision via terraform-apply-production.yml." diff --git a/.github/workflows/terraform.yml b/.github/workflows/terraform.yml deleted file mode 100644 index fe9cc2c..0000000 --- a/.github/workflows/terraform.yml +++ /dev/null @@ -1,170 +0,0 @@ ---- -# infra — Terraform fmt + validate + plan for CF resources. -# -# Runs on every push to master and on PRs touching terraform/**. -# Plan is read-only. Apply is split into per-env manual workflows -# (terraform-apply-staging.yml, terraform-apply-production.yml). -# This file NEVER applies — see those workflows for the apply path. -# -# Posts the plan diff as a PR comment so reviewers see what apply -# would do without granting CI apply rights. -# -# Security note: all GHA expressions consumed in run: blocks are -# referenced through env vars to prevent script injection. - -name: terraform - -on: - push: - branches: [master] - paths: - - 'terraform/**' - - '.github/workflows/terraform*.yml' - pull_request: - paths: - - 'terraform/**' - - '.github/workflows/terraform*.yml' - workflow_dispatch: - -permissions: - contents: read - pull-requests: write # for the plan comment - -concurrency: - group: terraform-plan-${{ github.ref }} - cancel-in-progress: true - -env: - TF_VERSION: '1.9.8' - TF_IN_AUTOMATION: 'true' - -jobs: - fmt-validate: - name: fmt + validate - runs-on: ubuntu-latest - defaults: - run: - working-directory: terraform/cloudflare - steps: - - uses: actions/checkout@v6 - - - uses: hashicorp/setup-terraform@v3 - with: - terraform_version: ${{ env.TF_VERSION }} - - - name: terraform fmt -check - run: terraform fmt -check -recursive - - - name: terraform init (backend-bypassed) - run: terraform init -backend=false - - - name: terraform validate - run: terraform validate -no-color - - plan: - name: plan (${{ matrix.env }}) - needs: fmt-validate - runs-on: ubuntu-latest - strategy: - fail-fast: false - matrix: - env: [staging, production] - defaults: - run: - working-directory: terraform/cloudflare - # CF creds + state-backend creds passed in via env, not inlined in run:. - env: - CLOUDFLARE_API_TOKEN: ${{ secrets.CLOUDFLARE_API_TOKEN }} - AWS_ACCESS_KEY_ID: ${{ secrets.TF_STATE_R2_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.TF_STATE_R2_SECRET_ACCESS_KEY }} - AWS_REGION: 'auto' - CF_ACCOUNT_ID: ${{ secrets.CF_ACCOUNT_ID }} - TF_ENV: ${{ matrix.env }} - steps: - - uses: actions/checkout@v6 - - - uses: hashicorp/setup-terraform@v3 - with: - terraform_version: ${{ env.TF_VERSION }} - - - name: Verify operator secrets are set - # Bootstrap chicken-and-egg: plan needs CF + R2-HMAC creds, but - # those are operator-only one-time setup (see README §Bootstrap). - # Without this guard the failure mode is a cryptic AWS-IAM stack - # trace from `terraform init`. With it, the error is one line - # pointing at the README and the exact missing variable names. - run: | - missing="" - [ -z "${CLOUDFLARE_API_TOKEN}" ] && missing="${missing} CLOUDFLARE_API_TOKEN" - [ -z "${AWS_ACCESS_KEY_ID}" ] && missing="${missing} TF_STATE_R2_ACCESS_KEY_ID" - [ -z "${AWS_SECRET_ACCESS_KEY}" ] && missing="${missing} TF_STATE_R2_SECRET_ACCESS_KEY" - [ -z "${CF_ACCOUNT_ID}" ] && missing="${missing} CF_ACCOUNT_ID" - if [ -n "${missing}" ]; then - echo "::error::Operator action required — these repo secrets are not set:${missing}" - echo "::error::See https://github.com/InstaNode-dev/infra/blob/master/terraform/cloudflare/README.md#bootstrap-one-time" - exit 1 - fi - echo "all 4 operator secrets present" - - - name: terraform init - run: | - terraform init \ - -backend-config="endpoints={s3=\"https://${CF_ACCOUNT_ID}.r2.cloudflarestorage.com\"}" \ - -backend-config="workspace_key_prefix=${TF_ENV}" - - - name: terraform workspace select-or-create - run: terraform workspace select "${TF_ENV}" 2>/dev/null || terraform workspace new "${TF_ENV}" - - - name: terraform plan - id: plan - run: | - set +e - terraform plan \ - -var-file="${TF_ENV}.auto.tfvars" \ - -no-color \ - -out=tfplan.bin \ - -detailed-exitcode 2>&1 | tee /tmp/plan.out - ec=${PIPESTATUS[0]} - echo "exitcode=${ec}" >> "$GITHUB_OUTPUT" - # 0 = no changes, 2 = changes, 1 = error - [ "${ec}" -eq 1 ] && exit 1 || exit 0 - - - name: Comment plan on PR - if: github.event_name == 'pull_request' - uses: actions/github-script@v7 - env: - PLAN_ENV: ${{ matrix.env }} - PLAN_CODE: ${{ steps.plan.outputs.exitcode }} - RUN_ID: ${{ github.run_id }} - with: - script: | - const fs = require('fs'); - let plan = fs.readFileSync('/tmp/plan.out', 'utf8'); - if (plan.length > 60000) { - plan = plan.slice(0, 60000) + '\n\n... (truncated; full plan in job log)'; - } - const env = process.env.PLAN_ENV; - const code = process.env.PLAN_CODE; - const verdict = code === '0' ? '✅ no changes' - : code === '2' ? '🟡 changes present — review before manual apply' - : '❌ plan failed'; - const body = [ - `### Terraform plan — \`${env}\``, - verdict, - '', - '
plan output', - '', - '```hcl', - plan, - '```', - '', - '
', - '', - `_Posted by terraform.yml run ${process.env.RUN_ID}. Apply requires manual trigger of terraform-apply-${env}.yml._`, - ].join('\n'); - await github.rest.issues.createComment({ - issue_number: context.issue.number, - owner: context.repo.owner, - repo: context.repo.repo, - body, - }); diff --git a/.github/workflows/wrangler-build-staging-images.yml b/.github/workflows/wrangler-build-staging-images.yml deleted file mode 100644 index c53ce26..0000000 --- a/.github/workflows/wrangler-build-staging-images.yml +++ /dev/null @@ -1,203 +0,0 @@ ---- -# infra — Build custom Docker images for CF Containers (staging only). -# -# Builds images that don't ship a usable upstream: -# - pg-platform: postgres + pgvector + all 63 platform migrations baked in -# -# api / worker / provisioner images are built by their own repos' deploy.yml -# (which now also pushes :staging — see api/.github/workflows/deploy.yml). -# This workflow handles only the "wrapped upstream image" cases. -# -# Triggers: -# - workflow_dispatch (with service input) -# - daily cron 09:00 UTC (to pick up migrations merged in api repo) -# - push to master touching infra/wrangler/pg-platform/** -# - repository_dispatch event "migrations-changed" from the api repo -# -# Security: all GHA expressions consumed in run: blocks are wrapped -# through env: to prevent script injection. - -name: wrangler-build-staging-images - -on: - workflow_dispatch: - inputs: - service: - description: 'Which custom image to build (or "all")' - required: true - type: choice - default: 'all' - options: - - all - - pg-platform - - mongodb - - redis-provision - - nats - push: - branches: [master] - paths: - - 'wrangler/pg-platform/**' - - 'wrangler/mongodb/**' - - 'wrangler/redis-provision/**' - - 'wrangler/nats/**' - - '.github/workflows/wrangler-build-staging-images.yml' - schedule: - - cron: '0 9 * * *' # daily 09:00 UTC - repository_dispatch: - types: [migrations-changed] - -permissions: - contents: read - packages: write - -concurrency: - group: wrangler-build-staging-${{ github.event.inputs.service || 'all' }} - cancel-in-progress: false - -env: - REGISTRY: ghcr.io - ORG: instanode-dev - -jobs: - pg-platform: - name: build pg-platform :staging - if: | - github.event_name == 'schedule' || - github.event_name == 'push' || - github.event_name == 'repository_dispatch' || - (github.event_name == 'workflow_dispatch' && (github.event.inputs.service == 'all' || github.event.inputs.service == 'pg-platform')) - runs-on: ubuntu-latest - env: - SERVICE: pg-platform - steps: - - name: Checkout infra repo - uses: actions/checkout@v6 - with: - path: infra - - - name: Checkout api repo (for the migrations) - uses: actions/checkout@v6 - with: - repository: ${{ vars.API_REPO || format('{0}/api', github.repository_owner) }} - token: ${{ secrets.REPO_ACCESS_TOKEN || secrets.GITHUB_TOKEN }} - path: api - - - name: Verify migrations dir exists + count - env: - MIGRATIONS_DIR: api/internal/db/migrations - run: | - if [ ! -d "$MIGRATIONS_DIR" ]; then - echo "::error::expected migrations dir $MIGRATIONS_DIR not found" - exit 1 - fi - count=$(find "$MIGRATIONS_DIR" -name '*.sql' | wc -l | tr -d ' ') - echo "migrations found: $count" - if [ "$count" -lt 50 ]; then - echo "::warning::only $count migration files — expected ≥50 (live count was 63 as of 2026-05-30)" - fi - - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v4 - - - name: Log in to GHCR - uses: docker/login-action@v4 - with: - registry: ${{ env.REGISTRY }} - username: ${{ github.actor }} - # GHCR_PUSH_TOKEN is a classic PAT with write:packages, same - # pattern as the api/worker/provisioner deploy.yml workflows. - password: ${{ secrets.GHCR_PUSH_TOKEN || secrets.GITHUB_TOKEN }} - - - name: Build and push - env: - IMAGE: ${{ env.REGISTRY }}/${{ env.ORG }}/instant-pg-platform - run: | - docker buildx build \ - --platform linux/amd64 \ - -f infra/wrangler/pg-platform/Dockerfile \ - -t "${IMAGE}:staging" \ - -t "${IMAGE}:staging-$(date -u +%Y%m%d)" \ - --push \ - . - - - name: Reminder - run: | - echo "::notice::pg-platform :staging image rebuilt with current migrations." - echo "::notice::Next CF Container cold start will re-apply them from the new image." - echo "::notice::Trigger a rolling restart with: wrangler deployments tail --env staging" - - # --------------------------------------------------------------------------- - # mongodb / redis-provision / nats — small wrapped images. - # - # These don't need cross-repo migration sync (the wrapping config is fully - # self-contained under infra/wrangler//). Single-repo checkout + - # build + push to GHCR. Same SERVICE-input gating as pg-platform. - # --------------------------------------------------------------------------- - - small-images: - name: build ${{ matrix.svc }} :staging - if: | - github.event_name == 'schedule' || - github.event_name == 'push' || - github.event_name == 'repository_dispatch' || - (github.event_name == 'workflow_dispatch' && (github.event.inputs.service == 'all' || github.event.inputs.service == 'mongodb' || github.event.inputs.service == 'redis-provision' || github.event.inputs.service == 'nats')) - runs-on: ubuntu-latest - strategy: - fail-fast: false - matrix: - svc: [mongodb, redis-provision, nats] - env: - SVC: ${{ matrix.svc }} - steps: - - name: Checkout infra repo - uses: actions/checkout@v6 - with: - path: infra - - - name: Skip if matrix svc doesn't match workflow_dispatch input - # Avoids spurious matrix entries when operator selected a single - # svc via workflow_dispatch. push / cron / dispatch run all 3. - id: gate - run: | - if [ "${{ github.event_name }}" != "workflow_dispatch" ]; then - echo "skip=false" >> "$GITHUB_OUTPUT" - exit 0 - fi - INPUT="${{ github.event.inputs.service }}" - if [ "$INPUT" = "all" ] || [ "$INPUT" = "$SVC" ]; then - echo "skip=false" >> "$GITHUB_OUTPUT" - else - echo "skip=true" >> "$GITHUB_OUTPUT" - echo "::notice::skipping $SVC (workflow_dispatch input was '$INPUT')" - fi - - - name: Set up Docker Buildx - if: steps.gate.outputs.skip == 'false' - uses: docker/setup-buildx-action@v4 - - - name: Log in to GHCR - if: steps.gate.outputs.skip == 'false' - uses: docker/login-action@v4 - with: - registry: ${{ env.REGISTRY }} - username: ${{ github.actor }} - password: ${{ secrets.GHCR_PUSH_TOKEN || secrets.GITHUB_TOKEN }} - - - name: Build and push - if: steps.gate.outputs.skip == 'false' - env: - IMAGE: ${{ env.REGISTRY }}/${{ env.ORG }}/instant-${{ matrix.svc }} - run: | - docker buildx build \ - --platform linux/amd64 \ - -f "infra/wrangler/${SVC}/Dockerfile" \ - -t "${IMAGE}:staging" \ - -t "${IMAGE}:staging-$(date -u +%Y%m%d)" \ - --push \ - . - - - name: Reminder - if: steps.gate.outputs.skip == 'false' - run: | - echo "::notice::${SVC} :staging image rebuilt." - echo "::notice::Trigger a rolling restart with: wrangler containers deploy --env staging" diff --git a/.github/workflows/wrangler-deploy-staging.yml b/.github/workflows/wrangler-deploy-staging.yml deleted file mode 100644 index 69c63cb..0000000 --- a/.github/workflows/wrangler-deploy-staging.yml +++ /dev/null @@ -1,110 +0,0 @@ ---- -# infra — CF Containers deploy for staging via wrangler. -# -# APPROVAL MODEL: workflow_dispatch ONLY for the first ~10 runs (manual -# verification). After staging stabilizes, can be promoted to auto-run on -# merge to master (controlled by the `auto_deploy` input). -# -# Production does NOT use this workflow — see the eventual -# production-deploy.yml when the prod target is settled. -# -# Security: all GHA expressions consumed in run: blocks are wrapped -# through env: to prevent script injection. - -name: wrangler-deploy-staging - -on: - workflow_dispatch: - inputs: - service: - description: 'Which service to deploy (or "all")' - required: true - type: choice - options: - - all - - api - - worker - - provisioner - - pg-platform - - pg-customers - - mongodb - - redis-provision - - nats - confirm: - description: 'Type DEPLOY-STAGING to confirm' - required: true - type: string - -permissions: - contents: read - -concurrency: - group: wrangler-deploy-staging-${{ inputs.service }} - cancel-in-progress: false - -jobs: - guard: - name: confirm-input guard - runs-on: ubuntu-latest - env: - CONFIRM_INPUT: ${{ inputs.confirm }} - steps: - - name: Reject if confirm phrase wrong - run: | - if [ "${CONFIRM_INPUT}" != "DEPLOY-STAGING" ]; then - echo "::error::confirm must be exactly 'DEPLOY-STAGING'" - exit 1 - fi - - deploy: - name: deploy ${{ inputs.service }} - needs: guard - runs-on: ubuntu-latest - environment: staging - env: - CLOUDFLARE_API_TOKEN: ${{ secrets.CLOUDFLARE_API_TOKEN }} - CF_ACCOUNT_ID: ${{ secrets.CF_ACCOUNT_ID }} - SERVICE_INPUT: ${{ inputs.service }} - steps: - - uses: actions/checkout@v6 - - - uses: actions/setup-node@v4 - with: - node-version: '20' - - - name: Install wrangler - run: npm install -g wrangler@latest - - - name: Validate service name - run: | - # Whitelist enforced — never embed user input into shell paths - # without validating it matches a known service. - case "${SERVICE_INPUT}" in - all|api|worker|provisioner|pg-platform|pg-customers|mongodb|redis-provision|nats) : ;; - *) - echo "::error::Unknown service: ${SERVICE_INPUT}" - exit 1 - ;; - esac - - - name: Deploy - run: | - set -euo pipefail - if [ "${SERVICE_INPUT}" = "all" ]; then - SERVICES="api worker provisioner pg-platform pg-customers mongodb redis-provision nats" - else - SERVICES="${SERVICE_INPUT}" - fi - for svc in $SERVICES; do - echo "::group::deploying $svc" - cd "infra/wrangler/$svc" - wrangler deploy --env staging - cd - >/dev/null - echo "::endgroup::" - done - - - name: Reminder - run: | - echo "::notice::STAGING DEPLOY COMPLETE." - echo "::notice::Verify with: curl https://api.staging.instanode.dev/healthz" - echo "::notice::Note: stateful containers (pg-*/mongodb/redis-*/nats) have ephemeral disk." diff --git a/terraform/cloudflare/.gitignore b/terraform/cloudflare/.gitignore deleted file mode 100644 index 343dfcf..0000000 --- a/terraform/cloudflare/.gitignore +++ /dev/null @@ -1,27 +0,0 @@ -# TF state — lives in R2 backend, never in repo. -*.tfstate -*.tfstate.* -*.tfstate.backup -.terraform/ -.terraform.lock.hcl - -# Per-environment variable files — committable ONLY if they contain -# no secrets. As of bootstrap there are no secrets in any tfvars (auth -# is via env vars), so we DO commit the .auto.tfvars files. Below -# excludes only the local ad-hoc ones. -*.local.tfvars -*.local.auto.tfvars - -# Operator-local overrides -override.tf -override.tf.json -*_override.tf -*_override.tf.json - -# Plan outputs (often contain post-apply secret values) -*.tfplan -*.tfplan.bin - -# crash logs from the provider -crash.log -crash.*.log diff --git a/terraform/cloudflare/Makefile b/terraform/cloudflare/Makefile deleted file mode 100644 index 23593ff..0000000 --- a/terraform/cloudflare/Makefile +++ /dev/null @@ -1,101 +0,0 @@ -# Terraform helpers for the CF migration. Run from this dir. -# -# Required env vars (export before any target): -# CLOUDFLARE_API_TOKEN — Token A (deploy) for plan/apply -# AWS_ACCESS_KEY_ID — R2 HMAC for TF state bucket -# AWS_SECRET_ACCESS_KEY — R2 HMAC secret for TF state bucket -# CF_ACCOUNT_ID — for backend endpoint URL -# -# ENV defaults to staging; pass ENV=production for prod. - -ENV ?= staging -TF ?= terraform - -ifneq ($(filter $(ENV),staging production),$(ENV)) -$(error ENV must be 'staging' or 'production' (got '$(ENV)')) -endif - -.PHONY: help init fmt validate plan apply destroy install-secrets rotate-tokens clean - -help: - @echo "Targets:" - @echo " init — terraform init with R2 backend (one-time per workspace)" - @echo " fmt — terraform fmt -check (CI also enforces)" - @echo " validate — terraform validate (offline)" - @echo " plan — terraform plan (writes tfplan.bin)" - @echo " apply — terraform apply (reads tfplan.bin from plan target)" - @echo " install-secrets — pull token outputs and push to k8s + GH org secrets" - @echo " rotate-tokens — bump expiry, plan, apply, install" - @echo " destroy — DANGEROUS, only for tearing down ephemeral staging" - @echo - @echo "Env: ENV=$(ENV) (override with ENV=production)" - -init: - @: $${CF_ACCOUNT_ID?CF_ACCOUNT_ID must be set} - $(TF) init \ - -backend-config="endpoints={s3=\"https://$$CF_ACCOUNT_ID.r2.cloudflarestorage.com\"}" \ - -backend-config="workspace_key_prefix=$(ENV)" - $(TF) workspace select $(ENV) 2>/dev/null || $(TF) workspace new $(ENV) - -fmt: - $(TF) fmt -check -recursive - -validate: - $(TF) validate -no-color - -plan: - $(TF) plan -var-file=$(ENV).auto.tfvars -out=tfplan.bin - -apply: - $(TF) apply tfplan.bin - @echo - @echo "==> Apply complete. If tokens were created/rotated, run:" - @echo " make install-secrets ENV=$(ENV)" - -# Pull sensitive token outputs (one-shot, never written to disk) and -# install them as k8s + GH secrets across all consuming repos. Token -# VALUES are scrubbed from the env on exit. -install-secrets: - @: $${GH_TOKEN?GH_TOKEN must be set for 'gh secret set' calls} - @DEPLOY_TOKEN="$$($(TF) output -raw deploy_token)"; \ - if [ -z "$$DEPLOY_TOKEN" ]; then echo "no deploy_token in state — apply first"; exit 1; fi; \ - echo "==> k8s: writing CLOUDFLARE_API_TOKEN to instant-secrets-cf in instant-$(ENV)"; \ - kubectl create secret generic instant-secrets-cf \ - -n instant-$(ENV) \ - --from-literal=CLOUDFLARE_API_TOKEN="$$DEPLOY_TOKEN" \ - --dry-run=client -o yaml | kubectl apply -f -; \ - echo "==> GH org secrets: CLOUDFLARE_API_TOKEN across instanodedev/{api,worker,provisioner,instanode-web,dashboard,infra,cli,mcp}"; \ - for repo in instanodedev/api instanodedev/worker instanodedev/provisioner \ - instanodedev/instanode-web instanodedev/dashboard \ - instanodedev/infra instanodedev/cli instanodedev/mcp; do \ - gh secret set CLOUDFLARE_API_TOKEN -b"$$DEPLOY_TOKEN" -R "$$repo" >/dev/null \ - && echo " ✓ $$repo" \ - || echo " ✗ $$repo (skipped — repo missing or not authorized)"; \ - done; \ - unset DEPLOY_TOKEN - @echo - @echo "==> Admin/tunnel token (Token B) is operator-only — NOT pushed to CI." - @echo " To install into your local 1Password vault:" - @echo " $(TF) output -raw admin_tunnel_token | op item create --category=ApiCredential --title='cf-admin-tunnel-$(ENV)' credential=-" - -# Bump expiry by 180d (deploy) / 90d (admin) — operator edits the .auto.tfvars -# to set new dates, then this target runs the plan/apply/install loop. -rotate-tokens: - @echo "==> Edit $(ENV).auto.tfvars to set new *_expires_on dates, then:" - @echo " make plan ENV=$(ENV)" - @echo " make apply ENV=$(ENV)" - @echo " make install-secrets ENV=$(ENV)" - @echo " Confirm the rotation in the CF dashboard audit log before" - @echo " revoking the previous token version." - -# Tearing down staging is OK (Phase 1 acceptance allows it). NEVER -# run against production — D-3 cutover keeps state on DO throughout. -destroy: - @if [ "$(ENV)" = "production" ]; then \ - echo "ABORTING — destroy against production is forbidden (D-1/D-3)."; \ - exit 1; \ - fi - $(TF) destroy -var-file=$(ENV).auto.tfvars - -clean: - rm -f tfplan.bin diff --git a/terraform/cloudflare/README.md b/terraform/cloudflare/README.md deleted file mode 100644 index 8f29db0..0000000 --- a/terraform/cloudflare/README.md +++ /dev/null @@ -1,161 +0,0 @@ -# Cloudflare resources — Terraform - -Source of truth for everything we declare in Cloudflare for the InstaNode -migration: API tokens (deploy + admin/tunnel), DNS records, R2 buckets, -Pages projects, and (later) Workers + Load Balancers + Page Rules. - -> **k8s is NOT in scope here.** k8s manifests stay under `../../k8s/`, -> managed by `kubectl set image` + the existing per-service auto-deploy -> per CLAUDE.md rule 15. This dir is for Cloudflare-managed resources only. - -## Decision references - -This module implements: -- **D-1** (scope — R2, Pages, CF proxy on api, staging-only Tunnel) -- **D-2** (staging on full CF stack) -- **D-3** (per-service DNS-weighted cutover; TTL 60s ≥48h) -- **D-4** (separate `instant-staging-data` ns — k8s-side, not here, but the staging Pages project + R2 bucket parallel it) -- **D-7** (NS delegation is CF; already verified) -- **D-8** (R2 env-var canonical names: `R2_HMAC_KEY_ID` / `R2_HMAC_SECRET`) -- **D-14** (operator credentials — outputs from `tokens.tf` install via `make install-secrets`) - -Source: `/tmp/cf-migration/shared/DECISIONS.md`. - -## Bootstrap (one-time) - -The TF state lives in R2, which means the R2 bucket for state and the -HMAC creds to write to it must exist BEFORE `terraform init`. Manual -chicken-and-egg step: - -```bash -# 1. Create the state bucket via wrangler (operator-side, one time). -wrangler r2 bucket create instanode-tf-state --location wnam - -# 2. Create R2 HMAC for state access only (scope: instanode-tf-state). -# Dashboard → R2 → Manage R2 API Tokens → Create: -# - Name: "tf-state-rw" -# - Permission: Object Read & Write -# - Specify buckets: instanode-tf-state -# Save the Access Key ID + Secret + Endpoint. - -# 3. Export the state-backend creds + CF auth token for terraform. -export AWS_ACCESS_KEY_ID="" -export AWS_SECRET_ACCESS_KEY="" -export CLOUDFLARE_API_TOKEN="" - -# 4. Init the backend with the env-specific account endpoint. -terraform init \ - -backend-config="endpoints={s3=\"https://${CF_ACCOUNT_ID}.r2.cloudflarestorage.com\"}" - -# 5. Pick a workspace (staging first). -terraform workspace new staging -terraform workspace select staging - -# 6. Plan + apply. -terraform plan -out=staging.tfplan -terraform apply staging.tfplan -``` - -After `apply` succeeds you have: -- Two CF API tokens in TF state (deploy + admin_tunnel). -- The staging Pages project + R2 bucket + DNS records. -- Output values for token secrets (sensitive — see next section). - -## Installing token secrets into k8s + GH - -Tokens are SENSITIVE outputs — they appear once in TF state and once -when `terraform output -raw ` is run. To install: - -```bash -# Read the tokens (do NOT redirect to a file you'll commit). -DEPLOY_TOKEN="$(terraform output -raw deploy_token)" -ADMIN_TUNNEL_TOKEN="$(terraform output -raw admin_tunnel_token)" - -# k8s — staging namespace. -kubectl create secret generic instant-secrets-cf \ - -n instant-staging \ - --from-literal=CLOUDFLARE_API_TOKEN="$DEPLOY_TOKEN" \ - --dry-run=client -o yaml | kubectl apply -f - - -# GH org / repo secrets — for CI auto-deploys. -for repo in instanodedev/api instanodedev/worker instanodedev/provisioner \ - instanodedev/instanode-web instanodedev/dashboard \ - instanodedev/infra; do - gh secret set CLOUDFLARE_API_TOKEN -b"$DEPLOY_TOKEN" -R "$repo" -done - -# Admin/tunnel token: ONLY into a separate operator-local Vault, never -# into CI. Used break-glass for Tunnel/Access changes. -op item create --category=ApiCredential --title="cf-admin-tunnel-staging" \ - --vault="instanode-prod" credential="$ADMIN_TUNNEL_TOKEN" - -unset DEPLOY_TOKEN ADMIN_TUNNEL_TOKEN -``` - -## Workflow during the migration - -1. **Plan-on-PR.** Every PR that changes a `.tf` file under this dir - triggers `terraform plan` in CI; diff posted as PR comment. -2. **Apply-on-merge.** Merge to `main` triggers `terraform apply` via - the workflow (gated on approval — `instanodedev/infra` already has - manual-apply discipline; rule 15 doesn't auto-deploy `infra`). -3. **Per-PR contract checklist (rule 22)** still applies. A TF PR that - adds a new host or changes the API base URL ALSO needs the - synchronized code edits in `api/internal/handlers/openapi.go` + - `content/llms.txt` + the dashboard/cli/mcp/sdk-go base-URL constants. -4. **Per-PR observability checklist (rule 25)** still applies. New - resources that emit metrics need an `instant_*` Prom rule + NR alert - JSON + dashboard tile + METRICS-CATALOG row in the same PR. - -## Workspace conventions - -- `terraform workspace new staging` / `terraform workspace new production` -- `terraform workspace select ` before any plan/apply -- `var.environment` is set automatically via `*.auto.tfvars` files - selected by workspace (TF auto-loads `staging.auto.tfvars` when the - workspace is `staging` if your CI passes `-var-file` accordingly; - during interactive use, pass `-var-file=staging.auto.tfvars` explicit- - ly to avoid surprises). - -## File layout - -| File | Purpose | -|---|---| -| `versions.tf` | TF + provider pinning, R2 backend config | -| `providers.tf` | CF provider (reads `CLOUDFLARE_API_TOKEN` env) | -| `variables.tf` | account_id, zone_id, environment, token expiries | -| `tokens.tf` | `cloudflare_account_token.deploy` + `.admin_tunnel` | -| `r2.tf` | R2 bucket + 24h-TTL lifecycle rule on `anon/` prefix | -| `dns.tf` | DNS records (apex / www / api / staging) with TTL 60s | -| `pages.tf` | Pages project for `instanode-web` (Phase 2) | -| `outputs.tf` | Sensitive token outputs (consumed by `make install-secrets`) | -| `staging.auto.tfvars` | Workspace-scoped vars for staging | -| `production.auto.tfvars` | Workspace-scoped vars for production | - -## What's NOT here (yet) - -- **Workers** — CEO D-1 deferred until measured TTFB benefit shows up. -- **Hyperdrive** — same; api and DO Managed PG are same-region, no win today. -- **D1** — KILLED per D-1. -- **CF Email Routing** — DEFERRED; outbound stays on Brevo. -- **Tunnels** — Phase 5 staging-only; add `tunnels.tf` when that PR ships, scoped to admin_tunnel token. -- **Load Balancers** — pending the CF Startups operator ticket (D-6, 5–10 day lead). Once enabled, add `lb.tf`. -- **Page Rules / Cache Rules** — Phase 4 only (api orange-cloud cut). Per D-12, the rule is an explicit path-allowlist for `/healthz`, `/openapi.json`, `/llms.txt`; NEVER Authorization-header-based. - -## R2 HMAC keys (NOT here) - -The R2 HMAC Access Key ID / Secret used by `common/storageprovider/r2/` -are SEPARATE from the CF API token and are generated via the R2 dashboard -"Manage R2 API Tokens" UI (NOT this Terraform). Reason: the -`cloudflare_r2_bucket` resource doesn't issue per-bucket HMAC pairs; -that's a one-off operator action, scoped to the specific bucket. - -After Phase 0 creates the staging bucket, the operator runs: -1. Dashboard → R2 → Manage R2 API Tokens → Create -2. Permissions: Object Read & Write -3. Specify buckets: `instant-shared-staging` (NOT *Apply to all buckets*) -4. TTL: 180 days -5. Save the resulting `Access Key ID` + `Secret Access Key` into - `instant-secrets` as `R2_HMAC_KEY_ID` + `R2_HMAC_SECRET` (D-8 names). - -Repeat for `instant-shared` (prod) after staging passes 48h green (D-9). diff --git a/terraform/cloudflare/cache.tf b/terraform/cloudflare/cache.tf deleted file mode 100644 index 9864c11..0000000 --- a/terraform/cloudflare/cache.tf +++ /dev/null @@ -1,96 +0,0 @@ -# Cache rules for api.staging.instanode.dev (and api.instanode.dev once -# Phase 4 flips proxied=true on the api A-record). -# -# D-12 (LOCKED): cache scope is an EXPLICIT path allowlist — `/healthz`, -# `/openapi.json`, `/llms.txt`. Everything else BYPASSES cache regardless -# of Authorization header presence. The original "bypass cache when -# Authorization header is set" approach was deleted because (a) the -# primitive doesn't exist on our zone tier, (b) it's a footgun if an -# authed response ever flows through cache. -# -# Plus: `instant_unexpected_cached_response_total` P0 metric in the api -# code (NOT here — handler-side) trips an alert if a request OUTSIDE -# the allowlist ever responds with cache-hit semantics. Defense in depth. - -# Catch-all bypass at top priority — cache OFF for everything by default. -resource "cloudflare_ruleset" "api_cache_rules" { - zone_id = var.zone_id - name = "api-cache-rules" - description = "D-12 explicit-path allowlist for api${var.environment == "production" ? "" : ".staging"}.${var.zone_name}" - kind = "zone" - phase = "http_request_cache_settings" - - # Rules evaluated top-to-bottom; first match wins. - rules = [ - # Rule 1: bypass cache for everything by default (catch-all at lowest - # priority via `Last`). - { - action = "set_cache_settings" - description = "bypass cache for all api.* paths by default" - enabled = true - expression = "(http.host eq \"api${var.environment == "production" ? "" : ".staging"}.${var.zone_name}\")" - action_parameters = { - cache = false - } - }, - # Rule 2: allow cache for /healthz (overrides bypass via earlier - # evaluation only if listed BEFORE the catch-all; CF Rulesets evaluate - # all rules and the LAST matching action wins for `set_cache_settings`, - # so explicit allowlist comes after the catch-all). - { - action = "set_cache_settings" - description = "cache /healthz at edge for 30s — same SHA across instances" - enabled = true - expression = "(http.host eq \"api${var.environment == "production" ? "" : ".staging"}.${var.zone_name}\") and (http.request.uri.path eq \"/healthz\")" - action_parameters = { - cache = true - edge_ttl = { - mode = "override_origin" - default = 30 - } - browser_ttl = { - mode = "override_origin" - default = 0 - } - } - }, - # Rule 3: cache /openapi.json for 5 minutes — frequently re-fetched - # by tooling, changes rarely. - { - action = "set_cache_settings" - description = "cache /openapi.json at edge for 5min" - enabled = true - expression = "(http.host eq \"api${var.environment == "production" ? "" : ".staging"}.${var.zone_name}\") and (http.request.uri.path eq \"/openapi.json\")" - action_parameters = { - cache = true - edge_ttl = { - mode = "override_origin" - default = 300 - } - browser_ttl = { - mode = "override_origin" - default = 60 - } - } - }, - # Rule 4: cache /llms.txt for 1 hour — static content from content - # repo, refresh cadence is "operator manually re-syncs". - { - action = "set_cache_settings" - description = "cache /llms.txt at edge for 1h" - enabled = true - expression = "(http.host eq \"api${var.environment == "production" ? "" : ".staging"}.${var.zone_name}\") and (http.request.uri.path eq \"/llms.txt\")" - action_parameters = { - cache = true - edge_ttl = { - mode = "override_origin" - default = 3600 - } - browser_ttl = { - mode = "override_origin" - default = 600 - } - } - }, - ] -} diff --git a/terraform/cloudflare/dns.tf b/terraform/cloudflare/dns.tf deleted file mode 100644 index fbfd48c..0000000 --- a/terraform/cloudflare/dns.tf +++ /dev/null @@ -1,55 +0,0 @@ -# DNS records under management. -# -# Pre-cutover ritual (D-3): TTL must be 60s for ≥48h BEFORE any cut. -# Setting it that low here means terraform plan/apply itself satisfies -# the pre-step the first time we touch the record. -# -# `proxied = true` = CF orange-cloud; `false` = grey-cloud (DNS only, no -# proxy). Today: marketing apex is orange (Phase 0 baseline), api is grey -# (becomes orange in Phase 4 — flip this flag in that phase's PR). - -locals { - marketing_origin = "instanode-web.pages.dev" # set per environment in staging.tfvars / production.tfvars after Pages project is created - api_origin = "152.42.154.144" # DigitalOcean LB; replaced with LB pool resource in Phase 4 -} - -resource "cloudflare_dns_record" "apex" { - zone_id = var.zone_id - name = var.zone_name - type = "CNAME" - content = local.marketing_origin - ttl = 60 - proxied = true - comment = "marketing apex; CNAME-flattened to Pages project" -} - -resource "cloudflare_dns_record" "www" { - zone_id = var.zone_id - name = "www.${var.zone_name}" - type = "CNAME" - content = var.zone_name - ttl = 60 - proxied = true - comment = "www → apex redirect handled by CF page rule" -} - -resource "cloudflare_dns_record" "api" { - zone_id = var.zone_id - name = "api.${var.zone_name}" - type = "A" - content = local.api_origin - ttl = 60 - proxied = false # Phase 4 flips this to true after CF orange-cloud cache rules are applied - comment = "api; grey-cloud today, orange-cloud per Phase 4 cut (D-3)" -} - -resource "cloudflare_dns_record" "staging" { - count = var.environment == "staging" ? 1 : 0 - zone_id = var.zone_id - name = "staging.${var.zone_name}" - type = "CNAME" - content = "instant-staging.${var.zone_name}.cdn.cloudflare.net" # Pages preview hostname; replaced after Pages project is up - ttl = 60 - proxied = true - comment = "staging mirror per D-2" -} diff --git a/terraform/cloudflare/outputs.tf b/terraform/cloudflare/outputs.tf deleted file mode 100644 index 3b123f3..0000000 --- a/terraform/cloudflare/outputs.tf +++ /dev/null @@ -1,34 +0,0 @@ -# Token VALUES are sensitive — operator must `terraform output -raw deploy_token` -# and immediately pipe into `kubectl create secret` / `gh secret set`. Never -# `terraform output` (no -raw) in a CI log: the redacted form ("(sensitive)") -# is still a footgun if anyone removes `sensitive = true`. - -output "deploy_token_id" { - value = cloudflare_account_token.deploy.id - description = "Token A id (non-sensitive; safe in CI logs)." -} - -output "deploy_token" { - value = cloudflare_account_token.deploy.value - description = "Token A secret. Pipe directly into k8s/GH secret; never log." - sensitive = true -} - -output "admin_tunnel_token_id" { - value = cloudflare_account_token.admin_tunnel.id - description = "Token B id (non-sensitive)." -} - -output "admin_tunnel_token" { - value = cloudflare_account_token.admin_tunnel.value - description = "Token B secret. Operator-only; never put into CI." - sensitive = true -} - -output "account_id" { - value = var.account_id -} - -output "zone_id" { - value = var.zone_id -} diff --git a/terraform/cloudflare/pages.tf b/terraform/cloudflare/pages.tf deleted file mode 100644 index 49f0037..0000000 --- a/terraform/cloudflare/pages.tf +++ /dev/null @@ -1,61 +0,0 @@ -# Cloudflare Pages project for instanode-web (marketing site). -# Phase 2 in FINAL-PLAN.md. Dashboard-on-Pages is KILLED per D-5; -# do NOT add a second `cloudflare_pages_project` for dashboard here. - -resource "cloudflare_pages_project" "instanode_web" { - account_id = var.account_id - name = var.environment == "production" ? "instanode-web" : "instanode-web-staging" - production_branch = "main" - - build_config = { - build_command = "npm run build" - destination_dir = "dist" - root_dir = "" - web_analytics_tag = null - web_analytics_token = null - } - - source = { - type = "github" - config = { - owner = "instanodedev" - repo_name = "instanode-web" - production_branch = "main" - pr_comments_enabled = true - production_deployment_enabled = true - preview_deployment_setting = "all" - preview_branch_includes = ["*"] - preview_branch_excludes = [] - } - } - - deployment_configs = { - production = { - compatibility_date = "2026-05-30" - compatibility_flags = [] - env_vars = { - VITE_API_URL = { - type = "plain_text" - value = var.environment == "production" ? "https://api.instanode.dev" : "https://api.staging.instanode.dev" - } - VITE_ENV = { - type = "plain_text" - value = var.environment - } - } - } - preview = { - compatibility_date = "2026-05-30" - compatibility_flags = [] - } - } -} - -# Custom domain binding — only after Phase 2 acceptance (D-9 equivalent -# for marketing: zero broken-link diff). Until then, traffic stays on -# GH Pages via DNS, and this resource is dormant. -resource "cloudflare_pages_domain" "instanode_web" { - account_id = var.account_id - project_name = cloudflare_pages_project.instanode_web.name - name = var.environment == "production" ? var.zone_name : "staging.${var.zone_name}" -} diff --git a/terraform/cloudflare/production.auto.tfvars b/terraform/cloudflare/production.auto.tfvars deleted file mode 100644 index 0c188fe..0000000 --- a/terraform/cloudflare/production.auto.tfvars +++ /dev/null @@ -1,4 +0,0 @@ -environment = "production" - -deploy_token_expires_on = "2026-11-26T23:59:59Z" -admin_tunnel_token_expires_on = "2026-08-28T23:59:59Z" diff --git a/terraform/cloudflare/providers.tf b/terraform/cloudflare/providers.tf deleted file mode 100644 index a89234e..0000000 --- a/terraform/cloudflare/providers.tf +++ /dev/null @@ -1,8 +0,0 @@ -provider "cloudflare" { - # Reads CLOUDFLARE_API_TOKEN from env. Operator uses Token A - # ("instanode-migration-deploy") for everything except Tunnel/Access - # changes — for those, switch the env var to Token B in a separate - # apply (see _modules/tunnel/README.md). - # - # Never commit a value here. -} diff --git a/terraform/cloudflare/r2.tf b/terraform/cloudflare/r2.tf deleted file mode 100644 index 61206ba..0000000 --- a/terraform/cloudflare/r2.tf +++ /dev/null @@ -1,38 +0,0 @@ -# R2 buckets. Per CEO D-1 + DevOps D-4, staging gets a parallel bucket -# (`instant-shared-staging`); production keeps the existing name and -# moves traffic into it via the storageprovider env-flip (D-8 names). -# -# Lifecycle rule: anon/ prefix expires after 24h (matches the platform's -# anon-resource TTL contract — pay-from-day-one, no trial creep). - -locals { - bucket_name = var.environment == "production" ? "instant-shared" : "instant-shared-staging" -} - -resource "cloudflare_r2_bucket" "shared" { - account_id = var.account_id - name = local.bucket_name - location = "WNAM" # North America West — closest to our DO NYC3 cluster latency-wise - storage_class = "Standard" -} - -# 24h TTL on anon/ — matches platform contract that anonymous resources -# expire after 24h (CLAUDE.md "anonymous (24h TTL) is the only free tier"). -resource "cloudflare_r2_bucket_lifecycle" "shared_anon_24h" { - account_id = var.account_id - bucket_name = cloudflare_r2_bucket.shared.name - - rules = [{ - id = "anon-24h" - enabled = true - conditions = { - prefix = "anon/" - } - delete_objects_transition = { - condition = { - type = "Age" - max_age = 86400 # 24h in seconds - } - } - }] -} diff --git a/terraform/cloudflare/staging.auto.tfvars b/terraform/cloudflare/staging.auto.tfvars deleted file mode 100644 index b7489f6..0000000 --- a/terraform/cloudflare/staging.auto.tfvars +++ /dev/null @@ -1,6 +0,0 @@ -environment = "staging" - -# Tokens rotate every 180d (deploy) / 90d (admin). Override per env -# if staging is on a shorter cycle. -deploy_token_expires_on = "2026-11-26T23:59:59Z" -admin_tunnel_token_expires_on = "2026-08-28T23:59:59Z" diff --git a/terraform/cloudflare/staging.tf b/terraform/cloudflare/staging.tf deleted file mode 100644 index 10deda6..0000000 --- a/terraform/cloudflare/staging.tf +++ /dev/null @@ -1,182 +0,0 @@ -# Staging-environment subdomains under staging.instanode.dev. -# -# All resources here are count-gated on `var.environment == "staging"` so -# they only materialize in the staging workspace; the production workspace -# plan shows no changes from this file. -# -# DIVISION OF RESPONSIBILITY between TF and wrangler: -# -# - **TF owns** wildcard records, env-level subdomains (dashboard, webhook), -# and the deployment-app wildcard. These don't have a 1:1 Worker/Container -# mapping or they're pre-deploy plumbing. -# - **Wrangler owns** service-specific hostnames via `custom_domain = true` -# in each wrangler.toml. wrangler auto-creates the DNS + cert + route on -# first deploy. That covers: api.staging.instanode.dev (managed by -# infra/wrangler/api/wrangler.toml). -# -# DO NOT add explicit TF records for hostnames wrangler is already -# custom-domain-claiming — wrangler will fail to deploy with "DNS record -# already exists" if both manage it. - -locals { - is_staging = var.environment == "staging" - # All staging subdomains live under this stem. - staging_stem = "staging.${var.zone_name}" -} - -# ----------------------------------------------------------------------------- -# Wildcards under *.staging.instanode.dev -# ----------------------------------------------------------------------------- -# -# Each per-tenant service in wrangler/ uses a hostname-shard pattern: -# - pg-customer-.staging.instanode.dev (pg-customers Container) -# - mongo-.staging.instanode.dev (mongodb Container) -# - redis-.staging.instanode.dev (redis-provision Container) -# - nats-.staging.instanode.dev (nats Container) -# -# A single proxied wildcard CNAME catches all of them; the Worker shells -# in each wrangler service extract the tenant from the hostname and -# dispatch to the right Durable Object via `idFromName(tenant)`. - -resource "cloudflare_dns_record" "staging_wildcard" { - count = local.is_staging ? 1 : 0 - zone_id = var.zone_id - name = "*.${local.staging_stem}" - type = "CNAME" - # CF requires SOME content for proxied CNAMEs; this is a placeholder. The - # cloudflare_workers_route below routes traffic to the correct Worker - # regardless of what's here. A 404 sink is intentional — any unrouted - # subdomain hits CF's default 404 page. - content = local.staging_stem - ttl = 60 - proxied = true - comment = "wildcard for per-tenant CF Container services in staging; routed via cloudflare_workers_route below" -} - -# ----------------------------------------------------------------------------- -# Deployment-app wildcard: *.deployment.staging.instanode.dev -# ----------------------------------------------------------------------------- -# -# Mirror of prod's `*.deployment.instanode.dev`. Every /deploy/new staging -# call provisions an app at `.deployment.staging.instanode.dev`. -# Wrangler-managed Containers for the deploy compute target this wildcard; -# the api Worker creates a DNS-less custom-domain claim per slug, but the -# wildcard ensures any future deploy slug resolves to CF before its -# custom-domain claim lands. - -resource "cloudflare_dns_record" "staging_deployment_wildcard" { - count = local.is_staging ? 1 : 0 - zone_id = var.zone_id - name = "*.deployment.${local.staging_stem}" - type = "CNAME" - content = "deployment.${local.staging_stem}" - ttl = 60 - proxied = true - comment = "wildcard for /deploy/new staging apps (mirrors prod *.deployment.instanode.dev)" -} - -# Anchor for the deployment wildcard CNAME (the wildcard's content needs -# a real record at the parent name). -resource "cloudflare_dns_record" "staging_deployment_anchor" { - count = local.is_staging ? 1 : 0 - zone_id = var.zone_id - name = "deployment.${local.staging_stem}" - type = "AAAA" - content = "100::" # IPv6 discard prefix — never reachable; CF proxied front-end terminates - ttl = 60 - proxied = true - comment = "anchor for deployment wildcard CNAME (CF requires a real record at the parent)" -} - -# ----------------------------------------------------------------------------- -# Webhook subdomain: webhook.staging.instanode.dev -# ----------------------------------------------------------------------------- -# -# /webhook/new staging endpoints return a URL at this host. Routed to the -# api Container via a Worker route. Separate subdomain (vs api.staging.) -# so customers can filter outbound webhook traffic by destination host. - -resource "cloudflare_dns_record" "staging_webhook" { - count = local.is_staging ? 1 : 0 - zone_id = var.zone_id - name = "webhook.${local.staging_stem}" - type = "AAAA" - content = "100::" # placeholder; CF orange-cloud handles routing - ttl = 60 - proxied = true - comment = "staging /webhook/new receiver subdomain" -} - -# ----------------------------------------------------------------------------- -# Dashboard subdomain: dashboard.staging.instanode.dev -# ----------------------------------------------------------------------------- -# -# CEO killed dashboard-on-Pages for PROD (D-5) but staging dashboard is -# useful for QA. Points at the same dashboard Pages project at the -# `staging` branch preview hostname. NOT enabled for production — D-5 -# stands. - -resource "cloudflare_dns_record" "staging_dashboard" { - count = local.is_staging ? 1 : 0 - zone_id = var.zone_id - name = "dashboard.${local.staging_stem}" - type = "CNAME" - content = "instanode-dashboard-staging.pages.dev" # set after dashboard Pages project is created - ttl = 60 - proxied = true - comment = "staging dashboard — QA-only; D-5 keeps prod dashboard off Pages" -} - -# ----------------------------------------------------------------------------- -# Workers Routes for per-tenant wildcards -# ----------------------------------------------------------------------------- -# -# `custom_domain = true` in wrangler.toml does NOT support wildcards. -# Wildcards need cloudflare_workers_route + a wildcard DNS record (done -# above). Each route binds a pattern to a specific Worker name; wrangler -# deploys the Worker, TF wires the route. - -resource "cloudflare_workers_route" "staging_pg_customers" { - count = local.is_staging ? 1 : 0 - zone_id = var.zone_id - pattern = "pg-customer-*.${local.staging_stem}/*" - script = "instanode-pg-customers-staging" -} - -resource "cloudflare_workers_route" "staging_mongodb" { - count = local.is_staging ? 1 : 0 - zone_id = var.zone_id - pattern = "mongo-*.${local.staging_stem}/*" - script = "instanode-mongodb-staging" -} - -resource "cloudflare_workers_route" "staging_redis" { - count = local.is_staging ? 1 : 0 - zone_id = var.zone_id - pattern = "redis-*.${local.staging_stem}/*" - script = "instanode-redis-provision-staging" -} - -resource "cloudflare_workers_route" "staging_nats" { - count = local.is_staging ? 1 : 0 - zone_id = var.zone_id - pattern = "nats-*.${local.staging_stem}/*" - script = "instanode-nats-staging" -} - -# ----------------------------------------------------------------------------- -# Pages custom domain — staging marketing site -# ----------------------------------------------------------------------------- -# -# The Pages project itself is declared in pages.tf with the -# `var.environment == "staging" ? "instanode-web-staging" : "instanode-web"` -# name pattern. The custom-domain attachment is here so prod's pages.tf -# stays simple. - -resource "cloudflare_pages_domain" "staging_marketing" { - count = local.is_staging ? 1 : 0 - account_id = var.account_id - project_name = "instanode-web-staging" - name = local.staging_stem - depends_on = [cloudflare_dns_record.staging] -} diff --git a/terraform/cloudflare/tokens.tf b/terraform/cloudflare/tokens.tf deleted file mode 100644 index 8a37ad0..0000000 --- a/terraform/cloudflare/tokens.tf +++ /dev/null @@ -1,75 +0,0 @@ -# Two scoped API tokens replace the Global API Key for CI / DevOps use. -# Source: exported from CF dashboard 2026-05-30, renamed to avoid the -# default `example_account_token` collision. -# -# WARNING — token values are SENSITIVE outputs. They appear once in TF -# state after `apply`. Operator MUST run the `make install-secrets` -# helper (see Makefile) to push them into k8s + GH org secrets, then -# rotate state. - -# Token A — day-to-day deploy + DNS + R2 + Pages + Workers + Page Rules -# + Load Balancing + Cache Purge + Zone Settings. Account-broad, zone- -# narrow on instanode.dev. Used by CI. -resource "cloudflare_account_token" "deploy" { - account_id = var.account_id - name = "instanode-migration-deploy-${var.environment}" - expires_on = var.deploy_token_expires_on - - policies = [ - # Zone-scoped permissions on instanode.dev (zone_id pinned). - { - effect = "allow" - permission_groups = [ - { id = "c4df38be41c247b3b4b7702e76eadae0" }, # Zone:Read - { id = "3030687196b94b638145a3953da2b699" }, # DNS:Edit - { id = "c8fed203ed3043cba015a93ad1616f1f" }, # Zone Settings:Edit - { id = "c03055bc037c4ea9afb9a9f104b7b721" }, # Cache Purge:Purge - { id = "e17beae8b8cb423a99b1730f21238bed" }, # Page Rules:Edit - { id = "ed07f6c337da4195b4e72a1fb2c6bcae" }, # SSL and Certificates:Edit - { id = "6d7f2f5f5b1d4a0e9081fdc98d432fd1" }, # Load Balancers:Edit - { id = "4755a26eedb94da69e1066d98aa820be" }, # Apps:Edit (zone-side) - ] - resources = jsonencode({ - "com.cloudflare.api.account.zone.${var.zone_id}" = "*" - }) - }, - # Account-scoped permissions for resources that aren't zone-bound. - { - effect = "allow" - permission_groups = [ - { id = "dc44f27f48ab405392a5f69fe822bd01" }, # Workers Scripts:Edit - { id = "8d28297797f24fb8a0c332fe0866ec89" }, # Workers KV Storage:Edit - { id = "bf7481a1826f439697cb59a20b22293e" }, # Workers R2 Storage:Edit - { id = "f7f0eda5697f475c90846e879bab8666" }, # Cloudflare Pages:Edit - { id = "e086da7e2179491d91ee5f35b3ca210a" }, # Account Settings:Read - { id = "d2a1802cc9a34e30852f8b33869b2f3c" }, # LB Monitors & Pools:Edit - { id = "c1fde68c7bcc44588cbb6ddbc16d6480" }, # Account Analytics:Read - ] - resources = jsonencode({ - "com.cloudflare.api.account.${var.account_id}" = "*" - }) - }, - ] -} - -# Token B — break-glass / rare-use Tunnel + Access. Smaller scope, shorter -# expiry. NOT used by CI; kept as separate apply for blast-radius isolation. -resource "cloudflare_account_token" "admin_tunnel" { - account_id = var.account_id - name = "instanode-migration-admin-tunnel-${var.environment}" - expires_on = var.admin_tunnel_token_expires_on - - policies = [{ - effect = "allow" - permission_groups = [ - { id = "ad7a6f88896d498f98eb30592abfbbf4" }, # Cloudflare Tunnel:Edit - { id = "77efc2c0724d4c4eb94bfd9656247130" }, # Access: Apps and Policies:Edit - { id = "db37e5f1cb1a4e1aabaef8deaea43575" }, # Access: Service Tokens:Edit - { id = "a1c0fec57cf94af79479a6d827fa518c" }, # Access: Organizations, Identity Providers:Edit - { id = "1e13c5124ca64b72b1969a67e8829049" }, # Account Settings:Read - ] - resources = jsonencode({ - "com.cloudflare.api.account.${var.account_id}" = "*" - }) - }] -} diff --git a/terraform/cloudflare/variables.tf b/terraform/cloudflare/variables.tf deleted file mode 100644 index 7e9f005..0000000 --- a/terraform/cloudflare/variables.tf +++ /dev/null @@ -1,37 +0,0 @@ -variable "account_id" { - type = string - description = "Cloudflare account ID (CF for Startups credit-tagged account)." - default = "613a9e74136364c781a8e258326019f9" -} - -variable "zone_id" { - type = string - description = "Cloudflare zone ID for instanode.dev." - default = "08a1a569d2d6f9a713dc6d62103c5dc6" -} - -variable "zone_name" { - type = string - default = "instanode.dev" -} - -variable "environment" { - type = string - description = "staging or production. Selected via `terraform workspace`." - validation { - condition = contains(["staging", "production"], var.environment) - error_message = "environment must be one of: staging, production." - } -} - -variable "deploy_token_expires_on" { - type = string - description = "RFC3339 expiry for the deploy token. Rotate every ≤180d." - default = "2026-11-26T23:59:59Z" -} - -variable "admin_tunnel_token_expires_on" { - type = string - description = "RFC3339 expiry for the admin/tunnel token. Rotate every ≤90d." - default = "2026-08-28T23:59:59Z" -} diff --git a/terraform/cloudflare/versions.tf b/terraform/cloudflare/versions.tf deleted file mode 100644 index 942c3ae..0000000 --- a/terraform/cloudflare/versions.tf +++ /dev/null @@ -1,27 +0,0 @@ -terraform { - required_version = ">= 1.4" - - required_providers { - cloudflare = { - source = "cloudflare/cloudflare" - version = "~> 5.0" - } - } - - # State lives in R2 (S3-compatible). The bucket "instanode-tf-state" must - # be created out-of-band before `terraform init` — see README §Bootstrap. - # Operator passes -backend-config="..." at init time; we DON'T hardcode - # the account-specific endpoint or HMAC creds here. - backend "s3" { - bucket = "instanode-tf-state" - key = "cloudflare/terraform.tfstate" - region = "auto" - use_path_style = true - skip_credentials_validation = true - skip_metadata_api_check = true - skip_region_validation = true - skip_requesting_account_id = true - skip_s3_checksum = true - encrypt = true - } -} diff --git a/wrangler/README.md b/wrangler/README.md deleted file mode 100644 index db2b867..0000000 --- a/wrangler/README.md +++ /dev/null @@ -1,97 +0,0 @@ -# Wrangler — CF Containers for staging - -This directory deploys instanode.dev services as **Cloudflare Containers** -to the **staging** environment. Each service has its own subdir with a -`wrangler.toml` + a tiny Worker shell (`src/worker.ts`) that exposes the -Container via a Durable Object binding. - -Production does NOT use this — see the `production-` workflow when written. -Per user direction 2026-05-30: staging is CF-only, ephemeral state acceptable. - -## Why wrangler, not Terraform - -The `cloudflare/cloudflare` Terraform provider (v5.19.1 as of bootstrap) does -NOT yet expose a `cloudflare_container` resource. Verified by `terraform -providers schema -json | jq '.. | keys?' | grep container` → empty. - -Until the provider catches up, we manage Containers via `wrangler` and -**Terraform manages everything else**: DNS, R2, Pages, Hyperdrive, KV, -Queues, secrets — see `../terraform/cloudflare/`. - -When `cloudflare_container` ships, we'll swap in. Until then, the -boundary is clean: - -| Surface | Tool | -|---|---| -| DNS records, R2 buckets, Pages projects, Hyperdrive config, API tokens | **Terraform** (`../terraform/cloudflare/`) | -| CF Containers (api/worker/provisioner + stateful staging services) | **Wrangler** (this dir) | -| k8s manifests (production data plane until that migrates) | **kubectl** (`../k8s/`) | - -## Ephemeral-state acceptance criterion - -CF Containers wipe disk every time an instance goes to sleep (which fires -on traffic-quiet, not just intentional restart). Source: -https://developers.cloudflare.com/containers/platform-details/ - -This means our staging Postgres / Mongo / Redis / NATS containers WILL -lose their data, mid-test sometimes. E2E test design MUST tolerate this: - -1. **Every test seeds its own fixtures** at start; no test assumes state - from a prior test. -2. **No "deploy now, verify in 2h" tests** — the container may have - slept and lost its state in between. -3. **Tests that span multiple HTTP calls** must complete within one - container-active window (typically minutes). -4. **`/db/new` in staging** returns a connection string that may stop - working when the backing Container sleeps. Documented in the staging - API responses. -5. **Synthetic monitors** keep the high-traffic Containers warm; cold - ones are accepted as ephemeral. - -These tradeoffs are explicit and user-blessed per the CF-only staging -decision. Production has a different host (TBD — not in this dir). - -## Per-service layout - -Each subdir contains: - -``` -infra/wrangler// -├── wrangler.toml # CF Container + Worker config -├── src/ -│ └── worker.ts # Tiny Worker shell that wraps the Container DO -├── Dockerfile # Optional override; defaults to ../..//Dockerfile -└── README.md # Service-specific notes (image source, env vars, ports) -``` - -The actual service code (api, worker, provisioner) lives in its own repo -under `instanodedev/` and produces a Docker image that wrangler ships. -For services without a separate repo (pg-platform, pg-customers, mongodb, -redis-provision, nats), we use upstream public images (`postgres:16`, -`mongo:7`, `redis:7`, `nats:2`) and a small staging-only init script. - -## Deploy - -CI auto-deploys on merge to `master` via `../.github/workflows/wrangler-deploy-staging.yml`. -Manual deploy from an operator workstation: - -```bash -cd infra/wrangler/ -wrangler login # one-time -wrangler containers deploy --env staging -``` - -Requires `CLOUDFLARE_API_TOKEN` env (Token A from the TF outputs). - -## Service inventory - -| Subdir | What runs | Stateful? | Public hostname (staging) | Notes | -|---|---|---|---|---| -| `api/` | instanode.dev api binary | no | `api.staging.instanode.dev` | HTTP only | -| `worker/` | River job worker | no | none (cron) | Triggered by CF Cron | -| `provisioner/` | gRPC :50051 service | no | private (Container→Container only) | api calls it | -| `pg-platform/` | postgres:16 | **yes, ephemeral** | private | `instance_type=standard`; data wiped on sleep | -| `pg-customers/` | postgres:16 | **yes, ephemeral** | `pg-customer-.staging.instanode.dev` (one per tenant) | Customer-facing in staging only | -| `mongodb/` | mongo:7 | **yes, ephemeral** | private | accessed by /nosql/new staging | -| `redis-provision/` | redis:7 | **yes, ephemeral** | `redis-.staging.instanode.dev` | Customer-facing | -| `nats/` | nats:2 (no JetStream — JS needs durable disk) | **yes, ephemeral** | `nats-.staging.instanode.dev` | Core NATS only in staging | diff --git a/wrangler/api/README.md b/wrangler/api/README.md deleted file mode 100644 index 80a190f..0000000 --- a/wrangler/api/README.md +++ /dev/null @@ -1,35 +0,0 @@ -# api — CF Containers staging deploy - -Wraps the Go api binary (port 8080) in a CF Container. Image pulled from -`ghcr.io/instanodedev/api:staging` — built by the api repo's CI on every -push to master, tagged with `:staging` for staging deploys. - -## Env vars and secrets - -Config (committed): -- `ENVIRONMENT=staging` -- `OBJECT_STORE_BACKEND=r2` -- `R2_BUCKET_NAME=instant-shared-staging` - -Secrets (via `wrangler secret put`): -- `DATABASE_URL` — points at `pg-platform` Container DO via service binding -- `CUSTOMER_DATABASE_URL` — points at `pg-customers` Container DO -- `REDIS_URL` — service binding to `redis-platform` -- `NATS_URL` — service binding to `nats` -- `AES_KEY`, `JWT_SECRET`, `RAZORPAY_WEBHOOK_SECRET`, `BREVO_API_KEY` — same names as k8s prod -- `R2_HMAC_KEY_ID`, `R2_HMAC_SECRET` — from R2 dashboard, scoped to `instant-shared-staging` bucket - -## Deploy - -```bash -cd infra/wrangler/api -wrangler containers deploy --env staging -``` - -CI auto-deploys on merge to master via the workflow in `infra/.github/workflows/`. - -## Known constraints - -- **Disk wipes on sleep** — api itself is stateless so this is fine; downstream PG/Mongo are NOT (see ../README.md acceptance criterion). -- **HTTP only** — gRPC api→provisioner is fine (CF Containers support HTTP/2). -- **No persistent customer port-forwards** — the dashboard's port-forward proxy is disabled on staging. diff --git a/wrangler/api/src/worker.ts b/wrangler/api/src/worker.ts deleted file mode 100644 index 7e78d5c..0000000 --- a/wrangler/api/src/worker.ts +++ /dev/null @@ -1,32 +0,0 @@ -// Tiny Worker shell for the api Container. -// -// CF Containers require a Worker entrypoint that forwards requests to -// the Container's Durable Object. The container itself runs the actual -// Go binary (instanodedev/api), listening on :8080. -// -// Every incoming HTTP request is routed to a Container instance; CF -// handles spin-up/spin-down. Disk is ephemeral — see ../README.md. - -import { Container, getContainer } from "@cloudflare/containers"; - -export class ApiContainer extends Container { - // The Go binary listens on :8080. - defaultPort = 8080; - // Sleep after 10 minutes of no traffic. CF will spin back up on the - // next request, with a fresh disk. The api is stateless (state lives - // in pg-platform Container), so cold-start is correctness-safe. - sleepAfter = "10m"; -} - -export default { - async fetch(request: Request, env: Env): Promise { - // Route every request to a single Container instance (single-shard - // for staging; production would shard by tenant or geo). - const container = getContainer(env.API_CONTAINER); - return container.fetch(request); - }, -}; - -interface Env { - API_CONTAINER: DurableObjectNamespace; -} diff --git a/wrangler/api/wrangler.toml b/wrangler/api/wrangler.toml deleted file mode 100644 index a403a09..0000000 --- a/wrangler/api/wrangler.toml +++ /dev/null @@ -1,64 +0,0 @@ -# instanode-api on CF Containers (staging). -# -# The api is a Go binary listening on :8080. CF Containers wraps it in a -# Durable Object; the Worker shell in src/worker.ts forwards every HTTP -# request to the container. -# -# Image: pulled from GHCR (built by api repo's CI on every push to master). - -name = "instanode-api" -main = "src/worker.ts" -compatibility_date = "2026-05-30" - -# Per-environment config keeps the staging deploy isolated from any future -# prod deploy (which won't live here — production goes to a non-CF k8s). -[env.staging] -name = "instanode-api-staging" -routes = [ - { pattern = "api.staging.instanode.dev/*", custom_domain = true }, -] - -# Container backed by a Durable Object class. -[[env.staging.containers]] -class_name = "ApiContainer" -image = "ghcr.io/instanode-dev/instant-api:staging" -max_instances = 3 -instance_type = "standard" # 1 vCPU, 4 GiB RAM, 8 GiB ephemeral disk - -[[env.staging.durable_objects.bindings]] -name = "API_CONTAINER" -class_name = "ApiContainer" - -[[env.staging.migrations]] -tag = "v1" -new_sqlite_classes = ["ApiContainer"] - -# Env vars passed to the container. Secrets via `wrangler secret put`. -[env.staging.vars] -ENVIRONMENT = "staging" -OBJECT_STORE_BACKEND = "r2" -R2_BUCKET_NAME = "instant-shared-staging" -# DATABASE_URL, REDIS_URL, NATS_URL, etc. resolve to other Container DOs -# via service bindings — see [[env.staging.services]] block. - -# Service bindings — Worker can RPC into other Containers/Workers without -# a public hostname. -[[env.staging.services]] -binding = "PG_PLATFORM" -service = "instanode-pg-platform-staging" -environment = "staging" - -[[env.staging.services]] -binding = "PROVISIONER" -service = "instanode-provisioner-staging" -environment = "staging" - -[[env.staging.services]] -binding = "REDIS_PLATFORM" -service = "instanode-redis-platform-staging" -environment = "staging" - -# Observability — send Container stdout/stderr to a CF Logpush sink. -[env.staging.observability] -enabled = true -head_sampling_rate = 1.0 diff --git a/wrangler/mongodb/Dockerfile b/wrangler/mongodb/Dockerfile deleted file mode 100644 index afbe234..0000000 --- a/wrangler/mongodb/Dockerfile +++ /dev/null @@ -1,30 +0,0 @@ -# mongodb image for staging CF Container. -# -# Base: mongo:7. CF Containers' ephemeral disk means EVERY cold start -# is a fresh init — there is no "first init vs subsequent restart" -# distinction. The mongo image's docker-entrypoint runs initdb scripts -# on every fresh /data/db, so the staging-bootstrap script below runs -# every cold start. -# -# Why custom (vs pristine mongo:7): -# - Bake the staging-bootstrap that creates the admin user + sets -# the wire compression default so api can connect without -# post-deploy operator action. -# - Healthcheck via `mongosh ping` for the Worker shell's wait loop. -# - Per-tenant database names are CREATED on demand by provisioner; -# no per-tenant schema baked in here. - -FROM mongo:7 - -# Staging-bootstrap: idempotent admin user. Mongo entrypoint reads -# MONGO_INITDB_ROOT_USERNAME / MONGO_INITDB_ROOT_PASSWORD from env on -# first init; this script is a defence-in-depth ensure path used by -# the api's connection-test against `db.adminCommand({ ping: 1 })`. -COPY infra/wrangler/mongodb/docker-entrypoint-initdb.d/ /docker-entrypoint-initdb.d/ - -# `mongosh` is in the base image; the healthcheck just exercises a -# round-trip via the admin DB to confirm the daemon is up + responsive. -HEALTHCHECK --interval=10s --timeout=3s --start-period=30s --retries=3 \ - CMD mongosh --quiet --eval "db.adminCommand({ping:1}).ok" --host=localhost | grep -q '^1$' || exit 1 - -EXPOSE 27017 diff --git a/wrangler/mongodb/docker-entrypoint-initdb.d/00_staging_bootstrap.js b/wrangler/mongodb/docker-entrypoint-initdb.d/00_staging_bootstrap.js deleted file mode 100644 index ef7f31e..0000000 --- a/wrangler/mongodb/docker-entrypoint-initdb.d/00_staging_bootstrap.js +++ /dev/null @@ -1,27 +0,0 @@ -// Staging-bootstrap for mongodb CF Container. Runs on EVERY cold start -// because CF Containers wipe /data/db on sleep. -// -// Idempotent: createUser fails with code 51003 ("user already exists") -// if the admin already created the user in the same boot — we swallow -// that. Other codes propagate. - -(function () { - var adminDb = db.getSiblingDB('admin'); - - // Mongo entrypoint already creates the root user from - // MONGO_INITDB_ROOT_USERNAME/MONGO_INITDB_ROOT_PASSWORD. Confirm it - // resolved successfully so the api connection doesn't hit "no users - // configured" on the first call. - var users = adminDb.system.users.find({ user: 'admin' }).count(); - if (users === 0) { - print('00_staging_bootstrap: no admin user found, creating one from env vars'); - adminDb.createUser({ - user: process.env.MONGO_INITDB_ROOT_USERNAME || 'admin', - pwd: process.env.MONGO_INITDB_ROOT_PASSWORD || 'staging-bootstrap', - roles: [{ role: 'root', db: 'admin' }], - }); - } else { - print('00_staging_bootstrap: admin user already provisioned by mongo entrypoint'); - } - print('00_staging_bootstrap: complete'); -})(); diff --git a/wrangler/mongodb/src/worker.ts b/wrangler/mongodb/src/worker.ts deleted file mode 100644 index 5cc2570..0000000 --- a/wrangler/mongodb/src/worker.ts +++ /dev/null @@ -1,19 +0,0 @@ -import { Container, getContainer } from "@cloudflare/containers"; - -export class MongoContainer extends Container { - defaultPort = 27017; - sleepAfter = "20m"; -} - -export default { - async fetch(request: Request, env: Env): Promise { - const url = new URL(request.url); - const tenant = url.hostname.split(".")[0].replace(/^mongo-/, ""); - const id = env.MONGO_CONTAINER.idFromName(tenant); - return env.MONGO_CONTAINER.get(id).fetch(request); - }, -}; - -interface Env { - MONGO_CONTAINER: DurableObjectNamespace; -} diff --git a/wrangler/mongodb/wrangler.toml b/wrangler/mongodb/wrangler.toml deleted file mode 100644 index 48d30dc..0000000 --- a/wrangler/mongodb/wrangler.toml +++ /dev/null @@ -1,30 +0,0 @@ -# mongodb — per-tenant Mongo in a CF Container (staging). - -name = "instanode-mongodb" -main = "src/worker.ts" -compatibility_date = "2026-05-30" - -[env.staging] -name = "instanode-mongodb-staging" -routes = [ - { pattern = "mongo-*.staging.instanode.dev/*", custom_domain = true }, -] - -[[env.staging.containers]] -class_name = "MongoContainer" -# Custom image — wraps mongo:7 with staging-bootstrap + healthcheck. -image = "ghcr.io/instanode-dev/instant-mongodb:staging" -max_instances = 10 -instance_type = "standard" - -[[env.staging.durable_objects.bindings]] -name = "MONGO_CONTAINER" -class_name = "MongoContainer" - -[[env.staging.migrations]] -tag = "v1" -new_sqlite_classes = ["MongoContainer"] - -[env.staging.observability] -enabled = true -head_sampling_rate = 1.0 diff --git a/wrangler/nats/Dockerfile b/wrangler/nats/Dockerfile deleted file mode 100644 index e3cd67a..0000000 --- a/wrangler/nats/Dockerfile +++ /dev/null @@ -1,23 +0,0 @@ -# nats image for staging CF Container. -# -# Base: nats:2-alpine. JetStream needs durable disk — NOT viable on -# CF Containers' ephemeral storage — so this image runs CORE NATS ONLY -# (no -js flag). Customer-facing /queue/new in staging returns a -# legacy_open connection string and tests that exercise JetStream -# features are skipped (see test guard in api/internal/handlers/queue.go). -# -# Auth mode: legacy_open. Per CLAUDE.md "Known Design Gaps", prod -# serves legacy_open until the operator runs `nsc generate` for -# operator/sys NKeys (NATS-AUTH-RUNBOOK.md). Staging matches prod's -# current auth posture. - -FROM nats:2-alpine - -COPY infra/wrangler/nats/nats-server.conf /etc/nats/nats-server.conf - -HEALTHCHECK --interval=10s --timeout=3s --start-period=10s --retries=3 \ - CMD wget -qO- http://localhost:8222/healthz | grep -q '"status":"ok"' || exit 1 - -EXPOSE 4222 8222 - -CMD ["-c", "/etc/nats/nats-server.conf"] diff --git a/wrangler/nats/nats-server.conf b/wrangler/nats/nats-server.conf deleted file mode 100644 index db33a4f..0000000 --- a/wrangler/nats/nats-server.conf +++ /dev/null @@ -1,33 +0,0 @@ -# Staging nats-server.conf — core NATS only (no JetStream — ephemeral -# disk on CF Containers can't satisfy JetStream's durable WAL). -# -# Auth mode: legacy_open. No per-tenant JWT in staging. Production -# eventually upgrades to per-tenant JWT once an operator runs -# `nsc generate` for operator + sys NKeys (NATS-AUTH-RUNBOOK.md). -# This staging config DOES NOT block on that. - -listen: 0.0.0.0:4222 - -# HTTP monitoring endpoint used by the Worker shell's healthcheck. -http: 0.0.0.0:8222 - -# Connection + payload limits matched to CF Container "basic" class. -max_connections: 1000 -max_payload: 1MB -max_pending: 32MB - -# Logging to stdout for `wrangler tail`. -debug: false -trace: false -logtime: true - -# Auth — legacy_open: no creds required. Customers connecting via -# /queue/new staging endpoint get an open URL. -authorization { - # Empty block = no auth. Documented intentional choice. -} - -# NO JetStream block — explicitly disabled because CF Container disk -# is ephemeral. Tests that require JetStream skip on staging via the -# `auth_mode=legacy_open` resource field (see CLAUDE.md /queue/new). -# jetstream { ... } # DO NOT enable in staging diff --git a/wrangler/nats/src/worker.ts b/wrangler/nats/src/worker.ts deleted file mode 100644 index 45f2350..0000000 --- a/wrangler/nats/src/worker.ts +++ /dev/null @@ -1,19 +0,0 @@ -import { Container, getContainer } from "@cloudflare/containers"; - -export class NatsContainer extends Container { - defaultPort = 4222; - sleepAfter = "20m"; -} - -export default { - async fetch(request: Request, env: Env): Promise { - const url = new URL(request.url); - const tenant = url.hostname.split(".")[0].replace(/^nats-/, ""); - const id = env.NATS_CONTAINER.idFromName(tenant); - return env.NATS_CONTAINER.get(id).fetch(request); - }, -}; - -interface Env { - NATS_CONTAINER: DurableObjectNamespace; -} diff --git a/wrangler/nats/wrangler.toml b/wrangler/nats/wrangler.toml deleted file mode 100644 index 7315949..0000000 --- a/wrangler/nats/wrangler.toml +++ /dev/null @@ -1,40 +0,0 @@ -# nats — per-tenant NATS in a CF Container (staging). -# NATS JetStream needs durable disk — NOT viable on ephemeral. Staging -# runs core NATS only (no streams). /queue/new in staging returns a -# legacy_open connection string. JetStream features test-skipped. - -name = "instanode-nats" -main = "src/worker.ts" -compatibility_date = "2026-05-30" - -[env.staging] -name = "instanode-nats-staging" -routes = [ - { pattern = "nats-*.staging.instanode.dev/*", custom_domain = true }, -] - -[[env.staging.containers]] -class_name = "NatsContainer" -# Custom image — wraps nats:2-alpine with /etc/nats/nats-server.conf -# baked in (core NATS only, no JetStream, legacy_open auth — matches -# prod's current auth posture). -image = "ghcr.io/instanode-dev/instant-nats:staging" -max_instances = 10 -instance_type = "basic" - -[[env.staging.durable_objects.bindings]] -name = "NATS_CONTAINER" -class_name = "NatsContainer" - -[[env.staging.migrations]] -tag = "v1" -new_sqlite_classes = ["NatsContainer"] - -[env.staging.vars] -# No -js flag → core NATS only. Document that JetStream is staging-disabled -# in /tmp/cf-migration/shared/STAGING-LIMITATIONS.md. -NATS_ARGS = "-m 8222" - -[env.staging.observability] -enabled = true -head_sampling_rate = 1.0 diff --git a/wrangler/pg-customers/src/worker.ts b/wrangler/pg-customers/src/worker.ts deleted file mode 100644 index 73ce9b0..0000000 --- a/wrangler/pg-customers/src/worker.ts +++ /dev/null @@ -1,22 +0,0 @@ -import { Container, getContainer } from "@cloudflare/containers"; - -export class PgCustomersContainer extends Container { - defaultPort = 5432; - sleepAfter = "20m"; -} - -export default { - async fetch(request: Request, env: Env): Promise { - // Per-tenant routing: extract tenant from subdomain. - const url = new URL(request.url); - const tenant = url.hostname.split(".")[0].replace(/^pg-customer-/, ""); - // ID by tenant → one DO instance per tenant (their isolated PG). - const id = env.PG_CUSTOMERS_CONTAINER.idFromName(tenant); - const container = env.PG_CUSTOMERS_CONTAINER.get(id); - return container.fetch(request); - }, -}; - -interface Env { - PG_CUSTOMERS_CONTAINER: DurableObjectNamespace; -} diff --git a/wrangler/pg-customers/wrangler.toml b/wrangler/pg-customers/wrangler.toml deleted file mode 100644 index 65a2b52..0000000 --- a/wrangler/pg-customers/wrangler.toml +++ /dev/null @@ -1,36 +0,0 @@ -# pg-customers — per-tenant Postgres in a CF Container (staging only). -# Customer-facing: /db/new in staging returns a connection string here. -# Data is EPHEMERAL — wipes on container sleep. Documented in ../README.md. - -name = "instanode-pg-customers" -main = "src/worker.ts" -compatibility_date = "2026-05-30" - -[env.staging] -name = "instanode-pg-customers-staging" -# Public TCP exposure happens via the Worker shell; staging clients dial -# `pg-customer-.staging.instanode.dev:5432`. -routes = [ - { pattern = "pg-customer-*.staging.instanode.dev/*", custom_domain = true }, -] - -[[env.staging.containers]] -class_name = "PgCustomersContainer" -image = "postgres:16-alpine" -max_instances = 10 # staging cap — bump if QA needs more -instance_type = "standard" - -[[env.staging.durable_objects.bindings]] -name = "PG_CUSTOMERS_CONTAINER" -class_name = "PgCustomersContainer" - -[[env.staging.migrations]] -tag = "v1" -new_sqlite_classes = ["PgCustomersContainer"] - -[env.staging.vars] -PGDATA = "/var/lib/postgresql/data/pgdata" - -[env.staging.observability] -enabled = true -head_sampling_rate = 1.0 diff --git a/wrangler/pg-platform/00_pre.sql b/wrangler/pg-platform/00_pre.sql deleted file mode 100644 index f2c18fb..0000000 --- a/wrangler/pg-platform/00_pre.sql +++ /dev/null @@ -1,25 +0,0 @@ --- Runs FIRST in /docker-entrypoint-initdb.d/ (alphabetical sort puts --- "00_pre.sql" ahead of "001_initial.sql"). Sets up extensions + log --- markers that every later migration depends on. --- --- This file is staging-only — production uses different operator-run --- bootstrap. See infra/wrangler/pg-platform/Dockerfile for context. - --- pgvector — mig 040+ does CREATE EXTENSION vector and assumes the --- shared library is loadable. pgvector/pgvector:pg16 ships the .so; --- this just registers it in the freshly-init'd database. -CREATE EXTENSION IF NOT EXISTS vector; - --- Standard extensions we use across migrations. -CREATE EXTENSION IF NOT EXISTS "uuid-ossp"; -CREATE EXTENSION IF NOT EXISTS "pgcrypto"; - --- Match prod timezone — every timestamp comparison in tests assumes UTC. -SET TIME ZONE 'UTC'; - --- Log marker. Shows in `wrangler tail` so operators know this is a --- cold-start init (vs an unexpected mid-life restart). -DO $$ -BEGIN - RAISE NOTICE 'pg-platform staging cold start — re-applying 63 migrations against fresh PGDATA'; -END $$; diff --git a/wrangler/pg-platform/Dockerfile b/wrangler/pg-platform/Dockerfile deleted file mode 100644 index 3b83f11..0000000 --- a/wrangler/pg-platform/Dockerfile +++ /dev/null @@ -1,53 +0,0 @@ -# pg-platform image for staging CF Container. -# -# Base: pgvector/pgvector:pg16 — Postgres 16 + the pgvector extension -# that platform_db's resource embeddings table requires (extension CREATE -# in mig 040+; without pgvector the image init fails on the first -# `CREATE EXTENSION vector` statement). -# -# Migrations: the 63 *.sql files from api/internal/db/migrations/ are -# copied into /docker-entrypoint-initdb.d/. Postgres's official -# entrypoint runs every *.sql alphabetically on first cluster init — -# and CF Containers' ephemeral disk means EVERY cold start IS a first -# cluster init, so the migrations re-apply on every wake-from-sleep. -# -# This is the explicit, user-blessed ephemeral-state tradeoff for the -# CF-only staging design. See ../README.md acceptance criterion. -# -# Build context: workspace root (../../). -# Build command (CI runs this; not for ad-hoc local use): -# docker buildx build \ -# -f infra/wrangler/pg-platform/Dockerfile \ -# -t ghcr.io/instanode-dev/instant-pg-platform:staging \ -# --push \ -# . - -FROM pgvector/pgvector:pg16 - -# Copy every migration file in numeric (=alphabetical) order. The -# leading 0NN_*.sql naming guarantees the entrypoint applies them in -# the same order as `make test-db-up` does locally. -COPY api/internal/db/migrations/*.sql /docker-entrypoint-initdb.d/ - -# A pre-script that runs before any migration. Names start with "00_" -# so it sorts ahead of "001_initial.sql". -# -# We use it to: -# 1. CREATE EXTENSION pgvector (idempotent — base image has the -# shared lib; this enables it in the freshly-init'd database). -# 2. Set timezone to UTC to match production. -# 3. Print a one-line marker so the CF Container's logs make clear -# this is a fresh init (operator confidence on cold start). -COPY infra/wrangler/pg-platform/00_pre.sql /docker-entrypoint-initdb.d/00_pre.sql - -# postgres image expects POSTGRES_PASSWORD set; staging wrangler.toml -# wires that through `wrangler secret put POSTGRES_PASSWORD`. The -# image also reads POSTGRES_DB / POSTGRES_USER if provided (wrangler -# env block sets POSTGRES_DB=instant_platform). - -# Healthcheck — pg_isready against the local socket. Used by the -# Worker shell's container.fetch wait-loop. -HEALTHCHECK --interval=10s --timeout=3s --start-period=30s --retries=3 \ - CMD pg_isready -U "${POSTGRES_USER:-postgres}" -d "${POSTGRES_DB:-instant_platform}" || exit 1 - -EXPOSE 5432 diff --git a/wrangler/pg-platform/README.md b/wrangler/pg-platform/README.md deleted file mode 100644 index 67b992a..0000000 --- a/wrangler/pg-platform/README.md +++ /dev/null @@ -1,87 +0,0 @@ -# pg-platform — staging CF Container - -Postgres 16 + pgvector. Image baked with all 63 platform migrations in -`/docker-entrypoint-initdb.d/` so cold starts come up with a fully -migrated schema. - -## Ephemeral acceptance - -Per the CF-only staging decision (2026-05-30): disk wipes every time the -Container sleeps (which fires on traffic-quiet, not just intentional -restart). Each cold start: - -1. CF Containers wakes the Container with a fresh disk. -2. Postgres entrypoint sees PGDATA empty → runs `initdb`. -3. `00_pre.sql` runs first — pgvector + uuid-ossp + pgcrypto extensions, UTC tz. -4. The 63 migration files run in numeric order (001 → 063). -5. Container reports healthy via `pg_isready`. -6. api / worker / provisioner Containers can now connect via service binding. - -Total cold-start time: estimated 15–45s depending on Container class + -migration count. Anything that talks to pg-platform must tolerate this -warmup (Worker shell's `container.fetch` blocks until healthy). - -## Image build - -The image is built by `infra/.github/workflows/wrangler-build-staging-images.yml` -on push to master that changes any of: -- `api/internal/db/migrations/**` (cross-repo trigger via repository_dispatch — see below) -- `infra/wrangler/pg-platform/**` - -Plus daily at 09:00 UTC to keep up with migrations merged in api repo without -explicit infra commits. - -Manual rebuild: -```bash -gh workflow run wrangler-build-staging-images.yml \ - -R instanode-dev/infra \ - -f service=pg-platform -``` - -## Cross-repo migration sync - -Migrations live in the `api` repo, not infra. Two patterns to keep the -image current: - -1. **Daily cron rebuild** — the build workflow runs nightly with a fresh - checkout of both repos; any new `.sql` file lands within 24h. -2. **`api` repo notifies on migration change** — `api/.github/workflows/notify-infra-on-migration.yml` - sends a `repository_dispatch` event to infra when `api/internal/db/migrations/**` - changes, triggering an immediate build. - -If neither runs, staging pg-platform will be behind on migrations and -api startup will fail with "migration not applied" — operator-visible -via `wrangler tail instanode-pg-platform-staging`. - -## Secrets - -Set via `wrangler secret put`, scoped to `--env staging`: - -| Secret | Source | Purpose | -|---|---|---| -| `POSTGRES_USER` | operator-defined (e.g. `instanode_admin`) | role for connection | -| `POSTGRES_PASSWORD` | random, ≥32 chars | passed to connection_url | -| `POSTGRES_DB` | `instant_platform` | initial DB created at first start | - -The actual connection string handed to api/worker/provisioner is built -via service binding — they see `PG_PLATFORM` env binding, not a raw -URL with the password. - -## Verifying - -```bash -wrangler tail instanode-pg-platform-staging --format pretty -# wait for: "pg-platform staging cold start — re-applying 63 migrations against fresh PGDATA" -# then: "database system is ready to accept connections" - -# from a debug Worker shell: -wrangler dev --env staging -# Then inside the Worker: env.PG_PLATFORM.fetch("http://internal/healthz") -``` - -## Known limitations - -- **Cold-start cost is ~15-45s.** Synthetic warmer can keep it hot; without one, every traffic gap > sleepAfter (currently 30m) pays the full re-migration cost. -- **No replication.** max_instances=1; HA is meaningless when disk is ephemeral. Production gets a different model entirely. -- **No `pg_dump` artifacts persist.** If you need a snapshot for debugging, dump and immediately stream to R2 via the customer-backup pipeline; the local file dies on next sleep. -- **63 migrations is the live count as of 2026-05-30.** When api repo adds mig 064+, the daily cron rebuild picks them up. diff --git a/wrangler/pg-platform/src/worker.ts b/wrangler/pg-platform/src/worker.ts deleted file mode 100644 index 7646da8..0000000 --- a/wrangler/pg-platform/src/worker.ts +++ /dev/null @@ -1,25 +0,0 @@ -// pg-platform Worker shell. Postgres doesn't speak HTTP, but CF -// Containers require a Worker entrypoint. The Worker accepts a -// service-binding RPC from other Containers and forwards a connection -// hint; the actual TCP traffic flows over the Container DO's internal -// network using `container.fetch(request)` with `Upgrade: tcp` semantics -// (CF Containers' raw-TCP mode, available since the GA release). - -import { Container, getContainer } from "@cloudflare/containers"; - -export class PgPlatformContainer extends Container { - defaultPort = 5432; - sleepAfter = "30m"; // Longer than api so platform_db survives test bursts. -} - -export default { - async fetch(request: Request, env: Env): Promise { - const container = getContainer(env.PG_CONTAINER); - // Container holds the TCP listener; CF routes the upgraded socket through. - return container.fetch(request); - }, -}; - -interface Env { - PG_CONTAINER: DurableObjectNamespace; -} diff --git a/wrangler/pg-platform/wrangler.toml b/wrangler/pg-platform/wrangler.toml deleted file mode 100644 index 274e033..0000000 --- a/wrangler/pg-platform/wrangler.toml +++ /dev/null @@ -1,48 +0,0 @@ -# pg-platform on CF Containers (staging). -# -# Runs `postgres:16` in a CF Container. Data dir is ephemeral — -# every sleep wipes /var/lib/postgresql/data. This is the explicit -# user-blessed tradeoff for CF-only staging. -# -# Production does NOT use this; prod platform_db lives elsewhere. - -name = "instanode-pg-platform" -main = "src/worker.ts" -compatibility_date = "2026-05-30" - -[env.staging] -name = "instanode-pg-platform-staging" -# No public route — accessed only via service binding from api/worker/provisioner. - -[[env.staging.containers]] -class_name = "PgPlatformContainer" -# Custom image built by infra/.github/workflows/wrangler-build-staging-images.yml. -# Bakes the 63 migrations from api/internal/db/migrations/*.sql into -# /docker-entrypoint-initdb.d/ + pgvector extension. See ./Dockerfile. -image = "ghcr.io/instanode-dev/instant-pg-platform:staging" -max_instances = 1 # Single-writer; HA is meaningless when disk is ephemeral. -instance_type = "standard" # 1 vCPU, 4 GiB RAM, 8 GiB ephemeral - -[[env.staging.durable_objects.bindings]] -name = "PG_CONTAINER" -class_name = "PgPlatformContainer" - -[[env.staging.migrations]] -tag = "v1" -new_sqlite_classes = ["PgPlatformContainer"] - -# Bootstrap secrets via wrangler secret put: -# POSTGRES_USER, POSTGRES_PASSWORD, POSTGRES_DB -# The Postgres image reads these env vars on first boot to initialize the -# cluster — which it'll redo every sleep cycle. -[env.staging.vars] -POSTGRES_DB = "instant_platform" -# POSTGRES_INITDB_ARGS controls locale; staging just uses default. -PGDATA = "/var/lib/postgresql/data/pgdata" -# Run our 62 migrations on container boot. The init script lives in -# src/bootstrap.sh and is included in the image via Dockerfile. -APPLY_MIGRATIONS_ON_BOOT = "true" - -[env.staging.observability] -enabled = true -head_sampling_rate = 1.0 diff --git a/wrangler/provisioner/src/worker.ts b/wrangler/provisioner/src/worker.ts deleted file mode 100644 index 72fde55..0000000 --- a/wrangler/provisioner/src/worker.ts +++ /dev/null @@ -1,16 +0,0 @@ -import { Container, getContainer } from "@cloudflare/containers"; - -export class ProvisionerContainer extends Container { - defaultPort = 50051; // gRPC - sleepAfter = "20m"; -} - -export default { - async fetch(request: Request, env: Env): Promise { - return getContainer(env.PROVISIONER_CONTAINER).fetch(request); - }, -}; - -interface Env { - PROVISIONER_CONTAINER: DurableObjectNamespace; -} diff --git a/wrangler/provisioner/wrangler.toml b/wrangler/provisioner/wrangler.toml deleted file mode 100644 index d1c93dc..0000000 --- a/wrangler/provisioner/wrangler.toml +++ /dev/null @@ -1,47 +0,0 @@ -# provisioner — gRPC service in a CF Container (staging). -# No public route; api reaches it via service binding. - -name = "instanode-provisioner" -main = "src/worker.ts" -compatibility_date = "2026-05-30" - -[env.staging] -name = "instanode-provisioner-staging" - -[[env.staging.containers]] -class_name = "ProvisionerContainer" -image = "ghcr.io/instanode-dev/instant-provisioner:staging" -max_instances = 2 -instance_type = "standard" - -[[env.staging.durable_objects.bindings]] -name = "PROVISIONER_CONTAINER" -class_name = "ProvisionerContainer" - -[[env.staging.migrations]] -tag = "v1" -new_sqlite_classes = ["ProvisionerContainer"] - -[env.staging.vars] -ENVIRONMENT = "staging" - -# Provisioner reaches the customer-data Containers via service bindings. -[[env.staging.services]] -binding = "PG_CUSTOMERS" -service = "instanode-pg-customers-staging" - -[[env.staging.services]] -binding = "MONGODB" -service = "instanode-mongodb-staging" - -[[env.staging.services]] -binding = "REDIS_PROVISION" -service = "instanode-redis-provision-staging" - -[[env.staging.services]] -binding = "NATS" -service = "instanode-nats-staging" - -[env.staging.observability] -enabled = true -head_sampling_rate = 1.0 diff --git a/wrangler/redis-provision/Dockerfile b/wrangler/redis-provision/Dockerfile deleted file mode 100644 index 299d710..0000000 --- a/wrangler/redis-provision/Dockerfile +++ /dev/null @@ -1,30 +0,0 @@ -# redis-provision image for staging CF Container. -# -# Base: redis:7-alpine. CF Containers' ephemeral disk means RDB -# persistence is pointless — every sleep wipes /data. We disable -# RDB + AOF entirely and run in-memory-only with `allkeys-lru` -# eviction so the Container can't OOM under sustained writes. -# -# Why custom (vs pristine redis:7-alpine): -# - Bake redis.conf with auth + memory + eviction policy so the -# Worker shell doesn't have to pass them via wrangler.toml CMD. -# - Healthcheck via `redis-cli -a $REDIS_PASSWORD ping`. -# - Auth is via `requirepass` from REDIS_PASSWORD env (wrangler -# secret). - -FROM redis:7-alpine - -COPY infra/wrangler/redis-provision/redis.conf /etc/redis/redis.conf - -# Entrypoint that templates REDIS_PASSWORD env into the conf at boot. -# Without this, the conf can't contain the secret at build time. -COPY infra/wrangler/redis-provision/entrypoint.sh /usr/local/bin/staging-entrypoint.sh -RUN chmod +x /usr/local/bin/staging-entrypoint.sh - -HEALTHCHECK --interval=10s --timeout=3s --start-period=10s --retries=3 \ - CMD redis-cli -a "$REDIS_PASSWORD" --no-auth-warning ping | grep -q '^PONG$' || exit 1 - -EXPOSE 6379 - -ENTRYPOINT ["/usr/local/bin/staging-entrypoint.sh"] -CMD ["redis-server", "/etc/redis/redis.conf"] diff --git a/wrangler/redis-provision/entrypoint.sh b/wrangler/redis-provision/entrypoint.sh deleted file mode 100644 index bc62464..0000000 --- a/wrangler/redis-provision/entrypoint.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/sh -# Templating entrypoint for staging redis. Inlines REDIS_PASSWORD into -# /etc/redis/redis.conf at boot (the file ships with __REDIS_PASSWORD__ -# as a literal marker; we never bake a real secret into the image). - -set -eu - -if [ -z "${REDIS_PASSWORD:-}" ]; then - echo "redis-provision: REDIS_PASSWORD env var is required" >&2 - exit 1 -fi - -# In-place substitute. Using a temp file because sed -i on alpine -# behaves differently than GNU sed; this is portable. -TMP="$(mktemp)" -sed "s|__REDIS_PASSWORD__|${REDIS_PASSWORD}|" /etc/redis/redis.conf > "$TMP" -mv "$TMP" /etc/redis/redis.conf -chmod 600 /etc/redis/redis.conf # only root reads — defense in depth - -# Hand off to the configured CMD (`redis-server /etc/redis/redis.conf`). -exec "$@" diff --git a/wrangler/redis-provision/redis.conf b/wrangler/redis-provision/redis.conf deleted file mode 100644 index 7b423d0..0000000 --- a/wrangler/redis-provision/redis.conf +++ /dev/null @@ -1,28 +0,0 @@ -# Staging redis.conf — ephemeral, auth'd, LRU-capped. -# REDIS_PASSWORD is substituted at container boot by entrypoint.sh. - -bind 0.0.0.0 -port 6379 -protected-mode yes - -# Auth — entrypoint.sh inlines REDIS_PASSWORD env value here. -requirepass __REDIS_PASSWORD__ - -# Memory cap + eviction. CF Container "basic" tier has 4 GiB; cap at 3 -# GiB to leave headroom for connection buffers + COW during eviction. -maxmemory 3gb -maxmemory-policy allkeys-lru - -# No persistence — CF Containers wipe /data on sleep, so RDB snapshots -# only waste CPU. AOF same. Staging is in-memory-only by design. -save "" -appendonly no - -# Logging to stdout for `wrangler tail`. -logfile "" -loglevel notice - -# Connection limits matched to instance class. -maxclients 1000 -timeout 300 -tcp-keepalive 60 diff --git a/wrangler/redis-provision/src/worker.ts b/wrangler/redis-provision/src/worker.ts deleted file mode 100644 index 2b77911..0000000 --- a/wrangler/redis-provision/src/worker.ts +++ /dev/null @@ -1,19 +0,0 @@ -import { Container, getContainer } from "@cloudflare/containers"; - -export class RedisContainer extends Container { - defaultPort = 6379; - sleepAfter = "20m"; -} - -export default { - async fetch(request: Request, env: Env): Promise { - const url = new URL(request.url); - const tenant = url.hostname.split(".")[0].replace(/^redis-/, ""); - const id = env.REDIS_CONTAINER.idFromName(tenant); - return env.REDIS_CONTAINER.get(id).fetch(request); - }, -}; - -interface Env { - REDIS_CONTAINER: DurableObjectNamespace; -} diff --git a/wrangler/redis-provision/wrangler.toml b/wrangler/redis-provision/wrangler.toml deleted file mode 100644 index 2896e8d..0000000 --- a/wrangler/redis-provision/wrangler.toml +++ /dev/null @@ -1,32 +0,0 @@ -# redis-provision — per-tenant Redis in a CF Container (staging). - -name = "instanode-redis-provision" -main = "src/worker.ts" -compatibility_date = "2026-05-30" - -[env.staging] -name = "instanode-redis-provision-staging" -routes = [ - { pattern = "redis-*.staging.instanode.dev/*", custom_domain = true }, -] - -[[env.staging.containers]] -class_name = "RedisContainer" -# Custom image — wraps redis:7-alpine with auth + maxmemory + LRU -# eviction baked into /etc/redis/redis.conf (entrypoint templates -# REDIS_PASSWORD in at boot). -image = "ghcr.io/instanode-dev/instant-redis-provision:staging" -max_instances = 10 -instance_type = "basic" # Redis is lighter than PG/Mongo - -[[env.staging.durable_objects.bindings]] -name = "REDIS_CONTAINER" -class_name = "RedisContainer" - -[[env.staging.migrations]] -tag = "v1" -new_sqlite_classes = ["RedisContainer"] - -[env.staging.observability] -enabled = true -head_sampling_rate = 1.0 diff --git a/wrangler/worker/src/worker.ts b/wrangler/worker/src/worker.ts deleted file mode 100644 index db330bb..0000000 --- a/wrangler/worker/src/worker.ts +++ /dev/null @@ -1,23 +0,0 @@ -import { Container, getContainer } from "@cloudflare/containers"; - -export class WorkerContainer extends Container { - defaultPort = 8091; // worker exposes /metrics + /readyz on 8091 - sleepAfter = "20m"; -} - -export default { - // HTTP path: forward to container (rare; mostly metrics scrapes). - async fetch(request: Request, env: Env): Promise { - return getContainer(env.WORKER_CONTAINER).fetch(request); - }, - // Cron path: wake the container so River picks up due jobs. - async scheduled(_event: ScheduledEvent, env: Env): Promise { - const c = getContainer(env.WORKER_CONTAINER); - // A no-op POST that the worker binary handles as "tick the job loop". - await c.fetch("http://internal/tick", { method: "POST" }); - }, -}; - -interface Env { - WORKER_CONTAINER: DurableObjectNamespace; -} diff --git a/wrangler/worker/wrangler.toml b/wrangler/worker/wrangler.toml deleted file mode 100644 index 05b555d..0000000 --- a/wrangler/worker/wrangler.toml +++ /dev/null @@ -1,40 +0,0 @@ -# worker — River jobs in a CF Container (staging). -# Cron triggers via CF Cron Triggers (no public route). - -name = "instanode-worker" -main = "src/worker.ts" -compatibility_date = "2026-05-30" - -[env.staging] -name = "instanode-worker-staging" - -[[env.staging.containers]] -class_name = "WorkerContainer" -image = "ghcr.io/instanode-dev/instant-worker:staging" -max_instances = 2 -instance_type = "standard" - -[[env.staging.durable_objects.bindings]] -name = "WORKER_CONTAINER" -class_name = "WorkerContainer" - -[[env.staging.migrations]] -tag = "v1" -new_sqlite_classes = ["WorkerContainer"] - -# Cron — fires every 5 minutes; the Worker shell wakes the Container. -[env.staging.triggers] -crons = ["*/5 * * * *"] - -[env.staging.vars] -ENVIRONMENT = "staging" -OBJECT_STORE_BACKEND = "r2" -R2_BUCKET_NAME = "instant-shared-staging" - -[[env.staging.services]] -binding = "PG_PLATFORM" -service = "instanode-pg-platform-staging" - -[env.staging.observability] -enabled = true -head_sampling_rate = 1.0