api/.github/workflows/deploy.yml at master · InstaNode-dev/api · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
# instant.dev/api — Auto-deploy on push to master
#
# Why this exists:
#   Until 2026-05-15, "shipped to master" did NOT mean "running in prod" —
#   an operator had to manually `docker buildx build && kubectl set image`.
#   A worker fix landed but never deployed; a user got a broken expiry email
#   twice as a result. This workflow eliminates that gap.
#
# Build context note:
#   The Dockerfile expects to be invoked from the parent of api/, with
#   sibling common/ and proto/ directories present (CLAUDE.md convention).
#   In CI we mirror that by checking out:
#       . (workspace root)
#       ├── api/      (this repo)
#       ├── common/   (sibling repo)
#       └── proto/    (sibling repo)
#   then `docker buildx build -f api/Dockerfile .` from the workspace root.
#
# Required repo secret:
#   KUBECONFIG_B64 — base64-encoded kubeconfig with permission to
#                    `kubectl set image deployment/instant-api -n instant`.
#                    See CLAUDE.md "Local Kubernetes Setup" for the cluster.
#
# GHCR auth uses the per-job GITHUB_TOKEN with `packages: write`.

name: Deploy

on:
  push:
    branches: [master]
    # CI-minute savings (2026-05-21): skip Deploy on docs-only commits.
    # Markdown, CLAUDE.md, runbooks, design docs, and the BUGBASH ledger
    # never change the binary — they don't need a 7-min test step + a 3-min
    # image build + rollout. Push paths matching ONLY these globs are ignored.
    # If a real code change happens to also touch a .md file in the same
    # commit, the non-ignored path triggers Deploy normally.
    paths-ignore:
      - '**.md'
      - 'docs/**'
      - 'CLAUDE.md'
      - '.gitignore'
      - 'LICENSE'
      - 'BUGBASH-*/**'
  workflow_dispatch:

concurrency:
  # CI-minute savings (2026-05-21): rapid-fire pushes now cancel the prior
  # in-flight Deploy instead of running both to completion. The 5-pushes-
  # in-10-minutes pattern that doubled today's burn now costs the duration
  # of one final Deploy, not five.
  group: deploy-${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true

permissions:
  contents: read
  packages: write

env:
  IMAGE_REPO: ghcr.io/instanode-dev/instant-api
  K8S_NAMESPACE: instant
  K8S_DEPLOYMENT: instant-api
  K8S_CONTAINER: api
  HEALTHZ_URL: https://api.instanode.dev/healthz

jobs:
  deploy:
    runs-on: ubuntu-latest
    # 2026-05-15: api unit tests require a real Postgres + Redis
    # (testhelpers.SetupTestDB / SetupTestRedis). First auto-deploy
    # run failed because no DB was reachable from the runner. These
    # service containers match the defaults in
    # api/internal/testhelpers/testhelpers.go:
    #   defaultTestDBURL = postgres://postgres:postgres@localhost:5432/instant_dev_test
    #   defaultTestRedisURL = redis://localhost:6379/15
    services:
      postgres:
        image: postgres:17-alpine
        env:
          POSTGRES_USER: postgres
          POSTGRES_PASSWORD: postgres
          POSTGRES_DB: instant_dev_test
        ports:
          - 5432:5432
        options: >-
          --health-cmd "pg_isready -U postgres"
          --health-interval 5s
          --health-timeout 3s
          --health-retries 12
      redis:
        image: redis:7-alpine
        ports:
          - 6379:6379
        options: >-
          --health-cmd "redis-cli ping"
          --health-interval 5s
          --health-timeout 3s
          --health-retries 12
    steps:
      - name: Checkout api (this repo) into ./api
        uses: actions/checkout@v4
        with:
          path: api

      - name: Checkout common sibling into ./common
        uses: actions/checkout@v4
        with:
          repository: ${{ vars.COMMON_REPO || format('{0}/common', github.repository_owner) }}
          # 2026-05-15: GITHUB_TOKEN is scoped to THIS repo only and 404s
          # on private sibling repos in the same org. REPO_ACCESS_TOKEN
          # is a fine-grained PAT with read access to
          # InstaNode-dev/{common,proto}. Set via
          # `gh secret set REPO_ACCESS_TOKEN --repo InstaNode-dev/<name>`.
          token: ${{ secrets.REPO_ACCESS_TOKEN }}
          path: common

      - name: Checkout proto sibling into ./proto
        uses: actions/checkout@v4
        with:
          repository: ${{ vars.PROTO_REPO || format('{0}/proto', github.repository_owner) }}
          token: ${{ secrets.REPO_ACCESS_TOKEN }}
          path: proto

      - name: Compute build metadata
        id: meta
        run: |
          SHORT_SHA="${GITHUB_SHA:0:7}"
          BUILD_TIME="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
          VERSION="master-${SHORT_SHA}"
          echo "short_sha=${SHORT_SHA}"  >> "$GITHUB_OUTPUT"
          echo "build_time=${BUILD_TIME}" >> "$GITHUB_OUTPUT"
          echo "version=${VERSION}"       >> "$GITHUB_OUTPUT"
          echo "Built ${VERSION} (${BUILD_TIME})"

      - name: Set up Go (for unit tests + go.mod replace directives)
        uses: actions/setup-go@v5
        with:
          go-version: '1.25'

      - name: Stage sibling repos for go.mod replace (../common, ../proto)
        # The api repo's go.mod uses `replace instant.dev/common => ../common`
        # and `replace instant.dev/proto => ../proto`. When `go test` runs
        # inside ./api, the relative paths resolve to ./common and ./proto
        # in the workspace root — which is already correct. No mv needed.
        run: ls -la

      - name: Apply DB migrations to the test database
        # 2026-05-16: before this step CI ran tests against a BARE Postgres
        # whose schema came ONLY from testhelpers.runMigrations — a
        # hand-maintained mirror of the prod schema. Every migration that
        # added a table/column without a matching mirror edit silently broke
        # this gate (email_events, pending_deletions, deployment_events,
        # deployments.private, …). This step applies the REAL migration
        # files, exactly like `make test-db-up` does locally, so CI runs
        # against the same schema developers do. runMigrations still runs
        # (all IF NOT EXISTS) as a harmless backstop. The TestRunMigrations-
        # MirrorsEveryMigrationTable guard keeps the mirror itself honest.
        env:
          PGPASSWORD: postgres
        run: |
          for f in $(ls api/internal/db/migrations/*.sql | sort); do
            echo "→ applying $(basename "$f")"
            psql -h localhost -U postgres -d instant_dev_test -f "$f" >/dev/null
          done
          echo "all migrations applied to instant_dev_test"
          # The db provider's local backend (internal/providers/db/local.go)
          # CREATEs a customer database per /db/new. In tests it connects to
          # TEST_POSTGRES_CUSTOMERS_URL — which testhelpers defaults to a
          # localhost:5434 instance that does NOT exist on the CI runner, so
          # every postgres provision (TestDBNew_*, TestBulkTwin_*) 503'd.
          # Create that database on the same service container and point the
          # env var at it below. It needs no migrations — it is only the
          # admin connection target for CREATE DATABASE / CREATE USER.
          psql -h localhost -U postgres -d postgres -c "CREATE DATABASE instant_customers" >/dev/null
          echo "created instant_customers (db-provider admin target)"

      - name: Run unit tests (short, no integration deps)
        working-directory: api
        env:
          # Match the service container above. testhelpers default would
          # also work since localhost:5432 is the same, but setting these
          # explicitly survives any future default-URL drift.
          TEST_DATABASE_URL: postgres://postgres:postgres@localhost:5432/instant_dev_test?sslmode=disable
          TEST_REDIS_URL: redis://localhost:6379/15
          # db-provider admin target (see the migrations step above). Without
          # this the default is an unreachable localhost:5434 and every
          # postgres-provisioning test fails with 503.
          TEST_POSTGRES_CUSTOMERS_URL: postgres://postgres:postgres@localhost:5432/instant_customers?sslmode=disable
        # 2026-05-16: the previous -skip list (TestOpenAPI_CoversAll-
        # RegisteredRoutes | TestCrossTeam_ | TestCustomDomainCreate_) was
        # removed once their real causes were fixed: the OpenAPI test had a
        # stale internal-route whitelist, TestCrossTeam_ never needed a
        # second DB at all, and TestCustomDomainCreate_ had a stale 5-column
        # sqlmock row. The whole `./...` suite passes — keep it that way; do
        # not re-add a -skip list.
        #
        # `-p 1` is load-bearing: every package shares the single
        # instant_dev_test DB + redis/15. With the default parallelism,
        # `go test ./...` runs ~25 package binaries at once and they corrupt
        # each other's DB/redis state mid-test (a handler test CREATEs a real
        # DB while a models test TRUNCATEs, a middleware test's rate-limit
        # counter is FLUSHed by another package, …). The Makefile's
        # `test-unit` target sidesteps this by running per-package; `-p 1`
        # serialises package execution for the same effect in one invocation.
        run: |
          go test ./... -short -count=1 -p 1

      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3

      - name: Log in to GHCR
        # 2026-05-17: the per-job GITHUB_TOKEN (even with packages: write)
        # is scoped to THIS repo and is not authorised to push the
        # org-owned package ghcr.io/instanode-dev/instant-api — every push
        # 403'd. GHCR_PUSH_TOKEN is a classic PAT with write:packages owned
        # by a user who has write access to that package. See task #121.
        uses: docker/login-action@v3
        with:
          registry: ghcr.io
          username: ${{ github.actor }}
          password: ${{ secrets.GHCR_PUSH_TOKEN }}

      - name: Build and push image
        # Build context = workspace root so Dockerfile's
        # `COPY proto/`, `COPY common/`, `COPY api/` all resolve.
        run: |
          docker buildx build \
            --platform linux/amd64 \
            -f api/Dockerfile \
            --build-arg GIT_SHA="${{ steps.meta.outputs.short_sha }}" \
            --build-arg BUILD_TIME="${{ steps.meta.outputs.build_time }}" \
            --build-arg VERSION="${{ steps.meta.outputs.version }}" \
            -t "${IMAGE_REPO}:${{ steps.meta.outputs.version }}" \
            -t "${IMAGE_REPO}:latest" \
            --push \
            .

      - name: Set up kubectl
        uses: azure/setup-kubectl@v3
        with:
          version: 'latest'

      - name: Configure kubeconfig from KUBECONFIG_B64 secret
        env:
          KUBECONFIG_B64: ${{ secrets.KUBECONFIG_B64 }}
        run: |
          if [ -z "${KUBECONFIG_B64}" ]; then
            echo "::error::KUBECONFIG_B64 repo secret is not set. Add it under Settings → Secrets → Actions."
            exit 1
          fi
          mkdir -p "$HOME/.kube"
          echo "$KUBECONFIG_B64" | base64 -d > "$HOME/.kube/config"
          chmod 600 "$HOME/.kube/config"
          kubectl version --client=true

      - name: Roll out new image
        run: |
          IMAGE="${IMAGE_REPO}:${{ steps.meta.outputs.version }}"
          echo "Setting ${K8S_DEPLOYMENT}.${K8S_CONTAINER} to ${IMAGE}"
          kubectl set image \
            "deployment/${K8S_DEPLOYMENT}" \
            "${K8S_CONTAINER}=${IMAGE}" \
            -n "${K8S_NAMESPACE}"
          kubectl rollout status \
            "deployment/${K8S_DEPLOYMENT}" \
            -n "${K8S_NAMESPACE}" \
            --timeout=180s

      - name: Verify rolled-out image tag matches built version
        run: |
          ROLLED=$(kubectl get deployment "${K8S_DEPLOYMENT}" -n "${K8S_NAMESPACE}" \
            -o jsonpath="{.spec.template.spec.containers[?(@.name=='${K8S_CONTAINER}')].image}")
          EXPECTED="${IMAGE_REPO}:${{ steps.meta.outputs.version }}"
          echo "Live image: ${ROLLED}"
          echo "Expected:   ${EXPECTED}"
          if [ "${ROLLED}" != "${EXPECTED}" ]; then
            echo "::error::Rolled image (${ROLLED}) != expected (${EXPECTED})"
            exit 1
          fi

      - name: Curl live /healthz and confirm new SHA is reported
        run: |
          SHORT_SHA="${{ steps.meta.outputs.short_sha }}"
          # Allow up to ~30s for the new pod to start serving the public URL.
          for i in 1 2 3 4 5 6; do
            BODY=$(curl -fsSL --max-time 5 "${HEALTHZ_URL}" || echo "")
            echo "Attempt ${i}: ${BODY}"
            if echo "${BODY}" | grep -q "${SHORT_SHA}"; then
              echo "Confirmed live /healthz reports commit_id=${SHORT_SHA}"
              exit 0
            fi
            sleep 5
          done
          echo "::error::live /healthz never reported commit_id=${SHORT_SHA}"
          exit 1