diff --git a/controllers/sandbox/sandbox.go b/controllers/sandbox/sandbox.go index 51c14def..be98d890 100644 --- a/controllers/sandbox/sandbox.go +++ b/controllers/sandbox/sandbox.go @@ -399,13 +399,39 @@ func (c *SandboxController) reconcileSandboxesOnBoot(ctx context.Context) error } } - // Re-register with token refresher so tokens keep getting renewed + // Re-register with token refresher so tokens keep getting renewed. Only + // sandboxes that actually use workload identity have an identity-token file; + // skip the rest so the refresh loop doesn't spew write errors for sandboxes + // that predate workload identity. if c.tokenRefresher != nil { - appName := c.resolveAppName(ctx, &sb) tokenPath := c.sandboxPath(&sb, "identity-token") - c.tokenRefresher.register(sb.ID.String(), tokenPath, appName) - c.Log.Debug("re-registered sandbox for token refresh", - "sandbox_id", sb.ID, "app", appName) + if _, err := os.Stat(tokenPath); err == nil { + appName := c.resolveAppName(ctx, &sb) + c.tokenRefresher.register(sb.ID.String(), tokenPath, appName) + c.Log.Debug("re-registered sandbox for token refresh", + "sandbox_id", sb.ID, "app", appName) + } + } + + // Re-register the token-request secret so the still-running sandbox keeps + // authenticating to the token server. The in-memory registry starts empty + // after a restart; without this the sandbox's token requests 403 forever + // until it is restarted (MIR-1235). + if c.tokenSecrets != nil { + secretPath := c.sandboxPath(&sb, tokenSecretFilename) + secret, ok, err := loadTokenSecret(secretPath) + switch { + case err != nil: + c.Log.Warn("failed to load persisted token secret during boot reconciliation", + "sandbox_id", sb.ID, "error", err) + case !ok: + c.Log.Debug("no persisted token secret for surviving sandbox; cannot re-register", + "sandbox_id", sb.ID) + default: + c.tokenSecrets.register(sb.ID.String(), secret) + c.Log.Debug("re-registered token secret for surviving sandbox", + "sandbox_id", sb.ID) + } } } } @@ -2137,8 +2163,16 @@ func (c *SandboxController) buildSubContainerSpec( if secretErr != nil { c.Log.Warn("failed to generate token request secret", "sandbox", sb.ID, "error", secretErr) } else { - c.tokenSecrets.register(ep.Addresses[0].Addr().String(), sb.ID.String(), secret) + c.tokenSecrets.register(sb.ID.String(), secret) envVars = append(envVars, fmt.Sprintf("MIREN_IDENTITY_TOKEN_SECRET=%s", secret)) + + // Persist the secret host-side so it can be re-registered after a + // controller/token-server restart. Without this the running sandbox's + // token requests 403 forever once the in-memory registry is lost. + secretPath := c.sandboxPath(sb, tokenSecretFilename) + if writeErr := writeTokenSecret(secretPath, secret); writeErr != nil { + c.Log.Warn("failed to persist token request secret", "sandbox", sb.ID, "error", writeErr) + } } } } @@ -2433,6 +2467,13 @@ func (c *SandboxController) Delete(ctx context.Context, id entity.Id, sb *comput } if c.tokenSecrets != nil { c.tokenSecrets.unregister(id.String()) + // Best-effort removal of the persisted secret. StopSandbox also wipes the whole + // sandbox dir, but removing the sensitive secret here ensures it doesn't linger + // if StopSandbox errors out before reaching its dir cleanup. + secretPath := filepath.Join(c.Tempdir, "containerd", id.PathSafe(), tokenSecretFilename) + if err := os.Remove(secretPath); err != nil && !os.IsNotExist(err) { + c.Log.Warn("failed to remove persisted token secret", "sandbox", id, "error", err) + } } if sb != nil { c.UnconfigureFirewall(sb) diff --git a/controllers/sandbox/sandbox_frozen_test.go b/controllers/sandbox/sandbox_frozen_test.go index 27321742..bb63be4a 100644 --- a/controllers/sandbox/sandbox_frozen_test.go +++ b/controllers/sandbox/sandbox_frozen_test.go @@ -24,7 +24,7 @@ import ( // sha256sum controllers/sandbox/sandbox.go controllers/sandbox/volume.go controllers/sandbox/firewall.go func TestSandboxControllerFrozen(t *testing.T) { frozen := map[string]string{ - "sandbox.go": "9fbee5834397f3600e9706fbe78fad45e69b0fa7bc5908afb3d887ffe8fa3ef7", + "sandbox.go": "2cb139828e42cae2a41459c3e8a699dfdcd60e9e30d191abcbc4a2c432497c9e", "volume.go": "b4697764d48a90adc04ce47968ccef11ceba50da8d19c889906c5c3a539065b3", "firewall.go": "648cb5d91091d5eb7400152b19695a8045585feae59c5dd36c12d663a27bb91f", } diff --git a/controllers/sandbox/token_server.go b/controllers/sandbox/token_server.go index ecc66e3d..34463d2e 100644 --- a/controllers/sandbox/token_server.go +++ b/controllers/sandbox/token_server.go @@ -9,6 +9,7 @@ import ( "fmt" "net" "net/http" + "os" "strconv" "strings" "sync" @@ -19,6 +20,11 @@ import ( const tokenServerPort = 7123 +// tokenSecretFilename is the host-side file (under the sandbox's data dir) where a +// sandbox's token-request secret is persisted so it can be re-registered with the +// in-memory tokenSecretRegistry after a controller/token-server restart. +const tokenSecretFilename = "token-secret" + type tokenResponse struct { Value string `json:"value"` } @@ -27,39 +33,37 @@ type tokenErrorResponse struct { Error string `json:"error"` } +// tokenSecretRegistry maps a sandbox's identity to its token-request secret. Keying by +// sandbox identity (rather than raw source IP) means a recycled pod IP can never match a +// stale secret left behind by a previous sandbox: the caller's identity is resolved from +// the IP via the authoritative netdb lookup, and the secret is checked against that. type tokenSecretRegistry struct { mu sync.RWMutex - byAddr map[string]string // IP → secret - bySandbox map[string]string // sandboxID → IP (for cleanup) + bySandbox map[string]string // sandboxID → secret } func newTokenSecretRegistry() *tokenSecretRegistry { return &tokenSecretRegistry{ - byAddr: make(map[string]string), bySandbox: make(map[string]string), } } -func (r *tokenSecretRegistry) register(ip, sandboxID, secret string) { +func (r *tokenSecretRegistry) register(sandboxID, secret string) { r.mu.Lock() defer r.mu.Unlock() - r.byAddr[ip] = secret - r.bySandbox[sandboxID] = ip + r.bySandbox[sandboxID] = secret } func (r *tokenSecretRegistry) unregister(sandboxID string) { r.mu.Lock() defer r.mu.Unlock() - if ip, ok := r.bySandbox[sandboxID]; ok { - delete(r.byAddr, ip) - delete(r.bySandbox, sandboxID) - } + delete(r.bySandbox, sandboxID) } -func (r *tokenSecretRegistry) verify(ip, secret string) bool { +func (r *tokenSecretRegistry) verify(sandboxID, secret string) bool { r.mu.RLock() defer r.mu.RUnlock() - expected, ok := r.byAddr[ip] + expected, ok := r.bySandbox[sandboxID] if !ok { return false } @@ -74,6 +78,30 @@ func generateTokenSecret() (string, error) { return hex.EncodeToString(b), nil } +// writeTokenSecret persists a sandbox's token-request secret host-side at 0600. It is +// never bind-mounted into the container (the container receives the secret via the +// MIREN_IDENTITY_TOKEN_SECRET env var); persisting it lets the controller re-register +// the same secret after a restart so the still-running sandbox keeps authenticating. +func writeTokenSecret(path, secret string) error { + return atomicWriteFile(path, []byte(secret), 0600) +} + +// loadTokenSecret reads a persisted token-request secret. It returns ok=false (with a nil +// error) when no secret file exists — e.g. a sandbox started before secret persistence was +// added — so callers can skip re-registration without treating absence as a failure. +func loadTokenSecret(path string) (secret string, ok bool, err error) { + data, err := os.ReadFile(path) + if err != nil { + if os.IsNotExist(err) { + return "", false, nil + } + return "", false, err + } + // Tolerate a trailing newline so a secret written by a text editor or + // fmt.Fprintln still matches the in-process env value. + return strings.TrimRight(string(data), "\r\n"), true, nil +} + func (c *SandboxController) startTokenServer(ctx context.Context) { listenAddr := fmt.Sprintf("%s:%d", c.Subnet.Router().Addr(), tokenServerPort) @@ -125,17 +153,17 @@ func (c *SandboxController) handleTokenRequest(w http.ResponseWriter, r *http.Re } bearerToken := strings.TrimPrefix(authHeader, "Bearer ") - if c.tokenSecrets == nil || !c.tokenSecrets.verify(remoteHost, bearerToken) { - writeTokenError(w, http.StatusForbidden, "invalid token") - return - } - sandboxID, appName, ok := c.NetServ.LookupSandboxByIP(remoteHost) if !ok { writeTokenError(w, http.StatusForbidden, "unknown source address") return } + if c.tokenSecrets == nil || !c.tokenSecrets.verify(sandboxID, bearerToken) { + writeTokenError(w, http.StatusForbidden, "invalid token") + return + } + opts := workloadidentity.TokenOptions{} if auds := r.URL.Query()["audience"]; len(auds) > 0 { diff --git a/controllers/sandbox/token_server_test.go b/controllers/sandbox/token_server_test.go index 7a6369dc..b3c7e9e4 100644 --- a/controllers/sandbox/token_server_test.go +++ b/controllers/sandbox/token_server_test.go @@ -5,6 +5,8 @@ import ( "log/slog" "net/http" "net/http/httptest" + "os" + "path/filepath" "testing" "github.com/golang-jwt/jwt/v5" @@ -39,7 +41,7 @@ func newTestTokenController(t *testing.T) *SandboxController { }) secrets := newTokenSecretRegistry() - secrets.register(testSandboxIP, testSandboxID, testSecret) + secrets.register(testSandboxID, testSecret) return &SandboxController{ Log: log, @@ -183,3 +185,84 @@ func TestTokenServer_InvalidTTL(t *testing.T) { assert.Equal(t, http.StatusBadRequest, w.Code) } + +// TestTokenSecretRegistry_KeyedBySandboxIdentity pins the property behind keying the +// registry by sandbox identity rather than raw IP: a secret is bound to one sandbox and +// cannot authenticate a different sandbox (e.g. one that later reused a recycled pod IP). +func TestTokenSecretRegistry_KeyedBySandboxIdentity(t *testing.T) { + r := newTokenSecretRegistry() + r.register("sandbox/old", "secret-old") + + assert.True(t, r.verify("sandbox/old", "secret-old")) + assert.False(t, r.verify("sandbox/new", "secret-old")) + + r.unregister("sandbox/old") + assert.False(t, r.verify("sandbox/old", "secret-old")) +} + +func TestWriteLoadTokenSecret_RoundTrip(t *testing.T) { + path := filepath.Join(t.TempDir(), tokenSecretFilename) + + secret, err := generateTokenSecret() + require.NoError(t, err) + + require.NoError(t, writeTokenSecret(path, secret)) + + got, ok, err := loadTokenSecret(path) + require.NoError(t, err) + assert.True(t, ok) + assert.Equal(t, secret, got) + + info, err := os.Stat(path) + require.NoError(t, err) + assert.Equal(t, os.FileMode(0600), info.Mode().Perm()) +} + +func TestLoadTokenSecret_TrimsTrailingNewline(t *testing.T) { + path := filepath.Join(t.TempDir(), tokenSecretFilename) + require.NoError(t, os.WriteFile(path, []byte("deadbeef\n"), 0600)) + + got, ok, err := loadTokenSecret(path) + require.NoError(t, err) + assert.True(t, ok) + assert.Equal(t, "deadbeef", got) +} + +func TestLoadTokenSecret_Missing(t *testing.T) { + got, ok, err := loadTokenSecret(filepath.Join(t.TempDir(), tokenSecretFilename)) + + require.NoError(t, err) + assert.False(t, ok) + assert.Empty(t, got) +} + +// TestTokenServer_RecoversSecretAfterRestart reproduces MIR-1235: a still-running sandbox +// 403s after the controller/token-server restarts and the in-memory registry is lost, then +// recovers once the persisted secret is reloaded and re-registered for the sandbox — +// without restarting the sandbox. +func TestTokenServer_RecoversSecretAfterRestart(t *testing.T) { + c := newTestTokenController(t) + + // Simulate a controller/token-server restart: the registry is recreated empty. + c.tokenSecrets = newTokenSecretRegistry() + + w := httptest.NewRecorder() + c.handleTokenRequest(w, authedRequest("GET", "/v1/token")) + require.Equal(t, http.StatusForbidden, w.Code) + + // On start the secret was persisted host-side; boot reconcile reloads it and + // re-registers it under the sandbox identity. We use a plain t.TempDir() rather + // than c.sandboxPath(&sb, tokenSecretFilename) because this test exercises the + // load+register handoff in isolation; sandboxPath construction is covered elsewhere. + path := filepath.Join(t.TempDir(), tokenSecretFilename) + require.NoError(t, writeTokenSecret(path, testSecret)) + + secret, ok, err := loadTokenSecret(path) + require.NoError(t, err) + require.True(t, ok) + c.tokenSecrets.register(testSandboxID, secret) + + w = httptest.NewRecorder() + c.handleTokenRequest(w, authedRequest("GET", "/v1/token")) + assert.Equal(t, http.StatusOK, w.Code) +} diff --git a/docs/docs/changelog.md b/docs/docs/changelog.md index d9c37ab0..42632c42 100644 --- a/docs/docs/changelog.md +++ b/docs/docs/changelog.md @@ -13,6 +13,27 @@ All notable changes to Miren Runtime will be documented in this file. --- +## v0.10.0 +*2026-06-09* + +**Features** +- **Workload identity tokens for sandboxes** - Every sandbox now receives a signed OIDC workload identity token (GitHub Actions-style) at `/var/run/miren/identity-token`, with `MIREN_IDENTITY_TOKEN_PATH`, `MIREN_OIDC_ISSUER_URL`, and `MIREN_IDENTITY_TOKEN_URL` injected into the environment. Your cluster publishes standard `/.well-known/openid-configuration` and JWKS endpoints, so external systems like AWS STS can verify tokens and federate access — no long-lived cloud credentials baked into your app. Tokens default to RS256 (universally supported by federation verifiers), auto-refresh on a background loop, and an on-demand endpoint lets a sandbox request tokens with a custom audience or TTL. Works on both embedded and distributed runners. ([#834](https://github.com/mirendev/runtime/pull/834), [#846](https://github.com/mirendev/runtime/pull/846), [#852](https://github.com/mirendev/runtime/pull/852)) +- **Admin API is now GA** - The admin API graduates out of Miren Labs and is always on — no more `--labs adminapi` flag needed. Expose and call admin methods on your app over JSON-RPC; see the [admin interface docs](https://miren.md/admin-interface) for the security model, auditing, and per-language examples. ([#832](https://github.com/mirendev/runtime/pull/832)) +- **Automatic TLS for cloud-provisioned cluster hostnames** - When a cluster has a cloud-provisioned `*.miren.systems` hostname, Miren now provisions a real ACME certificate for it on startup instead of serving the self-signed fallback. The hostname is pinned in the allowed-hosts set so route deletions can't strip its cert. ([#836](https://github.com/mirendev/runtime/pull/836)) + +**Improvements** +- **Cleaner `miren logs` output** - Structured JSON log lines from your app are now parsed at ingress: the message becomes the log body, `time`/`level` noise is stripped, and your own fields are promoted to first-class attributes. Internal bookkeeping is namespaced under `miren.*` and hidden from text output, and log brackets show the real short ID (e.g. `[CBZ]`) instead of a truncated entity key. Existing `--service` / `sandbox` filters keep working against both old and new entries. ([#838](https://github.com/mirendev/runtime/pull/838)) +- **`logs -f` collapses repeated lines** - When following logs, consecutive lines that differ only by their timestamp (e.g. a once-a-second health ping) now collapse into a single live-updated `[ Repeated 14x over 14s ]` summary instead of burying new output. Only engages for interactive text follow — JSON, piped, and non-follow output stay verbatim, so `grep` and machine consumers are unaffected. ([#845](https://github.com/mirendev/runtime/pull/845)) + +**Bug Fixes** +- **Fixed TLS failures reaching recreated distributed runners** - `miren app run` and `miren sandbox exec` against a distributed runner could fail with `certificate is valid for ... not ` after the runner VM was recreated with a new internal IP but a persisted certificate. Runners now detect a stale certificate on start and re-issue it from the coordinator, self-healing on the next restart. ([#848](https://github.com/mirendev/runtime/pull/848)) + +**Documentation** +- Expanded the [admin interface](https://miren.md/admin-interface) page with the full security model, auditing behavior, JSON-RPC shape, and CLI usage. ([#831](https://github.com/mirendev/runtime/pull/831)) +- Expanded the [terminology](https://miren.md/terminology) page from 12 to 35 canonical definitions. ([#850](https://github.com/mirendev/runtime/pull/850)) + +--- + ## v0.9.1 *2026-06-04*