Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions graceful_shutdown_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,31 @@ func TestRunServerWithGracefulShutdown_TimeoutKillsStuckRequest(t *testing.T) {
}
}

// TestRunServerWithGracefulShutdown_ListenErrorReturnsBeforeSignal — when
// app.Listen fails fast (a bind error: malformed addr, port already held),
// the serve goroutine pushes the fatal error onto serveErr and the helper
// MUST return it via the serveErr-before-signal select arm, NOT block waiting
// for SIGTERM. This is the "pod CrashLoopBackoffs instead of going green with
// no listener" contract.
func TestRunServerWithGracefulShutdown_ListenErrorReturnsBeforeSignal(t *testing.T) {
app := fiber.New(fiber.Config{DisableStartupMessage: true})
// A syntactically invalid bind address makes net.Listen fail immediately
// with a non-ErrClosed error, so the goroutine takes the serveErr<-err arm.
badAddr := "256.256.256.256:99999"

done := make(chan error, 1)
go func() {
done <- runServerWithGracefulShutdown(app, badAddr, time.Second, router.ShutdownHooks{})
}()

select {
case err := <-done:
require.Error(t, err, "a fatal Listen error must propagate out of the helper")
case <-time.After(5 * time.Second):
t.Fatal("helper blocked on a bind failure — the serveErr-before-signal arm is broken")
}
}

// Compile-time guard against a regression that removes the helper or changes
// its signature in a way that would silently bypass the MR-P0-7 fix.
var _ = func(app *fiber.App) error {
Expand Down
20 changes: 18 additions & 2 deletions internal/telemetry/tracer.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"time"

"go.opentelemetry.io/otel"
"go.opentelemetry.io/otel/exporters/otlp/otlptrace"
"go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc"
"go.opentelemetry.io/otel/propagation"
"go.opentelemetry.io/otel/sdk/resource"
Expand All @@ -18,6 +19,21 @@ import (
"google.golang.org/grpc/credentials"
)

// newExporter / newResource are package-level indirections over the OTel
// constructors so tests can override them to drive the otherwise-unreachable
// constructor-failure arms in InitTracer. Production behaviour is identical:
// they are plain forwards to otlptracegrpc.New / resource.New. Do NOT change
// the wired constructors (P0-2 OTel tracing contract) — these seams only let
// a test substitute an erroring stub.
var (
newExporter = func(ctx context.Context, opts ...otlptracegrpc.Option) (*otlptrace.Exporter, error) {
return otlptracegrpc.New(ctx, opts...)
}
newResource = func(ctx context.Context, opts ...resource.Option) (*resource.Resource, error) {
return resource.New(ctx, opts...)
}
)

// InitTracer configures the global OpenTelemetry tracer provider.
//
// Endpoint selection (in order of precedence):
Expand Down Expand Up @@ -90,13 +106,13 @@ func InitTracer(serviceName, otlpEndpoint string) func(context.Context) error {
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()

exporter, err := otlptracegrpc.New(ctx, opts...)
exporter, err := newExporter(ctx, opts...)
if err != nil {
slog.Error("telemetry.otlp_exporter_failed", "error", err, "endpoint", ep, "tls", useTLS)
return func(context.Context) error { return nil }
}

res, err := resource.New(ctx,
res, err := newResource(ctx,
resource.WithAttributes(semconv.ServiceName(serviceName)),
)
if err != nil {
Expand Down
61 changes: 61 additions & 0 deletions internal/telemetry/tracer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,70 @@ package telemetry

import (
"context"
"errors"
"testing"

"go.opentelemetry.io/otel/exporters/otlp/otlptrace"
"go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc"
"go.opentelemetry.io/otel/sdk/resource"
)

// withExporterStub temporarily replaces the package-level newExporter seam,
// restoring it on cleanup. Lets a test drive the otlptracegrpc.New failure
// arm without touching production wiring.
func withExporterStub(t *testing.T, fn func(context.Context, ...otlptracegrpc.Option) (*otlptrace.Exporter, error)) {
t.Helper()
prev := newExporter
t.Cleanup(func() { newExporter = prev })
newExporter = fn
}

// withResourceStub temporarily replaces the package-level newResource seam.
func withResourceStub(t *testing.T, fn func(context.Context, ...resource.Option) (*resource.Resource, error)) {
t.Helper()
prev := newResource
t.Cleanup(func() { newResource = prev })
newResource = fn
}

// TestInitTracer_ExporterConstructionFails — when otlptracegrpc.New errors
// (network stack misconfig, bad creds object, etc.), InitTracer MUST log and
// return a working no-op shutdown rather than crash. This is the fail-open
// contract: a broken exporter can never block service boot.
func TestInitTracer_ExporterConstructionFails(t *testing.T) {
t.Setenv("NEW_RELIC_LICENSE_KEY", "")
withExporterStub(t, func(context.Context, ...otlptracegrpc.Option) (*otlptrace.Exporter, error) {
return nil, errors.New("boom: exporter construction failed")
})

shutdown := InitTracer("instant-api", "https://otlp.nr-data.net:4317")
if shutdown == nil {
t.Fatal("InitTracer must return a non-nil no-op shutdown when the exporter fails")
}
if err := shutdown(context.Background()); err != nil {
t.Fatalf("no-op shutdown after exporter failure must return nil, got %v", err)
}
}

// TestInitTracer_ResourceConstructionFails — when resource.New errors,
// InitTracer MUST shut down the already-built exporter and return a working
// no-op shutdown. Same fail-open contract as the exporter arm.
func TestInitTracer_ResourceConstructionFails(t *testing.T) {
t.Setenv("NEW_RELIC_LICENSE_KEY", "")
// Real exporter constructs fine (lazy dial); force the resource arm.
withResourceStub(t, func(context.Context, ...resource.Option) (*resource.Resource, error) {
return nil, errors.New("boom: resource construction failed")
})

shutdown := InitTracer("instant-api", "http://localhost:4317")
if shutdown == nil {
t.Fatal("InitTracer must return a non-nil no-op shutdown when resource.New fails")
}
if err := shutdown(context.Background()); err != nil {
t.Fatalf("no-op shutdown after resource failure must return nil, got %v", err)
}
}

// TestInitTracer_EmptyEndpointNoop — when the endpoint is unset, the
// returned shutdown must be a working no-op. This is the fail-open
// contract for local dev / CI runs where OTel is intentionally off.
Expand Down
70 changes: 56 additions & 14 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,53 @@ import (
// transaction all join cleanly in queries.
const serviceName = "api"

// External boundaries are routed through package-level function variables so
// the run() seam can be exercised end-to-end (boot → ready → teardown, plus
// every failure arm) in a unit test without a real Postgres, Redis, GeoIP
// volume, or a bound TCP listener. In production every var holds its real
// implementation, so behaviour is byte-for-byte identical to inlining the
// call — this is a test seam, not a behaviour change. Do NOT change what the
// production defaults point at (notably telemetry.InitTracer — P0-2 OTel
// tracing contract); only override them in tests.
var (
initTracer = telemetry.InitTracer
connectPostgres = db.ConnectPostgres
runMigrations = db.RunMigrations
startPoolStatsExporter = db.StartPoolStatsExporter
connectRedis = db.ConnectRedis
loadGeoLite2 = middleware.LoadGeoLite2
newProvisionerClient = provisioner.NewClient
newRouterWithHooks = router.NewWithHooks
serveFunc = runServerWithGracefulShutdown

// runFunc / osExit are seams so main() — the one statement that calls
// os.Exit and thus can't run in-process under `go test` — is exercised
// with a stubbed exit. In production runFunc == run and osExit ==
// os.Exit, so behaviour is identical.
runFunc = run
osExit = os.Exit
)

func main() {
if err := runFunc(); err != nil {
slog.Error("server.fatal", "error", err)
osExit(1)
}
}

// run is the extracted body of main(). It returns an error instead of
// calling os.Exit so it can be driven from a unit test; main() is the only
// production caller and turns a non-nil error into os.Exit(1). The boot
// ordering, defers, and fail-open contracts are identical to the previous
// inline main() — every external call goes through a package-level seam var
// (defaulting to the real implementation) purely so a test can substitute a
// stub. A nil return is a clean SIGTERM-triggered graceful shutdown.
func run() (runErr error) {
// Structured JSON logging — wrapped in logctx.Handler so every record
// is decorated with service, commit_id, trace_id, team_id, tid.
//
// AddSource gives file:line of the slog call site (caller field in
// the design doc). Done before any other slog call in main so even
// the design doc). Done before any other slog call in run so even
// telemetry init failures land enriched.
base := slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{
Level: slog.LevelInfo,
Expand All @@ -59,7 +100,7 @@ func main() {
// contain the prefix value anyway (the prefix is unread at this point).
slog.SetDefault(slog.New(ctxH))

shutdownTracer := telemetry.InitTracer("instant-api", os.Getenv("OTEL_EXPORTER_OTLP_ENDPOINT"))
shutdownTracer := initTracer("instant-api", os.Getenv("OTEL_EXPORTER_OTLP_ENDPOINT"))
defer func() {
if err := shutdownTracer(context.Background()); err != nil {
slog.Error("telemetry.shutdown_failed", "error", err)
Expand All @@ -85,12 +126,12 @@ func main() {
// admin routes are disabled.
slog.SetDefault(slog.New(middleware.NewLogScrubber(ctxH, cfg.AdminPathPrefix)))

database := db.ConnectPostgres(cfg.DatabaseURL)
database := connectPostgres(cfg.DatabaseURL)
defer database.Close()

if err := db.RunMigrations(database); err != nil {
if err := runMigrations(database); err != nil {
slog.Error("main.migrations_failed", "error", err)
os.Exit(1)
return fmt.Errorf("migrations: %w", err)
}

// Pool-saturation observability (Wave-3 chaos verify, 2026-05-21).
Expand All @@ -103,7 +144,7 @@ func main() {
// is cancelled at shutdown (see Phase A/B handlers below).
poolStatsCtx, poolStatsCancel := context.WithCancel(context.Background())
defer poolStatsCancel()
go db.StartPoolStatsExporter(poolStatsCtx, database, "platform_db")
go startPoolStatsExporter(poolStatsCtx, database, "platform_db")

// Deploy-audit self-report. Idempotent on (service, commit_id,
// image_digest) — every pod startup of the same image is a no-op
Expand All @@ -113,10 +154,10 @@ func main() {
// must not stop the server from listening.
emitDeployAuditSelfReport(database)

rdb := db.ConnectRedis(cfg.RedisURL)
rdb := connectRedis(cfg.RedisURL)
defer rdb.Close()

geoDbs := middleware.LoadGeoLite2(cfg.GeoLite2DBPath)
geoDbs := loadGeoLite2(cfg.GeoLite2DBPath)
if geoDbs != nil && geoDbs.City != nil {
defer geoDbs.City.Close()
}
Expand Down Expand Up @@ -157,24 +198,24 @@ func main() {
// so a misconfigured prod pod surfaces as CrashLoopBackoff
// (operator-visible) instead of green /healthz with wrong limits.
slog.Error("plans.load_failed", "error", err, "path", plansPath, "environment", cfg.Environment)
os.Exit(1)
return fmt.Errorf("plans load: %w", err)
}

var provClient *provisioner.Client
if cfg.ProvisionerAddr != "" {
var conn *grpc.ClientConn
provClient, conn, err = provisioner.NewClient(cfg.ProvisionerAddr, cfg.ProvisionerSecret)
provClient, conn, err = newProvisionerClient(cfg.ProvisionerAddr, cfg.ProvisionerSecret)
if err != nil {
slog.Error("main.provisioner_connect_failed", "error", err)
os.Exit(1)
return fmt.Errorf("provisioner connect: %w", err)
}
defer conn.Close()
slog.Info("main.provisioner_connected", "addr", cfg.ProvisionerAddr)
} else {
slog.Info("main.provisioner_local", "note", "PROVISIONER_ADDR not set, using local providers")
}

app, hooks := router.NewWithHooks(cfg, database, rdb, geoDbs, emailClient, planRegistry, provClient, nrApp)
app, hooks := newRouterWithHooks(cfg, database, rdb, geoDbs, emailClient, planRegistry, provClient, nrApp)

slog.Info("server.starting",
"port", cfg.Port,
Expand All @@ -183,10 +224,11 @@ func main() {
"build_time", buildinfo.BuildTime,
"version", buildinfo.Version,
)
if err := runServerWithGracefulShutdown(app, ":"+cfg.Port, gracefulShutdownTimeout, hooks); err != nil {
if err := serveFunc(app, ":"+cfg.Port, gracefulShutdownTimeout, hooks); err != nil {
slog.Error("server.fatal", "error", err)
os.Exit(1)
return fmt.Errorf("serve: %w", err)
}
return nil
}

// gracefulShutdownTimeout is the budget Fiber gets to drain in-flight requests
Expand Down
Loading
Loading