diff --git a/CLAUDE.md b/CLAUDE.md index 995e7c74..c88029c2 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -266,6 +266,7 @@ Conventions: - Suite naming: `*_public_test.go` → `{Name}PublicTestSuite`, `*_test.go` → `{Name}TestSuite` - Table-driven structure with `validateFunc` callbacks +- One suite method per function under test — all scenarios (success, errors, edge cases) as rows in one table - Avoid generic file names like `helpers.go` or `utils.go` — name files after what they contain diff --git a/README.md b/README.md index 28064460..cff63853 100644 --- a/README.md +++ b/README.md @@ -30,6 +30,7 @@ them to be used as appliances. | 🌐 **[Network Management][]** | DNS read/update, ping | | ⚙️ **[Command Execution][]** | Remote exec and shell across managed hosts | | 📊 **[System Facts][]** | Agent-collected system facts — architecture, kernel, FQDN, CPUs, network interfaces, service/package manager | +| 🔄 **[Agent Lifecycle][]** | Node conditions (memory, disk, load pressure), graceful drain/cordon for maintenance | | ⚡ **[Async Job System][]** | NATS JetStream with KV-first architecture — broadcast, load-balanced, and label-based routing across hosts | | 💚 **[Health][] & [Metrics][]** | Liveness, readiness, system status endpoints, Prometheus `/metrics` | | 📋 **[Audit Logging][]** | Structured API audit trail in NATS KV with 30-day retention and admin-only read access | @@ -77,5 +78,6 @@ them to be used as appliances. The [MIT][] License. +[Agent Lifecycle]: https://osapi-io.github.io/osapi/sidebar/features/agent-lifecycle [System Facts]: https://osapi-io.github.io/osapi/sidebar/features/node-management [MIT]: LICENSE diff --git a/cmd/api_helpers.go b/cmd/api_helpers.go index c07f2811..72254b8a 100644 --- a/cmd/api_helpers.go +++ b/cmd/api_helpers.go @@ -74,6 +74,7 @@ type natsBundle struct { jobsKV jetstream.KeyValue registryKV jetstream.KeyValue factsKV jetstream.KeyValue + stateKV jetstream.KeyValue } // setupAPIServer connects to NATS, creates the API server with all handlers, @@ -112,7 +113,7 @@ func setupAPIServer( checker := newHealthChecker(b.nc, b.jobsKV) auditStore, auditKV, serverOpts := createAuditStore(ctx, log, b.nc, namespace) metricsProvider := newMetricsProvider( - b.nc, b.jobsKV, b.registryKV, b.factsKV, auditKV, streamName, b.jobClient, + b.nc, b.jobsKV, b.registryKV, b.factsKV, b.stateKV, auditKV, streamName, b.jobClient, ) sm := api.New(appConfig, log, serverOpts...) @@ -163,11 +164,21 @@ func connectNATSBundle( } } + var stateKV jetstream.KeyValue + if appConfig.NATS.State.Bucket != "" { + stateKVConfig := cli.BuildStateKVConfig(namespace, appConfig.NATS.State) + stateKV, err = nc.CreateOrUpdateKVBucketWithConfig(ctx, stateKVConfig) + if err != nil { + cli.LogFatal(log, "failed to create state KV bucket", err) + } + } + jc, err := jobclient.New(log, nc, &jobclient.Options{ Timeout: 30 * time.Second, KVBucket: jobsKV, RegistryKV: registryKV, FactsKV: factsKV, + StateKV: stateKV, StreamName: streamName, }) if err != nil { @@ -180,6 +191,7 @@ func connectNATSBundle( jobsKV: jobsKV, registryKV: registryKV, factsKV: factsKV, + stateKV: stateKV, } } @@ -216,6 +228,7 @@ func newMetricsProvider( jobsKV jetstream.KeyValue, registryKV jetstream.KeyValue, factsKV jetstream.KeyValue, + stateKV jetstream.KeyValue, auditKV jetstream.KeyValue, streamName string, jc jobclient.JobClient, @@ -254,7 +267,7 @@ func newMetricsProvider( }, nil }, KVInfoFn: func(fnCtx context.Context) ([]health.KVMetrics, error) { - buckets := []jetstream.KeyValue{jobsKV, registryKV, factsKV, auditKV} + buckets := []jetstream.KeyValue{jobsKV, registryKV, factsKV, stateKV, auditKV} results := make([]health.KVMetrics, 0, len(buckets)) for _, kv := range buckets { diff --git a/cmd/client_agent_drain.go b/cmd/client_agent_drain.go new file mode 100644 index 00000000..c59c167e --- /dev/null +++ b/cmd/client_agent_drain.go @@ -0,0 +1,61 @@ +// Copyright (c) 2026 John Dewey + +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS IN THE SOFTWARE. + +package cmd + +import ( + "fmt" + + "github.com/spf13/cobra" + + "github.com/retr0h/osapi/internal/cli" +) + +// clientAgentDrainCmd represents the clientAgentDrain command. +var clientAgentDrainCmd = &cobra.Command{ + Use: "drain", + Short: "Drain an agent", + Long: `Stop an agent from accepting new jobs. In-flight jobs continue to completion.`, + Run: func(cmd *cobra.Command, _ []string) { + ctx := cmd.Context() + hostname, _ := cmd.Flags().GetString("hostname") + + resp, err := sdkClient.Agent.Drain(ctx, hostname) + if err != nil { + cli.HandleError(err, logger) + return + } + + if jsonOutput { + fmt.Println(string(resp.RawJSON())) + return + } + + fmt.Println() + cli.PrintKV("Hostname", hostname, "Status", "Draining") + cli.PrintKV("Message", resp.Data.Message) + }, +} + +func init() { + clientAgentCmd.AddCommand(clientAgentDrainCmd) + clientAgentDrainCmd.Flags().String("hostname", "", "Hostname of the agent to drain") + _ = clientAgentDrainCmd.MarkFlagRequired("hostname") +} diff --git a/cmd/client_agent_get.go b/cmd/client_agent_get.go index 52232ccc..dd3c0be9 100644 --- a/cmd/client_agent_get.go +++ b/cmd/client_agent_get.go @@ -64,6 +64,10 @@ func displayAgentGetDetail( kvArgs := []string{"Hostname", data.Hostname, "Status", data.Status} cli.PrintKV(kvArgs...) + if data.State != "" && data.State != "Ready" { + cli.PrintKV("State", data.State) + } + if len(data.Labels) > 0 { cli.PrintKV("Labels", cli.FormatLabels(data.Labels)) } @@ -138,6 +142,48 @@ func displayAgentGetDetail( cli.PrintKV("Interface "+iface.Name, strings.Join(parts, " ")) } } + + var sections []cli.Section + + if len(data.Conditions) > 0 { + condRows := make([][]string, 0, len(data.Conditions)) + for _, c := range data.Conditions { + status := "false" + if c.Status { + status = "true" + } + reason := c.Reason + since := "" + if !c.LastTransitionTime.IsZero() { + since = cli.FormatAge(time.Since(c.LastTransitionTime)) + " ago" + } + condRows = append(condRows, []string{c.Type, status, reason, since}) + } + sections = append(sections, cli.Section{ + Title: "Conditions", + Headers: []string{"TYPE", "STATUS", "REASON", "SINCE"}, + Rows: condRows, + }) + } + + if len(data.Timeline) > 0 { + timelineRows := make([][]string, 0, len(data.Timeline)) + for _, te := range data.Timeline { + timelineRows = append( + timelineRows, + []string{te.Timestamp, te.Event, te.Hostname, te.Message, te.Error}, + ) + } + sections = append(sections, cli.Section{ + Title: "Timeline", + Headers: []string{"TIMESTAMP", "EVENT", "HOSTNAME", "MESSAGE", "ERROR"}, + Rows: timelineRows, + }) + } + + for _, sec := range sections { + cli.PrintCompactTable([]cli.Section{sec}) + } } func init() { diff --git a/cmd/client_agent_list.go b/cmd/client_agent_list.go index 89ac4ccc..c43a5c56 100644 --- a/cmd/client_agent_list.go +++ b/cmd/client_agent_list.go @@ -22,6 +22,7 @@ package cmd import ( "fmt" + "strings" "time" "github.com/spf13/cobra" @@ -57,6 +58,22 @@ Shows each agent's hostname, status, labels, age, load, and OS.`, rows := make([][]string, 0, len(agents)) for _, a := range agents { + status := a.State + if status == "" { + status = "Ready" + } + conditions := "-" + if len(a.Conditions) > 0 { + active := make([]string, 0) + for _, c := range a.Conditions { + if c.Status { + active = append(active, c.Type) + } + } + if len(active) > 0 { + conditions = strings.Join(active, ",") + } + } labels := cli.FormatLabels(a.Labels) age := "" if !a.StartedAt.IsZero() { @@ -72,7 +89,8 @@ Shows each agent's hostname, status, labels, age, load, and OS.`, } rows = append(rows, []string{ a.Hostname, - a.Status, + status, + conditions, labels, age, loadStr, @@ -82,9 +100,17 @@ Shows each agent's hostname, status, labels, age, load, and OS.`, sections := []cli.Section{ { - Title: fmt.Sprintf("Active Agents (%d)", resp.Data.Total), - Headers: []string{"HOSTNAME", "STATUS", "LABELS", "AGE", "LOAD (1m)", "OS"}, - Rows: rows, + Title: fmt.Sprintf("Active Agents (%d)", resp.Data.Total), + Headers: []string{ + "HOSTNAME", + "STATUS", + "CONDITIONS", + "LABELS", + "AGE", + "LOAD (1m)", + "OS", + }, + Rows: rows, }, } cli.PrintCompactTable(sections) diff --git a/cmd/client_agent_undrain.go b/cmd/client_agent_undrain.go new file mode 100644 index 00000000..7f668f03 --- /dev/null +++ b/cmd/client_agent_undrain.go @@ -0,0 +1,61 @@ +// Copyright (c) 2026 John Dewey + +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS IN THE SOFTWARE. + +package cmd + +import ( + "fmt" + + "github.com/spf13/cobra" + + "github.com/retr0h/osapi/internal/cli" +) + +// clientAgentUndrainCmd represents the clientAgentUndrain command. +var clientAgentUndrainCmd = &cobra.Command{ + Use: "undrain", + Short: "Undrain an agent", + Long: `Resume accepting jobs on a drained agent.`, + Run: func(cmd *cobra.Command, _ []string) { + ctx := cmd.Context() + hostname, _ := cmd.Flags().GetString("hostname") + + resp, err := sdkClient.Agent.Undrain(ctx, hostname) + if err != nil { + cli.HandleError(err, logger) + return + } + + if jsonOutput { + fmt.Println(string(resp.RawJSON())) + return + } + + fmt.Println() + cli.PrintKV("Hostname", hostname, "Status", "Ready") + cli.PrintKV("Message", resp.Data.Message) + }, +} + +func init() { + clientAgentCmd.AddCommand(clientAgentUndrainCmd) + clientAgentUndrainCmd.Flags().String("hostname", "", "Hostname of the agent to undrain") + _ = clientAgentUndrainCmd.MarkFlagRequired("hostname") +} diff --git a/cmd/nats_helpers.go b/cmd/nats_helpers.go index 1f189efc..2fbce3c5 100644 --- a/cmd/nats_helpers.go +++ b/cmd/nats_helpers.go @@ -172,6 +172,14 @@ func setupJetStream( } } + // Create state KV bucket with configured settings (no TTL) + if appConfig.NATS.State.Bucket != "" { + stateKVConfig := cli.BuildStateKVConfig(namespace, appConfig.NATS.State) + if _, err := nc.CreateOrUpdateKVBucketWithConfig(ctx, stateKVConfig); err != nil { + return fmt.Errorf("create state KV bucket %s: %w", stateKVConfig.Bucket, err) + } + } + // Create DLQ stream dlqMaxAge, _ := time.ParseDuration(appConfig.NATS.DLQ.MaxAge) dlqStorage := cli.ParseJetstreamStorageType(appConfig.NATS.DLQ.Storage) diff --git a/configs/osapi.yaml b/configs/osapi.yaml index e037a11a..3a09cd5a 100644 --- a/configs/osapi.yaml +++ b/configs/osapi.yaml @@ -98,6 +98,11 @@ nats: storage: file replicas: 1 + state: + bucket: agent-state + storage: file + replicas: 1 + telemetry: tracing: enabled: true @@ -133,3 +138,7 @@ agent: group: web.dev.us-east # hierarchical: --target group:web, group:web.dev, etc. facts: interval: 60s + conditions: + memory_pressure_threshold: 90 + high_load_multiplier: 2.0 + disk_pressure_threshold: 90 diff --git a/docs/docs/gen/api/drain-agent.api.mdx b/docs/docs/gen/api/drain-agent.api.mdx new file mode 100644 index 00000000..3e2e3d21 --- /dev/null +++ b/docs/docs/gen/api/drain-agent.api.mdx @@ -0,0 +1,525 @@ +--- +id: drain-agent +title: "Drain an agent" +description: "Stop the agent from accepting new jobs. In-flight jobs continue to completion." +sidebar_label: "Drain an agent" +hide_title: true +hide_table_of_contents: true +api: eJztVsFu20YQ/ZXFnFqAlpQ0PZQ3FakBBQhixAp6cAVjxB2Ja5O7zOzQjkrw34NZyjJt2UEOCdACPmmX3Jl5895w9ToIDTGKC35hIQfL6Px8S14gA0uxYNfoS8jhXEJjpCSD+tpsONQGi4IacX5rPN2aq7COE7PwJ5vKbUtJe1MEL863ZCSYItRNRZpv8o+HDAS3EfILSAUv36PHLdW6nJ8tLlOZywO8CKsMIhUtO9lBftHBn4RMPG+l1BzpeH7LTghW/SqDBhlrEuKYTnusCXIoQ5S0zMBpVw1KCRkwfW4dk4VcuKUMYlFSjZB3ILtG46Kw81voH7OyLMnc5TRhMyJIgklsTkDRMMUm+EhRc76ezfTnYaZEwhBinHfiUMhOIANlUAXJO8CmqVyR+JheRQ3rjqGG9RUVql/Dyp64oWhNMeKWnuipH/d/cTi46nt99Wb26hjsJ4+tlIHdv2TNiZmfLcw17cwhzQ9DTcyBjzE/lmFuRvs7HVKskRLFhKJomQc+6QvqGEIOp+gqsqoUk7CjGzJRUNo4GYQWdFX8juLWOl1iZfYxBtehlXsQT5a1w0fhSW4DXxtxNYVWUuki2LFQzgttiY8KLw9NasCDIr/PZiqeOEkl/9JTH/czCHfC/nYs7GngtbOWvDkxCx/bzcYVTgezIa5djOlLfFH3/6Dum+fuGB/EbELrf+Tt8qLkT1Tyj+eUxIoJ7c44n65eikI2UUMv0v73pe0zqEnKoMarCTExr3Ykh2myENPuzln00+QLQB0Q3wyWZmSHzlXJQayxKTqAL0Ua2Jsa3a/TIcj2i9PANQrk8O7vZXIDzm9CCt9DH4bt3p/p/z1koEAGFl5NZpOZsqZt1JjGa2+43iZDg35wRY/56+6H9KcYzKF/oS8ybSplsM+g5UoLD1TvjSNkkI+s4cD2Kkt2UQ913RojfeKq7/Xx55Z4N2hwg+xwrTRddGBd1LWFfINVpG80+8vHvVX61XyPhXymlf1D9DuVA6tWd5DBNe3GXrdf9RmUhJY4wRxezxO3o8CjS0KN62FCzz6cLyEDfDhaj0YppX8SVdcNJ5bhmnzfH0CK7hVh338FZL5baA== +sidebar_class_name: "post api-method" +info_path: gen/api/agent-management-api +custom_edit_url: null +--- + +import ApiTabs from "@theme/ApiTabs"; +import DiscriminatorTabs from "@theme/DiscriminatorTabs"; +import MethodEndpoint from "@theme/ApiExplorer/MethodEndpoint"; +import SecuritySchemes from "@theme/ApiExplorer/SecuritySchemes"; +import MimeTabs from "@theme/MimeTabs"; +import ParamsItem from "@theme/ParamsItem"; +import ResponseSamples from "@theme/ResponseSamples"; +import SchemaItem from "@theme/SchemaItem"; +import SchemaTabs from "@theme/SchemaTabs"; +import Heading from "@theme/Heading"; +import OperationTabs from "@theme/OperationTabs"; +import TabItem from "@theme/TabItem"; + + + + + + + + + + +Stop the agent from accepting new jobs. In-flight jobs continue to completion. + + + + + +
+ +

+ Path Parameters +

+
+ +
+
+
+ + +
+ + + Agent drain initiated. + + +
+ + + + +
+ + + Schema + +
+ +
    + + + +
+
+
+ + + + +
+
+
+
+
+
+ + + Unauthorized - API key required + + +
+ + + + +
+ + + Schema + +
+ +
    + + + + + + + +
+
+
+ + + + +
+
+
+
+
+
+ + + Forbidden - Insufficient permissions + + +
+ + + + +
+ + + Schema + +
+ +
    + + + + + + + +
+
+
+ + + + +
+
+
+
+
+
+ + + Agent not found. + + +
+ + + + +
+ + + Schema + +
+ +
    + + + + + + + +
+
+
+ + + + +
+
+
+
+
+
+ + + Agent already in requested state. + + +
+ + + + +
+ + + Schema + +
+ +
    + + + + + + + +
+
+
+ + + + +
+
+
+
+
+
+
+
+ \ No newline at end of file diff --git a/docs/docs/gen/api/get-agent-details.api.mdx b/docs/docs/gen/api/get-agent-details.api.mdx index 5769d8d5..2ef825ed 100644 --- a/docs/docs/gen/api/get-agent-details.api.mdx +++ b/docs/docs/gen/api/get-agent-details.api.mdx @@ -5,7 +5,7 @@ description: "Get detailed information about a specific agent by hostname." sidebar_label: "Get agent details" hide_title: true hide_table_of_contents: true -api: eJztWEtv2zgQ/isET7uAokiJnU10y3abIrt9BG2CHoLAoMWRzUYiFXLkxmvovy+Gkm3JdhIv2h4K9GRZmvnmLX2cBZfgUqtKVEbzhL8BZBJQqBwkUzozthD0iImxqZAJ5kpIVaZSJiagkY3nbGocalFAyANuSrBe/lLyhE8Az0nqLw/oeMBRTBxPbrm/PXontJhAQZfnV5cjjzhaQTh+F3AHaWUVznlyu+B/grBgzyucEoYXTywIye/qu4CXwooCEKzzwuQST/jSOx5wRQGWAqc84BYeKmVB8gRtBQF36RQKwZMFx3lJeg6t0hNeBxsJup7CKmJmMoZTaFOBhllAq2AGISeHLLjSaAeOYI+iiH76YD4Nbb4d5S81GkEjSYqyzFXqM3H4xZH4YttLM/4CKfKAl5byhqoxtgp6K569wwkpdIcCK7cLBXRVUBE+gpBzHvD3BpvLu10W0spairTB27aTizHkbldcQkpFQCK/6kX4QpX+gfnBTOQVsAaapUZnalJZkMzoDesWJsohWJAjgbuCbaaAJ1wKhANUvpv6Bj9PoQPLcuGQWcgsuCkNEjo2BWFxDGKVWYvf0WBpTQrOsRbX2zBuRBO8K619rA/NyOkJc3OHUHQHP9xqLqnIz3HVKO/TYN3meqt09ci6GGQBHkVR5gRzM640VuT+DKzb20QrvK+VoyiMBryuuy+C235kawfuAo4Kvd6HT5c6Mx/bwSYvq9KXZy8n2+w2Kk3jGyFHYgZWTODlOnUwSJG1io5lxrI4YMOACS1ZPGSF0hWC2y5eXKhuRnVVjMFuGXrbQffglFPf03EL3UtnFB4f1QEffiN2z+0O+NFpHfD4W9HjJ+GHm33gk9TGs7TcaQKycd6Y6HZCAYWx85er+M7LscqRi89OGhoUeQdQaYTJjqCvSY419pnSbDzfjPL0+PT0JKI8ZhZgD8gLC/As4lF09kc8pLJXjrL2IuKNA/ks4iA+GxxHg81iNEloHW+tdYrRZLNbB2HTqUJIsbJ7zOWrqxvW1ei/J0QhTwYEeg9WQz7a+5304RNrVJZvpj7uMIyHYXRwFh9MQINVKdlIy2qUmkrjHtl879ufXne5mahU5OzV1c1GPqnaD3IPZy+qPJ+zh0rkKlMgmTSFUJotSd3a7a8wPojisL0Rpqbw3zKwM5XCqJjYl21daoXtS6yP3dyTBFiK9F5M9gS8aoRZ4cmk3ahfiQRIWbSZSHvcQVgriLoohGIn9+gP41N8am0McBp5a+Vs8LxkfHYUxienYRzGS42T5zUyOI2SJPbvGZE+LxtFSRwnR0fJ8XEyGCTDIWllolD5fI/6XDEhpSUu0aj0EyorP4xL+qc0oGfWgCcERU/vNifYZ64zsu8Bvxp7f7ksynp4vZ8p/g8m2LD3fgivHxG0BMk8FsusKdham6jSTEmwLtwiAJ3DQkt9O257sk6ff17XpDiI4m1Gf6NFhVNj1b8g2QE7v7pk9zBnKyPfjeKDtWaP6Thnnf9LduR1GU4FMpN6bi77Vb5ozn+dE01L3cOGajfHuZeNr5Pe6rTHyJUTO83KCsi0btqEEVkyVUObUyP3+Xpdr4IkhZ6RYRRR8ZZFfU1SnQZsCnu8XdgLY8dKStDsgF1qV2WZSpVn3mAL5Zw/sP6q7s9Q3cFTB3FtkGWm0r/G9Gco5HDXRsULLtNBJ1rxg1Ysvwr7gwrrT1I4Ne0CkRJPK7uEH/pSHi6WH+maN9SzWfh1doWfqIRNlbobw5XXU8SStys/+j/2QjxoLy6Wy5e/P197hrBaonSJAFsvL+kr31kXJDwOo9DzutI4LIRe00e/X+215GbuFusG/dZlbBsuwiMelrlQ2u8rrD9RNjltl6g84MmK+twFngbRw8ViLBzc2Lyu6fZDBXS+pVTPhFViTNm49QshupY8yUTuNulYN6DfPrY86He25x71iSiWFF4TgfeLPp5wTse0eXfnW9/VAZ+CkGC9p83j8zSFEjuKW+8B2t6umvDN62uin/0e2ugZj77TqcWikbg296DreuUj0n9ysK7/A+haFv8= +api: eJztWdFu47YS/RWCT72A4pUSO83qLd0mvbltd4PdBH1YBAYtjiw2EqklR+66hv79YkjZlmwncZH2ocA+xbHIMzNnhuPD0YpLcJlVNSqjecp/AmQSUKgSJFM6N7YS9IiJmWmQCeZqyFSuMibmoJHNlqwwDrWoYMQjbmqwfv2N5CmfA17Sqh89oOMRRzF3PP3M/dfTX4UWc6jo4+XtzdQjTjcQjj9E3EHWWIVLnn5e8R9AWLCXDRaE4ZenFoTkD+1DxGthRQUI1vnF5BJP+do7HnFFAdYCCx5xC18aZUHyFG0DEXdZAZXg6YrjsqZ9Dq3Sc95GOwTdFbCJmJmcYQEdFWiYBbQKFjDi5JAFVxvtwBHsaRzTnyGYp6Hj2xF/mdEIGmmlqOtSZZ6JN787Wr7a99LMfocMecRrS7yhCsY2Qe/Fc3Q4IwrdocDGHUIB3VSUhI8g5JJH/L3B8PHhkIWssZYiDXj7dkoxg9IdiktIqQhIlLeDCF/I0s+wPFmIsgEWoFlmdK7mjQXJjN6xbmGuHIIFORV4KNhwCnjKpUA4QeWraWjwtwJ6sKwUDpmF3IIr6CChYwUIizMQG2Yt/o0Ga2sycI51uN6GcVM6wYdoHWJ9CEdOz5lbOoSqf/BHe8UlFfk5a8LmYwqsX1y/KN18ZX0MsgBfRVWXBHM/azQ25P4CrDvaRLf4WCun8Sge87btN4LPw8i2DjxEHBX6fR8+3ejcfOwONnnZ1D49RznZsRu2hMI3Qk7FAqyYw8t56mHQRtZtdCw3liURm0RMaMmSCauUbhDcfvKSSvUZ1U01A7tn6JceugcnTn1NJx30gM54dHbaRnzySuyB2z3w04s24slr0ZMn4Se7deBJ6uJZW+4VAdm4DCb6lVBBZezy5Sz+6texxpGLz540NCjKHqDSCPMDQd/ROhbsM6XZbLkb5cXZxcV5TDzmFuAIyGsL8Cziafz2+2RCaW8csfYi4r0D+SziOHk7PovHu8kIJHSOd9Z6yQhs9vMgbFYohAwbe8S5fHd7z/o7hn1CVPJ8TKCPYDWU06N70odPLGxZd6Yh7mSUTEbxydvkZA4arMrIRlY308w0Go9g870vf2p3pZmrTJTs3e39Dp+U7S/yCGevm7Jcsi+NKFWuQDJpKqE0W4u6rdt/wOwkTkbdF6PMVP63DOxCZTCt5vZlWzdaYdfEhtjhO0mAtcgexfxIwNuwmFVeTNqd/NVIgMSizUU20A7CWkHSRSFUB7XH8DA+pae2xgCL2FurF+PnVyZvT0fJ+cUoGSXrHefP78jhIk7TxPcZkT2/No7TJElPT9Ozs3Q8TicT2pWLSpXLI/Jzy4SUlrRE2DIkVDb+MK7ln9KAXlkDnhMUPX3YPcGeud6RfQ/4h7GPN+ukbA+v9zPDv6AEg3ofhnD1FUFLkMxjsdyaim13k1RaKAnWbSTu4bzuKNwfrVA6PHpnrDQ69KFDgp5kumxKL6gI3hvKjA4uvKIGw9OnfQ3N8JayR80v4v9V84J+rsh/5R43Tx4OifuZMSUIHSSxcIfanNfqDqdohXY+mulT8ueAht1r7bRl48gT0P3CMRLerWncF/1XpPgFgmTaSGBbwj3/BFYqDa9gX1XgUFT1seFGHBagDyj8NnrmiualhHNDRbh9BtaaA21xj9uNs2svekTedVxc+Qd7RHZVTKXLtulghXJo7HK0Z6t3x+5y2bPlwUg187aljeM42b8I32vRYGGs+hMkO2GXtzfsEZZsY+Rvuxk/wd4eA6z3//pS4fcyLAQyk/krrRw2x+swNukNArob7yhwHKYgLxvf9qpuTzd92Thx0KxsgEzr0F0ZFYBpsOs98hjRd7cJkjYMjEziuG23Sb2iVb2+HRJ7tp/Ya2NnSkrQ7ITdaNfkucqUv7CCrZRzvh9+y+6/Ibvjp+ZX2iDLTaO/HdN/QyInhwaRfuGaDtIt4h+aTH5L7D+UWK8asDDd3J2Ip0l3yt/4VL5ZrX+kWx5ubGFO3huxf6IUhiz1B+0brwtEEhM+014u+kU86j5crwXQ/3678wphM3vsCwG2nfnTr3xvypbyZBSP/HWoNg4robe3Lv9aYlCSu9yttgX62ncYXbgIX/FNXQrllWZj/SAmcNq9e+ARTzfS5yFIOnq4Ws2Eg3tbti19/aUBGgsR1QthlZgRG5/9HJU+S57monS7t5h+QN997HTQf9iRrx+eiGKtezWpXj8f5ynnNN1Y9l+VtHQ9KEBIsN7T8Pgyy6DG3sa9PkAvPTZF+NPVHd3ahjW0UzMe/aBTq1VYcWceQbftxkek/8nBtv0/q4g1/g== sidebar_class_name: "get api-method" info_path: gen/api/agent-management-api custom_edit_url: null @@ -594,6 +594,196 @@ Get detailed information about a specific agent by hostname. + + + +
+ + + + conditions + + object[] + + +
+
+ + + Evaluated node conditions. + + +
  • +
    + Array [ +
    +
  • + + + + + + + +
  • +
    + ] +
    +
  • +
    +
    +
    +
    + + + + timeline + + object[] + + +
    +
    + + + Agent state transition history. + + +
  • +
    + Array [ +
    +
  • + + + + + + + + + +
  • +
    + ] +
    +
  • +
    +
    @@ -602,7 +792,7 @@ Get detailed information about a specific agent by hostname. value={"Example (from schema)"} > diff --git a/docs/docs/gen/api/list-active-agents.api.mdx b/docs/docs/gen/api/list-active-agents.api.mdx index e5e56118..956db4a4 100644 --- a/docs/docs/gen/api/list-active-agents.api.mdx +++ b/docs/docs/gen/api/list-active-agents.api.mdx @@ -5,7 +5,7 @@ description: "Discover all active agents in the fleet." sidebar_label: "List active agents" hide_title: true hide_table_of_contents: true -api: eJztWEtv2zgQ/isEz44qOXY20S3bbRbZ7SNoEvRQGAYtjmw2EqmQIzdeQ/99MZRsS7KbuGgX2ENPlqWZb57kPNZcgkusKlAZzWP+h3KJWYJlIsuYSFAtgYk5aHRMaYYLYGkGgAEfcBRzx+PP/JI+T98JLeaQ0+PlzfXU80xNAVYQsuOTAXeQlFbhisef1/x3EBbsZYkLwvDksQUh+aSaDLgFVxjtwPF4zYdhSD9dRd8qh8ykXR1JrcRoBI3EIYoiU4lX4NUXR2xr7pIF5IKecFUAj7mZfYEE+YAXltRFVQutAVt0wlqx4gOuEHL3Mv/CONQihxalQ6v0nA96ltwtgG2oySJyspce8GrAHQos3SEU0GVOvvsIQpJi7w3Wj5NDEpLSWtDIarx9OZmYQXbQLiGlIiCR3XQs7OpT9YX+DauTpchKYDU0S4xO1by0IJnRPekW5sohWJBTgYeMTY3N6QuXAuEEVQ57fvy0gBYsy4RDZiG14BYgmULHFiAszkBsPWvxJwosrEnAOdbgehnGTZVOzSG3drE+1CdFz5lbOYScERtpoIwO9pJLKtJzVtbMxyRYO7neKl0+sTYGSYAnkRcZwdzPSo0lqb8E644W0RAfK2UYBuGIVz74j6WyICmZO5btFJgMOCr0fB9ur3VqPjYXBGlZFj48RynZeLdmqRPfCDkVS7BiDi/HqYVBjKxhdCw1lkUDNh4woSWLxixXukRw+8GLctX2qC7zGdg9QW9b6B6cfOpzOmqgO+4Mg9NhNeDjH8TuqN0CH55XAx79KHr0TfhxPw+8kxp7NpJbSUAyLmsR7UzIITd29XIU33k6VjpS8dmThgZF1gJUGmF+wOg7omO1fKqUs1XfyvPT8/OzkPyYWoAjIK8swLOIw/Dit2hMYS8dee1FxHsH8lnEUXQxOg1H/WDUTmgUb6S1glF7sx0HYZOFQkiwtEecy9c396zN0b0nRC7PRgT6AFZDNj36Tvpwy2qWzc3UxR0H0TgITy6ikzlosCohGUlRThNTajzCm+99+tN1l5m5SkTGXt/c9/xJ0X6URyh7VWbZij2WIlOpAsmkyYXS/truqv0VZidhFDQvgsTkvpaBXaoEpvncvizrWitsLrEudv1OEmAhkgcxPxLwpiZmue8BbS9+BRIgedGmIoEf6Km+1U/thAEuQi+tWI6ep4wuhkF0dh5EQbThOHueI4XzMI4jf8+I5HnaMIyjKB4O49PTeDSKx2PiSkWustUR8blhQkpLvUTN0nWoLP1h3LR/SgN5in7OCIq+Tvon2HuudWTfA3419uF6E5Td4fV6JvgdnSDaEvomvHlC0BIk81gstSZnO25qlZZKgnXBXgOw7Zu3rW9LbT9qUPn3bN93N+vtce2NDH0Nms5/g9+uOsqhV8G1/EXsozDan1DutShxYaz6ByQ7YZc31+wBVmwr6qeNKmCtOeKUXrLW/02X5nkZLgQyk/gZQXaz7UqoDCRDwyygVbCEZoQI6pYfhcoOjic94bvgNzxMzEyJOyUOipUlkGhdpyujps2UdfueGHlMFb3bGkkMHSHjMPRp1ET3DVHtBfZ0P7BXxs6UlKDZCbvWrkxTlSg/AYDNlXN+3v0V3f9/dMeHFguekOYWvwehieyn7xZ+hfQ/CqkfAXBhJI/53JfFQtCCib/yMeR1owSW9lathdQtxa0OTXsttVV1gVgQryfjMZ95Ij5oHq42q4K/Pt35arId+dtli+02ZFQLWsNtzKMgDHwXUhiHudC7Zqfec3UqVt9j611aftf6rrYN4QlfFZlQ2o/S1hfU2mtNKaQKSFWZXqzXM+Hg3mZVRa8fS6Bxi3y5FFaJGZn7eVIN+AKEBOs3fQ+wIh8kCRQUAb8T8n1h7wDR3m8bvT/f3FHD0Y1Dz+8efdNJ6lULe72uKe7MA+iq4oNGCaT/vJpUVfUvDGwzlA== +api: eJztWE1v2zgQ/SsEz4orOXY28S2bJrvZ7YfRJuihCAxaHFlsJFIhR268hv/7YijZlmQncZEusIeeLIucN8P3RsMhl1yCi60qUBnNR/ytcrGZg2Uiy5iIUc2BiRlodExphimwJAPAHg84ipnjo6/8nIYn74UWM8jp8Xx8PfE2E1OAFYTs+F3AHcSlVbjgo69L/jsIC/a8xJQw/PSRBSH53eou4BZcYbQDx0dL3g9D+mkH+k45ZCZpx0hhxUYjaCQLURSZin0Ab745MltyF6eQC3rCRQF8xM30G8TIA15YChdV5bQCbMwT1ooFD7hCyN3L9qlxqEUOjZkOrdIzHnRWcpMCW8+mFRHJ3nuPrwLuUGDp9qGALnPi7hMISYF9MFg93u3zEJfWgkZW4e36ycQUsr3rElIqAhLZuLXCdjyrrtO/YXE0F1kJrIJmsdGJmpUWJDO6493CTDkEC3IicN9iE2NzGuFSIByhymGHxy8pNGBZJhwyC4kFl4JkCh1LQVicgtgwa/EnOiysicE5VuN6H8ZNlE7MPlrbWB+rL0XPmFs4hJyRGUWgjO7tJJdUFOe0rIwPSbBmcr1TunxkTQzyAI8iLzKCuZ2WGksKfw7WHeyinnyol37YCwd85cV/KJUFScncWtk2gLuAo0Jv9/HztU7Mp7pAUJRl4eU5KMia3cqkSnwj5ETMwYoZvKxTA4MMWW3oWGIsiwI2DJjQkkVDlitdIrhd8aJcNRnVZT4Fu+PoXQPdgxOnPqejGrpFZ9g77q8CPnwldivsBnj/dBXw6LXo0ZPww24eeJLq9aw9N5KAfJxXLpqZkENu7OJlFd/7eax0FOKzXxoaFFkDUGmE2Z5F39A8VvmnnXK66K7y9Pj09CQkHhMLcADklQV4FrEfnv0WDUn20hFrLyLeOpDPIg6is8FxOOiKUZFQB157a4hRsdnUQdg4VQgxlvaA7/JifMuaFu06IXJ5MiDQe7AassnBNenjZ1aZrCtTG3fYi4a98OgsOpqBBqti8hEX5SQ2pcYD2Pzg05/KXWZmKhYZuxjfdvgktR/kAcFelVm2YA+lyFSiQDJpcqG0L9vtsL/D9CiMevWLXmxyv5eBnasYJvnMvuzrWiusi1gbu3onCbAQ8b2YHQg4riaz3PeAtqNfgQRILNpExPCKnuqpfmrrDDANvbdiPnh+ZnTW70Unp72oF60tTp63SOA0HI0iX2dE/PzcMBxF0ajfHx0fjwaD0XBIVonIVbY4QJ8xE1Ja6iUqkzahsvQf47r9UxqIKfo5ISgavet+wZ65xif7AfC7sffXa1G2H6+PM8Yf6ATRltBdwuUjgpYgmcdiiTU521pTqzRXEqzbtLj7de10uG+tULoaujBWGl3VobZrfxxh1ObLMvMNFcF7R7HRVQivyMFq9OlYq2I4JvWo+AX8TzVLabui+JW734zc7Wvup8ZkIHTVEgu3r8z5Xt3hBK3Qzq9m8lT7s6eH3SntZLIJ5AnoZuIYCRdrGneb/kvq+AWCZNpIYFvCPf8ElikNr2Bf5eBQ5MWhyw04zEHv6fBXwTNHNN9KONfuCLdjYK3ZUxZ3uN0Eu46iQeRNzcWlH9ghss5iSl22lYOlyqGxi96Or81aNlo2fHkw6pq92Y+1NHqzy3VO2t0I6gPzGr/ZrCmHPgTXKDNkPgij3YP9rRYlpsaqf0CyI3Y+vmb3sGAbVz/thP+EijtKsMb/9eHG2zJMBTIT+6O1bBfpK6EykAwNs4BWwRzqk3ev0hqFyvae6jvOtzWztmFiakrcBrHXrSyBXOuqyjNKRFNiXQPlIc3nzWaRZNByMgxDn0a1upc0a0fY411hr4ydKilBsyN2rV2ZJCpW/uAMNlfO+br8S93/v7rDffdxfiId9/31Ie27P/1K7pek/5GkfrvD1Eg+4jPfTRaC7mX5G68hr84XYOm6t3GP+5l0q6Rp3uZuQk0Raevz8vrmxk/iQf1wtd6u//py43eTzU1Zc9ti24tl2gsad0IjHvXCnm/eC+MwF3p7Rqiuh1s7Vpex5TYtf+jWu1obwiO+KTKhfBNUWr+hVqzVWyHtgLQr04vlcioc3NpstaLXDyXQLQVxORdWiSkt9ys1hCkICdZfkN/DgjiIYyhIAX+V6o9TnQ+Irss36v1xeUN9eluHDu8efd1+6UUDe7msZtyYe9CrFQ/qIJD+89XdarX6F9otUpM= sidebar_class_name: "get api-method" info_path: gen/api/agent-management-api custom_edit_url: null @@ -603,6 +603,196 @@ Discover all active agents in the fleet. + + + +
    + + + + conditions + + object[] + + +
    +
    + + + Evaluated node conditions. + + +
  • +
    + Array [ +
    +
  • + + + + + + + +
  • +
    + ] +
    +
  • +
    +
    +
    +
    + + + + timeline + + object[] + + +
    +
    + + + Agent state transition history. + + +
  • +
    + Array [ +
    +
  • + + + + + + + + + +
  • +
    + ] +
    +
  • +
    +
  • diff --git a/docs/docs/gen/api/sidebar.ts b/docs/docs/gen/api/sidebar.ts index dd455ba8..9309eedc 100644 --- a/docs/docs/gen/api/sidebar.ts +++ b/docs/docs/gen/api/sidebar.ts @@ -26,6 +26,18 @@ const sidebar: SidebarsConfig = { label: "Get agent details", className: "api-method get", }, + { + type: "doc", + id: "gen/api/drain-agent", + label: "Drain an agent", + className: "api-method post", + }, + { + type: "doc", + id: "gen/api/undrain-agent", + label: "Undrain an agent", + className: "api-method post", + }, ], }, { diff --git a/docs/docs/gen/api/undrain-agent.api.mdx b/docs/docs/gen/api/undrain-agent.api.mdx new file mode 100644 index 00000000..2ba26015 --- /dev/null +++ b/docs/docs/gen/api/undrain-agent.api.mdx @@ -0,0 +1,524 @@ +--- +id: undrain-agent +title: "Undrain an agent" +description: "Resume accepting jobs on a drained agent." +sidebar_label: "Undrain an agent" +hide_title: true +hide_table_of_contents: true +api: eJztVl1r20oQ/SvLPN2CartfD1dvvtBACqUhdbgPwYSxdmxtIu2qs6O0rtB/v8xKdpw4Lb3QQgt58q52Ps8Z754OQkOM4oI/tZBD6y2j8/MNeYEMLMWCXaPHkMM5xbYmg0VBjTi/MddhFU3wBk3yImtQHSeQgeAmQn4JKdLVe/S4oVqX87PTq2R1tc8cYZlBpKJlJ1vILzv4h5CJ562UGiOZ55/ZCcGyX2bQIGNNQhyTtceaIIcyREnLDJyW26CUkAHTp9YxWciFW8ogFiXVCHkHsm3ULwo7v4H+YbuLkswupglrIyUN/RkJZgRqAloPU2yCjxQ16svZTH/ux0ow7JyM804cCllFqgheFO28A2yayhUJk+l1VMfuuNywuqZCyWlYERQ3pK0pRtzQI331hxhc7g2Xfa9Hr2cvjsu98NhKGdh9JWuem/nZqbmhrdmH+WlVE3Pg45ofUjE3B/sdF8nXSIliQlG0zAOe9AXrptJgJ+gqssoWk7CjWzJRUNo4GcgWdFX8geTWOl1iZUYfg6vQyl0Rj6a1LWlqT/I58I0RV1NoJaUugj0kynmhDfFR4sW+SXW4l+TNbKbkiZOU8q1anY9TCDtiXx0TexJ45awlb56bUx/b9doVTkezIa5djOnf+MTun8Du62/dMj6IWYfW/8zb5YnJX8jk399j0vnhcdX3NmVgG/SlVYToieHfn+E+g5qkDCqvmhAT8qpMcpgmNTHtdiKjn44CAVQO8e2gbw600UflcqDrUCHtyy9FGhgVju5XyQiycXESuEaBHN79u0iywPl1SO5j8cPU3Yk1ffghAy1kwOHFZDaZKW7aSI1pwEb1dTFqG/SDSHqIYXc3qP9PSA6tCX2RaVMpOH0GLVcaccBxFIiQQX4gAXdQLrMkDNWs61YY6YKrvtfPn1ri7QDwLbLDlWJw2YF1UdcW8jVWkb7Tx1/noyB6Zn5MLH6jnfEj+q2ijVWrO8jghraHurZf9hmUhJY4FToczxOKB45Ht4BK1P0Inn34uIAM8P7kPJiUFP7RqrpusFiEG/J9vy9SdK8V9v1/nZdG4w== +sidebar_class_name: "post api-method" +info_path: gen/api/agent-management-api +custom_edit_url: null +--- + +import ApiTabs from "@theme/ApiTabs"; +import DiscriminatorTabs from "@theme/DiscriminatorTabs"; +import MethodEndpoint from "@theme/ApiExplorer/MethodEndpoint"; +import SecuritySchemes from "@theme/ApiExplorer/SecuritySchemes"; +import MimeTabs from "@theme/MimeTabs"; +import ParamsItem from "@theme/ParamsItem"; +import ResponseSamples from "@theme/ResponseSamples"; +import SchemaItem from "@theme/SchemaItem"; +import SchemaTabs from "@theme/SchemaTabs"; +import Heading from "@theme/Heading"; +import OperationTabs from "@theme/OperationTabs"; +import TabItem from "@theme/TabItem"; + + + + + + + + + + +Resume accepting jobs on a drained agent. + + + + +
    + +

    + Path Parameters +

    +
    +
      + + + +
    +
    +
    +
    + + +
    + + + Agent undrain initiated. + + +
    + + + + +
    + + + Schema + +
    + +
      + + + +
    +
    +
    + + + + +
    +
    +
    +
    +
    +
    + + + Unauthorized - API key required + + +
    + + + + +
    + + + Schema + +
    + +
      + + + + + + + +
    +
    +
    + + + + +
    +
    +
    +
    +
    +
    + + + Forbidden - Insufficient permissions + + +
    + + + + +
    + + + Schema + +
    + +
      + + + + + + + +
    +
    +
    + + + + +
    +
    +
    +
    +
    +
    + + + Agent not found. + + +
    + + + + +
    + + + Schema + +
    + +
      + + + + + + + +
    +
    +
    + + + + +
    +
    +
    +
    +
    +
    + + + Agent not in draining or cordoned state. + + +
    + + + + +
    + + + Schema + +
    + +
      + + + + + + + +
    +
    +
    + + + + +
    +
    +
    +
    +
    +
    +
    +
    + \ No newline at end of file diff --git a/docs/docs/sidebar/architecture/system-architecture.md b/docs/docs/sidebar/architecture/system-architecture.md index d8e9cf6e..d41b05e3 100644 --- a/docs/docs/sidebar/architecture/system-architecture.md +++ b/docs/docs/sidebar/architecture/system-architecture.md @@ -13,14 +13,14 @@ that can either hit the REST API directly or manage the job queue. The system is organized into six layers, top to bottom: -| Layer | Package | Role | -| -------------------------- | --------------------------------------- | ------------------------------------------------------------------- | -| **CLI** | `cmd/` | Cobra command tree (thin wiring) | -| **SDK Client** | `osapi-sdk` (external) | OpenAPI-generated client used by CLI | -| **REST API** | `internal/api/` | Echo server with JWT middleware | -| **Job Client** | `internal/job/client/` | Business logic for job CRUD and status | -| **NATS JetStream** | (external) | KV `job-queue`, Stream `JOBS`, KV `job-responses`, KV `agent-facts` | -| **Agent / Provider Layer** | `internal/agent/`, `internal/provider/` | Consumes jobs, executes providers, publishes system facts | +| Layer | Package | Role | +| -------------------------- | --------------------------------------- | ------------------------------------------------------------------------ | +| **CLI** | `cmd/` | Cobra command tree (thin wiring) | +| **SDK Client** | `osapi-sdk` (external) | OpenAPI-generated client used by CLI | +| **REST API** | `internal/api/` | Echo server with JWT middleware | +| **Job Client** | `internal/job/client/` | Business logic for job CRUD and status | +| **NATS JetStream** | (external) | KV `job-queue`, Stream `JOBS`, KV `job-responses`, KV `agent-facts` | +| **Agent / Provider Layer** | `internal/agent/`, `internal/provider/` | Consumes jobs, executes providers, evaluates conditions, drain lifecycle | ```mermaid graph TD @@ -113,6 +113,23 @@ Providers are stateless and platform-specific (e.g., a Ubuntu DNS provider vs. a generic Linux DNS provider). Adding a new operation means implementing the provider interface and registering it in the agent's processor dispatch. +### Agent Lifecycle (`internal/agent/`) + +Agents evaluate **node conditions** on each heartbeat tick (10s) and support +**graceful drain** for maintenance. Conditions are threshold-based booleans +(MemoryPressure, HighLoad, DiskPressure) computed from heartbeat metrics. + +The drain mechanism uses NATS consumer subscribe/unsubscribe. When an operator +drains an agent, the API writes a `drain.{hostname}` key to the state KV bucket +(`agent-state`, no TTL). The agent detects this on its next heartbeat, +unsubscribes from all NATS JetStream consumers (stopping new job delivery), and +transitions through `Draining` → `Cordoned` as in-flight jobs complete. Undrain +deletes the key and the agent resubscribes. + +State transitions are recorded as append-only timeline events in the state KV +bucket, following the same pattern used for job lifecycle events. See +[Agent Lifecycle](../features/agent-lifecycle.md) for details. + ### Configuration (`internal/config/`) Configuration is managed by [Viper][] and loaded from an `osapi.yaml` file. diff --git a/docs/docs/sidebar/development/development.md b/docs/docs/sidebar/development/development.md index 462e8d10..97fe0dbe 100644 --- a/docs/docs/sidebar/development/development.md +++ b/docs/docs/sidebar/development/development.md @@ -97,6 +97,12 @@ Unit tests should follow the Go convention of being located in a file named located in `test/integration/` and use a `//go:build integration` tag. They build and start a real `osapi` binary, so they require no external setup. +Use `testify/suite` with table-driven patterns and `validateFunc` callbacks. +**One suite method per function under test.** All scenarios for a function +(success, error codes, transport failures, nil responses) belong as rows in a +single table — never split into separate `TestFoo`, `TestFooError`, +`TestFooNilResponse` methods. + ### File naming Avoid generic file names like `helpers.go` or `utils.go`. Name files after what diff --git a/docs/docs/sidebar/development/tasks/backlog/2026-02-26-kubernetes-systemd-patterns.md b/docs/docs/sidebar/development/tasks/backlog/2026-02-26-kubernetes-systemd-patterns.md deleted file mode 100644 index 446aa048..00000000 --- a/docs/docs/sidebar/development/tasks/backlog/2026-02-26-kubernetes-systemd-patterns.md +++ /dev/null @@ -1,118 +0,0 @@ ---- -title: Kubernetes and systemd inspired patterns -status: backlog -created: 2026-02-26 -updated: 2026-02-26 ---- - -## Objective - -Adopt proven patterns from Kubernetes and systemd to make OSAPI's node -management feel more mature and operationally familiar. These are ideas to -explore beyond the initial heartbeat enrichment and `node list`/`node get` work. - -## Ideas - -### Node Conditions (Kubernetes-inspired) - -Kubernetes nodes report conditions like `MemoryPressure`, `DiskPressure`, -`PIDPressure`, and `NetworkUnavailable`. Since the heartbeat already collects -memory and load data, we could derive conditions from thresholds: - -- Memory > 90% used -> `MemoryPressure: true` -- Load 1m > 2x CPU count -> `HighLoad: true` -- Disk > 90% used -> `DiskPressure: true` (would need disk in heartbeat or a - periodic deep scan) - -Conditions would be stored in the KV registration and shown in `node list` / -`node get`. They give operators a quick "is anything wrong?" signal without -digging into raw numbers. - -### Capacity and Allocatable (Kubernetes-inspired) - -Kubernetes tracks what resources a node has vs. what's available for scheduling. -We could track: - -- `max_jobs` (configured) vs. `active_jobs` (current count) -- Job slot utilization per agent visible in `node get` -- Could inform smarter job routing (avoid overloaded agents) - -### Taints and Tolerations (Kubernetes-inspired) - -Kubernetes nodes can be "tainted" to repel workloads unless they explicitly -tolerate the taint. We already have label-based routing, but taints would add: - -- Mark a node as `draining` or `maintenance` so new jobs avoid it -- `NoSchedule` equivalent: agent stays registered but won't receive new jobs -- `NoExecute` equivalent: evict running jobs (graceful drain) -- CLI: - `osapi node taint --hostname web-01 --key maintenance --effect NoSchedule` - -### Node Lifecycle Events (Kubernetes-inspired) - -Kubernetes records lifecycle events per node (Joined, BecameReady, -BecameNotReady, etc.). We could store agent lifecycle events in a dedicated KV -bucket: - -- "agent started" with timestamp and version -- "agent stopped" (clean shutdown) -- "heartbeat missed" (detected by TTL expiry watcher) -- "agent restarted" (same hostname re-registers) - -Visible via `node get --hostname X` or a dedicated `node events --hostname X` -command. - -### Consistent Resource Model (Kubernetes-inspired) - -Every Kubernetes object has a uniform envelope: `apiVersion`, `kind`, `metadata` -(name, namespace, labels, annotations, creationTimestamp, uid), `spec`, -`status`. We could formalize OSAPI resources similarly: - -- Each resource type (node, job, audit entry) gets a consistent structure -- `metadata.labels`, `metadata.annotations`, `metadata.createdAt` on every - resource -- Annotations (separate from labels) for non-routing metadata -- Enables generic tooling: filtering, sorting, field selectors - -### Agent States (systemd-inspired) - -Systemd units have explicit states: Active, Inactive, Failed, Activating, -Deactivating. Currently we only have "present in KV = alive". Adding explicit -states would enable: - -- `Starting` - agent is initializing, not yet processing jobs -- `Ready` - agent is healthy and processing jobs -- `Draining` - agent is shutting down gracefully, finishing in-flight jobs but - not accepting new ones -- `Stopped` - clean shutdown (deregistered) - -State transitions would be visible in the registry and in lifecycle events. - -### Restart Tracking (systemd-inspired) - -Systemd tracks restart counts and restart reasons. We could add: - -- `restart_count` - how many times the agent process has started for this - hostname -- `last_restart_reason` - "clean start", "crash recovery", etc. -- Stability signal for fleet health dashboards - -### Additional State to Save - -- **First-seen timestamp** (`started_at`) distinct from last heartbeat - (`registered_at`) for true "AGE" display like `kubectl get nodes` -- **Active job count** - how busy the agent is right now -- **Agent binary version** - for fleet version tracking and rolling upgrade - visibility -- **OS kernel version** - already available from host provider - -## Notes - -- These are incremental improvements that build on the heartbeat enrichment - work. Each can be implemented independently. -- Priority should be driven by operational value: conditions and capacity - tracking are highest value for fleet operators. -- Taints and lifecycle events add complexity but enable sophisticated fleet - management workflows. -- The consistent resource model is the most ambitious change and would touch the - most code, but pays off long-term for tooling and API consistency. diff --git a/docs/docs/sidebar/features/agent-lifecycle.md b/docs/docs/sidebar/features/agent-lifecycle.md new file mode 100644 index 00000000..b3b83dde --- /dev/null +++ b/docs/docs/sidebar/features/agent-lifecycle.md @@ -0,0 +1,132 @@ +--- +sidebar_position: 4 +--- + +# Agent Lifecycle + +OSAPI agents report threshold-based **node conditions** and support graceful +**drain/cordon** for maintenance. Both features are inspired by Kubernetes node +management patterns. + +## Node Conditions + +Conditions are threshold-based booleans evaluated agent-side on every heartbeat +(10 seconds). They surface "is anything wrong?" at a glance without requiring +operators to interpret raw metrics. + +| Condition | Default Threshold | Data Source | +| ---------------- | -------------------- | ---------------- | +| `MemoryPressure` | Memory used > 90% | Heartbeat memory | +| `HighLoad` | Load1 > 2x CPU count | Heartbeat load | +| `DiskPressure` | Any disk > 90% used | Heartbeat disk | + +Each condition tracks: + +- **Status** -- `true` when the threshold is exceeded, `false` otherwise +- **Reason** -- human-readable explanation (e.g., "memory 94% used, 15.1/16.0 + GB") +- **LastTransitionTime** -- when the condition last flipped between true and + false + +### CLI Display + +`agent list` shows active conditions in the CONDITIONS column: + +``` +HOSTNAME STATUS CONDITIONS LABELS AGE LOAD (1m) OS +web-01 Ready HighLoad,MemoryPressure - 3d 4h 4.12 Ubuntu 24.04 +web-02 Ready - - 12h 0.31 Ubuntu 24.04 +db-01 Ready DiskPressure - 5d 1.22 Ubuntu 24.04 +``` + +`agent get` shows full condition details: + +``` +Conditions: + TYPE STATUS REASON SINCE + MemoryPressure true memory 94% used (15.1/16.0 GB) 2m ago + HighLoad true load 4.12, threshold 4.00 for 2 CPUs 5m ago + DiskPressure false +``` + +### Configuration + +Thresholds are configurable in `osapi.yaml`: + +```yaml +agent: + conditions: + memory_pressure_threshold: 90 # percent used + high_load_multiplier: 2.0 # load1 / cpu_count + disk_pressure_threshold: 90 # percent used +``` + +## Agent Drain + +Drain allows operators to gracefully remove an agent from the job routing pool +for maintenance without stopping the process. When an agent stops without +draining, it vanishes from the registry and looks identical to a crash. + +### State Machine + +Agents have an explicit scheduling state with three values: + +``` +Ready ──(drain)──> Draining ──(jobs done)──> Cordoned + ^ │ + └──────────────(undrain)───────────────────────┘ +``` + +| State | Meaning | +| ---------- | ------------------------------------------- | +| `Ready` | Accepting and processing jobs (default) | +| `Draining` | Finishing in-flight jobs, not accepting new | +| `Cordoned` | Fully drained, idle, not accepting jobs | + +### How It Works + +1. Operator calls `osapi client agent drain --hostname web-01` +2. API writes a `drain.{hostname}` key to the state KV bucket +3. Agent detects the drain flag on its next heartbeat tick (10s) +4. Agent transitions to `Draining` and **unsubscribes from NATS JetStream + consumers** -- this is how it stops receiving new jobs +5. In-flight jobs continue to completion +6. Once all in-flight jobs finish, state becomes `Cordoned` +7. Operator calls `osapi client agent undrain --hostname web-01` +8. API deletes the drain key; agent resubscribes and transitions to `Ready` + +### Timeline + +Every state transition is recorded as an append-only event in the state KV +bucket (`agent-state`, no TTL). `agent get` shows the full transition history: + +``` +Timeline: + TIMESTAMP EVENT HOSTNAME MESSAGE + 2026-03-05 10:00:00 drain web-01 Drain initiated + 2026-03-05 10:05:23 cordoned web-01 All jobs completed + 2026-03-05 12:00:00 undrain web-01 Resumed accepting jobs +``` + +### CLI Commands + +```bash +osapi client agent drain --hostname web-01 # start draining +osapi client agent undrain --hostname web-01 # resume accepting jobs +``` + +Both commands return the current state and a confirmation message. + +## Permissions + +Node conditions are included in the standard `agent:read` responses. Drain and +undrain operations require the `agent:write` permission, which is included in +the `admin` role by default. + +## Related + +- [Agent CLI Reference](../usage/cli/client/agent/agent.mdx) -- agent fleet + commands +- [Node Management](node-management.md) -- node queries via the job system +- [Job System](job-system.md) -- how async job processing works +- [Configuration](../usage/configuration.md) -- full configuration reference diff --git a/docs/docs/sidebar/features/authentication.md b/docs/docs/sidebar/features/authentication.md index 7654352d..88d7631d 100644 --- a/docs/docs/sidebar/features/authentication.md +++ b/docs/docs/sidebar/features/authentication.md @@ -60,11 +60,11 @@ flowchart TD Built-in roles expand to these default permissions: -| Role | Permissions | -| ------- | -------------------------------------------------------------------------------------------------- | -| `admin` | `node:read`, `network:read`, `network:write`, `job:read`, `job:write`, `health:read`, `audit:read` | -| `write` | `node:read`, `network:read`, `network:write`, `job:read`, `job:write`, `health:read` | -| `read` | `node:read`, `network:read`, `job:read`, `health:read` | +| Role | Permissions | +| ------- | ------------------------------------------------------------------------------------------------------------------------------- | +| `admin` | `agent:read`, `agent:write`, `node:read`, `network:read`, `network:write`, `job:read`, `job:write`, `health:read`, `audit:read` | +| `write` | `agent:read`, `node:read`, `network:read`, `network:write`, `job:read`, `job:write`, `health:read` | +| `read` | `agent:read`, `node:read`, `network:read`, `job:read`, `health:read` | ### Custom Roles diff --git a/docs/docs/sidebar/features/node-management.md b/docs/docs/sidebar/features/node-management.md index 9a857ebf..a9eb931d 100644 --- a/docs/docs/sidebar/features/node-management.md +++ b/docs/docs/sidebar/features/node-management.md @@ -14,13 +14,15 @@ host. OSAPI separates agent fleet discovery from node system queries: - **Agent** commands (`agent list`, `agent get`) read directly from the NATS KV - heartbeat registry. They show which agents are online, their labels, and - lightweight metrics from the last heartbeat. No jobs are created. Agents also - expose typed **system facts** (architecture, kernel version, FQDN, CPU count, - network interfaces, service manager, package manager) gathered every 60 - seconds via providers and stored in a separate `agent-facts` KV bucket with a - 5-minute TTL. The API merges registry and facts data into a single `AgentInfo` - response. + heartbeat registry. They show which agents are online, their labels, + lightweight metrics, and [node conditions](agent-lifecycle.md) from the last + heartbeat. No jobs are created. Agents also expose typed **system facts** + (architecture, kernel version, FQDN, CPU count, network interfaces, service + manager, package manager) gathered every 60 seconds via providers and stored + in a separate `agent-facts` KV bucket with a 5-minute TTL. The API merges + registry and facts data into a single `AgentInfo` response. Agents can be + [drained](agent-lifecycle.md#agent-drain) for maintenance without stopping the + process. - **Node** commands (`node hostname`, `node status`) dispatch jobs to agents that execute system commands and return detailed results (disk usage, full memory breakdown, etc.). diff --git a/docs/docs/sidebar/usage/cli/client/agent/drain.md b/docs/docs/sidebar/usage/cli/client/agent/drain.md new file mode 100644 index 00000000..5b35701e --- /dev/null +++ b/docs/docs/sidebar/usage/cli/client/agent/drain.md @@ -0,0 +1,24 @@ +# Drain + +Drain an agent to stop it from accepting new jobs. In-flight jobs continue to +completion: + +```bash +$ osapi client agent drain --hostname web-01 + + Hostname: web-01 + Status: Draining + Message: Agent drain initiated +``` + +The agent transitions from `Ready` to `Draining`. Once all in-flight jobs +finish, the state becomes `Cordoned`. The agent stays running and continues +sending heartbeats -- it just stops pulling new work from the job queue. + +Use `agent undrain` to resume accepting jobs. + +## Flags + +| Flag | Description | Required | +| ------------ | ------------------------------ | -------- | +| `--hostname` | Hostname of the agent to drain | Yes | diff --git a/docs/docs/sidebar/usage/cli/client/agent/get.md b/docs/docs/sidebar/usage/cli/client/agent/get.md index 127ed5cc..8bde755f 100644 --- a/docs/docs/sidebar/usage/cli/client/agent/get.md +++ b/docs/docs/sidebar/usage/cli/client/agent/get.md @@ -6,6 +6,7 @@ Get detailed information about a specific agent by hostname: $ osapi client agent get --hostname web-01 Hostname: web-01 Status: Ready + State: Draining Labels: group:web.dev.us-east OS: Ubuntu 24.04 Uptime: 6 days, 3 hours, 54 minutes @@ -22,29 +23,43 @@ $ osapi client agent get --hostname web-01 Interfaces: eth0: 10.0.1.10 (IPv4), fe80::1 (IPv6), MAC 00:1a:2b:3c:4d:5e lo: 127.0.0.1 (IPv4), ::1 (IPv6) + + Conditions: + TYPE STATUS REASON SINCE + MemoryPressure true memory 94% used (15.1/16.0 GB) 2m ago + HighLoad true load 4.12, threshold 4.00 for 2 CPUs 5m ago + DiskPressure false + + Timeline: + TIMESTAMP EVENT HOSTNAME MESSAGE + 2026-03-05 10:00:00 drain web-01 Drain initiated + 2026-03-05 10:05:23 cordoned web-01 All jobs completed ``` This command reads directly from the agent heartbeat registry -- no job is created. The data comes from the agent's most recent heartbeat write. -| Field | Description | -| ------------ | --------------------------------------------------- | -| Hostname | Agent's configured or OS hostname | -| Status | `Ready` if present in registry | -| Labels | Key-value labels from agent config | -| OS | Distribution and version | -| Uptime | System uptime reported by the agent | -| Age | Time since the agent process started | -| Last Seen | Time since the last heartbeat refresh | -| Load | 1-, 5-, and 15-minute load averages | -| Memory | Total, used, and free RAM | -| Architecture | CPU architecture (e.g., amd64) | -| Kernel | OS kernel version | -| FQDN | Fully qualified domain name | -| CPUs | Number of logical CPUs | -| Service Mgr | Init system (e.g., systemd) | -| Package Mgr | Package manager (e.g., apt) | -| Interfaces | Network interfaces with IPv4, IPv6, MAC, and family | +| Field | Description | +| ------------ | --------------------------------------------------------- | +| Hostname | Agent's configured or OS hostname | +| Status | `Ready` if present in registry | +| State | Scheduling state: `Draining` or `Cordoned` (if not Ready) | +| Labels | Key-value labels from agent config | +| OS | Distribution and version | +| Uptime | System uptime reported by the agent | +| Age | Time since the agent process started | +| Last Seen | Time since the last heartbeat refresh | +| Load | 1-, 5-, and 15-minute load averages | +| Memory | Total, used, and free RAM | +| Architecture | CPU architecture (e.g., amd64) | +| Kernel | OS kernel version | +| FQDN | Fully qualified domain name | +| CPUs | Number of logical CPUs | +| Service Mgr | Init system (e.g., systemd) | +| Package Mgr | Package manager (e.g., apt) | +| Interfaces | Network interfaces with IPv4, IPv6, MAC, and family | +| Conditions | Node conditions table (type, status, reason, since) | +| Timeline | State transition events (timestamp, event, hostname) | :::tip agent get vs. node status diff --git a/docs/docs/sidebar/usage/cli/client/agent/list.md b/docs/docs/sidebar/usage/cli/client/agent/list.md index 4c0f6973..172dc3e7 100644 --- a/docs/docs/sidebar/usage/cli/client/agent/list.md +++ b/docs/docs/sidebar/usage/cli/client/agent/list.md @@ -5,25 +5,27 @@ List active agents in the fleet with status, labels, age, and system metrics: ```bash $ osapi client agent list - Active Agents (2): + Active Agents (3): - HOSTNAME STATUS LABELS AGE LOAD (1m) OS - web-01 Ready group:web.dev.us-east 3d 4h 1.78 Ubuntu 24.04 - web-02 Ready group:web.dev.us-west 12h 5m 0.45 Ubuntu 24.04 + HOSTNAME STATUS CONDITIONS LABELS AGE LOAD (1m) OS + web-01 Ready HighLoad,MemoryPressure group:web.dev.us-east 3d 4h 4.12 Ubuntu 24.04 + web-02 Ready - group:web.dev.us-west 12h 5m 0.45 Ubuntu 24.04 + db-01 Cordoned DiskPressure - 5d 2h 1.22 Ubuntu 24.04 ``` This command reads directly from the agent heartbeat registry -- no job is created. Each agent writes a heartbeat every 10 seconds with a 30-second TTL. Agents that stop heartbeating disappear from the list automatically. -| Column | Source | -| --------- | --------------------------------------- | -| HOSTNAME | Agent's configured or OS hostname | -| STATUS | `Ready` if present in registry | -| LABELS | Key-value labels from agent config | -| AGE | Time since the agent process started | -| LOAD (1m) | 1-minute load average from heartbeat | -| OS | Distribution and version from heartbeat | +| Column | Source | +| ---------- | --------------------------------------------------------------- | +| HOSTNAME | Agent's configured or OS hostname | +| STATUS | Scheduling state: `Ready`, `Draining`, or `Cordoned` | +| CONDITIONS | Active node conditions (MemoryPressure, HighLoad, DiskPressure) | +| LABELS | Key-value labels from agent config | +| AGE | Time since the agent process started | +| LOAD (1m) | 1-minute load average from heartbeat | +| OS | Distribution and version from heartbeat | :::tip Full facts in JSON output diff --git a/docs/docs/sidebar/usage/cli/client/agent/undrain.md b/docs/docs/sidebar/usage/cli/client/agent/undrain.md new file mode 100644 index 00000000..aaa75ea2 --- /dev/null +++ b/docs/docs/sidebar/usage/cli/client/agent/undrain.md @@ -0,0 +1,20 @@ +# Undrain + +Resume accepting jobs on a drained or cordoned agent: + +```bash +$ osapi client agent undrain --hostname web-01 + + Hostname: web-01 + Status: Ready + Message: Agent undrain initiated +``` + +The agent re-subscribes to NATS JetStream consumers and transitions back to +`Ready`. + +## Flags + +| Flag | Description | Required | +| ------------ | -------------------------------- | -------- | +| `--hostname` | Hostname of the agent to undrain | Yes | diff --git a/docs/docs/sidebar/usage/configuration.md b/docs/docs/sidebar/usage/configuration.md index 46f438f2..843b07ed 100644 --- a/docs/docs/sidebar/usage/configuration.md +++ b/docs/docs/sidebar/usage/configuration.md @@ -23,47 +23,53 @@ Every config key can be overridden with an environment variable using the `OSAPI_` prefix. Dots and nested keys become underscores, and the name is uppercased: -| Config Key | Environment Variable | -| ---------------------------------- | ---------------------------------------- | -| `debug` | `OSAPI_DEBUG` | -| `api.server.port` | `OSAPI_API_SERVER_PORT` | -| `api.server.nats.host` | `OSAPI_API_SERVER_NATS_HOST` | -| `api.server.nats.port` | `OSAPI_API_SERVER_NATS_PORT` | -| `api.server.nats.client_name` | `OSAPI_API_SERVER_NATS_CLIENT_NAME` | -| `api.server.nats.namespace` | `OSAPI_API_SERVER_NATS_NAMESPACE` | -| `api.server.nats.auth.type` | `OSAPI_API_SERVER_NATS_AUTH_TYPE` | -| `api.server.security.signing_key` | `OSAPI_API_SERVER_SECURITY_SIGNING_KEY` | -| `api.client.security.bearer_token` | `OSAPI_API_CLIENT_SECURITY_BEARER_TOKEN` | -| `nats.server.host` | `OSAPI_NATS_SERVER_HOST` | -| `nats.server.port` | `OSAPI_NATS_SERVER_PORT` | -| `nats.server.namespace` | `OSAPI_NATS_SERVER_NAMESPACE` | -| `nats.server.auth.type` | `OSAPI_NATS_SERVER_AUTH_TYPE` | -| `nats.stream.name` | `OSAPI_NATS_STREAM_NAME` | -| `nats.kv.bucket` | `OSAPI_NATS_KV_BUCKET` | -| `nats.kv.response_bucket` | `OSAPI_NATS_KV_RESPONSE_BUCKET` | -| `nats.audit.bucket` | `OSAPI_NATS_AUDIT_BUCKET` | -| `nats.audit.ttl` | `OSAPI_NATS_AUDIT_TTL` | -| `nats.audit.max_bytes` | `OSAPI_NATS_AUDIT_MAX_BYTES` | -| `nats.audit.storage` | `OSAPI_NATS_AUDIT_STORAGE` | -| `nats.audit.replicas` | `OSAPI_NATS_AUDIT_REPLICAS` | -| `nats.registry.bucket` | `OSAPI_NATS_REGISTRY_BUCKET` | -| `nats.registry.ttl` | `OSAPI_NATS_REGISTRY_TTL` | -| `nats.registry.storage` | `OSAPI_NATS_REGISTRY_STORAGE` | -| `nats.registry.replicas` | `OSAPI_NATS_REGISTRY_REPLICAS` | -| `nats.facts.bucket` | `OSAPI_NATS_FACTS_BUCKET` | -| `nats.facts.ttl` | `OSAPI_NATS_FACTS_TTL` | -| `nats.facts.storage` | `OSAPI_NATS_FACTS_STORAGE` | -| `nats.facts.replicas` | `OSAPI_NATS_FACTS_REPLICAS` | -| `telemetry.tracing.enabled` | `OSAPI_TELEMETRY_TRACING_ENABLED` | -| `telemetry.tracing.exporter` | `OSAPI_TELEMETRY_TRACING_EXPORTER` | -| `telemetry.tracing.otlp_endpoint` | `OSAPI_TELEMETRY_TRACING_OTLP_ENDPOINT` | -| `agent.nats.host` | `OSAPI_AGENT_NATS_HOST` | -| `agent.nats.port` | `OSAPI_AGENT_NATS_PORT` | -| `agent.nats.client_name` | `OSAPI_AGENT_NATS_CLIENT_NAME` | -| `agent.nats.namespace` | `OSAPI_AGENT_NATS_NAMESPACE` | -| `agent.nats.auth.type` | `OSAPI_AGENT_NATS_AUTH_TYPE` | -| `agent.hostname` | `OSAPI_AGENT_HOSTNAME` | -| `agent.facts.interval` | `OSAPI_AGENT_FACTS_INTERVAL` | +| Config Key | Environment Variable | +| -------------------------------------------- | -------------------------------------------------- | +| `debug` | `OSAPI_DEBUG` | +| `api.server.port` | `OSAPI_API_SERVER_PORT` | +| `api.server.nats.host` | `OSAPI_API_SERVER_NATS_HOST` | +| `api.server.nats.port` | `OSAPI_API_SERVER_NATS_PORT` | +| `api.server.nats.client_name` | `OSAPI_API_SERVER_NATS_CLIENT_NAME` | +| `api.server.nats.namespace` | `OSAPI_API_SERVER_NATS_NAMESPACE` | +| `api.server.nats.auth.type` | `OSAPI_API_SERVER_NATS_AUTH_TYPE` | +| `api.server.security.signing_key` | `OSAPI_API_SERVER_SECURITY_SIGNING_KEY` | +| `api.client.security.bearer_token` | `OSAPI_API_CLIENT_SECURITY_BEARER_TOKEN` | +| `nats.server.host` | `OSAPI_NATS_SERVER_HOST` | +| `nats.server.port` | `OSAPI_NATS_SERVER_PORT` | +| `nats.server.namespace` | `OSAPI_NATS_SERVER_NAMESPACE` | +| `nats.server.auth.type` | `OSAPI_NATS_SERVER_AUTH_TYPE` | +| `nats.stream.name` | `OSAPI_NATS_STREAM_NAME` | +| `nats.kv.bucket` | `OSAPI_NATS_KV_BUCKET` | +| `nats.kv.response_bucket` | `OSAPI_NATS_KV_RESPONSE_BUCKET` | +| `nats.audit.bucket` | `OSAPI_NATS_AUDIT_BUCKET` | +| `nats.audit.ttl` | `OSAPI_NATS_AUDIT_TTL` | +| `nats.audit.max_bytes` | `OSAPI_NATS_AUDIT_MAX_BYTES` | +| `nats.audit.storage` | `OSAPI_NATS_AUDIT_STORAGE` | +| `nats.audit.replicas` | `OSAPI_NATS_AUDIT_REPLICAS` | +| `nats.registry.bucket` | `OSAPI_NATS_REGISTRY_BUCKET` | +| `nats.registry.ttl` | `OSAPI_NATS_REGISTRY_TTL` | +| `nats.registry.storage` | `OSAPI_NATS_REGISTRY_STORAGE` | +| `nats.registry.replicas` | `OSAPI_NATS_REGISTRY_REPLICAS` | +| `nats.facts.bucket` | `OSAPI_NATS_FACTS_BUCKET` | +| `nats.facts.ttl` | `OSAPI_NATS_FACTS_TTL` | +| `nats.facts.storage` | `OSAPI_NATS_FACTS_STORAGE` | +| `nats.facts.replicas` | `OSAPI_NATS_FACTS_REPLICAS` | +| `nats.state.bucket` | `OSAPI_NATS_STATE_BUCKET` | +| `nats.state.storage` | `OSAPI_NATS_STATE_STORAGE` | +| `nats.state.replicas` | `OSAPI_NATS_STATE_REPLICAS` | +| `telemetry.tracing.enabled` | `OSAPI_TELEMETRY_TRACING_ENABLED` | +| `telemetry.tracing.exporter` | `OSAPI_TELEMETRY_TRACING_EXPORTER` | +| `telemetry.tracing.otlp_endpoint` | `OSAPI_TELEMETRY_TRACING_OTLP_ENDPOINT` | +| `agent.nats.host` | `OSAPI_AGENT_NATS_HOST` | +| `agent.nats.port` | `OSAPI_AGENT_NATS_PORT` | +| `agent.nats.client_name` | `OSAPI_AGENT_NATS_CLIENT_NAME` | +| `agent.nats.namespace` | `OSAPI_AGENT_NATS_NAMESPACE` | +| `agent.nats.auth.type` | `OSAPI_AGENT_NATS_AUTH_TYPE` | +| `agent.hostname` | `OSAPI_AGENT_HOSTNAME` | +| `agent.facts.interval` | `OSAPI_AGENT_FACTS_INTERVAL` | +| `agent.conditions.memory_pressure_threshold` | `OSAPI_AGENT_CONDITIONS_MEMORY_PRESSURE_THRESHOLD` | +| `agent.conditions.high_load_multiplier` | `OSAPI_AGENT_CONDITIONS_HIGH_LOAD_MULTIPLIER` | +| `agent.conditions.disk_pressure_threshold` | `OSAPI_AGENT_CONDITIONS_DISK_PRESSURE_THRESHOLD` | Environment variables take precedence over file values. @@ -127,11 +133,11 @@ OSAPI uses fine-grained `resource:verb` permissions for access control. Each API endpoint requires a specific permission. Built-in roles expand to a default set of permissions: -| Role | Permissions | -| ------- | ----------------------------------------------------------------------------------------------------------------------------------- | -| `admin` | `agent:read`, `node:read`, `network:read`, `network:write`, `job:read`, `job:write`, `health:read`, `audit:read`, `command:execute` | -| `write` | `agent:read`, `node:read`, `network:read`, `network:write`, `job:read`, `job:write`, `health:read` | -| `read` | `agent:read`, `node:read`, `network:read`, `job:read`, `health:read` | +| Role | Permissions | +| ------- | -------------------------------------------------------------------------------------------------------------------------------------------------- | +| `admin` | `agent:read`, `agent:write`, `node:read`, `network:read`, `network:write`, `job:read`, `job:write`, `health:read`, `audit:read`, `command:execute` | +| `write` | `agent:read`, `node:read`, `network:read`, `network:write`, `job:read`, `job:write`, `health:read` | +| `read` | `agent:read`, `node:read`, `network:read`, `job:read`, `health:read` | ### Custom Roles @@ -226,9 +232,9 @@ api: - 'http://localhost:3001' - 'https://osapi-io.github.io' # Custom roles with fine-grained permissions. - # Permissions: agent:read, node:read, network:read, network:write, - # job:read, job:write, health:read, audit:read, - # command:execute + # Permissions: agent:read, agent:write, node:read, network:read, + # network:write, job:read, job:write, health:read, + # audit:read, command:execute # roles: # ops: # permissions: @@ -324,6 +330,16 @@ nats: # Number of KV replicas. replicas: 1 + # ── State KV bucket ────────────────────────────────────── + state: + # KV bucket for persistent agent state (drain flags, timeline events). + # No TTL — operator actions persist indefinitely. + bucket: 'agent-state' + # Storage backend: "file" or "memory". + storage: 'file' + # Number of KV replicas. + replicas: 1 + # ── Dead Letter Queue ───────────────────────────────────── dlq: # Maximum age of messages in the DLQ. @@ -379,6 +395,14 @@ agent: facts: # How often the agent collects and publishes facts. interval: '60s' + # Node condition thresholds. + conditions: + # Memory pressure threshold (percent used). + memory_pressure_threshold: 90 + # High load multiplier (load1 / cpu_count). + high_load_multiplier: 2.0 + # Disk pressure threshold (percent used). + disk_pressure_threshold: 90 # Queue group for load-balanced (_any) subscriptions. queue_group: 'job-agents' # Agent hostname for direct routing. Defaults to the @@ -481,6 +505,14 @@ agent: | `storage` | string | `"file"` or `"memory"` | | `replicas` | int | Number of KV replicas | +### `nats.state` + +| Key | Type | Description | +| ---------- | ------ | --------------------------------------------- | +| `bucket` | string | KV bucket for persistent agent state (no TTL) | +| `storage` | string | `"file"` or `"memory"` | +| `replicas` | int | Number of KV replicas | + ### `nats.dlq` | Key | Type | Description | @@ -500,23 +532,26 @@ agent: ### `agent` -| Key | Type | Description | -| -------------------------- | ----------------- | ---------------------------------------- | -| `nats.host` | string | NATS server hostname | -| `nats.port` | int | NATS server port | -| `nats.client_name` | string | NATS client identification name | -| `nats.namespace` | string | Subject namespace prefix | -| `nats.auth.type` | string | Auth type: `none`, `user_pass` | -| `nats.auth.username` | string | Username for `user_pass` auth | -| `nats.auth.password` | string | Password for `user_pass` auth | -| `consumer.name` | string | Durable consumer name | -| `consumer.max_deliver` | int | Max redelivery attempts before DLQ | -| `consumer.ack_wait` | string | ACK timeout (Go duration) | -| `consumer.max_ack_pending` | int | Max outstanding unacknowledged msgs | -| `consumer.replay_policy` | string | `"instant"` or `"original"` | -| `consumer.back_off` | []string | Backoff durations between redeliveries | -| `queue_group` | string | Queue group for load-balanced routing | -| `hostname` | string | Agent hostname (defaults to OS hostname) | -| `max_jobs` | int | Max concurrent jobs | -| `facts.interval` | string | How often the agent collects facts | -| `labels` | map[string]string | Key-value pairs for label-based routing | +| Key | Type | Description | +| -------------------------------------- | ----------------- | ---------------------------------------------- | +| `nats.host` | string | NATS server hostname | +| `nats.port` | int | NATS server port | +| `nats.client_name` | string | NATS client identification name | +| `nats.namespace` | string | Subject namespace prefix | +| `nats.auth.type` | string | Auth type: `none`, `user_pass` | +| `nats.auth.username` | string | Username for `user_pass` auth | +| `nats.auth.password` | string | Password for `user_pass` auth | +| `consumer.name` | string | Durable consumer name | +| `consumer.max_deliver` | int | Max redelivery attempts before DLQ | +| `consumer.ack_wait` | string | ACK timeout (Go duration) | +| `consumer.max_ack_pending` | int | Max outstanding unacknowledged msgs | +| `consumer.replay_policy` | string | `"instant"` or `"original"` | +| `consumer.back_off` | []string | Backoff durations between redeliveries | +| `queue_group` | string | Queue group for load-balanced routing | +| `hostname` | string | Agent hostname (defaults to OS hostname) | +| `max_jobs` | int | Max concurrent jobs | +| `facts.interval` | string | How often the agent collects facts | +| `conditions.memory_pressure_threshold` | int | Memory pressure threshold percent (default 90) | +| `conditions.high_load_multiplier` | float | Load multiplier over CPU count (default 2.0) | +| `conditions.disk_pressure_threshold` | int | Disk pressure threshold percent (default 90) | +| `labels` | map[string]string | Key-value pairs for label-based routing | diff --git a/docs/docusaurus.config.ts b/docs/docusaurus.config.ts index bb2a104a..a1927b92 100644 --- a/docs/docusaurus.config.ts +++ b/docs/docusaurus.config.ts @@ -90,6 +90,11 @@ const config: Config = { label: 'Network Management', docId: 'sidebar/features/network-management' }, + { + type: 'doc', + label: 'Agent Lifecycle', + docId: 'sidebar/features/agent-lifecycle' + }, { type: 'doc', label: 'Job System', diff --git a/docs/plans/2026-03-05-node-conditions-drain-design.md b/docs/plans/2026-03-05-node-conditions-drain-design.md new file mode 100644 index 00000000..5d784b22 --- /dev/null +++ b/docs/plans/2026-03-05-node-conditions-drain-design.md @@ -0,0 +1,336 @@ +# Node Conditions and Agent Drain + +## Context + +OSAPI agents collect rich system metrics (memory, load, disk, CPU count) via +heartbeat and facts, but operators must manually interpret raw numbers to detect +problems. Kubernetes solves this with node conditions — threshold-based booleans +that surface "is anything wrong?" at a glance. + +Additionally, there's no way to gracefully remove an agent from the job routing +pool for maintenance without stopping the process entirely. When an agent stops, +it vanishes from the registry and looks identical to a crash. Kubernetes handles +this with cordon/drain. + +This design adds both features to OSAPI. + +## Node Conditions + +### Condition Types + +Three conditions derived from existing heartbeat and facts data, evaluated +agent-side on each heartbeat tick (10s): + +| Condition | Default Threshold | Data Source | +| ---------------- | -------------------- | ----------------------------------------------- | +| `MemoryPressure` | memory used > 90% | `MemoryStats` (heartbeat) | +| `HighLoad` | load1 > 2× CPU count | `LoadAverages` (heartbeat) + `CPUCount` (facts) | +| `DiskPressure` | any disk > 90% used | `DiskStats` (new in heartbeat) | + +### Condition Structure + +Each condition has: + +```go +type Condition struct { + Type string `json:"type"` + Status bool `json:"status"` + Reason string `json:"reason,omitempty"` + LastTransitionTime time.Time `json:"last_transition_time"` +} +``` + +- `Status`: `true` = condition is active (pressure/overload detected) +- `Reason`: human-readable explanation (e.g., "memory 94% used (15.1/16.0 GB)") +- `LastTransitionTime`: when the condition last changed from true→false or + false→true + +### Configuration + +Thresholds configurable in `osapi.yaml` with sensible defaults: + +```yaml +agent: + conditions: + memory_pressure_threshold: 90 # percent used + high_load_multiplier: 2.0 # load1 / cpu_count + disk_pressure_threshold: 90 # percent used +``` + +### Evaluation + +Conditions are evaluated in the agent during `writeRegistration()`. The agent +maintains previous condition state in memory to track `LastTransitionTime` — +only updated when the boolean flips. + +DiskPressure requires adding disk stats to the heartbeat. The existing +`disk.Provider` already implements `GetUsage()` so the data is available. Disk +collection joins the existing non-fatal provider pattern: if it fails, the +DiskPressure condition is simply not evaluated. + +### Storage + +Conditions are stored as part of `AgentRegistration` in the registry KV bucket. +No new KV bucket needed. + +```go +type AgentRegistration struct { + // ... existing fields ... + Conditions []Condition `json:"conditions,omitempty"` +} +``` + +### CLI Display + +`agent list` gains a CONDITIONS column showing active conditions: + +``` +HOSTNAME STATUS CONDITIONS LOAD OS +web-01 Ready HighLoad,MemoryPressure 4.12 Ubuntu 24.04 +web-02 Ready - 0.31 Ubuntu 24.04 +db-01 Ready DiskPressure 1.22 Ubuntu 24.04 +``` + +`agent get` shows full condition details and state timeline: + +``` +Conditions: + MemoryPressure: true (memory 94% used, 15.1/16.0 GB) since 2m ago + HighLoad: true (load 4.12, threshold 4.00 for 2 CPUs) since 5m ago + DiskPressure: false + +Timeline: + TIMESTAMP EVENT HOSTNAME MESSAGE + 2026-03-05 10:00:00 drain web-01 Drain initiated + 2026-03-05 10:05:23 cordoned web-01 All jobs completed + 2026-03-05 12:00:00 undrain web-01 Resumed accepting jobs +``` + +## Agent Drain + +### State Machine + +Agents gain an explicit state field with three values: + +``` +Ready ──(drain)──> Draining ──(jobs done)──> Cordoned + ^ │ + └──────────────(undrain)───────────────────────┘ +``` + +| State | Meaning | +| ---------- | ------------------------------------------------ | +| `Ready` | Accepting and processing jobs (default) | +| `Draining` | Finishing in-flight jobs, not accepting new ones | +| `Cordoned` | Fully drained, idle, not accepting jobs | + +### Mechanism + +1. Operator calls `POST /agent/{hostname}/drain` +2. API writes a `drain.{hostname}` key to the state KV bucket +3. Agent checks for drain key on each heartbeat tick (10s) +4. When drain flag detected: + - Agent transitions state to `Draining` + - Agent unsubscribes from NATS consumer (stops receiving new jobs) + - In-flight jobs continue to completion +5. Once WaitGroup drains (no in-flight jobs), state becomes `Cordoned` +6. `POST /agent/{hostname}/undrain` deletes the drain key +7. Agent detects drain key removal on next heartbeat: + - Transitions state to `Ready` + - Re-subscribes to NATS consumer + +### API Endpoints + +``` +POST /agent/{hostname}/drain # Start draining +POST /agent/{hostname}/undrain # Resume accepting jobs +``` + +Both return 200 on success, 404 if agent not found, 409 if already in the +requested state. + +### Permission + +New `agent:write` permission. Added to the `admin` role by default. + +### Storage + +Agent state transitions are recorded as **append-only events** in the state KV +bucket (`agent-state`, no TTL), following the same pattern used for job status +events (see `WriteStatusEvent` in `internal/job/client/agent.go`). + +Events reuse the existing `TimelineEvent` type (`internal/job/types.go`) — the +same type used for job lifecycle events. This type is generic (Timestamp, Event, +Hostname, Message, Error) and not job-specific: + +``` +Key format: timeline.{sanitized_hostname}.{event}.{unix_nano} +Value: TimelineEvent JSON +``` + +Events: `ready`, `drain`, `cordoned`, `undrain` + +On the SDK side, `TimelineEvent` is promoted from `job_types.go` to a shared +top-level type in `pkg/osapi/types.go`. Both `JobDetail.Timeline` and +`Agent.Timeline` reference the same type. + +Current state is **computed from the latest event**, just like job status is +computed via `computeStatusFromEvents`. This preserves the full transition +history (Ready → Draining → Cordoned → Ready → Draining → ...) and eliminates +race conditions by never updating existing keys. + +The drain intent uses a separate key: `drain.{sanitized_hostname}`. The API +writes this key to signal drain; the agent reads it on heartbeat and writes the +state transition event. The API deletes the key on undrain. + +The `AgentRegistration` also carries the current state for quick reads without +scanning events: + +```go +type AgentRegistration struct { + // ... existing fields ... + State string `json:"state,omitempty"` // Ready, Draining, Cordoned +} +``` + +### CLI Commands + +```bash +osapi client agent drain --hostname web-01 +osapi client agent undrain --hostname web-01 +``` + +`agent list` and `agent get` show the state in the STATUS column. + +## OpenAPI Changes + +### AgentInfo Schema + +Add to existing `AgentInfo`: + +```yaml +state: + type: string + enum: [Ready, Draining, Cordoned] + description: Agent scheduling state. +conditions: + type: array + items: + $ref: '#/components/schemas/NodeCondition' +``` + +New schema: + +```yaml +NodeCondition: + type: object + properties: + type: + type: string + enum: [MemoryPressure, HighLoad, DiskPressure] + status: + type: boolean + reason: + type: string + last_transition_time: + type: string + format: date-time + required: [type, status, last_transition_time] +``` + +### New Endpoints + +```yaml +/agent/{hostname}/drain: + post: + summary: Drain an agent + description: Stop the agent from accepting new jobs. + security: + - BearerAuth: [] + responses: + 200: ... + 404: ... + 409: ... + +/agent/{hostname}/undrain: + post: + summary: Undrain an agent + description: Resume accepting jobs on a drained agent. + security: + - BearerAuth: [] + responses: + 200: ... + 404: ... + 409: ... +``` + +### Permission Updates + +```yaml +# New permission +agent:write + +# Updated admin role +admin: + permissions: + - agent:read + - agent:write # new + - node:read + - ... +``` + +## Implementation Scope + +### Provider Changes + +- Extend heartbeat to collect disk stats (reuse existing `disk.Provider`) +- Add condition evaluation logic to agent heartbeat + +### Agent Changes + +- Add `Condition` type and evaluation functions +- Add state field to `AgentRegistration` +- Add drain flag detection on heartbeat tick +- Add consumer subscribe/unsubscribe for drain/undrain transitions +- Add condition threshold config support + +### API Changes + +- New drain/undrain endpoints in the agent API domain +- Extend `AgentInfo` schema with `state` and `conditions` +- Add `agent:write` permission and wire into scope middleware + +### CLI Changes + +- `agent drain` and `agent undrain` commands +- CONDITIONS column in `agent list` +- Condition details and state timeline in `agent get` +- State shown in STATUS column + +### SDK Changes + +- Promote `TimelineEvent` from `job_types.go` to shared `types.go` +- Both `JobDetail.Timeline` and `Agent.Timeline` use the same type +- Add `Agent.Drain()` and `Agent.Undrain()` methods +- Add conditions, state, and timeline to `Agent` type + +### Config Changes + +- `agent.conditions` section with threshold defaults + +## Testing + +- **Unit**: condition evaluation logic (threshold math, transition tracking), + state machine transitions, drain flag detection +- **HTTP wiring**: drain/undrain endpoints with RBAC (401, 403, 200, 404, 409) +- **Integration**: drain agent → submit job → verify not routed to drained agent + → undrain → verify jobs resume + +## Verification + +```bash +just generate # regenerate specs + code +go build ./... # compiles +just go::unit # tests pass +just go::vet # lint passes +``` diff --git a/docs/plans/2026-03-05-node-conditions-drain.md b/docs/plans/2026-03-05-node-conditions-drain.md new file mode 100644 index 00000000..5227ccdd --- /dev/null +++ b/docs/plans/2026-03-05-node-conditions-drain.md @@ -0,0 +1,1387 @@ +# Node Conditions & Agent Drain Implementation Plan + +> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to +> implement this plan task-by-task. + +**Goal:** Add Kubernetes-inspired node conditions (MemoryPressure, HighLoad, +DiskPressure) and agent drain/cordon lifecycle to OSAPI. + +**Architecture:** Conditions are evaluated agent-side on each heartbeat tick +using existing provider data, stored in AgentRegistration. Drain uses +append-only timeline events in the registry KV bucket (reusing the existing +`TimelineEvent` type from job lifecycle), with a separate drain intent key the +API writes and the agent reads on heartbeat. State transitions trigger NATS +consumer subscribe/unsubscribe. + +**Tech Stack:** Go 1.25, NATS JetStream KV, Echo REST API, OpenAPI codegen, +testify/suite + +**Design Doc:** `docs/plans/2026-03-05-node-conditions-drain-design.md` + +--- + +## Task 1: Add Condition type and evaluation functions + +**Files:** + +- Create: `internal/agent/condition.go` +- Create: `internal/agent/condition_test.go` + +**Step 1: Write the failing tests** + +```go +// internal/agent/condition_test.go +package agent + +import ( + "testing" + "time" + + "github.com/stretchr/testify/suite" + + "github.com/retr0h/osapi/internal/job" + "github.com/retr0h/osapi/internal/provider/node/disk" + "github.com/retr0h/osapi/internal/provider/node/load" + "github.com/retr0h/osapi/internal/provider/node/mem" +) + +type ConditionTestSuite struct { + suite.Suite +} + +func TestConditionTestSuite(t *testing.T) { + suite.Run(t, new(ConditionTestSuite)) +} + +func (s *ConditionTestSuite) TestEvaluateMemoryPressure() { + tests := []struct { + name string + stats *mem.Stats + threshold int + wantStatus bool + wantReason string + }{ + { + name: "above threshold", + stats: &mem.Stats{Total: 16000000000, Used: 15000000000, Free: 1000000000}, + threshold: 90, + wantStatus: true, + }, + { + name: "below threshold", + stats: &mem.Stats{Total: 16000000000, Used: 8000000000, Free: 8000000000}, + threshold: 90, + wantStatus: false, + }, + { + name: "nil stats", + stats: nil, + threshold: 90, + wantStatus: false, + }, + } + + for _, tt := range tests { + s.Run(tt.name, func() { + c := evaluateMemoryPressure(tt.stats, tt.threshold, nil) + s.Equal(tt.wantStatus, c.Status) + s.Equal(job.ConditionMemoryPressure, c.Type) + }) + } +} + +func (s *ConditionTestSuite) TestEvaluateHighLoad() { + tests := []struct { + name string + loadAvg *load.AverageStats + cpuCount int + multiplier float64 + wantStatus bool + }{ + { + name: "above threshold", + loadAvg: &load.AverageStats{OneMin: 5.0}, + cpuCount: 2, + multiplier: 2.0, + wantStatus: true, + }, + { + name: "below threshold", + loadAvg: &load.AverageStats{OneMin: 1.0}, + cpuCount: 2, + multiplier: 2.0, + wantStatus: false, + }, + { + name: "nil load", + loadAvg: nil, + cpuCount: 2, + multiplier: 2.0, + wantStatus: false, + }, + { + name: "zero cpus", + loadAvg: &load.AverageStats{OneMin: 5.0}, + cpuCount: 0, + multiplier: 2.0, + wantStatus: false, + }, + } + + for _, tt := range tests { + s.Run(tt.name, func() { + c := evaluateHighLoad(tt.loadAvg, tt.cpuCount, tt.multiplier, nil) + s.Equal(tt.wantStatus, c.Status) + s.Equal(job.ConditionHighLoad, c.Type) + }) + } +} + +func (s *ConditionTestSuite) TestEvaluateDiskPressure() { + tests := []struct { + name string + disks []disk.UsageStats + threshold int + wantStatus bool + }{ + { + name: "one disk above threshold", + disks: []disk.UsageStats{ + {Name: "/dev/sda1", Total: 100000, Used: 95000, Free: 5000}, + }, + threshold: 90, + wantStatus: true, + }, + { + name: "all disks below threshold", + disks: []disk.UsageStats{ + {Name: "/dev/sda1", Total: 100000, Used: 50000, Free: 50000}, + }, + threshold: 90, + wantStatus: false, + }, + { + name: "nil disks", + disks: nil, + threshold: 90, + wantStatus: false, + }, + } + + for _, tt := range tests { + s.Run(tt.name, func() { + c := evaluateDiskPressure(tt.disks, tt.threshold, nil) + s.Equal(tt.wantStatus, c.Status) + s.Equal(job.ConditionDiskPressure, c.Type) + }) + } +} + +func (s *ConditionTestSuite) TestLastTransitionTimeTracking() { + prev := []job.Condition{{ + Type: job.ConditionMemoryPressure, Status: false, + LastTransitionTime: time.Now().Add(-5 * time.Minute), + }} + // Flip from false -> true: should update LastTransitionTime + c := evaluateMemoryPressure( + &mem.Stats{Total: 100, Used: 95, Free: 5}, 90, prev, + ) + s.True(c.Status) + s.True(c.LastTransitionTime.After(time.Now().Add(-1 * time.Second))) + + // Same status (true -> true): should keep old LastTransitionTime + prev2 := []job.Condition{c} + c2 := evaluateMemoryPressure( + &mem.Stats{Total: 100, Used: 95, Free: 5}, 90, prev2, + ) + s.True(c2.Status) + s.Equal(c.LastTransitionTime, c2.LastTransitionTime) +} +``` + +**Step 2: Run tests to verify they fail** + +Run: `go test -run TestConditionTestSuite -v ./internal/agent/` Expected: FAIL — +`evaluateMemoryPressure` not defined + +**Step 3: Write minimal implementation** + +```go +// internal/agent/condition.go +package agent + +import ( + "fmt" + "time" + + "github.com/retr0h/osapi/internal/job" + "github.com/retr0h/osapi/internal/provider/node/disk" + "github.com/retr0h/osapi/internal/provider/node/load" + "github.com/retr0h/osapi/internal/provider/node/mem" +) + +// findPrevCondition returns the previous condition of the given type, +// or nil if not found. +func findPrevCondition( + condType string, + prev []job.Condition, +) *job.Condition { + for i := range prev { + if prev[i].Type == condType { + return &prev[i] + } + } + return nil +} + +// transitionTime returns the previous LastTransitionTime if status +// hasn't changed, otherwise returns now. +func transitionTime( + condType string, + newStatus bool, + prev []job.Condition, +) time.Time { + if p := findPrevCondition(condType, prev); p != nil { + if p.Status == newStatus { + return p.LastTransitionTime + } + } + return time.Now() +} + +func evaluateMemoryPressure( + stats *mem.Stats, + threshold int, + prev []job.Condition, +) job.Condition { + c := job.Condition{Type: job.ConditionMemoryPressure} + if stats == nil || stats.Total == 0 { + c.LastTransitionTime = transitionTime(c.Type, false, prev) + return c + } + pct := float64(stats.Used) / float64(stats.Total) * 100 + c.Status = pct > float64(threshold) + if c.Status { + c.Reason = fmt.Sprintf( + "memory %.0f%% used (%.1f/%.1f GB)", + pct, + float64(stats.Used)/1024/1024/1024, + float64(stats.Total)/1024/1024/1024, + ) + } + c.LastTransitionTime = transitionTime(c.Type, c.Status, prev) + return c +} + +func evaluateHighLoad( + loadAvg *load.AverageStats, + cpuCount int, + multiplier float64, + prev []job.Condition, +) job.Condition { + c := job.Condition{Type: job.ConditionHighLoad} + if loadAvg == nil || cpuCount == 0 { + c.LastTransitionTime = transitionTime(c.Type, false, prev) + return c + } + threshold := float64(cpuCount) * multiplier + c.Status = loadAvg.OneMin > threshold + if c.Status { + c.Reason = fmt.Sprintf( + "load %.2f, threshold %.2f for %d CPUs", + loadAvg.OneMin, threshold, cpuCount, + ) + } + c.LastTransitionTime = transitionTime(c.Type, c.Status, prev) + return c +} + +func evaluateDiskPressure( + disks []disk.UsageStats, + threshold int, + prev []job.Condition, +) job.Condition { + c := job.Condition{Type: job.ConditionDiskPressure} + if len(disks) == 0 { + c.LastTransitionTime = transitionTime(c.Type, false, prev) + return c + } + for _, d := range disks { + if d.Total == 0 { + continue + } + pct := float64(d.Used) / float64(d.Total) * 100 + if pct > float64(threshold) { + c.Status = true + c.Reason = fmt.Sprintf( + "%s %.0f%% used (%.1f/%.1f GB)", + d.Name, pct, + float64(d.Used)/1024/1024/1024, + float64(d.Total)/1024/1024/1024, + ) + break + } + } + c.LastTransitionTime = transitionTime(c.Type, c.Status, prev) + return c +} +``` + +**Step 4: Run tests to verify they pass** + +Run: `go test -run TestConditionTestSuite -v ./internal/agent/` Expected: PASS + +**Step 5: Commit** + +```bash +git add internal/agent/condition.go internal/agent/condition_test.go +git commit -m "feat(agent): add condition evaluation functions" +``` + +--- + +## Task 2: Add Condition and State types to job domain + +**Files:** + +- Modify: `internal/job/types.go:273-331` (AgentRegistration, AgentInfo) + +**Step 1: Write the types** + +Add to `internal/job/types.go` after existing types: + +```go +// Condition type constants. +const ( + ConditionMemoryPressure = "MemoryPressure" + ConditionHighLoad = "HighLoad" + ConditionDiskPressure = "DiskPressure" +) + +// Agent state constants. +const ( + AgentStateReady = "Ready" + AgentStateDraining = "Draining" + AgentStateCordoned = "Cordoned" +) + +// Condition represents a node condition evaluated agent-side. +type Condition struct { + Type string `json:"type"` + Status bool `json:"status"` + Reason string `json:"reason,omitempty"` + LastTransitionTime time.Time `json:"last_transition_time"` +} + +``` + +The existing `TimelineEvent` type (line 177) is already generic and will be +reused for agent state transitions — no new event type needed. + +Add fields to `AgentRegistration`: + +```go +Conditions []Condition `json:"conditions,omitempty"` +State string `json:"state,omitempty"` +``` + +Add fields to `AgentInfo`: + +```go +Conditions []Condition `json:"conditions,omitempty"` +State string `json:"state,omitempty"` +Timeline []TimelineEvent `json:"timeline,omitempty"` +``` + +**Step 2: Run existing tests** + +Run: `go test ./internal/job/... -count=1` Expected: PASS (additive change) + +**Step 3: Commit** + +```bash +git add internal/job/types.go +git commit -m "feat(job): add Condition type and agent state constants" +``` + +--- + +## Task 3: Add conditions config to AgentConfig + +**Files:** + +- Modify: `internal/config/types.go:262-277` +- Modify: `configs/osapi.yaml` +- Modify: `configs/osapi.local.yaml` + +**Step 1: Add config struct** + +Add to `internal/config/types.go`: + +```go +// AgentConditions holds threshold configuration for node conditions. +type AgentConditions struct { + MemoryPressureThreshold int `mapstructure:"memory_pressure_threshold"` + HighLoadMultiplier float64 `mapstructure:"high_load_multiplier"` + DiskPressureThreshold int `mapstructure:"disk_pressure_threshold"` +} +``` + +Add field to `AgentConfig`: + +```go +Conditions AgentConditions `mapstructure:"conditions,omitempty"` +``` + +**Step 2: Set defaults in osapi.yaml and osapi.local.yaml** + +```yaml +agent: + conditions: + memory_pressure_threshold: 90 + high_load_multiplier: 2.0 + disk_pressure_threshold: 90 +``` + +**Step 3: Verify compilation** + +Run: `go build ./...` Expected: compiles + +**Step 4: Commit** + +```bash +git add internal/config/types.go configs/osapi.yaml configs/osapi.local.yaml +git commit -m "feat(config): add agent conditions threshold configuration" +``` + +--- + +## Task 4: Add disk stats to heartbeat and evaluate conditions + +**Files:** + +- Modify: `internal/agent/heartbeat.go:88-134` (writeRegistration) +- Modify: `internal/agent/types.go:45-81` (add prevConditions, cpuCount) + +**Step 1: Add fields to Agent struct** + +In `internal/agent/types.go`, add to Agent struct: + +```go +// prevConditions tracks condition state between heartbeats. +prevConditions []job.Condition + +// cpuCount cached from facts for HighLoad evaluation. +cpuCount int +``` + +**Step 2: Extend writeRegistration** + +In `internal/agent/heartbeat.go`, after memory stats collection (~line 111), +add: + +```go +// Collect disk stats (non-fatal). +var diskStats []disk.UsageStats +if stats, err := a.diskProvider.GetLocalUsageStats(); err == nil { + diskStats = stats +} + +// Evaluate conditions. +conditions := []job.Condition{ + evaluateMemoryPressure( + memStats, + a.appConfig.Agent.Conditions.MemoryPressureThreshold, + a.prevConditions, + ), + evaluateHighLoad( + loadAvg, + a.cpuCount, + a.appConfig.Agent.Conditions.HighLoadMultiplier, + a.prevConditions, + ), + evaluateDiskPressure( + diskStats, + a.appConfig.Agent.Conditions.DiskPressureThreshold, + a.prevConditions, + ), +} +a.prevConditions = conditions +``` + +Add `Conditions: conditions` to the `AgentRegistration` literal. + +**Step 3: Set cpuCount from facts** + +In `internal/agent/facts.go` (the `writeFacts` function), after collecting +`CPUCount`, add: + +```go +a.cpuCount = cpuCount +``` + +**Step 4: Run tests** + +Run: `go test ./internal/agent/... -count=1` Expected: PASS + +**Step 5: Commit** + +```bash +git add internal/agent/heartbeat.go internal/agent/types.go internal/agent/facts.go +git commit -m "feat(agent): evaluate node conditions on heartbeat tick" +``` + +--- + +## Task 5: Add drain timeline event storage functions + +**Files:** + +- Modify: `internal/job/client/agent.go:39-85` +- Create: `internal/job/client/agent_timeline_test.go` + +**Step 1: Write failing tests** + +```go +// internal/job/client/agent_timeline_test.go +package client_test + +// Test WriteAgentTimelineEvent writes append-only key to registryKV. +// Test ComputeAgentState returns latest state from timeline events. +// Test GetAgentTimeline returns sorted timeline events. +``` + +Table-driven tests: + +- `WriteAgentTimelineEvent` writes key like + `timeline.{hostname}.{event}.{unix_nano}` +- `ComputeAgentState` with no events returns "Ready" +- `ComputeAgentState` with drain event returns "Draining" +- `ComputeAgentState` with cordoned event returns "Cordoned" +- `ComputeAgentState` with undrain event returns "Ready" + +**Step 2: Run tests to verify they fail** + +Run: `go test -run TestAgentTimeline -v ./internal/job/client/` Expected: FAIL + +**Step 3: Implement** + +Add to `internal/job/client/agent.go`: + +```go +// WriteAgentTimelineEvent writes an append-only timeline event +// for an agent state transition. Reuses the same TimelineEvent +// type used by job lifecycle events. +func (c *Client) WriteAgentTimelineEvent( + _ context.Context, + hostname, event, message string, +) error { + now := time.Now() + key := fmt.Sprintf( + "timeline.%s.%s.%d", + job.SanitizeHostname(hostname), + event, + now.UnixNano(), + ) + data, _ := json.Marshal(job.TimelineEvent{ + Timestamp: now, + Event: event, + Hostname: hostname, + Message: message, + }) + _, err := c.registryKV.Put(key, data) + return err +} + +// GetAgentTimeline returns sorted timeline events for a hostname. +func (c *Client) GetAgentTimeline( + ctx context.Context, + hostname string, +) ([]job.TimelineEvent, error) { + prefix := "timeline." + job.SanitizeHostname(hostname) + "." + // List keys with prefix, unmarshal, sort by Timestamp + // Return sorted events +} + +// ComputeAgentState returns the current state from timeline events. +func ComputeAgentState( + events []job.TimelineEvent, +) string { + if len(events) == 0 { + return job.AgentStateReady + } + latest := events[len(events)-1] + switch latest.Event { + case "drain": + return job.AgentStateDraining + case "cordoned": + return job.AgentStateCordoned + case "undrain", "ready": + return job.AgentStateReady + default: + return job.AgentStateReady + } +} +``` + +Add `WriteAgentTimelineEvent`, `GetAgentTimeline` to the `JobClient` interface +in `internal/job/client/types.go`. Regenerate mocks. + +**Step 4: Run tests** + +Run: `go test -run TestAgentTimeline -v ./internal/job/client/` Expected: PASS + +**Step 5: Commit** + +```bash +git add internal/job/client/agent.go internal/job/client/agent_timeline_test.go \ + internal/job/client/types.go internal/job/client/mock_*.go +git commit -m "feat(job): add append-only timeline events for agent drain" +``` + +--- + +## Task 6: Add drain/undrain API endpoints + +**Files:** + +- Modify: `internal/api/agent/gen/api.yaml` +- Create: `internal/api/agent/agent_drain.go` +- Create: `internal/api/agent/agent_drain_public_test.go` + +**Step 1: Add to OpenAPI spec** + +Add to `internal/api/agent/gen/api.yaml`: + +```yaml +/agent/{hostname}/drain: + post: + operationId: drainAgent + summary: Drain an agent + description: Stop the agent from accepting new jobs. + security: + - BearerAuth: + - 'agent:write' + parameters: + - name: hostname + in: path + required: true + schema: + type: string + responses: + '200': + description: Agent drain initiated. + content: + application/json: + schema: + type: object + properties: + message: + type: string + '404': + description: Agent not found. + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '409': + description: Agent already in requested state. + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + +/agent/{hostname}/undrain: + post: + operationId: undrainAgent + summary: Undrain an agent + description: Resume accepting jobs on a drained agent. + security: + - BearerAuth: + - 'agent:write' + parameters: + - name: hostname + in: path + required: true + schema: + type: string + responses: + '200': + description: Agent undrain initiated. + content: + application/json: + schema: + type: object + properties: + message: + type: string + '404': + description: Agent not found. + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '409': + description: Agent not in draining/cordoned state. + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' +``` + +Add `agent:write` to BearerAuth scopes. Add `state` and `conditions` fields to +`AgentInfo` schema. Add `NodeCondition` schema. + +Run: `just generate` to regenerate `*.gen.go`. + +**Step 2: Write failing tests** + +```go +// internal/api/agent/agent_drain_public_test.go +// Table-driven tests for DrainAgent and UndrainAgent: +// - 200: agent found and drain initiated +// - 404: agent not found +// - 409: already draining/cordoned +// - HTTP wiring: RBAC (401, 403 without agent:write, 200 with agent:write) +``` + +**Step 3: Implement handlers** + +```go +// internal/api/agent/agent_drain.go +package agent + +func (a *Agent) DrainAgent( + ctx context.Context, + request gen.DrainAgentRequestObject, +) (gen.DrainAgentResponseObject, error) { + hostname := request.Hostname + + // 1. Verify agent exists + agentInfo, err := a.JobClient.GetAgent(ctx, hostname) + if err != nil { + return gen.DrainAgent404JSONResponse{...}, nil + } + + // 2. Check not already draining + if agentInfo.State == job.AgentStateDraining || + agentInfo.State == job.AgentStateCordoned { + return gen.DrainAgent409JSONResponse{...}, nil + } + + // 3. Write drain intent key + // 4. Write state event + return gen.DrainAgent200JSONResponse{...}, nil +} + +func (a *Agent) UndrainAgent( + ctx context.Context, + request gen.UndrainAgentRequestObject, +) (gen.UndrainAgentResponseObject, error) { + // Similar: verify exists, check state, delete drain key, write event +} +``` + +**Step 4: Run tests** + +Run: `go test ./internal/api/agent/... -count=1` Expected: PASS + +**Step 5: Commit** + +```bash +git add internal/api/agent/gen/api.yaml internal/api/agent/gen/*.gen.go \ + internal/api/agent/agent_drain.go internal/api/agent/agent_drain_public_test.go +git commit -m "feat(api): add drain/undrain endpoints with RBAC" +``` + +--- + +## Task 7: Add agent:write permission + +**Files:** + +- Modify: `internal/authtoken/permissions.go:27-37` (add constant) +- Modify: `internal/authtoken/permissions.go:53-81` (add to admin role) + +**Step 1: Add permission constant** + +```go +PermAgentWrite Permission = "agent:write" +``` + +**Step 2: Add to admin role** + +In `DefaultRolePermissions`, add `PermAgentWrite` to the `admin` slice. + +**Step 3: Run tests** + +Run: `go test ./internal/authtoken/... -count=1` Expected: PASS + +**Step 4: Commit** + +```bash +git add internal/authtoken/permissions.go +git commit -m "feat(auth): add agent:write permission for drain operations" +``` + +--- + +## Task 8: Wire drain endpoints into server + +**Files:** + +- Modify: `internal/api/handler_agent.go:34-61` +- Modify: `internal/api/handler_agent_public_test.go` + +**Step 1: Update handler registration** + +The `GetAgentHandler` already wires all agent gen handlers through +`scopeMiddleware`. After regenerating the OpenAPI code (Task 6), the new +`DrainAgent` and `UndrainAgent` methods on the strict server interface will be +picked up automatically by `RegisterHandlers`. + +No code change needed in `handler_agent.go` unless `unauthenticatedOperations` +needs updating (it doesn't — drain requires auth). + +**Step 2: Verify compilation** + +Run: `go build ./...` Expected: compiles + +**Step 3: Add handler test cases** + +Add test cases to `handler_agent_public_test.go` for drain/undrain handler +registration. + +**Step 4: Commit** + +```bash +git add internal/api/handler_agent.go internal/api/handler_agent_public_test.go +git commit -m "feat(api): wire drain/undrain handlers into server" +``` + +--- + +## Task 9: Add drain detection to agent heartbeat + +**Files:** + +- Modify: `internal/agent/heartbeat.go:88-134` +- Modify: `internal/agent/server.go:32-61` +- Create: `internal/agent/drain.go` +- Create: `internal/agent/drain_test.go` + +**Step 1: Write failing tests** + +```go +// internal/agent/drain_test.go +// Test checkDrainFlag: returns true when drain key exists +// Test checkDrainFlag: returns false when drain key absent +// Test handleDrainTransition: unsubscribes consumers when draining +// Test handleUndrainTransition: resubscribes consumers when undrained +``` + +**Step 2: Implement drain detection** + +```go +// internal/agent/drain.go +package agent + +// checkDrainFlag reads drain.{hostname} from registryKV. +func (a *Agent) checkDrainFlag( + ctx context.Context, + hostname string, +) bool { + key := "drain." + job.SanitizeHostname(hostname) + _, err := a.registryKV.Get(ctx, key) + return err == nil +} + +// handleDrainDetection checks drain flag on each heartbeat. +func (a *Agent) handleDrainDetection( + ctx context.Context, + hostname string, +) { + drainRequested := a.checkDrainFlag(ctx, hostname) + + switch { + case drainRequested && a.state == job.AgentStateReady: + a.state = job.AgentStateDraining + a.unsubscribeConsumers() + // Write timeline event: "drain", "Drain initiated" + // When WaitGroup drains, transition to Cordoned + + case !drainRequested && a.state == job.AgentStateCordoned: + a.state = job.AgentStateReady + a.resubscribeConsumers(ctx, hostname) + // Write timeline event: "undrain", "Resumed accepting jobs" + } +} +``` + +**Step 3: Add state field to Agent struct** + +In `internal/agent/types.go`: + +```go +state string // Ready, Draining, Cordoned +``` + +Initialize to `job.AgentStateReady` in `Start()`. + +**Step 4: Call from heartbeat** + +In `writeRegistration()`, add `a.handleDrainDetection(ctx, hostname)` and +include `State: a.state` in the registration. + +**Step 5: Run tests** + +Run: `go test ./internal/agent/... -count=1` Expected: PASS + +**Step 6: Commit** + +```bash +git add internal/agent/drain.go internal/agent/drain_test.go \ + internal/agent/heartbeat.go internal/agent/types.go internal/agent/server.go +git commit -m "feat(agent): detect drain flag and manage consumer lifecycle" +``` + +--- + +## Task 10: Extend buildAgentInfo with conditions and state + +**Files:** + +- Modify: `internal/api/agent/agent_list.go:59-171` (buildAgentInfo) +- Modify: `internal/api/agent/agent_list_public_test.go` +- Modify: `internal/job/client/query.go:479-493` (agentInfoFromRegistration) + +**Step 1: Update agentInfoFromRegistration** + +Add to the returned `AgentInfo`: + +```go +Conditions: reg.Conditions, +State: reg.State, +``` + +**Step 2: Update buildAgentInfo** + +Map conditions and state from `job.AgentInfo` to `gen.AgentInfo`: + +```go +if len(a.Conditions) > 0 { + conditions := make([]gen.NodeCondition, 0, len(a.Conditions)) + for _, c := range a.Conditions { + nc := gen.NodeCondition{ + Type: gen.NodeConditionType(c.Type), + Status: c.Status, + LastTransitionTime: c.LastTransitionTime, + } + if c.Reason != "" { + nc.Reason = &c.Reason + } + conditions = append(conditions, nc) + } + info.Conditions = &conditions +} + +if a.State != "" { + state := gen.AgentInfoState(a.State) + info.State = &state +} +``` + +**Step 3: Update status derivation** + +Change status logic: if `a.State` is set, use it; otherwise default to `Ready` +(existing behavior). + +**Step 4: Add test cases** + +Add table-driven test case for agent with conditions and Draining/Cordoned +states. + +**Step 5: Run tests** + +Run: `go test ./internal/api/agent/... -count=1` Expected: PASS + +**Step 6: Commit** + +```bash +git add internal/api/agent/agent_list.go internal/api/agent/agent_list_public_test.go \ + internal/job/client/query.go +git commit -m "feat(api): expose conditions and state in agent responses" +``` + +--- + +## Task 11: Add timeline to GetAgent response + +**Files:** + +- Modify: `internal/job/client/query.go:423-445` (GetAgent) +- Modify: `internal/job/client/query_public_test.go` + +**Step 1: Extend GetAgent to fetch timeline events** + +After building `AgentInfo`, fetch timeline events: + +```go +timeline, err := c.GetAgentTimeline(ctx, hostname) +if err == nil { + info.Timeline = timeline +} +``` + +**Step 2: Add test cases** + +Test GetAgent returns timeline events when present. + +**Step 3: Run tests** + +Run: `go test ./internal/job/client/... -count=1` Expected: PASS + +**Step 4: Commit** + +```bash +git add internal/job/client/query.go internal/job/client/query_public_test.go +git commit -m "feat(job): include timeline events in GetAgent response" +``` + +--- + +## Task 12: Update SDK with conditions, state, drain/undrain + +**Files:** + +- Modify: `osapi-sdk/pkg/osapi/gen/agent/api.yaml` (copy from osapi) +- Modify: `osapi-sdk/pkg/osapi/agent.go` (add Drain, Undrain methods) +- Modify: `osapi-sdk/pkg/osapi/agent_types.go` (add conditions, state, timeline + to Agent type) +- Create: `osapi-sdk/pkg/osapi/types.go` (promote TimelineEvent to shared type) +- Modify: `osapi-sdk/pkg/osapi/job_types.go` (remove TimelineEvent, import from + types.go) + +**Step 1: Promote TimelineEvent to shared type** + +Move `TimelineEvent` from `job_types.go` to a new `types.go`: + +```go +// pkg/osapi/types.go + +// TimelineEvent represents a lifecycle event. Used by both job +// timelines and agent state transition history. +type TimelineEvent struct { + Timestamp string + Event string + Hostname string + Message string + Error string +} +``` + +Update `job_types.go` to remove the `TimelineEvent` definition — +`JobDetail.Timeline` now references the shared type. + +**Step 2: Sync OpenAPI spec** + +Copy `internal/api/agent/gen/api.yaml` to +`osapi-sdk/pkg/osapi/gen/agent/api.yaml`. + +Run `redocly join` + `go generate` in the SDK. + +**Step 3: Add domain types** + +```go +// In agent_types.go +type Agent struct { + // ... existing fields ... + State string + Conditions []Condition + Timeline []TimelineEvent // shared type from types.go +} + +type Condition struct { + Type string + Status bool + Reason string + LastTransitionTime time.Time +} +``` + +**Step 4: Add Drain/Undrain methods** + +```go +func (s *AgentService) Drain( + ctx context.Context, + hostname string, +) (*Response[any], error) { + // POST /agent/{hostname}/drain +} + +func (s *AgentService) Undrain( + ctx context.Context, + hostname string, +) (*Response[any], error) { + // POST /agent/{hostname}/undrain +} +``` + +**Step 4: Run SDK tests** + +Run: `go test ./pkg/osapi/... -count=1` Expected: PASS + +**Step 5: Commit (in osapi-sdk repo)** + +```bash +git add pkg/osapi/ +git commit -m "feat(agent): add conditions, state, drain/undrain support" +``` + +--- + +## Task 13: Add CONDITIONS column to agent list CLI + +**Files:** + +- Modify: `cmd/client_agent_list.go` + +**Step 1: Add CONDITIONS column** + +In the table builder for `agent list`, add a column that joins active condition +type names: + +```go +conditions := "-" +if len(agent.Conditions) > 0 { + active := make([]string, 0) + for _, c := range agent.Conditions { + if c.Status { + active = append(active, c.Type) + } + } + if len(active) > 0 { + conditions = strings.Join(active, ",") + } +} +``` + +Headers: `HOSTNAME`, `STATUS`, `CONDITIONS`, `LABELS`, `AGE`, `LOAD`, `OS` + +**Step 2: Use State for STATUS column** + +Replace hardcoded "Ready" with `agent.State` (defaulting to "Ready" if empty). + +**Step 3: Run `go build ./cmd/...`** + +Expected: compiles + +**Step 4: Commit** + +```bash +git add cmd/client_agent_list.go +git commit -m "feat(cli): add CONDITIONS column and state to agent list" +``` + +--- + +## Task 14: Add conditions and timeline to agent get CLI + +**Files:** + +- Modify: `cmd/client_agent_get.go:58-141` + +**Step 1: Add state to agent get output** + +After the Status KV line, display the State: + +```go +if data.State != "" && data.State != "Ready" { + cli.PrintKV("State", data.State) +} +``` + +**Step 2: Add conditions section** + +```go +if len(data.Conditions) > 0 { + condRows := make([][]string, 0, len(data.Conditions)) + for _, c := range data.Conditions { + status := "false" + if c.Status { + status = "true" + } + reason := "" + if c.Reason != "" { + reason = c.Reason + } + since := cli.FormatAge(time.Since(c.LastTransitionTime)) + " ago" + condRows = append(condRows, []string{c.Type, status, reason, since}) + } + sections = append(sections, cli.Section{ + Title: "Conditions", + Headers: []string{"TYPE", "STATUS", "REASON", "SINCE"}, + Rows: condRows, + }) +} +``` + +**Step 3: Add timeline section** + +Same pattern as `DisplayJobDetail` in `internal/cli/ui.go:600-615`: + +```go +if len(data.Timeline) > 0 { + timelineRows := make([][]string, 0, len(data.Timeline)) + for _, te := range data.Timeline { + timelineRows = append(timelineRows, []string{ + te.Timestamp, te.Event, te.Hostname, te.Message, te.Error, + }) + } + sections = append(sections, cli.Section{ + Title: "Timeline", + Headers: []string{"TIMESTAMP", "EVENT", "HOSTNAME", "MESSAGE", "ERROR"}, + Rows: timelineRows, + }) +} +``` + +**Step 4: Run `go build ./cmd/...`** + +Expected: compiles + +**Step 5: Commit** + +```bash +git add cmd/client_agent_get.go +git commit -m "feat(cli): display conditions and timeline in agent get" +``` + +--- + +## Task 15: Add agent drain/undrain CLI commands + +**Files:** + +- Create: `cmd/client_agent_drain.go` +- Create: `cmd/client_agent_undrain.go` + +**Step 1: Create drain command** + +```go +// cmd/client_agent_drain.go +var clientAgentDrainCmd = &cobra.Command{ + Use: "drain", + Short: "Drain an agent", + Long: `Stop an agent from accepting new jobs. In-flight jobs complete.`, + Run: func(cmd *cobra.Command, _ []string) { + ctx := cmd.Context() + hostname, _ := cmd.Flags().GetString("hostname") + + resp, err := sdkClient.Agent.Drain(ctx, hostname) + if err != nil { + cli.HandleError(err, logger) + return + } + + if jsonOutput { + fmt.Println(string(resp.RawJSON())) + return + } + + fmt.Printf("Agent %s drain initiated\n", hostname) + }, +} +``` + +**Step 2: Create undrain command** + +Similar pattern for `undrain`. + +**Step 3: Register commands** + +```go +func init() { + clientAgentCmd.AddCommand(clientAgentDrainCmd) + clientAgentDrainCmd.Flags().String("hostname", "", "Hostname of the agent to drain") + _ = clientAgentDrainCmd.MarkFlagRequired("hostname") +} +``` + +**Step 4: Run `go build ./cmd/...`** + +Expected: compiles + +**Step 5: Commit** + +```bash +git add cmd/client_agent_drain.go cmd/client_agent_undrain.go +git commit -m "feat(cli): add agent drain and undrain commands" +``` + +--- + +## Task 16: Update documentation + +**Files:** + +- Modify: `docs/docs/sidebar/features/agent-management.md` (or create) +- Modify: `docs/docs/sidebar/usage/configuration.md` +- Modify: `docs/docs/sidebar/usage/cli/client/agent/` + +**Step 1: Add conditions and drain docs** + +Document: + +- Condition types and thresholds +- Drain lifecycle (Ready → Draining → Cordoned) +- CLI commands (`agent drain`, `agent undrain`) +- Configuration section for `agent.conditions` + +**Step 2: Update permission table** + +Add `agent:write` to the permissions table in configuration.md. + +**Step 3: Commit** + +```bash +git add docs/ +git commit -m "docs: add node conditions and agent drain documentation" +``` + +--- + +## Task 17: Final verification + +**Step 1: Regenerate** + +Run: `just generate` Expected: no diff + +**Step 2: Build** + +Run: `go build ./...` Expected: compiles + +**Step 3: Unit tests** + +Run: `just go::unit` Expected: PASS + +**Step 4: Lint** + +Run: `just go::vet` Expected: clean + +**Step 5: Coverage check** + +Run: +`go test -coverprofile=coverage.out ./internal/agent/... ./internal/job/client/... ./internal/api/agent/...` +Expected: condition.go, drain.go, agent_drain.go at 100% + +--- + +## Verification + +```bash +just generate # regenerate specs + code +go build ./... # compiles +just go::unit # tests pass +just go::vet # lint passes +``` diff --git a/go.mod b/go.mod index e8c53583..b53e2db0 100644 --- a/go.mod +++ b/go.mod @@ -18,7 +18,7 @@ require ( github.com/oapi-codegen/runtime v1.2.0 github.com/osapi-io/nats-client v0.0.0-20260222233639-d0822e0a4b86 github.com/osapi-io/nats-server v0.0.0-20260216201410-1f33dfc63848 - github.com/osapi-io/osapi-sdk v0.0.0-20260305004213-6ad316fa4505 + github.com/osapi-io/osapi-sdk v0.0.0-20260306002247-11cb3395b3f9 github.com/prometheus-community/pro-bing v0.8.0 github.com/prometheus/client_golang v1.23.2 github.com/samber/slog-echo v1.21.0 diff --git a/go.sum b/go.sum index f8ce5a76..30b725b2 100644 --- a/go.sum +++ b/go.sum @@ -755,8 +755,8 @@ github.com/osapi-io/nats-client v0.0.0-20260222233639-d0822e0a4b86 h1:ML0fdgr0M4 github.com/osapi-io/nats-client v0.0.0-20260222233639-d0822e0a4b86/go.mod h1:TQqODOjF2JuAOFrLtm1ItsMzPPAizKfHo+grOMuPDyE= github.com/osapi-io/nats-server v0.0.0-20260216201410-1f33dfc63848 h1:ELW1sTVBn5JIc17mHgd5fhpO3/7btaxJpxykG2Fe0U4= github.com/osapi-io/nats-server v0.0.0-20260216201410-1f33dfc63848/go.mod h1:4rzeY9jiJF/+Ej4WNwqK5HQ2sflZrEs60GxQpg3Iya8= -github.com/osapi-io/osapi-sdk v0.0.0-20260305004213-6ad316fa4505 h1:J7Wv551BG39Ma9LLWxvZgsaWVNkP5TkteHzExSjt9e4= -github.com/osapi-io/osapi-sdk v0.0.0-20260305004213-6ad316fa4505/go.mod h1:5Y45ymBR4BcxJTOJ7WhqYTDHXxtlQRW7Sr3G52pfMdI= +github.com/osapi-io/osapi-sdk v0.0.0-20260306002247-11cb3395b3f9 h1:v7MKMVLktP3FotS5josRw5DlOKEsIwOQFAj2cd04VwE= +github.com/osapi-io/osapi-sdk v0.0.0-20260306002247-11cb3395b3f9/go.mod h1:gL9oHgIkG+VMazSIXO4Nvwd3IXEuzRvuXstGiphSycc= github.com/otiai10/copy v1.2.0/go.mod h1:rrF5dJ5F0t/EWSYODDu4j9/vEeYHMkc8jt0zJChqQWw= github.com/otiai10/copy v1.14.0 h1:dCI/t1iTdYGtkvCuBG2BgR6KZa83PTclw4U5n2wAllU= github.com/otiai10/copy v1.14.0/go.mod h1:ECfuL02W+/FkTWZWgQqXPWZgW9oeKCSQ5qVfSc4qc4w= diff --git a/internal/agent/condition.go b/internal/agent/condition.go new file mode 100644 index 00000000..db786c99 --- /dev/null +++ b/internal/agent/condition.go @@ -0,0 +1,138 @@ +// Copyright (c) 2026 John Dewey + +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS IN THE SOFTWARE. + +package agent + +import ( + "fmt" + "time" + + "github.com/retr0h/osapi/internal/job" + "github.com/retr0h/osapi/internal/provider/node/disk" + "github.com/retr0h/osapi/internal/provider/node/load" + "github.com/retr0h/osapi/internal/provider/node/mem" +) + +// findPrevCondition returns the previous condition of the given type, +// or nil if not found. +func findPrevCondition( + condType string, + prev []job.Condition, +) *job.Condition { + for i := range prev { + if prev[i].Type == condType { + return &prev[i] + } + } + return nil +} + +// transitionTime returns the previous LastTransitionTime if status +// hasn't changed, otherwise returns now. +func transitionTime( + condType string, + newStatus bool, + prev []job.Condition, +) time.Time { + if p := findPrevCondition(condType, prev); p != nil { + if p.Status == newStatus { + return p.LastTransitionTime + } + } + return time.Now() +} + +func evaluateMemoryPressure( + stats *mem.Stats, + threshold int, + prev []job.Condition, +) job.Condition { + c := job.Condition{Type: job.ConditionMemoryPressure} + if stats == nil || stats.Total == 0 { + c.LastTransitionTime = transitionTime(c.Type, false, prev) + return c + } + used := stats.Total - stats.Available + pct := float64(used) / float64(stats.Total) * 100 + c.Status = pct > float64(threshold) + if c.Status { + c.Reason = fmt.Sprintf( + "memory %.0f%% used (%.1f/%.1f GB)", + pct, + float64(used)/1024/1024/1024, + float64(stats.Total)/1024/1024/1024, + ) + } + c.LastTransitionTime = transitionTime(c.Type, c.Status, prev) + return c +} + +func evaluateHighLoad( + loadAvg *load.AverageStats, + cpuCount int, + multiplier float64, + prev []job.Condition, +) job.Condition { + c := job.Condition{Type: job.ConditionHighLoad} + if loadAvg == nil || cpuCount == 0 { + c.LastTransitionTime = transitionTime(c.Type, false, prev) + return c + } + threshold := float64(cpuCount) * multiplier + c.Status = float64(loadAvg.Load1) > threshold + if c.Status { + c.Reason = fmt.Sprintf( + "load %.2f, threshold %.2f for %d CPUs", + loadAvg.Load1, threshold, cpuCount, + ) + } + c.LastTransitionTime = transitionTime(c.Type, c.Status, prev) + return c +} + +func evaluateDiskPressure( + disks []disk.UsageStats, + threshold int, + prev []job.Condition, +) job.Condition { + c := job.Condition{Type: job.ConditionDiskPressure} + if len(disks) == 0 { + c.LastTransitionTime = transitionTime(c.Type, false, prev) + return c + } + for _, d := range disks { + if d.Total == 0 { + continue + } + pct := float64(d.Used) / float64(d.Total) * 100 + if pct > float64(threshold) { + c.Status = true + c.Reason = fmt.Sprintf( + "%s %.0f%% used (%.1f/%.1f GB)", + d.Name, pct, + float64(d.Used)/1024/1024/1024, + float64(d.Total)/1024/1024/1024, + ) + break + } + } + c.LastTransitionTime = transitionTime(c.Type, c.Status, prev) + return c +} diff --git a/internal/agent/condition_test.go b/internal/agent/condition_test.go new file mode 100644 index 00000000..720c971e --- /dev/null +++ b/internal/agent/condition_test.go @@ -0,0 +1,619 @@ +// Copyright (c) 2026 John Dewey + +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS IN THE SOFTWARE. + +package agent + +import ( + "testing" + "time" + + "github.com/stretchr/testify/suite" + + "github.com/retr0h/osapi/internal/job" + "github.com/retr0h/osapi/internal/provider/node/disk" + "github.com/retr0h/osapi/internal/provider/node/load" + "github.com/retr0h/osapi/internal/provider/node/mem" +) + +type ConditionTestSuite struct { + suite.Suite +} + +func (s *ConditionTestSuite) TestFindPrevCondition() { + tests := []struct { + name string + condType string + prev []job.Condition + validateFunc func(*job.Condition) + }{ + { + name: "when condition type is found returns pointer", + condType: job.ConditionMemoryPressure, + prev: []job.Condition{ + { + Type: job.ConditionMemoryPressure, + Status: true, + Reason: "high", + }, + { + Type: job.ConditionHighLoad, + Status: false, + }, + }, + validateFunc: func(c *job.Condition) { + s.Require().NotNil(c) + s.Equal(job.ConditionMemoryPressure, c.Type) + s.True(c.Status) + s.Equal("high", c.Reason) + }, + }, + { + name: "when condition type is not found returns nil", + condType: job.ConditionDiskPressure, + prev: []job.Condition{ + { + Type: job.ConditionMemoryPressure, + Status: true, + }, + }, + validateFunc: func(c *job.Condition) { + s.Nil(c) + }, + }, + { + name: "when prev is empty returns nil", + condType: job.ConditionHighLoad, + prev: []job.Condition{}, + validateFunc: func(c *job.Condition) { + s.Nil(c) + }, + }, + { + name: "when prev is nil returns nil", + condType: job.ConditionHighLoad, + prev: nil, + validateFunc: func(c *job.Condition) { + s.Nil(c) + }, + }, + } + + for _, tt := range tests { + s.Run(tt.name, func() { + result := findPrevCondition(tt.condType, tt.prev) + tt.validateFunc(result) + }) + } +} + +func (s *ConditionTestSuite) TestTransitionTime() { + fixedTime := time.Date(2026, 1, 1, 0, 0, 0, 0, time.UTC) + + tests := []struct { + name string + condType string + newStatus bool + prev []job.Condition + validateFunc func(time.Time) + }{ + { + name: "when matching prev has same status preserves transition time", + condType: job.ConditionHighLoad, + newStatus: true, + prev: []job.Condition{ + { + Type: job.ConditionHighLoad, + Status: true, + LastTransitionTime: fixedTime, + }, + }, + validateFunc: func(t time.Time) { + s.Equal(fixedTime, t) + }, + }, + { + name: "when matching prev has different status returns now", + condType: job.ConditionHighLoad, + newStatus: true, + prev: []job.Condition{ + { + Type: job.ConditionHighLoad, + Status: false, + LastTransitionTime: fixedTime, + }, + }, + validateFunc: func(t time.Time) { + s.NotEqual(fixedTime, t) + s.WithinDuration(time.Now(), t, 2*time.Second) + }, + }, + { + name: "when no matching prev returns now", + condType: job.ConditionDiskPressure, + newStatus: true, + prev: []job.Condition{ + { + Type: job.ConditionHighLoad, + Status: true, + LastTransitionTime: fixedTime, + }, + }, + validateFunc: func(t time.Time) { + s.WithinDuration(time.Now(), t, 2*time.Second) + }, + }, + { + name: "when prev is empty returns now", + condType: job.ConditionHighLoad, + newStatus: false, + prev: []job.Condition{}, + validateFunc: func(t time.Time) { + s.WithinDuration(time.Now(), t, 2*time.Second) + }, + }, + { + name: "when prev is nil returns now", + condType: job.ConditionHighLoad, + newStatus: false, + prev: nil, + validateFunc: func(t time.Time) { + s.WithinDuration(time.Now(), t, 2*time.Second) + }, + }, + } + + for _, tt := range tests { + s.Run(tt.name, func() { + result := transitionTime(tt.condType, tt.newStatus, tt.prev) + tt.validateFunc(result) + }) + } +} + +func (s *ConditionTestSuite) TestEvaluateMemoryPressure() { + tests := []struct { + name string + stats *mem.Stats + threshold int + prev []job.Condition + validateFunc func(job.Condition) + }{ + { + name: "when usage above threshold returns true with reason", + stats: &mem.Stats{ + Total: 8 * 1024 * 1024 * 1024, // 8 GB + Available: 1 * 1024 * 1024 * 1024, // 1 GB available = 87.5% used + }, + threshold: 80, + prev: nil, + validateFunc: func(c job.Condition) { + s.Equal(job.ConditionMemoryPressure, c.Type) + s.True(c.Status) + s.Contains(c.Reason, "memory") + s.Contains(c.Reason, "88%") + s.Contains(c.Reason, "GB") + }, + }, + { + name: "when usage below threshold returns false", + stats: &mem.Stats{ + Total: 8 * 1024 * 1024 * 1024, // 8 GB + Available: 6 * 1024 * 1024 * 1024, // 6 GB available = 25% used + }, + threshold: 80, + prev: nil, + validateFunc: func(c job.Condition) { + s.Equal(job.ConditionMemoryPressure, c.Type) + s.False(c.Status) + s.Empty(c.Reason) + }, + }, + { + name: "when stats is nil returns false", + stats: nil, + threshold: 80, + prev: nil, + validateFunc: func(c job.Condition) { + s.Equal(job.ConditionMemoryPressure, c.Type) + s.False(c.Status) + s.Empty(c.Reason) + }, + }, + { + name: "when total is zero returns false", + stats: &mem.Stats{ + Total: 0, + Available: 0, + }, + threshold: 80, + prev: nil, + validateFunc: func(c job.Condition) { + s.Equal(job.ConditionMemoryPressure, c.Type) + s.False(c.Status) + s.Empty(c.Reason) + }, + }, + { + name: "when usage exactly at threshold returns false", + stats: &mem.Stats{ + Total: 100, + Available: 20, // 80% used, threshold is 80 (> not >=) + }, + threshold: 80, + prev: nil, + validateFunc: func(c job.Condition) { + s.Equal(job.ConditionMemoryPressure, c.Type) + s.False(c.Status) + s.Empty(c.Reason) + }, + }, + } + + for _, tt := range tests { + s.Run(tt.name, func() { + result := evaluateMemoryPressure(tt.stats, tt.threshold, tt.prev) + tt.validateFunc(result) + }) + } +} + +func (s *ConditionTestSuite) TestEvaluateHighLoad() { + tests := []struct { + name string + loadAvg *load.AverageStats + cpuCount int + multiplier float64 + prev []job.Condition + validateFunc func(job.Condition) + }{ + { + name: "when load above threshold returns true with reason", + loadAvg: &load.AverageStats{ + Load1: 8.5, + Load5: 7.0, + Load15: 6.0, + }, + cpuCount: 4, + multiplier: 2.0, // threshold = 8.0 + prev: nil, + validateFunc: func(c job.Condition) { + s.Equal(job.ConditionHighLoad, c.Type) + s.True(c.Status) + s.Contains(c.Reason, "load 8.50") + s.Contains(c.Reason, "threshold 8.00") + s.Contains(c.Reason, "4 CPUs") + }, + }, + { + name: "when load below threshold returns false", + loadAvg: &load.AverageStats{ + Load1: 2.0, + Load5: 1.5, + Load15: 1.0, + }, + cpuCount: 4, + multiplier: 2.0, // threshold = 8.0 + prev: nil, + validateFunc: func(c job.Condition) { + s.Equal(job.ConditionHighLoad, c.Type) + s.False(c.Status) + s.Empty(c.Reason) + }, + }, + { + name: "when load is nil returns false", + loadAvg: nil, + cpuCount: 4, + multiplier: 2.0, + prev: nil, + validateFunc: func(c job.Condition) { + s.Equal(job.ConditionHighLoad, c.Type) + s.False(c.Status) + s.Empty(c.Reason) + }, + }, + { + name: "when cpu count is zero returns false", + loadAvg: &load.AverageStats{ + Load1: 8.5, + Load5: 7.0, + Load15: 6.0, + }, + cpuCount: 0, + multiplier: 2.0, + prev: nil, + validateFunc: func(c job.Condition) { + s.Equal(job.ConditionHighLoad, c.Type) + s.False(c.Status) + s.Empty(c.Reason) + }, + }, + { + name: "when load exactly at threshold returns false", + loadAvg: &load.AverageStats{ + Load1: 8.0, + Load5: 5.0, + Load15: 3.0, + }, + cpuCount: 4, + multiplier: 2.0, // threshold = 8.0, Load1 = 8.0 (not >) + prev: nil, + validateFunc: func(c job.Condition) { + s.Equal(job.ConditionHighLoad, c.Type) + s.False(c.Status) + s.Empty(c.Reason) + }, + }, + } + + for _, tt := range tests { + s.Run(tt.name, func() { + result := evaluateHighLoad(tt.loadAvg, tt.cpuCount, tt.multiplier, tt.prev) + tt.validateFunc(result) + }) + } +} + +func (s *ConditionTestSuite) TestEvaluateDiskPressure() { + tests := []struct { + name string + disks []disk.UsageStats + threshold int + prev []job.Condition + validateFunc func(job.Condition) + }{ + { + name: "when one disk above threshold returns true", + disks: []disk.UsageStats{ + { + Name: "/dev/sda1", + Total: 100 * 1024 * 1024 * 1024, // 100 GB + Used: 95 * 1024 * 1024 * 1024, // 95 GB = 95% + Free: 5 * 1024 * 1024 * 1024, + }, + }, + threshold: 90, + prev: nil, + validateFunc: func(c job.Condition) { + s.Equal(job.ConditionDiskPressure, c.Type) + s.True(c.Status) + s.Contains(c.Reason, "/dev/sda1") + s.Contains(c.Reason, "95%") + s.Contains(c.Reason, "GB") + }, + }, + { + name: "when all disks below threshold returns false", + disks: []disk.UsageStats{ + { + Name: "/dev/sda1", + Total: 100 * 1024 * 1024 * 1024, + Used: 50 * 1024 * 1024 * 1024, // 50% + Free: 50 * 1024 * 1024 * 1024, + }, + { + Name: "/dev/sdb1", + Total: 200 * 1024 * 1024 * 1024, + Used: 60 * 1024 * 1024 * 1024, // 30% + Free: 140 * 1024 * 1024 * 1024, + }, + }, + threshold: 90, + prev: nil, + validateFunc: func(c job.Condition) { + s.Equal(job.ConditionDiskPressure, c.Type) + s.False(c.Status) + s.Empty(c.Reason) + }, + }, + { + name: "when disks is nil returns false", + disks: nil, + threshold: 90, + prev: nil, + validateFunc: func(c job.Condition) { + s.Equal(job.ConditionDiskPressure, c.Type) + s.False(c.Status) + s.Empty(c.Reason) + }, + }, + { + name: "when disks is empty returns false", + disks: []disk.UsageStats{}, + threshold: 90, + prev: nil, + validateFunc: func(c job.Condition) { + s.Equal(job.ConditionDiskPressure, c.Type) + s.False(c.Status) + s.Empty(c.Reason) + }, + }, + { + name: "when disk total is zero skips it", + disks: []disk.UsageStats{ + { + Name: "/dev/sda1", + Total: 0, + Used: 0, + Free: 0, + }, + }, + threshold: 90, + prev: nil, + validateFunc: func(c job.Condition) { + s.Equal(job.ConditionDiskPressure, c.Type) + s.False(c.Status) + s.Empty(c.Reason) + }, + }, + { + name: "when second disk is above threshold reports it", + disks: []disk.UsageStats{ + { + Name: "/dev/sda1", + Total: 100 * 1024 * 1024 * 1024, + Used: 50 * 1024 * 1024 * 1024, // 50% + Free: 50 * 1024 * 1024 * 1024, + }, + { + Name: "/dev/sdb1", + Total: 200 * 1024 * 1024 * 1024, + Used: 195 * 1024 * 1024 * 1024, // 97.5% + Free: 5 * 1024 * 1024 * 1024, + }, + }, + threshold: 90, + prev: nil, + validateFunc: func(c job.Condition) { + s.Equal(job.ConditionDiskPressure, c.Type) + s.True(c.Status) + s.Contains(c.Reason, "/dev/sdb1") + }, + }, + } + + for _, tt := range tests { + s.Run(tt.name, func() { + result := evaluateDiskPressure(tt.disks, tt.threshold, tt.prev) + tt.validateFunc(result) + }) + } +} + +func (s *ConditionTestSuite) TestLastTransitionTimeTracking() { + fixedPast := time.Date(2026, 1, 1, 0, 0, 0, 0, time.UTC) + + tests := []struct { + name string + evalFunc func([]job.Condition) job.Condition + prev []job.Condition + validateFunc func(job.Condition) + }{ + { + name: "when status flips from false to true transition time updates", + evalFunc: func(prev []job.Condition) job.Condition { + return evaluateMemoryPressure( + &mem.Stats{ + Total: 100, + Available: 10, // 90% used + }, + 80, + prev, + ) + }, + prev: []job.Condition{ + { + Type: job.ConditionMemoryPressure, + Status: false, + LastTransitionTime: fixedPast, + }, + }, + validateFunc: func(c job.Condition) { + s.True(c.Status) + s.NotEqual(fixedPast, c.LastTransitionTime) + s.WithinDuration(time.Now(), c.LastTransitionTime, 2*time.Second) + }, + }, + { + name: "when status stays true transition time is preserved", + evalFunc: func(prev []job.Condition) job.Condition { + return evaluateMemoryPressure( + &mem.Stats{ + Total: 100, + Available: 10, // 90% used + }, + 80, + prev, + ) + }, + prev: []job.Condition{ + { + Type: job.ConditionMemoryPressure, + Status: true, + LastTransitionTime: fixedPast, + }, + }, + validateFunc: func(c job.Condition) { + s.True(c.Status) + s.Equal(fixedPast, c.LastTransitionTime) + }, + }, + { + name: "when status flips from true to false transition time updates", + evalFunc: func(prev []job.Condition) job.Condition { + return evaluateMemoryPressure( + &mem.Stats{ + Total: 100, + Available: 80, // 20% used + }, + 80, + prev, + ) + }, + prev: []job.Condition{ + { + Type: job.ConditionMemoryPressure, + Status: true, + LastTransitionTime: fixedPast, + }, + }, + validateFunc: func(c job.Condition) { + s.False(c.Status) + s.NotEqual(fixedPast, c.LastTransitionTime) + s.WithinDuration(time.Now(), c.LastTransitionTime, 2*time.Second) + }, + }, + { + name: "when status stays false transition time is preserved", + evalFunc: func(prev []job.Condition) job.Condition { + return evaluateMemoryPressure( + &mem.Stats{ + Total: 100, + Available: 80, // 20% used + }, + 80, + prev, + ) + }, + prev: []job.Condition{ + { + Type: job.ConditionMemoryPressure, + Status: false, + LastTransitionTime: fixedPast, + }, + }, + validateFunc: func(c job.Condition) { + s.False(c.Status) + s.Equal(fixedPast, c.LastTransitionTime) + }, + }, + } + + for _, tt := range tests { + s.Run(tt.name, func() { + result := tt.evalFunc(tt.prev) + tt.validateFunc(result) + }) + } +} + +func TestConditionTestSuite(t *testing.T) { + suite.Run(t, new(ConditionTestSuite)) +} diff --git a/internal/agent/consumer.go b/internal/agent/consumer.go index 532b1c72..03b95168 100644 --- a/internal/agent/consumer.go +++ b/internal/agent/consumer.go @@ -102,14 +102,14 @@ func (a *Agent) consumeQueryJobs( continue } - a.wg.Add(1) + a.consumerWg.Add(1) go func(c struct { name string filter string queueGroup string }, ) { - defer a.wg.Done() + defer a.consumerWg.Done() opts := &natsclient.ConsumeOptions{ QueueGroup: c.queueGroup, @@ -194,14 +194,14 @@ func (a *Agent) consumeModifyJobs( continue } - a.wg.Add(1) + a.consumerWg.Add(1) go func(c struct { name string filter string queueGroup string }, ) { - defer a.wg.Done() + defer a.consumerWg.Done() opts := &natsclient.ConsumeOptions{ QueueGroup: c.queueGroup, @@ -222,6 +222,21 @@ func (a *Agent) consumeModifyJobs( return nil } +// startConsumers creates a consumer context and starts all job consumers. +func (a *Agent) startConsumers() { + a.consumerCtx, a.consumerCancel = context.WithCancel(a.ctx) + _ = a.consumeQueryJobs(a.consumerCtx, a.hostname) + _ = a.consumeModifyJobs(a.consumerCtx, a.hostname) +} + +// stopConsumers cancels the consumer context and waits for all consumer +// goroutines to finish. After this returns, the agent is no longer +// receiving new jobs. +func (a *Agent) stopConsumers() { + a.consumerCancel() + a.consumerWg.Wait() +} + // handleJobMessageJS wraps the existing handleJobMessage for JetStream compatibility. func (a *Agent) handleJobMessageJS( msg jetstream.Msg, diff --git a/internal/agent/drain.go b/internal/agent/drain.go new file mode 100644 index 00000000..47105637 --- /dev/null +++ b/internal/agent/drain.go @@ -0,0 +1,70 @@ +// Copyright (c) 2026 John Dewey + +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS IN THE SOFTWARE. + +package agent + +import ( + "context" + + "github.com/retr0h/osapi/internal/job" +) + +// checkDrainFlag checks the drain flag via the job client. Drain flags +// are stored in the main KV bucket (longer TTL than registry). +func (a *Agent) checkDrainFlag( + ctx context.Context, + hostname string, +) bool { + return a.jobClient.CheckDrainFlag(ctx, hostname) +} + +// handleDrainDetection checks drain flag on each heartbeat tick. +// When drain is requested and agent is Ready, it transitions to Draining +// and stops accepting new jobs by stopping consumer message handlers. +// When drain flag is removed and agent is Cordoned, it transitions back +// to Ready and resumes accepting jobs. +func (a *Agent) handleDrainDetection( + ctx context.Context, + hostname string, +) { + drainRequested := a.checkDrainFlag(ctx, hostname) + + switch { + case drainRequested && a.state == job.AgentStateReady: + a.logger.Info("drain detected, stopping job consumption") + a.stopConsumers() + a.state = job.AgentStateCordoned + a.logger.Info("all consumers stopped, agent cordoned") + _ = a.jobClient.WriteAgentTimelineEvent( + ctx, hostname, "drain", "Drain initiated", + ) + _ = a.jobClient.WriteAgentTimelineEvent( + ctx, hostname, "cordoned", "All jobs completed", + ) + + case !drainRequested && (a.state == job.AgentStateDraining || a.state == job.AgentStateCordoned): + a.logger.Info("undrain detected, resuming job consumption") + a.startConsumers() + a.state = job.AgentStateReady + _ = a.jobClient.WriteAgentTimelineEvent( + ctx, hostname, "undrain", "Resumed accepting jobs", + ) + } +} diff --git a/internal/agent/drain_test.go b/internal/agent/drain_test.go new file mode 100644 index 00000000..4d746a4f --- /dev/null +++ b/internal/agent/drain_test.go @@ -0,0 +1,253 @@ +// Copyright (c) 2026 John Dewey + +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS IN THE SOFTWARE. + +package agent + +import ( + "context" + "log/slog" + "testing" + + "github.com/golang/mock/gomock" + "github.com/spf13/afero" + "github.com/stretchr/testify/suite" + + "github.com/retr0h/osapi/internal/config" + "github.com/retr0h/osapi/internal/job" + "github.com/retr0h/osapi/internal/job/mocks" + commandMocks "github.com/retr0h/osapi/internal/provider/command/mocks" + dnsMocks "github.com/retr0h/osapi/internal/provider/network/dns/mocks" + netinfoMocks "github.com/retr0h/osapi/internal/provider/network/netinfo/mocks" + pingMocks "github.com/retr0h/osapi/internal/provider/network/ping/mocks" + diskMocks "github.com/retr0h/osapi/internal/provider/node/disk/mocks" + hostMocks "github.com/retr0h/osapi/internal/provider/node/host/mocks" + loadMocks "github.com/retr0h/osapi/internal/provider/node/load/mocks" + memMocks "github.com/retr0h/osapi/internal/provider/node/mem/mocks" +) + +type DrainTestSuite struct { + suite.Suite + + mockCtrl *gomock.Controller + mockJobClient *mocks.MockJobClient + mockKV *mocks.MockKeyValue + mockEntry *mocks.MockKeyValueEntry + agent *Agent +} + +func (s *DrainTestSuite) SetupTest() { + s.mockCtrl = gomock.NewController(s.T()) + s.mockJobClient = mocks.NewMockJobClient(s.mockCtrl) + s.mockKV = mocks.NewMockKeyValue(s.mockCtrl) + s.mockEntry = mocks.NewMockKeyValueEntry(s.mockCtrl) + + appConfig := config.Config{ + Agent: config.AgentConfig{ + Labels: map[string]string{"group": "web"}, + }, + } + + s.agent = New( + afero.NewMemMapFs(), + appConfig, + slog.Default(), + s.mockJobClient, + "test-stream", + hostMocks.NewDefaultMockProvider(s.mockCtrl), + diskMocks.NewDefaultMockProvider(s.mockCtrl), + memMocks.NewDefaultMockProvider(s.mockCtrl), + loadMocks.NewDefaultMockProvider(s.mockCtrl), + dnsMocks.NewDefaultMockProvider(s.mockCtrl), + pingMocks.NewDefaultMockProvider(s.mockCtrl), + netinfoMocks.NewDefaultMockProvider(s.mockCtrl), + commandMocks.NewDefaultMockProvider(s.mockCtrl), + s.mockKV, + nil, + ) + s.agent.state = job.AgentStateReady + s.agent.ctx, s.agent.cancel = context.WithCancel(context.Background()) + s.agent.consumerCtx, s.agent.consumerCancel = context.WithCancel(s.agent.ctx) +} + +func (s *DrainTestSuite) TearDownTest() { + s.mockCtrl.Finish() +} + +func (s *DrainTestSuite) TestCheckDrainFlag() { + tests := []struct { + name string + setupMock func() + validateFunc func(bool) + }{ + { + name: "when drain key exists returns true", + setupMock: func() { + s.mockJobClient.EXPECT(). + CheckDrainFlag(gomock.Any(), "test-agent"). + Return(true) + }, + validateFunc: func(result bool) { + s.True(result) + }, + }, + { + name: "when drain key missing returns false", + setupMock: func() { + s.mockJobClient.EXPECT(). + CheckDrainFlag(gomock.Any(), "test-agent"). + Return(false) + }, + validateFunc: func(result bool) { + s.False(result) + }, + }, + } + + for _, tt := range tests { + s.Run(tt.name, func() { + tt.setupMock() + result := s.agent.checkDrainFlag(context.Background(), "test-agent") + tt.validateFunc(result) + }) + } +} + +func (s *DrainTestSuite) TestHandleDrainDetection() { + tests := []struct { + name string + initialState string + setupMock func() + expectedState string + }{ + { + name: "when drain flag set and agent is Ready transitions to Cordoned", + initialState: job.AgentStateReady, + setupMock: func() { + s.mockJobClient.EXPECT(). + CheckDrainFlag(gomock.Any(), "test-agent"). + Return(true) + s.mockJobClient.EXPECT(). + WriteAgentTimelineEvent( + gomock.Any(), + "test-agent", + "drain", + "Drain initiated", + ). + Return(nil) + s.mockJobClient.EXPECT(). + WriteAgentTimelineEvent( + gomock.Any(), + "test-agent", + "cordoned", + "All jobs completed", + ). + Return(nil) + }, + expectedState: job.AgentStateCordoned, + }, + { + name: "when drain flag removed and agent is Draining transitions to Ready", + initialState: job.AgentStateDraining, + setupMock: func() { + s.mockJobClient.EXPECT(). + CheckDrainFlag(gomock.Any(), "test-agent"). + Return(false) + s.mockJobClient.EXPECT(). + WriteAgentTimelineEvent( + gomock.Any(), + "test-agent", + "undrain", + "Resumed accepting jobs", + ). + Return(nil) + // startConsumers re-creates consumers + s.mockJobClient.EXPECT(). + CreateOrUpdateConsumer(gomock.Any(), gomock.Any(), gomock.Any()). + Return(nil). + AnyTimes() + s.mockJobClient.EXPECT(). + ConsumeJobs(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()). + Return(context.Canceled). + AnyTimes() + }, + expectedState: job.AgentStateReady, + }, + { + name: "when drain flag removed and agent is Cordoned transitions to Ready", + initialState: job.AgentStateCordoned, + setupMock: func() { + s.mockJobClient.EXPECT(). + CheckDrainFlag(gomock.Any(), "test-agent"). + Return(false) + s.mockJobClient.EXPECT(). + WriteAgentTimelineEvent( + gomock.Any(), + "test-agent", + "undrain", + "Resumed accepting jobs", + ). + Return(nil) + // startConsumers re-creates consumers + s.mockJobClient.EXPECT(). + CreateOrUpdateConsumer(gomock.Any(), gomock.Any(), gomock.Any()). + Return(nil). + AnyTimes() + s.mockJobClient.EXPECT(). + ConsumeJobs(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()). + Return(context.Canceled). + AnyTimes() + }, + expectedState: job.AgentStateReady, + }, + { + name: "when drain flag still set and agent is already Draining stays Draining", + initialState: job.AgentStateDraining, + setupMock: func() { + s.mockJobClient.EXPECT(). + CheckDrainFlag(gomock.Any(), "test-agent"). + Return(true) + }, + expectedState: job.AgentStateDraining, + }, + { + name: "when no drain flag and agent is Ready stays Ready", + initialState: job.AgentStateReady, + setupMock: func() { + s.mockJobClient.EXPECT(). + CheckDrainFlag(gomock.Any(), "test-agent"). + Return(false) + }, + expectedState: job.AgentStateReady, + }, + } + + for _, tt := range tests { + s.Run(tt.name, func() { + s.agent.state = tt.initialState + tt.setupMock() + s.agent.handleDrainDetection(context.Background(), "test-agent") + s.Equal(tt.expectedState, s.agent.state) + }) + } +} + +func TestDrainTestSuite(t *testing.T) { + suite.Run(t, new(DrainTestSuite)) +} diff --git a/internal/agent/facts.go b/internal/agent/facts.go index d0642479..cac82974 100644 --- a/internal/agent/facts.go +++ b/internal/agent/facts.go @@ -85,6 +85,7 @@ func (a *Agent) writeFacts( if count, err := a.hostProvider.GetCPUCount(); err == nil { reg.CPUCount = count + a.cpuCount = count } if mgr, err := a.hostProvider.GetServiceManager(); err == nil { diff --git a/internal/agent/heartbeat.go b/internal/agent/heartbeat.go index f469e35e..959d7814 100644 --- a/internal/agent/heartbeat.go +++ b/internal/agent/heartbeat.go @@ -27,6 +27,9 @@ import ( "time" "github.com/retr0h/osapi/internal/job" + "github.com/retr0h/osapi/internal/provider/node/disk" + "github.com/retr0h/osapi/internal/provider/node/load" + "github.com/retr0h/osapi/internal/provider/node/mem" ) // heartbeatInterval is the interval between heartbeat refreshes. @@ -89,11 +92,14 @@ func (a *Agent) writeRegistration( ctx context.Context, hostname string, ) { + a.handleDrainDetection(ctx, hostname) + reg := job.AgentRegistration{ Hostname: hostname, Labels: a.appConfig.Agent.Labels, RegisteredAt: time.Now(), StartedAt: a.startedAt, + State: a.state, } if info, err := a.hostProvider.GetOSInfo(); err == nil { @@ -104,14 +110,44 @@ func (a *Agent) writeRegistration( reg.Uptime = uptime } + var loadAvg *load.AverageStats if avg, err := a.loadProvider.GetAverageStats(); err == nil { + loadAvg = avg reg.LoadAverages = avg } + var memStats *mem.Stats if stats, err := a.memProvider.GetStats(); err == nil { + memStats = stats reg.MemoryStats = stats } + var diskStats []disk.UsageStats + if stats, err := a.diskProvider.GetLocalUsageStats(); err == nil { + diskStats = stats + } + + conditions := []job.Condition{ + evaluateMemoryPressure( + memStats, + a.appConfig.Agent.Conditions.MemoryPressureThreshold, + a.prevConditions, + ), + evaluateHighLoad( + loadAvg, + a.cpuCount, + a.appConfig.Agent.Conditions.HighLoadMultiplier, + a.prevConditions, + ), + evaluateDiskPressure( + diskStats, + a.appConfig.Agent.Conditions.DiskPressureThreshold, + a.prevConditions, + ), + } + a.prevConditions = conditions + reg.Conditions = conditions + data, err := marshalJSON(reg) if err != nil { a.logger.Warn( diff --git a/internal/agent/heartbeat_public_test.go b/internal/agent/heartbeat_public_test.go index d607d56f..93db8d22 100644 --- a/internal/agent/heartbeat_public_test.go +++ b/internal/agent/heartbeat_public_test.go @@ -100,6 +100,12 @@ func (s *HeartbeatPublicTestSuite) TestStartWithHeartbeat() { { name: "when registryKV is set registers and deregisters", setupFunc: func() *agent.Agent { + // Drain check on each heartbeat tick (no drain flag present) + s.mockJobClient.EXPECT(). + CheckDrainFlag(gomock.Any(), "test-agent"). + Return(false). + AnyTimes() + // Heartbeat initial write s.mockKV.EXPECT(). Put(gomock.Any(), "agents.test_agent", gomock.Any()). diff --git a/internal/agent/heartbeat_test.go b/internal/agent/heartbeat_test.go index eacc6ef5..fb64492f 100644 --- a/internal/agent/heartbeat_test.go +++ b/internal/agent/heartbeat_test.go @@ -34,6 +34,7 @@ import ( "github.com/stretchr/testify/suite" "github.com/retr0h/osapi/internal/config" + "github.com/retr0h/osapi/internal/job" "github.com/retr0h/osapi/internal/job/mocks" commandMocks "github.com/retr0h/osapi/internal/provider/command/mocks" dnsMocks "github.com/retr0h/osapi/internal/provider/network/dns/mocks" @@ -83,6 +84,14 @@ func (s *HeartbeatTestSuite) SetupTest() { s.mockKV, nil, ) + s.agent.state = job.AgentStateReady + + // writeRegistration now calls handleDrainDetection which checks drain flag. + // Default: no drain flag present. + s.mockJobClient.EXPECT(). + CheckDrainFlag(gomock.Any(), "test-agent"). + Return(false). + AnyTimes() } func (s *HeartbeatTestSuite) TearDownTest() { diff --git a/internal/agent/server.go b/internal/agent/server.go index c1c0ec18..396b75b8 100644 --- a/internal/agent/server.go +++ b/internal/agent/server.go @@ -32,30 +32,29 @@ import ( func (a *Agent) Start() { a.ctx, a.cancel = context.WithCancel(context.Background()) a.startedAt = time.Now() + a.state = job.AgentStateReady a.logger.Info("starting node agent") // Determine agent hostname (GetAgentHostname always succeeds) - hostname, _ := job.GetAgentHostname(a.appConfig.Agent.Hostname) + a.hostname, _ = job.GetAgentHostname(a.appConfig.Agent.Hostname) a.logger.Info( "agent configuration", - slog.String("hostname", hostname), + slog.String("hostname", a.hostname), slog.String("queue_group", a.appConfig.Agent.QueueGroup), slog.Int("max_jobs", a.appConfig.Agent.MaxJobs), slog.Any("labels", a.appConfig.Agent.Labels), ) // Register in agent registry and start heartbeat keepalive. - a.startHeartbeat(a.ctx, hostname) + a.startHeartbeat(a.ctx, a.hostname) // Collect and publish system facts. - a.startFacts(a.ctx, hostname) + a.startFacts(a.ctx, a.hostname) // Start consuming messages for different job types. - // Each consume function spawns goroutines tracked by a.wg. - _ = a.consumeQueryJobs(a.ctx, hostname) - _ = a.consumeModifyJobs(a.ctx, hostname) + a.startConsumers() a.logger.Info("node agent started successfully") } @@ -70,6 +69,7 @@ func (a *Agent) Stop( done := make(chan struct{}) go func() { + a.consumerWg.Wait() a.wg.Wait() close(done) }() diff --git a/internal/agent/types.go b/internal/agent/types.go index 5e97581c..e3b31e01 100644 --- a/internal/agent/types.go +++ b/internal/agent/types.go @@ -30,6 +30,7 @@ import ( "github.com/spf13/afero" "github.com/retr0h/osapi/internal/config" + "github.com/retr0h/osapi/internal/job" "github.com/retr0h/osapi/internal/job/client" "github.com/retr0h/osapi/internal/provider/command" "github.com/retr0h/osapi/internal/provider/network/dns" @@ -74,10 +75,27 @@ type Agent struct { // startedAt records when the agent process started. startedAt time.Time + // prevConditions tracks condition state between heartbeats. + prevConditions []job.Condition + + // cpuCount cached from facts for HighLoad evaluation. + cpuCount int + + // state is the agent's scheduling state (Ready, Draining, Cordoned). + state string + + // hostname cached from Start for drain/undrain resubscribe. + hostname string + // Lifecycle management ctx context.Context cancel context.CancelFunc wg sync.WaitGroup + + // Consumer lifecycle for drain/undrain. + consumerCtx context.Context + consumerCancel context.CancelFunc + consumerWg sync.WaitGroup } // JobContext contains the context and data for a single job execution. diff --git a/internal/api/agent/agent_drain.go b/internal/api/agent/agent_drain.go new file mode 100644 index 00000000..ee3c1276 --- /dev/null +++ b/internal/api/agent/agent_drain.go @@ -0,0 +1,68 @@ +// Copyright (c) 2026 John Dewey + +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS IN THE SOFTWARE. + +package agent + +import ( + "context" + "fmt" + "strings" + + "github.com/retr0h/osapi/internal/api/agent/gen" + "github.com/retr0h/osapi/internal/job" +) + +// DrainAgent handles POST /agent/{hostname}/drain. +func (a *Agent) DrainAgent( + ctx context.Context, + request gen.DrainAgentRequestObject, +) (gen.DrainAgentResponseObject, error) { + hostname := request.Hostname + + agentInfo, err := a.JobClient.GetAgent(ctx, hostname) + if err != nil { + errMsg := fmt.Sprintf("agent not found: %s", hostname) + return gen.DrainAgent404JSONResponse{Error: &errMsg}, nil + } + + if agentInfo.State == job.AgentStateDraining || agentInfo.State == job.AgentStateCordoned { + errMsg := fmt.Sprintf("agent %s is already in %s state", hostname, agentInfo.State) + return gen.DrainAgent409JSONResponse{Error: &errMsg}, nil + } + + if err := a.JobClient.SetDrainFlag(ctx, hostname); err != nil { + errMsg := fmt.Sprintf("failed to set drain flag: %s", err.Error()) + return gen.DrainAgent409JSONResponse{Error: &errMsg}, nil + } + + if err := a.JobClient.WriteAgentTimelineEvent(ctx, hostname, "drain", "Drain initiated via API"); err != nil { + if strings.Contains(err.Error(), "not found") { + errMsg := fmt.Sprintf("agent not found: %s", hostname) + return gen.DrainAgent404JSONResponse{Error: &errMsg}, nil + } + + errMsg := err.Error() + return gen.DrainAgent409JSONResponse{Error: &errMsg}, nil + } + + msg := fmt.Sprintf("drain initiated for agent %s", hostname) + + return gen.DrainAgent200JSONResponse{Message: msg}, nil +} diff --git a/internal/api/agent/agent_drain_public_test.go b/internal/api/agent/agent_drain_public_test.go new file mode 100644 index 00000000..91c99de5 --- /dev/null +++ b/internal/api/agent/agent_drain_public_test.go @@ -0,0 +1,358 @@ +// Copyright (c) 2026 John Dewey + +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS IN THE SOFTWARE. + +package agent_test + +import ( + "context" + "fmt" + "log/slog" + "net/http" + "net/http/httptest" + "os" + "testing" + + "github.com/golang/mock/gomock" + "github.com/stretchr/testify/suite" + + "github.com/retr0h/osapi/internal/api" + apiagent "github.com/retr0h/osapi/internal/api/agent" + "github.com/retr0h/osapi/internal/api/agent/gen" + "github.com/retr0h/osapi/internal/authtoken" + "github.com/retr0h/osapi/internal/config" + jobtypes "github.com/retr0h/osapi/internal/job" + jobmocks "github.com/retr0h/osapi/internal/job/mocks" +) + +type AgentDrainPublicTestSuite struct { + suite.Suite + + mockCtrl *gomock.Controller + mockJobClient *jobmocks.MockJobClient + handler *apiagent.Agent + ctx context.Context + appConfig config.Config + logger *slog.Logger +} + +func (s *AgentDrainPublicTestSuite) SetupTest() { + s.mockCtrl = gomock.NewController(s.T()) + s.mockJobClient = jobmocks.NewMockJobClient(s.mockCtrl) + s.handler = apiagent.New(slog.Default(), s.mockJobClient) + s.ctx = context.Background() + s.appConfig = config.Config{} + s.logger = slog.New(slog.NewTextHandler(os.Stdout, nil)) +} + +func (s *AgentDrainPublicTestSuite) TearDownTest() { + s.mockCtrl.Finish() +} + +func (s *AgentDrainPublicTestSuite) TestDrainAgent() { + tests := []struct { + name string + hostname string + mockAgent *jobtypes.AgentInfo + mockGetErr error + mockWriteErr error + skipWrite bool + mockSetDrain bool + validateFunc func(resp gen.DrainAgentResponseObject) + }{ + { + name: "success drains agent", + hostname: "server1", + mockAgent: &jobtypes.AgentInfo{ + Hostname: "server1", + State: jobtypes.AgentStateReady, + }, + mockSetDrain: true, + validateFunc: func(resp gen.DrainAgentResponseObject) { + r, ok := resp.(gen.DrainAgent200JSONResponse) + s.True(ok) + s.Contains(r.Message, "drain initiated for agent server1") + }, + }, + { + name: "agent not found returns 404", + hostname: "unknown", + mockGetErr: fmt.Errorf("agent not found: unknown"), + skipWrite: true, + validateFunc: func(resp gen.DrainAgentResponseObject) { + _, ok := resp.(gen.DrainAgent404JSONResponse) + s.True(ok) + }, + }, + { + name: "agent already draining returns 409", + hostname: "server1", + mockAgent: &jobtypes.AgentInfo{ + Hostname: "server1", + State: jobtypes.AgentStateDraining, + }, + skipWrite: true, + validateFunc: func(resp gen.DrainAgentResponseObject) { + _, ok := resp.(gen.DrainAgent409JSONResponse) + s.True(ok) + }, + }, + { + name: "agent already cordoned returns 409", + hostname: "server1", + mockAgent: &jobtypes.AgentInfo{ + Hostname: "server1", + State: jobtypes.AgentStateCordoned, + }, + skipWrite: true, + validateFunc: func(resp gen.DrainAgentResponseObject) { + _, ok := resp.(gen.DrainAgent409JSONResponse) + s.True(ok) + }, + }, + } + + for _, tt := range tests { + s.Run(tt.name, func() { + s.mockJobClient.EXPECT(). + GetAgent(gomock.Any(), tt.hostname). + Return(tt.mockAgent, tt.mockGetErr) + + if tt.mockSetDrain { + s.mockJobClient.EXPECT(). + SetDrainFlag(gomock.Any(), tt.hostname). + Return(nil) + } + + if !tt.skipWrite { + s.mockJobClient.EXPECT(). + WriteAgentTimelineEvent(gomock.Any(), tt.hostname, "drain", "Drain initiated via API"). + Return(tt.mockWriteErr) + } + + resp, err := s.handler.DrainAgent(s.ctx, gen.DrainAgentRequestObject{ + Hostname: tt.hostname, + }) + s.NoError(err) + tt.validateFunc(resp) + }) + } +} + +func (s *AgentDrainPublicTestSuite) TestDrainAgentValidationHTTP() { + tests := []struct { + name string + hostname string + setupJobMock func() *jobmocks.MockJobClient + wantCode int + wantContains []string + }{ + { + name: "when agent exists returns 200", + hostname: "server1", + setupJobMock: func() *jobmocks.MockJobClient { + mock := jobmocks.NewMockJobClient(s.mockCtrl) + mock.EXPECT(). + GetAgent(gomock.Any(), "server1"). + Return(&jobtypes.AgentInfo{ + Hostname: "server1", + State: jobtypes.AgentStateReady, + }, nil) + mock.EXPECT(). + SetDrainFlag(gomock.Any(), "server1"). + Return(nil) + mock.EXPECT(). + WriteAgentTimelineEvent(gomock.Any(), "server1", "drain", "Drain initiated via API"). + Return(nil) + return mock + }, + wantCode: http.StatusOK, + wantContains: []string{`"message"`, `drain initiated`}, + }, + { + name: "when agent not found returns 404", + hostname: "unknown", + setupJobMock: func() *jobmocks.MockJobClient { + mock := jobmocks.NewMockJobClient(s.mockCtrl) + mock.EXPECT(). + GetAgent(gomock.Any(), "unknown"). + Return(nil, fmt.Errorf("agent not found: unknown")) + return mock + }, + wantCode: http.StatusNotFound, + wantContains: []string{`"error"`}, + }, + { + name: "when agent already draining returns 409", + hostname: "server1", + setupJobMock: func() *jobmocks.MockJobClient { + mock := jobmocks.NewMockJobClient(s.mockCtrl) + mock.EXPECT(). + GetAgent(gomock.Any(), "server1"). + Return(&jobtypes.AgentInfo{ + Hostname: "server1", + State: jobtypes.AgentStateDraining, + }, nil) + return mock + }, + wantCode: http.StatusConflict, + wantContains: []string{`"error"`, `already in Draining`}, + }, + } + + for _, tc := range tests { + s.Run(tc.name, func() { + jobMock := tc.setupJobMock() + + agentHandler := apiagent.New(s.logger, jobMock) + strictHandler := gen.NewStrictHandler(agentHandler, nil) + + a := api.New(s.appConfig, s.logger) + gen.RegisterHandlers(a.Echo, strictHandler) + + req := httptest.NewRequest( + http.MethodPost, + fmt.Sprintf("/agent/%s/drain", tc.hostname), + nil, + ) + rec := httptest.NewRecorder() + + a.Echo.ServeHTTP(rec, req) + + s.Equal(tc.wantCode, rec.Code) + for _, str := range tc.wantContains { + s.Contains(rec.Body.String(), str) + } + }) + } +} + +const rbacAgentDrainTestSigningKey = "test-signing-key-for-rbac-agent-drain" + +func (s *AgentDrainPublicTestSuite) TestDrainAgentRBACHTTP() { + tokenManager := authtoken.New(s.logger) + + tests := []struct { + name string + setupAuth func(req *http.Request) + setupJobMock func() *jobmocks.MockJobClient + wantCode int + wantContains []string + }{ + { + name: "when no token returns 401", + setupAuth: func(_ *http.Request) { + // No auth header set + }, + setupJobMock: func() *jobmocks.MockJobClient { + return jobmocks.NewMockJobClient(s.mockCtrl) + }, + wantCode: http.StatusUnauthorized, + wantContains: []string{"Bearer token required"}, + }, + { + name: "when insufficient permissions returns 403", + setupAuth: func(req *http.Request) { + token, err := tokenManager.Generate( + rbacAgentDrainTestSigningKey, + []string{"read"}, + "test-user", + []string{"agent:read"}, + ) + s.Require().NoError(err) + req.Header.Set("Authorization", fmt.Sprintf("Bearer %s", token)) + }, + setupJobMock: func() *jobmocks.MockJobClient { + return jobmocks.NewMockJobClient(s.mockCtrl) + }, + wantCode: http.StatusForbidden, + wantContains: []string{"Insufficient permissions"}, + }, + { + name: "when valid token with agent:write returns 200", + setupAuth: func(req *http.Request) { + token, err := tokenManager.Generate( + rbacAgentDrainTestSigningKey, + []string{"admin"}, + "test-user", + nil, + ) + s.Require().NoError(err) + req.Header.Set("Authorization", fmt.Sprintf("Bearer %s", token)) + }, + setupJobMock: func() *jobmocks.MockJobClient { + mock := jobmocks.NewMockJobClient(s.mockCtrl) + mock.EXPECT(). + GetAgent(gomock.Any(), "server1"). + Return(&jobtypes.AgentInfo{ + Hostname: "server1", + State: jobtypes.AgentStateReady, + }, nil) + mock.EXPECT(). + SetDrainFlag(gomock.Any(), "server1"). + Return(nil) + mock.EXPECT(). + WriteAgentTimelineEvent(gomock.Any(), "server1", "drain", "Drain initiated via API"). + Return(nil) + return mock + }, + wantCode: http.StatusOK, + wantContains: []string{`"message"`, `drain initiated`}, + }, + } + + for _, tc := range tests { + s.Run(tc.name, func() { + jobMock := tc.setupJobMock() + + appConfig := config.Config{ + API: config.API{ + Server: config.Server{ + Security: config.ServerSecurity{ + SigningKey: rbacAgentDrainTestSigningKey, + }, + }, + }, + } + + server := api.New(appConfig, s.logger) + handlers := server.GetAgentHandler(jobMock) + server.RegisterHandlers(handlers) + + req := httptest.NewRequest( + http.MethodPost, + "/agent/server1/drain", + nil, + ) + tc.setupAuth(req) + rec := httptest.NewRecorder() + + server.Echo.ServeHTTP(rec, req) + + s.Equal(tc.wantCode, rec.Code) + for _, str := range tc.wantContains { + s.Contains(rec.Body.String(), str) + } + }) + } +} + +func TestAgentDrainPublicTestSuite(t *testing.T) { + suite.Run(t, new(AgentDrainPublicTestSuite)) +} diff --git a/internal/api/agent/agent_get_public_test.go b/internal/api/agent/agent_get_public_test.go index 411d2991..635958b3 100644 --- a/internal/api/agent/agent_get_public_test.go +++ b/internal/api/agent/agent_get_public_test.go @@ -94,7 +94,7 @@ func (s *AgentGetPublicTestSuite) TestGetAgentDetails() { r, ok := resp.(gen.GetAgentDetails200JSONResponse) s.True(ok) s.Equal("server1", r.Hostname) - s.Equal(gen.Ready, r.Status) + s.Equal(gen.AgentInfoStatusReady, r.Status) s.NotNil(r.Labels) s.NotNil(r.OsInfo) s.Equal("Ubuntu", r.OsInfo.Distribution) diff --git a/internal/api/agent/agent_list.go b/internal/api/agent/agent_list.go index b2628d53..02e0b44f 100644 --- a/internal/api/agent/agent_list.go +++ b/internal/api/agent/agent_list.go @@ -59,7 +59,7 @@ func (a *Agent) GetAgent( func buildAgentInfo( a *job.AgentInfo, ) gen.AgentInfo { - status := gen.Ready + status := gen.AgentInfoStatusReady info := gen.AgentInfo{ Hostname: a.Hostname, Status: status, @@ -167,6 +167,50 @@ func buildAgentInfo( info.Facts = &facts } + if a.State != "" { + state := gen.AgentInfoState(a.State) + info.State = &state + } + + if len(a.Conditions) > 0 { + conditions := make([]gen.NodeCondition, len(a.Conditions)) + for i, c := range a.Conditions { + conditions[i] = gen.NodeCondition{ + Type: gen.NodeConditionType(c.Type), + Status: c.Status, + LastTransitionTime: c.LastTransitionTime, + } + if c.Reason != "" { + reason := c.Reason + conditions[i].Reason = &reason + } + } + info.Conditions = &conditions + } + + if len(a.Timeline) > 0 { + timeline := make([]gen.TimelineEvent, len(a.Timeline)) + for i, te := range a.Timeline { + timeline[i] = gen.TimelineEvent{ + Timestamp: te.Timestamp, + Event: te.Event, + } + if te.Hostname != "" { + hostname := te.Hostname + timeline[i].Hostname = &hostname + } + if te.Message != "" { + message := te.Message + timeline[i].Message = &message + } + if te.Error != "" { + errStr := te.Error + timeline[i].Error = &errStr + } + } + info.Timeline = &timeline + } + return info } diff --git a/internal/api/agent/agent_list_public_test.go b/internal/api/agent/agent_list_public_test.go index 38f43a21..78b641d0 100644 --- a/internal/api/agent/agent_list_public_test.go +++ b/internal/api/agent/agent_list_public_test.go @@ -98,7 +98,7 @@ func (s *AgentListPublicTestSuite) TestGetAgent() { s.Equal(2, r.Total) s.Len(r.Agents, 2) s.Equal("server1", r.Agents[0].Hostname) - s.Equal(gen.Ready, r.Agents[0].Status) + s.Equal(gen.AgentInfoStatusReady, r.Agents[0].Status) s.NotNil(r.Agents[0].Labels) s.NotNil(r.Agents[0].RegisteredAt) s.NotNil(r.Agents[0].StartedAt) @@ -108,7 +108,7 @@ func (s *AgentListPublicTestSuite) TestGetAgent() { s.NotNil(r.Agents[0].Memory) s.NotNil(r.Agents[0].Uptime) s.Equal("server2", r.Agents[1].Hostname) - s.Equal(gen.Ready, r.Agents[1].Status) + s.Equal(gen.AgentInfoStatusReady, r.Agents[1].Status) }, }, { diff --git a/internal/api/agent/agent_undrain.go b/internal/api/agent/agent_undrain.go new file mode 100644 index 00000000..67bad3cc --- /dev/null +++ b/internal/api/agent/agent_undrain.go @@ -0,0 +1,72 @@ +// Copyright (c) 2026 John Dewey + +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS IN THE SOFTWARE. + +package agent + +import ( + "context" + "fmt" + "strings" + + "github.com/retr0h/osapi/internal/api/agent/gen" + "github.com/retr0h/osapi/internal/job" +) + +// UndrainAgent handles POST /agent/{hostname}/undrain. +func (a *Agent) UndrainAgent( + ctx context.Context, + request gen.UndrainAgentRequestObject, +) (gen.UndrainAgentResponseObject, error) { + hostname := request.Hostname + + agentInfo, err := a.JobClient.GetAgent(ctx, hostname) + if err != nil { + errMsg := fmt.Sprintf("agent not found: %s", hostname) + return gen.UndrainAgent404JSONResponse{Error: &errMsg}, nil + } + + if agentInfo.State != job.AgentStateDraining && agentInfo.State != job.AgentStateCordoned { + errMsg := fmt.Sprintf( + "agent %s is not in draining or cordoned state (current: %s)", + hostname, + agentInfo.State, + ) + return gen.UndrainAgent409JSONResponse{Error: &errMsg}, nil + } + + if err := a.JobClient.DeleteDrainFlag(ctx, hostname); err != nil { + errMsg := fmt.Sprintf("failed to delete drain flag: %s", err.Error()) + return gen.UndrainAgent409JSONResponse{Error: &errMsg}, nil + } + + if err := a.JobClient.WriteAgentTimelineEvent(ctx, hostname, "undrain", "Undrain initiated via API"); err != nil { + if strings.Contains(err.Error(), "not found") { + errMsg := fmt.Sprintf("agent not found: %s", hostname) + return gen.UndrainAgent404JSONResponse{Error: &errMsg}, nil + } + + errMsg := err.Error() + return gen.UndrainAgent409JSONResponse{Error: &errMsg}, nil + } + + msg := fmt.Sprintf("undrain initiated for agent %s", hostname) + + return gen.UndrainAgent200JSONResponse{Message: msg}, nil +} diff --git a/internal/api/agent/agent_undrain_public_test.go b/internal/api/agent/agent_undrain_public_test.go new file mode 100644 index 00000000..30b55bbb --- /dev/null +++ b/internal/api/agent/agent_undrain_public_test.go @@ -0,0 +1,372 @@ +// Copyright (c) 2026 John Dewey + +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS IN THE SOFTWARE. + +package agent_test + +import ( + "context" + "fmt" + "log/slog" + "net/http" + "net/http/httptest" + "os" + "testing" + + "github.com/golang/mock/gomock" + "github.com/stretchr/testify/suite" + + "github.com/retr0h/osapi/internal/api" + apiagent "github.com/retr0h/osapi/internal/api/agent" + "github.com/retr0h/osapi/internal/api/agent/gen" + "github.com/retr0h/osapi/internal/authtoken" + "github.com/retr0h/osapi/internal/config" + jobtypes "github.com/retr0h/osapi/internal/job" + jobmocks "github.com/retr0h/osapi/internal/job/mocks" +) + +type AgentUndrainPublicTestSuite struct { + suite.Suite + + mockCtrl *gomock.Controller + mockJobClient *jobmocks.MockJobClient + handler *apiagent.Agent + ctx context.Context + appConfig config.Config + logger *slog.Logger +} + +func (s *AgentUndrainPublicTestSuite) SetupTest() { + s.mockCtrl = gomock.NewController(s.T()) + s.mockJobClient = jobmocks.NewMockJobClient(s.mockCtrl) + s.handler = apiagent.New(slog.Default(), s.mockJobClient) + s.ctx = context.Background() + s.appConfig = config.Config{} + s.logger = slog.New(slog.NewTextHandler(os.Stdout, nil)) +} + +func (s *AgentUndrainPublicTestSuite) TearDownTest() { + s.mockCtrl.Finish() +} + +func (s *AgentUndrainPublicTestSuite) TestUndrainAgent() { + tests := []struct { + name string + hostname string + mockAgent *jobtypes.AgentInfo + mockGetErr error + mockWriteErr error + skipWrite bool + mockDeleteDrain bool + validateFunc func(resp gen.UndrainAgentResponseObject) + }{ + { + name: "success undrains draining agent", + hostname: "server1", + mockAgent: &jobtypes.AgentInfo{ + Hostname: "server1", + State: jobtypes.AgentStateDraining, + }, + mockDeleteDrain: true, + validateFunc: func(resp gen.UndrainAgentResponseObject) { + r, ok := resp.(gen.UndrainAgent200JSONResponse) + s.True(ok) + s.Contains(r.Message, "undrain initiated for agent server1") + }, + }, + { + name: "success undrains cordoned agent", + hostname: "server1", + mockAgent: &jobtypes.AgentInfo{ + Hostname: "server1", + State: jobtypes.AgentStateCordoned, + }, + mockDeleteDrain: true, + validateFunc: func(resp gen.UndrainAgentResponseObject) { + r, ok := resp.(gen.UndrainAgent200JSONResponse) + s.True(ok) + s.Contains(r.Message, "undrain initiated for agent server1") + }, + }, + { + name: "agent not found returns 404", + hostname: "unknown", + mockGetErr: fmt.Errorf("agent not found: unknown"), + skipWrite: true, + validateFunc: func(resp gen.UndrainAgentResponseObject) { + _, ok := resp.(gen.UndrainAgent404JSONResponse) + s.True(ok) + }, + }, + { + name: "agent in ready state returns 409", + hostname: "server1", + mockAgent: &jobtypes.AgentInfo{ + Hostname: "server1", + State: jobtypes.AgentStateReady, + }, + skipWrite: true, + validateFunc: func(resp gen.UndrainAgentResponseObject) { + _, ok := resp.(gen.UndrainAgent409JSONResponse) + s.True(ok) + }, + }, + { + name: "agent with empty state returns 409", + hostname: "server1", + mockAgent: &jobtypes.AgentInfo{ + Hostname: "server1", + State: "", + }, + skipWrite: true, + validateFunc: func(resp gen.UndrainAgentResponseObject) { + _, ok := resp.(gen.UndrainAgent409JSONResponse) + s.True(ok) + }, + }, + } + + for _, tt := range tests { + s.Run(tt.name, func() { + s.mockJobClient.EXPECT(). + GetAgent(gomock.Any(), tt.hostname). + Return(tt.mockAgent, tt.mockGetErr) + + if tt.mockDeleteDrain { + s.mockJobClient.EXPECT(). + DeleteDrainFlag(gomock.Any(), tt.hostname). + Return(nil) + } + + if !tt.skipWrite { + s.mockJobClient.EXPECT(). + WriteAgentTimelineEvent(gomock.Any(), tt.hostname, "undrain", "Undrain initiated via API"). + Return(tt.mockWriteErr) + } + + resp, err := s.handler.UndrainAgent(s.ctx, gen.UndrainAgentRequestObject{ + Hostname: tt.hostname, + }) + s.NoError(err) + tt.validateFunc(resp) + }) + } +} + +func (s *AgentUndrainPublicTestSuite) TestUndrainAgentValidationHTTP() { + tests := []struct { + name string + hostname string + setupJobMock func() *jobmocks.MockJobClient + wantCode int + wantContains []string + }{ + { + name: "when draining agent exists returns 200", + hostname: "server1", + setupJobMock: func() *jobmocks.MockJobClient { + mock := jobmocks.NewMockJobClient(s.mockCtrl) + mock.EXPECT(). + GetAgent(gomock.Any(), "server1"). + Return(&jobtypes.AgentInfo{ + Hostname: "server1", + State: jobtypes.AgentStateDraining, + }, nil) + mock.EXPECT(). + DeleteDrainFlag(gomock.Any(), "server1"). + Return(nil) + mock.EXPECT(). + WriteAgentTimelineEvent(gomock.Any(), "server1", "undrain", "Undrain initiated via API"). + Return(nil) + return mock + }, + wantCode: http.StatusOK, + wantContains: []string{`"message"`, `undrain initiated`}, + }, + { + name: "when agent not found returns 404", + hostname: "unknown", + setupJobMock: func() *jobmocks.MockJobClient { + mock := jobmocks.NewMockJobClient(s.mockCtrl) + mock.EXPECT(). + GetAgent(gomock.Any(), "unknown"). + Return(nil, fmt.Errorf("agent not found: unknown")) + return mock + }, + wantCode: http.StatusNotFound, + wantContains: []string{`"error"`}, + }, + { + name: "when agent in ready state returns 409", + hostname: "server1", + setupJobMock: func() *jobmocks.MockJobClient { + mock := jobmocks.NewMockJobClient(s.mockCtrl) + mock.EXPECT(). + GetAgent(gomock.Any(), "server1"). + Return(&jobtypes.AgentInfo{ + Hostname: "server1", + State: jobtypes.AgentStateReady, + }, nil) + return mock + }, + wantCode: http.StatusConflict, + wantContains: []string{`"error"`, `not in draining or cordoned`}, + }, + } + + for _, tc := range tests { + s.Run(tc.name, func() { + jobMock := tc.setupJobMock() + + agentHandler := apiagent.New(s.logger, jobMock) + strictHandler := gen.NewStrictHandler(agentHandler, nil) + + a := api.New(s.appConfig, s.logger) + gen.RegisterHandlers(a.Echo, strictHandler) + + req := httptest.NewRequest( + http.MethodPost, + fmt.Sprintf("/agent/%s/undrain", tc.hostname), + nil, + ) + rec := httptest.NewRecorder() + + a.Echo.ServeHTTP(rec, req) + + s.Equal(tc.wantCode, rec.Code) + for _, str := range tc.wantContains { + s.Contains(rec.Body.String(), str) + } + }) + } +} + +const rbacAgentUndrainTestSigningKey = "test-signing-key-for-rbac-agent-undrain" + +func (s *AgentUndrainPublicTestSuite) TestUndrainAgentRBACHTTP() { + tokenManager := authtoken.New(s.logger) + + tests := []struct { + name string + setupAuth func(req *http.Request) + setupJobMock func() *jobmocks.MockJobClient + wantCode int + wantContains []string + }{ + { + name: "when no token returns 401", + setupAuth: func(_ *http.Request) { + // No auth header set + }, + setupJobMock: func() *jobmocks.MockJobClient { + return jobmocks.NewMockJobClient(s.mockCtrl) + }, + wantCode: http.StatusUnauthorized, + wantContains: []string{"Bearer token required"}, + }, + { + name: "when insufficient permissions returns 403", + setupAuth: func(req *http.Request) { + token, err := tokenManager.Generate( + rbacAgentUndrainTestSigningKey, + []string{"read"}, + "test-user", + []string{"agent:read"}, + ) + s.Require().NoError(err) + req.Header.Set("Authorization", fmt.Sprintf("Bearer %s", token)) + }, + setupJobMock: func() *jobmocks.MockJobClient { + return jobmocks.NewMockJobClient(s.mockCtrl) + }, + wantCode: http.StatusForbidden, + wantContains: []string{"Insufficient permissions"}, + }, + { + name: "when valid token with agent:write returns 200", + setupAuth: func(req *http.Request) { + token, err := tokenManager.Generate( + rbacAgentUndrainTestSigningKey, + []string{"admin"}, + "test-user", + nil, + ) + s.Require().NoError(err) + req.Header.Set("Authorization", fmt.Sprintf("Bearer %s", token)) + }, + setupJobMock: func() *jobmocks.MockJobClient { + mock := jobmocks.NewMockJobClient(s.mockCtrl) + mock.EXPECT(). + GetAgent(gomock.Any(), "server1"). + Return(&jobtypes.AgentInfo{ + Hostname: "server1", + State: jobtypes.AgentStateDraining, + }, nil) + mock.EXPECT(). + DeleteDrainFlag(gomock.Any(), "server1"). + Return(nil) + mock.EXPECT(). + WriteAgentTimelineEvent(gomock.Any(), "server1", "undrain", "Undrain initiated via API"). + Return(nil) + return mock + }, + wantCode: http.StatusOK, + wantContains: []string{`"message"`, `undrain initiated`}, + }, + } + + for _, tc := range tests { + s.Run(tc.name, func() { + jobMock := tc.setupJobMock() + + appConfig := config.Config{ + API: config.API{ + Server: config.Server{ + Security: config.ServerSecurity{ + SigningKey: rbacAgentUndrainTestSigningKey, + }, + }, + }, + } + + server := api.New(appConfig, s.logger) + handlers := server.GetAgentHandler(jobMock) + server.RegisterHandlers(handlers) + + req := httptest.NewRequest( + http.MethodPost, + "/agent/server1/undrain", + nil, + ) + tc.setupAuth(req) + rec := httptest.NewRecorder() + + server.Echo.ServeHTTP(rec, req) + + s.Equal(tc.wantCode, rec.Code) + for _, str := range tc.wantContains { + s.Contains(rec.Body.String(), str) + } + }) + } +} + +func TestAgentUndrainPublicTestSuite(t *testing.T) { + suite.Run(t, new(AgentUndrainPublicTestSuite)) +} diff --git a/internal/api/agent/gen/agent.gen.go b/internal/api/agent/gen/agent.gen.go index 60acbb7c..16ef4bec 100644 --- a/internal/api/agent/gen/agent.gen.go +++ b/internal/api/agent/gen/agent.gen.go @@ -20,10 +20,17 @@ const ( BearerAuthScopes = "BearerAuth.Scopes" ) +// Defines values for AgentInfoState. +const ( + AgentInfoStateCordoned AgentInfoState = "Cordoned" + AgentInfoStateDraining AgentInfoState = "Draining" + AgentInfoStateReady AgentInfoState = "Ready" +) + // Defines values for AgentInfoStatus. const ( - NotReady AgentInfoStatus = "NotReady" - Ready AgentInfoStatus = "Ready" + AgentInfoStatusNotReady AgentInfoStatus = "NotReady" + AgentInfoStatusReady AgentInfoStatus = "Ready" ) // Defines values for NetworkInterfaceResponseFamily. @@ -33,11 +40,21 @@ const ( Inet6 NetworkInterfaceResponseFamily = "inet6" ) +// Defines values for NodeConditionType. +const ( + DiskPressure NodeConditionType = "DiskPressure" + HighLoad NodeConditionType = "HighLoad" + MemoryPressure NodeConditionType = "MemoryPressure" +) + // AgentInfo defines model for AgentInfo. type AgentInfo struct { // Architecture CPU architecture. Architecture *string `json:"architecture,omitempty"` + // Conditions Evaluated node conditions. + Conditions *[]NodeCondition `json:"conditions,omitempty"` + // CpuCount Number of logical CPUs. CpuCount *int `json:"cpu_count,omitempty"` @@ -78,13 +95,22 @@ type AgentInfo struct { // StartedAt When the agent process started. StartedAt *time.Time `json:"started_at,omitempty"` + // State Agent scheduling state. + State *AgentInfoState `json:"state,omitempty"` + // Status The current status of the agent. Status AgentInfoStatus `json:"status"` + // Timeline Agent state transition history. + Timeline *[]TimelineEvent `json:"timeline,omitempty"` + // Uptime The system uptime. Uptime *string `json:"uptime,omitempty"` } +// AgentInfoState Agent scheduling state. +type AgentInfoState string + // AgentInfoStatus The current status of the agent. type AgentInfoStatus string @@ -136,6 +162,17 @@ type NetworkInterfaceResponse struct { // NetworkInterfaceResponseFamily IP address family. type NetworkInterfaceResponseFamily string +// NodeCondition defines model for NodeCondition. +type NodeCondition struct { + LastTransitionTime time.Time `json:"last_transition_time"` + Reason *string `json:"reason,omitempty"` + Status bool `json:"status"` + Type NodeConditionType `json:"type"` +} + +// NodeConditionType defines model for NodeCondition.Type. +type NodeConditionType string + // OSInfoResponse Operating system information. type OSInfoResponse struct { // Distribution The name of the Linux distribution. @@ -145,6 +182,15 @@ type OSInfoResponse struct { Version string `json:"version"` } +// TimelineEvent defines model for TimelineEvent. +type TimelineEvent struct { + Error *string `json:"error,omitempty"` + Event string `json:"event"` + Hostname *string `json:"hostname,omitempty"` + Message *string `json:"message,omitempty"` + Timestamp time.Time `json:"timestamp"` +} + // ServerInterface represents all server handlers. type ServerInterface interface { // List active agents @@ -153,6 +199,12 @@ type ServerInterface interface { // Get agent details // (GET /agent/{hostname}) GetAgentDetails(ctx echo.Context, hostname string) error + // Drain an agent + // (POST /agent/{hostname}/drain) + DrainAgent(ctx echo.Context, hostname string) error + // Undrain an agent + // (POST /agent/{hostname}/undrain) + UndrainAgent(ctx echo.Context, hostname string) error } // ServerInterfaceWrapper converts echo contexts to parameters. @@ -189,6 +241,42 @@ func (w *ServerInterfaceWrapper) GetAgentDetails(ctx echo.Context) error { return err } +// DrainAgent converts echo context to params. +func (w *ServerInterfaceWrapper) DrainAgent(ctx echo.Context) error { + var err error + // ------------- Path parameter "hostname" ------------- + var hostname string + + err = runtime.BindStyledParameterWithOptions("simple", "hostname", ctx.Param("hostname"), &hostname, runtime.BindStyledParameterOptions{ParamLocation: runtime.ParamLocationPath, Explode: false, Required: true}) + if err != nil { + return echo.NewHTTPError(http.StatusBadRequest, fmt.Sprintf("Invalid format for parameter hostname: %s", err)) + } + + ctx.Set(BearerAuthScopes, []string{"agent:write"}) + + // Invoke the callback with all the unmarshaled arguments + err = w.Handler.DrainAgent(ctx, hostname) + return err +} + +// UndrainAgent converts echo context to params. +func (w *ServerInterfaceWrapper) UndrainAgent(ctx echo.Context) error { + var err error + // ------------- Path parameter "hostname" ------------- + var hostname string + + err = runtime.BindStyledParameterWithOptions("simple", "hostname", ctx.Param("hostname"), &hostname, runtime.BindStyledParameterOptions{ParamLocation: runtime.ParamLocationPath, Explode: false, Required: true}) + if err != nil { + return echo.NewHTTPError(http.StatusBadRequest, fmt.Sprintf("Invalid format for parameter hostname: %s", err)) + } + + ctx.Set(BearerAuthScopes, []string{"agent:write"}) + + // Invoke the callback with all the unmarshaled arguments + err = w.Handler.UndrainAgent(ctx, hostname) + return err +} + // This is a simple interface which specifies echo.Route addition functions which // are present on both echo.Echo and echo.Group, since we want to allow using // either of them for path registration @@ -219,6 +307,8 @@ func RegisterHandlersWithBaseURL(router EchoRouter, si ServerInterface, baseURL router.GET(baseURL+"/agent", wrapper.GetAgent) router.GET(baseURL+"/agent/:hostname", wrapper.GetAgentDetails) + router.POST(baseURL+"/agent/:hostname/drain", wrapper.DrainAgent) + router.POST(baseURL+"/agent/:hostname/undrain", wrapper.UndrainAgent) } @@ -318,6 +408,116 @@ func (response GetAgentDetails500JSONResponse) VisitGetAgentDetailsResponse(w ht return json.NewEncoder(w).Encode(response) } +type DrainAgentRequestObject struct { + Hostname string `json:"hostname"` +} + +type DrainAgentResponseObject interface { + VisitDrainAgentResponse(w http.ResponseWriter) error +} + +type DrainAgent200JSONResponse struct { + Message string `json:"message"` +} + +func (response DrainAgent200JSONResponse) VisitDrainAgentResponse(w http.ResponseWriter) error { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(200) + + return json.NewEncoder(w).Encode(response) +} + +type DrainAgent401JSONResponse externalRef0.ErrorResponse + +func (response DrainAgent401JSONResponse) VisitDrainAgentResponse(w http.ResponseWriter) error { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(401) + + return json.NewEncoder(w).Encode(response) +} + +type DrainAgent403JSONResponse externalRef0.ErrorResponse + +func (response DrainAgent403JSONResponse) VisitDrainAgentResponse(w http.ResponseWriter) error { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(403) + + return json.NewEncoder(w).Encode(response) +} + +type DrainAgent404JSONResponse externalRef0.ErrorResponse + +func (response DrainAgent404JSONResponse) VisitDrainAgentResponse(w http.ResponseWriter) error { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(404) + + return json.NewEncoder(w).Encode(response) +} + +type DrainAgent409JSONResponse externalRef0.ErrorResponse + +func (response DrainAgent409JSONResponse) VisitDrainAgentResponse(w http.ResponseWriter) error { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(409) + + return json.NewEncoder(w).Encode(response) +} + +type UndrainAgentRequestObject struct { + Hostname string `json:"hostname"` +} + +type UndrainAgentResponseObject interface { + VisitUndrainAgentResponse(w http.ResponseWriter) error +} + +type UndrainAgent200JSONResponse struct { + Message string `json:"message"` +} + +func (response UndrainAgent200JSONResponse) VisitUndrainAgentResponse(w http.ResponseWriter) error { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(200) + + return json.NewEncoder(w).Encode(response) +} + +type UndrainAgent401JSONResponse externalRef0.ErrorResponse + +func (response UndrainAgent401JSONResponse) VisitUndrainAgentResponse(w http.ResponseWriter) error { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(401) + + return json.NewEncoder(w).Encode(response) +} + +type UndrainAgent403JSONResponse externalRef0.ErrorResponse + +func (response UndrainAgent403JSONResponse) VisitUndrainAgentResponse(w http.ResponseWriter) error { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(403) + + return json.NewEncoder(w).Encode(response) +} + +type UndrainAgent404JSONResponse externalRef0.ErrorResponse + +func (response UndrainAgent404JSONResponse) VisitUndrainAgentResponse(w http.ResponseWriter) error { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(404) + + return json.NewEncoder(w).Encode(response) +} + +type UndrainAgent409JSONResponse externalRef0.ErrorResponse + +func (response UndrainAgent409JSONResponse) VisitUndrainAgentResponse(w http.ResponseWriter) error { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(409) + + return json.NewEncoder(w).Encode(response) +} + // StrictServerInterface represents all server handlers. type StrictServerInterface interface { // List active agents @@ -326,6 +526,12 @@ type StrictServerInterface interface { // Get agent details // (GET /agent/{hostname}) GetAgentDetails(ctx context.Context, request GetAgentDetailsRequestObject) (GetAgentDetailsResponseObject, error) + // Drain an agent + // (POST /agent/{hostname}/drain) + DrainAgent(ctx context.Context, request DrainAgentRequestObject) (DrainAgentResponseObject, error) + // Undrain an agent + // (POST /agent/{hostname}/undrain) + UndrainAgent(ctx context.Context, request UndrainAgentRequestObject) (UndrainAgentResponseObject, error) } type StrictHandlerFunc = strictecho.StrictEchoHandlerFunc @@ -387,3 +593,53 @@ func (sh *strictHandler) GetAgentDetails(ctx echo.Context, hostname string) erro } return nil } + +// DrainAgent operation middleware +func (sh *strictHandler) DrainAgent(ctx echo.Context, hostname string) error { + var request DrainAgentRequestObject + + request.Hostname = hostname + + handler := func(ctx echo.Context, request interface{}) (interface{}, error) { + return sh.ssi.DrainAgent(ctx.Request().Context(), request.(DrainAgentRequestObject)) + } + for _, middleware := range sh.middlewares { + handler = middleware(handler, "DrainAgent") + } + + response, err := handler(ctx, request) + + if err != nil { + return err + } else if validResponse, ok := response.(DrainAgentResponseObject); ok { + return validResponse.VisitDrainAgentResponse(ctx.Response()) + } else if response != nil { + return fmt.Errorf("unexpected response type: %T", response) + } + return nil +} + +// UndrainAgent operation middleware +func (sh *strictHandler) UndrainAgent(ctx echo.Context, hostname string) error { + var request UndrainAgentRequestObject + + request.Hostname = hostname + + handler := func(ctx echo.Context, request interface{}) (interface{}, error) { + return sh.ssi.UndrainAgent(ctx.Request().Context(), request.(UndrainAgentRequestObject)) + } + for _, middleware := range sh.middlewares { + handler = middleware(handler, "UndrainAgent") + } + + response, err := handler(ctx, request) + + if err != nil { + return err + } else if validResponse, ok := response.(UndrainAgentResponseObject); ok { + return validResponse.VisitUndrainAgentResponse(ctx.Response()) + } else if response != nil { + return fmt.Errorf("unexpected response type: %T", response) + } + return nil +} diff --git a/internal/api/agent/gen/api.yaml b/internal/api/agent/gen/api.yaml index ad1f5d42..26fe3050 100644 --- a/internal/api/agent/gen/api.yaml +++ b/internal/api/agent/gen/api.yaml @@ -110,6 +110,114 @@ paths: application/json: schema: $ref: '../../common/gen/api.yaml#/components/schemas/ErrorResponse' + /agent/{hostname}/drain: + post: + operationId: drainAgent + summary: Drain an agent + description: > + Stop the agent from accepting new jobs. In-flight jobs continue + to completion. + tags: + - agent_operations + security: + - BearerAuth: + - agent:write + parameters: + - name: hostname + in: path + required: true + schema: + type: string + description: The hostname of the agent to drain. + responses: + '200': + description: Agent drain initiated. + content: + application/json: + schema: + type: object + properties: + message: + type: string + required: + - message + '401': + description: Unauthorized - API key required + content: + application/json: + schema: + $ref: '../../common/gen/api.yaml#/components/schemas/ErrorResponse' + '403': + description: Forbidden - Insufficient permissions + content: + application/json: + schema: + $ref: '../../common/gen/api.yaml#/components/schemas/ErrorResponse' + '404': + description: Agent not found. + content: + application/json: + schema: + $ref: '../../common/gen/api.yaml#/components/schemas/ErrorResponse' + '409': + description: Agent already in requested state. + content: + application/json: + schema: + $ref: '../../common/gen/api.yaml#/components/schemas/ErrorResponse' + /agent/{hostname}/undrain: + post: + operationId: undrainAgent + summary: Undrain an agent + description: Resume accepting jobs on a drained agent. + tags: + - agent_operations + security: + - BearerAuth: + - agent:write + parameters: + - name: hostname + in: path + required: true + schema: + type: string + description: The hostname of the agent to undrain. + responses: + '200': + description: Agent undrain initiated. + content: + application/json: + schema: + type: object + properties: + message: + type: string + required: + - message + '401': + description: Unauthorized - API key required + content: + application/json: + schema: + $ref: '../../common/gen/api.yaml#/components/schemas/ErrorResponse' + '403': + description: Forbidden - Insufficient permissions + content: + application/json: + schema: + $ref: '../../common/gen/api.yaml#/components/schemas/ErrorResponse' + '404': + description: Agent not found. + content: + application/json: + schema: + $ref: '../../common/gen/api.yaml#/components/schemas/ErrorResponse' + '409': + description: Agent not in draining or cordoned state. + content: + application/json: + schema: + $ref: '../../common/gen/api.yaml#/components/schemas/ErrorResponse' components: securitySchemes: @@ -200,6 +308,20 @@ components: type: object additionalProperties: true description: Extended facts from additional providers. + state: + type: string + enum: [Ready, Draining, Cordoned] + description: Agent scheduling state. + conditions: + type: array + items: + $ref: '#/components/schemas/NodeCondition' + description: Evaluated node conditions. + timeline: + type: array + items: + $ref: '#/components/schemas/TimelineEvent' + description: Agent state transition history. required: - hostname - status @@ -287,3 +409,39 @@ components: - dual required: - name + + NodeCondition: + type: object + properties: + type: + type: string + enum: [MemoryPressure, HighLoad, DiskPressure] + status: + type: boolean + reason: + type: string + last_transition_time: + type: string + format: date-time + required: + - type + - status + - last_transition_time + + TimelineEvent: + type: object + properties: + timestamp: + type: string + format: date-time + event: + type: string + hostname: + type: string + message: + type: string + error: + type: string + required: + - timestamp + - event diff --git a/internal/api/gen/api.yaml b/internal/api/gen/api.yaml index 0dbe61d0..104c36da 100644 --- a/internal/api/gen/api.yaml +++ b/internal/api/gen/api.yaml @@ -118,6 +118,116 @@ paths: application/json: schema: $ref: '#/components/schemas/ErrorResponse' + /agent/{hostname}/drain: + servers: [] + post: + operationId: drainAgent + summary: Drain an agent + description: > + Stop the agent from accepting new jobs. In-flight jobs continue to + completion. + tags: + - Agent_Management_API_agent_operations + security: + - BearerAuth: + - agent:write + parameters: + - name: hostname + in: path + required: true + schema: + type: string + description: The hostname of the agent to drain. + responses: + '200': + description: Agent drain initiated. + content: + application/json: + schema: + type: object + properties: + message: + type: string + required: + - message + '401': + description: Unauthorized - API key required + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '403': + description: Forbidden - Insufficient permissions + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '404': + description: Agent not found. + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '409': + description: Agent already in requested state. + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + /agent/{hostname}/undrain: + servers: [] + post: + operationId: undrainAgent + summary: Undrain an agent + description: Resume accepting jobs on a drained agent. + tags: + - Agent_Management_API_agent_operations + security: + - BearerAuth: + - agent:write + parameters: + - name: hostname + in: path + required: true + schema: + type: string + description: The hostname of the agent to undrain. + responses: + '200': + description: Agent undrain initiated. + content: + application/json: + schema: + type: object + properties: + message: + type: string + required: + - message + '401': + description: Unauthorized - API key required + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '403': + description: Forbidden - Insufficient permissions + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '404': + description: Agent not found. + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' + '409': + description: Agent not in draining or cordoned state. + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorResponse' /audit: servers: [] get: @@ -1380,6 +1490,23 @@ components: type: object additionalProperties: true description: Extended facts from additional providers. + state: + type: string + enum: + - Ready + - Draining + - Cordoned + description: Agent scheduling state. + conditions: + type: array + items: + $ref: '#/components/schemas/NodeCondition' + description: Evaluated node conditions. + timeline: + type: array + items: + $ref: '#/components/schemas/TimelineEvent' + description: Agent state transition history. required: - hostname - status @@ -1463,6 +1590,43 @@ components: - dual required: - name + NodeCondition: + type: object + properties: + type: + type: string + enum: + - MemoryPressure + - HighLoad + - DiskPressure + status: + type: boolean + reason: + type: string + last_transition_time: + type: string + format: date-time + required: + - type + - status + - last_transition_time + TimelineEvent: + type: object + properties: + timestamp: + type: string + format: date-time + event: + type: string + hostname: + type: string + message: + type: string + error: + type: string + required: + - timestamp + - event AuditEntry: type: object properties: diff --git a/internal/authtoken/permissions.go b/internal/authtoken/permissions.go index ae2dbd4e..df3f2611 100644 --- a/internal/authtoken/permissions.go +++ b/internal/authtoken/permissions.go @@ -26,6 +26,7 @@ type Permission = string // Permission constants using resource:verb format. const ( PermAgentRead Permission = "agent:read" + PermAgentWrite Permission = "agent:write" PermNodeRead Permission = "node:read" PermNetworkRead Permission = "network:read" PermNetworkWrite Permission = "network:write" @@ -39,6 +40,7 @@ const ( // AllPermissions is the full set of known permissions. var AllPermissions = []Permission{ PermAgentRead, + PermAgentWrite, PermNodeRead, PermNetworkRead, PermNetworkWrite, @@ -53,6 +55,7 @@ var AllPermissions = []Permission{ var DefaultRolePermissions = map[string][]Permission{ "admin": { PermAgentRead, + PermAgentWrite, PermNodeRead, PermNetworkRead, PermNetworkWrite, diff --git a/internal/cli/nats.go b/internal/cli/nats.go index 419d1543..b3d7e1c7 100644 --- a/internal/cli/nats.go +++ b/internal/cli/nats.go @@ -107,6 +107,21 @@ func BuildFactsKVConfig( } } +// BuildStateKVConfig builds a jetstream.KeyValueConfig from state config values. +// The state bucket has no TTL so drain flags and timeline events persist indefinitely. +func BuildStateKVConfig( + namespace string, + stateCfg config.NATSState, +) jetstream.KeyValueConfig { + stateBucket := job.ApplyNamespaceToInfraName(namespace, stateCfg.Bucket) + + return jetstream.KeyValueConfig{ + Bucket: stateBucket, + Storage: ParseJetstreamStorageType(stateCfg.Storage), + Replicas: stateCfg.Replicas, + } +} + // BuildAuditKVConfig builds a jetstream.KeyValueConfig from audit config values. func BuildAuditKVConfig( namespace string, diff --git a/internal/config/types.go b/internal/config/types.go index 61764d7f..afbcbfb6 100644 --- a/internal/config/types.go +++ b/internal/config/types.go @@ -93,6 +93,7 @@ type NATS struct { Audit NATSAudit `mapstructure:"audit,omitempty"` Registry NATSRegistry `mapstructure:"registry,omitempty"` Facts NATSFacts `mapstructure:"facts,omitempty"` + State NATSState `mapstructure:"state,omitempty"` } // NATSAudit configuration for the audit log KV bucket. @@ -123,6 +124,14 @@ type NATSFacts struct { Replicas int `mapstructure:"replicas"` } +// NATSState configuration for the agent state KV bucket (drain flags, timeline events). +type NATSState struct { + // Bucket is the KV bucket name for persistent agent state. + Bucket string `mapstructure:"bucket"` + Storage string `mapstructure:"storage"` // "file" or "memory" + Replicas int `mapstructure:"replicas"` +} + // NATSServer configuration settings for the embedded NATS server. type NATSServer struct { // Host the server will bind to. @@ -258,6 +267,13 @@ type AgentFacts struct { Interval string `mapstructure:"interval"` // e.g. "5m", "1h" } +// AgentConditions holds threshold configuration for node conditions. +type AgentConditions struct { + MemoryPressureThreshold int `mapstructure:"memory_pressure_threshold"` + HighLoadMultiplier float64 `mapstructure:"high_load_multiplier"` + DiskPressureThreshold int `mapstructure:"disk_pressure_threshold"` +} + // AgentConfig configuration settings. type AgentConfig struct { // NATS connection settings for the agent. @@ -274,4 +290,6 @@ type AgentConfig struct { MaxJobs int `mapstructure:"max_jobs"` // Labels are key-value pairs for label-based routing (e.g., role: web, env: prod). Labels map[string]string `mapstructure:"labels"` + // Conditions holds threshold settings for node condition evaluation. + Conditions AgentConditions `mapstructure:"conditions,omitempty"` } diff --git a/internal/job/client/agent.go b/internal/job/client/agent.go index d0ab7cd8..9b9aec44 100644 --- a/internal/job/client/agent.go +++ b/internal/job/client/agent.go @@ -26,6 +26,8 @@ import ( "fmt" "log/slog" "regexp" + "sort" + "strings" "time" "github.com/nats-io/nats.go/jetstream" @@ -164,6 +166,194 @@ func (c *Client) CreateOrUpdateConsumer( return c.natsClient.CreateOrUpdateConsumerWithConfig(ctx, streamName, consumerConfig) } +// WriteAgentTimelineEvent writes an append-only timeline event +// for an agent state transition. +func (c *Client) WriteAgentTimelineEvent( + ctx context.Context, + hostname, event, message string, +) error { + if c.stateKV == nil { + return fmt.Errorf("agent state bucket not configured") + } + + now := time.Now() + key := fmt.Sprintf( + "timeline.%s.%s.%d", + job.SanitizeHostname(hostname), + event, + now.UnixNano(), + ) + + data, err := json.Marshal(job.TimelineEvent{ + Timestamp: now, + Event: event, + Hostname: hostname, + Message: message, + }) + if err != nil { + return fmt.Errorf("marshal timeline event: %w", err) + } + + _, err = c.stateKV.Put(ctx, key, data) + if err != nil { + return fmt.Errorf("write timeline event: %w", err) + } + + c.logger.Debug("wrote agent timeline event", + slog.String("hostname", hostname), + slog.String("event", event), + slog.String("key", key), + ) + + return nil +} + +// GetAgentTimeline returns sorted timeline events for a hostname. +func (c *Client) GetAgentTimeline( + ctx context.Context, + hostname string, +) ([]job.TimelineEvent, error) { + if c.stateKV == nil { + return nil, fmt.Errorf("agent state bucket not configured") + } + + prefix := "timeline." + job.SanitizeHostname(hostname) + "." + + keys, err := c.stateKV.Keys(ctx) + if err != nil { + // No keys found is not an error for timeline + return []job.TimelineEvent{}, nil + } + + var events []job.TimelineEvent + for _, key := range keys { + if !strings.HasPrefix(key, prefix) { + continue + } + + entry, err := c.stateKV.Get(ctx, key) + if err != nil { + continue + } + + var te job.TimelineEvent + if err := json.Unmarshal(entry.Value(), &te); err != nil { + continue + } + + events = append(events, te) + } + + // Sort by timestamp + sort.Slice(events, func(i, j int) bool { + return events[i].Timestamp.Before(events[j].Timestamp) + }) + + return events, nil +} + +// ComputeAgentState returns the current state from timeline events. +func ComputeAgentState( + events []job.TimelineEvent, +) string { + if len(events) == 0 { + return job.AgentStateReady + } + + latest := events[len(events)-1] + switch latest.Event { + case "drain": + return job.AgentStateDraining + case "cordoned": + return job.AgentStateCordoned + case "undrain", "ready": + return job.AgentStateReady + default: + return job.AgentStateReady + } +} + +// overlayDrainState checks if a drain flag exists for the agent and +// overrides the reported state. The agent always reports its own view +// (Ready), but the operator may have drained it via the API. Drain +// flags are stored in the agent-state KV bucket (no TTL). +func (c *Client) overlayDrainState( + ctx context.Context, + info *job.AgentInfo, +) { + if c.stateKV == nil { + return + } + + key := "drain." + job.SanitizeHostname(info.Hostname) + _, err := c.stateKV.Get(ctx, key) + if err == nil { + info.State = job.AgentStateCordoned + } +} + +// CheckDrainFlag returns true if the drain flag exists for the hostname. +func (c *Client) CheckDrainFlag( + ctx context.Context, + hostname string, +) bool { + if c.stateKV == nil { + return false + } + + key := "drain." + job.SanitizeHostname(hostname) + _, err := c.stateKV.Get(ctx, key) + return err == nil +} + +// SetDrainFlag writes the drain flag for an agent in the state KV bucket. +// The agent detects this flag on heartbeat and stops accepting jobs. +func (c *Client) SetDrainFlag( + ctx context.Context, + hostname string, +) error { + if c.stateKV == nil { + return fmt.Errorf("agent state bucket not configured") + } + + key := "drain." + job.SanitizeHostname(hostname) + _, err := c.stateKV.Put(ctx, key, []byte("1")) + if err != nil { + return fmt.Errorf("set drain flag: %w", err) + } + + c.logger.Debug("set drain flag", + slog.String("hostname", hostname), + slog.String("key", key), + ) + + return nil +} + +// DeleteDrainFlag removes the drain flag for an agent from the state KV bucket. +// The agent detects this on heartbeat and resumes accepting jobs. +func (c *Client) DeleteDrainFlag( + ctx context.Context, + hostname string, +) error { + if c.stateKV == nil { + return fmt.Errorf("agent state bucket not configured") + } + + key := "drain." + job.SanitizeHostname(hostname) + err := c.stateKV.Delete(ctx, key) + if err != nil { + return fmt.Errorf("delete drain flag: %w", err) + } + + c.logger.Debug("deleted drain flag", + slog.String("hostname", hostname), + slog.String("key", key), + ) + + return nil +} + // sanitizeKeyForNATS sanitizes a string for use as a NATS key. func sanitizeKeyForNATS( input string, diff --git a/internal/job/client/agent_drain_public_test.go b/internal/job/client/agent_drain_public_test.go new file mode 100644 index 00000000..9301403c --- /dev/null +++ b/internal/job/client/agent_drain_public_test.go @@ -0,0 +1,353 @@ +// Copyright (c) 2026 John Dewey + +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS IN THE SOFTWARE. + +package client_test + +import ( + "context" + "errors" + "log/slog" + "testing" + "time" + + "github.com/golang/mock/gomock" + "github.com/stretchr/testify/suite" + + "github.com/retr0h/osapi/internal/job" + "github.com/retr0h/osapi/internal/job/client" + jobmocks "github.com/retr0h/osapi/internal/job/mocks" +) + +type AgentDrainPublicTestSuite struct { + suite.Suite + + mockCtrl *gomock.Controller + mockNATSClient *jobmocks.MockNATSClient + mockKV *jobmocks.MockKeyValue + ctx context.Context +} + +func (s *AgentDrainPublicTestSuite) SetupTest() { + s.mockCtrl = gomock.NewController(s.T()) + s.mockNATSClient = jobmocks.NewMockNATSClient(s.mockCtrl) + s.mockKV = jobmocks.NewMockKeyValue(s.mockCtrl) + s.ctx = context.Background() +} + +func (s *AgentDrainPublicTestSuite) TearDownTest() { + s.mockCtrl.Finish() +} + +func (s *AgentDrainPublicTestSuite) newClientWithState( + stateKV *jobmocks.MockKeyValue, +) *client.Client { + opts := &client.Options{ + Timeout: 30 * time.Second, + KVBucket: s.mockKV, + StateKV: stateKV, + } + c, err := client.New(slog.Default(), s.mockNATSClient, opts) + s.Require().NoError(err) + + return c +} + +func (s *AgentDrainPublicTestSuite) newClientWithoutState() *client.Client { + opts := &client.Options{ + Timeout: 30 * time.Second, + KVBucket: s.mockKV, + } + c, err := client.New(slog.Default(), s.mockNATSClient, opts) + s.Require().NoError(err) + + return c +} + +func (s *AgentDrainPublicTestSuite) TestCheckDrainFlag() { + tests := []struct { + name string + hostname string + useState bool + setupMocks func(*jobmocks.MockKeyValue) + expected bool + }{ + { + name: "when drain flag exists returns true", + hostname: "server1", + useState: true, + setupMocks: func(kv *jobmocks.MockKeyValue) { + entry := jobmocks.NewMockKeyValueEntry(s.mockCtrl) + kv.EXPECT(). + Get(gomock.Any(), "drain.server1"). + Return(entry, nil) + }, + expected: true, + }, + { + name: "when drain flag missing returns false", + hostname: "server1", + useState: true, + setupMocks: func(kv *jobmocks.MockKeyValue) { + kv.EXPECT(). + Get(gomock.Any(), "drain.server1"). + Return(nil, errors.New("key not found")) + }, + expected: false, + }, + { + name: "when stateKV is nil returns false", + hostname: "server1", + useState: false, + expected: false, + }, + } + + for _, tt := range tests { + s.Run(tt.name, func() { + var jobsClient *client.Client + if tt.useState { + stateKV := jobmocks.NewMockKeyValue(s.mockCtrl) + if tt.setupMocks != nil { + tt.setupMocks(stateKV) + } + jobsClient = s.newClientWithState(stateKV) + } else { + jobsClient = s.newClientWithoutState() + } + + result := jobsClient.CheckDrainFlag(s.ctx, tt.hostname) + s.Equal(tt.expected, result) + }) + } +} + +func (s *AgentDrainPublicTestSuite) TestSetDrainFlag() { + tests := []struct { + name string + hostname string + useState bool + setupMocks func(*jobmocks.MockKeyValue) + expectError bool + errorMsg string + }{ + { + name: "when write succeeds sets drain flag", + hostname: "server1", + useState: true, + setupMocks: func(kv *jobmocks.MockKeyValue) { + kv.EXPECT(). + Put(gomock.Any(), "drain.server1", []byte("1")). + Return(uint64(1), nil) + }, + }, + { + name: "when KV put fails returns error", + hostname: "server1", + useState: true, + setupMocks: func(kv *jobmocks.MockKeyValue) { + kv.EXPECT(). + Put(gomock.Any(), "drain.server1", []byte("1")). + Return(uint64(0), errors.New("kv connection failed")) + }, + expectError: true, + errorMsg: "set drain flag", + }, + { + name: "when stateKV is nil returns error", + hostname: "server1", + useState: false, + expectError: true, + errorMsg: "agent state bucket not configured", + }, + } + + for _, tt := range tests { + s.Run(tt.name, func() { + var jobsClient *client.Client + if tt.useState { + stateKV := jobmocks.NewMockKeyValue(s.mockCtrl) + if tt.setupMocks != nil { + tt.setupMocks(stateKV) + } + jobsClient = s.newClientWithState(stateKV) + } else { + jobsClient = s.newClientWithoutState() + } + + err := jobsClient.SetDrainFlag(s.ctx, tt.hostname) + + if tt.expectError { + s.Error(err) + s.Contains(err.Error(), tt.errorMsg) + } else { + s.NoError(err) + } + }) + } +} + +func (s *AgentDrainPublicTestSuite) TestDeleteDrainFlag() { + tests := []struct { + name string + hostname string + useState bool + setupMocks func(*jobmocks.MockKeyValue) + expectError bool + errorMsg string + }{ + { + name: "when delete succeeds removes drain flag", + hostname: "server1", + useState: true, + setupMocks: func(kv *jobmocks.MockKeyValue) { + kv.EXPECT(). + Delete(gomock.Any(), "drain.server1"). + Return(nil) + }, + }, + { + name: "when KV delete fails returns error", + hostname: "server1", + useState: true, + setupMocks: func(kv *jobmocks.MockKeyValue) { + kv.EXPECT(). + Delete(gomock.Any(), "drain.server1"). + Return(errors.New("kv connection failed")) + }, + expectError: true, + errorMsg: "delete drain flag", + }, + { + name: "when stateKV is nil returns error", + hostname: "server1", + useState: false, + expectError: true, + errorMsg: "agent state bucket not configured", + }, + } + + for _, tt := range tests { + s.Run(tt.name, func() { + var jobsClient *client.Client + if tt.useState { + stateKV := jobmocks.NewMockKeyValue(s.mockCtrl) + if tt.setupMocks != nil { + tt.setupMocks(stateKV) + } + jobsClient = s.newClientWithState(stateKV) + } else { + jobsClient = s.newClientWithoutState() + } + + err := jobsClient.DeleteDrainFlag(s.ctx, tt.hostname) + + if tt.expectError { + s.Error(err) + s.Contains(err.Error(), tt.errorMsg) + } else { + s.NoError(err) + } + }) + } +} + +func (s *AgentDrainPublicTestSuite) TestOverlayDrainState() { + tests := []struct { + name string + useState bool + setupMocks func(*jobmocks.MockKeyValue) + expectedState string + }{ + { + name: "when drain flag exists sets state to Cordoned", + useState: true, + setupMocks: func(kv *jobmocks.MockKeyValue) { + entry := jobmocks.NewMockKeyValueEntry(s.mockCtrl) + kv.EXPECT(). + Get(gomock.Any(), "drain.server1"). + Return(entry, nil) + }, + expectedState: job.AgentStateCordoned, + }, + { + name: "when drain flag missing keeps original state", + useState: true, + setupMocks: func(kv *jobmocks.MockKeyValue) { + kv.EXPECT(). + Get(gomock.Any(), "drain.server1"). + Return(nil, errors.New("key not found")) + }, + expectedState: "", + }, + { + name: "when stateKV is nil keeps original state", + useState: false, + expectedState: "", + }, + } + + for _, tt := range tests { + s.Run(tt.name, func() { + registryKV := jobmocks.NewMockKeyValue(s.mockCtrl) + + // Set up the registry KV to return agent data + entry := jobmocks.NewMockKeyValueEntry(s.mockCtrl) + entry.EXPECT().Value().Return( + []byte(`{"hostname":"server1","registered_at":"2026-01-01T00:00:00Z"}`), + ) + registryKV.EXPECT(). + Get(gomock.Any(), "agents.server1"). + Return(entry, nil) + + opts := &client.Options{ + Timeout: 30 * time.Second, + KVBucket: s.mockKV, + RegistryKV: registryKV, + } + + if tt.useState { + stateKV := jobmocks.NewMockKeyValue(s.mockCtrl) + if tt.setupMocks != nil { + tt.setupMocks(stateKV) + } + opts.StateKV = stateKV + // GetAgent also calls GetAgentTimeline which uses stateKV + stateKV.EXPECT(). + Keys(gomock.Any()). + Return(nil, errors.New("nats: no keys found")) + } + + jobsClient, err := client.New( + slog.Default(), + s.mockNATSClient, + opts, + ) + s.Require().NoError(err) + + info, err := jobsClient.GetAgent(s.ctx, "server1") + s.NoError(err) + s.Equal(tt.expectedState, info.State) + }) + } +} + +func TestAgentDrainPublicTestSuite(t *testing.T) { + suite.Run(t, new(AgentDrainPublicTestSuite)) +} diff --git a/internal/job/client/agent_timeline_public_test.go b/internal/job/client/agent_timeline_public_test.go new file mode 100644 index 00000000..c3be65a4 --- /dev/null +++ b/internal/job/client/agent_timeline_public_test.go @@ -0,0 +1,484 @@ +// Copyright (c) 2026 John Dewey + +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to +// deal in the Software without restriction, including without limitation the +// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or +// sell copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS IN THE SOFTWARE. + +package client_test + +import ( + "context" + "encoding/json" + "errors" + "log/slog" + "testing" + "time" + + "github.com/golang/mock/gomock" + "github.com/stretchr/testify/suite" + + "github.com/retr0h/osapi/internal/job" + "github.com/retr0h/osapi/internal/job/client" + jobmocks "github.com/retr0h/osapi/internal/job/mocks" +) + +type AgentTimelinePublicTestSuite struct { + suite.Suite + + mockCtrl *gomock.Controller + mockNATSClient *jobmocks.MockNATSClient + mockKV *jobmocks.MockKeyValue + ctx context.Context +} + +func (s *AgentTimelinePublicTestSuite) SetupTest() { + s.mockCtrl = gomock.NewController(s.T()) + s.mockNATSClient = jobmocks.NewMockNATSClient(s.mockCtrl) + s.mockKV = jobmocks.NewMockKeyValue(s.mockCtrl) + s.ctx = context.Background() +} + +func (s *AgentTimelinePublicTestSuite) TearDownTest() { + s.mockCtrl.Finish() +} + +func (s *AgentTimelinePublicTestSuite) newClientWithState( + stateKV *jobmocks.MockKeyValue, +) *client.Client { + opts := &client.Options{ + Timeout: 30 * time.Second, + KVBucket: s.mockKV, + StateKV: stateKV, + } + c, err := client.New(slog.Default(), s.mockNATSClient, opts) + s.Require().NoError(err) + + return c +} + +func (s *AgentTimelinePublicTestSuite) newClientWithoutState() *client.Client { + opts := &client.Options{ + Timeout: 30 * time.Second, + KVBucket: s.mockKV, + } + c, err := client.New(slog.Default(), s.mockNATSClient, opts) + s.Require().NoError(err) + + return c +} + +func (s *AgentTimelinePublicTestSuite) TestWriteAgentTimelineEvent() { + tests := []struct { + name string + hostname string + event string + message string + useState bool + setupMocks func(*jobmocks.MockKeyValue) + expectError bool + errorMsg string + }{ + { + name: "when write succeeds stores timeline event", + hostname: "server1", + event: "drain", + message: "node marked for drain", + useState: true, + setupMocks: func(kv *jobmocks.MockKeyValue) { + kv.EXPECT(). + Put(gomock.Any(), gomock.Any(), gomock.Any()). + DoAndReturn(func( + _ context.Context, + key string, + data []byte, + ) (uint64, error) { + s.Contains(key, "timeline.server1.drain.") + + var te job.TimelineEvent + err := json.Unmarshal(data, &te) + s.NoError(err) + s.Equal("drain", te.Event) + s.Equal("server1", te.Hostname) + s.Equal("node marked for drain", te.Message) + s.NotZero(te.Timestamp) + + return 1, nil + }) + }, + }, + { + name: "when KV put fails returns error", + hostname: "server1", + event: "drain", + message: "drain requested", + useState: true, + setupMocks: func(kv *jobmocks.MockKeyValue) { + kv.EXPECT(). + Put(gomock.Any(), gomock.Any(), gomock.Any()). + Return(uint64(0), errors.New("kv connection failed")) + }, + expectError: true, + errorMsg: "write timeline event", + }, + { + name: "when stateKV is nil returns error", + hostname: "server1", + event: "drain", + message: "drain requested", + useState: false, + expectError: true, + errorMsg: "agent state bucket not configured", + }, + } + + for _, tt := range tests { + s.Run(tt.name, func() { + var jobsClient *client.Client + if tt.useState { + stateKV := jobmocks.NewMockKeyValue(s.mockCtrl) + if tt.setupMocks != nil { + tt.setupMocks(stateKV) + } + jobsClient = s.newClientWithState(stateKV) + } else { + jobsClient = s.newClientWithoutState() + } + + err := jobsClient.WriteAgentTimelineEvent( + s.ctx, + tt.hostname, + tt.event, + tt.message, + ) + + if tt.expectError { + s.Error(err) + s.Contains(err.Error(), tt.errorMsg) + } else { + s.NoError(err) + } + }) + } +} + +func (s *AgentTimelinePublicTestSuite) TestGetAgentTimeline() { + now := time.Now() + earlier := now.Add(-10 * time.Minute) + later := now.Add(10 * time.Minute) + + tests := []struct { + name string + hostname string + useState bool + setupMocks func(*jobmocks.MockKeyValue) + expectError bool + errorMsg string + expectedCount int + validateFunc func([]job.TimelineEvent) + }{ + { + name: "when events exist returns sorted events", + hostname: "server1", + useState: true, + setupMocks: func(kv *jobmocks.MockKeyValue) { + kv.EXPECT(). + Keys(gomock.Any()). + Return([]string{ + "timeline.server1.drain.1000000000", + "timeline.server1.undrain.2000000000", + "agents.server1", + }, nil) + + drainEvent, _ := json.Marshal(job.TimelineEvent{ + Timestamp: later, + Event: "drain", + Hostname: "server1", + Message: "drain requested", + }) + entry1 := jobmocks.NewMockKeyValueEntry(s.mockCtrl) + entry1.EXPECT().Value().Return(drainEvent) + kv.EXPECT(). + Get(gomock.Any(), "timeline.server1.drain.1000000000"). + Return(entry1, nil) + + undrainEvent, _ := json.Marshal(job.TimelineEvent{ + Timestamp: earlier, + Event: "undrain", + Hostname: "server1", + Message: "undrain requested", + }) + entry2 := jobmocks.NewMockKeyValueEntry(s.mockCtrl) + entry2.EXPECT().Value().Return(undrainEvent) + kv.EXPECT(). + Get(gomock.Any(), "timeline.server1.undrain.2000000000"). + Return(entry2, nil) + }, + expectedCount: 2, + validateFunc: func(events []job.TimelineEvent) { + // Should be sorted by timestamp (earlier first) + s.Equal("undrain", events[0].Event) + s.Equal("drain", events[1].Event) + }, + }, + { + name: "when no keys found returns empty slice", + hostname: "server1", + useState: true, + setupMocks: func(kv *jobmocks.MockKeyValue) { + kv.EXPECT(). + Keys(gomock.Any()). + Return(nil, errors.New("nats: no keys found")) + }, + expectedCount: 0, + }, + { + name: "when stateKV is nil returns error", + hostname: "server1", + useState: false, + expectError: true, + errorMsg: "agent state bucket not configured", + }, + { + name: "when Get fails for a key skips it", + hostname: "server1", + useState: true, + setupMocks: func(kv *jobmocks.MockKeyValue) { + kv.EXPECT(). + Keys(gomock.Any()). + Return([]string{ + "timeline.server1.drain.1000000000", + "timeline.server1.undrain.2000000000", + }, nil) + + drainEvent, _ := json.Marshal(job.TimelineEvent{ + Timestamp: now, + Event: "drain", + Hostname: "server1", + Message: "drain requested", + }) + entry1 := jobmocks.NewMockKeyValueEntry(s.mockCtrl) + entry1.EXPECT().Value().Return(drainEvent) + kv.EXPECT(). + Get(gomock.Any(), "timeline.server1.drain.1000000000"). + Return(entry1, nil) + + kv.EXPECT(). + Get(gomock.Any(), "timeline.server1.undrain.2000000000"). + Return(nil, errors.New("key not found")) + }, + expectedCount: 1, + validateFunc: func(events []job.TimelineEvent) { + s.Equal("drain", events[0].Event) + }, + }, + { + name: "when unmarshal fails for a key skips it", + hostname: "server1", + useState: true, + setupMocks: func(kv *jobmocks.MockKeyValue) { + kv.EXPECT(). + Keys(gomock.Any()). + Return([]string{ + "timeline.server1.drain.1000000000", + "timeline.server1.undrain.2000000000", + }, nil) + + drainEvent, _ := json.Marshal(job.TimelineEvent{ + Timestamp: now, + Event: "drain", + Hostname: "server1", + Message: "drain requested", + }) + entry1 := jobmocks.NewMockKeyValueEntry(s.mockCtrl) + entry1.EXPECT().Value().Return(drainEvent) + kv.EXPECT(). + Get(gomock.Any(), "timeline.server1.drain.1000000000"). + Return(entry1, nil) + + entry2 := jobmocks.NewMockKeyValueEntry(s.mockCtrl) + entry2.EXPECT().Value().Return([]byte("invalid json")) + kv.EXPECT(). + Get(gomock.Any(), "timeline.server1.undrain.2000000000"). + Return(entry2, nil) + }, + expectedCount: 1, + validateFunc: func(events []job.TimelineEvent) { + s.Equal("drain", events[0].Event) + }, + }, + { + name: "when keys exist for other hostnames filters them out", + hostname: "server1", + useState: true, + setupMocks: func(kv *jobmocks.MockKeyValue) { + kv.EXPECT(). + Keys(gomock.Any()). + Return([]string{ + "timeline.server1.drain.1000000000", + "timeline.server2.drain.2000000000", + "agents.server1", + }, nil) + + drainEvent, _ := json.Marshal(job.TimelineEvent{ + Timestamp: now, + Event: "drain", + Hostname: "server1", + Message: "drain requested", + }) + entry1 := jobmocks.NewMockKeyValueEntry(s.mockCtrl) + entry1.EXPECT().Value().Return(drainEvent) + kv.EXPECT(). + Get(gomock.Any(), "timeline.server1.drain.1000000000"). + Return(entry1, nil) + }, + expectedCount: 1, + validateFunc: func(events []job.TimelineEvent) { + s.Equal("server1", events[0].Hostname) + }, + }, + } + + for _, tt := range tests { + s.Run(tt.name, func() { + var jobsClient *client.Client + if tt.useState { + stateKV := jobmocks.NewMockKeyValue(s.mockCtrl) + if tt.setupMocks != nil { + tt.setupMocks(stateKV) + } + jobsClient = s.newClientWithState(stateKV) + } else { + jobsClient = s.newClientWithoutState() + } + + events, err := jobsClient.GetAgentTimeline(s.ctx, tt.hostname) + + if tt.expectError { + s.Error(err) + s.Contains(err.Error(), tt.errorMsg) + } else { + s.NoError(err) + s.Len(events, tt.expectedCount) + if tt.validateFunc != nil { + tt.validateFunc(events) + } + } + }) + } +} + +func (s *AgentTimelinePublicTestSuite) TestComputeAgentState() { + tests := []struct { + name string + events []job.TimelineEvent + expectedState string + }{ + { + name: "when no events returns Ready", + events: []job.TimelineEvent{}, + expectedState: job.AgentStateReady, + }, + { + name: "when nil events returns Ready", + events: nil, + expectedState: job.AgentStateReady, + }, + { + name: "when latest event is drain returns Draining", + events: []job.TimelineEvent{ + { + Timestamp: time.Now(), + Event: "drain", + Hostname: "server1", + Message: "drain requested", + }, + }, + expectedState: job.AgentStateDraining, + }, + { + name: "when latest event is cordoned returns Cordoned", + events: []job.TimelineEvent{ + { + Timestamp: time.Now(), + Event: "cordoned", + Hostname: "server1", + Message: "node cordoned", + }, + }, + expectedState: job.AgentStateCordoned, + }, + { + name: "when latest event is undrain returns Ready", + events: []job.TimelineEvent{ + { + Timestamp: time.Now().Add(-10 * time.Minute), + Event: "drain", + Hostname: "server1", + Message: "drain requested", + }, + { + Timestamp: time.Now(), + Event: "undrain", + Hostname: "server1", + Message: "undrain requested", + }, + }, + expectedState: job.AgentStateReady, + }, + { + name: "when latest event is ready returns Ready", + events: []job.TimelineEvent{ + { + Timestamp: time.Now().Add(-10 * time.Minute), + Event: "drain", + Hostname: "server1", + Message: "drain requested", + }, + { + Timestamp: time.Now(), + Event: "ready", + Hostname: "server1", + Message: "agent ready", + }, + }, + expectedState: job.AgentStateReady, + }, + { + name: "when latest event is unknown returns Ready", + events: []job.TimelineEvent{ + { + Timestamp: time.Now(), + Event: "something-unexpected", + Hostname: "server1", + Message: "unknown event", + }, + }, + expectedState: job.AgentStateReady, + }, + } + + for _, tt := range tests { + s.Run(tt.name, func() { + state := client.ComputeAgentState(tt.events) + s.Equal(tt.expectedState, state) + }) + } +} + +func TestAgentTimelinePublicTestSuite(t *testing.T) { + suite.Run(t, new(AgentTimelinePublicTestSuite)) +} diff --git a/internal/job/client/client.go b/internal/job/client/client.go index da57a3b7..5096913b 100644 --- a/internal/job/client/client.go +++ b/internal/job/client/client.go @@ -42,6 +42,7 @@ type Client struct { kv jetstream.KeyValue registryKV jetstream.KeyValue factsKV jetstream.KeyValue + stateKV jetstream.KeyValue timeout time.Duration streamName string } @@ -56,6 +57,8 @@ type Options struct { RegistryKV jetstream.KeyValue // FactsKV is the KV bucket for agent facts (optional). FactsKV jetstream.KeyValue + // StateKV is the KV bucket for persistent agent state (drain flags, timeline). + StateKV jetstream.KeyValue // StreamName is the JetStream stream name (used to derive DLQ name). StreamName string } @@ -79,6 +82,7 @@ func New( kv: opts.KVBucket, registryKV: opts.RegistryKV, factsKV: opts.FactsKV, + stateKV: opts.StateKV, streamName: opts.StreamName, timeout: opts.Timeout, }, nil diff --git a/internal/job/client/query.go b/internal/job/client/query.go index f715c712..f3c02ca5 100644 --- a/internal/job/client/query.go +++ b/internal/job/client/query.go @@ -24,6 +24,7 @@ import ( "context" "encoding/json" "fmt" + "strings" "github.com/retr0h/osapi/internal/job" "github.com/retr0h/osapi/internal/provider/network/dns" @@ -401,6 +402,10 @@ func (c *Client) ListAgents( agents := make([]job.AgentInfo, 0, len(keys)) for _, key := range keys { + if !strings.HasPrefix(key, "agents.") { + continue + } + entry, err := c.registryKV.Get(ctx, key) if err != nil { continue @@ -413,6 +418,8 @@ func (c *Client) ListAgents( info := agentInfoFromRegistration(®) c.mergeFacts(ctx, &info) + c.overlayDrainState(ctx, &info) + agents = append(agents, info) } @@ -441,6 +448,13 @@ func (c *Client) GetAgent( info := agentInfoFromRegistration(®) c.mergeFacts(ctx, &info) + c.overlayDrainState(ctx, &info) + + timeline, err := c.GetAgentTimeline(ctx, hostname) + if err == nil && len(timeline) > 0 { + info.Timeline = timeline + } + return &info, nil } @@ -489,5 +503,7 @@ func agentInfoFromRegistration( LoadAverages: reg.LoadAverages, MemoryStats: reg.MemoryStats, AgentVersion: reg.AgentVersion, + Conditions: reg.Conditions, + State: reg.State, } } diff --git a/internal/job/client/query_public_test.go b/internal/job/client/query_public_test.go index 6c07b949..d5394087 100644 --- a/internal/job/client/query_public_test.go +++ b/internal/job/client/query_public_test.go @@ -1133,6 +1133,7 @@ func (s *QueryPublicTestSuite) TestListAgents() { tests := []struct { name string setupMockKV func(*jobmocks.MockKeyValue) + setupStateKV func(*jobmocks.MockKeyValue) setupMockFactsKV func(*jobmocks.MockKeyValue) useRegistryKV bool useFactsKV bool @@ -1185,6 +1186,14 @@ func (s *QueryPublicTestSuite) TestListAgents() { Get(gomock.Any(), "agents.server2"). Return(entry2, nil) }, + setupStateKV: func(kv *jobmocks.MockKeyValue) { + kv.EXPECT(). + Get(gomock.Any(), "drain.server1"). + Return(nil, errors.New("key not found")) + kv.EXPECT(). + Get(gomock.Any(), "drain.server2"). + Return(nil, errors.New("key not found")) + }, expectedCount: 2, validateFunc: func(agents []job.AgentInfo) { s.Equal("server1", agents[0].Hostname) @@ -1228,6 +1237,11 @@ func (s *QueryPublicTestSuite) TestListAgents() { Get(gomock.Any(), "agents.server2"). Return(nil, errors.New("key not found")) }, + setupStateKV: func(kv *jobmocks.MockKeyValue) { + kv.EXPECT(). + Get(gomock.Any(), "drain.server1"). + Return(nil, errors.New("key not found")) + }, expectedCount: 1, }, { @@ -1252,6 +1266,11 @@ func (s *QueryPublicTestSuite) TestListAgents() { Get(gomock.Any(), "agents.server2"). Return(entry2, nil) }, + setupStateKV: func(kv *jobmocks.MockKeyValue) { + kv.EXPECT(). + Get(gomock.Any(), "drain.server1"). + Return(nil, errors.New("key not found")) + }, expectedCount: 1, }, { @@ -1273,6 +1292,11 @@ func (s *QueryPublicTestSuite) TestListAgents() { Get(gomock.Any(), "agents.server1"). Return(entry, nil) }, + setupStateKV: func(kv *jobmocks.MockKeyValue) { + kv.EXPECT(). + Get(gomock.Any(), "drain.server1"). + Return(nil, errors.New("key not found")) + }, setupMockFactsKV: func(kv *jobmocks.MockKeyValue) { factsEntry := jobmocks.NewMockKeyValueEntry(s.mockCtrl) factsEntry.EXPECT().Value().Return( @@ -1317,6 +1341,11 @@ func (s *QueryPublicTestSuite) TestListAgents() { Get(gomock.Any(), "agents.server1"). Return(entry, nil) }, + setupStateKV: func(kv *jobmocks.MockKeyValue) { + kv.EXPECT(). + Get(gomock.Any(), "drain.server1"). + Return(nil, errors.New("key not found")) + }, expectedCount: 1, validateFunc: func(agents []job.AgentInfo) { s.Equal("server1", agents[0].Hostname) @@ -1347,6 +1376,11 @@ func (s *QueryPublicTestSuite) TestListAgents() { Get(gomock.Any(), "agents.server1"). Return(entry, nil) }, + setupStateKV: func(kv *jobmocks.MockKeyValue) { + kv.EXPECT(). + Get(gomock.Any(), "drain.server1"). + Return(nil, errors.New("key not found")) + }, setupMockFactsKV: func(kv *jobmocks.MockKeyValue) { kv.EXPECT(). Get(gomock.Any(), "facts.server1"). @@ -1378,6 +1412,11 @@ func (s *QueryPublicTestSuite) TestListAgents() { Get(gomock.Any(), "agents.server1"). Return(entry, nil) }, + setupStateKV: func(kv *jobmocks.MockKeyValue) { + kv.EXPECT(). + Get(gomock.Any(), "drain.server1"). + Return(nil, errors.New("key not found")) + }, setupMockFactsKV: func(kv *jobmocks.MockKeyValue) { factsEntry := jobmocks.NewMockKeyValueEntry(s.mockCtrl) factsEntry.EXPECT().Value().Return([]byte(`not valid json`)) @@ -1409,6 +1448,11 @@ func (s *QueryPublicTestSuite) TestListAgents() { if tt.useRegistryKV { opts.RegistryKV = registryKV } + if tt.setupStateKV != nil { + stateKV := jobmocks.NewMockKeyValue(s.mockCtrl) + tt.setupStateKV(stateKV) + opts.StateKV = stateKV + } if tt.useFactsKV { factsKV := jobmocks.NewMockKeyValue(s.mockCtrl) if tt.setupMockFactsKV != nil { @@ -1444,6 +1488,7 @@ func (s *QueryPublicTestSuite) TestGetAgent() { name string hostname string setupMockKV func(*jobmocks.MockKeyValue) + setupStateKV func(*jobmocks.MockKeyValue) setupMockFactsKV func(*jobmocks.MockKeyValue) useRegistryKV bool useFactsKV bool @@ -1473,6 +1518,14 @@ func (s *QueryPublicTestSuite) TestGetAgent() { Get(gomock.Any(), "agents.server1"). Return(entry, nil) }, + setupStateKV: func(kv *jobmocks.MockKeyValue) { + kv.EXPECT(). + Get(gomock.Any(), "drain.server1"). + Return(nil, errors.New("key not found")) + kv.EXPECT(). + Keys(gomock.Any()). + Return(nil, errors.New("nats: no keys found")) + }, validateFunc: func(info *job.AgentInfo) { s.Equal("server1", info.Hostname) s.Equal(map[string]string{"group": "web"}, info.Labels) @@ -1524,6 +1577,14 @@ func (s *QueryPublicTestSuite) TestGetAgent() { Get(gomock.Any(), "agents.server1"). Return(entry, nil) }, + setupStateKV: func(kv *jobmocks.MockKeyValue) { + kv.EXPECT(). + Get(gomock.Any(), "drain.server1"). + Return(nil, errors.New("key not found")) + kv.EXPECT(). + Keys(gomock.Any()). + Return(nil, errors.New("nats: no keys found")) + }, setupMockFactsKV: func(kv *jobmocks.MockKeyValue) { factsEntry := jobmocks.NewMockKeyValueEntry(s.mockCtrl) factsEntry.EXPECT().Value().Return( @@ -1565,6 +1626,14 @@ func (s *QueryPublicTestSuite) TestGetAgent() { Get(gomock.Any(), "agents.server1"). Return(entry, nil) }, + setupStateKV: func(kv *jobmocks.MockKeyValue) { + kv.EXPECT(). + Get(gomock.Any(), "drain.server1"). + Return(nil, errors.New("key not found")) + kv.EXPECT(). + Keys(gomock.Any()). + Return(nil, errors.New("nats: no keys found")) + }, validateFunc: func(info *job.AgentInfo) { s.Equal("server1", info.Hostname) s.Empty(info.Architecture) @@ -1591,6 +1660,14 @@ func (s *QueryPublicTestSuite) TestGetAgent() { Get(gomock.Any(), "agents.server1"). Return(entry, nil) }, + setupStateKV: func(kv *jobmocks.MockKeyValue) { + kv.EXPECT(). + Get(gomock.Any(), "drain.server1"). + Return(nil, errors.New("key not found")) + kv.EXPECT(). + Keys(gomock.Any()). + Return(nil, errors.New("nats: no keys found")) + }, setupMockFactsKV: func(kv *jobmocks.MockKeyValue) { kv.EXPECT(). Get(gomock.Any(), "facts.server1"). @@ -1602,6 +1679,96 @@ func (s *QueryPublicTestSuite) TestGetAgent() { s.Empty(info.KernelVersion) }, }, + { + name: "when timeline events exist includes timeline in response", + hostname: "server1", + useRegistryKV: true, + setupMockKV: func(kv *jobmocks.MockKeyValue) { + entry := jobmocks.NewMockKeyValueEntry(s.mockCtrl) + entry.EXPECT().Value().Return( + []byte( + `{"hostname":"server1","registered_at":"2026-01-01T00:00:00Z"}`, + ), + ) + kv.EXPECT(). + Get(gomock.Any(), "agents.server1"). + Return(entry, nil) + }, + setupStateKV: func(kv *jobmocks.MockKeyValue) { + kv.EXPECT(). + Get(gomock.Any(), "drain.server1"). + Return(nil, errors.New("key not found")) + + // GetAgentTimeline calls Keys then Get for matching keys + kv.EXPECT(). + Keys(gomock.Any()). + Return([]string{ + "agents.server1", + "timeline.server1.drain.1000000000", + "timeline.server1.undrain.2000000000", + }, nil) + + drainEntry := jobmocks.NewMockKeyValueEntry(s.mockCtrl) + drainEntry.EXPECT().Value().Return( + []byte( + `{"timestamp":"2026-01-01T01:00:00Z","event":"drain","hostname":"server1","message":"node draining"}`, + ), + ) + kv.EXPECT(). + Get(gomock.Any(), "timeline.server1.drain.1000000000"). + Return(drainEntry, nil) + + undrainEntry := jobmocks.NewMockKeyValueEntry(s.mockCtrl) + undrainEntry.EXPECT().Value().Return( + []byte( + `{"timestamp":"2026-01-01T02:00:00Z","event":"undrain","hostname":"server1","message":"node undrained"}`, + ), + ) + kv.EXPECT(). + Get(gomock.Any(), "timeline.server1.undrain.2000000000"). + Return(undrainEntry, nil) + }, + validateFunc: func(info *job.AgentInfo) { + s.Equal("server1", info.Hostname) + s.Len(info.Timeline, 2) + s.Equal("drain", info.Timeline[0].Event) + s.Equal("node draining", info.Timeline[0].Message) + s.Equal("undrain", info.Timeline[1].Event) + s.Equal("node undrained", info.Timeline[1].Message) + }, + }, + { + name: "when conditions and state set includes them in response", + hostname: "server1", + useRegistryKV: true, + setupMockKV: func(kv *jobmocks.MockKeyValue) { + entry := jobmocks.NewMockKeyValueEntry(s.mockCtrl) + entry.EXPECT().Value().Return( + []byte( + `{"hostname":"server1","registered_at":"2026-01-01T00:00:00Z","state":"Draining","conditions":[{"type":"DiskPressure","status":true,"reason":"disk usage 92%","last_transition_time":"2026-01-01T00:00:00Z"}]}`, + ), + ) + kv.EXPECT(). + Get(gomock.Any(), "agents.server1"). + Return(entry, nil) + }, + setupStateKV: func(kv *jobmocks.MockKeyValue) { + kv.EXPECT(). + Get(gomock.Any(), "drain.server1"). + Return(nil, errors.New("key not found")) + kv.EXPECT(). + Keys(gomock.Any()). + Return(nil, errors.New("nats: no keys found")) + }, + validateFunc: func(info *job.AgentInfo) { + s.Equal("server1", info.Hostname) + s.Equal("Draining", info.State) + s.Len(info.Conditions, 1) + s.Equal("DiskPressure", info.Conditions[0].Type) + s.True(info.Conditions[0].Status) + s.Equal("disk usage 92%", info.Conditions[0].Reason) + }, + }, } for _, tt := range tests { @@ -1618,6 +1785,11 @@ func (s *QueryPublicTestSuite) TestGetAgent() { if tt.useRegistryKV { opts.RegistryKV = registryKV } + if tt.setupStateKV != nil { + stateKV := jobmocks.NewMockKeyValue(s.mockCtrl) + tt.setupStateKV(stateKV) + opts.StateKV = stateKV + } if tt.useFactsKV { factsKV := jobmocks.NewMockKeyValue(s.mockCtrl) if tt.setupMockFactsKV != nil { @@ -1730,6 +1902,7 @@ func (s *QueryPublicTestSuite) TestQueryNodeDiskBroadcast() { timeout time.Duration opts *publishAndCollectMockOpts setupRegistryKV func(*jobmocks.MockKeyValue) + setupStateKV func(*jobmocks.MockKeyValue) expectError bool errorContains string expectedCount int @@ -1771,6 +1944,14 @@ func (s *QueryPublicTestSuite) TestQueryNodeDiskBroadcast() { Get(gomock.Any(), "agents.server2"). Return(entry2, nil) }, + setupStateKV: func(kv *jobmocks.MockKeyValue) { + kv.EXPECT(). + Get(gomock.Any(), "drain.server1"). + Return(nil, errors.New("key not found")) + kv.EXPECT(). + Get(gomock.Any(), "drain.server2"). + Return(nil, errors.New("key not found")) + }, expectedCount: 2, }, { @@ -1834,6 +2015,12 @@ func (s *QueryPublicTestSuite) TestQueryNodeDiskBroadcast() { opts.RegistryKV = mockRegistryKV } + if tt.setupStateKV != nil { + stateKV := jobmocks.NewMockKeyValue(s.mockCtrl) + tt.setupStateKV(stateKV) + opts.StateKV = stateKV + } + jobsClient, err := client.New(slog.Default(), s.mockNATSClient, opts) s.Require().NoError(err) diff --git a/internal/job/client/types.go b/internal/job/client/types.go index 4da2e789..5723b5dc 100644 --- a/internal/job/client/types.go +++ b/internal/job/client/types.go @@ -228,6 +228,30 @@ type JobClient interface { hostname string, ) (*job.AgentInfo, error) + // Agent timeline + WriteAgentTimelineEvent( + ctx context.Context, + hostname, event, message string, + ) error + GetAgentTimeline( + ctx context.Context, + hostname string, + ) ([]job.TimelineEvent, error) + + // Agent drain flag + CheckDrainFlag( + ctx context.Context, + hostname string, + ) bool + SetDrainFlag( + ctx context.Context, + hostname string, + ) error + DeleteDrainFlag( + ctx context.Context, + hostname string, + ) error + // Job deletion DeleteJob( ctx context.Context, diff --git a/internal/job/mocks/job_client.gen.go b/internal/job/mocks/job_client.gen.go index 2ea4cb88..60a0265d 100644 --- a/internal/job/mocks/job_client.gen.go +++ b/internal/job/mocks/job_client.gen.go @@ -44,6 +44,20 @@ func (m *MockJobClient) EXPECT() *MockJobClientMockRecorder { return m.recorder } +// CheckDrainFlag mocks base method. +func (m *MockJobClient) CheckDrainFlag(arg0 context.Context, arg1 string) bool { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "CheckDrainFlag", arg0, arg1) + ret0, _ := ret[0].(bool) + return ret0 +} + +// CheckDrainFlag indicates an expected call of CheckDrainFlag. +func (mr *MockJobClientMockRecorder) CheckDrainFlag(arg0, arg1 interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "CheckDrainFlag", reflect.TypeOf((*MockJobClient)(nil).CheckDrainFlag), arg0, arg1) +} + // ConsumeJobs mocks base method. func (m *MockJobClient) ConsumeJobs(arg0 context.Context, arg1, arg2 string, arg3 func(jetstream.Msg) error, arg4 *client.ConsumeOptions) error { m.ctrl.T.Helper() @@ -87,6 +101,20 @@ func (mr *MockJobClientMockRecorder) CreateOrUpdateConsumer(arg0, arg1, arg2 int return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "CreateOrUpdateConsumer", reflect.TypeOf((*MockJobClient)(nil).CreateOrUpdateConsumer), arg0, arg1, arg2) } +// DeleteDrainFlag mocks base method. +func (m *MockJobClient) DeleteDrainFlag(arg0 context.Context, arg1 string) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "DeleteDrainFlag", arg0, arg1) + ret0, _ := ret[0].(error) + return ret0 +} + +// DeleteDrainFlag indicates an expected call of DeleteDrainFlag. +func (mr *MockJobClientMockRecorder) DeleteDrainFlag(arg0, arg1 interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "DeleteDrainFlag", reflect.TypeOf((*MockJobClient)(nil).DeleteDrainFlag), arg0, arg1) +} + // DeleteJob mocks base method. func (m *MockJobClient) DeleteJob(arg0 context.Context, arg1 string) error { m.ctrl.T.Helper() @@ -116,6 +144,21 @@ func (mr *MockJobClientMockRecorder) GetAgent(arg0, arg1 interface{}) *gomock.Ca return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetAgent", reflect.TypeOf((*MockJobClient)(nil).GetAgent), arg0, arg1) } +// GetAgentTimeline mocks base method. +func (m *MockJobClient) GetAgentTimeline(arg0 context.Context, arg1 string) ([]job.TimelineEvent, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "GetAgentTimeline", arg0, arg1) + ret0, _ := ret[0].([]job.TimelineEvent) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// GetAgentTimeline indicates an expected call of GetAgentTimeline. +func (mr *MockJobClientMockRecorder) GetAgentTimeline(arg0, arg1 interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetAgentTimeline", reflect.TypeOf((*MockJobClient)(nil).GetAgentTimeline), arg0, arg1) +} + // GetJobData mocks base method. func (m *MockJobClient) GetJobData(arg0 context.Context, arg1 string) ([]byte, error) { m.ctrl.T.Helper() @@ -763,6 +806,34 @@ func (mr *MockJobClientMockRecorder) RetryJob(arg0, arg1, arg2 interface{}) *gom return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "RetryJob", reflect.TypeOf((*MockJobClient)(nil).RetryJob), arg0, arg1, arg2) } +// SetDrainFlag mocks base method. +func (m *MockJobClient) SetDrainFlag(arg0 context.Context, arg1 string) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "SetDrainFlag", arg0, arg1) + ret0, _ := ret[0].(error) + return ret0 +} + +// SetDrainFlag indicates an expected call of SetDrainFlag. +func (mr *MockJobClientMockRecorder) SetDrainFlag(arg0, arg1 interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "SetDrainFlag", reflect.TypeOf((*MockJobClient)(nil).SetDrainFlag), arg0, arg1) +} + +// WriteAgentTimelineEvent mocks base method. +func (m *MockJobClient) WriteAgentTimelineEvent(arg0 context.Context, arg1, arg2, arg3 string) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "WriteAgentTimelineEvent", arg0, arg1, arg2, arg3) + ret0, _ := ret[0].(error) + return ret0 +} + +// WriteAgentTimelineEvent indicates an expected call of WriteAgentTimelineEvent. +func (mr *MockJobClientMockRecorder) WriteAgentTimelineEvent(arg0, arg1, arg2, arg3 interface{}) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "WriteAgentTimelineEvent", reflect.TypeOf((*MockJobClient)(nil).WriteAgentTimelineEvent), arg0, arg1, arg2, arg3) +} + // WriteJobResponse mocks base method. func (m *MockJobClient) WriteJobResponse(arg0 context.Context, arg1, arg2 string, arg3 []byte, arg4, arg5 string, arg6 *bool) error { m.ctrl.T.Helper() diff --git a/internal/job/subjects.go b/internal/job/subjects.go index b6bab149..0085bf64 100644 --- a/internal/job/subjects.go +++ b/internal/job/subjects.go @@ -339,10 +339,19 @@ func CountExpectedAgents( switch routingType { case BroadcastHost: - return len(agents) + count := 0 + for i := range agents { + if agents[i].State != AgentStateCordoned && agents[i].State != AgentStateDraining { + count++ + } + } + return count case "label": count := 0 for i := range agents { + if agents[i].State == AgentStateCordoned || agents[i].State == AgentStateDraining { + continue + } if agentVal, ok := agents[i].Labels[key]; ok { if agentVal == value || strings.HasPrefix(agentVal, value+".") { count++ diff --git a/internal/job/subjects_public_test.go b/internal/job/subjects_public_test.go index 55b0cc6a..b16315ea 100644 --- a/internal/job/subjects_public_test.go +++ b/internal/job/subjects_public_test.go @@ -956,6 +956,51 @@ func (suite *SubjectsPublicTestSuite) TestCountExpectedAgents() { target: "_any", want: 0, }, + { + name: "when _all excludes cordoned agents", + agents: []job.AgentInfo{ + {Hostname: "web-01"}, + {Hostname: "web-02", State: job.AgentStateCordoned}, + {Hostname: "web-03"}, + }, + target: "_all", + want: 2, + }, + { + name: "when _all excludes draining agents", + agents: []job.AgentInfo{ + {Hostname: "web-01"}, + {Hostname: "web-02", State: job.AgentStateDraining}, + }, + target: "_all", + want: 1, + }, + { + name: "when label match excludes cordoned agents", + agents: []job.AgentInfo{ + { + Hostname: "web-01", + Labels: map[string]string{"group": "web.dev"}, + State: job.AgentStateCordoned, + }, + {Hostname: "web-02", Labels: map[string]string{"group": "web.dev"}}, + }, + target: "group:web", + want: 1, + }, + { + name: "when label match excludes draining agents", + agents: []job.AgentInfo{ + { + Hostname: "web-01", + Labels: map[string]string{"group": "web.dev"}, + State: job.AgentStateDraining, + }, + {Hostname: "web-02", Labels: map[string]string{"group": "web.dev"}}, + }, + target: "group:web", + want: 1, + }, } for _, tt := range tests { diff --git a/internal/job/types.go b/internal/job/types.go index 77c87403..753a9be3 100644 --- a/internal/job/types.go +++ b/internal/job/types.go @@ -270,6 +270,28 @@ type FactsRegistration struct { Facts map[string]any `json:"facts,omitempty"` } +// Condition type constants. +const ( + ConditionMemoryPressure = "MemoryPressure" + ConditionHighLoad = "HighLoad" + ConditionDiskPressure = "DiskPressure" +) + +// Agent state constants. +const ( + AgentStateReady = "Ready" + AgentStateDraining = "Draining" + AgentStateCordoned = "Cordoned" +) + +// Condition represents a node condition evaluated agent-side. +type Condition struct { + Type string `json:"type"` + Status bool `json:"status"` + Reason string `json:"reason,omitempty"` + LastTransitionTime time.Time `json:"last_transition_time"` +} + // AgentRegistration represents an agent's registration entry in the KV registry. type AgentRegistration struct { // Hostname is the hostname of the agent. @@ -290,6 +312,10 @@ type AgentRegistration struct { MemoryStats *mem.Stats `json:"memory_stats,omitempty"` // AgentVersion is the version of the agent binary. AgentVersion string `json:"agent_version,omitempty"` + // Conditions contains the evaluated node conditions. + Conditions []Condition `json:"conditions,omitempty"` + // State is the agent's scheduling state (Ready, Draining, Cordoned). + State string `json:"state,omitempty"` } // AgentInfo represents information about an active agent. @@ -328,6 +354,12 @@ type AgentInfo struct { Interfaces []NetworkInterface `json:"interfaces,omitempty"` // Facts contains arbitrary key-value facts collected by the agent. Facts map[string]any `json:"facts,omitempty"` + // Conditions contains the evaluated node conditions. + Conditions []Condition `json:"conditions,omitempty"` + // State is the agent's scheduling state (Ready, Draining, Cordoned). + State string `json:"state,omitempty"` + // Timeline contains the chronological sequence of state transition events. + Timeline []TimelineEvent `json:"timeline,omitempty"` } // NodeDiskResponse represents the response for node.disk.get operations. diff --git a/internal/provider/node/mem/darwin_get_vm.go b/internal/provider/node/mem/darwin_get_vm.go index 819dd072..6df705a1 100644 --- a/internal/provider/node/mem/darwin_get_vm.go +++ b/internal/provider/node/mem/darwin_get_vm.go @@ -30,8 +30,9 @@ func (d *Darwin) GetStats() (*Stats, error) { } return &Stats{ - Total: memInfo.Total, - Free: memInfo.Free, - Cached: memInfo.Cached, + Total: memInfo.Total, + Available: memInfo.Available, + Free: memInfo.Free, + Cached: memInfo.Cached, }, nil } diff --git a/internal/provider/node/mem/types.go b/internal/provider/node/mem/types.go index 2f242555..8a592d80 100644 --- a/internal/provider/node/mem/types.go +++ b/internal/provider/node/mem/types.go @@ -30,6 +30,8 @@ type Provider interface { type Stats struct { // Total memory in bytes Total uint64 + // Available memory in bytes (free + reclaimable) + Available uint64 // Free memory in bytes Free uint64 // Cached memory in bytes diff --git a/internal/provider/node/mem/ubuntu_get_vm.go b/internal/provider/node/mem/ubuntu_get_vm.go index ae4e1982..84411367 100644 --- a/internal/provider/node/mem/ubuntu_get_vm.go +++ b/internal/provider/node/mem/ubuntu_get_vm.go @@ -30,8 +30,9 @@ func (u *Ubuntu) GetStats() (*Stats, error) { } return &Stats{ - Total: memInfo.Total, - Free: memInfo.Free, - Cached: memInfo.Cached, + Total: memInfo.Total, + Available: memInfo.Available, + Free: memInfo.Free, + Cached: memInfo.Cached, }, nil }