diff --git a/CLAUDE.md b/CLAUDE.md
index 995e7c74..c88029c2 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -266,6 +266,7 @@ Conventions:
- Suite naming: `*_public_test.go` → `{Name}PublicTestSuite`,
`*_test.go` → `{Name}TestSuite`
- Table-driven structure with `validateFunc` callbacks
+- One suite method per function under test — all scenarios (success, errors, edge cases) as rows in one table
- Avoid generic file names like `helpers.go` or `utils.go` — name
files after what they contain
diff --git a/README.md b/README.md
index 28064460..cff63853 100644
--- a/README.md
+++ b/README.md
@@ -30,6 +30,7 @@ them to be used as appliances.
| 🌐 **[Network Management][]** | DNS read/update, ping |
| ⚙️ **[Command Execution][]** | Remote exec and shell across managed hosts |
| 📊 **[System Facts][]** | Agent-collected system facts — architecture, kernel, FQDN, CPUs, network interfaces, service/package manager |
+| 🔄 **[Agent Lifecycle][]** | Node conditions (memory, disk, load pressure), graceful drain/cordon for maintenance |
| ⚡ **[Async Job System][]** | NATS JetStream with KV-first architecture — broadcast, load-balanced, and label-based routing across hosts |
| 💚 **[Health][] & [Metrics][]** | Liveness, readiness, system status endpoints, Prometheus `/metrics` |
| 📋 **[Audit Logging][]** | Structured API audit trail in NATS KV with 30-day retention and admin-only read access |
@@ -77,5 +78,6 @@ them to be used as appliances.
The [MIT][] License.
+[Agent Lifecycle]: https://osapi-io.github.io/osapi/sidebar/features/agent-lifecycle
[System Facts]: https://osapi-io.github.io/osapi/sidebar/features/node-management
[MIT]: LICENSE
diff --git a/cmd/api_helpers.go b/cmd/api_helpers.go
index c07f2811..72254b8a 100644
--- a/cmd/api_helpers.go
+++ b/cmd/api_helpers.go
@@ -74,6 +74,7 @@ type natsBundle struct {
jobsKV jetstream.KeyValue
registryKV jetstream.KeyValue
factsKV jetstream.KeyValue
+ stateKV jetstream.KeyValue
}
// setupAPIServer connects to NATS, creates the API server with all handlers,
@@ -112,7 +113,7 @@ func setupAPIServer(
checker := newHealthChecker(b.nc, b.jobsKV)
auditStore, auditKV, serverOpts := createAuditStore(ctx, log, b.nc, namespace)
metricsProvider := newMetricsProvider(
- b.nc, b.jobsKV, b.registryKV, b.factsKV, auditKV, streamName, b.jobClient,
+ b.nc, b.jobsKV, b.registryKV, b.factsKV, b.stateKV, auditKV, streamName, b.jobClient,
)
sm := api.New(appConfig, log, serverOpts...)
@@ -163,11 +164,21 @@ func connectNATSBundle(
}
}
+ var stateKV jetstream.KeyValue
+ if appConfig.NATS.State.Bucket != "" {
+ stateKVConfig := cli.BuildStateKVConfig(namespace, appConfig.NATS.State)
+ stateKV, err = nc.CreateOrUpdateKVBucketWithConfig(ctx, stateKVConfig)
+ if err != nil {
+ cli.LogFatal(log, "failed to create state KV bucket", err)
+ }
+ }
+
jc, err := jobclient.New(log, nc, &jobclient.Options{
Timeout: 30 * time.Second,
KVBucket: jobsKV,
RegistryKV: registryKV,
FactsKV: factsKV,
+ StateKV: stateKV,
StreamName: streamName,
})
if err != nil {
@@ -180,6 +191,7 @@ func connectNATSBundle(
jobsKV: jobsKV,
registryKV: registryKV,
factsKV: factsKV,
+ stateKV: stateKV,
}
}
@@ -216,6 +228,7 @@ func newMetricsProvider(
jobsKV jetstream.KeyValue,
registryKV jetstream.KeyValue,
factsKV jetstream.KeyValue,
+ stateKV jetstream.KeyValue,
auditKV jetstream.KeyValue,
streamName string,
jc jobclient.JobClient,
@@ -254,7 +267,7 @@ func newMetricsProvider(
}, nil
},
KVInfoFn: func(fnCtx context.Context) ([]health.KVMetrics, error) {
- buckets := []jetstream.KeyValue{jobsKV, registryKV, factsKV, auditKV}
+ buckets := []jetstream.KeyValue{jobsKV, registryKV, factsKV, stateKV, auditKV}
results := make([]health.KVMetrics, 0, len(buckets))
for _, kv := range buckets {
diff --git a/cmd/client_agent_drain.go b/cmd/client_agent_drain.go
new file mode 100644
index 00000000..c59c167e
--- /dev/null
+++ b/cmd/client_agent_drain.go
@@ -0,0 +1,61 @@
+// Copyright (c) 2026 John Dewey
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal in the Software without restriction, including without limitation the
+// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+// sell copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS IN THE SOFTWARE.
+
+package cmd
+
+import (
+ "fmt"
+
+ "github.com/spf13/cobra"
+
+ "github.com/retr0h/osapi/internal/cli"
+)
+
+// clientAgentDrainCmd represents the clientAgentDrain command.
+var clientAgentDrainCmd = &cobra.Command{
+ Use: "drain",
+ Short: "Drain an agent",
+ Long: `Stop an agent from accepting new jobs. In-flight jobs continue to completion.`,
+ Run: func(cmd *cobra.Command, _ []string) {
+ ctx := cmd.Context()
+ hostname, _ := cmd.Flags().GetString("hostname")
+
+ resp, err := sdkClient.Agent.Drain(ctx, hostname)
+ if err != nil {
+ cli.HandleError(err, logger)
+ return
+ }
+
+ if jsonOutput {
+ fmt.Println(string(resp.RawJSON()))
+ return
+ }
+
+ fmt.Println()
+ cli.PrintKV("Hostname", hostname, "Status", "Draining")
+ cli.PrintKV("Message", resp.Data.Message)
+ },
+}
+
+func init() {
+ clientAgentCmd.AddCommand(clientAgentDrainCmd)
+ clientAgentDrainCmd.Flags().String("hostname", "", "Hostname of the agent to drain")
+ _ = clientAgentDrainCmd.MarkFlagRequired("hostname")
+}
diff --git a/cmd/client_agent_get.go b/cmd/client_agent_get.go
index 52232ccc..dd3c0be9 100644
--- a/cmd/client_agent_get.go
+++ b/cmd/client_agent_get.go
@@ -64,6 +64,10 @@ func displayAgentGetDetail(
kvArgs := []string{"Hostname", data.Hostname, "Status", data.Status}
cli.PrintKV(kvArgs...)
+ if data.State != "" && data.State != "Ready" {
+ cli.PrintKV("State", data.State)
+ }
+
if len(data.Labels) > 0 {
cli.PrintKV("Labels", cli.FormatLabels(data.Labels))
}
@@ -138,6 +142,48 @@ func displayAgentGetDetail(
cli.PrintKV("Interface "+iface.Name, strings.Join(parts, " "))
}
}
+
+ var sections []cli.Section
+
+ if len(data.Conditions) > 0 {
+ condRows := make([][]string, 0, len(data.Conditions))
+ for _, c := range data.Conditions {
+ status := "false"
+ if c.Status {
+ status = "true"
+ }
+ reason := c.Reason
+ since := ""
+ if !c.LastTransitionTime.IsZero() {
+ since = cli.FormatAge(time.Since(c.LastTransitionTime)) + " ago"
+ }
+ condRows = append(condRows, []string{c.Type, status, reason, since})
+ }
+ sections = append(sections, cli.Section{
+ Title: "Conditions",
+ Headers: []string{"TYPE", "STATUS", "REASON", "SINCE"},
+ Rows: condRows,
+ })
+ }
+
+ if len(data.Timeline) > 0 {
+ timelineRows := make([][]string, 0, len(data.Timeline))
+ for _, te := range data.Timeline {
+ timelineRows = append(
+ timelineRows,
+ []string{te.Timestamp, te.Event, te.Hostname, te.Message, te.Error},
+ )
+ }
+ sections = append(sections, cli.Section{
+ Title: "Timeline",
+ Headers: []string{"TIMESTAMP", "EVENT", "HOSTNAME", "MESSAGE", "ERROR"},
+ Rows: timelineRows,
+ })
+ }
+
+ for _, sec := range sections {
+ cli.PrintCompactTable([]cli.Section{sec})
+ }
}
func init() {
diff --git a/cmd/client_agent_list.go b/cmd/client_agent_list.go
index 89ac4ccc..c43a5c56 100644
--- a/cmd/client_agent_list.go
+++ b/cmd/client_agent_list.go
@@ -22,6 +22,7 @@ package cmd
import (
"fmt"
+ "strings"
"time"
"github.com/spf13/cobra"
@@ -57,6 +58,22 @@ Shows each agent's hostname, status, labels, age, load, and OS.`,
rows := make([][]string, 0, len(agents))
for _, a := range agents {
+ status := a.State
+ if status == "" {
+ status = "Ready"
+ }
+ conditions := "-"
+ if len(a.Conditions) > 0 {
+ active := make([]string, 0)
+ for _, c := range a.Conditions {
+ if c.Status {
+ active = append(active, c.Type)
+ }
+ }
+ if len(active) > 0 {
+ conditions = strings.Join(active, ",")
+ }
+ }
labels := cli.FormatLabels(a.Labels)
age := ""
if !a.StartedAt.IsZero() {
@@ -72,7 +89,8 @@ Shows each agent's hostname, status, labels, age, load, and OS.`,
}
rows = append(rows, []string{
a.Hostname,
- a.Status,
+ status,
+ conditions,
labels,
age,
loadStr,
@@ -82,9 +100,17 @@ Shows each agent's hostname, status, labels, age, load, and OS.`,
sections := []cli.Section{
{
- Title: fmt.Sprintf("Active Agents (%d)", resp.Data.Total),
- Headers: []string{"HOSTNAME", "STATUS", "LABELS", "AGE", "LOAD (1m)", "OS"},
- Rows: rows,
+ Title: fmt.Sprintf("Active Agents (%d)", resp.Data.Total),
+ Headers: []string{
+ "HOSTNAME",
+ "STATUS",
+ "CONDITIONS",
+ "LABELS",
+ "AGE",
+ "LOAD (1m)",
+ "OS",
+ },
+ Rows: rows,
},
}
cli.PrintCompactTable(sections)
diff --git a/cmd/client_agent_undrain.go b/cmd/client_agent_undrain.go
new file mode 100644
index 00000000..7f668f03
--- /dev/null
+++ b/cmd/client_agent_undrain.go
@@ -0,0 +1,61 @@
+// Copyright (c) 2026 John Dewey
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal in the Software without restriction, including without limitation the
+// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+// sell copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS IN THE SOFTWARE.
+
+package cmd
+
+import (
+ "fmt"
+
+ "github.com/spf13/cobra"
+
+ "github.com/retr0h/osapi/internal/cli"
+)
+
+// clientAgentUndrainCmd represents the clientAgentUndrain command.
+var clientAgentUndrainCmd = &cobra.Command{
+ Use: "undrain",
+ Short: "Undrain an agent",
+ Long: `Resume accepting jobs on a drained agent.`,
+ Run: func(cmd *cobra.Command, _ []string) {
+ ctx := cmd.Context()
+ hostname, _ := cmd.Flags().GetString("hostname")
+
+ resp, err := sdkClient.Agent.Undrain(ctx, hostname)
+ if err != nil {
+ cli.HandleError(err, logger)
+ return
+ }
+
+ if jsonOutput {
+ fmt.Println(string(resp.RawJSON()))
+ return
+ }
+
+ fmt.Println()
+ cli.PrintKV("Hostname", hostname, "Status", "Ready")
+ cli.PrintKV("Message", resp.Data.Message)
+ },
+}
+
+func init() {
+ clientAgentCmd.AddCommand(clientAgentUndrainCmd)
+ clientAgentUndrainCmd.Flags().String("hostname", "", "Hostname of the agent to undrain")
+ _ = clientAgentUndrainCmd.MarkFlagRequired("hostname")
+}
diff --git a/cmd/nats_helpers.go b/cmd/nats_helpers.go
index 1f189efc..2fbce3c5 100644
--- a/cmd/nats_helpers.go
+++ b/cmd/nats_helpers.go
@@ -172,6 +172,14 @@ func setupJetStream(
}
}
+ // Create state KV bucket with configured settings (no TTL)
+ if appConfig.NATS.State.Bucket != "" {
+ stateKVConfig := cli.BuildStateKVConfig(namespace, appConfig.NATS.State)
+ if _, err := nc.CreateOrUpdateKVBucketWithConfig(ctx, stateKVConfig); err != nil {
+ return fmt.Errorf("create state KV bucket %s: %w", stateKVConfig.Bucket, err)
+ }
+ }
+
// Create DLQ stream
dlqMaxAge, _ := time.ParseDuration(appConfig.NATS.DLQ.MaxAge)
dlqStorage := cli.ParseJetstreamStorageType(appConfig.NATS.DLQ.Storage)
diff --git a/configs/osapi.yaml b/configs/osapi.yaml
index e037a11a..3a09cd5a 100644
--- a/configs/osapi.yaml
+++ b/configs/osapi.yaml
@@ -98,6 +98,11 @@ nats:
storage: file
replicas: 1
+ state:
+ bucket: agent-state
+ storage: file
+ replicas: 1
+
telemetry:
tracing:
enabled: true
@@ -133,3 +138,7 @@ agent:
group: web.dev.us-east # hierarchical: --target group:web, group:web.dev, etc.
facts:
interval: 60s
+ conditions:
+ memory_pressure_threshold: 90
+ high_load_multiplier: 2.0
+ disk_pressure_threshold: 90
diff --git a/docs/docs/gen/api/drain-agent.api.mdx b/docs/docs/gen/api/drain-agent.api.mdx
new file mode 100644
index 00000000..3e2e3d21
--- /dev/null
+++ b/docs/docs/gen/api/drain-agent.api.mdx
@@ -0,0 +1,525 @@
+---
+id: drain-agent
+title: "Drain an agent"
+description: "Stop the agent from accepting new jobs. In-flight jobs continue to completion."
+sidebar_label: "Drain an agent"
+hide_title: true
+hide_table_of_contents: true
+api: eJztVsFu20YQ/ZXFnFqAlpQ0PZQ3FakBBQhixAp6cAVjxB2Ja5O7zOzQjkrw34NZyjJt2UEOCdACPmmX3Jl5895w9ToIDTGKC35hIQfL6Px8S14gA0uxYNfoS8jhXEJjpCSD+tpsONQGi4IacX5rPN2aq7COE7PwJ5vKbUtJe1MEL863ZCSYItRNRZpv8o+HDAS3EfILSAUv36PHLdW6nJ8tLlOZywO8CKsMIhUtO9lBftHBn4RMPG+l1BzpeH7LTghW/SqDBhlrEuKYTnusCXIoQ5S0zMBpVw1KCRkwfW4dk4VcuKUMYlFSjZB3ILtG46Kw81voH7OyLMnc5TRhMyJIgklsTkDRMMUm+EhRc76ezfTnYaZEwhBinHfiUMhOIANlUAXJO8CmqVyR+JheRQ3rjqGG9RUVql/Dyp64oWhNMeKWnuipH/d/cTi46nt99Wb26hjsJ4+tlIHdv2TNiZmfLcw17cwhzQ9DTcyBjzE/lmFuRvs7HVKskRLFhKJomQc+6QvqGEIOp+gqsqoUk7CjGzJRUNo4GYQWdFX8juLWOl1iZfYxBtehlXsQT5a1w0fhSW4DXxtxNYVWUuki2LFQzgttiY8KLw9NasCDIr/PZiqeOEkl/9JTH/czCHfC/nYs7GngtbOWvDkxCx/bzcYVTgezIa5djOlLfFH3/6Dum+fuGB/EbELrf+Tt8qLkT1Tyj+eUxIoJ7c44n65eikI2UUMv0v73pe0zqEnKoMarCTExr3Ykh2myENPuzln00+QLQB0Q3wyWZmSHzlXJQayxKTqAL0Ua2Jsa3a/TIcj2i9PANQrk8O7vZXIDzm9CCt9DH4bt3p/p/z1koEAGFl5NZpOZsqZt1JjGa2+43iZDg35wRY/56+6H9KcYzKF/oS8ybSplsM+g5UoLD1TvjSNkkI+s4cD2Kkt2UQ913RojfeKq7/Xx55Z4N2hwg+xwrTRddGBd1LWFfINVpG80+8vHvVX61XyPhXymlf1D9DuVA6tWd5DBNe3GXrdf9RmUhJY4wRxezxO3o8CjS0KN62FCzz6cLyEDfDhaj0YppX8SVdcNJ5bhmnzfH0CK7hVh338FZL5baA==
+sidebar_class_name: "post api-method"
+info_path: gen/api/agent-management-api
+custom_edit_url: null
+---
+
+import ApiTabs from "@theme/ApiTabs";
+import DiscriminatorTabs from "@theme/DiscriminatorTabs";
+import MethodEndpoint from "@theme/ApiExplorer/MethodEndpoint";
+import SecuritySchemes from "@theme/ApiExplorer/SecuritySchemes";
+import MimeTabs from "@theme/MimeTabs";
+import ParamsItem from "@theme/ParamsItem";
+import ResponseSamples from "@theme/ResponseSamples";
+import SchemaItem from "@theme/SchemaItem";
+import SchemaTabs from "@theme/SchemaTabs";
+import Heading from "@theme/Heading";
+import OperationTabs from "@theme/OperationTabs";
+import TabItem from "@theme/TabItem";
+
+
+
+
+
+
+
+
+
+
+Stop the agent from accepting new jobs. In-flight jobs continue to completion.
+
+
+
+
+
+
+
+
+ Path Parameters
+
+
+
+
+
+
+
+
+
+ Agent drain initiated.
+
+
+
+
+
+
+
+
+
+
+ Schema
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Unauthorized - API key required
+
+
+
+
+
+
+
+
+
+
+ Schema
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Forbidden - Insufficient permissions
+
+
+
+
+
+
+
+
+
+
+ Schema
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Agent not found.
+
+
+
+
+
+
+
+
+
+
+ Schema
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Agent already in requested state.
+
+
+
+
+
+
+
+
+
+
+ Schema
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/docs/gen/api/get-agent-details.api.mdx b/docs/docs/gen/api/get-agent-details.api.mdx
index 5769d8d5..2ef825ed 100644
--- a/docs/docs/gen/api/get-agent-details.api.mdx
+++ b/docs/docs/gen/api/get-agent-details.api.mdx
@@ -5,7 +5,7 @@ description: "Get detailed information about a specific agent by hostname."
sidebar_label: "Get agent details"
hide_title: true
hide_table_of_contents: true
-api: eJztWEtv2zgQ/isET7uAokiJnU10y3abIrt9BG2CHoLAoMWRzUYiFXLkxmvovy+Gkm3JdhIv2h4K9GRZmvnmLX2cBZfgUqtKVEbzhL8BZBJQqBwkUzozthD0iImxqZAJ5kpIVaZSJiagkY3nbGocalFAyANuSrBe/lLyhE8Az0nqLw/oeMBRTBxPbrm/PXontJhAQZfnV5cjjzhaQTh+F3AHaWUVznlyu+B/grBgzyucEoYXTywIye/qu4CXwooCEKzzwuQST/jSOx5wRQGWAqc84BYeKmVB8gRtBQF36RQKwZMFx3lJeg6t0hNeBxsJup7CKmJmMoZTaFOBhllAq2AGISeHLLjSaAeOYI+iiH76YD4Nbb4d5S81GkEjSYqyzFXqM3H4xZH4YttLM/4CKfKAl5byhqoxtgp6K569wwkpdIcCK7cLBXRVUBE+gpBzHvD3BpvLu10W0spairTB27aTizHkbldcQkpFQCK/6kX4QpX+gfnBTOQVsAaapUZnalJZkMzoDesWJsohWJAjgbuCbaaAJ1wKhANUvpv6Bj9PoQPLcuGQWcgsuCkNEjo2BWFxDGKVWYvf0WBpTQrOsRbX2zBuRBO8K619rA/NyOkJc3OHUHQHP9xqLqnIz3HVKO/TYN3meqt09ci6GGQBHkVR5gRzM640VuT+DKzb20QrvK+VoyiMBryuuy+C235kawfuAo4Kvd6HT5c6Mx/bwSYvq9KXZy8n2+w2Kk3jGyFHYgZWTODlOnUwSJG1io5lxrI4YMOACS1ZPGSF0hWC2y5eXKhuRnVVjMFuGXrbQffglFPf03EL3UtnFB4f1QEffiN2z+0O+NFpHfD4W9HjJ+GHm33gk9TGs7TcaQKycd6Y6HZCAYWx85er+M7LscqRi89OGhoUeQdQaYTJjqCvSY419pnSbDzfjPL0+PT0JKI8ZhZgD8gLC/As4lF09kc8pLJXjrL2IuKNA/ks4iA+GxxHg81iNEloHW+tdYrRZLNbB2HTqUJIsbJ7zOWrqxvW1ei/J0QhTwYEeg9WQz7a+5304RNrVJZvpj7uMIyHYXRwFh9MQINVKdlIy2qUmkrjHtl879ufXne5mahU5OzV1c1GPqnaD3IPZy+qPJ+zh0rkKlMgmTSFUJotSd3a7a8wPojisL0Rpqbw3zKwM5XCqJjYl21daoXtS6yP3dyTBFiK9F5M9gS8aoRZ4cmk3ahfiQRIWbSZSHvcQVgriLoohGIn9+gP41N8am0McBp5a+Vs8LxkfHYUxienYRzGS42T5zUyOI2SJPbvGZE+LxtFSRwnR0fJ8XEyGCTDIWllolD5fI/6XDEhpSUu0aj0EyorP4xL+qc0oGfWgCcERU/vNifYZ64zsu8Bvxp7f7ksynp4vZ8p/g8m2LD3fgivHxG0BMk8FsusKdham6jSTEmwLtwiAJ3DQkt9O257sk6ff17XpDiI4m1Gf6NFhVNj1b8g2QE7v7pk9zBnKyPfjeKDtWaP6Thnnf9LduR1GU4FMpN6bi77Vb5ozn+dE01L3cOGajfHuZeNr5Pe6rTHyJUTO83KCsi0btqEEVkyVUObUyP3+Xpdr4IkhZ6RYRRR8ZZFfU1SnQZsCnu8XdgLY8dKStDsgF1qV2WZSpVn3mAL5Zw/sP6q7s9Q3cFTB3FtkGWm0r/G9Gco5HDXRsULLtNBJ1rxg1Ysvwr7gwrrT1I4Ne0CkRJPK7uEH/pSHi6WH+maN9SzWfh1doWfqIRNlbobw5XXU8SStys/+j/2QjxoLy6Wy5e/P197hrBaonSJAFsvL+kr31kXJDwOo9DzutI4LIRe00e/X+215GbuFusG/dZlbBsuwiMelrlQ2u8rrD9RNjltl6g84MmK+twFngbRw8ViLBzc2Lyu6fZDBXS+pVTPhFViTNm49QshupY8yUTuNulYN6DfPrY86He25x71iSiWFF4TgfeLPp5wTse0eXfnW9/VAZ+CkGC9p83j8zSFEjuKW+8B2t6umvDN62uin/0e2ugZj77TqcWikbg296DreuUj0n9ysK7/A+haFv8=
+api: eJztWdFu47YS/RWCT72A4pUSO83qLd0mvbltd4PdBH1YBAYtjiw2EqklR+66hv79YkjZlmwncZH2ocA+xbHIMzNnhuPD0YpLcJlVNSqjecp/AmQSUKgSJFM6N7YS9IiJmWmQCeZqyFSuMibmoJHNlqwwDrWoYMQjbmqwfv2N5CmfA17Sqh89oOMRRzF3PP3M/dfTX4UWc6jo4+XtzdQjTjcQjj9E3EHWWIVLnn5e8R9AWLCXDRaE4ZenFoTkD+1DxGthRQUI1vnF5BJP+do7HnFFAdYCCx5xC18aZUHyFG0DEXdZAZXg6YrjsqZ9Dq3Sc95GOwTdFbCJmJmcYQEdFWiYBbQKFjDi5JAFVxvtwBHsaRzTnyGYp6Hj2xF/mdEIGmmlqOtSZZ6JN787Wr7a99LMfocMecRrS7yhCsY2Qe/Fc3Q4IwrdocDGHUIB3VSUhI8g5JJH/L3B8PHhkIWssZYiDXj7dkoxg9IdiktIqQhIlLeDCF/I0s+wPFmIsgEWoFlmdK7mjQXJjN6xbmGuHIIFORV4KNhwCnjKpUA4QeWraWjwtwJ6sKwUDpmF3IIr6CChYwUIizMQG2Yt/o0Ga2sycI51uN6GcVM6wYdoHWJ9CEdOz5lbOoSqf/BHe8UlFfk5a8LmYwqsX1y/KN18ZX0MsgBfRVWXBHM/azQ25P4CrDvaRLf4WCun8Sge87btN4LPw8i2DjxEHBX6fR8+3ejcfOwONnnZ1D49RznZsRu2hMI3Qk7FAqyYw8t56mHQRtZtdCw3liURm0RMaMmSCauUbhDcfvKSSvUZ1U01A7tn6JceugcnTn1NJx30gM54dHbaRnzySuyB2z3w04s24slr0ZMn4Se7deBJ6uJZW+4VAdm4DCb6lVBBZezy5Sz+6texxpGLz540NCjKHqDSCPMDQd/ROhbsM6XZbLkb5cXZxcV5TDzmFuAIyGsL8Cziafz2+2RCaW8csfYi4r0D+SziOHk7PovHu8kIJHSOd9Z6yQhs9vMgbFYohAwbe8S5fHd7z/o7hn1CVPJ8TKCPYDWU06N70odPLGxZd6Yh7mSUTEbxydvkZA4arMrIRlY308w0Go9g870vf2p3pZmrTJTs3e39Dp+U7S/yCGevm7Jcsi+NKFWuQDJpKqE0W4u6rdt/wOwkTkbdF6PMVP63DOxCZTCt5vZlWzdaYdfEhtjhO0mAtcgexfxIwNuwmFVeTNqd/NVIgMSizUU20A7CWkHSRSFUB7XH8DA+pae2xgCL2FurF+PnVyZvT0fJ+cUoGSXrHefP78jhIk7TxPcZkT2/No7TJElPT9Ozs3Q8TicT2pWLSpXLI/Jzy4SUlrRE2DIkVDb+MK7ln9KAXlkDnhMUPX3YPcGeud6RfQ/4h7GPN+ukbA+v9zPDv6AEg3ofhnD1FUFLkMxjsdyaim13k1RaKAnWbSTu4bzuKNwfrVA6PHpnrDQ69KFDgp5kumxKL6gI3hvKjA4uvKIGw9OnfQ3N8JayR80v4v9V84J+rsh/5R43Tx4OifuZMSUIHSSxcIfanNfqDqdohXY+mulT8ueAht1r7bRl48gT0P3CMRLerWncF/1XpPgFgmTaSGBbwj3/BFYqDa9gX1XgUFT1seFGHBagDyj8NnrmiualhHNDRbh9BtaaA21xj9uNs2svekTedVxc+Qd7RHZVTKXLtulghXJo7HK0Z6t3x+5y2bPlwUg187aljeM42b8I32vRYGGs+hMkO2GXtzfsEZZsY+Rvuxk/wd4eA6z3//pS4fcyLAQyk/krrRw2x+swNukNArob7yhwHKYgLxvf9qpuTzd92Thx0KxsgEzr0F0ZFYBpsOs98hjRd7cJkjYMjEziuG23Sb2iVb2+HRJ7tp/Ya2NnSkrQ7ITdaNfkucqUv7CCrZRzvh9+y+6/Ibvjp+ZX2iDLTaO/HdN/QyInhwaRfuGaDtIt4h+aTH5L7D+UWK8asDDd3J2Ip0l3yt/4VL5ZrX+kWx5ubGFO3huxf6IUhiz1B+0brwtEEhM+014u+kU86j5crwXQ/3678wphM3vsCwG2nfnTr3xvypbyZBSP/HWoNg4robe3Lv9aYlCSu9yttgX62ncYXbgIX/FNXQrllWZj/SAmcNq9e+ARTzfS5yFIOnq4Ws2Eg3tbti19/aUBGgsR1QthlZgRG5/9HJU+S57monS7t5h+QN997HTQf9iRrx+eiGKtezWpXj8f5ynnNN1Y9l+VtHQ9KEBIsN7T8Pgyy6DG3sa9PkAvPTZF+NPVHd3ahjW0UzMe/aBTq1VYcWceQbftxkek/8nBtv0/q4g1/g==
sidebar_class_name: "get api-method"
info_path: gen/api/agent-management-api
custom_edit_url: null
@@ -594,6 +594,196 @@ Get detailed information about a specific agent by hostname.
+
+
+
+
+
+
+
+ conditions
+
+ object[]
+
+
+
+
+
+
+ Evaluated node conditions.
+
+
+
+
+ Array [
+
+
+
+
+
+
+
+
+
+
+
+ ]
+
+
+
+
+
+
+
+
+
+ timeline
+
+ object[]
+
+
+
+
+
+
+ Agent state transition history.
+
+
+
+
+ Array [
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ]
+
+
+
+
@@ -602,7 +792,7 @@ Get detailed information about a specific agent by hostname.
value={"Example (from schema)"}
>
diff --git a/docs/docs/gen/api/list-active-agents.api.mdx b/docs/docs/gen/api/list-active-agents.api.mdx
index e5e56118..956db4a4 100644
--- a/docs/docs/gen/api/list-active-agents.api.mdx
+++ b/docs/docs/gen/api/list-active-agents.api.mdx
@@ -5,7 +5,7 @@ description: "Discover all active agents in the fleet."
sidebar_label: "List active agents"
hide_title: true
hide_table_of_contents: true
-api: eJztWEtv2zgQ/isEz44qOXY20S3bbRbZ7SNoEvRQGAYtjmw2EqmQIzdeQ/99MZRsS7KbuGgX2ENPlqWZb57kPNZcgkusKlAZzWP+h3KJWYJlIsuYSFAtgYk5aHRMaYYLYGkGgAEfcBRzx+PP/JI+T98JLeaQ0+PlzfXU80xNAVYQsuOTAXeQlFbhisef1/x3EBbsZYkLwvDksQUh+aSaDLgFVxjtwPF4zYdhSD9dRd8qh8ykXR1JrcRoBI3EIYoiU4lX4NUXR2xr7pIF5IKecFUAj7mZfYEE+YAXltRFVQutAVt0wlqx4gOuEHL3Mv/CONQihxalQ6v0nA96ltwtgG2oySJyspce8GrAHQos3SEU0GVOvvsIQpJi7w3Wj5NDEpLSWtDIarx9OZmYQXbQLiGlIiCR3XQs7OpT9YX+DauTpchKYDU0S4xO1by0IJnRPekW5sohWJBTgYeMTY3N6QuXAuEEVQ57fvy0gBYsy4RDZiG14BYgmULHFiAszkBsPWvxJwosrEnAOdbgehnGTZVOzSG3drE+1CdFz5lbOYScERtpoIwO9pJLKtJzVtbMxyRYO7neKl0+sTYGSYAnkRcZwdzPSo0lqb8E644W0RAfK2UYBuGIVz74j6WyICmZO5btFJgMOCr0fB9ur3VqPjYXBGlZFj48RynZeLdmqRPfCDkVS7BiDi/HqYVBjKxhdCw1lkUDNh4woSWLxixXukRw+8GLctX2qC7zGdg9QW9b6B6cfOpzOmqgO+4Mg9NhNeDjH8TuqN0CH55XAx79KHr0TfhxPw+8kxp7NpJbSUAyLmsR7UzIITd29XIU33k6VjpS8dmThgZF1gJUGmF+wOg7omO1fKqUs1XfyvPT8/OzkPyYWoAjIK8swLOIw/Dit2hMYS8dee1FxHsH8lnEUXQxOg1H/WDUTmgUb6S1glF7sx0HYZOFQkiwtEecy9c396zN0b0nRC7PRgT6AFZDNj36Tvpwy2qWzc3UxR0H0TgITy6ikzlosCohGUlRThNTajzCm+99+tN1l5m5SkTGXt/c9/xJ0X6URyh7VWbZij2WIlOpAsmkyYXS/truqv0VZidhFDQvgsTkvpaBXaoEpvncvizrWitsLrEudv1OEmAhkgcxPxLwpiZmue8BbS9+BRIgedGmIoEf6Km+1U/thAEuQi+tWI6ep4wuhkF0dh5EQbThOHueI4XzMI4jf8+I5HnaMIyjKB4O49PTeDSKx2PiSkWustUR8blhQkpLvUTN0nWoLP1h3LR/SgN5in7OCIq+Tvon2HuudWTfA3419uF6E5Td4fV6JvgdnSDaEvomvHlC0BIk81gstSZnO25qlZZKgnXBXgOw7Zu3rW9LbT9qUPn3bN93N+vtce2NDH0Nms5/g9+uOsqhV8G1/EXsozDan1DutShxYaz6ByQ7YZc31+wBVmwr6qeNKmCtOeKUXrLW/02X5nkZLgQyk/gZQXaz7UqoDCRDwyygVbCEZoQI6pYfhcoOjic94bvgNzxMzEyJOyUOipUlkGhdpyujps2UdfueGHlMFb3bGkkMHSHjMPRp1ET3DVHtBfZ0P7BXxs6UlKDZCbvWrkxTlSg/AYDNlXN+3v0V3f9/dMeHFguekOYWvwehieyn7xZ+hfQ/CqkfAXBhJI/53JfFQtCCib/yMeR1owSW9lathdQtxa0OTXsttVV1gVgQryfjMZ95Ij5oHq42q4K/Pt35arId+dtli+02ZFQLWsNtzKMgDHwXUhiHudC7Zqfec3UqVt9j611aftf6rrYN4QlfFZlQ2o/S1hfU2mtNKaQKSFWZXqzXM+Hg3mZVRa8fS6Bxi3y5FFaJGZn7eVIN+AKEBOs3fQ+wIh8kCRQUAb8T8n1h7wDR3m8bvT/f3FHD0Y1Dz+8efdNJ6lULe72uKe7MA+iq4oNGCaT/vJpUVfUvDGwzlA==
+api: eJztWE1v2zgQ/SsEz4orOXY28S2bJrvZ7YfRJuihCAxaHFlsJFIhR268hv/7YijZlmQncZEusIeeLIucN8P3RsMhl1yCi60qUBnNR/ytcrGZg2Uiy5iIUc2BiRlodExphimwJAPAHg84ipnjo6/8nIYn74UWM8jp8Xx8PfE2E1OAFYTs+F3AHcSlVbjgo69L/jsIC/a8xJQw/PSRBSH53eou4BZcYbQDx0dL3g9D+mkH+k45ZCZpx0hhxUYjaCQLURSZin0Ab745MltyF6eQC3rCRQF8xM30G8TIA15YChdV5bQCbMwT1ooFD7hCyN3L9qlxqEUOjZkOrdIzHnRWcpMCW8+mFRHJ3nuPrwLuUGDp9qGALnPi7hMISYF9MFg93u3zEJfWgkZW4e36ycQUsr3rElIqAhLZuLXCdjyrrtO/YXE0F1kJrIJmsdGJmpUWJDO6493CTDkEC3IicN9iE2NzGuFSIByhymGHxy8pNGBZJhwyC4kFl4JkCh1LQVicgtgwa/EnOiysicE5VuN6H8ZNlE7MPlrbWB+rL0XPmFs4hJyRGUWgjO7tJJdUFOe0rIwPSbBmcr1TunxkTQzyAI8iLzKCuZ2WGksKfw7WHeyinnyol37YCwd85cV/KJUFScncWtk2gLuAo0Jv9/HztU7Mp7pAUJRl4eU5KMia3cqkSnwj5ETMwYoZvKxTA4MMWW3oWGIsiwI2DJjQkkVDlitdIrhd8aJcNRnVZT4Fu+PoXQPdgxOnPqejGrpFZ9g77q8CPnwldivsBnj/dBXw6LXo0ZPww24eeJLq9aw9N5KAfJxXLpqZkENu7OJlFd/7eax0FOKzXxoaFFkDUGmE2Z5F39A8VvmnnXK66K7y9Pj09CQkHhMLcADklQV4FrEfnv0WDUn20hFrLyLeOpDPIg6is8FxOOiKUZFQB157a4hRsdnUQdg4VQgxlvaA7/JifMuaFu06IXJ5MiDQe7AassnBNenjZ1aZrCtTG3fYi4a98OgsOpqBBqti8hEX5SQ2pcYD2Pzg05/KXWZmKhYZuxjfdvgktR/kAcFelVm2YA+lyFSiQDJpcqG0L9vtsL/D9CiMevWLXmxyv5eBnasYJvnMvuzrWiusi1gbu3onCbAQ8b2YHQg4riaz3PeAtqNfgQRILNpExPCKnuqpfmrrDDANvbdiPnh+ZnTW70Unp72oF60tTp63SOA0HI0iX2dE/PzcMBxF0ajfHx0fjwaD0XBIVonIVbY4QJ8xE1Ja6iUqkzahsvQf47r9UxqIKfo5ISgavet+wZ65xif7AfC7sffXa1G2H6+PM8Yf6ATRltBdwuUjgpYgmcdiiTU521pTqzRXEqzbtLj7de10uG+tULoaujBWGl3VobZrfxxh1ObLMvMNFcF7R7HRVQivyMFq9OlYq2I4JvWo+AX8TzVLabui+JW734zc7Wvup8ZkIHTVEgu3r8z5Xt3hBK3Qzq9m8lT7s6eH3SntZLIJ5AnoZuIYCRdrGneb/kvq+AWCZNpIYFvCPf8ElikNr2Bf5eBQ5MWhyw04zEHv6fBXwTNHNN9KONfuCLdjYK3ZUxZ3uN0Eu46iQeRNzcWlH9ghss5iSl22lYOlyqGxi96Or81aNlo2fHkw6pq92Y+1NHqzy3VO2t0I6gPzGr/ZrCmHPgTXKDNkPgij3YP9rRYlpsaqf0CyI3Y+vmb3sGAbVz/thP+EijtKsMb/9eHG2zJMBTIT+6O1bBfpK6EykAwNs4BWwRzqk3ev0hqFyvae6jvOtzWztmFiakrcBrHXrSyBXOuqyjNKRFNiXQPlIc3nzWaRZNByMgxDn0a1upc0a0fY411hr4ydKilBsyN2rV2ZJCpW/uAMNlfO+br8S93/v7rDffdxfiId9/31Ie27P/1K7pek/5GkfrvD1Eg+4jPfTRaC7mX5G68hr84XYOm6t3GP+5l0q6Rp3uZuQk0Raevz8vrmxk/iQf1wtd6u//py43eTzU1Zc9ti24tl2gsad0IjHvXCnm/eC+MwF3p7Rqiuh1s7Vpex5TYtf+jWu1obwiO+KTKhfBNUWr+hVqzVWyHtgLQr04vlcioc3NpstaLXDyXQLQVxORdWiSkt9ys1hCkICdZfkN/DgjiIYyhIAX+V6o9TnQ+Irss36v1xeUN9eluHDu8efd1+6UUDe7msZtyYe9CrFQ/qIJD+89XdarX6F9otUpM=
sidebar_class_name: "get api-method"
info_path: gen/api/agent-management-api
custom_edit_url: null
@@ -603,6 +603,196 @@ Discover all active agents in the fleet.
+
+
+
+
+
+
+
+ conditions
+
+ object[]
+
+
+
+
+
+
+ Evaluated node conditions.
+
+
+
+
+ Array [
+
+
+
+
+
+
+
+
+
+
+
+ ]
+
+
+
+
+
+
+
+
+
+ timeline
+
+ object[]
+
+
+
+
+
+
+ Agent state transition history.
+
+
+
+
+ Array [
+
+
+
+
+
+
+
+
+
+
+
+
+
+ ]
+
+
+
+
diff --git a/docs/docs/gen/api/sidebar.ts b/docs/docs/gen/api/sidebar.ts
index dd455ba8..9309eedc 100644
--- a/docs/docs/gen/api/sidebar.ts
+++ b/docs/docs/gen/api/sidebar.ts
@@ -26,6 +26,18 @@ const sidebar: SidebarsConfig = {
label: "Get agent details",
className: "api-method get",
},
+ {
+ type: "doc",
+ id: "gen/api/drain-agent",
+ label: "Drain an agent",
+ className: "api-method post",
+ },
+ {
+ type: "doc",
+ id: "gen/api/undrain-agent",
+ label: "Undrain an agent",
+ className: "api-method post",
+ },
],
},
{
diff --git a/docs/docs/gen/api/undrain-agent.api.mdx b/docs/docs/gen/api/undrain-agent.api.mdx
new file mode 100644
index 00000000..2ba26015
--- /dev/null
+++ b/docs/docs/gen/api/undrain-agent.api.mdx
@@ -0,0 +1,524 @@
+---
+id: undrain-agent
+title: "Undrain an agent"
+description: "Resume accepting jobs on a drained agent."
+sidebar_label: "Undrain an agent"
+hide_title: true
+hide_table_of_contents: true
+api: eJztVl1r20oQ/SvLPN2CartfD1dvvtBACqUhdbgPwYSxdmxtIu2qs6O0rtB/v8xKdpw4Lb3QQgt58q52Ps8Z754OQkOM4oI/tZBD6y2j8/MNeYEMLMWCXaPHkMM5xbYmg0VBjTi/MddhFU3wBk3yImtQHSeQgeAmQn4JKdLVe/S4oVqX87PTq2R1tc8cYZlBpKJlJ1vILzv4h5CJ562UGiOZ55/ZCcGyX2bQIGNNQhyTtceaIIcyREnLDJyW26CUkAHTp9YxWciFW8ogFiXVCHkHsm3ULwo7v4H+YbuLkswupglrIyUN/RkJZgRqAloPU2yCjxQ16svZTH/ux0ow7JyM804cCllFqgheFO28A2yayhUJk+l1VMfuuNywuqZCyWlYERQ3pK0pRtzQI331hxhc7g2Xfa9Hr2cvjsu98NhKGdh9JWuem/nZqbmhrdmH+WlVE3Pg45ofUjE3B/sdF8nXSIliQlG0zAOe9AXrptJgJ+gqssoWk7CjWzJRUNo4GcgWdFX8geTWOl1iZUYfg6vQyl0Rj6a1LWlqT/I58I0RV1NoJaUugj0kynmhDfFR4sW+SXW4l+TNbKbkiZOU8q1anY9TCDtiXx0TexJ45awlb56bUx/b9doVTkezIa5djOnf+MTun8Du62/dMj6IWYfW/8zb5YnJX8jk399j0vnhcdX3NmVgG/SlVYToieHfn+E+g5qkDCqvmhAT8qpMcpgmNTHtdiKjn44CAVQO8e2gbw600UflcqDrUCHtyy9FGhgVju5XyQiycXESuEaBHN79u0iywPl1SO5j8cPU3Yk1ffghAy1kwOHFZDaZKW7aSI1pwEb1dTFqG/SDSHqIYXc3qP9PSA6tCX2RaVMpOH0GLVcaccBxFIiQQX4gAXdQLrMkDNWs61YY6YKrvtfPn1ri7QDwLbLDlWJw2YF1UdcW8jVWkb7Tx1/noyB6Zn5MLH6jnfEj+q2ijVWrO8jghraHurZf9hmUhJY4FToczxOKB45Ht4BK1P0Inn34uIAM8P7kPJiUFP7RqrpusFiEG/J9vy9SdK8V9v1/nZdG4w==
+sidebar_class_name: "post api-method"
+info_path: gen/api/agent-management-api
+custom_edit_url: null
+---
+
+import ApiTabs from "@theme/ApiTabs";
+import DiscriminatorTabs from "@theme/DiscriminatorTabs";
+import MethodEndpoint from "@theme/ApiExplorer/MethodEndpoint";
+import SecuritySchemes from "@theme/ApiExplorer/SecuritySchemes";
+import MimeTabs from "@theme/MimeTabs";
+import ParamsItem from "@theme/ParamsItem";
+import ResponseSamples from "@theme/ResponseSamples";
+import SchemaItem from "@theme/SchemaItem";
+import SchemaTabs from "@theme/SchemaTabs";
+import Heading from "@theme/Heading";
+import OperationTabs from "@theme/OperationTabs";
+import TabItem from "@theme/TabItem";
+
+
+
+
+
+
+
+
+
+
+Resume accepting jobs on a drained agent.
+
+
+
+
+
+
+
+ Path Parameters
+
+
+
+
+
+
+
+
+
+ Agent undrain initiated.
+
+
+
+
+
+
+
+
+
+
+ Schema
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Unauthorized - API key required
+
+
+
+
+
+
+
+
+
+
+ Schema
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Forbidden - Insufficient permissions
+
+
+
+
+
+
+
+
+
+
+ Schema
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Agent not found.
+
+
+
+
+
+
+
+
+
+
+ Schema
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Agent not in draining or cordoned state.
+
+
+
+
+
+
+
+
+
+
+ Schema
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/docs/sidebar/architecture/system-architecture.md b/docs/docs/sidebar/architecture/system-architecture.md
index d8e9cf6e..d41b05e3 100644
--- a/docs/docs/sidebar/architecture/system-architecture.md
+++ b/docs/docs/sidebar/architecture/system-architecture.md
@@ -13,14 +13,14 @@ that can either hit the REST API directly or manage the job queue.
The system is organized into six layers, top to bottom:
-| Layer | Package | Role |
-| -------------------------- | --------------------------------------- | ------------------------------------------------------------------- |
-| **CLI** | `cmd/` | Cobra command tree (thin wiring) |
-| **SDK Client** | `osapi-sdk` (external) | OpenAPI-generated client used by CLI |
-| **REST API** | `internal/api/` | Echo server with JWT middleware |
-| **Job Client** | `internal/job/client/` | Business logic for job CRUD and status |
-| **NATS JetStream** | (external) | KV `job-queue`, Stream `JOBS`, KV `job-responses`, KV `agent-facts` |
-| **Agent / Provider Layer** | `internal/agent/`, `internal/provider/` | Consumes jobs, executes providers, publishes system facts |
+| Layer | Package | Role |
+| -------------------------- | --------------------------------------- | ------------------------------------------------------------------------ |
+| **CLI** | `cmd/` | Cobra command tree (thin wiring) |
+| **SDK Client** | `osapi-sdk` (external) | OpenAPI-generated client used by CLI |
+| **REST API** | `internal/api/` | Echo server with JWT middleware |
+| **Job Client** | `internal/job/client/` | Business logic for job CRUD and status |
+| **NATS JetStream** | (external) | KV `job-queue`, Stream `JOBS`, KV `job-responses`, KV `agent-facts` |
+| **Agent / Provider Layer** | `internal/agent/`, `internal/provider/` | Consumes jobs, executes providers, evaluates conditions, drain lifecycle |
```mermaid
graph TD
@@ -113,6 +113,23 @@ Providers are stateless and platform-specific (e.g., a Ubuntu DNS provider vs. a
generic Linux DNS provider). Adding a new operation means implementing the
provider interface and registering it in the agent's processor dispatch.
+### Agent Lifecycle (`internal/agent/`)
+
+Agents evaluate **node conditions** on each heartbeat tick (10s) and support
+**graceful drain** for maintenance. Conditions are threshold-based booleans
+(MemoryPressure, HighLoad, DiskPressure) computed from heartbeat metrics.
+
+The drain mechanism uses NATS consumer subscribe/unsubscribe. When an operator
+drains an agent, the API writes a `drain.{hostname}` key to the state KV bucket
+(`agent-state`, no TTL). The agent detects this on its next heartbeat,
+unsubscribes from all NATS JetStream consumers (stopping new job delivery), and
+transitions through `Draining` → `Cordoned` as in-flight jobs complete. Undrain
+deletes the key and the agent resubscribes.
+
+State transitions are recorded as append-only timeline events in the state KV
+bucket, following the same pattern used for job lifecycle events. See
+[Agent Lifecycle](../features/agent-lifecycle.md) for details.
+
### Configuration (`internal/config/`)
Configuration is managed by [Viper][] and loaded from an `osapi.yaml` file.
diff --git a/docs/docs/sidebar/development/development.md b/docs/docs/sidebar/development/development.md
index 462e8d10..97fe0dbe 100644
--- a/docs/docs/sidebar/development/development.md
+++ b/docs/docs/sidebar/development/development.md
@@ -97,6 +97,12 @@ Unit tests should follow the Go convention of being located in a file named
located in `test/integration/` and use a `//go:build integration` tag. They
build and start a real `osapi` binary, so they require no external setup.
+Use `testify/suite` with table-driven patterns and `validateFunc` callbacks.
+**One suite method per function under test.** All scenarios for a function
+(success, error codes, transport failures, nil responses) belong as rows in a
+single table — never split into separate `TestFoo`, `TestFooError`,
+`TestFooNilResponse` methods.
+
### File naming
Avoid generic file names like `helpers.go` or `utils.go`. Name files after what
diff --git a/docs/docs/sidebar/development/tasks/backlog/2026-02-26-kubernetes-systemd-patterns.md b/docs/docs/sidebar/development/tasks/backlog/2026-02-26-kubernetes-systemd-patterns.md
deleted file mode 100644
index 446aa048..00000000
--- a/docs/docs/sidebar/development/tasks/backlog/2026-02-26-kubernetes-systemd-patterns.md
+++ /dev/null
@@ -1,118 +0,0 @@
----
-title: Kubernetes and systemd inspired patterns
-status: backlog
-created: 2026-02-26
-updated: 2026-02-26
----
-
-## Objective
-
-Adopt proven patterns from Kubernetes and systemd to make OSAPI's node
-management feel more mature and operationally familiar. These are ideas to
-explore beyond the initial heartbeat enrichment and `node list`/`node get` work.
-
-## Ideas
-
-### Node Conditions (Kubernetes-inspired)
-
-Kubernetes nodes report conditions like `MemoryPressure`, `DiskPressure`,
-`PIDPressure`, and `NetworkUnavailable`. Since the heartbeat already collects
-memory and load data, we could derive conditions from thresholds:
-
-- Memory > 90% used -> `MemoryPressure: true`
-- Load 1m > 2x CPU count -> `HighLoad: true`
-- Disk > 90% used -> `DiskPressure: true` (would need disk in heartbeat or a
- periodic deep scan)
-
-Conditions would be stored in the KV registration and shown in `node list` /
-`node get`. They give operators a quick "is anything wrong?" signal without
-digging into raw numbers.
-
-### Capacity and Allocatable (Kubernetes-inspired)
-
-Kubernetes tracks what resources a node has vs. what's available for scheduling.
-We could track:
-
-- `max_jobs` (configured) vs. `active_jobs` (current count)
-- Job slot utilization per agent visible in `node get`
-- Could inform smarter job routing (avoid overloaded agents)
-
-### Taints and Tolerations (Kubernetes-inspired)
-
-Kubernetes nodes can be "tainted" to repel workloads unless they explicitly
-tolerate the taint. We already have label-based routing, but taints would add:
-
-- Mark a node as `draining` or `maintenance` so new jobs avoid it
-- `NoSchedule` equivalent: agent stays registered but won't receive new jobs
-- `NoExecute` equivalent: evict running jobs (graceful drain)
-- CLI:
- `osapi node taint --hostname web-01 --key maintenance --effect NoSchedule`
-
-### Node Lifecycle Events (Kubernetes-inspired)
-
-Kubernetes records lifecycle events per node (Joined, BecameReady,
-BecameNotReady, etc.). We could store agent lifecycle events in a dedicated KV
-bucket:
-
-- "agent started" with timestamp and version
-- "agent stopped" (clean shutdown)
-- "heartbeat missed" (detected by TTL expiry watcher)
-- "agent restarted" (same hostname re-registers)
-
-Visible via `node get --hostname X` or a dedicated `node events --hostname X`
-command.
-
-### Consistent Resource Model (Kubernetes-inspired)
-
-Every Kubernetes object has a uniform envelope: `apiVersion`, `kind`, `metadata`
-(name, namespace, labels, annotations, creationTimestamp, uid), `spec`,
-`status`. We could formalize OSAPI resources similarly:
-
-- Each resource type (node, job, audit entry) gets a consistent structure
-- `metadata.labels`, `metadata.annotations`, `metadata.createdAt` on every
- resource
-- Annotations (separate from labels) for non-routing metadata
-- Enables generic tooling: filtering, sorting, field selectors
-
-### Agent States (systemd-inspired)
-
-Systemd units have explicit states: Active, Inactive, Failed, Activating,
-Deactivating. Currently we only have "present in KV = alive". Adding explicit
-states would enable:
-
-- `Starting` - agent is initializing, not yet processing jobs
-- `Ready` - agent is healthy and processing jobs
-- `Draining` - agent is shutting down gracefully, finishing in-flight jobs but
- not accepting new ones
-- `Stopped` - clean shutdown (deregistered)
-
-State transitions would be visible in the registry and in lifecycle events.
-
-### Restart Tracking (systemd-inspired)
-
-Systemd tracks restart counts and restart reasons. We could add:
-
-- `restart_count` - how many times the agent process has started for this
- hostname
-- `last_restart_reason` - "clean start", "crash recovery", etc.
-- Stability signal for fleet health dashboards
-
-### Additional State to Save
-
-- **First-seen timestamp** (`started_at`) distinct from last heartbeat
- (`registered_at`) for true "AGE" display like `kubectl get nodes`
-- **Active job count** - how busy the agent is right now
-- **Agent binary version** - for fleet version tracking and rolling upgrade
- visibility
-- **OS kernel version** - already available from host provider
-
-## Notes
-
-- These are incremental improvements that build on the heartbeat enrichment
- work. Each can be implemented independently.
-- Priority should be driven by operational value: conditions and capacity
- tracking are highest value for fleet operators.
-- Taints and lifecycle events add complexity but enable sophisticated fleet
- management workflows.
-- The consistent resource model is the most ambitious change and would touch the
- most code, but pays off long-term for tooling and API consistency.
diff --git a/docs/docs/sidebar/features/agent-lifecycle.md b/docs/docs/sidebar/features/agent-lifecycle.md
new file mode 100644
index 00000000..b3b83dde
--- /dev/null
+++ b/docs/docs/sidebar/features/agent-lifecycle.md
@@ -0,0 +1,132 @@
+---
+sidebar_position: 4
+---
+
+# Agent Lifecycle
+
+OSAPI agents report threshold-based **node conditions** and support graceful
+**drain/cordon** for maintenance. Both features are inspired by Kubernetes node
+management patterns.
+
+## Node Conditions
+
+Conditions are threshold-based booleans evaluated agent-side on every heartbeat
+(10 seconds). They surface "is anything wrong?" at a glance without requiring
+operators to interpret raw metrics.
+
+| Condition | Default Threshold | Data Source |
+| ---------------- | -------------------- | ---------------- |
+| `MemoryPressure` | Memory used > 90% | Heartbeat memory |
+| `HighLoad` | Load1 > 2x CPU count | Heartbeat load |
+| `DiskPressure` | Any disk > 90% used | Heartbeat disk |
+
+Each condition tracks:
+
+- **Status** -- `true` when the threshold is exceeded, `false` otherwise
+- **Reason** -- human-readable explanation (e.g., "memory 94% used, 15.1/16.0
+ GB")
+- **LastTransitionTime** -- when the condition last flipped between true and
+ false
+
+### CLI Display
+
+`agent list` shows active conditions in the CONDITIONS column:
+
+```
+HOSTNAME STATUS CONDITIONS LABELS AGE LOAD (1m) OS
+web-01 Ready HighLoad,MemoryPressure - 3d 4h 4.12 Ubuntu 24.04
+web-02 Ready - - 12h 0.31 Ubuntu 24.04
+db-01 Ready DiskPressure - 5d 1.22 Ubuntu 24.04
+```
+
+`agent get` shows full condition details:
+
+```
+Conditions:
+ TYPE STATUS REASON SINCE
+ MemoryPressure true memory 94% used (15.1/16.0 GB) 2m ago
+ HighLoad true load 4.12, threshold 4.00 for 2 CPUs 5m ago
+ DiskPressure false
+```
+
+### Configuration
+
+Thresholds are configurable in `osapi.yaml`:
+
+```yaml
+agent:
+ conditions:
+ memory_pressure_threshold: 90 # percent used
+ high_load_multiplier: 2.0 # load1 / cpu_count
+ disk_pressure_threshold: 90 # percent used
+```
+
+## Agent Drain
+
+Drain allows operators to gracefully remove an agent from the job routing pool
+for maintenance without stopping the process. When an agent stops without
+draining, it vanishes from the registry and looks identical to a crash.
+
+### State Machine
+
+Agents have an explicit scheduling state with three values:
+
+```
+Ready ──(drain)──> Draining ──(jobs done)──> Cordoned
+ ^ │
+ └──────────────(undrain)───────────────────────┘
+```
+
+| State | Meaning |
+| ---------- | ------------------------------------------- |
+| `Ready` | Accepting and processing jobs (default) |
+| `Draining` | Finishing in-flight jobs, not accepting new |
+| `Cordoned` | Fully drained, idle, not accepting jobs |
+
+### How It Works
+
+1. Operator calls `osapi client agent drain --hostname web-01`
+2. API writes a `drain.{hostname}` key to the state KV bucket
+3. Agent detects the drain flag on its next heartbeat tick (10s)
+4. Agent transitions to `Draining` and **unsubscribes from NATS JetStream
+ consumers** -- this is how it stops receiving new jobs
+5. In-flight jobs continue to completion
+6. Once all in-flight jobs finish, state becomes `Cordoned`
+7. Operator calls `osapi client agent undrain --hostname web-01`
+8. API deletes the drain key; agent resubscribes and transitions to `Ready`
+
+### Timeline
+
+Every state transition is recorded as an append-only event in the state KV
+bucket (`agent-state`, no TTL). `agent get` shows the full transition history:
+
+```
+Timeline:
+ TIMESTAMP EVENT HOSTNAME MESSAGE
+ 2026-03-05 10:00:00 drain web-01 Drain initiated
+ 2026-03-05 10:05:23 cordoned web-01 All jobs completed
+ 2026-03-05 12:00:00 undrain web-01 Resumed accepting jobs
+```
+
+### CLI Commands
+
+```bash
+osapi client agent drain --hostname web-01 # start draining
+osapi client agent undrain --hostname web-01 # resume accepting jobs
+```
+
+Both commands return the current state and a confirmation message.
+
+## Permissions
+
+Node conditions are included in the standard `agent:read` responses. Drain and
+undrain operations require the `agent:write` permission, which is included in
+the `admin` role by default.
+
+## Related
+
+- [Agent CLI Reference](../usage/cli/client/agent/agent.mdx) -- agent fleet
+ commands
+- [Node Management](node-management.md) -- node queries via the job system
+- [Job System](job-system.md) -- how async job processing works
+- [Configuration](../usage/configuration.md) -- full configuration reference
diff --git a/docs/docs/sidebar/features/authentication.md b/docs/docs/sidebar/features/authentication.md
index 7654352d..88d7631d 100644
--- a/docs/docs/sidebar/features/authentication.md
+++ b/docs/docs/sidebar/features/authentication.md
@@ -60,11 +60,11 @@ flowchart TD
Built-in roles expand to these default permissions:
-| Role | Permissions |
-| ------- | -------------------------------------------------------------------------------------------------- |
-| `admin` | `node:read`, `network:read`, `network:write`, `job:read`, `job:write`, `health:read`, `audit:read` |
-| `write` | `node:read`, `network:read`, `network:write`, `job:read`, `job:write`, `health:read` |
-| `read` | `node:read`, `network:read`, `job:read`, `health:read` |
+| Role | Permissions |
+| ------- | ------------------------------------------------------------------------------------------------------------------------------- |
+| `admin` | `agent:read`, `agent:write`, `node:read`, `network:read`, `network:write`, `job:read`, `job:write`, `health:read`, `audit:read` |
+| `write` | `agent:read`, `node:read`, `network:read`, `network:write`, `job:read`, `job:write`, `health:read` |
+| `read` | `agent:read`, `node:read`, `network:read`, `job:read`, `health:read` |
### Custom Roles
diff --git a/docs/docs/sidebar/features/node-management.md b/docs/docs/sidebar/features/node-management.md
index 9a857ebf..a9eb931d 100644
--- a/docs/docs/sidebar/features/node-management.md
+++ b/docs/docs/sidebar/features/node-management.md
@@ -14,13 +14,15 @@ host.
OSAPI separates agent fleet discovery from node system queries:
- **Agent** commands (`agent list`, `agent get`) read directly from the NATS KV
- heartbeat registry. They show which agents are online, their labels, and
- lightweight metrics from the last heartbeat. No jobs are created. Agents also
- expose typed **system facts** (architecture, kernel version, FQDN, CPU count,
- network interfaces, service manager, package manager) gathered every 60
- seconds via providers and stored in a separate `agent-facts` KV bucket with a
- 5-minute TTL. The API merges registry and facts data into a single `AgentInfo`
- response.
+ heartbeat registry. They show which agents are online, their labels,
+ lightweight metrics, and [node conditions](agent-lifecycle.md) from the last
+ heartbeat. No jobs are created. Agents also expose typed **system facts**
+ (architecture, kernel version, FQDN, CPU count, network interfaces, service
+ manager, package manager) gathered every 60 seconds via providers and stored
+ in a separate `agent-facts` KV bucket with a 5-minute TTL. The API merges
+ registry and facts data into a single `AgentInfo` response. Agents can be
+ [drained](agent-lifecycle.md#agent-drain) for maintenance without stopping the
+ process.
- **Node** commands (`node hostname`, `node status`) dispatch jobs to agents
that execute system commands and return detailed results (disk usage, full
memory breakdown, etc.).
diff --git a/docs/docs/sidebar/usage/cli/client/agent/drain.md b/docs/docs/sidebar/usage/cli/client/agent/drain.md
new file mode 100644
index 00000000..5b35701e
--- /dev/null
+++ b/docs/docs/sidebar/usage/cli/client/agent/drain.md
@@ -0,0 +1,24 @@
+# Drain
+
+Drain an agent to stop it from accepting new jobs. In-flight jobs continue to
+completion:
+
+```bash
+$ osapi client agent drain --hostname web-01
+
+ Hostname: web-01
+ Status: Draining
+ Message: Agent drain initiated
+```
+
+The agent transitions from `Ready` to `Draining`. Once all in-flight jobs
+finish, the state becomes `Cordoned`. The agent stays running and continues
+sending heartbeats -- it just stops pulling new work from the job queue.
+
+Use `agent undrain` to resume accepting jobs.
+
+## Flags
+
+| Flag | Description | Required |
+| ------------ | ------------------------------ | -------- |
+| `--hostname` | Hostname of the agent to drain | Yes |
diff --git a/docs/docs/sidebar/usage/cli/client/agent/get.md b/docs/docs/sidebar/usage/cli/client/agent/get.md
index 127ed5cc..8bde755f 100644
--- a/docs/docs/sidebar/usage/cli/client/agent/get.md
+++ b/docs/docs/sidebar/usage/cli/client/agent/get.md
@@ -6,6 +6,7 @@ Get detailed information about a specific agent by hostname:
$ osapi client agent get --hostname web-01
Hostname: web-01 Status: Ready
+ State: Draining
Labels: group:web.dev.us-east
OS: Ubuntu 24.04
Uptime: 6 days, 3 hours, 54 minutes
@@ -22,29 +23,43 @@ $ osapi client agent get --hostname web-01
Interfaces:
eth0: 10.0.1.10 (IPv4), fe80::1 (IPv6), MAC 00:1a:2b:3c:4d:5e
lo: 127.0.0.1 (IPv4), ::1 (IPv6)
+
+ Conditions:
+ TYPE STATUS REASON SINCE
+ MemoryPressure true memory 94% used (15.1/16.0 GB) 2m ago
+ HighLoad true load 4.12, threshold 4.00 for 2 CPUs 5m ago
+ DiskPressure false
+
+ Timeline:
+ TIMESTAMP EVENT HOSTNAME MESSAGE
+ 2026-03-05 10:00:00 drain web-01 Drain initiated
+ 2026-03-05 10:05:23 cordoned web-01 All jobs completed
```
This command reads directly from the agent heartbeat registry -- no job is
created. The data comes from the agent's most recent heartbeat write.
-| Field | Description |
-| ------------ | --------------------------------------------------- |
-| Hostname | Agent's configured or OS hostname |
-| Status | `Ready` if present in registry |
-| Labels | Key-value labels from agent config |
-| OS | Distribution and version |
-| Uptime | System uptime reported by the agent |
-| Age | Time since the agent process started |
-| Last Seen | Time since the last heartbeat refresh |
-| Load | 1-, 5-, and 15-minute load averages |
-| Memory | Total, used, and free RAM |
-| Architecture | CPU architecture (e.g., amd64) |
-| Kernel | OS kernel version |
-| FQDN | Fully qualified domain name |
-| CPUs | Number of logical CPUs |
-| Service Mgr | Init system (e.g., systemd) |
-| Package Mgr | Package manager (e.g., apt) |
-| Interfaces | Network interfaces with IPv4, IPv6, MAC, and family |
+| Field | Description |
+| ------------ | --------------------------------------------------------- |
+| Hostname | Agent's configured or OS hostname |
+| Status | `Ready` if present in registry |
+| State | Scheduling state: `Draining` or `Cordoned` (if not Ready) |
+| Labels | Key-value labels from agent config |
+| OS | Distribution and version |
+| Uptime | System uptime reported by the agent |
+| Age | Time since the agent process started |
+| Last Seen | Time since the last heartbeat refresh |
+| Load | 1-, 5-, and 15-minute load averages |
+| Memory | Total, used, and free RAM |
+| Architecture | CPU architecture (e.g., amd64) |
+| Kernel | OS kernel version |
+| FQDN | Fully qualified domain name |
+| CPUs | Number of logical CPUs |
+| Service Mgr | Init system (e.g., systemd) |
+| Package Mgr | Package manager (e.g., apt) |
+| Interfaces | Network interfaces with IPv4, IPv6, MAC, and family |
+| Conditions | Node conditions table (type, status, reason, since) |
+| Timeline | State transition events (timestamp, event, hostname) |
:::tip agent get vs. node status
diff --git a/docs/docs/sidebar/usage/cli/client/agent/list.md b/docs/docs/sidebar/usage/cli/client/agent/list.md
index 4c0f6973..172dc3e7 100644
--- a/docs/docs/sidebar/usage/cli/client/agent/list.md
+++ b/docs/docs/sidebar/usage/cli/client/agent/list.md
@@ -5,25 +5,27 @@ List active agents in the fleet with status, labels, age, and system metrics:
```bash
$ osapi client agent list
- Active Agents (2):
+ Active Agents (3):
- HOSTNAME STATUS LABELS AGE LOAD (1m) OS
- web-01 Ready group:web.dev.us-east 3d 4h 1.78 Ubuntu 24.04
- web-02 Ready group:web.dev.us-west 12h 5m 0.45 Ubuntu 24.04
+ HOSTNAME STATUS CONDITIONS LABELS AGE LOAD (1m) OS
+ web-01 Ready HighLoad,MemoryPressure group:web.dev.us-east 3d 4h 4.12 Ubuntu 24.04
+ web-02 Ready - group:web.dev.us-west 12h 5m 0.45 Ubuntu 24.04
+ db-01 Cordoned DiskPressure - 5d 2h 1.22 Ubuntu 24.04
```
This command reads directly from the agent heartbeat registry -- no job is
created. Each agent writes a heartbeat every 10 seconds with a 30-second TTL.
Agents that stop heartbeating disappear from the list automatically.
-| Column | Source |
-| --------- | --------------------------------------- |
-| HOSTNAME | Agent's configured or OS hostname |
-| STATUS | `Ready` if present in registry |
-| LABELS | Key-value labels from agent config |
-| AGE | Time since the agent process started |
-| LOAD (1m) | 1-minute load average from heartbeat |
-| OS | Distribution and version from heartbeat |
+| Column | Source |
+| ---------- | --------------------------------------------------------------- |
+| HOSTNAME | Agent's configured or OS hostname |
+| STATUS | Scheduling state: `Ready`, `Draining`, or `Cordoned` |
+| CONDITIONS | Active node conditions (MemoryPressure, HighLoad, DiskPressure) |
+| LABELS | Key-value labels from agent config |
+| AGE | Time since the agent process started |
+| LOAD (1m) | 1-minute load average from heartbeat |
+| OS | Distribution and version from heartbeat |
:::tip Full facts in JSON output
diff --git a/docs/docs/sidebar/usage/cli/client/agent/undrain.md b/docs/docs/sidebar/usage/cli/client/agent/undrain.md
new file mode 100644
index 00000000..aaa75ea2
--- /dev/null
+++ b/docs/docs/sidebar/usage/cli/client/agent/undrain.md
@@ -0,0 +1,20 @@
+# Undrain
+
+Resume accepting jobs on a drained or cordoned agent:
+
+```bash
+$ osapi client agent undrain --hostname web-01
+
+ Hostname: web-01
+ Status: Ready
+ Message: Agent undrain initiated
+```
+
+The agent re-subscribes to NATS JetStream consumers and transitions back to
+`Ready`.
+
+## Flags
+
+| Flag | Description | Required |
+| ------------ | -------------------------------- | -------- |
+| `--hostname` | Hostname of the agent to undrain | Yes |
diff --git a/docs/docs/sidebar/usage/configuration.md b/docs/docs/sidebar/usage/configuration.md
index 46f438f2..843b07ed 100644
--- a/docs/docs/sidebar/usage/configuration.md
+++ b/docs/docs/sidebar/usage/configuration.md
@@ -23,47 +23,53 @@ Every config key can be overridden with an environment variable using the
`OSAPI_` prefix. Dots and nested keys become underscores, and the name is
uppercased:
-| Config Key | Environment Variable |
-| ---------------------------------- | ---------------------------------------- |
-| `debug` | `OSAPI_DEBUG` |
-| `api.server.port` | `OSAPI_API_SERVER_PORT` |
-| `api.server.nats.host` | `OSAPI_API_SERVER_NATS_HOST` |
-| `api.server.nats.port` | `OSAPI_API_SERVER_NATS_PORT` |
-| `api.server.nats.client_name` | `OSAPI_API_SERVER_NATS_CLIENT_NAME` |
-| `api.server.nats.namespace` | `OSAPI_API_SERVER_NATS_NAMESPACE` |
-| `api.server.nats.auth.type` | `OSAPI_API_SERVER_NATS_AUTH_TYPE` |
-| `api.server.security.signing_key` | `OSAPI_API_SERVER_SECURITY_SIGNING_KEY` |
-| `api.client.security.bearer_token` | `OSAPI_API_CLIENT_SECURITY_BEARER_TOKEN` |
-| `nats.server.host` | `OSAPI_NATS_SERVER_HOST` |
-| `nats.server.port` | `OSAPI_NATS_SERVER_PORT` |
-| `nats.server.namespace` | `OSAPI_NATS_SERVER_NAMESPACE` |
-| `nats.server.auth.type` | `OSAPI_NATS_SERVER_AUTH_TYPE` |
-| `nats.stream.name` | `OSAPI_NATS_STREAM_NAME` |
-| `nats.kv.bucket` | `OSAPI_NATS_KV_BUCKET` |
-| `nats.kv.response_bucket` | `OSAPI_NATS_KV_RESPONSE_BUCKET` |
-| `nats.audit.bucket` | `OSAPI_NATS_AUDIT_BUCKET` |
-| `nats.audit.ttl` | `OSAPI_NATS_AUDIT_TTL` |
-| `nats.audit.max_bytes` | `OSAPI_NATS_AUDIT_MAX_BYTES` |
-| `nats.audit.storage` | `OSAPI_NATS_AUDIT_STORAGE` |
-| `nats.audit.replicas` | `OSAPI_NATS_AUDIT_REPLICAS` |
-| `nats.registry.bucket` | `OSAPI_NATS_REGISTRY_BUCKET` |
-| `nats.registry.ttl` | `OSAPI_NATS_REGISTRY_TTL` |
-| `nats.registry.storage` | `OSAPI_NATS_REGISTRY_STORAGE` |
-| `nats.registry.replicas` | `OSAPI_NATS_REGISTRY_REPLICAS` |
-| `nats.facts.bucket` | `OSAPI_NATS_FACTS_BUCKET` |
-| `nats.facts.ttl` | `OSAPI_NATS_FACTS_TTL` |
-| `nats.facts.storage` | `OSAPI_NATS_FACTS_STORAGE` |
-| `nats.facts.replicas` | `OSAPI_NATS_FACTS_REPLICAS` |
-| `telemetry.tracing.enabled` | `OSAPI_TELEMETRY_TRACING_ENABLED` |
-| `telemetry.tracing.exporter` | `OSAPI_TELEMETRY_TRACING_EXPORTER` |
-| `telemetry.tracing.otlp_endpoint` | `OSAPI_TELEMETRY_TRACING_OTLP_ENDPOINT` |
-| `agent.nats.host` | `OSAPI_AGENT_NATS_HOST` |
-| `agent.nats.port` | `OSAPI_AGENT_NATS_PORT` |
-| `agent.nats.client_name` | `OSAPI_AGENT_NATS_CLIENT_NAME` |
-| `agent.nats.namespace` | `OSAPI_AGENT_NATS_NAMESPACE` |
-| `agent.nats.auth.type` | `OSAPI_AGENT_NATS_AUTH_TYPE` |
-| `agent.hostname` | `OSAPI_AGENT_HOSTNAME` |
-| `agent.facts.interval` | `OSAPI_AGENT_FACTS_INTERVAL` |
+| Config Key | Environment Variable |
+| -------------------------------------------- | -------------------------------------------------- |
+| `debug` | `OSAPI_DEBUG` |
+| `api.server.port` | `OSAPI_API_SERVER_PORT` |
+| `api.server.nats.host` | `OSAPI_API_SERVER_NATS_HOST` |
+| `api.server.nats.port` | `OSAPI_API_SERVER_NATS_PORT` |
+| `api.server.nats.client_name` | `OSAPI_API_SERVER_NATS_CLIENT_NAME` |
+| `api.server.nats.namespace` | `OSAPI_API_SERVER_NATS_NAMESPACE` |
+| `api.server.nats.auth.type` | `OSAPI_API_SERVER_NATS_AUTH_TYPE` |
+| `api.server.security.signing_key` | `OSAPI_API_SERVER_SECURITY_SIGNING_KEY` |
+| `api.client.security.bearer_token` | `OSAPI_API_CLIENT_SECURITY_BEARER_TOKEN` |
+| `nats.server.host` | `OSAPI_NATS_SERVER_HOST` |
+| `nats.server.port` | `OSAPI_NATS_SERVER_PORT` |
+| `nats.server.namespace` | `OSAPI_NATS_SERVER_NAMESPACE` |
+| `nats.server.auth.type` | `OSAPI_NATS_SERVER_AUTH_TYPE` |
+| `nats.stream.name` | `OSAPI_NATS_STREAM_NAME` |
+| `nats.kv.bucket` | `OSAPI_NATS_KV_BUCKET` |
+| `nats.kv.response_bucket` | `OSAPI_NATS_KV_RESPONSE_BUCKET` |
+| `nats.audit.bucket` | `OSAPI_NATS_AUDIT_BUCKET` |
+| `nats.audit.ttl` | `OSAPI_NATS_AUDIT_TTL` |
+| `nats.audit.max_bytes` | `OSAPI_NATS_AUDIT_MAX_BYTES` |
+| `nats.audit.storage` | `OSAPI_NATS_AUDIT_STORAGE` |
+| `nats.audit.replicas` | `OSAPI_NATS_AUDIT_REPLICAS` |
+| `nats.registry.bucket` | `OSAPI_NATS_REGISTRY_BUCKET` |
+| `nats.registry.ttl` | `OSAPI_NATS_REGISTRY_TTL` |
+| `nats.registry.storage` | `OSAPI_NATS_REGISTRY_STORAGE` |
+| `nats.registry.replicas` | `OSAPI_NATS_REGISTRY_REPLICAS` |
+| `nats.facts.bucket` | `OSAPI_NATS_FACTS_BUCKET` |
+| `nats.facts.ttl` | `OSAPI_NATS_FACTS_TTL` |
+| `nats.facts.storage` | `OSAPI_NATS_FACTS_STORAGE` |
+| `nats.facts.replicas` | `OSAPI_NATS_FACTS_REPLICAS` |
+| `nats.state.bucket` | `OSAPI_NATS_STATE_BUCKET` |
+| `nats.state.storage` | `OSAPI_NATS_STATE_STORAGE` |
+| `nats.state.replicas` | `OSAPI_NATS_STATE_REPLICAS` |
+| `telemetry.tracing.enabled` | `OSAPI_TELEMETRY_TRACING_ENABLED` |
+| `telemetry.tracing.exporter` | `OSAPI_TELEMETRY_TRACING_EXPORTER` |
+| `telemetry.tracing.otlp_endpoint` | `OSAPI_TELEMETRY_TRACING_OTLP_ENDPOINT` |
+| `agent.nats.host` | `OSAPI_AGENT_NATS_HOST` |
+| `agent.nats.port` | `OSAPI_AGENT_NATS_PORT` |
+| `agent.nats.client_name` | `OSAPI_AGENT_NATS_CLIENT_NAME` |
+| `agent.nats.namespace` | `OSAPI_AGENT_NATS_NAMESPACE` |
+| `agent.nats.auth.type` | `OSAPI_AGENT_NATS_AUTH_TYPE` |
+| `agent.hostname` | `OSAPI_AGENT_HOSTNAME` |
+| `agent.facts.interval` | `OSAPI_AGENT_FACTS_INTERVAL` |
+| `agent.conditions.memory_pressure_threshold` | `OSAPI_AGENT_CONDITIONS_MEMORY_PRESSURE_THRESHOLD` |
+| `agent.conditions.high_load_multiplier` | `OSAPI_AGENT_CONDITIONS_HIGH_LOAD_MULTIPLIER` |
+| `agent.conditions.disk_pressure_threshold` | `OSAPI_AGENT_CONDITIONS_DISK_PRESSURE_THRESHOLD` |
Environment variables take precedence over file values.
@@ -127,11 +133,11 @@ OSAPI uses fine-grained `resource:verb` permissions for access control. Each API
endpoint requires a specific permission. Built-in roles expand to a default set
of permissions:
-| Role | Permissions |
-| ------- | ----------------------------------------------------------------------------------------------------------------------------------- |
-| `admin` | `agent:read`, `node:read`, `network:read`, `network:write`, `job:read`, `job:write`, `health:read`, `audit:read`, `command:execute` |
-| `write` | `agent:read`, `node:read`, `network:read`, `network:write`, `job:read`, `job:write`, `health:read` |
-| `read` | `agent:read`, `node:read`, `network:read`, `job:read`, `health:read` |
+| Role | Permissions |
+| ------- | -------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `admin` | `agent:read`, `agent:write`, `node:read`, `network:read`, `network:write`, `job:read`, `job:write`, `health:read`, `audit:read`, `command:execute` |
+| `write` | `agent:read`, `node:read`, `network:read`, `network:write`, `job:read`, `job:write`, `health:read` |
+| `read` | `agent:read`, `node:read`, `network:read`, `job:read`, `health:read` |
### Custom Roles
@@ -226,9 +232,9 @@ api:
- 'http://localhost:3001'
- 'https://osapi-io.github.io'
# Custom roles with fine-grained permissions.
- # Permissions: agent:read, node:read, network:read, network:write,
- # job:read, job:write, health:read, audit:read,
- # command:execute
+ # Permissions: agent:read, agent:write, node:read, network:read,
+ # network:write, job:read, job:write, health:read,
+ # audit:read, command:execute
# roles:
# ops:
# permissions:
@@ -324,6 +330,16 @@ nats:
# Number of KV replicas.
replicas: 1
+ # ── State KV bucket ──────────────────────────────────────
+ state:
+ # KV bucket for persistent agent state (drain flags, timeline events).
+ # No TTL — operator actions persist indefinitely.
+ bucket: 'agent-state'
+ # Storage backend: "file" or "memory".
+ storage: 'file'
+ # Number of KV replicas.
+ replicas: 1
+
# ── Dead Letter Queue ─────────────────────────────────────
dlq:
# Maximum age of messages in the DLQ.
@@ -379,6 +395,14 @@ agent:
facts:
# How often the agent collects and publishes facts.
interval: '60s'
+ # Node condition thresholds.
+ conditions:
+ # Memory pressure threshold (percent used).
+ memory_pressure_threshold: 90
+ # High load multiplier (load1 / cpu_count).
+ high_load_multiplier: 2.0
+ # Disk pressure threshold (percent used).
+ disk_pressure_threshold: 90
# Queue group for load-balanced (_any) subscriptions.
queue_group: 'job-agents'
# Agent hostname for direct routing. Defaults to the
@@ -481,6 +505,14 @@ agent:
| `storage` | string | `"file"` or `"memory"` |
| `replicas` | int | Number of KV replicas |
+### `nats.state`
+
+| Key | Type | Description |
+| ---------- | ------ | --------------------------------------------- |
+| `bucket` | string | KV bucket for persistent agent state (no TTL) |
+| `storage` | string | `"file"` or `"memory"` |
+| `replicas` | int | Number of KV replicas |
+
### `nats.dlq`
| Key | Type | Description |
@@ -500,23 +532,26 @@ agent:
### `agent`
-| Key | Type | Description |
-| -------------------------- | ----------------- | ---------------------------------------- |
-| `nats.host` | string | NATS server hostname |
-| `nats.port` | int | NATS server port |
-| `nats.client_name` | string | NATS client identification name |
-| `nats.namespace` | string | Subject namespace prefix |
-| `nats.auth.type` | string | Auth type: `none`, `user_pass` |
-| `nats.auth.username` | string | Username for `user_pass` auth |
-| `nats.auth.password` | string | Password for `user_pass` auth |
-| `consumer.name` | string | Durable consumer name |
-| `consumer.max_deliver` | int | Max redelivery attempts before DLQ |
-| `consumer.ack_wait` | string | ACK timeout (Go duration) |
-| `consumer.max_ack_pending` | int | Max outstanding unacknowledged msgs |
-| `consumer.replay_policy` | string | `"instant"` or `"original"` |
-| `consumer.back_off` | []string | Backoff durations between redeliveries |
-| `queue_group` | string | Queue group for load-balanced routing |
-| `hostname` | string | Agent hostname (defaults to OS hostname) |
-| `max_jobs` | int | Max concurrent jobs |
-| `facts.interval` | string | How often the agent collects facts |
-| `labels` | map[string]string | Key-value pairs for label-based routing |
+| Key | Type | Description |
+| -------------------------------------- | ----------------- | ---------------------------------------------- |
+| `nats.host` | string | NATS server hostname |
+| `nats.port` | int | NATS server port |
+| `nats.client_name` | string | NATS client identification name |
+| `nats.namespace` | string | Subject namespace prefix |
+| `nats.auth.type` | string | Auth type: `none`, `user_pass` |
+| `nats.auth.username` | string | Username for `user_pass` auth |
+| `nats.auth.password` | string | Password for `user_pass` auth |
+| `consumer.name` | string | Durable consumer name |
+| `consumer.max_deliver` | int | Max redelivery attempts before DLQ |
+| `consumer.ack_wait` | string | ACK timeout (Go duration) |
+| `consumer.max_ack_pending` | int | Max outstanding unacknowledged msgs |
+| `consumer.replay_policy` | string | `"instant"` or `"original"` |
+| `consumer.back_off` | []string | Backoff durations between redeliveries |
+| `queue_group` | string | Queue group for load-balanced routing |
+| `hostname` | string | Agent hostname (defaults to OS hostname) |
+| `max_jobs` | int | Max concurrent jobs |
+| `facts.interval` | string | How often the agent collects facts |
+| `conditions.memory_pressure_threshold` | int | Memory pressure threshold percent (default 90) |
+| `conditions.high_load_multiplier` | float | Load multiplier over CPU count (default 2.0) |
+| `conditions.disk_pressure_threshold` | int | Disk pressure threshold percent (default 90) |
+| `labels` | map[string]string | Key-value pairs for label-based routing |
diff --git a/docs/docusaurus.config.ts b/docs/docusaurus.config.ts
index bb2a104a..a1927b92 100644
--- a/docs/docusaurus.config.ts
+++ b/docs/docusaurus.config.ts
@@ -90,6 +90,11 @@ const config: Config = {
label: 'Network Management',
docId: 'sidebar/features/network-management'
},
+ {
+ type: 'doc',
+ label: 'Agent Lifecycle',
+ docId: 'sidebar/features/agent-lifecycle'
+ },
{
type: 'doc',
label: 'Job System',
diff --git a/docs/plans/2026-03-05-node-conditions-drain-design.md b/docs/plans/2026-03-05-node-conditions-drain-design.md
new file mode 100644
index 00000000..5d784b22
--- /dev/null
+++ b/docs/plans/2026-03-05-node-conditions-drain-design.md
@@ -0,0 +1,336 @@
+# Node Conditions and Agent Drain
+
+## Context
+
+OSAPI agents collect rich system metrics (memory, load, disk, CPU count) via
+heartbeat and facts, but operators must manually interpret raw numbers to detect
+problems. Kubernetes solves this with node conditions — threshold-based booleans
+that surface "is anything wrong?" at a glance.
+
+Additionally, there's no way to gracefully remove an agent from the job routing
+pool for maintenance without stopping the process entirely. When an agent stops,
+it vanishes from the registry and looks identical to a crash. Kubernetes handles
+this with cordon/drain.
+
+This design adds both features to OSAPI.
+
+## Node Conditions
+
+### Condition Types
+
+Three conditions derived from existing heartbeat and facts data, evaluated
+agent-side on each heartbeat tick (10s):
+
+| Condition | Default Threshold | Data Source |
+| ---------------- | -------------------- | ----------------------------------------------- |
+| `MemoryPressure` | memory used > 90% | `MemoryStats` (heartbeat) |
+| `HighLoad` | load1 > 2× CPU count | `LoadAverages` (heartbeat) + `CPUCount` (facts) |
+| `DiskPressure` | any disk > 90% used | `DiskStats` (new in heartbeat) |
+
+### Condition Structure
+
+Each condition has:
+
+```go
+type Condition struct {
+ Type string `json:"type"`
+ Status bool `json:"status"`
+ Reason string `json:"reason,omitempty"`
+ LastTransitionTime time.Time `json:"last_transition_time"`
+}
+```
+
+- `Status`: `true` = condition is active (pressure/overload detected)
+- `Reason`: human-readable explanation (e.g., "memory 94% used (15.1/16.0 GB)")
+- `LastTransitionTime`: when the condition last changed from true→false or
+ false→true
+
+### Configuration
+
+Thresholds configurable in `osapi.yaml` with sensible defaults:
+
+```yaml
+agent:
+ conditions:
+ memory_pressure_threshold: 90 # percent used
+ high_load_multiplier: 2.0 # load1 / cpu_count
+ disk_pressure_threshold: 90 # percent used
+```
+
+### Evaluation
+
+Conditions are evaluated in the agent during `writeRegistration()`. The agent
+maintains previous condition state in memory to track `LastTransitionTime` —
+only updated when the boolean flips.
+
+DiskPressure requires adding disk stats to the heartbeat. The existing
+`disk.Provider` already implements `GetUsage()` so the data is available. Disk
+collection joins the existing non-fatal provider pattern: if it fails, the
+DiskPressure condition is simply not evaluated.
+
+### Storage
+
+Conditions are stored as part of `AgentRegistration` in the registry KV bucket.
+No new KV bucket needed.
+
+```go
+type AgentRegistration struct {
+ // ... existing fields ...
+ Conditions []Condition `json:"conditions,omitempty"`
+}
+```
+
+### CLI Display
+
+`agent list` gains a CONDITIONS column showing active conditions:
+
+```
+HOSTNAME STATUS CONDITIONS LOAD OS
+web-01 Ready HighLoad,MemoryPressure 4.12 Ubuntu 24.04
+web-02 Ready - 0.31 Ubuntu 24.04
+db-01 Ready DiskPressure 1.22 Ubuntu 24.04
+```
+
+`agent get` shows full condition details and state timeline:
+
+```
+Conditions:
+ MemoryPressure: true (memory 94% used, 15.1/16.0 GB) since 2m ago
+ HighLoad: true (load 4.12, threshold 4.00 for 2 CPUs) since 5m ago
+ DiskPressure: false
+
+Timeline:
+ TIMESTAMP EVENT HOSTNAME MESSAGE
+ 2026-03-05 10:00:00 drain web-01 Drain initiated
+ 2026-03-05 10:05:23 cordoned web-01 All jobs completed
+ 2026-03-05 12:00:00 undrain web-01 Resumed accepting jobs
+```
+
+## Agent Drain
+
+### State Machine
+
+Agents gain an explicit state field with three values:
+
+```
+Ready ──(drain)──> Draining ──(jobs done)──> Cordoned
+ ^ │
+ └──────────────(undrain)───────────────────────┘
+```
+
+| State | Meaning |
+| ---------- | ------------------------------------------------ |
+| `Ready` | Accepting and processing jobs (default) |
+| `Draining` | Finishing in-flight jobs, not accepting new ones |
+| `Cordoned` | Fully drained, idle, not accepting jobs |
+
+### Mechanism
+
+1. Operator calls `POST /agent/{hostname}/drain`
+2. API writes a `drain.{hostname}` key to the state KV bucket
+3. Agent checks for drain key on each heartbeat tick (10s)
+4. When drain flag detected:
+ - Agent transitions state to `Draining`
+ - Agent unsubscribes from NATS consumer (stops receiving new jobs)
+ - In-flight jobs continue to completion
+5. Once WaitGroup drains (no in-flight jobs), state becomes `Cordoned`
+6. `POST /agent/{hostname}/undrain` deletes the drain key
+7. Agent detects drain key removal on next heartbeat:
+ - Transitions state to `Ready`
+ - Re-subscribes to NATS consumer
+
+### API Endpoints
+
+```
+POST /agent/{hostname}/drain # Start draining
+POST /agent/{hostname}/undrain # Resume accepting jobs
+```
+
+Both return 200 on success, 404 if agent not found, 409 if already in the
+requested state.
+
+### Permission
+
+New `agent:write` permission. Added to the `admin` role by default.
+
+### Storage
+
+Agent state transitions are recorded as **append-only events** in the state KV
+bucket (`agent-state`, no TTL), following the same pattern used for job status
+events (see `WriteStatusEvent` in `internal/job/client/agent.go`).
+
+Events reuse the existing `TimelineEvent` type (`internal/job/types.go`) — the
+same type used for job lifecycle events. This type is generic (Timestamp, Event,
+Hostname, Message, Error) and not job-specific:
+
+```
+Key format: timeline.{sanitized_hostname}.{event}.{unix_nano}
+Value: TimelineEvent JSON
+```
+
+Events: `ready`, `drain`, `cordoned`, `undrain`
+
+On the SDK side, `TimelineEvent` is promoted from `job_types.go` to a shared
+top-level type in `pkg/osapi/types.go`. Both `JobDetail.Timeline` and
+`Agent.Timeline` reference the same type.
+
+Current state is **computed from the latest event**, just like job status is
+computed via `computeStatusFromEvents`. This preserves the full transition
+history (Ready → Draining → Cordoned → Ready → Draining → ...) and eliminates
+race conditions by never updating existing keys.
+
+The drain intent uses a separate key: `drain.{sanitized_hostname}`. The API
+writes this key to signal drain; the agent reads it on heartbeat and writes the
+state transition event. The API deletes the key on undrain.
+
+The `AgentRegistration` also carries the current state for quick reads without
+scanning events:
+
+```go
+type AgentRegistration struct {
+ // ... existing fields ...
+ State string `json:"state,omitempty"` // Ready, Draining, Cordoned
+}
+```
+
+### CLI Commands
+
+```bash
+osapi client agent drain --hostname web-01
+osapi client agent undrain --hostname web-01
+```
+
+`agent list` and `agent get` show the state in the STATUS column.
+
+## OpenAPI Changes
+
+### AgentInfo Schema
+
+Add to existing `AgentInfo`:
+
+```yaml
+state:
+ type: string
+ enum: [Ready, Draining, Cordoned]
+ description: Agent scheduling state.
+conditions:
+ type: array
+ items:
+ $ref: '#/components/schemas/NodeCondition'
+```
+
+New schema:
+
+```yaml
+NodeCondition:
+ type: object
+ properties:
+ type:
+ type: string
+ enum: [MemoryPressure, HighLoad, DiskPressure]
+ status:
+ type: boolean
+ reason:
+ type: string
+ last_transition_time:
+ type: string
+ format: date-time
+ required: [type, status, last_transition_time]
+```
+
+### New Endpoints
+
+```yaml
+/agent/{hostname}/drain:
+ post:
+ summary: Drain an agent
+ description: Stop the agent from accepting new jobs.
+ security:
+ - BearerAuth: []
+ responses:
+ 200: ...
+ 404: ...
+ 409: ...
+
+/agent/{hostname}/undrain:
+ post:
+ summary: Undrain an agent
+ description: Resume accepting jobs on a drained agent.
+ security:
+ - BearerAuth: []
+ responses:
+ 200: ...
+ 404: ...
+ 409: ...
+```
+
+### Permission Updates
+
+```yaml
+# New permission
+agent:write
+
+# Updated admin role
+admin:
+ permissions:
+ - agent:read
+ - agent:write # new
+ - node:read
+ - ...
+```
+
+## Implementation Scope
+
+### Provider Changes
+
+- Extend heartbeat to collect disk stats (reuse existing `disk.Provider`)
+- Add condition evaluation logic to agent heartbeat
+
+### Agent Changes
+
+- Add `Condition` type and evaluation functions
+- Add state field to `AgentRegistration`
+- Add drain flag detection on heartbeat tick
+- Add consumer subscribe/unsubscribe for drain/undrain transitions
+- Add condition threshold config support
+
+### API Changes
+
+- New drain/undrain endpoints in the agent API domain
+- Extend `AgentInfo` schema with `state` and `conditions`
+- Add `agent:write` permission and wire into scope middleware
+
+### CLI Changes
+
+- `agent drain` and `agent undrain` commands
+- CONDITIONS column in `agent list`
+- Condition details and state timeline in `agent get`
+- State shown in STATUS column
+
+### SDK Changes
+
+- Promote `TimelineEvent` from `job_types.go` to shared `types.go`
+- Both `JobDetail.Timeline` and `Agent.Timeline` use the same type
+- Add `Agent.Drain()` and `Agent.Undrain()` methods
+- Add conditions, state, and timeline to `Agent` type
+
+### Config Changes
+
+- `agent.conditions` section with threshold defaults
+
+## Testing
+
+- **Unit**: condition evaluation logic (threshold math, transition tracking),
+ state machine transitions, drain flag detection
+- **HTTP wiring**: drain/undrain endpoints with RBAC (401, 403, 200, 404, 409)
+- **Integration**: drain agent → submit job → verify not routed to drained agent
+ → undrain → verify jobs resume
+
+## Verification
+
+```bash
+just generate # regenerate specs + code
+go build ./... # compiles
+just go::unit # tests pass
+just go::vet # lint passes
+```
diff --git a/docs/plans/2026-03-05-node-conditions-drain.md b/docs/plans/2026-03-05-node-conditions-drain.md
new file mode 100644
index 00000000..5227ccdd
--- /dev/null
+++ b/docs/plans/2026-03-05-node-conditions-drain.md
@@ -0,0 +1,1387 @@
+# Node Conditions & Agent Drain Implementation Plan
+
+> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to
+> implement this plan task-by-task.
+
+**Goal:** Add Kubernetes-inspired node conditions (MemoryPressure, HighLoad,
+DiskPressure) and agent drain/cordon lifecycle to OSAPI.
+
+**Architecture:** Conditions are evaluated agent-side on each heartbeat tick
+using existing provider data, stored in AgentRegistration. Drain uses
+append-only timeline events in the registry KV bucket (reusing the existing
+`TimelineEvent` type from job lifecycle), with a separate drain intent key the
+API writes and the agent reads on heartbeat. State transitions trigger NATS
+consumer subscribe/unsubscribe.
+
+**Tech Stack:** Go 1.25, NATS JetStream KV, Echo REST API, OpenAPI codegen,
+testify/suite
+
+**Design Doc:** `docs/plans/2026-03-05-node-conditions-drain-design.md`
+
+---
+
+## Task 1: Add Condition type and evaluation functions
+
+**Files:**
+
+- Create: `internal/agent/condition.go`
+- Create: `internal/agent/condition_test.go`
+
+**Step 1: Write the failing tests**
+
+```go
+// internal/agent/condition_test.go
+package agent
+
+import (
+ "testing"
+ "time"
+
+ "github.com/stretchr/testify/suite"
+
+ "github.com/retr0h/osapi/internal/job"
+ "github.com/retr0h/osapi/internal/provider/node/disk"
+ "github.com/retr0h/osapi/internal/provider/node/load"
+ "github.com/retr0h/osapi/internal/provider/node/mem"
+)
+
+type ConditionTestSuite struct {
+ suite.Suite
+}
+
+func TestConditionTestSuite(t *testing.T) {
+ suite.Run(t, new(ConditionTestSuite))
+}
+
+func (s *ConditionTestSuite) TestEvaluateMemoryPressure() {
+ tests := []struct {
+ name string
+ stats *mem.Stats
+ threshold int
+ wantStatus bool
+ wantReason string
+ }{
+ {
+ name: "above threshold",
+ stats: &mem.Stats{Total: 16000000000, Used: 15000000000, Free: 1000000000},
+ threshold: 90,
+ wantStatus: true,
+ },
+ {
+ name: "below threshold",
+ stats: &mem.Stats{Total: 16000000000, Used: 8000000000, Free: 8000000000},
+ threshold: 90,
+ wantStatus: false,
+ },
+ {
+ name: "nil stats",
+ stats: nil,
+ threshold: 90,
+ wantStatus: false,
+ },
+ }
+
+ for _, tt := range tests {
+ s.Run(tt.name, func() {
+ c := evaluateMemoryPressure(tt.stats, tt.threshold, nil)
+ s.Equal(tt.wantStatus, c.Status)
+ s.Equal(job.ConditionMemoryPressure, c.Type)
+ })
+ }
+}
+
+func (s *ConditionTestSuite) TestEvaluateHighLoad() {
+ tests := []struct {
+ name string
+ loadAvg *load.AverageStats
+ cpuCount int
+ multiplier float64
+ wantStatus bool
+ }{
+ {
+ name: "above threshold",
+ loadAvg: &load.AverageStats{OneMin: 5.0},
+ cpuCount: 2,
+ multiplier: 2.0,
+ wantStatus: true,
+ },
+ {
+ name: "below threshold",
+ loadAvg: &load.AverageStats{OneMin: 1.0},
+ cpuCount: 2,
+ multiplier: 2.0,
+ wantStatus: false,
+ },
+ {
+ name: "nil load",
+ loadAvg: nil,
+ cpuCount: 2,
+ multiplier: 2.0,
+ wantStatus: false,
+ },
+ {
+ name: "zero cpus",
+ loadAvg: &load.AverageStats{OneMin: 5.0},
+ cpuCount: 0,
+ multiplier: 2.0,
+ wantStatus: false,
+ },
+ }
+
+ for _, tt := range tests {
+ s.Run(tt.name, func() {
+ c := evaluateHighLoad(tt.loadAvg, tt.cpuCount, tt.multiplier, nil)
+ s.Equal(tt.wantStatus, c.Status)
+ s.Equal(job.ConditionHighLoad, c.Type)
+ })
+ }
+}
+
+func (s *ConditionTestSuite) TestEvaluateDiskPressure() {
+ tests := []struct {
+ name string
+ disks []disk.UsageStats
+ threshold int
+ wantStatus bool
+ }{
+ {
+ name: "one disk above threshold",
+ disks: []disk.UsageStats{
+ {Name: "/dev/sda1", Total: 100000, Used: 95000, Free: 5000},
+ },
+ threshold: 90,
+ wantStatus: true,
+ },
+ {
+ name: "all disks below threshold",
+ disks: []disk.UsageStats{
+ {Name: "/dev/sda1", Total: 100000, Used: 50000, Free: 50000},
+ },
+ threshold: 90,
+ wantStatus: false,
+ },
+ {
+ name: "nil disks",
+ disks: nil,
+ threshold: 90,
+ wantStatus: false,
+ },
+ }
+
+ for _, tt := range tests {
+ s.Run(tt.name, func() {
+ c := evaluateDiskPressure(tt.disks, tt.threshold, nil)
+ s.Equal(tt.wantStatus, c.Status)
+ s.Equal(job.ConditionDiskPressure, c.Type)
+ })
+ }
+}
+
+func (s *ConditionTestSuite) TestLastTransitionTimeTracking() {
+ prev := []job.Condition{{
+ Type: job.ConditionMemoryPressure, Status: false,
+ LastTransitionTime: time.Now().Add(-5 * time.Minute),
+ }}
+ // Flip from false -> true: should update LastTransitionTime
+ c := evaluateMemoryPressure(
+ &mem.Stats{Total: 100, Used: 95, Free: 5}, 90, prev,
+ )
+ s.True(c.Status)
+ s.True(c.LastTransitionTime.After(time.Now().Add(-1 * time.Second)))
+
+ // Same status (true -> true): should keep old LastTransitionTime
+ prev2 := []job.Condition{c}
+ c2 := evaluateMemoryPressure(
+ &mem.Stats{Total: 100, Used: 95, Free: 5}, 90, prev2,
+ )
+ s.True(c2.Status)
+ s.Equal(c.LastTransitionTime, c2.LastTransitionTime)
+}
+```
+
+**Step 2: Run tests to verify they fail**
+
+Run: `go test -run TestConditionTestSuite -v ./internal/agent/` Expected: FAIL —
+`evaluateMemoryPressure` not defined
+
+**Step 3: Write minimal implementation**
+
+```go
+// internal/agent/condition.go
+package agent
+
+import (
+ "fmt"
+ "time"
+
+ "github.com/retr0h/osapi/internal/job"
+ "github.com/retr0h/osapi/internal/provider/node/disk"
+ "github.com/retr0h/osapi/internal/provider/node/load"
+ "github.com/retr0h/osapi/internal/provider/node/mem"
+)
+
+// findPrevCondition returns the previous condition of the given type,
+// or nil if not found.
+func findPrevCondition(
+ condType string,
+ prev []job.Condition,
+) *job.Condition {
+ for i := range prev {
+ if prev[i].Type == condType {
+ return &prev[i]
+ }
+ }
+ return nil
+}
+
+// transitionTime returns the previous LastTransitionTime if status
+// hasn't changed, otherwise returns now.
+func transitionTime(
+ condType string,
+ newStatus bool,
+ prev []job.Condition,
+) time.Time {
+ if p := findPrevCondition(condType, prev); p != nil {
+ if p.Status == newStatus {
+ return p.LastTransitionTime
+ }
+ }
+ return time.Now()
+}
+
+func evaluateMemoryPressure(
+ stats *mem.Stats,
+ threshold int,
+ prev []job.Condition,
+) job.Condition {
+ c := job.Condition{Type: job.ConditionMemoryPressure}
+ if stats == nil || stats.Total == 0 {
+ c.LastTransitionTime = transitionTime(c.Type, false, prev)
+ return c
+ }
+ pct := float64(stats.Used) / float64(stats.Total) * 100
+ c.Status = pct > float64(threshold)
+ if c.Status {
+ c.Reason = fmt.Sprintf(
+ "memory %.0f%% used (%.1f/%.1f GB)",
+ pct,
+ float64(stats.Used)/1024/1024/1024,
+ float64(stats.Total)/1024/1024/1024,
+ )
+ }
+ c.LastTransitionTime = transitionTime(c.Type, c.Status, prev)
+ return c
+}
+
+func evaluateHighLoad(
+ loadAvg *load.AverageStats,
+ cpuCount int,
+ multiplier float64,
+ prev []job.Condition,
+) job.Condition {
+ c := job.Condition{Type: job.ConditionHighLoad}
+ if loadAvg == nil || cpuCount == 0 {
+ c.LastTransitionTime = transitionTime(c.Type, false, prev)
+ return c
+ }
+ threshold := float64(cpuCount) * multiplier
+ c.Status = loadAvg.OneMin > threshold
+ if c.Status {
+ c.Reason = fmt.Sprintf(
+ "load %.2f, threshold %.2f for %d CPUs",
+ loadAvg.OneMin, threshold, cpuCount,
+ )
+ }
+ c.LastTransitionTime = transitionTime(c.Type, c.Status, prev)
+ return c
+}
+
+func evaluateDiskPressure(
+ disks []disk.UsageStats,
+ threshold int,
+ prev []job.Condition,
+) job.Condition {
+ c := job.Condition{Type: job.ConditionDiskPressure}
+ if len(disks) == 0 {
+ c.LastTransitionTime = transitionTime(c.Type, false, prev)
+ return c
+ }
+ for _, d := range disks {
+ if d.Total == 0 {
+ continue
+ }
+ pct := float64(d.Used) / float64(d.Total) * 100
+ if pct > float64(threshold) {
+ c.Status = true
+ c.Reason = fmt.Sprintf(
+ "%s %.0f%% used (%.1f/%.1f GB)",
+ d.Name, pct,
+ float64(d.Used)/1024/1024/1024,
+ float64(d.Total)/1024/1024/1024,
+ )
+ break
+ }
+ }
+ c.LastTransitionTime = transitionTime(c.Type, c.Status, prev)
+ return c
+}
+```
+
+**Step 4: Run tests to verify they pass**
+
+Run: `go test -run TestConditionTestSuite -v ./internal/agent/` Expected: PASS
+
+**Step 5: Commit**
+
+```bash
+git add internal/agent/condition.go internal/agent/condition_test.go
+git commit -m "feat(agent): add condition evaluation functions"
+```
+
+---
+
+## Task 2: Add Condition and State types to job domain
+
+**Files:**
+
+- Modify: `internal/job/types.go:273-331` (AgentRegistration, AgentInfo)
+
+**Step 1: Write the types**
+
+Add to `internal/job/types.go` after existing types:
+
+```go
+// Condition type constants.
+const (
+ ConditionMemoryPressure = "MemoryPressure"
+ ConditionHighLoad = "HighLoad"
+ ConditionDiskPressure = "DiskPressure"
+)
+
+// Agent state constants.
+const (
+ AgentStateReady = "Ready"
+ AgentStateDraining = "Draining"
+ AgentStateCordoned = "Cordoned"
+)
+
+// Condition represents a node condition evaluated agent-side.
+type Condition struct {
+ Type string `json:"type"`
+ Status bool `json:"status"`
+ Reason string `json:"reason,omitempty"`
+ LastTransitionTime time.Time `json:"last_transition_time"`
+}
+
+```
+
+The existing `TimelineEvent` type (line 177) is already generic and will be
+reused for agent state transitions — no new event type needed.
+
+Add fields to `AgentRegistration`:
+
+```go
+Conditions []Condition `json:"conditions,omitempty"`
+State string `json:"state,omitempty"`
+```
+
+Add fields to `AgentInfo`:
+
+```go
+Conditions []Condition `json:"conditions,omitempty"`
+State string `json:"state,omitempty"`
+Timeline []TimelineEvent `json:"timeline,omitempty"`
+```
+
+**Step 2: Run existing tests**
+
+Run: `go test ./internal/job/... -count=1` Expected: PASS (additive change)
+
+**Step 3: Commit**
+
+```bash
+git add internal/job/types.go
+git commit -m "feat(job): add Condition type and agent state constants"
+```
+
+---
+
+## Task 3: Add conditions config to AgentConfig
+
+**Files:**
+
+- Modify: `internal/config/types.go:262-277`
+- Modify: `configs/osapi.yaml`
+- Modify: `configs/osapi.local.yaml`
+
+**Step 1: Add config struct**
+
+Add to `internal/config/types.go`:
+
+```go
+// AgentConditions holds threshold configuration for node conditions.
+type AgentConditions struct {
+ MemoryPressureThreshold int `mapstructure:"memory_pressure_threshold"`
+ HighLoadMultiplier float64 `mapstructure:"high_load_multiplier"`
+ DiskPressureThreshold int `mapstructure:"disk_pressure_threshold"`
+}
+```
+
+Add field to `AgentConfig`:
+
+```go
+Conditions AgentConditions `mapstructure:"conditions,omitempty"`
+```
+
+**Step 2: Set defaults in osapi.yaml and osapi.local.yaml**
+
+```yaml
+agent:
+ conditions:
+ memory_pressure_threshold: 90
+ high_load_multiplier: 2.0
+ disk_pressure_threshold: 90
+```
+
+**Step 3: Verify compilation**
+
+Run: `go build ./...` Expected: compiles
+
+**Step 4: Commit**
+
+```bash
+git add internal/config/types.go configs/osapi.yaml configs/osapi.local.yaml
+git commit -m "feat(config): add agent conditions threshold configuration"
+```
+
+---
+
+## Task 4: Add disk stats to heartbeat and evaluate conditions
+
+**Files:**
+
+- Modify: `internal/agent/heartbeat.go:88-134` (writeRegistration)
+- Modify: `internal/agent/types.go:45-81` (add prevConditions, cpuCount)
+
+**Step 1: Add fields to Agent struct**
+
+In `internal/agent/types.go`, add to Agent struct:
+
+```go
+// prevConditions tracks condition state between heartbeats.
+prevConditions []job.Condition
+
+// cpuCount cached from facts for HighLoad evaluation.
+cpuCount int
+```
+
+**Step 2: Extend writeRegistration**
+
+In `internal/agent/heartbeat.go`, after memory stats collection (~line 111),
+add:
+
+```go
+// Collect disk stats (non-fatal).
+var diskStats []disk.UsageStats
+if stats, err := a.diskProvider.GetLocalUsageStats(); err == nil {
+ diskStats = stats
+}
+
+// Evaluate conditions.
+conditions := []job.Condition{
+ evaluateMemoryPressure(
+ memStats,
+ a.appConfig.Agent.Conditions.MemoryPressureThreshold,
+ a.prevConditions,
+ ),
+ evaluateHighLoad(
+ loadAvg,
+ a.cpuCount,
+ a.appConfig.Agent.Conditions.HighLoadMultiplier,
+ a.prevConditions,
+ ),
+ evaluateDiskPressure(
+ diskStats,
+ a.appConfig.Agent.Conditions.DiskPressureThreshold,
+ a.prevConditions,
+ ),
+}
+a.prevConditions = conditions
+```
+
+Add `Conditions: conditions` to the `AgentRegistration` literal.
+
+**Step 3: Set cpuCount from facts**
+
+In `internal/agent/facts.go` (the `writeFacts` function), after collecting
+`CPUCount`, add:
+
+```go
+a.cpuCount = cpuCount
+```
+
+**Step 4: Run tests**
+
+Run: `go test ./internal/agent/... -count=1` Expected: PASS
+
+**Step 5: Commit**
+
+```bash
+git add internal/agent/heartbeat.go internal/agent/types.go internal/agent/facts.go
+git commit -m "feat(agent): evaluate node conditions on heartbeat tick"
+```
+
+---
+
+## Task 5: Add drain timeline event storage functions
+
+**Files:**
+
+- Modify: `internal/job/client/agent.go:39-85`
+- Create: `internal/job/client/agent_timeline_test.go`
+
+**Step 1: Write failing tests**
+
+```go
+// internal/job/client/agent_timeline_test.go
+package client_test
+
+// Test WriteAgentTimelineEvent writes append-only key to registryKV.
+// Test ComputeAgentState returns latest state from timeline events.
+// Test GetAgentTimeline returns sorted timeline events.
+```
+
+Table-driven tests:
+
+- `WriteAgentTimelineEvent` writes key like
+ `timeline.{hostname}.{event}.{unix_nano}`
+- `ComputeAgentState` with no events returns "Ready"
+- `ComputeAgentState` with drain event returns "Draining"
+- `ComputeAgentState` with cordoned event returns "Cordoned"
+- `ComputeAgentState` with undrain event returns "Ready"
+
+**Step 2: Run tests to verify they fail**
+
+Run: `go test -run TestAgentTimeline -v ./internal/job/client/` Expected: FAIL
+
+**Step 3: Implement**
+
+Add to `internal/job/client/agent.go`:
+
+```go
+// WriteAgentTimelineEvent writes an append-only timeline event
+// for an agent state transition. Reuses the same TimelineEvent
+// type used by job lifecycle events.
+func (c *Client) WriteAgentTimelineEvent(
+ _ context.Context,
+ hostname, event, message string,
+) error {
+ now := time.Now()
+ key := fmt.Sprintf(
+ "timeline.%s.%s.%d",
+ job.SanitizeHostname(hostname),
+ event,
+ now.UnixNano(),
+ )
+ data, _ := json.Marshal(job.TimelineEvent{
+ Timestamp: now,
+ Event: event,
+ Hostname: hostname,
+ Message: message,
+ })
+ _, err := c.registryKV.Put(key, data)
+ return err
+}
+
+// GetAgentTimeline returns sorted timeline events for a hostname.
+func (c *Client) GetAgentTimeline(
+ ctx context.Context,
+ hostname string,
+) ([]job.TimelineEvent, error) {
+ prefix := "timeline." + job.SanitizeHostname(hostname) + "."
+ // List keys with prefix, unmarshal, sort by Timestamp
+ // Return sorted events
+}
+
+// ComputeAgentState returns the current state from timeline events.
+func ComputeAgentState(
+ events []job.TimelineEvent,
+) string {
+ if len(events) == 0 {
+ return job.AgentStateReady
+ }
+ latest := events[len(events)-1]
+ switch latest.Event {
+ case "drain":
+ return job.AgentStateDraining
+ case "cordoned":
+ return job.AgentStateCordoned
+ case "undrain", "ready":
+ return job.AgentStateReady
+ default:
+ return job.AgentStateReady
+ }
+}
+```
+
+Add `WriteAgentTimelineEvent`, `GetAgentTimeline` to the `JobClient` interface
+in `internal/job/client/types.go`. Regenerate mocks.
+
+**Step 4: Run tests**
+
+Run: `go test -run TestAgentTimeline -v ./internal/job/client/` Expected: PASS
+
+**Step 5: Commit**
+
+```bash
+git add internal/job/client/agent.go internal/job/client/agent_timeline_test.go \
+ internal/job/client/types.go internal/job/client/mock_*.go
+git commit -m "feat(job): add append-only timeline events for agent drain"
+```
+
+---
+
+## Task 6: Add drain/undrain API endpoints
+
+**Files:**
+
+- Modify: `internal/api/agent/gen/api.yaml`
+- Create: `internal/api/agent/agent_drain.go`
+- Create: `internal/api/agent/agent_drain_public_test.go`
+
+**Step 1: Add to OpenAPI spec**
+
+Add to `internal/api/agent/gen/api.yaml`:
+
+```yaml
+/agent/{hostname}/drain:
+ post:
+ operationId: drainAgent
+ summary: Drain an agent
+ description: Stop the agent from accepting new jobs.
+ security:
+ - BearerAuth:
+ - 'agent:write'
+ parameters:
+ - name: hostname
+ in: path
+ required: true
+ schema:
+ type: string
+ responses:
+ '200':
+ description: Agent drain initiated.
+ content:
+ application/json:
+ schema:
+ type: object
+ properties:
+ message:
+ type: string
+ '404':
+ description: Agent not found.
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '409':
+ description: Agent already in requested state.
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+
+/agent/{hostname}/undrain:
+ post:
+ operationId: undrainAgent
+ summary: Undrain an agent
+ description: Resume accepting jobs on a drained agent.
+ security:
+ - BearerAuth:
+ - 'agent:write'
+ parameters:
+ - name: hostname
+ in: path
+ required: true
+ schema:
+ type: string
+ responses:
+ '200':
+ description: Agent undrain initiated.
+ content:
+ application/json:
+ schema:
+ type: object
+ properties:
+ message:
+ type: string
+ '404':
+ description: Agent not found.
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '409':
+ description: Agent not in draining/cordoned state.
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+```
+
+Add `agent:write` to BearerAuth scopes. Add `state` and `conditions` fields to
+`AgentInfo` schema. Add `NodeCondition` schema.
+
+Run: `just generate` to regenerate `*.gen.go`.
+
+**Step 2: Write failing tests**
+
+```go
+// internal/api/agent/agent_drain_public_test.go
+// Table-driven tests for DrainAgent and UndrainAgent:
+// - 200: agent found and drain initiated
+// - 404: agent not found
+// - 409: already draining/cordoned
+// - HTTP wiring: RBAC (401, 403 without agent:write, 200 with agent:write)
+```
+
+**Step 3: Implement handlers**
+
+```go
+// internal/api/agent/agent_drain.go
+package agent
+
+func (a *Agent) DrainAgent(
+ ctx context.Context,
+ request gen.DrainAgentRequestObject,
+) (gen.DrainAgentResponseObject, error) {
+ hostname := request.Hostname
+
+ // 1. Verify agent exists
+ agentInfo, err := a.JobClient.GetAgent(ctx, hostname)
+ if err != nil {
+ return gen.DrainAgent404JSONResponse{...}, nil
+ }
+
+ // 2. Check not already draining
+ if agentInfo.State == job.AgentStateDraining ||
+ agentInfo.State == job.AgentStateCordoned {
+ return gen.DrainAgent409JSONResponse{...}, nil
+ }
+
+ // 3. Write drain intent key
+ // 4. Write state event
+ return gen.DrainAgent200JSONResponse{...}, nil
+}
+
+func (a *Agent) UndrainAgent(
+ ctx context.Context,
+ request gen.UndrainAgentRequestObject,
+) (gen.UndrainAgentResponseObject, error) {
+ // Similar: verify exists, check state, delete drain key, write event
+}
+```
+
+**Step 4: Run tests**
+
+Run: `go test ./internal/api/agent/... -count=1` Expected: PASS
+
+**Step 5: Commit**
+
+```bash
+git add internal/api/agent/gen/api.yaml internal/api/agent/gen/*.gen.go \
+ internal/api/agent/agent_drain.go internal/api/agent/agent_drain_public_test.go
+git commit -m "feat(api): add drain/undrain endpoints with RBAC"
+```
+
+---
+
+## Task 7: Add agent:write permission
+
+**Files:**
+
+- Modify: `internal/authtoken/permissions.go:27-37` (add constant)
+- Modify: `internal/authtoken/permissions.go:53-81` (add to admin role)
+
+**Step 1: Add permission constant**
+
+```go
+PermAgentWrite Permission = "agent:write"
+```
+
+**Step 2: Add to admin role**
+
+In `DefaultRolePermissions`, add `PermAgentWrite` to the `admin` slice.
+
+**Step 3: Run tests**
+
+Run: `go test ./internal/authtoken/... -count=1` Expected: PASS
+
+**Step 4: Commit**
+
+```bash
+git add internal/authtoken/permissions.go
+git commit -m "feat(auth): add agent:write permission for drain operations"
+```
+
+---
+
+## Task 8: Wire drain endpoints into server
+
+**Files:**
+
+- Modify: `internal/api/handler_agent.go:34-61`
+- Modify: `internal/api/handler_agent_public_test.go`
+
+**Step 1: Update handler registration**
+
+The `GetAgentHandler` already wires all agent gen handlers through
+`scopeMiddleware`. After regenerating the OpenAPI code (Task 6), the new
+`DrainAgent` and `UndrainAgent` methods on the strict server interface will be
+picked up automatically by `RegisterHandlers`.
+
+No code change needed in `handler_agent.go` unless `unauthenticatedOperations`
+needs updating (it doesn't — drain requires auth).
+
+**Step 2: Verify compilation**
+
+Run: `go build ./...` Expected: compiles
+
+**Step 3: Add handler test cases**
+
+Add test cases to `handler_agent_public_test.go` for drain/undrain handler
+registration.
+
+**Step 4: Commit**
+
+```bash
+git add internal/api/handler_agent.go internal/api/handler_agent_public_test.go
+git commit -m "feat(api): wire drain/undrain handlers into server"
+```
+
+---
+
+## Task 9: Add drain detection to agent heartbeat
+
+**Files:**
+
+- Modify: `internal/agent/heartbeat.go:88-134`
+- Modify: `internal/agent/server.go:32-61`
+- Create: `internal/agent/drain.go`
+- Create: `internal/agent/drain_test.go`
+
+**Step 1: Write failing tests**
+
+```go
+// internal/agent/drain_test.go
+// Test checkDrainFlag: returns true when drain key exists
+// Test checkDrainFlag: returns false when drain key absent
+// Test handleDrainTransition: unsubscribes consumers when draining
+// Test handleUndrainTransition: resubscribes consumers when undrained
+```
+
+**Step 2: Implement drain detection**
+
+```go
+// internal/agent/drain.go
+package agent
+
+// checkDrainFlag reads drain.{hostname} from registryKV.
+func (a *Agent) checkDrainFlag(
+ ctx context.Context,
+ hostname string,
+) bool {
+ key := "drain." + job.SanitizeHostname(hostname)
+ _, err := a.registryKV.Get(ctx, key)
+ return err == nil
+}
+
+// handleDrainDetection checks drain flag on each heartbeat.
+func (a *Agent) handleDrainDetection(
+ ctx context.Context,
+ hostname string,
+) {
+ drainRequested := a.checkDrainFlag(ctx, hostname)
+
+ switch {
+ case drainRequested && a.state == job.AgentStateReady:
+ a.state = job.AgentStateDraining
+ a.unsubscribeConsumers()
+ // Write timeline event: "drain", "Drain initiated"
+ // When WaitGroup drains, transition to Cordoned
+
+ case !drainRequested && a.state == job.AgentStateCordoned:
+ a.state = job.AgentStateReady
+ a.resubscribeConsumers(ctx, hostname)
+ // Write timeline event: "undrain", "Resumed accepting jobs"
+ }
+}
+```
+
+**Step 3: Add state field to Agent struct**
+
+In `internal/agent/types.go`:
+
+```go
+state string // Ready, Draining, Cordoned
+```
+
+Initialize to `job.AgentStateReady` in `Start()`.
+
+**Step 4: Call from heartbeat**
+
+In `writeRegistration()`, add `a.handleDrainDetection(ctx, hostname)` and
+include `State: a.state` in the registration.
+
+**Step 5: Run tests**
+
+Run: `go test ./internal/agent/... -count=1` Expected: PASS
+
+**Step 6: Commit**
+
+```bash
+git add internal/agent/drain.go internal/agent/drain_test.go \
+ internal/agent/heartbeat.go internal/agent/types.go internal/agent/server.go
+git commit -m "feat(agent): detect drain flag and manage consumer lifecycle"
+```
+
+---
+
+## Task 10: Extend buildAgentInfo with conditions and state
+
+**Files:**
+
+- Modify: `internal/api/agent/agent_list.go:59-171` (buildAgentInfo)
+- Modify: `internal/api/agent/agent_list_public_test.go`
+- Modify: `internal/job/client/query.go:479-493` (agentInfoFromRegistration)
+
+**Step 1: Update agentInfoFromRegistration**
+
+Add to the returned `AgentInfo`:
+
+```go
+Conditions: reg.Conditions,
+State: reg.State,
+```
+
+**Step 2: Update buildAgentInfo**
+
+Map conditions and state from `job.AgentInfo` to `gen.AgentInfo`:
+
+```go
+if len(a.Conditions) > 0 {
+ conditions := make([]gen.NodeCondition, 0, len(a.Conditions))
+ for _, c := range a.Conditions {
+ nc := gen.NodeCondition{
+ Type: gen.NodeConditionType(c.Type),
+ Status: c.Status,
+ LastTransitionTime: c.LastTransitionTime,
+ }
+ if c.Reason != "" {
+ nc.Reason = &c.Reason
+ }
+ conditions = append(conditions, nc)
+ }
+ info.Conditions = &conditions
+}
+
+if a.State != "" {
+ state := gen.AgentInfoState(a.State)
+ info.State = &state
+}
+```
+
+**Step 3: Update status derivation**
+
+Change status logic: if `a.State` is set, use it; otherwise default to `Ready`
+(existing behavior).
+
+**Step 4: Add test cases**
+
+Add table-driven test case for agent with conditions and Draining/Cordoned
+states.
+
+**Step 5: Run tests**
+
+Run: `go test ./internal/api/agent/... -count=1` Expected: PASS
+
+**Step 6: Commit**
+
+```bash
+git add internal/api/agent/agent_list.go internal/api/agent/agent_list_public_test.go \
+ internal/job/client/query.go
+git commit -m "feat(api): expose conditions and state in agent responses"
+```
+
+---
+
+## Task 11: Add timeline to GetAgent response
+
+**Files:**
+
+- Modify: `internal/job/client/query.go:423-445` (GetAgent)
+- Modify: `internal/job/client/query_public_test.go`
+
+**Step 1: Extend GetAgent to fetch timeline events**
+
+After building `AgentInfo`, fetch timeline events:
+
+```go
+timeline, err := c.GetAgentTimeline(ctx, hostname)
+if err == nil {
+ info.Timeline = timeline
+}
+```
+
+**Step 2: Add test cases**
+
+Test GetAgent returns timeline events when present.
+
+**Step 3: Run tests**
+
+Run: `go test ./internal/job/client/... -count=1` Expected: PASS
+
+**Step 4: Commit**
+
+```bash
+git add internal/job/client/query.go internal/job/client/query_public_test.go
+git commit -m "feat(job): include timeline events in GetAgent response"
+```
+
+---
+
+## Task 12: Update SDK with conditions, state, drain/undrain
+
+**Files:**
+
+- Modify: `osapi-sdk/pkg/osapi/gen/agent/api.yaml` (copy from osapi)
+- Modify: `osapi-sdk/pkg/osapi/agent.go` (add Drain, Undrain methods)
+- Modify: `osapi-sdk/pkg/osapi/agent_types.go` (add conditions, state, timeline
+ to Agent type)
+- Create: `osapi-sdk/pkg/osapi/types.go` (promote TimelineEvent to shared type)
+- Modify: `osapi-sdk/pkg/osapi/job_types.go` (remove TimelineEvent, import from
+ types.go)
+
+**Step 1: Promote TimelineEvent to shared type**
+
+Move `TimelineEvent` from `job_types.go` to a new `types.go`:
+
+```go
+// pkg/osapi/types.go
+
+// TimelineEvent represents a lifecycle event. Used by both job
+// timelines and agent state transition history.
+type TimelineEvent struct {
+ Timestamp string
+ Event string
+ Hostname string
+ Message string
+ Error string
+}
+```
+
+Update `job_types.go` to remove the `TimelineEvent` definition —
+`JobDetail.Timeline` now references the shared type.
+
+**Step 2: Sync OpenAPI spec**
+
+Copy `internal/api/agent/gen/api.yaml` to
+`osapi-sdk/pkg/osapi/gen/agent/api.yaml`.
+
+Run `redocly join` + `go generate` in the SDK.
+
+**Step 3: Add domain types**
+
+```go
+// In agent_types.go
+type Agent struct {
+ // ... existing fields ...
+ State string
+ Conditions []Condition
+ Timeline []TimelineEvent // shared type from types.go
+}
+
+type Condition struct {
+ Type string
+ Status bool
+ Reason string
+ LastTransitionTime time.Time
+}
+```
+
+**Step 4: Add Drain/Undrain methods**
+
+```go
+func (s *AgentService) Drain(
+ ctx context.Context,
+ hostname string,
+) (*Response[any], error) {
+ // POST /agent/{hostname}/drain
+}
+
+func (s *AgentService) Undrain(
+ ctx context.Context,
+ hostname string,
+) (*Response[any], error) {
+ // POST /agent/{hostname}/undrain
+}
+```
+
+**Step 4: Run SDK tests**
+
+Run: `go test ./pkg/osapi/... -count=1` Expected: PASS
+
+**Step 5: Commit (in osapi-sdk repo)**
+
+```bash
+git add pkg/osapi/
+git commit -m "feat(agent): add conditions, state, drain/undrain support"
+```
+
+---
+
+## Task 13: Add CONDITIONS column to agent list CLI
+
+**Files:**
+
+- Modify: `cmd/client_agent_list.go`
+
+**Step 1: Add CONDITIONS column**
+
+In the table builder for `agent list`, add a column that joins active condition
+type names:
+
+```go
+conditions := "-"
+if len(agent.Conditions) > 0 {
+ active := make([]string, 0)
+ for _, c := range agent.Conditions {
+ if c.Status {
+ active = append(active, c.Type)
+ }
+ }
+ if len(active) > 0 {
+ conditions = strings.Join(active, ",")
+ }
+}
+```
+
+Headers: `HOSTNAME`, `STATUS`, `CONDITIONS`, `LABELS`, `AGE`, `LOAD`, `OS`
+
+**Step 2: Use State for STATUS column**
+
+Replace hardcoded "Ready" with `agent.State` (defaulting to "Ready" if empty).
+
+**Step 3: Run `go build ./cmd/...`**
+
+Expected: compiles
+
+**Step 4: Commit**
+
+```bash
+git add cmd/client_agent_list.go
+git commit -m "feat(cli): add CONDITIONS column and state to agent list"
+```
+
+---
+
+## Task 14: Add conditions and timeline to agent get CLI
+
+**Files:**
+
+- Modify: `cmd/client_agent_get.go:58-141`
+
+**Step 1: Add state to agent get output**
+
+After the Status KV line, display the State:
+
+```go
+if data.State != "" && data.State != "Ready" {
+ cli.PrintKV("State", data.State)
+}
+```
+
+**Step 2: Add conditions section**
+
+```go
+if len(data.Conditions) > 0 {
+ condRows := make([][]string, 0, len(data.Conditions))
+ for _, c := range data.Conditions {
+ status := "false"
+ if c.Status {
+ status = "true"
+ }
+ reason := ""
+ if c.Reason != "" {
+ reason = c.Reason
+ }
+ since := cli.FormatAge(time.Since(c.LastTransitionTime)) + " ago"
+ condRows = append(condRows, []string{c.Type, status, reason, since})
+ }
+ sections = append(sections, cli.Section{
+ Title: "Conditions",
+ Headers: []string{"TYPE", "STATUS", "REASON", "SINCE"},
+ Rows: condRows,
+ })
+}
+```
+
+**Step 3: Add timeline section**
+
+Same pattern as `DisplayJobDetail` in `internal/cli/ui.go:600-615`:
+
+```go
+if len(data.Timeline) > 0 {
+ timelineRows := make([][]string, 0, len(data.Timeline))
+ for _, te := range data.Timeline {
+ timelineRows = append(timelineRows, []string{
+ te.Timestamp, te.Event, te.Hostname, te.Message, te.Error,
+ })
+ }
+ sections = append(sections, cli.Section{
+ Title: "Timeline",
+ Headers: []string{"TIMESTAMP", "EVENT", "HOSTNAME", "MESSAGE", "ERROR"},
+ Rows: timelineRows,
+ })
+}
+```
+
+**Step 4: Run `go build ./cmd/...`**
+
+Expected: compiles
+
+**Step 5: Commit**
+
+```bash
+git add cmd/client_agent_get.go
+git commit -m "feat(cli): display conditions and timeline in agent get"
+```
+
+---
+
+## Task 15: Add agent drain/undrain CLI commands
+
+**Files:**
+
+- Create: `cmd/client_agent_drain.go`
+- Create: `cmd/client_agent_undrain.go`
+
+**Step 1: Create drain command**
+
+```go
+// cmd/client_agent_drain.go
+var clientAgentDrainCmd = &cobra.Command{
+ Use: "drain",
+ Short: "Drain an agent",
+ Long: `Stop an agent from accepting new jobs. In-flight jobs complete.`,
+ Run: func(cmd *cobra.Command, _ []string) {
+ ctx := cmd.Context()
+ hostname, _ := cmd.Flags().GetString("hostname")
+
+ resp, err := sdkClient.Agent.Drain(ctx, hostname)
+ if err != nil {
+ cli.HandleError(err, logger)
+ return
+ }
+
+ if jsonOutput {
+ fmt.Println(string(resp.RawJSON()))
+ return
+ }
+
+ fmt.Printf("Agent %s drain initiated\n", hostname)
+ },
+}
+```
+
+**Step 2: Create undrain command**
+
+Similar pattern for `undrain`.
+
+**Step 3: Register commands**
+
+```go
+func init() {
+ clientAgentCmd.AddCommand(clientAgentDrainCmd)
+ clientAgentDrainCmd.Flags().String("hostname", "", "Hostname of the agent to drain")
+ _ = clientAgentDrainCmd.MarkFlagRequired("hostname")
+}
+```
+
+**Step 4: Run `go build ./cmd/...`**
+
+Expected: compiles
+
+**Step 5: Commit**
+
+```bash
+git add cmd/client_agent_drain.go cmd/client_agent_undrain.go
+git commit -m "feat(cli): add agent drain and undrain commands"
+```
+
+---
+
+## Task 16: Update documentation
+
+**Files:**
+
+- Modify: `docs/docs/sidebar/features/agent-management.md` (or create)
+- Modify: `docs/docs/sidebar/usage/configuration.md`
+- Modify: `docs/docs/sidebar/usage/cli/client/agent/`
+
+**Step 1: Add conditions and drain docs**
+
+Document:
+
+- Condition types and thresholds
+- Drain lifecycle (Ready → Draining → Cordoned)
+- CLI commands (`agent drain`, `agent undrain`)
+- Configuration section for `agent.conditions`
+
+**Step 2: Update permission table**
+
+Add `agent:write` to the permissions table in configuration.md.
+
+**Step 3: Commit**
+
+```bash
+git add docs/
+git commit -m "docs: add node conditions and agent drain documentation"
+```
+
+---
+
+## Task 17: Final verification
+
+**Step 1: Regenerate**
+
+Run: `just generate` Expected: no diff
+
+**Step 2: Build**
+
+Run: `go build ./...` Expected: compiles
+
+**Step 3: Unit tests**
+
+Run: `just go::unit` Expected: PASS
+
+**Step 4: Lint**
+
+Run: `just go::vet` Expected: clean
+
+**Step 5: Coverage check**
+
+Run:
+`go test -coverprofile=coverage.out ./internal/agent/... ./internal/job/client/... ./internal/api/agent/...`
+Expected: condition.go, drain.go, agent_drain.go at 100%
+
+---
+
+## Verification
+
+```bash
+just generate # regenerate specs + code
+go build ./... # compiles
+just go::unit # tests pass
+just go::vet # lint passes
+```
diff --git a/go.mod b/go.mod
index e8c53583..b53e2db0 100644
--- a/go.mod
+++ b/go.mod
@@ -18,7 +18,7 @@ require (
github.com/oapi-codegen/runtime v1.2.0
github.com/osapi-io/nats-client v0.0.0-20260222233639-d0822e0a4b86
github.com/osapi-io/nats-server v0.0.0-20260216201410-1f33dfc63848
- github.com/osapi-io/osapi-sdk v0.0.0-20260305004213-6ad316fa4505
+ github.com/osapi-io/osapi-sdk v0.0.0-20260306002247-11cb3395b3f9
github.com/prometheus-community/pro-bing v0.8.0
github.com/prometheus/client_golang v1.23.2
github.com/samber/slog-echo v1.21.0
diff --git a/go.sum b/go.sum
index f8ce5a76..30b725b2 100644
--- a/go.sum
+++ b/go.sum
@@ -755,8 +755,8 @@ github.com/osapi-io/nats-client v0.0.0-20260222233639-d0822e0a4b86 h1:ML0fdgr0M4
github.com/osapi-io/nats-client v0.0.0-20260222233639-d0822e0a4b86/go.mod h1:TQqODOjF2JuAOFrLtm1ItsMzPPAizKfHo+grOMuPDyE=
github.com/osapi-io/nats-server v0.0.0-20260216201410-1f33dfc63848 h1:ELW1sTVBn5JIc17mHgd5fhpO3/7btaxJpxykG2Fe0U4=
github.com/osapi-io/nats-server v0.0.0-20260216201410-1f33dfc63848/go.mod h1:4rzeY9jiJF/+Ej4WNwqK5HQ2sflZrEs60GxQpg3Iya8=
-github.com/osapi-io/osapi-sdk v0.0.0-20260305004213-6ad316fa4505 h1:J7Wv551BG39Ma9LLWxvZgsaWVNkP5TkteHzExSjt9e4=
-github.com/osapi-io/osapi-sdk v0.0.0-20260305004213-6ad316fa4505/go.mod h1:5Y45ymBR4BcxJTOJ7WhqYTDHXxtlQRW7Sr3G52pfMdI=
+github.com/osapi-io/osapi-sdk v0.0.0-20260306002247-11cb3395b3f9 h1:v7MKMVLktP3FotS5josRw5DlOKEsIwOQFAj2cd04VwE=
+github.com/osapi-io/osapi-sdk v0.0.0-20260306002247-11cb3395b3f9/go.mod h1:gL9oHgIkG+VMazSIXO4Nvwd3IXEuzRvuXstGiphSycc=
github.com/otiai10/copy v1.2.0/go.mod h1:rrF5dJ5F0t/EWSYODDu4j9/vEeYHMkc8jt0zJChqQWw=
github.com/otiai10/copy v1.14.0 h1:dCI/t1iTdYGtkvCuBG2BgR6KZa83PTclw4U5n2wAllU=
github.com/otiai10/copy v1.14.0/go.mod h1:ECfuL02W+/FkTWZWgQqXPWZgW9oeKCSQ5qVfSc4qc4w=
diff --git a/internal/agent/condition.go b/internal/agent/condition.go
new file mode 100644
index 00000000..db786c99
--- /dev/null
+++ b/internal/agent/condition.go
@@ -0,0 +1,138 @@
+// Copyright (c) 2026 John Dewey
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal in the Software without restriction, including without limitation the
+// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+// sell copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS IN THE SOFTWARE.
+
+package agent
+
+import (
+ "fmt"
+ "time"
+
+ "github.com/retr0h/osapi/internal/job"
+ "github.com/retr0h/osapi/internal/provider/node/disk"
+ "github.com/retr0h/osapi/internal/provider/node/load"
+ "github.com/retr0h/osapi/internal/provider/node/mem"
+)
+
+// findPrevCondition returns the previous condition of the given type,
+// or nil if not found.
+func findPrevCondition(
+ condType string,
+ prev []job.Condition,
+) *job.Condition {
+ for i := range prev {
+ if prev[i].Type == condType {
+ return &prev[i]
+ }
+ }
+ return nil
+}
+
+// transitionTime returns the previous LastTransitionTime if status
+// hasn't changed, otherwise returns now.
+func transitionTime(
+ condType string,
+ newStatus bool,
+ prev []job.Condition,
+) time.Time {
+ if p := findPrevCondition(condType, prev); p != nil {
+ if p.Status == newStatus {
+ return p.LastTransitionTime
+ }
+ }
+ return time.Now()
+}
+
+func evaluateMemoryPressure(
+ stats *mem.Stats,
+ threshold int,
+ prev []job.Condition,
+) job.Condition {
+ c := job.Condition{Type: job.ConditionMemoryPressure}
+ if stats == nil || stats.Total == 0 {
+ c.LastTransitionTime = transitionTime(c.Type, false, prev)
+ return c
+ }
+ used := stats.Total - stats.Available
+ pct := float64(used) / float64(stats.Total) * 100
+ c.Status = pct > float64(threshold)
+ if c.Status {
+ c.Reason = fmt.Sprintf(
+ "memory %.0f%% used (%.1f/%.1f GB)",
+ pct,
+ float64(used)/1024/1024/1024,
+ float64(stats.Total)/1024/1024/1024,
+ )
+ }
+ c.LastTransitionTime = transitionTime(c.Type, c.Status, prev)
+ return c
+}
+
+func evaluateHighLoad(
+ loadAvg *load.AverageStats,
+ cpuCount int,
+ multiplier float64,
+ prev []job.Condition,
+) job.Condition {
+ c := job.Condition{Type: job.ConditionHighLoad}
+ if loadAvg == nil || cpuCount == 0 {
+ c.LastTransitionTime = transitionTime(c.Type, false, prev)
+ return c
+ }
+ threshold := float64(cpuCount) * multiplier
+ c.Status = float64(loadAvg.Load1) > threshold
+ if c.Status {
+ c.Reason = fmt.Sprintf(
+ "load %.2f, threshold %.2f for %d CPUs",
+ loadAvg.Load1, threshold, cpuCount,
+ )
+ }
+ c.LastTransitionTime = transitionTime(c.Type, c.Status, prev)
+ return c
+}
+
+func evaluateDiskPressure(
+ disks []disk.UsageStats,
+ threshold int,
+ prev []job.Condition,
+) job.Condition {
+ c := job.Condition{Type: job.ConditionDiskPressure}
+ if len(disks) == 0 {
+ c.LastTransitionTime = transitionTime(c.Type, false, prev)
+ return c
+ }
+ for _, d := range disks {
+ if d.Total == 0 {
+ continue
+ }
+ pct := float64(d.Used) / float64(d.Total) * 100
+ if pct > float64(threshold) {
+ c.Status = true
+ c.Reason = fmt.Sprintf(
+ "%s %.0f%% used (%.1f/%.1f GB)",
+ d.Name, pct,
+ float64(d.Used)/1024/1024/1024,
+ float64(d.Total)/1024/1024/1024,
+ )
+ break
+ }
+ }
+ c.LastTransitionTime = transitionTime(c.Type, c.Status, prev)
+ return c
+}
diff --git a/internal/agent/condition_test.go b/internal/agent/condition_test.go
new file mode 100644
index 00000000..720c971e
--- /dev/null
+++ b/internal/agent/condition_test.go
@@ -0,0 +1,619 @@
+// Copyright (c) 2026 John Dewey
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal in the Software without restriction, including without limitation the
+// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+// sell copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS IN THE SOFTWARE.
+
+package agent
+
+import (
+ "testing"
+ "time"
+
+ "github.com/stretchr/testify/suite"
+
+ "github.com/retr0h/osapi/internal/job"
+ "github.com/retr0h/osapi/internal/provider/node/disk"
+ "github.com/retr0h/osapi/internal/provider/node/load"
+ "github.com/retr0h/osapi/internal/provider/node/mem"
+)
+
+type ConditionTestSuite struct {
+ suite.Suite
+}
+
+func (s *ConditionTestSuite) TestFindPrevCondition() {
+ tests := []struct {
+ name string
+ condType string
+ prev []job.Condition
+ validateFunc func(*job.Condition)
+ }{
+ {
+ name: "when condition type is found returns pointer",
+ condType: job.ConditionMemoryPressure,
+ prev: []job.Condition{
+ {
+ Type: job.ConditionMemoryPressure,
+ Status: true,
+ Reason: "high",
+ },
+ {
+ Type: job.ConditionHighLoad,
+ Status: false,
+ },
+ },
+ validateFunc: func(c *job.Condition) {
+ s.Require().NotNil(c)
+ s.Equal(job.ConditionMemoryPressure, c.Type)
+ s.True(c.Status)
+ s.Equal("high", c.Reason)
+ },
+ },
+ {
+ name: "when condition type is not found returns nil",
+ condType: job.ConditionDiskPressure,
+ prev: []job.Condition{
+ {
+ Type: job.ConditionMemoryPressure,
+ Status: true,
+ },
+ },
+ validateFunc: func(c *job.Condition) {
+ s.Nil(c)
+ },
+ },
+ {
+ name: "when prev is empty returns nil",
+ condType: job.ConditionHighLoad,
+ prev: []job.Condition{},
+ validateFunc: func(c *job.Condition) {
+ s.Nil(c)
+ },
+ },
+ {
+ name: "when prev is nil returns nil",
+ condType: job.ConditionHighLoad,
+ prev: nil,
+ validateFunc: func(c *job.Condition) {
+ s.Nil(c)
+ },
+ },
+ }
+
+ for _, tt := range tests {
+ s.Run(tt.name, func() {
+ result := findPrevCondition(tt.condType, tt.prev)
+ tt.validateFunc(result)
+ })
+ }
+}
+
+func (s *ConditionTestSuite) TestTransitionTime() {
+ fixedTime := time.Date(2026, 1, 1, 0, 0, 0, 0, time.UTC)
+
+ tests := []struct {
+ name string
+ condType string
+ newStatus bool
+ prev []job.Condition
+ validateFunc func(time.Time)
+ }{
+ {
+ name: "when matching prev has same status preserves transition time",
+ condType: job.ConditionHighLoad,
+ newStatus: true,
+ prev: []job.Condition{
+ {
+ Type: job.ConditionHighLoad,
+ Status: true,
+ LastTransitionTime: fixedTime,
+ },
+ },
+ validateFunc: func(t time.Time) {
+ s.Equal(fixedTime, t)
+ },
+ },
+ {
+ name: "when matching prev has different status returns now",
+ condType: job.ConditionHighLoad,
+ newStatus: true,
+ prev: []job.Condition{
+ {
+ Type: job.ConditionHighLoad,
+ Status: false,
+ LastTransitionTime: fixedTime,
+ },
+ },
+ validateFunc: func(t time.Time) {
+ s.NotEqual(fixedTime, t)
+ s.WithinDuration(time.Now(), t, 2*time.Second)
+ },
+ },
+ {
+ name: "when no matching prev returns now",
+ condType: job.ConditionDiskPressure,
+ newStatus: true,
+ prev: []job.Condition{
+ {
+ Type: job.ConditionHighLoad,
+ Status: true,
+ LastTransitionTime: fixedTime,
+ },
+ },
+ validateFunc: func(t time.Time) {
+ s.WithinDuration(time.Now(), t, 2*time.Second)
+ },
+ },
+ {
+ name: "when prev is empty returns now",
+ condType: job.ConditionHighLoad,
+ newStatus: false,
+ prev: []job.Condition{},
+ validateFunc: func(t time.Time) {
+ s.WithinDuration(time.Now(), t, 2*time.Second)
+ },
+ },
+ {
+ name: "when prev is nil returns now",
+ condType: job.ConditionHighLoad,
+ newStatus: false,
+ prev: nil,
+ validateFunc: func(t time.Time) {
+ s.WithinDuration(time.Now(), t, 2*time.Second)
+ },
+ },
+ }
+
+ for _, tt := range tests {
+ s.Run(tt.name, func() {
+ result := transitionTime(tt.condType, tt.newStatus, tt.prev)
+ tt.validateFunc(result)
+ })
+ }
+}
+
+func (s *ConditionTestSuite) TestEvaluateMemoryPressure() {
+ tests := []struct {
+ name string
+ stats *mem.Stats
+ threshold int
+ prev []job.Condition
+ validateFunc func(job.Condition)
+ }{
+ {
+ name: "when usage above threshold returns true with reason",
+ stats: &mem.Stats{
+ Total: 8 * 1024 * 1024 * 1024, // 8 GB
+ Available: 1 * 1024 * 1024 * 1024, // 1 GB available = 87.5% used
+ },
+ threshold: 80,
+ prev: nil,
+ validateFunc: func(c job.Condition) {
+ s.Equal(job.ConditionMemoryPressure, c.Type)
+ s.True(c.Status)
+ s.Contains(c.Reason, "memory")
+ s.Contains(c.Reason, "88%")
+ s.Contains(c.Reason, "GB")
+ },
+ },
+ {
+ name: "when usage below threshold returns false",
+ stats: &mem.Stats{
+ Total: 8 * 1024 * 1024 * 1024, // 8 GB
+ Available: 6 * 1024 * 1024 * 1024, // 6 GB available = 25% used
+ },
+ threshold: 80,
+ prev: nil,
+ validateFunc: func(c job.Condition) {
+ s.Equal(job.ConditionMemoryPressure, c.Type)
+ s.False(c.Status)
+ s.Empty(c.Reason)
+ },
+ },
+ {
+ name: "when stats is nil returns false",
+ stats: nil,
+ threshold: 80,
+ prev: nil,
+ validateFunc: func(c job.Condition) {
+ s.Equal(job.ConditionMemoryPressure, c.Type)
+ s.False(c.Status)
+ s.Empty(c.Reason)
+ },
+ },
+ {
+ name: "when total is zero returns false",
+ stats: &mem.Stats{
+ Total: 0,
+ Available: 0,
+ },
+ threshold: 80,
+ prev: nil,
+ validateFunc: func(c job.Condition) {
+ s.Equal(job.ConditionMemoryPressure, c.Type)
+ s.False(c.Status)
+ s.Empty(c.Reason)
+ },
+ },
+ {
+ name: "when usage exactly at threshold returns false",
+ stats: &mem.Stats{
+ Total: 100,
+ Available: 20, // 80% used, threshold is 80 (> not >=)
+ },
+ threshold: 80,
+ prev: nil,
+ validateFunc: func(c job.Condition) {
+ s.Equal(job.ConditionMemoryPressure, c.Type)
+ s.False(c.Status)
+ s.Empty(c.Reason)
+ },
+ },
+ }
+
+ for _, tt := range tests {
+ s.Run(tt.name, func() {
+ result := evaluateMemoryPressure(tt.stats, tt.threshold, tt.prev)
+ tt.validateFunc(result)
+ })
+ }
+}
+
+func (s *ConditionTestSuite) TestEvaluateHighLoad() {
+ tests := []struct {
+ name string
+ loadAvg *load.AverageStats
+ cpuCount int
+ multiplier float64
+ prev []job.Condition
+ validateFunc func(job.Condition)
+ }{
+ {
+ name: "when load above threshold returns true with reason",
+ loadAvg: &load.AverageStats{
+ Load1: 8.5,
+ Load5: 7.0,
+ Load15: 6.0,
+ },
+ cpuCount: 4,
+ multiplier: 2.0, // threshold = 8.0
+ prev: nil,
+ validateFunc: func(c job.Condition) {
+ s.Equal(job.ConditionHighLoad, c.Type)
+ s.True(c.Status)
+ s.Contains(c.Reason, "load 8.50")
+ s.Contains(c.Reason, "threshold 8.00")
+ s.Contains(c.Reason, "4 CPUs")
+ },
+ },
+ {
+ name: "when load below threshold returns false",
+ loadAvg: &load.AverageStats{
+ Load1: 2.0,
+ Load5: 1.5,
+ Load15: 1.0,
+ },
+ cpuCount: 4,
+ multiplier: 2.0, // threshold = 8.0
+ prev: nil,
+ validateFunc: func(c job.Condition) {
+ s.Equal(job.ConditionHighLoad, c.Type)
+ s.False(c.Status)
+ s.Empty(c.Reason)
+ },
+ },
+ {
+ name: "when load is nil returns false",
+ loadAvg: nil,
+ cpuCount: 4,
+ multiplier: 2.0,
+ prev: nil,
+ validateFunc: func(c job.Condition) {
+ s.Equal(job.ConditionHighLoad, c.Type)
+ s.False(c.Status)
+ s.Empty(c.Reason)
+ },
+ },
+ {
+ name: "when cpu count is zero returns false",
+ loadAvg: &load.AverageStats{
+ Load1: 8.5,
+ Load5: 7.0,
+ Load15: 6.0,
+ },
+ cpuCount: 0,
+ multiplier: 2.0,
+ prev: nil,
+ validateFunc: func(c job.Condition) {
+ s.Equal(job.ConditionHighLoad, c.Type)
+ s.False(c.Status)
+ s.Empty(c.Reason)
+ },
+ },
+ {
+ name: "when load exactly at threshold returns false",
+ loadAvg: &load.AverageStats{
+ Load1: 8.0,
+ Load5: 5.0,
+ Load15: 3.0,
+ },
+ cpuCount: 4,
+ multiplier: 2.0, // threshold = 8.0, Load1 = 8.0 (not >)
+ prev: nil,
+ validateFunc: func(c job.Condition) {
+ s.Equal(job.ConditionHighLoad, c.Type)
+ s.False(c.Status)
+ s.Empty(c.Reason)
+ },
+ },
+ }
+
+ for _, tt := range tests {
+ s.Run(tt.name, func() {
+ result := evaluateHighLoad(tt.loadAvg, tt.cpuCount, tt.multiplier, tt.prev)
+ tt.validateFunc(result)
+ })
+ }
+}
+
+func (s *ConditionTestSuite) TestEvaluateDiskPressure() {
+ tests := []struct {
+ name string
+ disks []disk.UsageStats
+ threshold int
+ prev []job.Condition
+ validateFunc func(job.Condition)
+ }{
+ {
+ name: "when one disk above threshold returns true",
+ disks: []disk.UsageStats{
+ {
+ Name: "/dev/sda1",
+ Total: 100 * 1024 * 1024 * 1024, // 100 GB
+ Used: 95 * 1024 * 1024 * 1024, // 95 GB = 95%
+ Free: 5 * 1024 * 1024 * 1024,
+ },
+ },
+ threshold: 90,
+ prev: nil,
+ validateFunc: func(c job.Condition) {
+ s.Equal(job.ConditionDiskPressure, c.Type)
+ s.True(c.Status)
+ s.Contains(c.Reason, "/dev/sda1")
+ s.Contains(c.Reason, "95%")
+ s.Contains(c.Reason, "GB")
+ },
+ },
+ {
+ name: "when all disks below threshold returns false",
+ disks: []disk.UsageStats{
+ {
+ Name: "/dev/sda1",
+ Total: 100 * 1024 * 1024 * 1024,
+ Used: 50 * 1024 * 1024 * 1024, // 50%
+ Free: 50 * 1024 * 1024 * 1024,
+ },
+ {
+ Name: "/dev/sdb1",
+ Total: 200 * 1024 * 1024 * 1024,
+ Used: 60 * 1024 * 1024 * 1024, // 30%
+ Free: 140 * 1024 * 1024 * 1024,
+ },
+ },
+ threshold: 90,
+ prev: nil,
+ validateFunc: func(c job.Condition) {
+ s.Equal(job.ConditionDiskPressure, c.Type)
+ s.False(c.Status)
+ s.Empty(c.Reason)
+ },
+ },
+ {
+ name: "when disks is nil returns false",
+ disks: nil,
+ threshold: 90,
+ prev: nil,
+ validateFunc: func(c job.Condition) {
+ s.Equal(job.ConditionDiskPressure, c.Type)
+ s.False(c.Status)
+ s.Empty(c.Reason)
+ },
+ },
+ {
+ name: "when disks is empty returns false",
+ disks: []disk.UsageStats{},
+ threshold: 90,
+ prev: nil,
+ validateFunc: func(c job.Condition) {
+ s.Equal(job.ConditionDiskPressure, c.Type)
+ s.False(c.Status)
+ s.Empty(c.Reason)
+ },
+ },
+ {
+ name: "when disk total is zero skips it",
+ disks: []disk.UsageStats{
+ {
+ Name: "/dev/sda1",
+ Total: 0,
+ Used: 0,
+ Free: 0,
+ },
+ },
+ threshold: 90,
+ prev: nil,
+ validateFunc: func(c job.Condition) {
+ s.Equal(job.ConditionDiskPressure, c.Type)
+ s.False(c.Status)
+ s.Empty(c.Reason)
+ },
+ },
+ {
+ name: "when second disk is above threshold reports it",
+ disks: []disk.UsageStats{
+ {
+ Name: "/dev/sda1",
+ Total: 100 * 1024 * 1024 * 1024,
+ Used: 50 * 1024 * 1024 * 1024, // 50%
+ Free: 50 * 1024 * 1024 * 1024,
+ },
+ {
+ Name: "/dev/sdb1",
+ Total: 200 * 1024 * 1024 * 1024,
+ Used: 195 * 1024 * 1024 * 1024, // 97.5%
+ Free: 5 * 1024 * 1024 * 1024,
+ },
+ },
+ threshold: 90,
+ prev: nil,
+ validateFunc: func(c job.Condition) {
+ s.Equal(job.ConditionDiskPressure, c.Type)
+ s.True(c.Status)
+ s.Contains(c.Reason, "/dev/sdb1")
+ },
+ },
+ }
+
+ for _, tt := range tests {
+ s.Run(tt.name, func() {
+ result := evaluateDiskPressure(tt.disks, tt.threshold, tt.prev)
+ tt.validateFunc(result)
+ })
+ }
+}
+
+func (s *ConditionTestSuite) TestLastTransitionTimeTracking() {
+ fixedPast := time.Date(2026, 1, 1, 0, 0, 0, 0, time.UTC)
+
+ tests := []struct {
+ name string
+ evalFunc func([]job.Condition) job.Condition
+ prev []job.Condition
+ validateFunc func(job.Condition)
+ }{
+ {
+ name: "when status flips from false to true transition time updates",
+ evalFunc: func(prev []job.Condition) job.Condition {
+ return evaluateMemoryPressure(
+ &mem.Stats{
+ Total: 100,
+ Available: 10, // 90% used
+ },
+ 80,
+ prev,
+ )
+ },
+ prev: []job.Condition{
+ {
+ Type: job.ConditionMemoryPressure,
+ Status: false,
+ LastTransitionTime: fixedPast,
+ },
+ },
+ validateFunc: func(c job.Condition) {
+ s.True(c.Status)
+ s.NotEqual(fixedPast, c.LastTransitionTime)
+ s.WithinDuration(time.Now(), c.LastTransitionTime, 2*time.Second)
+ },
+ },
+ {
+ name: "when status stays true transition time is preserved",
+ evalFunc: func(prev []job.Condition) job.Condition {
+ return evaluateMemoryPressure(
+ &mem.Stats{
+ Total: 100,
+ Available: 10, // 90% used
+ },
+ 80,
+ prev,
+ )
+ },
+ prev: []job.Condition{
+ {
+ Type: job.ConditionMemoryPressure,
+ Status: true,
+ LastTransitionTime: fixedPast,
+ },
+ },
+ validateFunc: func(c job.Condition) {
+ s.True(c.Status)
+ s.Equal(fixedPast, c.LastTransitionTime)
+ },
+ },
+ {
+ name: "when status flips from true to false transition time updates",
+ evalFunc: func(prev []job.Condition) job.Condition {
+ return evaluateMemoryPressure(
+ &mem.Stats{
+ Total: 100,
+ Available: 80, // 20% used
+ },
+ 80,
+ prev,
+ )
+ },
+ prev: []job.Condition{
+ {
+ Type: job.ConditionMemoryPressure,
+ Status: true,
+ LastTransitionTime: fixedPast,
+ },
+ },
+ validateFunc: func(c job.Condition) {
+ s.False(c.Status)
+ s.NotEqual(fixedPast, c.LastTransitionTime)
+ s.WithinDuration(time.Now(), c.LastTransitionTime, 2*time.Second)
+ },
+ },
+ {
+ name: "when status stays false transition time is preserved",
+ evalFunc: func(prev []job.Condition) job.Condition {
+ return evaluateMemoryPressure(
+ &mem.Stats{
+ Total: 100,
+ Available: 80, // 20% used
+ },
+ 80,
+ prev,
+ )
+ },
+ prev: []job.Condition{
+ {
+ Type: job.ConditionMemoryPressure,
+ Status: false,
+ LastTransitionTime: fixedPast,
+ },
+ },
+ validateFunc: func(c job.Condition) {
+ s.False(c.Status)
+ s.Equal(fixedPast, c.LastTransitionTime)
+ },
+ },
+ }
+
+ for _, tt := range tests {
+ s.Run(tt.name, func() {
+ result := tt.evalFunc(tt.prev)
+ tt.validateFunc(result)
+ })
+ }
+}
+
+func TestConditionTestSuite(t *testing.T) {
+ suite.Run(t, new(ConditionTestSuite))
+}
diff --git a/internal/agent/consumer.go b/internal/agent/consumer.go
index 532b1c72..03b95168 100644
--- a/internal/agent/consumer.go
+++ b/internal/agent/consumer.go
@@ -102,14 +102,14 @@ func (a *Agent) consumeQueryJobs(
continue
}
- a.wg.Add(1)
+ a.consumerWg.Add(1)
go func(c struct {
name string
filter string
queueGroup string
},
) {
- defer a.wg.Done()
+ defer a.consumerWg.Done()
opts := &natsclient.ConsumeOptions{
QueueGroup: c.queueGroup,
@@ -194,14 +194,14 @@ func (a *Agent) consumeModifyJobs(
continue
}
- a.wg.Add(1)
+ a.consumerWg.Add(1)
go func(c struct {
name string
filter string
queueGroup string
},
) {
- defer a.wg.Done()
+ defer a.consumerWg.Done()
opts := &natsclient.ConsumeOptions{
QueueGroup: c.queueGroup,
@@ -222,6 +222,21 @@ func (a *Agent) consumeModifyJobs(
return nil
}
+// startConsumers creates a consumer context and starts all job consumers.
+func (a *Agent) startConsumers() {
+ a.consumerCtx, a.consumerCancel = context.WithCancel(a.ctx)
+ _ = a.consumeQueryJobs(a.consumerCtx, a.hostname)
+ _ = a.consumeModifyJobs(a.consumerCtx, a.hostname)
+}
+
+// stopConsumers cancels the consumer context and waits for all consumer
+// goroutines to finish. After this returns, the agent is no longer
+// receiving new jobs.
+func (a *Agent) stopConsumers() {
+ a.consumerCancel()
+ a.consumerWg.Wait()
+}
+
// handleJobMessageJS wraps the existing handleJobMessage for JetStream compatibility.
func (a *Agent) handleJobMessageJS(
msg jetstream.Msg,
diff --git a/internal/agent/drain.go b/internal/agent/drain.go
new file mode 100644
index 00000000..47105637
--- /dev/null
+++ b/internal/agent/drain.go
@@ -0,0 +1,70 @@
+// Copyright (c) 2026 John Dewey
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal in the Software without restriction, including without limitation the
+// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+// sell copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS IN THE SOFTWARE.
+
+package agent
+
+import (
+ "context"
+
+ "github.com/retr0h/osapi/internal/job"
+)
+
+// checkDrainFlag checks the drain flag via the job client. Drain flags
+// are stored in the main KV bucket (longer TTL than registry).
+func (a *Agent) checkDrainFlag(
+ ctx context.Context,
+ hostname string,
+) bool {
+ return a.jobClient.CheckDrainFlag(ctx, hostname)
+}
+
+// handleDrainDetection checks drain flag on each heartbeat tick.
+// When drain is requested and agent is Ready, it transitions to Draining
+// and stops accepting new jobs by stopping consumer message handlers.
+// When drain flag is removed and agent is Cordoned, it transitions back
+// to Ready and resumes accepting jobs.
+func (a *Agent) handleDrainDetection(
+ ctx context.Context,
+ hostname string,
+) {
+ drainRequested := a.checkDrainFlag(ctx, hostname)
+
+ switch {
+ case drainRequested && a.state == job.AgentStateReady:
+ a.logger.Info("drain detected, stopping job consumption")
+ a.stopConsumers()
+ a.state = job.AgentStateCordoned
+ a.logger.Info("all consumers stopped, agent cordoned")
+ _ = a.jobClient.WriteAgentTimelineEvent(
+ ctx, hostname, "drain", "Drain initiated",
+ )
+ _ = a.jobClient.WriteAgentTimelineEvent(
+ ctx, hostname, "cordoned", "All jobs completed",
+ )
+
+ case !drainRequested && (a.state == job.AgentStateDraining || a.state == job.AgentStateCordoned):
+ a.logger.Info("undrain detected, resuming job consumption")
+ a.startConsumers()
+ a.state = job.AgentStateReady
+ _ = a.jobClient.WriteAgentTimelineEvent(
+ ctx, hostname, "undrain", "Resumed accepting jobs",
+ )
+ }
+}
diff --git a/internal/agent/drain_test.go b/internal/agent/drain_test.go
new file mode 100644
index 00000000..4d746a4f
--- /dev/null
+++ b/internal/agent/drain_test.go
@@ -0,0 +1,253 @@
+// Copyright (c) 2026 John Dewey
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal in the Software without restriction, including without limitation the
+// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+// sell copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS IN THE SOFTWARE.
+
+package agent
+
+import (
+ "context"
+ "log/slog"
+ "testing"
+
+ "github.com/golang/mock/gomock"
+ "github.com/spf13/afero"
+ "github.com/stretchr/testify/suite"
+
+ "github.com/retr0h/osapi/internal/config"
+ "github.com/retr0h/osapi/internal/job"
+ "github.com/retr0h/osapi/internal/job/mocks"
+ commandMocks "github.com/retr0h/osapi/internal/provider/command/mocks"
+ dnsMocks "github.com/retr0h/osapi/internal/provider/network/dns/mocks"
+ netinfoMocks "github.com/retr0h/osapi/internal/provider/network/netinfo/mocks"
+ pingMocks "github.com/retr0h/osapi/internal/provider/network/ping/mocks"
+ diskMocks "github.com/retr0h/osapi/internal/provider/node/disk/mocks"
+ hostMocks "github.com/retr0h/osapi/internal/provider/node/host/mocks"
+ loadMocks "github.com/retr0h/osapi/internal/provider/node/load/mocks"
+ memMocks "github.com/retr0h/osapi/internal/provider/node/mem/mocks"
+)
+
+type DrainTestSuite struct {
+ suite.Suite
+
+ mockCtrl *gomock.Controller
+ mockJobClient *mocks.MockJobClient
+ mockKV *mocks.MockKeyValue
+ mockEntry *mocks.MockKeyValueEntry
+ agent *Agent
+}
+
+func (s *DrainTestSuite) SetupTest() {
+ s.mockCtrl = gomock.NewController(s.T())
+ s.mockJobClient = mocks.NewMockJobClient(s.mockCtrl)
+ s.mockKV = mocks.NewMockKeyValue(s.mockCtrl)
+ s.mockEntry = mocks.NewMockKeyValueEntry(s.mockCtrl)
+
+ appConfig := config.Config{
+ Agent: config.AgentConfig{
+ Labels: map[string]string{"group": "web"},
+ },
+ }
+
+ s.agent = New(
+ afero.NewMemMapFs(),
+ appConfig,
+ slog.Default(),
+ s.mockJobClient,
+ "test-stream",
+ hostMocks.NewDefaultMockProvider(s.mockCtrl),
+ diskMocks.NewDefaultMockProvider(s.mockCtrl),
+ memMocks.NewDefaultMockProvider(s.mockCtrl),
+ loadMocks.NewDefaultMockProvider(s.mockCtrl),
+ dnsMocks.NewDefaultMockProvider(s.mockCtrl),
+ pingMocks.NewDefaultMockProvider(s.mockCtrl),
+ netinfoMocks.NewDefaultMockProvider(s.mockCtrl),
+ commandMocks.NewDefaultMockProvider(s.mockCtrl),
+ s.mockKV,
+ nil,
+ )
+ s.agent.state = job.AgentStateReady
+ s.agent.ctx, s.agent.cancel = context.WithCancel(context.Background())
+ s.agent.consumerCtx, s.agent.consumerCancel = context.WithCancel(s.agent.ctx)
+}
+
+func (s *DrainTestSuite) TearDownTest() {
+ s.mockCtrl.Finish()
+}
+
+func (s *DrainTestSuite) TestCheckDrainFlag() {
+ tests := []struct {
+ name string
+ setupMock func()
+ validateFunc func(bool)
+ }{
+ {
+ name: "when drain key exists returns true",
+ setupMock: func() {
+ s.mockJobClient.EXPECT().
+ CheckDrainFlag(gomock.Any(), "test-agent").
+ Return(true)
+ },
+ validateFunc: func(result bool) {
+ s.True(result)
+ },
+ },
+ {
+ name: "when drain key missing returns false",
+ setupMock: func() {
+ s.mockJobClient.EXPECT().
+ CheckDrainFlag(gomock.Any(), "test-agent").
+ Return(false)
+ },
+ validateFunc: func(result bool) {
+ s.False(result)
+ },
+ },
+ }
+
+ for _, tt := range tests {
+ s.Run(tt.name, func() {
+ tt.setupMock()
+ result := s.agent.checkDrainFlag(context.Background(), "test-agent")
+ tt.validateFunc(result)
+ })
+ }
+}
+
+func (s *DrainTestSuite) TestHandleDrainDetection() {
+ tests := []struct {
+ name string
+ initialState string
+ setupMock func()
+ expectedState string
+ }{
+ {
+ name: "when drain flag set and agent is Ready transitions to Cordoned",
+ initialState: job.AgentStateReady,
+ setupMock: func() {
+ s.mockJobClient.EXPECT().
+ CheckDrainFlag(gomock.Any(), "test-agent").
+ Return(true)
+ s.mockJobClient.EXPECT().
+ WriteAgentTimelineEvent(
+ gomock.Any(),
+ "test-agent",
+ "drain",
+ "Drain initiated",
+ ).
+ Return(nil)
+ s.mockJobClient.EXPECT().
+ WriteAgentTimelineEvent(
+ gomock.Any(),
+ "test-agent",
+ "cordoned",
+ "All jobs completed",
+ ).
+ Return(nil)
+ },
+ expectedState: job.AgentStateCordoned,
+ },
+ {
+ name: "when drain flag removed and agent is Draining transitions to Ready",
+ initialState: job.AgentStateDraining,
+ setupMock: func() {
+ s.mockJobClient.EXPECT().
+ CheckDrainFlag(gomock.Any(), "test-agent").
+ Return(false)
+ s.mockJobClient.EXPECT().
+ WriteAgentTimelineEvent(
+ gomock.Any(),
+ "test-agent",
+ "undrain",
+ "Resumed accepting jobs",
+ ).
+ Return(nil)
+ // startConsumers re-creates consumers
+ s.mockJobClient.EXPECT().
+ CreateOrUpdateConsumer(gomock.Any(), gomock.Any(), gomock.Any()).
+ Return(nil).
+ AnyTimes()
+ s.mockJobClient.EXPECT().
+ ConsumeJobs(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).
+ Return(context.Canceled).
+ AnyTimes()
+ },
+ expectedState: job.AgentStateReady,
+ },
+ {
+ name: "when drain flag removed and agent is Cordoned transitions to Ready",
+ initialState: job.AgentStateCordoned,
+ setupMock: func() {
+ s.mockJobClient.EXPECT().
+ CheckDrainFlag(gomock.Any(), "test-agent").
+ Return(false)
+ s.mockJobClient.EXPECT().
+ WriteAgentTimelineEvent(
+ gomock.Any(),
+ "test-agent",
+ "undrain",
+ "Resumed accepting jobs",
+ ).
+ Return(nil)
+ // startConsumers re-creates consumers
+ s.mockJobClient.EXPECT().
+ CreateOrUpdateConsumer(gomock.Any(), gomock.Any(), gomock.Any()).
+ Return(nil).
+ AnyTimes()
+ s.mockJobClient.EXPECT().
+ ConsumeJobs(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).
+ Return(context.Canceled).
+ AnyTimes()
+ },
+ expectedState: job.AgentStateReady,
+ },
+ {
+ name: "when drain flag still set and agent is already Draining stays Draining",
+ initialState: job.AgentStateDraining,
+ setupMock: func() {
+ s.mockJobClient.EXPECT().
+ CheckDrainFlag(gomock.Any(), "test-agent").
+ Return(true)
+ },
+ expectedState: job.AgentStateDraining,
+ },
+ {
+ name: "when no drain flag and agent is Ready stays Ready",
+ initialState: job.AgentStateReady,
+ setupMock: func() {
+ s.mockJobClient.EXPECT().
+ CheckDrainFlag(gomock.Any(), "test-agent").
+ Return(false)
+ },
+ expectedState: job.AgentStateReady,
+ },
+ }
+
+ for _, tt := range tests {
+ s.Run(tt.name, func() {
+ s.agent.state = tt.initialState
+ tt.setupMock()
+ s.agent.handleDrainDetection(context.Background(), "test-agent")
+ s.Equal(tt.expectedState, s.agent.state)
+ })
+ }
+}
+
+func TestDrainTestSuite(t *testing.T) {
+ suite.Run(t, new(DrainTestSuite))
+}
diff --git a/internal/agent/facts.go b/internal/agent/facts.go
index d0642479..cac82974 100644
--- a/internal/agent/facts.go
+++ b/internal/agent/facts.go
@@ -85,6 +85,7 @@ func (a *Agent) writeFacts(
if count, err := a.hostProvider.GetCPUCount(); err == nil {
reg.CPUCount = count
+ a.cpuCount = count
}
if mgr, err := a.hostProvider.GetServiceManager(); err == nil {
diff --git a/internal/agent/heartbeat.go b/internal/agent/heartbeat.go
index f469e35e..959d7814 100644
--- a/internal/agent/heartbeat.go
+++ b/internal/agent/heartbeat.go
@@ -27,6 +27,9 @@ import (
"time"
"github.com/retr0h/osapi/internal/job"
+ "github.com/retr0h/osapi/internal/provider/node/disk"
+ "github.com/retr0h/osapi/internal/provider/node/load"
+ "github.com/retr0h/osapi/internal/provider/node/mem"
)
// heartbeatInterval is the interval between heartbeat refreshes.
@@ -89,11 +92,14 @@ func (a *Agent) writeRegistration(
ctx context.Context,
hostname string,
) {
+ a.handleDrainDetection(ctx, hostname)
+
reg := job.AgentRegistration{
Hostname: hostname,
Labels: a.appConfig.Agent.Labels,
RegisteredAt: time.Now(),
StartedAt: a.startedAt,
+ State: a.state,
}
if info, err := a.hostProvider.GetOSInfo(); err == nil {
@@ -104,14 +110,44 @@ func (a *Agent) writeRegistration(
reg.Uptime = uptime
}
+ var loadAvg *load.AverageStats
if avg, err := a.loadProvider.GetAverageStats(); err == nil {
+ loadAvg = avg
reg.LoadAverages = avg
}
+ var memStats *mem.Stats
if stats, err := a.memProvider.GetStats(); err == nil {
+ memStats = stats
reg.MemoryStats = stats
}
+ var diskStats []disk.UsageStats
+ if stats, err := a.diskProvider.GetLocalUsageStats(); err == nil {
+ diskStats = stats
+ }
+
+ conditions := []job.Condition{
+ evaluateMemoryPressure(
+ memStats,
+ a.appConfig.Agent.Conditions.MemoryPressureThreshold,
+ a.prevConditions,
+ ),
+ evaluateHighLoad(
+ loadAvg,
+ a.cpuCount,
+ a.appConfig.Agent.Conditions.HighLoadMultiplier,
+ a.prevConditions,
+ ),
+ evaluateDiskPressure(
+ diskStats,
+ a.appConfig.Agent.Conditions.DiskPressureThreshold,
+ a.prevConditions,
+ ),
+ }
+ a.prevConditions = conditions
+ reg.Conditions = conditions
+
data, err := marshalJSON(reg)
if err != nil {
a.logger.Warn(
diff --git a/internal/agent/heartbeat_public_test.go b/internal/agent/heartbeat_public_test.go
index d607d56f..93db8d22 100644
--- a/internal/agent/heartbeat_public_test.go
+++ b/internal/agent/heartbeat_public_test.go
@@ -100,6 +100,12 @@ func (s *HeartbeatPublicTestSuite) TestStartWithHeartbeat() {
{
name: "when registryKV is set registers and deregisters",
setupFunc: func() *agent.Agent {
+ // Drain check on each heartbeat tick (no drain flag present)
+ s.mockJobClient.EXPECT().
+ CheckDrainFlag(gomock.Any(), "test-agent").
+ Return(false).
+ AnyTimes()
+
// Heartbeat initial write
s.mockKV.EXPECT().
Put(gomock.Any(), "agents.test_agent", gomock.Any()).
diff --git a/internal/agent/heartbeat_test.go b/internal/agent/heartbeat_test.go
index eacc6ef5..fb64492f 100644
--- a/internal/agent/heartbeat_test.go
+++ b/internal/agent/heartbeat_test.go
@@ -34,6 +34,7 @@ import (
"github.com/stretchr/testify/suite"
"github.com/retr0h/osapi/internal/config"
+ "github.com/retr0h/osapi/internal/job"
"github.com/retr0h/osapi/internal/job/mocks"
commandMocks "github.com/retr0h/osapi/internal/provider/command/mocks"
dnsMocks "github.com/retr0h/osapi/internal/provider/network/dns/mocks"
@@ -83,6 +84,14 @@ func (s *HeartbeatTestSuite) SetupTest() {
s.mockKV,
nil,
)
+ s.agent.state = job.AgentStateReady
+
+ // writeRegistration now calls handleDrainDetection which checks drain flag.
+ // Default: no drain flag present.
+ s.mockJobClient.EXPECT().
+ CheckDrainFlag(gomock.Any(), "test-agent").
+ Return(false).
+ AnyTimes()
}
func (s *HeartbeatTestSuite) TearDownTest() {
diff --git a/internal/agent/server.go b/internal/agent/server.go
index c1c0ec18..396b75b8 100644
--- a/internal/agent/server.go
+++ b/internal/agent/server.go
@@ -32,30 +32,29 @@ import (
func (a *Agent) Start() {
a.ctx, a.cancel = context.WithCancel(context.Background())
a.startedAt = time.Now()
+ a.state = job.AgentStateReady
a.logger.Info("starting node agent")
// Determine agent hostname (GetAgentHostname always succeeds)
- hostname, _ := job.GetAgentHostname(a.appConfig.Agent.Hostname)
+ a.hostname, _ = job.GetAgentHostname(a.appConfig.Agent.Hostname)
a.logger.Info(
"agent configuration",
- slog.String("hostname", hostname),
+ slog.String("hostname", a.hostname),
slog.String("queue_group", a.appConfig.Agent.QueueGroup),
slog.Int("max_jobs", a.appConfig.Agent.MaxJobs),
slog.Any("labels", a.appConfig.Agent.Labels),
)
// Register in agent registry and start heartbeat keepalive.
- a.startHeartbeat(a.ctx, hostname)
+ a.startHeartbeat(a.ctx, a.hostname)
// Collect and publish system facts.
- a.startFacts(a.ctx, hostname)
+ a.startFacts(a.ctx, a.hostname)
// Start consuming messages for different job types.
- // Each consume function spawns goroutines tracked by a.wg.
- _ = a.consumeQueryJobs(a.ctx, hostname)
- _ = a.consumeModifyJobs(a.ctx, hostname)
+ a.startConsumers()
a.logger.Info("node agent started successfully")
}
@@ -70,6 +69,7 @@ func (a *Agent) Stop(
done := make(chan struct{})
go func() {
+ a.consumerWg.Wait()
a.wg.Wait()
close(done)
}()
diff --git a/internal/agent/types.go b/internal/agent/types.go
index 5e97581c..e3b31e01 100644
--- a/internal/agent/types.go
+++ b/internal/agent/types.go
@@ -30,6 +30,7 @@ import (
"github.com/spf13/afero"
"github.com/retr0h/osapi/internal/config"
+ "github.com/retr0h/osapi/internal/job"
"github.com/retr0h/osapi/internal/job/client"
"github.com/retr0h/osapi/internal/provider/command"
"github.com/retr0h/osapi/internal/provider/network/dns"
@@ -74,10 +75,27 @@ type Agent struct {
// startedAt records when the agent process started.
startedAt time.Time
+ // prevConditions tracks condition state between heartbeats.
+ prevConditions []job.Condition
+
+ // cpuCount cached from facts for HighLoad evaluation.
+ cpuCount int
+
+ // state is the agent's scheduling state (Ready, Draining, Cordoned).
+ state string
+
+ // hostname cached from Start for drain/undrain resubscribe.
+ hostname string
+
// Lifecycle management
ctx context.Context
cancel context.CancelFunc
wg sync.WaitGroup
+
+ // Consumer lifecycle for drain/undrain.
+ consumerCtx context.Context
+ consumerCancel context.CancelFunc
+ consumerWg sync.WaitGroup
}
// JobContext contains the context and data for a single job execution.
diff --git a/internal/api/agent/agent_drain.go b/internal/api/agent/agent_drain.go
new file mode 100644
index 00000000..ee3c1276
--- /dev/null
+++ b/internal/api/agent/agent_drain.go
@@ -0,0 +1,68 @@
+// Copyright (c) 2026 John Dewey
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal in the Software without restriction, including without limitation the
+// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+// sell copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS IN THE SOFTWARE.
+
+package agent
+
+import (
+ "context"
+ "fmt"
+ "strings"
+
+ "github.com/retr0h/osapi/internal/api/agent/gen"
+ "github.com/retr0h/osapi/internal/job"
+)
+
+// DrainAgent handles POST /agent/{hostname}/drain.
+func (a *Agent) DrainAgent(
+ ctx context.Context,
+ request gen.DrainAgentRequestObject,
+) (gen.DrainAgentResponseObject, error) {
+ hostname := request.Hostname
+
+ agentInfo, err := a.JobClient.GetAgent(ctx, hostname)
+ if err != nil {
+ errMsg := fmt.Sprintf("agent not found: %s", hostname)
+ return gen.DrainAgent404JSONResponse{Error: &errMsg}, nil
+ }
+
+ if agentInfo.State == job.AgentStateDraining || agentInfo.State == job.AgentStateCordoned {
+ errMsg := fmt.Sprintf("agent %s is already in %s state", hostname, agentInfo.State)
+ return gen.DrainAgent409JSONResponse{Error: &errMsg}, nil
+ }
+
+ if err := a.JobClient.SetDrainFlag(ctx, hostname); err != nil {
+ errMsg := fmt.Sprintf("failed to set drain flag: %s", err.Error())
+ return gen.DrainAgent409JSONResponse{Error: &errMsg}, nil
+ }
+
+ if err := a.JobClient.WriteAgentTimelineEvent(ctx, hostname, "drain", "Drain initiated via API"); err != nil {
+ if strings.Contains(err.Error(), "not found") {
+ errMsg := fmt.Sprintf("agent not found: %s", hostname)
+ return gen.DrainAgent404JSONResponse{Error: &errMsg}, nil
+ }
+
+ errMsg := err.Error()
+ return gen.DrainAgent409JSONResponse{Error: &errMsg}, nil
+ }
+
+ msg := fmt.Sprintf("drain initiated for agent %s", hostname)
+
+ return gen.DrainAgent200JSONResponse{Message: msg}, nil
+}
diff --git a/internal/api/agent/agent_drain_public_test.go b/internal/api/agent/agent_drain_public_test.go
new file mode 100644
index 00000000..91c99de5
--- /dev/null
+++ b/internal/api/agent/agent_drain_public_test.go
@@ -0,0 +1,358 @@
+// Copyright (c) 2026 John Dewey
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal in the Software without restriction, including without limitation the
+// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+// sell copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS IN THE SOFTWARE.
+
+package agent_test
+
+import (
+ "context"
+ "fmt"
+ "log/slog"
+ "net/http"
+ "net/http/httptest"
+ "os"
+ "testing"
+
+ "github.com/golang/mock/gomock"
+ "github.com/stretchr/testify/suite"
+
+ "github.com/retr0h/osapi/internal/api"
+ apiagent "github.com/retr0h/osapi/internal/api/agent"
+ "github.com/retr0h/osapi/internal/api/agent/gen"
+ "github.com/retr0h/osapi/internal/authtoken"
+ "github.com/retr0h/osapi/internal/config"
+ jobtypes "github.com/retr0h/osapi/internal/job"
+ jobmocks "github.com/retr0h/osapi/internal/job/mocks"
+)
+
+type AgentDrainPublicTestSuite struct {
+ suite.Suite
+
+ mockCtrl *gomock.Controller
+ mockJobClient *jobmocks.MockJobClient
+ handler *apiagent.Agent
+ ctx context.Context
+ appConfig config.Config
+ logger *slog.Logger
+}
+
+func (s *AgentDrainPublicTestSuite) SetupTest() {
+ s.mockCtrl = gomock.NewController(s.T())
+ s.mockJobClient = jobmocks.NewMockJobClient(s.mockCtrl)
+ s.handler = apiagent.New(slog.Default(), s.mockJobClient)
+ s.ctx = context.Background()
+ s.appConfig = config.Config{}
+ s.logger = slog.New(slog.NewTextHandler(os.Stdout, nil))
+}
+
+func (s *AgentDrainPublicTestSuite) TearDownTest() {
+ s.mockCtrl.Finish()
+}
+
+func (s *AgentDrainPublicTestSuite) TestDrainAgent() {
+ tests := []struct {
+ name string
+ hostname string
+ mockAgent *jobtypes.AgentInfo
+ mockGetErr error
+ mockWriteErr error
+ skipWrite bool
+ mockSetDrain bool
+ validateFunc func(resp gen.DrainAgentResponseObject)
+ }{
+ {
+ name: "success drains agent",
+ hostname: "server1",
+ mockAgent: &jobtypes.AgentInfo{
+ Hostname: "server1",
+ State: jobtypes.AgentStateReady,
+ },
+ mockSetDrain: true,
+ validateFunc: func(resp gen.DrainAgentResponseObject) {
+ r, ok := resp.(gen.DrainAgent200JSONResponse)
+ s.True(ok)
+ s.Contains(r.Message, "drain initiated for agent server1")
+ },
+ },
+ {
+ name: "agent not found returns 404",
+ hostname: "unknown",
+ mockGetErr: fmt.Errorf("agent not found: unknown"),
+ skipWrite: true,
+ validateFunc: func(resp gen.DrainAgentResponseObject) {
+ _, ok := resp.(gen.DrainAgent404JSONResponse)
+ s.True(ok)
+ },
+ },
+ {
+ name: "agent already draining returns 409",
+ hostname: "server1",
+ mockAgent: &jobtypes.AgentInfo{
+ Hostname: "server1",
+ State: jobtypes.AgentStateDraining,
+ },
+ skipWrite: true,
+ validateFunc: func(resp gen.DrainAgentResponseObject) {
+ _, ok := resp.(gen.DrainAgent409JSONResponse)
+ s.True(ok)
+ },
+ },
+ {
+ name: "agent already cordoned returns 409",
+ hostname: "server1",
+ mockAgent: &jobtypes.AgentInfo{
+ Hostname: "server1",
+ State: jobtypes.AgentStateCordoned,
+ },
+ skipWrite: true,
+ validateFunc: func(resp gen.DrainAgentResponseObject) {
+ _, ok := resp.(gen.DrainAgent409JSONResponse)
+ s.True(ok)
+ },
+ },
+ }
+
+ for _, tt := range tests {
+ s.Run(tt.name, func() {
+ s.mockJobClient.EXPECT().
+ GetAgent(gomock.Any(), tt.hostname).
+ Return(tt.mockAgent, tt.mockGetErr)
+
+ if tt.mockSetDrain {
+ s.mockJobClient.EXPECT().
+ SetDrainFlag(gomock.Any(), tt.hostname).
+ Return(nil)
+ }
+
+ if !tt.skipWrite {
+ s.mockJobClient.EXPECT().
+ WriteAgentTimelineEvent(gomock.Any(), tt.hostname, "drain", "Drain initiated via API").
+ Return(tt.mockWriteErr)
+ }
+
+ resp, err := s.handler.DrainAgent(s.ctx, gen.DrainAgentRequestObject{
+ Hostname: tt.hostname,
+ })
+ s.NoError(err)
+ tt.validateFunc(resp)
+ })
+ }
+}
+
+func (s *AgentDrainPublicTestSuite) TestDrainAgentValidationHTTP() {
+ tests := []struct {
+ name string
+ hostname string
+ setupJobMock func() *jobmocks.MockJobClient
+ wantCode int
+ wantContains []string
+ }{
+ {
+ name: "when agent exists returns 200",
+ hostname: "server1",
+ setupJobMock: func() *jobmocks.MockJobClient {
+ mock := jobmocks.NewMockJobClient(s.mockCtrl)
+ mock.EXPECT().
+ GetAgent(gomock.Any(), "server1").
+ Return(&jobtypes.AgentInfo{
+ Hostname: "server1",
+ State: jobtypes.AgentStateReady,
+ }, nil)
+ mock.EXPECT().
+ SetDrainFlag(gomock.Any(), "server1").
+ Return(nil)
+ mock.EXPECT().
+ WriteAgentTimelineEvent(gomock.Any(), "server1", "drain", "Drain initiated via API").
+ Return(nil)
+ return mock
+ },
+ wantCode: http.StatusOK,
+ wantContains: []string{`"message"`, `drain initiated`},
+ },
+ {
+ name: "when agent not found returns 404",
+ hostname: "unknown",
+ setupJobMock: func() *jobmocks.MockJobClient {
+ mock := jobmocks.NewMockJobClient(s.mockCtrl)
+ mock.EXPECT().
+ GetAgent(gomock.Any(), "unknown").
+ Return(nil, fmt.Errorf("agent not found: unknown"))
+ return mock
+ },
+ wantCode: http.StatusNotFound,
+ wantContains: []string{`"error"`},
+ },
+ {
+ name: "when agent already draining returns 409",
+ hostname: "server1",
+ setupJobMock: func() *jobmocks.MockJobClient {
+ mock := jobmocks.NewMockJobClient(s.mockCtrl)
+ mock.EXPECT().
+ GetAgent(gomock.Any(), "server1").
+ Return(&jobtypes.AgentInfo{
+ Hostname: "server1",
+ State: jobtypes.AgentStateDraining,
+ }, nil)
+ return mock
+ },
+ wantCode: http.StatusConflict,
+ wantContains: []string{`"error"`, `already in Draining`},
+ },
+ }
+
+ for _, tc := range tests {
+ s.Run(tc.name, func() {
+ jobMock := tc.setupJobMock()
+
+ agentHandler := apiagent.New(s.logger, jobMock)
+ strictHandler := gen.NewStrictHandler(agentHandler, nil)
+
+ a := api.New(s.appConfig, s.logger)
+ gen.RegisterHandlers(a.Echo, strictHandler)
+
+ req := httptest.NewRequest(
+ http.MethodPost,
+ fmt.Sprintf("/agent/%s/drain", tc.hostname),
+ nil,
+ )
+ rec := httptest.NewRecorder()
+
+ a.Echo.ServeHTTP(rec, req)
+
+ s.Equal(tc.wantCode, rec.Code)
+ for _, str := range tc.wantContains {
+ s.Contains(rec.Body.String(), str)
+ }
+ })
+ }
+}
+
+const rbacAgentDrainTestSigningKey = "test-signing-key-for-rbac-agent-drain"
+
+func (s *AgentDrainPublicTestSuite) TestDrainAgentRBACHTTP() {
+ tokenManager := authtoken.New(s.logger)
+
+ tests := []struct {
+ name string
+ setupAuth func(req *http.Request)
+ setupJobMock func() *jobmocks.MockJobClient
+ wantCode int
+ wantContains []string
+ }{
+ {
+ name: "when no token returns 401",
+ setupAuth: func(_ *http.Request) {
+ // No auth header set
+ },
+ setupJobMock: func() *jobmocks.MockJobClient {
+ return jobmocks.NewMockJobClient(s.mockCtrl)
+ },
+ wantCode: http.StatusUnauthorized,
+ wantContains: []string{"Bearer token required"},
+ },
+ {
+ name: "when insufficient permissions returns 403",
+ setupAuth: func(req *http.Request) {
+ token, err := tokenManager.Generate(
+ rbacAgentDrainTestSigningKey,
+ []string{"read"},
+ "test-user",
+ []string{"agent:read"},
+ )
+ s.Require().NoError(err)
+ req.Header.Set("Authorization", fmt.Sprintf("Bearer %s", token))
+ },
+ setupJobMock: func() *jobmocks.MockJobClient {
+ return jobmocks.NewMockJobClient(s.mockCtrl)
+ },
+ wantCode: http.StatusForbidden,
+ wantContains: []string{"Insufficient permissions"},
+ },
+ {
+ name: "when valid token with agent:write returns 200",
+ setupAuth: func(req *http.Request) {
+ token, err := tokenManager.Generate(
+ rbacAgentDrainTestSigningKey,
+ []string{"admin"},
+ "test-user",
+ nil,
+ )
+ s.Require().NoError(err)
+ req.Header.Set("Authorization", fmt.Sprintf("Bearer %s", token))
+ },
+ setupJobMock: func() *jobmocks.MockJobClient {
+ mock := jobmocks.NewMockJobClient(s.mockCtrl)
+ mock.EXPECT().
+ GetAgent(gomock.Any(), "server1").
+ Return(&jobtypes.AgentInfo{
+ Hostname: "server1",
+ State: jobtypes.AgentStateReady,
+ }, nil)
+ mock.EXPECT().
+ SetDrainFlag(gomock.Any(), "server1").
+ Return(nil)
+ mock.EXPECT().
+ WriteAgentTimelineEvent(gomock.Any(), "server1", "drain", "Drain initiated via API").
+ Return(nil)
+ return mock
+ },
+ wantCode: http.StatusOK,
+ wantContains: []string{`"message"`, `drain initiated`},
+ },
+ }
+
+ for _, tc := range tests {
+ s.Run(tc.name, func() {
+ jobMock := tc.setupJobMock()
+
+ appConfig := config.Config{
+ API: config.API{
+ Server: config.Server{
+ Security: config.ServerSecurity{
+ SigningKey: rbacAgentDrainTestSigningKey,
+ },
+ },
+ },
+ }
+
+ server := api.New(appConfig, s.logger)
+ handlers := server.GetAgentHandler(jobMock)
+ server.RegisterHandlers(handlers)
+
+ req := httptest.NewRequest(
+ http.MethodPost,
+ "/agent/server1/drain",
+ nil,
+ )
+ tc.setupAuth(req)
+ rec := httptest.NewRecorder()
+
+ server.Echo.ServeHTTP(rec, req)
+
+ s.Equal(tc.wantCode, rec.Code)
+ for _, str := range tc.wantContains {
+ s.Contains(rec.Body.String(), str)
+ }
+ })
+ }
+}
+
+func TestAgentDrainPublicTestSuite(t *testing.T) {
+ suite.Run(t, new(AgentDrainPublicTestSuite))
+}
diff --git a/internal/api/agent/agent_get_public_test.go b/internal/api/agent/agent_get_public_test.go
index 411d2991..635958b3 100644
--- a/internal/api/agent/agent_get_public_test.go
+++ b/internal/api/agent/agent_get_public_test.go
@@ -94,7 +94,7 @@ func (s *AgentGetPublicTestSuite) TestGetAgentDetails() {
r, ok := resp.(gen.GetAgentDetails200JSONResponse)
s.True(ok)
s.Equal("server1", r.Hostname)
- s.Equal(gen.Ready, r.Status)
+ s.Equal(gen.AgentInfoStatusReady, r.Status)
s.NotNil(r.Labels)
s.NotNil(r.OsInfo)
s.Equal("Ubuntu", r.OsInfo.Distribution)
diff --git a/internal/api/agent/agent_list.go b/internal/api/agent/agent_list.go
index b2628d53..02e0b44f 100644
--- a/internal/api/agent/agent_list.go
+++ b/internal/api/agent/agent_list.go
@@ -59,7 +59,7 @@ func (a *Agent) GetAgent(
func buildAgentInfo(
a *job.AgentInfo,
) gen.AgentInfo {
- status := gen.Ready
+ status := gen.AgentInfoStatusReady
info := gen.AgentInfo{
Hostname: a.Hostname,
Status: status,
@@ -167,6 +167,50 @@ func buildAgentInfo(
info.Facts = &facts
}
+ if a.State != "" {
+ state := gen.AgentInfoState(a.State)
+ info.State = &state
+ }
+
+ if len(a.Conditions) > 0 {
+ conditions := make([]gen.NodeCondition, len(a.Conditions))
+ for i, c := range a.Conditions {
+ conditions[i] = gen.NodeCondition{
+ Type: gen.NodeConditionType(c.Type),
+ Status: c.Status,
+ LastTransitionTime: c.LastTransitionTime,
+ }
+ if c.Reason != "" {
+ reason := c.Reason
+ conditions[i].Reason = &reason
+ }
+ }
+ info.Conditions = &conditions
+ }
+
+ if len(a.Timeline) > 0 {
+ timeline := make([]gen.TimelineEvent, len(a.Timeline))
+ for i, te := range a.Timeline {
+ timeline[i] = gen.TimelineEvent{
+ Timestamp: te.Timestamp,
+ Event: te.Event,
+ }
+ if te.Hostname != "" {
+ hostname := te.Hostname
+ timeline[i].Hostname = &hostname
+ }
+ if te.Message != "" {
+ message := te.Message
+ timeline[i].Message = &message
+ }
+ if te.Error != "" {
+ errStr := te.Error
+ timeline[i].Error = &errStr
+ }
+ }
+ info.Timeline = &timeline
+ }
+
return info
}
diff --git a/internal/api/agent/agent_list_public_test.go b/internal/api/agent/agent_list_public_test.go
index 38f43a21..78b641d0 100644
--- a/internal/api/agent/agent_list_public_test.go
+++ b/internal/api/agent/agent_list_public_test.go
@@ -98,7 +98,7 @@ func (s *AgentListPublicTestSuite) TestGetAgent() {
s.Equal(2, r.Total)
s.Len(r.Agents, 2)
s.Equal("server1", r.Agents[0].Hostname)
- s.Equal(gen.Ready, r.Agents[0].Status)
+ s.Equal(gen.AgentInfoStatusReady, r.Agents[0].Status)
s.NotNil(r.Agents[0].Labels)
s.NotNil(r.Agents[0].RegisteredAt)
s.NotNil(r.Agents[0].StartedAt)
@@ -108,7 +108,7 @@ func (s *AgentListPublicTestSuite) TestGetAgent() {
s.NotNil(r.Agents[0].Memory)
s.NotNil(r.Agents[0].Uptime)
s.Equal("server2", r.Agents[1].Hostname)
- s.Equal(gen.Ready, r.Agents[1].Status)
+ s.Equal(gen.AgentInfoStatusReady, r.Agents[1].Status)
},
},
{
diff --git a/internal/api/agent/agent_undrain.go b/internal/api/agent/agent_undrain.go
new file mode 100644
index 00000000..67bad3cc
--- /dev/null
+++ b/internal/api/agent/agent_undrain.go
@@ -0,0 +1,72 @@
+// Copyright (c) 2026 John Dewey
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal in the Software without restriction, including without limitation the
+// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+// sell copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS IN THE SOFTWARE.
+
+package agent
+
+import (
+ "context"
+ "fmt"
+ "strings"
+
+ "github.com/retr0h/osapi/internal/api/agent/gen"
+ "github.com/retr0h/osapi/internal/job"
+)
+
+// UndrainAgent handles POST /agent/{hostname}/undrain.
+func (a *Agent) UndrainAgent(
+ ctx context.Context,
+ request gen.UndrainAgentRequestObject,
+) (gen.UndrainAgentResponseObject, error) {
+ hostname := request.Hostname
+
+ agentInfo, err := a.JobClient.GetAgent(ctx, hostname)
+ if err != nil {
+ errMsg := fmt.Sprintf("agent not found: %s", hostname)
+ return gen.UndrainAgent404JSONResponse{Error: &errMsg}, nil
+ }
+
+ if agentInfo.State != job.AgentStateDraining && agentInfo.State != job.AgentStateCordoned {
+ errMsg := fmt.Sprintf(
+ "agent %s is not in draining or cordoned state (current: %s)",
+ hostname,
+ agentInfo.State,
+ )
+ return gen.UndrainAgent409JSONResponse{Error: &errMsg}, nil
+ }
+
+ if err := a.JobClient.DeleteDrainFlag(ctx, hostname); err != nil {
+ errMsg := fmt.Sprintf("failed to delete drain flag: %s", err.Error())
+ return gen.UndrainAgent409JSONResponse{Error: &errMsg}, nil
+ }
+
+ if err := a.JobClient.WriteAgentTimelineEvent(ctx, hostname, "undrain", "Undrain initiated via API"); err != nil {
+ if strings.Contains(err.Error(), "not found") {
+ errMsg := fmt.Sprintf("agent not found: %s", hostname)
+ return gen.UndrainAgent404JSONResponse{Error: &errMsg}, nil
+ }
+
+ errMsg := err.Error()
+ return gen.UndrainAgent409JSONResponse{Error: &errMsg}, nil
+ }
+
+ msg := fmt.Sprintf("undrain initiated for agent %s", hostname)
+
+ return gen.UndrainAgent200JSONResponse{Message: msg}, nil
+}
diff --git a/internal/api/agent/agent_undrain_public_test.go b/internal/api/agent/agent_undrain_public_test.go
new file mode 100644
index 00000000..30b55bbb
--- /dev/null
+++ b/internal/api/agent/agent_undrain_public_test.go
@@ -0,0 +1,372 @@
+// Copyright (c) 2026 John Dewey
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal in the Software without restriction, including without limitation the
+// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+// sell copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS IN THE SOFTWARE.
+
+package agent_test
+
+import (
+ "context"
+ "fmt"
+ "log/slog"
+ "net/http"
+ "net/http/httptest"
+ "os"
+ "testing"
+
+ "github.com/golang/mock/gomock"
+ "github.com/stretchr/testify/suite"
+
+ "github.com/retr0h/osapi/internal/api"
+ apiagent "github.com/retr0h/osapi/internal/api/agent"
+ "github.com/retr0h/osapi/internal/api/agent/gen"
+ "github.com/retr0h/osapi/internal/authtoken"
+ "github.com/retr0h/osapi/internal/config"
+ jobtypes "github.com/retr0h/osapi/internal/job"
+ jobmocks "github.com/retr0h/osapi/internal/job/mocks"
+)
+
+type AgentUndrainPublicTestSuite struct {
+ suite.Suite
+
+ mockCtrl *gomock.Controller
+ mockJobClient *jobmocks.MockJobClient
+ handler *apiagent.Agent
+ ctx context.Context
+ appConfig config.Config
+ logger *slog.Logger
+}
+
+func (s *AgentUndrainPublicTestSuite) SetupTest() {
+ s.mockCtrl = gomock.NewController(s.T())
+ s.mockJobClient = jobmocks.NewMockJobClient(s.mockCtrl)
+ s.handler = apiagent.New(slog.Default(), s.mockJobClient)
+ s.ctx = context.Background()
+ s.appConfig = config.Config{}
+ s.logger = slog.New(slog.NewTextHandler(os.Stdout, nil))
+}
+
+func (s *AgentUndrainPublicTestSuite) TearDownTest() {
+ s.mockCtrl.Finish()
+}
+
+func (s *AgentUndrainPublicTestSuite) TestUndrainAgent() {
+ tests := []struct {
+ name string
+ hostname string
+ mockAgent *jobtypes.AgentInfo
+ mockGetErr error
+ mockWriteErr error
+ skipWrite bool
+ mockDeleteDrain bool
+ validateFunc func(resp gen.UndrainAgentResponseObject)
+ }{
+ {
+ name: "success undrains draining agent",
+ hostname: "server1",
+ mockAgent: &jobtypes.AgentInfo{
+ Hostname: "server1",
+ State: jobtypes.AgentStateDraining,
+ },
+ mockDeleteDrain: true,
+ validateFunc: func(resp gen.UndrainAgentResponseObject) {
+ r, ok := resp.(gen.UndrainAgent200JSONResponse)
+ s.True(ok)
+ s.Contains(r.Message, "undrain initiated for agent server1")
+ },
+ },
+ {
+ name: "success undrains cordoned agent",
+ hostname: "server1",
+ mockAgent: &jobtypes.AgentInfo{
+ Hostname: "server1",
+ State: jobtypes.AgentStateCordoned,
+ },
+ mockDeleteDrain: true,
+ validateFunc: func(resp gen.UndrainAgentResponseObject) {
+ r, ok := resp.(gen.UndrainAgent200JSONResponse)
+ s.True(ok)
+ s.Contains(r.Message, "undrain initiated for agent server1")
+ },
+ },
+ {
+ name: "agent not found returns 404",
+ hostname: "unknown",
+ mockGetErr: fmt.Errorf("agent not found: unknown"),
+ skipWrite: true,
+ validateFunc: func(resp gen.UndrainAgentResponseObject) {
+ _, ok := resp.(gen.UndrainAgent404JSONResponse)
+ s.True(ok)
+ },
+ },
+ {
+ name: "agent in ready state returns 409",
+ hostname: "server1",
+ mockAgent: &jobtypes.AgentInfo{
+ Hostname: "server1",
+ State: jobtypes.AgentStateReady,
+ },
+ skipWrite: true,
+ validateFunc: func(resp gen.UndrainAgentResponseObject) {
+ _, ok := resp.(gen.UndrainAgent409JSONResponse)
+ s.True(ok)
+ },
+ },
+ {
+ name: "agent with empty state returns 409",
+ hostname: "server1",
+ mockAgent: &jobtypes.AgentInfo{
+ Hostname: "server1",
+ State: "",
+ },
+ skipWrite: true,
+ validateFunc: func(resp gen.UndrainAgentResponseObject) {
+ _, ok := resp.(gen.UndrainAgent409JSONResponse)
+ s.True(ok)
+ },
+ },
+ }
+
+ for _, tt := range tests {
+ s.Run(tt.name, func() {
+ s.mockJobClient.EXPECT().
+ GetAgent(gomock.Any(), tt.hostname).
+ Return(tt.mockAgent, tt.mockGetErr)
+
+ if tt.mockDeleteDrain {
+ s.mockJobClient.EXPECT().
+ DeleteDrainFlag(gomock.Any(), tt.hostname).
+ Return(nil)
+ }
+
+ if !tt.skipWrite {
+ s.mockJobClient.EXPECT().
+ WriteAgentTimelineEvent(gomock.Any(), tt.hostname, "undrain", "Undrain initiated via API").
+ Return(tt.mockWriteErr)
+ }
+
+ resp, err := s.handler.UndrainAgent(s.ctx, gen.UndrainAgentRequestObject{
+ Hostname: tt.hostname,
+ })
+ s.NoError(err)
+ tt.validateFunc(resp)
+ })
+ }
+}
+
+func (s *AgentUndrainPublicTestSuite) TestUndrainAgentValidationHTTP() {
+ tests := []struct {
+ name string
+ hostname string
+ setupJobMock func() *jobmocks.MockJobClient
+ wantCode int
+ wantContains []string
+ }{
+ {
+ name: "when draining agent exists returns 200",
+ hostname: "server1",
+ setupJobMock: func() *jobmocks.MockJobClient {
+ mock := jobmocks.NewMockJobClient(s.mockCtrl)
+ mock.EXPECT().
+ GetAgent(gomock.Any(), "server1").
+ Return(&jobtypes.AgentInfo{
+ Hostname: "server1",
+ State: jobtypes.AgentStateDraining,
+ }, nil)
+ mock.EXPECT().
+ DeleteDrainFlag(gomock.Any(), "server1").
+ Return(nil)
+ mock.EXPECT().
+ WriteAgentTimelineEvent(gomock.Any(), "server1", "undrain", "Undrain initiated via API").
+ Return(nil)
+ return mock
+ },
+ wantCode: http.StatusOK,
+ wantContains: []string{`"message"`, `undrain initiated`},
+ },
+ {
+ name: "when agent not found returns 404",
+ hostname: "unknown",
+ setupJobMock: func() *jobmocks.MockJobClient {
+ mock := jobmocks.NewMockJobClient(s.mockCtrl)
+ mock.EXPECT().
+ GetAgent(gomock.Any(), "unknown").
+ Return(nil, fmt.Errorf("agent not found: unknown"))
+ return mock
+ },
+ wantCode: http.StatusNotFound,
+ wantContains: []string{`"error"`},
+ },
+ {
+ name: "when agent in ready state returns 409",
+ hostname: "server1",
+ setupJobMock: func() *jobmocks.MockJobClient {
+ mock := jobmocks.NewMockJobClient(s.mockCtrl)
+ mock.EXPECT().
+ GetAgent(gomock.Any(), "server1").
+ Return(&jobtypes.AgentInfo{
+ Hostname: "server1",
+ State: jobtypes.AgentStateReady,
+ }, nil)
+ return mock
+ },
+ wantCode: http.StatusConflict,
+ wantContains: []string{`"error"`, `not in draining or cordoned`},
+ },
+ }
+
+ for _, tc := range tests {
+ s.Run(tc.name, func() {
+ jobMock := tc.setupJobMock()
+
+ agentHandler := apiagent.New(s.logger, jobMock)
+ strictHandler := gen.NewStrictHandler(agentHandler, nil)
+
+ a := api.New(s.appConfig, s.logger)
+ gen.RegisterHandlers(a.Echo, strictHandler)
+
+ req := httptest.NewRequest(
+ http.MethodPost,
+ fmt.Sprintf("/agent/%s/undrain", tc.hostname),
+ nil,
+ )
+ rec := httptest.NewRecorder()
+
+ a.Echo.ServeHTTP(rec, req)
+
+ s.Equal(tc.wantCode, rec.Code)
+ for _, str := range tc.wantContains {
+ s.Contains(rec.Body.String(), str)
+ }
+ })
+ }
+}
+
+const rbacAgentUndrainTestSigningKey = "test-signing-key-for-rbac-agent-undrain"
+
+func (s *AgentUndrainPublicTestSuite) TestUndrainAgentRBACHTTP() {
+ tokenManager := authtoken.New(s.logger)
+
+ tests := []struct {
+ name string
+ setupAuth func(req *http.Request)
+ setupJobMock func() *jobmocks.MockJobClient
+ wantCode int
+ wantContains []string
+ }{
+ {
+ name: "when no token returns 401",
+ setupAuth: func(_ *http.Request) {
+ // No auth header set
+ },
+ setupJobMock: func() *jobmocks.MockJobClient {
+ return jobmocks.NewMockJobClient(s.mockCtrl)
+ },
+ wantCode: http.StatusUnauthorized,
+ wantContains: []string{"Bearer token required"},
+ },
+ {
+ name: "when insufficient permissions returns 403",
+ setupAuth: func(req *http.Request) {
+ token, err := tokenManager.Generate(
+ rbacAgentUndrainTestSigningKey,
+ []string{"read"},
+ "test-user",
+ []string{"agent:read"},
+ )
+ s.Require().NoError(err)
+ req.Header.Set("Authorization", fmt.Sprintf("Bearer %s", token))
+ },
+ setupJobMock: func() *jobmocks.MockJobClient {
+ return jobmocks.NewMockJobClient(s.mockCtrl)
+ },
+ wantCode: http.StatusForbidden,
+ wantContains: []string{"Insufficient permissions"},
+ },
+ {
+ name: "when valid token with agent:write returns 200",
+ setupAuth: func(req *http.Request) {
+ token, err := tokenManager.Generate(
+ rbacAgentUndrainTestSigningKey,
+ []string{"admin"},
+ "test-user",
+ nil,
+ )
+ s.Require().NoError(err)
+ req.Header.Set("Authorization", fmt.Sprintf("Bearer %s", token))
+ },
+ setupJobMock: func() *jobmocks.MockJobClient {
+ mock := jobmocks.NewMockJobClient(s.mockCtrl)
+ mock.EXPECT().
+ GetAgent(gomock.Any(), "server1").
+ Return(&jobtypes.AgentInfo{
+ Hostname: "server1",
+ State: jobtypes.AgentStateDraining,
+ }, nil)
+ mock.EXPECT().
+ DeleteDrainFlag(gomock.Any(), "server1").
+ Return(nil)
+ mock.EXPECT().
+ WriteAgentTimelineEvent(gomock.Any(), "server1", "undrain", "Undrain initiated via API").
+ Return(nil)
+ return mock
+ },
+ wantCode: http.StatusOK,
+ wantContains: []string{`"message"`, `undrain initiated`},
+ },
+ }
+
+ for _, tc := range tests {
+ s.Run(tc.name, func() {
+ jobMock := tc.setupJobMock()
+
+ appConfig := config.Config{
+ API: config.API{
+ Server: config.Server{
+ Security: config.ServerSecurity{
+ SigningKey: rbacAgentUndrainTestSigningKey,
+ },
+ },
+ },
+ }
+
+ server := api.New(appConfig, s.logger)
+ handlers := server.GetAgentHandler(jobMock)
+ server.RegisterHandlers(handlers)
+
+ req := httptest.NewRequest(
+ http.MethodPost,
+ "/agent/server1/undrain",
+ nil,
+ )
+ tc.setupAuth(req)
+ rec := httptest.NewRecorder()
+
+ server.Echo.ServeHTTP(rec, req)
+
+ s.Equal(tc.wantCode, rec.Code)
+ for _, str := range tc.wantContains {
+ s.Contains(rec.Body.String(), str)
+ }
+ })
+ }
+}
+
+func TestAgentUndrainPublicTestSuite(t *testing.T) {
+ suite.Run(t, new(AgentUndrainPublicTestSuite))
+}
diff --git a/internal/api/agent/gen/agent.gen.go b/internal/api/agent/gen/agent.gen.go
index 60acbb7c..16ef4bec 100644
--- a/internal/api/agent/gen/agent.gen.go
+++ b/internal/api/agent/gen/agent.gen.go
@@ -20,10 +20,17 @@ const (
BearerAuthScopes = "BearerAuth.Scopes"
)
+// Defines values for AgentInfoState.
+const (
+ AgentInfoStateCordoned AgentInfoState = "Cordoned"
+ AgentInfoStateDraining AgentInfoState = "Draining"
+ AgentInfoStateReady AgentInfoState = "Ready"
+)
+
// Defines values for AgentInfoStatus.
const (
- NotReady AgentInfoStatus = "NotReady"
- Ready AgentInfoStatus = "Ready"
+ AgentInfoStatusNotReady AgentInfoStatus = "NotReady"
+ AgentInfoStatusReady AgentInfoStatus = "Ready"
)
// Defines values for NetworkInterfaceResponseFamily.
@@ -33,11 +40,21 @@ const (
Inet6 NetworkInterfaceResponseFamily = "inet6"
)
+// Defines values for NodeConditionType.
+const (
+ DiskPressure NodeConditionType = "DiskPressure"
+ HighLoad NodeConditionType = "HighLoad"
+ MemoryPressure NodeConditionType = "MemoryPressure"
+)
+
// AgentInfo defines model for AgentInfo.
type AgentInfo struct {
// Architecture CPU architecture.
Architecture *string `json:"architecture,omitempty"`
+ // Conditions Evaluated node conditions.
+ Conditions *[]NodeCondition `json:"conditions,omitempty"`
+
// CpuCount Number of logical CPUs.
CpuCount *int `json:"cpu_count,omitempty"`
@@ -78,13 +95,22 @@ type AgentInfo struct {
// StartedAt When the agent process started.
StartedAt *time.Time `json:"started_at,omitempty"`
+ // State Agent scheduling state.
+ State *AgentInfoState `json:"state,omitempty"`
+
// Status The current status of the agent.
Status AgentInfoStatus `json:"status"`
+ // Timeline Agent state transition history.
+ Timeline *[]TimelineEvent `json:"timeline,omitempty"`
+
// Uptime The system uptime.
Uptime *string `json:"uptime,omitempty"`
}
+// AgentInfoState Agent scheduling state.
+type AgentInfoState string
+
// AgentInfoStatus The current status of the agent.
type AgentInfoStatus string
@@ -136,6 +162,17 @@ type NetworkInterfaceResponse struct {
// NetworkInterfaceResponseFamily IP address family.
type NetworkInterfaceResponseFamily string
+// NodeCondition defines model for NodeCondition.
+type NodeCondition struct {
+ LastTransitionTime time.Time `json:"last_transition_time"`
+ Reason *string `json:"reason,omitempty"`
+ Status bool `json:"status"`
+ Type NodeConditionType `json:"type"`
+}
+
+// NodeConditionType defines model for NodeCondition.Type.
+type NodeConditionType string
+
// OSInfoResponse Operating system information.
type OSInfoResponse struct {
// Distribution The name of the Linux distribution.
@@ -145,6 +182,15 @@ type OSInfoResponse struct {
Version string `json:"version"`
}
+// TimelineEvent defines model for TimelineEvent.
+type TimelineEvent struct {
+ Error *string `json:"error,omitempty"`
+ Event string `json:"event"`
+ Hostname *string `json:"hostname,omitempty"`
+ Message *string `json:"message,omitempty"`
+ Timestamp time.Time `json:"timestamp"`
+}
+
// ServerInterface represents all server handlers.
type ServerInterface interface {
// List active agents
@@ -153,6 +199,12 @@ type ServerInterface interface {
// Get agent details
// (GET /agent/{hostname})
GetAgentDetails(ctx echo.Context, hostname string) error
+ // Drain an agent
+ // (POST /agent/{hostname}/drain)
+ DrainAgent(ctx echo.Context, hostname string) error
+ // Undrain an agent
+ // (POST /agent/{hostname}/undrain)
+ UndrainAgent(ctx echo.Context, hostname string) error
}
// ServerInterfaceWrapper converts echo contexts to parameters.
@@ -189,6 +241,42 @@ func (w *ServerInterfaceWrapper) GetAgentDetails(ctx echo.Context) error {
return err
}
+// DrainAgent converts echo context to params.
+func (w *ServerInterfaceWrapper) DrainAgent(ctx echo.Context) error {
+ var err error
+ // ------------- Path parameter "hostname" -------------
+ var hostname string
+
+ err = runtime.BindStyledParameterWithOptions("simple", "hostname", ctx.Param("hostname"), &hostname, runtime.BindStyledParameterOptions{ParamLocation: runtime.ParamLocationPath, Explode: false, Required: true})
+ if err != nil {
+ return echo.NewHTTPError(http.StatusBadRequest, fmt.Sprintf("Invalid format for parameter hostname: %s", err))
+ }
+
+ ctx.Set(BearerAuthScopes, []string{"agent:write"})
+
+ // Invoke the callback with all the unmarshaled arguments
+ err = w.Handler.DrainAgent(ctx, hostname)
+ return err
+}
+
+// UndrainAgent converts echo context to params.
+func (w *ServerInterfaceWrapper) UndrainAgent(ctx echo.Context) error {
+ var err error
+ // ------------- Path parameter "hostname" -------------
+ var hostname string
+
+ err = runtime.BindStyledParameterWithOptions("simple", "hostname", ctx.Param("hostname"), &hostname, runtime.BindStyledParameterOptions{ParamLocation: runtime.ParamLocationPath, Explode: false, Required: true})
+ if err != nil {
+ return echo.NewHTTPError(http.StatusBadRequest, fmt.Sprintf("Invalid format for parameter hostname: %s", err))
+ }
+
+ ctx.Set(BearerAuthScopes, []string{"agent:write"})
+
+ // Invoke the callback with all the unmarshaled arguments
+ err = w.Handler.UndrainAgent(ctx, hostname)
+ return err
+}
+
// This is a simple interface which specifies echo.Route addition functions which
// are present on both echo.Echo and echo.Group, since we want to allow using
// either of them for path registration
@@ -219,6 +307,8 @@ func RegisterHandlersWithBaseURL(router EchoRouter, si ServerInterface, baseURL
router.GET(baseURL+"/agent", wrapper.GetAgent)
router.GET(baseURL+"/agent/:hostname", wrapper.GetAgentDetails)
+ router.POST(baseURL+"/agent/:hostname/drain", wrapper.DrainAgent)
+ router.POST(baseURL+"/agent/:hostname/undrain", wrapper.UndrainAgent)
}
@@ -318,6 +408,116 @@ func (response GetAgentDetails500JSONResponse) VisitGetAgentDetailsResponse(w ht
return json.NewEncoder(w).Encode(response)
}
+type DrainAgentRequestObject struct {
+ Hostname string `json:"hostname"`
+}
+
+type DrainAgentResponseObject interface {
+ VisitDrainAgentResponse(w http.ResponseWriter) error
+}
+
+type DrainAgent200JSONResponse struct {
+ Message string `json:"message"`
+}
+
+func (response DrainAgent200JSONResponse) VisitDrainAgentResponse(w http.ResponseWriter) error {
+ w.Header().Set("Content-Type", "application/json")
+ w.WriteHeader(200)
+
+ return json.NewEncoder(w).Encode(response)
+}
+
+type DrainAgent401JSONResponse externalRef0.ErrorResponse
+
+func (response DrainAgent401JSONResponse) VisitDrainAgentResponse(w http.ResponseWriter) error {
+ w.Header().Set("Content-Type", "application/json")
+ w.WriteHeader(401)
+
+ return json.NewEncoder(w).Encode(response)
+}
+
+type DrainAgent403JSONResponse externalRef0.ErrorResponse
+
+func (response DrainAgent403JSONResponse) VisitDrainAgentResponse(w http.ResponseWriter) error {
+ w.Header().Set("Content-Type", "application/json")
+ w.WriteHeader(403)
+
+ return json.NewEncoder(w).Encode(response)
+}
+
+type DrainAgent404JSONResponse externalRef0.ErrorResponse
+
+func (response DrainAgent404JSONResponse) VisitDrainAgentResponse(w http.ResponseWriter) error {
+ w.Header().Set("Content-Type", "application/json")
+ w.WriteHeader(404)
+
+ return json.NewEncoder(w).Encode(response)
+}
+
+type DrainAgent409JSONResponse externalRef0.ErrorResponse
+
+func (response DrainAgent409JSONResponse) VisitDrainAgentResponse(w http.ResponseWriter) error {
+ w.Header().Set("Content-Type", "application/json")
+ w.WriteHeader(409)
+
+ return json.NewEncoder(w).Encode(response)
+}
+
+type UndrainAgentRequestObject struct {
+ Hostname string `json:"hostname"`
+}
+
+type UndrainAgentResponseObject interface {
+ VisitUndrainAgentResponse(w http.ResponseWriter) error
+}
+
+type UndrainAgent200JSONResponse struct {
+ Message string `json:"message"`
+}
+
+func (response UndrainAgent200JSONResponse) VisitUndrainAgentResponse(w http.ResponseWriter) error {
+ w.Header().Set("Content-Type", "application/json")
+ w.WriteHeader(200)
+
+ return json.NewEncoder(w).Encode(response)
+}
+
+type UndrainAgent401JSONResponse externalRef0.ErrorResponse
+
+func (response UndrainAgent401JSONResponse) VisitUndrainAgentResponse(w http.ResponseWriter) error {
+ w.Header().Set("Content-Type", "application/json")
+ w.WriteHeader(401)
+
+ return json.NewEncoder(w).Encode(response)
+}
+
+type UndrainAgent403JSONResponse externalRef0.ErrorResponse
+
+func (response UndrainAgent403JSONResponse) VisitUndrainAgentResponse(w http.ResponseWriter) error {
+ w.Header().Set("Content-Type", "application/json")
+ w.WriteHeader(403)
+
+ return json.NewEncoder(w).Encode(response)
+}
+
+type UndrainAgent404JSONResponse externalRef0.ErrorResponse
+
+func (response UndrainAgent404JSONResponse) VisitUndrainAgentResponse(w http.ResponseWriter) error {
+ w.Header().Set("Content-Type", "application/json")
+ w.WriteHeader(404)
+
+ return json.NewEncoder(w).Encode(response)
+}
+
+type UndrainAgent409JSONResponse externalRef0.ErrorResponse
+
+func (response UndrainAgent409JSONResponse) VisitUndrainAgentResponse(w http.ResponseWriter) error {
+ w.Header().Set("Content-Type", "application/json")
+ w.WriteHeader(409)
+
+ return json.NewEncoder(w).Encode(response)
+}
+
// StrictServerInterface represents all server handlers.
type StrictServerInterface interface {
// List active agents
@@ -326,6 +526,12 @@ type StrictServerInterface interface {
// Get agent details
// (GET /agent/{hostname})
GetAgentDetails(ctx context.Context, request GetAgentDetailsRequestObject) (GetAgentDetailsResponseObject, error)
+ // Drain an agent
+ // (POST /agent/{hostname}/drain)
+ DrainAgent(ctx context.Context, request DrainAgentRequestObject) (DrainAgentResponseObject, error)
+ // Undrain an agent
+ // (POST /agent/{hostname}/undrain)
+ UndrainAgent(ctx context.Context, request UndrainAgentRequestObject) (UndrainAgentResponseObject, error)
}
type StrictHandlerFunc = strictecho.StrictEchoHandlerFunc
@@ -387,3 +593,53 @@ func (sh *strictHandler) GetAgentDetails(ctx echo.Context, hostname string) erro
}
return nil
}
+
+// DrainAgent operation middleware
+func (sh *strictHandler) DrainAgent(ctx echo.Context, hostname string) error {
+ var request DrainAgentRequestObject
+
+ request.Hostname = hostname
+
+ handler := func(ctx echo.Context, request interface{}) (interface{}, error) {
+ return sh.ssi.DrainAgent(ctx.Request().Context(), request.(DrainAgentRequestObject))
+ }
+ for _, middleware := range sh.middlewares {
+ handler = middleware(handler, "DrainAgent")
+ }
+
+ response, err := handler(ctx, request)
+
+ if err != nil {
+ return err
+ } else if validResponse, ok := response.(DrainAgentResponseObject); ok {
+ return validResponse.VisitDrainAgentResponse(ctx.Response())
+ } else if response != nil {
+ return fmt.Errorf("unexpected response type: %T", response)
+ }
+ return nil
+}
+
+// UndrainAgent operation middleware
+func (sh *strictHandler) UndrainAgent(ctx echo.Context, hostname string) error {
+ var request UndrainAgentRequestObject
+
+ request.Hostname = hostname
+
+ handler := func(ctx echo.Context, request interface{}) (interface{}, error) {
+ return sh.ssi.UndrainAgent(ctx.Request().Context(), request.(UndrainAgentRequestObject))
+ }
+ for _, middleware := range sh.middlewares {
+ handler = middleware(handler, "UndrainAgent")
+ }
+
+ response, err := handler(ctx, request)
+
+ if err != nil {
+ return err
+ } else if validResponse, ok := response.(UndrainAgentResponseObject); ok {
+ return validResponse.VisitUndrainAgentResponse(ctx.Response())
+ } else if response != nil {
+ return fmt.Errorf("unexpected response type: %T", response)
+ }
+ return nil
+}
diff --git a/internal/api/agent/gen/api.yaml b/internal/api/agent/gen/api.yaml
index ad1f5d42..26fe3050 100644
--- a/internal/api/agent/gen/api.yaml
+++ b/internal/api/agent/gen/api.yaml
@@ -110,6 +110,114 @@ paths:
application/json:
schema:
$ref: '../../common/gen/api.yaml#/components/schemas/ErrorResponse'
+ /agent/{hostname}/drain:
+ post:
+ operationId: drainAgent
+ summary: Drain an agent
+ description: >
+ Stop the agent from accepting new jobs. In-flight jobs continue
+ to completion.
+ tags:
+ - agent_operations
+ security:
+ - BearerAuth:
+ - agent:write
+ parameters:
+ - name: hostname
+ in: path
+ required: true
+ schema:
+ type: string
+ description: The hostname of the agent to drain.
+ responses:
+ '200':
+ description: Agent drain initiated.
+ content:
+ application/json:
+ schema:
+ type: object
+ properties:
+ message:
+ type: string
+ required:
+ - message
+ '401':
+ description: Unauthorized - API key required
+ content:
+ application/json:
+ schema:
+ $ref: '../../common/gen/api.yaml#/components/schemas/ErrorResponse'
+ '403':
+ description: Forbidden - Insufficient permissions
+ content:
+ application/json:
+ schema:
+ $ref: '../../common/gen/api.yaml#/components/schemas/ErrorResponse'
+ '404':
+ description: Agent not found.
+ content:
+ application/json:
+ schema:
+ $ref: '../../common/gen/api.yaml#/components/schemas/ErrorResponse'
+ '409':
+ description: Agent already in requested state.
+ content:
+ application/json:
+ schema:
+ $ref: '../../common/gen/api.yaml#/components/schemas/ErrorResponse'
+ /agent/{hostname}/undrain:
+ post:
+ operationId: undrainAgent
+ summary: Undrain an agent
+ description: Resume accepting jobs on a drained agent.
+ tags:
+ - agent_operations
+ security:
+ - BearerAuth:
+ - agent:write
+ parameters:
+ - name: hostname
+ in: path
+ required: true
+ schema:
+ type: string
+ description: The hostname of the agent to undrain.
+ responses:
+ '200':
+ description: Agent undrain initiated.
+ content:
+ application/json:
+ schema:
+ type: object
+ properties:
+ message:
+ type: string
+ required:
+ - message
+ '401':
+ description: Unauthorized - API key required
+ content:
+ application/json:
+ schema:
+ $ref: '../../common/gen/api.yaml#/components/schemas/ErrorResponse'
+ '403':
+ description: Forbidden - Insufficient permissions
+ content:
+ application/json:
+ schema:
+ $ref: '../../common/gen/api.yaml#/components/schemas/ErrorResponse'
+ '404':
+ description: Agent not found.
+ content:
+ application/json:
+ schema:
+ $ref: '../../common/gen/api.yaml#/components/schemas/ErrorResponse'
+ '409':
+ description: Agent not in draining or cordoned state.
+ content:
+ application/json:
+ schema:
+ $ref: '../../common/gen/api.yaml#/components/schemas/ErrorResponse'
components:
securitySchemes:
@@ -200,6 +308,20 @@ components:
type: object
additionalProperties: true
description: Extended facts from additional providers.
+ state:
+ type: string
+ enum: [Ready, Draining, Cordoned]
+ description: Agent scheduling state.
+ conditions:
+ type: array
+ items:
+ $ref: '#/components/schemas/NodeCondition'
+ description: Evaluated node conditions.
+ timeline:
+ type: array
+ items:
+ $ref: '#/components/schemas/TimelineEvent'
+ description: Agent state transition history.
required:
- hostname
- status
@@ -287,3 +409,39 @@ components:
- dual
required:
- name
+
+ NodeCondition:
+ type: object
+ properties:
+ type:
+ type: string
+ enum: [MemoryPressure, HighLoad, DiskPressure]
+ status:
+ type: boolean
+ reason:
+ type: string
+ last_transition_time:
+ type: string
+ format: date-time
+ required:
+ - type
+ - status
+ - last_transition_time
+
+ TimelineEvent:
+ type: object
+ properties:
+ timestamp:
+ type: string
+ format: date-time
+ event:
+ type: string
+ hostname:
+ type: string
+ message:
+ type: string
+ error:
+ type: string
+ required:
+ - timestamp
+ - event
diff --git a/internal/api/gen/api.yaml b/internal/api/gen/api.yaml
index 0dbe61d0..104c36da 100644
--- a/internal/api/gen/api.yaml
+++ b/internal/api/gen/api.yaml
@@ -118,6 +118,116 @@ paths:
application/json:
schema:
$ref: '#/components/schemas/ErrorResponse'
+ /agent/{hostname}/drain:
+ servers: []
+ post:
+ operationId: drainAgent
+ summary: Drain an agent
+ description: >
+ Stop the agent from accepting new jobs. In-flight jobs continue to
+ completion.
+ tags:
+ - Agent_Management_API_agent_operations
+ security:
+ - BearerAuth:
+ - agent:write
+ parameters:
+ - name: hostname
+ in: path
+ required: true
+ schema:
+ type: string
+ description: The hostname of the agent to drain.
+ responses:
+ '200':
+ description: Agent drain initiated.
+ content:
+ application/json:
+ schema:
+ type: object
+ properties:
+ message:
+ type: string
+ required:
+ - message
+ '401':
+ description: Unauthorized - API key required
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '403':
+ description: Forbidden - Insufficient permissions
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '404':
+ description: Agent not found.
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '409':
+ description: Agent already in requested state.
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ /agent/{hostname}/undrain:
+ servers: []
+ post:
+ operationId: undrainAgent
+ summary: Undrain an agent
+ description: Resume accepting jobs on a drained agent.
+ tags:
+ - Agent_Management_API_agent_operations
+ security:
+ - BearerAuth:
+ - agent:write
+ parameters:
+ - name: hostname
+ in: path
+ required: true
+ schema:
+ type: string
+ description: The hostname of the agent to undrain.
+ responses:
+ '200':
+ description: Agent undrain initiated.
+ content:
+ application/json:
+ schema:
+ type: object
+ properties:
+ message:
+ type: string
+ required:
+ - message
+ '401':
+ description: Unauthorized - API key required
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '403':
+ description: Forbidden - Insufficient permissions
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '404':
+ description: Agent not found.
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
+ '409':
+ description: Agent not in draining or cordoned state.
+ content:
+ application/json:
+ schema:
+ $ref: '#/components/schemas/ErrorResponse'
/audit:
servers: []
get:
@@ -1380,6 +1490,23 @@ components:
type: object
additionalProperties: true
description: Extended facts from additional providers.
+ state:
+ type: string
+ enum:
+ - Ready
+ - Draining
+ - Cordoned
+ description: Agent scheduling state.
+ conditions:
+ type: array
+ items:
+ $ref: '#/components/schemas/NodeCondition'
+ description: Evaluated node conditions.
+ timeline:
+ type: array
+ items:
+ $ref: '#/components/schemas/TimelineEvent'
+ description: Agent state transition history.
required:
- hostname
- status
@@ -1463,6 +1590,43 @@ components:
- dual
required:
- name
+ NodeCondition:
+ type: object
+ properties:
+ type:
+ type: string
+ enum:
+ - MemoryPressure
+ - HighLoad
+ - DiskPressure
+ status:
+ type: boolean
+ reason:
+ type: string
+ last_transition_time:
+ type: string
+ format: date-time
+ required:
+ - type
+ - status
+ - last_transition_time
+ TimelineEvent:
+ type: object
+ properties:
+ timestamp:
+ type: string
+ format: date-time
+ event:
+ type: string
+ hostname:
+ type: string
+ message:
+ type: string
+ error:
+ type: string
+ required:
+ - timestamp
+ - event
AuditEntry:
type: object
properties:
diff --git a/internal/authtoken/permissions.go b/internal/authtoken/permissions.go
index ae2dbd4e..df3f2611 100644
--- a/internal/authtoken/permissions.go
+++ b/internal/authtoken/permissions.go
@@ -26,6 +26,7 @@ type Permission = string
// Permission constants using resource:verb format.
const (
PermAgentRead Permission = "agent:read"
+ PermAgentWrite Permission = "agent:write"
PermNodeRead Permission = "node:read"
PermNetworkRead Permission = "network:read"
PermNetworkWrite Permission = "network:write"
@@ -39,6 +40,7 @@ const (
// AllPermissions is the full set of known permissions.
var AllPermissions = []Permission{
PermAgentRead,
+ PermAgentWrite,
PermNodeRead,
PermNetworkRead,
PermNetworkWrite,
@@ -53,6 +55,7 @@ var AllPermissions = []Permission{
var DefaultRolePermissions = map[string][]Permission{
"admin": {
PermAgentRead,
+ PermAgentWrite,
PermNodeRead,
PermNetworkRead,
PermNetworkWrite,
diff --git a/internal/cli/nats.go b/internal/cli/nats.go
index 419d1543..b3d7e1c7 100644
--- a/internal/cli/nats.go
+++ b/internal/cli/nats.go
@@ -107,6 +107,21 @@ func BuildFactsKVConfig(
}
}
+// BuildStateKVConfig builds a jetstream.KeyValueConfig from state config values.
+// The state bucket has no TTL so drain flags and timeline events persist indefinitely.
+func BuildStateKVConfig(
+ namespace string,
+ stateCfg config.NATSState,
+) jetstream.KeyValueConfig {
+ stateBucket := job.ApplyNamespaceToInfraName(namespace, stateCfg.Bucket)
+
+ return jetstream.KeyValueConfig{
+ Bucket: stateBucket,
+ Storage: ParseJetstreamStorageType(stateCfg.Storage),
+ Replicas: stateCfg.Replicas,
+ }
+}
+
// BuildAuditKVConfig builds a jetstream.KeyValueConfig from audit config values.
func BuildAuditKVConfig(
namespace string,
diff --git a/internal/config/types.go b/internal/config/types.go
index 61764d7f..afbcbfb6 100644
--- a/internal/config/types.go
+++ b/internal/config/types.go
@@ -93,6 +93,7 @@ type NATS struct {
Audit NATSAudit `mapstructure:"audit,omitempty"`
Registry NATSRegistry `mapstructure:"registry,omitempty"`
Facts NATSFacts `mapstructure:"facts,omitempty"`
+ State NATSState `mapstructure:"state,omitempty"`
}
// NATSAudit configuration for the audit log KV bucket.
@@ -123,6 +124,14 @@ type NATSFacts struct {
Replicas int `mapstructure:"replicas"`
}
+// NATSState configuration for the agent state KV bucket (drain flags, timeline events).
+type NATSState struct {
+ // Bucket is the KV bucket name for persistent agent state.
+ Bucket string `mapstructure:"bucket"`
+ Storage string `mapstructure:"storage"` // "file" or "memory"
+ Replicas int `mapstructure:"replicas"`
+}
+
// NATSServer configuration settings for the embedded NATS server.
type NATSServer struct {
// Host the server will bind to.
@@ -258,6 +267,13 @@ type AgentFacts struct {
Interval string `mapstructure:"interval"` // e.g. "5m", "1h"
}
+// AgentConditions holds threshold configuration for node conditions.
+type AgentConditions struct {
+ MemoryPressureThreshold int `mapstructure:"memory_pressure_threshold"`
+ HighLoadMultiplier float64 `mapstructure:"high_load_multiplier"`
+ DiskPressureThreshold int `mapstructure:"disk_pressure_threshold"`
+}
+
// AgentConfig configuration settings.
type AgentConfig struct {
// NATS connection settings for the agent.
@@ -274,4 +290,6 @@ type AgentConfig struct {
MaxJobs int `mapstructure:"max_jobs"`
// Labels are key-value pairs for label-based routing (e.g., role: web, env: prod).
Labels map[string]string `mapstructure:"labels"`
+ // Conditions holds threshold settings for node condition evaluation.
+ Conditions AgentConditions `mapstructure:"conditions,omitempty"`
}
diff --git a/internal/job/client/agent.go b/internal/job/client/agent.go
index d0ab7cd8..9b9aec44 100644
--- a/internal/job/client/agent.go
+++ b/internal/job/client/agent.go
@@ -26,6 +26,8 @@ import (
"fmt"
"log/slog"
"regexp"
+ "sort"
+ "strings"
"time"
"github.com/nats-io/nats.go/jetstream"
@@ -164,6 +166,194 @@ func (c *Client) CreateOrUpdateConsumer(
return c.natsClient.CreateOrUpdateConsumerWithConfig(ctx, streamName, consumerConfig)
}
+// WriteAgentTimelineEvent writes an append-only timeline event
+// for an agent state transition.
+func (c *Client) WriteAgentTimelineEvent(
+ ctx context.Context,
+ hostname, event, message string,
+) error {
+ if c.stateKV == nil {
+ return fmt.Errorf("agent state bucket not configured")
+ }
+
+ now := time.Now()
+ key := fmt.Sprintf(
+ "timeline.%s.%s.%d",
+ job.SanitizeHostname(hostname),
+ event,
+ now.UnixNano(),
+ )
+
+ data, err := json.Marshal(job.TimelineEvent{
+ Timestamp: now,
+ Event: event,
+ Hostname: hostname,
+ Message: message,
+ })
+ if err != nil {
+ return fmt.Errorf("marshal timeline event: %w", err)
+ }
+
+ _, err = c.stateKV.Put(ctx, key, data)
+ if err != nil {
+ return fmt.Errorf("write timeline event: %w", err)
+ }
+
+ c.logger.Debug("wrote agent timeline event",
+ slog.String("hostname", hostname),
+ slog.String("event", event),
+ slog.String("key", key),
+ )
+
+ return nil
+}
+
+// GetAgentTimeline returns sorted timeline events for a hostname.
+func (c *Client) GetAgentTimeline(
+ ctx context.Context,
+ hostname string,
+) ([]job.TimelineEvent, error) {
+ if c.stateKV == nil {
+ return nil, fmt.Errorf("agent state bucket not configured")
+ }
+
+ prefix := "timeline." + job.SanitizeHostname(hostname) + "."
+
+ keys, err := c.stateKV.Keys(ctx)
+ if err != nil {
+ // No keys found is not an error for timeline
+ return []job.TimelineEvent{}, nil
+ }
+
+ var events []job.TimelineEvent
+ for _, key := range keys {
+ if !strings.HasPrefix(key, prefix) {
+ continue
+ }
+
+ entry, err := c.stateKV.Get(ctx, key)
+ if err != nil {
+ continue
+ }
+
+ var te job.TimelineEvent
+ if err := json.Unmarshal(entry.Value(), &te); err != nil {
+ continue
+ }
+
+ events = append(events, te)
+ }
+
+ // Sort by timestamp
+ sort.Slice(events, func(i, j int) bool {
+ return events[i].Timestamp.Before(events[j].Timestamp)
+ })
+
+ return events, nil
+}
+
+// ComputeAgentState returns the current state from timeline events.
+func ComputeAgentState(
+ events []job.TimelineEvent,
+) string {
+ if len(events) == 0 {
+ return job.AgentStateReady
+ }
+
+ latest := events[len(events)-1]
+ switch latest.Event {
+ case "drain":
+ return job.AgentStateDraining
+ case "cordoned":
+ return job.AgentStateCordoned
+ case "undrain", "ready":
+ return job.AgentStateReady
+ default:
+ return job.AgentStateReady
+ }
+}
+
+// overlayDrainState checks if a drain flag exists for the agent and
+// overrides the reported state. The agent always reports its own view
+// (Ready), but the operator may have drained it via the API. Drain
+// flags are stored in the agent-state KV bucket (no TTL).
+func (c *Client) overlayDrainState(
+ ctx context.Context,
+ info *job.AgentInfo,
+) {
+ if c.stateKV == nil {
+ return
+ }
+
+ key := "drain." + job.SanitizeHostname(info.Hostname)
+ _, err := c.stateKV.Get(ctx, key)
+ if err == nil {
+ info.State = job.AgentStateCordoned
+ }
+}
+
+// CheckDrainFlag returns true if the drain flag exists for the hostname.
+func (c *Client) CheckDrainFlag(
+ ctx context.Context,
+ hostname string,
+) bool {
+ if c.stateKV == nil {
+ return false
+ }
+
+ key := "drain." + job.SanitizeHostname(hostname)
+ _, err := c.stateKV.Get(ctx, key)
+ return err == nil
+}
+
+// SetDrainFlag writes the drain flag for an agent in the state KV bucket.
+// The agent detects this flag on heartbeat and stops accepting jobs.
+func (c *Client) SetDrainFlag(
+ ctx context.Context,
+ hostname string,
+) error {
+ if c.stateKV == nil {
+ return fmt.Errorf("agent state bucket not configured")
+ }
+
+ key := "drain." + job.SanitizeHostname(hostname)
+ _, err := c.stateKV.Put(ctx, key, []byte("1"))
+ if err != nil {
+ return fmt.Errorf("set drain flag: %w", err)
+ }
+
+ c.logger.Debug("set drain flag",
+ slog.String("hostname", hostname),
+ slog.String("key", key),
+ )
+
+ return nil
+}
+
+// DeleteDrainFlag removes the drain flag for an agent from the state KV bucket.
+// The agent detects this on heartbeat and resumes accepting jobs.
+func (c *Client) DeleteDrainFlag(
+ ctx context.Context,
+ hostname string,
+) error {
+ if c.stateKV == nil {
+ return fmt.Errorf("agent state bucket not configured")
+ }
+
+ key := "drain." + job.SanitizeHostname(hostname)
+ err := c.stateKV.Delete(ctx, key)
+ if err != nil {
+ return fmt.Errorf("delete drain flag: %w", err)
+ }
+
+ c.logger.Debug("deleted drain flag",
+ slog.String("hostname", hostname),
+ slog.String("key", key),
+ )
+
+ return nil
+}
+
// sanitizeKeyForNATS sanitizes a string for use as a NATS key.
func sanitizeKeyForNATS(
input string,
diff --git a/internal/job/client/agent_drain_public_test.go b/internal/job/client/agent_drain_public_test.go
new file mode 100644
index 00000000..9301403c
--- /dev/null
+++ b/internal/job/client/agent_drain_public_test.go
@@ -0,0 +1,353 @@
+// Copyright (c) 2026 John Dewey
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal in the Software without restriction, including without limitation the
+// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+// sell copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS IN THE SOFTWARE.
+
+package client_test
+
+import (
+ "context"
+ "errors"
+ "log/slog"
+ "testing"
+ "time"
+
+ "github.com/golang/mock/gomock"
+ "github.com/stretchr/testify/suite"
+
+ "github.com/retr0h/osapi/internal/job"
+ "github.com/retr0h/osapi/internal/job/client"
+ jobmocks "github.com/retr0h/osapi/internal/job/mocks"
+)
+
+type AgentDrainPublicTestSuite struct {
+ suite.Suite
+
+ mockCtrl *gomock.Controller
+ mockNATSClient *jobmocks.MockNATSClient
+ mockKV *jobmocks.MockKeyValue
+ ctx context.Context
+}
+
+func (s *AgentDrainPublicTestSuite) SetupTest() {
+ s.mockCtrl = gomock.NewController(s.T())
+ s.mockNATSClient = jobmocks.NewMockNATSClient(s.mockCtrl)
+ s.mockKV = jobmocks.NewMockKeyValue(s.mockCtrl)
+ s.ctx = context.Background()
+}
+
+func (s *AgentDrainPublicTestSuite) TearDownTest() {
+ s.mockCtrl.Finish()
+}
+
+func (s *AgentDrainPublicTestSuite) newClientWithState(
+ stateKV *jobmocks.MockKeyValue,
+) *client.Client {
+ opts := &client.Options{
+ Timeout: 30 * time.Second,
+ KVBucket: s.mockKV,
+ StateKV: stateKV,
+ }
+ c, err := client.New(slog.Default(), s.mockNATSClient, opts)
+ s.Require().NoError(err)
+
+ return c
+}
+
+func (s *AgentDrainPublicTestSuite) newClientWithoutState() *client.Client {
+ opts := &client.Options{
+ Timeout: 30 * time.Second,
+ KVBucket: s.mockKV,
+ }
+ c, err := client.New(slog.Default(), s.mockNATSClient, opts)
+ s.Require().NoError(err)
+
+ return c
+}
+
+func (s *AgentDrainPublicTestSuite) TestCheckDrainFlag() {
+ tests := []struct {
+ name string
+ hostname string
+ useState bool
+ setupMocks func(*jobmocks.MockKeyValue)
+ expected bool
+ }{
+ {
+ name: "when drain flag exists returns true",
+ hostname: "server1",
+ useState: true,
+ setupMocks: func(kv *jobmocks.MockKeyValue) {
+ entry := jobmocks.NewMockKeyValueEntry(s.mockCtrl)
+ kv.EXPECT().
+ Get(gomock.Any(), "drain.server1").
+ Return(entry, nil)
+ },
+ expected: true,
+ },
+ {
+ name: "when drain flag missing returns false",
+ hostname: "server1",
+ useState: true,
+ setupMocks: func(kv *jobmocks.MockKeyValue) {
+ kv.EXPECT().
+ Get(gomock.Any(), "drain.server1").
+ Return(nil, errors.New("key not found"))
+ },
+ expected: false,
+ },
+ {
+ name: "when stateKV is nil returns false",
+ hostname: "server1",
+ useState: false,
+ expected: false,
+ },
+ }
+
+ for _, tt := range tests {
+ s.Run(tt.name, func() {
+ var jobsClient *client.Client
+ if tt.useState {
+ stateKV := jobmocks.NewMockKeyValue(s.mockCtrl)
+ if tt.setupMocks != nil {
+ tt.setupMocks(stateKV)
+ }
+ jobsClient = s.newClientWithState(stateKV)
+ } else {
+ jobsClient = s.newClientWithoutState()
+ }
+
+ result := jobsClient.CheckDrainFlag(s.ctx, tt.hostname)
+ s.Equal(tt.expected, result)
+ })
+ }
+}
+
+func (s *AgentDrainPublicTestSuite) TestSetDrainFlag() {
+ tests := []struct {
+ name string
+ hostname string
+ useState bool
+ setupMocks func(*jobmocks.MockKeyValue)
+ expectError bool
+ errorMsg string
+ }{
+ {
+ name: "when write succeeds sets drain flag",
+ hostname: "server1",
+ useState: true,
+ setupMocks: func(kv *jobmocks.MockKeyValue) {
+ kv.EXPECT().
+ Put(gomock.Any(), "drain.server1", []byte("1")).
+ Return(uint64(1), nil)
+ },
+ },
+ {
+ name: "when KV put fails returns error",
+ hostname: "server1",
+ useState: true,
+ setupMocks: func(kv *jobmocks.MockKeyValue) {
+ kv.EXPECT().
+ Put(gomock.Any(), "drain.server1", []byte("1")).
+ Return(uint64(0), errors.New("kv connection failed"))
+ },
+ expectError: true,
+ errorMsg: "set drain flag",
+ },
+ {
+ name: "when stateKV is nil returns error",
+ hostname: "server1",
+ useState: false,
+ expectError: true,
+ errorMsg: "agent state bucket not configured",
+ },
+ }
+
+ for _, tt := range tests {
+ s.Run(tt.name, func() {
+ var jobsClient *client.Client
+ if tt.useState {
+ stateKV := jobmocks.NewMockKeyValue(s.mockCtrl)
+ if tt.setupMocks != nil {
+ tt.setupMocks(stateKV)
+ }
+ jobsClient = s.newClientWithState(stateKV)
+ } else {
+ jobsClient = s.newClientWithoutState()
+ }
+
+ err := jobsClient.SetDrainFlag(s.ctx, tt.hostname)
+
+ if tt.expectError {
+ s.Error(err)
+ s.Contains(err.Error(), tt.errorMsg)
+ } else {
+ s.NoError(err)
+ }
+ })
+ }
+}
+
+func (s *AgentDrainPublicTestSuite) TestDeleteDrainFlag() {
+ tests := []struct {
+ name string
+ hostname string
+ useState bool
+ setupMocks func(*jobmocks.MockKeyValue)
+ expectError bool
+ errorMsg string
+ }{
+ {
+ name: "when delete succeeds removes drain flag",
+ hostname: "server1",
+ useState: true,
+ setupMocks: func(kv *jobmocks.MockKeyValue) {
+ kv.EXPECT().
+ Delete(gomock.Any(), "drain.server1").
+ Return(nil)
+ },
+ },
+ {
+ name: "when KV delete fails returns error",
+ hostname: "server1",
+ useState: true,
+ setupMocks: func(kv *jobmocks.MockKeyValue) {
+ kv.EXPECT().
+ Delete(gomock.Any(), "drain.server1").
+ Return(errors.New("kv connection failed"))
+ },
+ expectError: true,
+ errorMsg: "delete drain flag",
+ },
+ {
+ name: "when stateKV is nil returns error",
+ hostname: "server1",
+ useState: false,
+ expectError: true,
+ errorMsg: "agent state bucket not configured",
+ },
+ }
+
+ for _, tt := range tests {
+ s.Run(tt.name, func() {
+ var jobsClient *client.Client
+ if tt.useState {
+ stateKV := jobmocks.NewMockKeyValue(s.mockCtrl)
+ if tt.setupMocks != nil {
+ tt.setupMocks(stateKV)
+ }
+ jobsClient = s.newClientWithState(stateKV)
+ } else {
+ jobsClient = s.newClientWithoutState()
+ }
+
+ err := jobsClient.DeleteDrainFlag(s.ctx, tt.hostname)
+
+ if tt.expectError {
+ s.Error(err)
+ s.Contains(err.Error(), tt.errorMsg)
+ } else {
+ s.NoError(err)
+ }
+ })
+ }
+}
+
+func (s *AgentDrainPublicTestSuite) TestOverlayDrainState() {
+ tests := []struct {
+ name string
+ useState bool
+ setupMocks func(*jobmocks.MockKeyValue)
+ expectedState string
+ }{
+ {
+ name: "when drain flag exists sets state to Cordoned",
+ useState: true,
+ setupMocks: func(kv *jobmocks.MockKeyValue) {
+ entry := jobmocks.NewMockKeyValueEntry(s.mockCtrl)
+ kv.EXPECT().
+ Get(gomock.Any(), "drain.server1").
+ Return(entry, nil)
+ },
+ expectedState: job.AgentStateCordoned,
+ },
+ {
+ name: "when drain flag missing keeps original state",
+ useState: true,
+ setupMocks: func(kv *jobmocks.MockKeyValue) {
+ kv.EXPECT().
+ Get(gomock.Any(), "drain.server1").
+ Return(nil, errors.New("key not found"))
+ },
+ expectedState: "",
+ },
+ {
+ name: "when stateKV is nil keeps original state",
+ useState: false,
+ expectedState: "",
+ },
+ }
+
+ for _, tt := range tests {
+ s.Run(tt.name, func() {
+ registryKV := jobmocks.NewMockKeyValue(s.mockCtrl)
+
+ // Set up the registry KV to return agent data
+ entry := jobmocks.NewMockKeyValueEntry(s.mockCtrl)
+ entry.EXPECT().Value().Return(
+ []byte(`{"hostname":"server1","registered_at":"2026-01-01T00:00:00Z"}`),
+ )
+ registryKV.EXPECT().
+ Get(gomock.Any(), "agents.server1").
+ Return(entry, nil)
+
+ opts := &client.Options{
+ Timeout: 30 * time.Second,
+ KVBucket: s.mockKV,
+ RegistryKV: registryKV,
+ }
+
+ if tt.useState {
+ stateKV := jobmocks.NewMockKeyValue(s.mockCtrl)
+ if tt.setupMocks != nil {
+ tt.setupMocks(stateKV)
+ }
+ opts.StateKV = stateKV
+ // GetAgent also calls GetAgentTimeline which uses stateKV
+ stateKV.EXPECT().
+ Keys(gomock.Any()).
+ Return(nil, errors.New("nats: no keys found"))
+ }
+
+ jobsClient, err := client.New(
+ slog.Default(),
+ s.mockNATSClient,
+ opts,
+ )
+ s.Require().NoError(err)
+
+ info, err := jobsClient.GetAgent(s.ctx, "server1")
+ s.NoError(err)
+ s.Equal(tt.expectedState, info.State)
+ })
+ }
+}
+
+func TestAgentDrainPublicTestSuite(t *testing.T) {
+ suite.Run(t, new(AgentDrainPublicTestSuite))
+}
diff --git a/internal/job/client/agent_timeline_public_test.go b/internal/job/client/agent_timeline_public_test.go
new file mode 100644
index 00000000..c3be65a4
--- /dev/null
+++ b/internal/job/client/agent_timeline_public_test.go
@@ -0,0 +1,484 @@
+// Copyright (c) 2026 John Dewey
+
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to
+// deal in the Software without restriction, including without limitation the
+// rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+// sell copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS IN THE SOFTWARE.
+
+package client_test
+
+import (
+ "context"
+ "encoding/json"
+ "errors"
+ "log/slog"
+ "testing"
+ "time"
+
+ "github.com/golang/mock/gomock"
+ "github.com/stretchr/testify/suite"
+
+ "github.com/retr0h/osapi/internal/job"
+ "github.com/retr0h/osapi/internal/job/client"
+ jobmocks "github.com/retr0h/osapi/internal/job/mocks"
+)
+
+type AgentTimelinePublicTestSuite struct {
+ suite.Suite
+
+ mockCtrl *gomock.Controller
+ mockNATSClient *jobmocks.MockNATSClient
+ mockKV *jobmocks.MockKeyValue
+ ctx context.Context
+}
+
+func (s *AgentTimelinePublicTestSuite) SetupTest() {
+ s.mockCtrl = gomock.NewController(s.T())
+ s.mockNATSClient = jobmocks.NewMockNATSClient(s.mockCtrl)
+ s.mockKV = jobmocks.NewMockKeyValue(s.mockCtrl)
+ s.ctx = context.Background()
+}
+
+func (s *AgentTimelinePublicTestSuite) TearDownTest() {
+ s.mockCtrl.Finish()
+}
+
+func (s *AgentTimelinePublicTestSuite) newClientWithState(
+ stateKV *jobmocks.MockKeyValue,
+) *client.Client {
+ opts := &client.Options{
+ Timeout: 30 * time.Second,
+ KVBucket: s.mockKV,
+ StateKV: stateKV,
+ }
+ c, err := client.New(slog.Default(), s.mockNATSClient, opts)
+ s.Require().NoError(err)
+
+ return c
+}
+
+func (s *AgentTimelinePublicTestSuite) newClientWithoutState() *client.Client {
+ opts := &client.Options{
+ Timeout: 30 * time.Second,
+ KVBucket: s.mockKV,
+ }
+ c, err := client.New(slog.Default(), s.mockNATSClient, opts)
+ s.Require().NoError(err)
+
+ return c
+}
+
+func (s *AgentTimelinePublicTestSuite) TestWriteAgentTimelineEvent() {
+ tests := []struct {
+ name string
+ hostname string
+ event string
+ message string
+ useState bool
+ setupMocks func(*jobmocks.MockKeyValue)
+ expectError bool
+ errorMsg string
+ }{
+ {
+ name: "when write succeeds stores timeline event",
+ hostname: "server1",
+ event: "drain",
+ message: "node marked for drain",
+ useState: true,
+ setupMocks: func(kv *jobmocks.MockKeyValue) {
+ kv.EXPECT().
+ Put(gomock.Any(), gomock.Any(), gomock.Any()).
+ DoAndReturn(func(
+ _ context.Context,
+ key string,
+ data []byte,
+ ) (uint64, error) {
+ s.Contains(key, "timeline.server1.drain.")
+
+ var te job.TimelineEvent
+ err := json.Unmarshal(data, &te)
+ s.NoError(err)
+ s.Equal("drain", te.Event)
+ s.Equal("server1", te.Hostname)
+ s.Equal("node marked for drain", te.Message)
+ s.NotZero(te.Timestamp)
+
+ return 1, nil
+ })
+ },
+ },
+ {
+ name: "when KV put fails returns error",
+ hostname: "server1",
+ event: "drain",
+ message: "drain requested",
+ useState: true,
+ setupMocks: func(kv *jobmocks.MockKeyValue) {
+ kv.EXPECT().
+ Put(gomock.Any(), gomock.Any(), gomock.Any()).
+ Return(uint64(0), errors.New("kv connection failed"))
+ },
+ expectError: true,
+ errorMsg: "write timeline event",
+ },
+ {
+ name: "when stateKV is nil returns error",
+ hostname: "server1",
+ event: "drain",
+ message: "drain requested",
+ useState: false,
+ expectError: true,
+ errorMsg: "agent state bucket not configured",
+ },
+ }
+
+ for _, tt := range tests {
+ s.Run(tt.name, func() {
+ var jobsClient *client.Client
+ if tt.useState {
+ stateKV := jobmocks.NewMockKeyValue(s.mockCtrl)
+ if tt.setupMocks != nil {
+ tt.setupMocks(stateKV)
+ }
+ jobsClient = s.newClientWithState(stateKV)
+ } else {
+ jobsClient = s.newClientWithoutState()
+ }
+
+ err := jobsClient.WriteAgentTimelineEvent(
+ s.ctx,
+ tt.hostname,
+ tt.event,
+ tt.message,
+ )
+
+ if tt.expectError {
+ s.Error(err)
+ s.Contains(err.Error(), tt.errorMsg)
+ } else {
+ s.NoError(err)
+ }
+ })
+ }
+}
+
+func (s *AgentTimelinePublicTestSuite) TestGetAgentTimeline() {
+ now := time.Now()
+ earlier := now.Add(-10 * time.Minute)
+ later := now.Add(10 * time.Minute)
+
+ tests := []struct {
+ name string
+ hostname string
+ useState bool
+ setupMocks func(*jobmocks.MockKeyValue)
+ expectError bool
+ errorMsg string
+ expectedCount int
+ validateFunc func([]job.TimelineEvent)
+ }{
+ {
+ name: "when events exist returns sorted events",
+ hostname: "server1",
+ useState: true,
+ setupMocks: func(kv *jobmocks.MockKeyValue) {
+ kv.EXPECT().
+ Keys(gomock.Any()).
+ Return([]string{
+ "timeline.server1.drain.1000000000",
+ "timeline.server1.undrain.2000000000",
+ "agents.server1",
+ }, nil)
+
+ drainEvent, _ := json.Marshal(job.TimelineEvent{
+ Timestamp: later,
+ Event: "drain",
+ Hostname: "server1",
+ Message: "drain requested",
+ })
+ entry1 := jobmocks.NewMockKeyValueEntry(s.mockCtrl)
+ entry1.EXPECT().Value().Return(drainEvent)
+ kv.EXPECT().
+ Get(gomock.Any(), "timeline.server1.drain.1000000000").
+ Return(entry1, nil)
+
+ undrainEvent, _ := json.Marshal(job.TimelineEvent{
+ Timestamp: earlier,
+ Event: "undrain",
+ Hostname: "server1",
+ Message: "undrain requested",
+ })
+ entry2 := jobmocks.NewMockKeyValueEntry(s.mockCtrl)
+ entry2.EXPECT().Value().Return(undrainEvent)
+ kv.EXPECT().
+ Get(gomock.Any(), "timeline.server1.undrain.2000000000").
+ Return(entry2, nil)
+ },
+ expectedCount: 2,
+ validateFunc: func(events []job.TimelineEvent) {
+ // Should be sorted by timestamp (earlier first)
+ s.Equal("undrain", events[0].Event)
+ s.Equal("drain", events[1].Event)
+ },
+ },
+ {
+ name: "when no keys found returns empty slice",
+ hostname: "server1",
+ useState: true,
+ setupMocks: func(kv *jobmocks.MockKeyValue) {
+ kv.EXPECT().
+ Keys(gomock.Any()).
+ Return(nil, errors.New("nats: no keys found"))
+ },
+ expectedCount: 0,
+ },
+ {
+ name: "when stateKV is nil returns error",
+ hostname: "server1",
+ useState: false,
+ expectError: true,
+ errorMsg: "agent state bucket not configured",
+ },
+ {
+ name: "when Get fails for a key skips it",
+ hostname: "server1",
+ useState: true,
+ setupMocks: func(kv *jobmocks.MockKeyValue) {
+ kv.EXPECT().
+ Keys(gomock.Any()).
+ Return([]string{
+ "timeline.server1.drain.1000000000",
+ "timeline.server1.undrain.2000000000",
+ }, nil)
+
+ drainEvent, _ := json.Marshal(job.TimelineEvent{
+ Timestamp: now,
+ Event: "drain",
+ Hostname: "server1",
+ Message: "drain requested",
+ })
+ entry1 := jobmocks.NewMockKeyValueEntry(s.mockCtrl)
+ entry1.EXPECT().Value().Return(drainEvent)
+ kv.EXPECT().
+ Get(gomock.Any(), "timeline.server1.drain.1000000000").
+ Return(entry1, nil)
+
+ kv.EXPECT().
+ Get(gomock.Any(), "timeline.server1.undrain.2000000000").
+ Return(nil, errors.New("key not found"))
+ },
+ expectedCount: 1,
+ validateFunc: func(events []job.TimelineEvent) {
+ s.Equal("drain", events[0].Event)
+ },
+ },
+ {
+ name: "when unmarshal fails for a key skips it",
+ hostname: "server1",
+ useState: true,
+ setupMocks: func(kv *jobmocks.MockKeyValue) {
+ kv.EXPECT().
+ Keys(gomock.Any()).
+ Return([]string{
+ "timeline.server1.drain.1000000000",
+ "timeline.server1.undrain.2000000000",
+ }, nil)
+
+ drainEvent, _ := json.Marshal(job.TimelineEvent{
+ Timestamp: now,
+ Event: "drain",
+ Hostname: "server1",
+ Message: "drain requested",
+ })
+ entry1 := jobmocks.NewMockKeyValueEntry(s.mockCtrl)
+ entry1.EXPECT().Value().Return(drainEvent)
+ kv.EXPECT().
+ Get(gomock.Any(), "timeline.server1.drain.1000000000").
+ Return(entry1, nil)
+
+ entry2 := jobmocks.NewMockKeyValueEntry(s.mockCtrl)
+ entry2.EXPECT().Value().Return([]byte("invalid json"))
+ kv.EXPECT().
+ Get(gomock.Any(), "timeline.server1.undrain.2000000000").
+ Return(entry2, nil)
+ },
+ expectedCount: 1,
+ validateFunc: func(events []job.TimelineEvent) {
+ s.Equal("drain", events[0].Event)
+ },
+ },
+ {
+ name: "when keys exist for other hostnames filters them out",
+ hostname: "server1",
+ useState: true,
+ setupMocks: func(kv *jobmocks.MockKeyValue) {
+ kv.EXPECT().
+ Keys(gomock.Any()).
+ Return([]string{
+ "timeline.server1.drain.1000000000",
+ "timeline.server2.drain.2000000000",
+ "agents.server1",
+ }, nil)
+
+ drainEvent, _ := json.Marshal(job.TimelineEvent{
+ Timestamp: now,
+ Event: "drain",
+ Hostname: "server1",
+ Message: "drain requested",
+ })
+ entry1 := jobmocks.NewMockKeyValueEntry(s.mockCtrl)
+ entry1.EXPECT().Value().Return(drainEvent)
+ kv.EXPECT().
+ Get(gomock.Any(), "timeline.server1.drain.1000000000").
+ Return(entry1, nil)
+ },
+ expectedCount: 1,
+ validateFunc: func(events []job.TimelineEvent) {
+ s.Equal("server1", events[0].Hostname)
+ },
+ },
+ }
+
+ for _, tt := range tests {
+ s.Run(tt.name, func() {
+ var jobsClient *client.Client
+ if tt.useState {
+ stateKV := jobmocks.NewMockKeyValue(s.mockCtrl)
+ if tt.setupMocks != nil {
+ tt.setupMocks(stateKV)
+ }
+ jobsClient = s.newClientWithState(stateKV)
+ } else {
+ jobsClient = s.newClientWithoutState()
+ }
+
+ events, err := jobsClient.GetAgentTimeline(s.ctx, tt.hostname)
+
+ if tt.expectError {
+ s.Error(err)
+ s.Contains(err.Error(), tt.errorMsg)
+ } else {
+ s.NoError(err)
+ s.Len(events, tt.expectedCount)
+ if tt.validateFunc != nil {
+ tt.validateFunc(events)
+ }
+ }
+ })
+ }
+}
+
+func (s *AgentTimelinePublicTestSuite) TestComputeAgentState() {
+ tests := []struct {
+ name string
+ events []job.TimelineEvent
+ expectedState string
+ }{
+ {
+ name: "when no events returns Ready",
+ events: []job.TimelineEvent{},
+ expectedState: job.AgentStateReady,
+ },
+ {
+ name: "when nil events returns Ready",
+ events: nil,
+ expectedState: job.AgentStateReady,
+ },
+ {
+ name: "when latest event is drain returns Draining",
+ events: []job.TimelineEvent{
+ {
+ Timestamp: time.Now(),
+ Event: "drain",
+ Hostname: "server1",
+ Message: "drain requested",
+ },
+ },
+ expectedState: job.AgentStateDraining,
+ },
+ {
+ name: "when latest event is cordoned returns Cordoned",
+ events: []job.TimelineEvent{
+ {
+ Timestamp: time.Now(),
+ Event: "cordoned",
+ Hostname: "server1",
+ Message: "node cordoned",
+ },
+ },
+ expectedState: job.AgentStateCordoned,
+ },
+ {
+ name: "when latest event is undrain returns Ready",
+ events: []job.TimelineEvent{
+ {
+ Timestamp: time.Now().Add(-10 * time.Minute),
+ Event: "drain",
+ Hostname: "server1",
+ Message: "drain requested",
+ },
+ {
+ Timestamp: time.Now(),
+ Event: "undrain",
+ Hostname: "server1",
+ Message: "undrain requested",
+ },
+ },
+ expectedState: job.AgentStateReady,
+ },
+ {
+ name: "when latest event is ready returns Ready",
+ events: []job.TimelineEvent{
+ {
+ Timestamp: time.Now().Add(-10 * time.Minute),
+ Event: "drain",
+ Hostname: "server1",
+ Message: "drain requested",
+ },
+ {
+ Timestamp: time.Now(),
+ Event: "ready",
+ Hostname: "server1",
+ Message: "agent ready",
+ },
+ },
+ expectedState: job.AgentStateReady,
+ },
+ {
+ name: "when latest event is unknown returns Ready",
+ events: []job.TimelineEvent{
+ {
+ Timestamp: time.Now(),
+ Event: "something-unexpected",
+ Hostname: "server1",
+ Message: "unknown event",
+ },
+ },
+ expectedState: job.AgentStateReady,
+ },
+ }
+
+ for _, tt := range tests {
+ s.Run(tt.name, func() {
+ state := client.ComputeAgentState(tt.events)
+ s.Equal(tt.expectedState, state)
+ })
+ }
+}
+
+func TestAgentTimelinePublicTestSuite(t *testing.T) {
+ suite.Run(t, new(AgentTimelinePublicTestSuite))
+}
diff --git a/internal/job/client/client.go b/internal/job/client/client.go
index da57a3b7..5096913b 100644
--- a/internal/job/client/client.go
+++ b/internal/job/client/client.go
@@ -42,6 +42,7 @@ type Client struct {
kv jetstream.KeyValue
registryKV jetstream.KeyValue
factsKV jetstream.KeyValue
+ stateKV jetstream.KeyValue
timeout time.Duration
streamName string
}
@@ -56,6 +57,8 @@ type Options struct {
RegistryKV jetstream.KeyValue
// FactsKV is the KV bucket for agent facts (optional).
FactsKV jetstream.KeyValue
+ // StateKV is the KV bucket for persistent agent state (drain flags, timeline).
+ StateKV jetstream.KeyValue
// StreamName is the JetStream stream name (used to derive DLQ name).
StreamName string
}
@@ -79,6 +82,7 @@ func New(
kv: opts.KVBucket,
registryKV: opts.RegistryKV,
factsKV: opts.FactsKV,
+ stateKV: opts.StateKV,
streamName: opts.StreamName,
timeout: opts.Timeout,
}, nil
diff --git a/internal/job/client/query.go b/internal/job/client/query.go
index f715c712..f3c02ca5 100644
--- a/internal/job/client/query.go
+++ b/internal/job/client/query.go
@@ -24,6 +24,7 @@ import (
"context"
"encoding/json"
"fmt"
+ "strings"
"github.com/retr0h/osapi/internal/job"
"github.com/retr0h/osapi/internal/provider/network/dns"
@@ -401,6 +402,10 @@ func (c *Client) ListAgents(
agents := make([]job.AgentInfo, 0, len(keys))
for _, key := range keys {
+ if !strings.HasPrefix(key, "agents.") {
+ continue
+ }
+
entry, err := c.registryKV.Get(ctx, key)
if err != nil {
continue
@@ -413,6 +418,8 @@ func (c *Client) ListAgents(
info := agentInfoFromRegistration(®)
c.mergeFacts(ctx, &info)
+ c.overlayDrainState(ctx, &info)
+
agents = append(agents, info)
}
@@ -441,6 +448,13 @@ func (c *Client) GetAgent(
info := agentInfoFromRegistration(®)
c.mergeFacts(ctx, &info)
+ c.overlayDrainState(ctx, &info)
+
+ timeline, err := c.GetAgentTimeline(ctx, hostname)
+ if err == nil && len(timeline) > 0 {
+ info.Timeline = timeline
+ }
+
return &info, nil
}
@@ -489,5 +503,7 @@ func agentInfoFromRegistration(
LoadAverages: reg.LoadAverages,
MemoryStats: reg.MemoryStats,
AgentVersion: reg.AgentVersion,
+ Conditions: reg.Conditions,
+ State: reg.State,
}
}
diff --git a/internal/job/client/query_public_test.go b/internal/job/client/query_public_test.go
index 6c07b949..d5394087 100644
--- a/internal/job/client/query_public_test.go
+++ b/internal/job/client/query_public_test.go
@@ -1133,6 +1133,7 @@ func (s *QueryPublicTestSuite) TestListAgents() {
tests := []struct {
name string
setupMockKV func(*jobmocks.MockKeyValue)
+ setupStateKV func(*jobmocks.MockKeyValue)
setupMockFactsKV func(*jobmocks.MockKeyValue)
useRegistryKV bool
useFactsKV bool
@@ -1185,6 +1186,14 @@ func (s *QueryPublicTestSuite) TestListAgents() {
Get(gomock.Any(), "agents.server2").
Return(entry2, nil)
},
+ setupStateKV: func(kv *jobmocks.MockKeyValue) {
+ kv.EXPECT().
+ Get(gomock.Any(), "drain.server1").
+ Return(nil, errors.New("key not found"))
+ kv.EXPECT().
+ Get(gomock.Any(), "drain.server2").
+ Return(nil, errors.New("key not found"))
+ },
expectedCount: 2,
validateFunc: func(agents []job.AgentInfo) {
s.Equal("server1", agents[0].Hostname)
@@ -1228,6 +1237,11 @@ func (s *QueryPublicTestSuite) TestListAgents() {
Get(gomock.Any(), "agents.server2").
Return(nil, errors.New("key not found"))
},
+ setupStateKV: func(kv *jobmocks.MockKeyValue) {
+ kv.EXPECT().
+ Get(gomock.Any(), "drain.server1").
+ Return(nil, errors.New("key not found"))
+ },
expectedCount: 1,
},
{
@@ -1252,6 +1266,11 @@ func (s *QueryPublicTestSuite) TestListAgents() {
Get(gomock.Any(), "agents.server2").
Return(entry2, nil)
},
+ setupStateKV: func(kv *jobmocks.MockKeyValue) {
+ kv.EXPECT().
+ Get(gomock.Any(), "drain.server1").
+ Return(nil, errors.New("key not found"))
+ },
expectedCount: 1,
},
{
@@ -1273,6 +1292,11 @@ func (s *QueryPublicTestSuite) TestListAgents() {
Get(gomock.Any(), "agents.server1").
Return(entry, nil)
},
+ setupStateKV: func(kv *jobmocks.MockKeyValue) {
+ kv.EXPECT().
+ Get(gomock.Any(), "drain.server1").
+ Return(nil, errors.New("key not found"))
+ },
setupMockFactsKV: func(kv *jobmocks.MockKeyValue) {
factsEntry := jobmocks.NewMockKeyValueEntry(s.mockCtrl)
factsEntry.EXPECT().Value().Return(
@@ -1317,6 +1341,11 @@ func (s *QueryPublicTestSuite) TestListAgents() {
Get(gomock.Any(), "agents.server1").
Return(entry, nil)
},
+ setupStateKV: func(kv *jobmocks.MockKeyValue) {
+ kv.EXPECT().
+ Get(gomock.Any(), "drain.server1").
+ Return(nil, errors.New("key not found"))
+ },
expectedCount: 1,
validateFunc: func(agents []job.AgentInfo) {
s.Equal("server1", agents[0].Hostname)
@@ -1347,6 +1376,11 @@ func (s *QueryPublicTestSuite) TestListAgents() {
Get(gomock.Any(), "agents.server1").
Return(entry, nil)
},
+ setupStateKV: func(kv *jobmocks.MockKeyValue) {
+ kv.EXPECT().
+ Get(gomock.Any(), "drain.server1").
+ Return(nil, errors.New("key not found"))
+ },
setupMockFactsKV: func(kv *jobmocks.MockKeyValue) {
kv.EXPECT().
Get(gomock.Any(), "facts.server1").
@@ -1378,6 +1412,11 @@ func (s *QueryPublicTestSuite) TestListAgents() {
Get(gomock.Any(), "agents.server1").
Return(entry, nil)
},
+ setupStateKV: func(kv *jobmocks.MockKeyValue) {
+ kv.EXPECT().
+ Get(gomock.Any(), "drain.server1").
+ Return(nil, errors.New("key not found"))
+ },
setupMockFactsKV: func(kv *jobmocks.MockKeyValue) {
factsEntry := jobmocks.NewMockKeyValueEntry(s.mockCtrl)
factsEntry.EXPECT().Value().Return([]byte(`not valid json`))
@@ -1409,6 +1448,11 @@ func (s *QueryPublicTestSuite) TestListAgents() {
if tt.useRegistryKV {
opts.RegistryKV = registryKV
}
+ if tt.setupStateKV != nil {
+ stateKV := jobmocks.NewMockKeyValue(s.mockCtrl)
+ tt.setupStateKV(stateKV)
+ opts.StateKV = stateKV
+ }
if tt.useFactsKV {
factsKV := jobmocks.NewMockKeyValue(s.mockCtrl)
if tt.setupMockFactsKV != nil {
@@ -1444,6 +1488,7 @@ func (s *QueryPublicTestSuite) TestGetAgent() {
name string
hostname string
setupMockKV func(*jobmocks.MockKeyValue)
+ setupStateKV func(*jobmocks.MockKeyValue)
setupMockFactsKV func(*jobmocks.MockKeyValue)
useRegistryKV bool
useFactsKV bool
@@ -1473,6 +1518,14 @@ func (s *QueryPublicTestSuite) TestGetAgent() {
Get(gomock.Any(), "agents.server1").
Return(entry, nil)
},
+ setupStateKV: func(kv *jobmocks.MockKeyValue) {
+ kv.EXPECT().
+ Get(gomock.Any(), "drain.server1").
+ Return(nil, errors.New("key not found"))
+ kv.EXPECT().
+ Keys(gomock.Any()).
+ Return(nil, errors.New("nats: no keys found"))
+ },
validateFunc: func(info *job.AgentInfo) {
s.Equal("server1", info.Hostname)
s.Equal(map[string]string{"group": "web"}, info.Labels)
@@ -1524,6 +1577,14 @@ func (s *QueryPublicTestSuite) TestGetAgent() {
Get(gomock.Any(), "agents.server1").
Return(entry, nil)
},
+ setupStateKV: func(kv *jobmocks.MockKeyValue) {
+ kv.EXPECT().
+ Get(gomock.Any(), "drain.server1").
+ Return(nil, errors.New("key not found"))
+ kv.EXPECT().
+ Keys(gomock.Any()).
+ Return(nil, errors.New("nats: no keys found"))
+ },
setupMockFactsKV: func(kv *jobmocks.MockKeyValue) {
factsEntry := jobmocks.NewMockKeyValueEntry(s.mockCtrl)
factsEntry.EXPECT().Value().Return(
@@ -1565,6 +1626,14 @@ func (s *QueryPublicTestSuite) TestGetAgent() {
Get(gomock.Any(), "agents.server1").
Return(entry, nil)
},
+ setupStateKV: func(kv *jobmocks.MockKeyValue) {
+ kv.EXPECT().
+ Get(gomock.Any(), "drain.server1").
+ Return(nil, errors.New("key not found"))
+ kv.EXPECT().
+ Keys(gomock.Any()).
+ Return(nil, errors.New("nats: no keys found"))
+ },
validateFunc: func(info *job.AgentInfo) {
s.Equal("server1", info.Hostname)
s.Empty(info.Architecture)
@@ -1591,6 +1660,14 @@ func (s *QueryPublicTestSuite) TestGetAgent() {
Get(gomock.Any(), "agents.server1").
Return(entry, nil)
},
+ setupStateKV: func(kv *jobmocks.MockKeyValue) {
+ kv.EXPECT().
+ Get(gomock.Any(), "drain.server1").
+ Return(nil, errors.New("key not found"))
+ kv.EXPECT().
+ Keys(gomock.Any()).
+ Return(nil, errors.New("nats: no keys found"))
+ },
setupMockFactsKV: func(kv *jobmocks.MockKeyValue) {
kv.EXPECT().
Get(gomock.Any(), "facts.server1").
@@ -1602,6 +1679,96 @@ func (s *QueryPublicTestSuite) TestGetAgent() {
s.Empty(info.KernelVersion)
},
},
+ {
+ name: "when timeline events exist includes timeline in response",
+ hostname: "server1",
+ useRegistryKV: true,
+ setupMockKV: func(kv *jobmocks.MockKeyValue) {
+ entry := jobmocks.NewMockKeyValueEntry(s.mockCtrl)
+ entry.EXPECT().Value().Return(
+ []byte(
+ `{"hostname":"server1","registered_at":"2026-01-01T00:00:00Z"}`,
+ ),
+ )
+ kv.EXPECT().
+ Get(gomock.Any(), "agents.server1").
+ Return(entry, nil)
+ },
+ setupStateKV: func(kv *jobmocks.MockKeyValue) {
+ kv.EXPECT().
+ Get(gomock.Any(), "drain.server1").
+ Return(nil, errors.New("key not found"))
+
+ // GetAgentTimeline calls Keys then Get for matching keys
+ kv.EXPECT().
+ Keys(gomock.Any()).
+ Return([]string{
+ "agents.server1",
+ "timeline.server1.drain.1000000000",
+ "timeline.server1.undrain.2000000000",
+ }, nil)
+
+ drainEntry := jobmocks.NewMockKeyValueEntry(s.mockCtrl)
+ drainEntry.EXPECT().Value().Return(
+ []byte(
+ `{"timestamp":"2026-01-01T01:00:00Z","event":"drain","hostname":"server1","message":"node draining"}`,
+ ),
+ )
+ kv.EXPECT().
+ Get(gomock.Any(), "timeline.server1.drain.1000000000").
+ Return(drainEntry, nil)
+
+ undrainEntry := jobmocks.NewMockKeyValueEntry(s.mockCtrl)
+ undrainEntry.EXPECT().Value().Return(
+ []byte(
+ `{"timestamp":"2026-01-01T02:00:00Z","event":"undrain","hostname":"server1","message":"node undrained"}`,
+ ),
+ )
+ kv.EXPECT().
+ Get(gomock.Any(), "timeline.server1.undrain.2000000000").
+ Return(undrainEntry, nil)
+ },
+ validateFunc: func(info *job.AgentInfo) {
+ s.Equal("server1", info.Hostname)
+ s.Len(info.Timeline, 2)
+ s.Equal("drain", info.Timeline[0].Event)
+ s.Equal("node draining", info.Timeline[0].Message)
+ s.Equal("undrain", info.Timeline[1].Event)
+ s.Equal("node undrained", info.Timeline[1].Message)
+ },
+ },
+ {
+ name: "when conditions and state set includes them in response",
+ hostname: "server1",
+ useRegistryKV: true,
+ setupMockKV: func(kv *jobmocks.MockKeyValue) {
+ entry := jobmocks.NewMockKeyValueEntry(s.mockCtrl)
+ entry.EXPECT().Value().Return(
+ []byte(
+ `{"hostname":"server1","registered_at":"2026-01-01T00:00:00Z","state":"Draining","conditions":[{"type":"DiskPressure","status":true,"reason":"disk usage 92%","last_transition_time":"2026-01-01T00:00:00Z"}]}`,
+ ),
+ )
+ kv.EXPECT().
+ Get(gomock.Any(), "agents.server1").
+ Return(entry, nil)
+ },
+ setupStateKV: func(kv *jobmocks.MockKeyValue) {
+ kv.EXPECT().
+ Get(gomock.Any(), "drain.server1").
+ Return(nil, errors.New("key not found"))
+ kv.EXPECT().
+ Keys(gomock.Any()).
+ Return(nil, errors.New("nats: no keys found"))
+ },
+ validateFunc: func(info *job.AgentInfo) {
+ s.Equal("server1", info.Hostname)
+ s.Equal("Draining", info.State)
+ s.Len(info.Conditions, 1)
+ s.Equal("DiskPressure", info.Conditions[0].Type)
+ s.True(info.Conditions[0].Status)
+ s.Equal("disk usage 92%", info.Conditions[0].Reason)
+ },
+ },
}
for _, tt := range tests {
@@ -1618,6 +1785,11 @@ func (s *QueryPublicTestSuite) TestGetAgent() {
if tt.useRegistryKV {
opts.RegistryKV = registryKV
}
+ if tt.setupStateKV != nil {
+ stateKV := jobmocks.NewMockKeyValue(s.mockCtrl)
+ tt.setupStateKV(stateKV)
+ opts.StateKV = stateKV
+ }
if tt.useFactsKV {
factsKV := jobmocks.NewMockKeyValue(s.mockCtrl)
if tt.setupMockFactsKV != nil {
@@ -1730,6 +1902,7 @@ func (s *QueryPublicTestSuite) TestQueryNodeDiskBroadcast() {
timeout time.Duration
opts *publishAndCollectMockOpts
setupRegistryKV func(*jobmocks.MockKeyValue)
+ setupStateKV func(*jobmocks.MockKeyValue)
expectError bool
errorContains string
expectedCount int
@@ -1771,6 +1944,14 @@ func (s *QueryPublicTestSuite) TestQueryNodeDiskBroadcast() {
Get(gomock.Any(), "agents.server2").
Return(entry2, nil)
},
+ setupStateKV: func(kv *jobmocks.MockKeyValue) {
+ kv.EXPECT().
+ Get(gomock.Any(), "drain.server1").
+ Return(nil, errors.New("key not found"))
+ kv.EXPECT().
+ Get(gomock.Any(), "drain.server2").
+ Return(nil, errors.New("key not found"))
+ },
expectedCount: 2,
},
{
@@ -1834,6 +2015,12 @@ func (s *QueryPublicTestSuite) TestQueryNodeDiskBroadcast() {
opts.RegistryKV = mockRegistryKV
}
+ if tt.setupStateKV != nil {
+ stateKV := jobmocks.NewMockKeyValue(s.mockCtrl)
+ tt.setupStateKV(stateKV)
+ opts.StateKV = stateKV
+ }
+
jobsClient, err := client.New(slog.Default(), s.mockNATSClient, opts)
s.Require().NoError(err)
diff --git a/internal/job/client/types.go b/internal/job/client/types.go
index 4da2e789..5723b5dc 100644
--- a/internal/job/client/types.go
+++ b/internal/job/client/types.go
@@ -228,6 +228,30 @@ type JobClient interface {
hostname string,
) (*job.AgentInfo, error)
+ // Agent timeline
+ WriteAgentTimelineEvent(
+ ctx context.Context,
+ hostname, event, message string,
+ ) error
+ GetAgentTimeline(
+ ctx context.Context,
+ hostname string,
+ ) ([]job.TimelineEvent, error)
+
+ // Agent drain flag
+ CheckDrainFlag(
+ ctx context.Context,
+ hostname string,
+ ) bool
+ SetDrainFlag(
+ ctx context.Context,
+ hostname string,
+ ) error
+ DeleteDrainFlag(
+ ctx context.Context,
+ hostname string,
+ ) error
+
// Job deletion
DeleteJob(
ctx context.Context,
diff --git a/internal/job/mocks/job_client.gen.go b/internal/job/mocks/job_client.gen.go
index 2ea4cb88..60a0265d 100644
--- a/internal/job/mocks/job_client.gen.go
+++ b/internal/job/mocks/job_client.gen.go
@@ -44,6 +44,20 @@ func (m *MockJobClient) EXPECT() *MockJobClientMockRecorder {
return m.recorder
}
+// CheckDrainFlag mocks base method.
+func (m *MockJobClient) CheckDrainFlag(arg0 context.Context, arg1 string) bool {
+ m.ctrl.T.Helper()
+ ret := m.ctrl.Call(m, "CheckDrainFlag", arg0, arg1)
+ ret0, _ := ret[0].(bool)
+ return ret0
+}
+
+// CheckDrainFlag indicates an expected call of CheckDrainFlag.
+func (mr *MockJobClientMockRecorder) CheckDrainFlag(arg0, arg1 interface{}) *gomock.Call {
+ mr.mock.ctrl.T.Helper()
+ return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "CheckDrainFlag", reflect.TypeOf((*MockJobClient)(nil).CheckDrainFlag), arg0, arg1)
+}
+
// ConsumeJobs mocks base method.
func (m *MockJobClient) ConsumeJobs(arg0 context.Context, arg1, arg2 string, arg3 func(jetstream.Msg) error, arg4 *client.ConsumeOptions) error {
m.ctrl.T.Helper()
@@ -87,6 +101,20 @@ func (mr *MockJobClientMockRecorder) CreateOrUpdateConsumer(arg0, arg1, arg2 int
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "CreateOrUpdateConsumer", reflect.TypeOf((*MockJobClient)(nil).CreateOrUpdateConsumer), arg0, arg1, arg2)
}
+// DeleteDrainFlag mocks base method.
+func (m *MockJobClient) DeleteDrainFlag(arg0 context.Context, arg1 string) error {
+ m.ctrl.T.Helper()
+ ret := m.ctrl.Call(m, "DeleteDrainFlag", arg0, arg1)
+ ret0, _ := ret[0].(error)
+ return ret0
+}
+
+// DeleteDrainFlag indicates an expected call of DeleteDrainFlag.
+func (mr *MockJobClientMockRecorder) DeleteDrainFlag(arg0, arg1 interface{}) *gomock.Call {
+ mr.mock.ctrl.T.Helper()
+ return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "DeleteDrainFlag", reflect.TypeOf((*MockJobClient)(nil).DeleteDrainFlag), arg0, arg1)
+}
+
// DeleteJob mocks base method.
func (m *MockJobClient) DeleteJob(arg0 context.Context, arg1 string) error {
m.ctrl.T.Helper()
@@ -116,6 +144,21 @@ func (mr *MockJobClientMockRecorder) GetAgent(arg0, arg1 interface{}) *gomock.Ca
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetAgent", reflect.TypeOf((*MockJobClient)(nil).GetAgent), arg0, arg1)
}
+// GetAgentTimeline mocks base method.
+func (m *MockJobClient) GetAgentTimeline(arg0 context.Context, arg1 string) ([]job.TimelineEvent, error) {
+ m.ctrl.T.Helper()
+ ret := m.ctrl.Call(m, "GetAgentTimeline", arg0, arg1)
+ ret0, _ := ret[0].([]job.TimelineEvent)
+ ret1, _ := ret[1].(error)
+ return ret0, ret1
+}
+
+// GetAgentTimeline indicates an expected call of GetAgentTimeline.
+func (mr *MockJobClientMockRecorder) GetAgentTimeline(arg0, arg1 interface{}) *gomock.Call {
+ mr.mock.ctrl.T.Helper()
+ return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetAgentTimeline", reflect.TypeOf((*MockJobClient)(nil).GetAgentTimeline), arg0, arg1)
+}
+
// GetJobData mocks base method.
func (m *MockJobClient) GetJobData(arg0 context.Context, arg1 string) ([]byte, error) {
m.ctrl.T.Helper()
@@ -763,6 +806,34 @@ func (mr *MockJobClientMockRecorder) RetryJob(arg0, arg1, arg2 interface{}) *gom
return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "RetryJob", reflect.TypeOf((*MockJobClient)(nil).RetryJob), arg0, arg1, arg2)
}
+// SetDrainFlag mocks base method.
+func (m *MockJobClient) SetDrainFlag(arg0 context.Context, arg1 string) error {
+ m.ctrl.T.Helper()
+ ret := m.ctrl.Call(m, "SetDrainFlag", arg0, arg1)
+ ret0, _ := ret[0].(error)
+ return ret0
+}
+
+// SetDrainFlag indicates an expected call of SetDrainFlag.
+func (mr *MockJobClientMockRecorder) SetDrainFlag(arg0, arg1 interface{}) *gomock.Call {
+ mr.mock.ctrl.T.Helper()
+ return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "SetDrainFlag", reflect.TypeOf((*MockJobClient)(nil).SetDrainFlag), arg0, arg1)
+}
+
+// WriteAgentTimelineEvent mocks base method.
+func (m *MockJobClient) WriteAgentTimelineEvent(arg0 context.Context, arg1, arg2, arg3 string) error {
+ m.ctrl.T.Helper()
+ ret := m.ctrl.Call(m, "WriteAgentTimelineEvent", arg0, arg1, arg2, arg3)
+ ret0, _ := ret[0].(error)
+ return ret0
+}
+
+// WriteAgentTimelineEvent indicates an expected call of WriteAgentTimelineEvent.
+func (mr *MockJobClientMockRecorder) WriteAgentTimelineEvent(arg0, arg1, arg2, arg3 interface{}) *gomock.Call {
+ mr.mock.ctrl.T.Helper()
+ return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "WriteAgentTimelineEvent", reflect.TypeOf((*MockJobClient)(nil).WriteAgentTimelineEvent), arg0, arg1, arg2, arg3)
+}
+
// WriteJobResponse mocks base method.
func (m *MockJobClient) WriteJobResponse(arg0 context.Context, arg1, arg2 string, arg3 []byte, arg4, arg5 string, arg6 *bool) error {
m.ctrl.T.Helper()
diff --git a/internal/job/subjects.go b/internal/job/subjects.go
index b6bab149..0085bf64 100644
--- a/internal/job/subjects.go
+++ b/internal/job/subjects.go
@@ -339,10 +339,19 @@ func CountExpectedAgents(
switch routingType {
case BroadcastHost:
- return len(agents)
+ count := 0
+ for i := range agents {
+ if agents[i].State != AgentStateCordoned && agents[i].State != AgentStateDraining {
+ count++
+ }
+ }
+ return count
case "label":
count := 0
for i := range agents {
+ if agents[i].State == AgentStateCordoned || agents[i].State == AgentStateDraining {
+ continue
+ }
if agentVal, ok := agents[i].Labels[key]; ok {
if agentVal == value || strings.HasPrefix(agentVal, value+".") {
count++
diff --git a/internal/job/subjects_public_test.go b/internal/job/subjects_public_test.go
index 55b0cc6a..b16315ea 100644
--- a/internal/job/subjects_public_test.go
+++ b/internal/job/subjects_public_test.go
@@ -956,6 +956,51 @@ func (suite *SubjectsPublicTestSuite) TestCountExpectedAgents() {
target: "_any",
want: 0,
},
+ {
+ name: "when _all excludes cordoned agents",
+ agents: []job.AgentInfo{
+ {Hostname: "web-01"},
+ {Hostname: "web-02", State: job.AgentStateCordoned},
+ {Hostname: "web-03"},
+ },
+ target: "_all",
+ want: 2,
+ },
+ {
+ name: "when _all excludes draining agents",
+ agents: []job.AgentInfo{
+ {Hostname: "web-01"},
+ {Hostname: "web-02", State: job.AgentStateDraining},
+ },
+ target: "_all",
+ want: 1,
+ },
+ {
+ name: "when label match excludes cordoned agents",
+ agents: []job.AgentInfo{
+ {
+ Hostname: "web-01",
+ Labels: map[string]string{"group": "web.dev"},
+ State: job.AgentStateCordoned,
+ },
+ {Hostname: "web-02", Labels: map[string]string{"group": "web.dev"}},
+ },
+ target: "group:web",
+ want: 1,
+ },
+ {
+ name: "when label match excludes draining agents",
+ agents: []job.AgentInfo{
+ {
+ Hostname: "web-01",
+ Labels: map[string]string{"group": "web.dev"},
+ State: job.AgentStateDraining,
+ },
+ {Hostname: "web-02", Labels: map[string]string{"group": "web.dev"}},
+ },
+ target: "group:web",
+ want: 1,
+ },
}
for _, tt := range tests {
diff --git a/internal/job/types.go b/internal/job/types.go
index 77c87403..753a9be3 100644
--- a/internal/job/types.go
+++ b/internal/job/types.go
@@ -270,6 +270,28 @@ type FactsRegistration struct {
Facts map[string]any `json:"facts,omitempty"`
}
+// Condition type constants.
+const (
+ ConditionMemoryPressure = "MemoryPressure"
+ ConditionHighLoad = "HighLoad"
+ ConditionDiskPressure = "DiskPressure"
+)
+
+// Agent state constants.
+const (
+ AgentStateReady = "Ready"
+ AgentStateDraining = "Draining"
+ AgentStateCordoned = "Cordoned"
+)
+
+// Condition represents a node condition evaluated agent-side.
+type Condition struct {
+ Type string `json:"type"`
+ Status bool `json:"status"`
+ Reason string `json:"reason,omitempty"`
+ LastTransitionTime time.Time `json:"last_transition_time"`
+}
+
// AgentRegistration represents an agent's registration entry in the KV registry.
type AgentRegistration struct {
// Hostname is the hostname of the agent.
@@ -290,6 +312,10 @@ type AgentRegistration struct {
MemoryStats *mem.Stats `json:"memory_stats,omitempty"`
// AgentVersion is the version of the agent binary.
AgentVersion string `json:"agent_version,omitempty"`
+ // Conditions contains the evaluated node conditions.
+ Conditions []Condition `json:"conditions,omitempty"`
+ // State is the agent's scheduling state (Ready, Draining, Cordoned).
+ State string `json:"state,omitempty"`
}
// AgentInfo represents information about an active agent.
@@ -328,6 +354,12 @@ type AgentInfo struct {
Interfaces []NetworkInterface `json:"interfaces,omitempty"`
// Facts contains arbitrary key-value facts collected by the agent.
Facts map[string]any `json:"facts,omitempty"`
+ // Conditions contains the evaluated node conditions.
+ Conditions []Condition `json:"conditions,omitempty"`
+ // State is the agent's scheduling state (Ready, Draining, Cordoned).
+ State string `json:"state,omitempty"`
+ // Timeline contains the chronological sequence of state transition events.
+ Timeline []TimelineEvent `json:"timeline,omitempty"`
}
// NodeDiskResponse represents the response for node.disk.get operations.
diff --git a/internal/provider/node/mem/darwin_get_vm.go b/internal/provider/node/mem/darwin_get_vm.go
index 819dd072..6df705a1 100644
--- a/internal/provider/node/mem/darwin_get_vm.go
+++ b/internal/provider/node/mem/darwin_get_vm.go
@@ -30,8 +30,9 @@ func (d *Darwin) GetStats() (*Stats, error) {
}
return &Stats{
- Total: memInfo.Total,
- Free: memInfo.Free,
- Cached: memInfo.Cached,
+ Total: memInfo.Total,
+ Available: memInfo.Available,
+ Free: memInfo.Free,
+ Cached: memInfo.Cached,
}, nil
}
diff --git a/internal/provider/node/mem/types.go b/internal/provider/node/mem/types.go
index 2f242555..8a592d80 100644
--- a/internal/provider/node/mem/types.go
+++ b/internal/provider/node/mem/types.go
@@ -30,6 +30,8 @@ type Provider interface {
type Stats struct {
// Total memory in bytes
Total uint64
+ // Available memory in bytes (free + reclaimable)
+ Available uint64
// Free memory in bytes
Free uint64
// Cached memory in bytes
diff --git a/internal/provider/node/mem/ubuntu_get_vm.go b/internal/provider/node/mem/ubuntu_get_vm.go
index ae4e1982..84411367 100644
--- a/internal/provider/node/mem/ubuntu_get_vm.go
+++ b/internal/provider/node/mem/ubuntu_get_vm.go
@@ -30,8 +30,9 @@ func (u *Ubuntu) GetStats() (*Stats, error) {
}
return &Stats{
- Total: memInfo.Total,
- Free: memInfo.Free,
- Cached: memInfo.Cached,
+ Total: memInfo.Total,
+ Available: memInfo.Available,
+ Free: memInfo.Free,
+ Cached: memInfo.Cached,
}, nil
}