Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ All notable changes to this project will be documented in this file.

### Changes

- Device agents
- Reduce agent CPU usage by continuing to fetch the full config every 5 seconds but only applying when it has changed or after 60s timeout

## [v0.14.0](https://github.com/malbeclabs/doublezero/compare/client/v0.13.0...client/v0.14.0) - 2026-03-24

### Breaking
Expand Down
92 changes: 65 additions & 27 deletions controlplane/agent/cmd/agent/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ package main

import (
"context"
"crypto/sha256"
"encoding/hex"
"flag"
"fmt"
"log"
Expand All @@ -20,52 +22,51 @@ import (
)

var (
localDevicePubkey = flag.String("pubkey", "frtyt4WKYudUpqTsvJzwN6Bd4btYxrkaYNhBNAaUVGWn", "This device's public key on the doublezero network")
controllerAddress = flag.String("controller", "18.116.166.35:7000", "The DoubleZero controller IP address and port to connect to")
device = flag.String("device", "127.0.0.1:9543", "IP Address and port of the Arist EOS API. Should always be the local switch at 127.0.0.1:9543.")
sleepIntervalInSeconds = flag.Float64("sleep-interval-in-seconds", 5, "How long to sleep in between polls")
controllerTimeoutInSeconds = flag.Float64("controller-timeout-in-seconds", 30, "How long to wait for a response from the controller before giving up")
maxLockAge = flag.Int("max-lock-age-in-seconds", 3600, "If agent detects a config lock that older than the specified age, it will force unlock.")
verbose = flag.Bool("verbose", false, "Enable verbose logging")
showVersion = flag.Bool("version", false, "Print the version of the doublezero-agent and exit")
metricsEnable = flag.Bool("metrics-enable", false, "Enable prometheus metrics")
metricsAddr = flag.String("metrics-addr", ":8080", "Address to listen on for prometheus metrics")
localDevicePubkey = flag.String("pubkey", "frtyt4WKYudUpqTsvJzwN6Bd4btYxrkaYNhBNAaUVGWn", "This device's public key on the doublezero network")
controllerAddress = flag.String("controller", "18.116.166.35:7000", "The DoubleZero controller IP address and port to connect to")
device = flag.String("device", "127.0.0.1:9543", "IP Address and port of the Arist EOS API. Should always be the local switch at 127.0.0.1:9543.")
sleepIntervalInSeconds = flag.Float64("sleep-interval-in-seconds", 5, "How long to sleep in between polls")
controllerTimeoutInSeconds = flag.Float64("controller-timeout-in-seconds", 30, "How long to wait for a response from the controller before giving up")
configCacheTimeoutInSeconds = flag.Int("config-cache-timeout-in-seconds", 60, "Force full config fetch after this many seconds, even if hash unchanged")
maxLockAge = flag.Int("max-lock-age-in-seconds", 3600, "If agent detects a config lock that older than the specified age, it will force unlock.")
verbose = flag.Bool("verbose", false, "Enable verbose logging")
showVersion = flag.Bool("version", false, "Print the version of the doublezero-agent and exit")
metricsEnable = flag.Bool("metrics-enable", false, "Enable prometheus metrics")
metricsAddr = flag.String("metrics-addr", ":8080", "Address to listen on for prometheus metrics")

// set by LDFLAGS
version = "dev"
commit = "none"
date = "unknown"
)

func pollControllerAndConfigureDevice(ctx context.Context, dzclient pb.ControllerClient, eapiClient *arista.EAPIClient, pubkey string, verbose *bool, maxLockAge int, agentVersion string, agentCommit string, agentDate string) error {
var err error

// The dz controller needs to know what BGP sessions we have configured locally
var neighborIpMap map[string][]string
neighborIpMap, err = eapiClient.GetBgpNeighbors(ctx)
if err != nil {
log.Println("pollControllerAndConfigureDevice: eapiClient.GetBgpNeighbors returned error:", err)
agent.ErrorsBgpNeighbors.Inc()
}
func computeChecksum(data string) string {
hash := sha256.Sum256([]byte(data))
return hex.EncodeToString(hash[:])
}

var configText string
func fetchConfigFromController(ctx context.Context, dzclient pb.ControllerClient, pubkey string, neighborIpMap map[string][]string, verbose *bool, agentVersion string, agentCommit string, agentDate string) (configText string, configHash string, err error) {
configText, err = agent.GetConfigFromServer(ctx, dzclient, pubkey, neighborIpMap, controllerTimeoutInSeconds, agentVersion, agentCommit, agentDate)
if err != nil {
log.Printf("pollControllerAndConfigureDevice failed to call agent.GetConfigFromServer: %q", err)
log.Printf("fetchConfigFromController failed to call agent.GetConfigFromServer: %q", err)
agent.ErrorsGetConfig.Inc()
return err
return "", "", err
}

if *verbose {
log.Printf("controller returned the following config: '%s'", configText)
}

configHash = computeChecksum(configText)
return configText, configHash, nil
}

func applyConfig(ctx context.Context, eapiClient *arista.EAPIClient, configText string, maxLockAge int) error {
if configText == "" {
// Controller returned empty config
return nil
}

_, err = eapiClient.AddConfigToDevice(ctx, configText, nil, maxLockAge) // 3rd arg (diffCmd) is only used for testing
_, err := eapiClient.AddConfigToDevice(ctx, configText, nil, maxLockAge)
if err != nil {
agent.ErrorsApplyConfig.Inc()
return err
Expand Down Expand Up @@ -121,15 +122,52 @@ func main() {
client := aristapb.NewEapiMgrServiceClient(clientConn)
eapiClient = arista.NewEAPIClient(slog.Default(), client)

var cachedConfigHash string
var configCacheTime time.Time
configCacheTimeout := time.Duration(*configCacheTimeoutInSeconds) * time.Second

for {
select {
case <-ctx.Done():
return
case <-ticker.C:
err := pollControllerAndConfigureDevice(ctx, dzclient, eapiClient, *localDevicePubkey, verbose, *maxLockAge, version, commit, date)
neighborIpMap, err := eapiClient.GetBgpNeighbors(ctx)
if err != nil {
log.Println("ERROR: eapiClient.GetBgpNeighbors returned", err)
agent.ErrorsBgpNeighbors.Inc()
}

// Fetch config every 5 seconds
configText, configHash, err := fetchConfigFromController(ctx, dzclient, *localDevicePubkey, neighborIpMap, verbose, version, commit, date)
if err != nil {
log.Println("ERROR: fetchConfigFromController returned", err)
continue
}

// Only apply if config changed or timeout elapsed
shouldApply := false
if cachedConfigHash == "" {
// First run
shouldApply = true
} else if configHash != cachedConfigHash {
// Config changed
shouldApply = true
} else if time.Since(configCacheTime) >= configCacheTimeout {
// Force apply after timeout
shouldApply = true
}

if !shouldApply {
continue
}

err = applyConfig(ctx, eapiClient, configText, *maxLockAge)
if err != nil {
log.Println("ERROR: pollAndConfigureDevice returned", err)
log.Println("ERROR: applyConfig returned", err)
continue
}
cachedConfigHash = configHash
configCacheTime = time.Now()
}
}
}
Loading
Loading