From 23aebabfb817a3b550a6a9e455d3d2e1dc8155b6 Mon Sep 17 00:00:00 2001 From: Fredrik Skogman Date: Tue, 17 Feb 2026 10:49:58 +0100 Subject: [PATCH] Update the chace semantics. The current logic is wrong. If the controller is restarted, deployments that happened before it started will never be deleted. This PR changes the logic to only check for existing entries to prevent duplicate calls, not require an entry to actually trigger a delete operation. --- go.mod | 4 +-- go.sum | 2 -- internal/controller/controller.go | 50 ++++++++++++++++++++----------- 3 files changed, 35 insertions(+), 21 deletions(-) diff --git a/go.mod b/go.mod index b5dea87..889361b 100644 --- a/go.mod +++ b/go.mod @@ -3,7 +3,9 @@ module github.com/github/deployment-tracker go 1.25.4 require ( + github.com/bradleyfalzon/ghinstallation/v2 v2.17.0 github.com/prometheus/client_golang v1.23.2 + golang.org/x/time v0.14.0 k8s.io/api v0.35.0 k8s.io/apimachinery v0.35.0 k8s.io/client-go v0.35.0 @@ -11,7 +13,6 @@ require ( require ( github.com/beorn7/perks v1.0.1 // indirect - github.com/bradleyfalzon/ghinstallation/v2 v2.17.0 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/davecgh/go-spew v1.1.1 // indirect github.com/emicklei/go-restful/v3 v3.12.2 // indirect @@ -45,7 +46,6 @@ require ( golang.org/x/sys v0.38.0 // indirect golang.org/x/term v0.37.0 // indirect golang.org/x/text v0.31.0 // indirect - golang.org/x/time v0.14.0 // indirect google.golang.org/protobuf v1.36.8 // indirect gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect diff --git a/go.sum b/go.sum index a94e27b..52a5a1f 100644 --- a/go.sum +++ b/go.sum @@ -118,8 +118,6 @@ golang.org/x/term v0.37.0 h1:8EGAD0qCmHYZg6J17DvsMy9/wJ7/D/4pV/wfnld5lTU= golang.org/x/term v0.37.0/go.mod h1:5pB4lxRNYYVZuTLmy8oR2BH8dflOR+IbTYFD8fi3254= golang.org/x/text v0.31.0 h1:aC8ghyu4JhP8VojJ2lEHBnochRno1sgL6nEi9WGFGMM= golang.org/x/text v0.31.0/go.mod h1:tKRAlv61yKIjGGHX/4tP1LTbc13YSec1pxVEWXzfoeM= -golang.org/x/time v0.9.0 h1:EsRrnYcQiGH+5FfbgvV4AP7qEZstoyrHB0DzarOQ4ZY= -golang.org/x/time v0.9.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= golang.org/x/time v0.14.0 h1:MRx4UaLrDotUKUdCIqzPC48t1Y9hANFKIRpNx+Te8PI= golang.org/x/time v0.14.0/go.mod h1:eL/Oa2bBBK0TkX57Fyni+NgnyQQN4LitPmob2Hjnqw4= golang.org/x/tools v0.38.0 h1:Hx2Xv8hISq8Lm16jvBZ2VQf+RLmbd7wVUsALibYI/IQ= diff --git a/internal/controller/controller.go b/internal/controller/controller.go index 9ac951b..1104cea 100644 --- a/internal/controller/controller.go +++ b/internal/controller/controller.go @@ -7,7 +7,6 @@ import ( "log/slog" "slices" "strings" - "sync" "time" "github.com/github/deployment-tracker/pkg/deploymentrecord" @@ -15,6 +14,7 @@ import ( "github.com/github/deployment-tracker/pkg/metrics" "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/apimachinery/pkg/types" + amcache "k8s.io/apimachinery/pkg/util/cache" "k8s.io/client-go/metadata" corev1 "k8s.io/api/core/v1" @@ -37,6 +37,12 @@ const ( RuntimeRiskAnnotationKey = "github.com/runtime-risks" ) +type ttlCache interface { + Get(k any) (any, bool) + Set(k any, v any, ttl time.Duration) + Delete(k any) +} + // PodEvent represents a pod event to be processed. type PodEvent struct { Key string @@ -60,7 +66,7 @@ type Controller struct { // best effort cache to avoid redundant posts // post requests are idempotent, so if this cache fails due to // restarts or other events, nothing will break. - observedDeployments sync.Map + observedDeployments ttlCache } // New creates a new deployment tracker controller. @@ -96,12 +102,13 @@ func New(clientset kubernetes.Interface, metadataClient metadata.Interface, name } cntrl := &Controller{ - clientset: clientset, - metadataClient: metadataClient, - podInformer: podInformer, - workqueue: queue, - apiClient: apiClient, - cfg: cfg, + clientset: clientset, + metadataClient: metadataClient, + podInformer: podInformer, + workqueue: queue, + apiClient: apiClient, + cfg: cfg, + observedDeployments: amcache.NewExpiring(), } // Add event handlers to the informer @@ -395,6 +402,8 @@ func (c *Controller) deploymentExists(ctx context.Context, namespace, name strin // recordContainer records a single container's deployment info. func (c *Controller) recordContainer(ctx context.Context, pod *corev1.Pod, container corev1.Container, status, eventType string, runtimeRisks []deploymentrecord.RuntimeRisk) error { + var cacheKey string + dn := getARDeploymentName(pod, container, c.cfg.Template) digest := getContainerDigest(pod, container.Name) @@ -409,12 +418,11 @@ func (c *Controller) recordContainer(ctx context.Context, pod *corev1.Pod, conta return nil } - cacheKey := getCacheKey(dn, digest) - // Check if we've already recorded this deployment switch status { case deploymentrecord.StatusDeployed: - if _, exists := c.observedDeployments.Load(cacheKey); exists { + cacheKey = getCacheKey(EventCreated, dn, digest) + if _, exists := c.observedDeployments.Get(cacheKey); exists { slog.Debug("Deployment already observed, skipping post", "deployment_name", dn, "digest", digest, @@ -422,9 +430,9 @@ func (c *Controller) recordContainer(ctx context.Context, pod *corev1.Pod, conta return nil } case deploymentrecord.StatusDecommissioned: - // For delete, check if we've seen it - if not, no need to decommission - if _, exists := c.observedDeployments.Load(cacheKey); !exists { - slog.Debug("Deployment not in cache, skipping decommission", + cacheKey = getCacheKey(EventDeleted, dn, digest) + if _, exists := c.observedDeployments.Get(cacheKey); exists { + slog.Debug("Deployment already deleted, skipping post", "deployment_name", dn, "digest", digest, ) @@ -488,8 +496,16 @@ func (c *Controller) recordContainer(ctx context.Context, pod *corev1.Pod, conta // Update cache after successful post switch status { case deploymentrecord.StatusDeployed: - c.observedDeployments.Store(cacheKey, true) + cacheKey = getCacheKey(EventCreated, dn, digest) + c.observedDeployments.Set(cacheKey, true, 2*time.Minute) + // If there was a previous delete event, remove that + cacheKey = getCacheKey(EventDeleted, dn, digest) + c.observedDeployments.Delete(cacheKey) case deploymentrecord.StatusDecommissioned: + cacheKey = getCacheKey(EventDeleted, dn, digest) + c.observedDeployments.Set(cacheKey, true, 2*time.Minute) + // If there was a previous created event, remove that + cacheKey = getCacheKey(EventCreated, dn, digest) c.observedDeployments.Delete(cacheKey) default: return fmt.Errorf("invalid status: %s", status) @@ -586,8 +602,8 @@ func (c *Controller) getOwnerMetadata(ctx context.Context, namespace string, own return obj, nil } -func getCacheKey(dn, digest string) string { - return dn + "||" + digest +func getCacheKey(ev, dn, digest string) string { + return ev + "||" + dn + "||" + digest } // createInformerFactory creates a shared informer factory with the given resync period.