diff --git a/Dockerfile b/Dockerfile index 1b76f93..76802ad 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,5 @@ # 1.25.5-alpine3.23 -FROM golang@sha256:26111811bc967321e7b6f852e914d14bede324cd1accb7f81811929a6a57fea9 AS builder +FROM golang@sha256:d9b2e14101f27ec8d09674cd01186798d227bb0daec90e032aeb1cd22ac0f029 AS builder WORKDIR /app COPY go.mod go.sum ./ diff --git a/README.md b/README.md index b4e4df5..28fa011 100644 --- a/README.md +++ b/README.md @@ -154,6 +154,8 @@ The metrics exposed beyond the default Prometheus metrics are: * `deptracker_post_record_hard_fail`: the number of failures to persist a record via the HTTP API (either an irrecoverable error or all retries are exhausted). +* `deptracker_post_record_client_error`: the number of client errors, + these are never retried nor reprocessed. ## License diff --git a/internal/controller/controller.go b/internal/controller/controller.go index 9a0495c..97ed8ec 100644 --- a/internal/controller/controller.go +++ b/internal/controller/controller.go @@ -323,6 +323,20 @@ func (c *Controller) recordContainer(ctx context.Context, pod *corev1.Pod, conta ) if err := c.apiClient.PostOne(ctx, record); err != nil { + // Make sure to not retry on client error messages + var clientErr *deploymentrecord.ClientError + if errors.As(err, &clientErr) { + slog.Warn("Failed to post record", + "event_type", eventType, + "name", record.Name, + "deployment_name", record.DeploymentName, + "status", record.Status, + "digest", record.Digest, + "error", err, + ) + return nil + } + slog.Error("Failed to post record", "event_type", eventType, "name", record.Name, diff --git a/pkg/deploymentrecord/client.go b/pkg/deploymentrecord/client.go index d71cf13..4c2dd93 100644 --- a/pkg/deploymentrecord/client.go +++ b/pkg/deploymentrecord/client.go @@ -65,6 +65,19 @@ func WithAPIToken(token string) ClientOption { } } +// ClientError represents a client error that can not be retried. +type ClientError struct { + err error +} + +func (c *ClientError) Error() string { + return fmt.Sprintf("client_error: %s", c.err.Error()) +} + +func (c *ClientError) Unwrap() error { + return c.err +} + // PostOne posts a single deployment record to the GitHub deployment // records API. func (c *Client) PostOne(ctx context.Context, record *DeploymentRecord) error { @@ -129,11 +142,11 @@ func (c *Client) PostOne(ctx context.Context, record *DeploymentRecord) error { // Don't retry on client errors (4xx) except for 429 // (rate limit) if resp.StatusCode >= 400 && resp.StatusCode < 500 && resp.StatusCode != 429 { - metrics.PostDeploymentRecordHardFail.Inc() - slog.Error("irrecoverable error, aborting", + metrics.PostDeploymentRecordClientError.Inc() + slog.Warn("client error, aborting", "attempt", attempt, "error", lastErr) - return lastErr + return &ClientError{err: lastErr} } metrics.PostDeploymentRecordSoftFail.Inc() } diff --git a/pkg/metrics/prom.go b/pkg/metrics/prom.go index cdcb51d..3d28cb4 100644 --- a/pkg/metrics/prom.go +++ b/pkg/metrics/prom.go @@ -64,4 +64,12 @@ var ( Help: "The total number of hard post failures", }, ) + + //nolint: revive + PostDeploymentRecordClientError = promauto.NewCounter( + prometheus.CounterOpts{ + Name: "deptracker_post_record_client_error", + Help: "The total number of non-retryable client failures", + }, + ) )