diff --git a/Makefile b/Makefile index 68d5f781..2b9ae194 100644 --- a/Makefile +++ b/Makefile @@ -76,6 +76,9 @@ UPSTREAM_SRCS = $(SERVER_DIR)/upstream_connection.cc $(SERVER_DIR)/pool_partitio # Rate limit layer sources RATE_LIMIT_SRCS = $(SERVER_DIR)/token_bucket.cc $(SERVER_DIR)/rate_limit_zone.cc $(SERVER_DIR)/rate_limiter.cc +# Circuit breaker layer sources +CIRCUIT_BREAKER_SRCS = $(SERVER_DIR)/circuit_breaker_window.cc $(SERVER_DIR)/circuit_breaker_slice.cc $(SERVER_DIR)/retry_budget.cc $(SERVER_DIR)/circuit_breaker_host.cc $(SERVER_DIR)/circuit_breaker_manager.cc + # CLI layer sources CLI_SRCS = $(SERVER_DIR)/cli_parser.cc $(SERVER_DIR)/signal_handler.cc $(SERVER_DIR)/pid_file.cc $(SERVER_DIR)/daemonizer.cc @@ -122,7 +125,7 @@ NGHTTP2_SRC = $(THIRD_PARTY_DIR)/nghttp2/nghttp2_alpn.c \ NGHTTP2_OBJ = $(NGHTTP2_SRC:.c=.o) # Server library sources (shared between test and production binaries) -LIB_SRCS = $(REACTOR_SRCS) $(NETWORK_SRCS) $(SERVER_SRCS) $(THREAD_POOL_SRCS) $(FOUNDATION_SRCS) $(HTTP_SRCS) $(HTTP2_SRCS) $(WS_SRCS) $(TLS_SRCS) $(UPSTREAM_SRCS) $(RATE_LIMIT_SRCS) $(CLI_SRCS) $(UTIL_SRCS) +LIB_SRCS = $(REACTOR_SRCS) $(NETWORK_SRCS) $(SERVER_SRCS) $(THREAD_POOL_SRCS) $(FOUNDATION_SRCS) $(HTTP_SRCS) $(HTTP2_SRCS) $(WS_SRCS) $(TLS_SRCS) $(UPSTREAM_SRCS) $(RATE_LIMIT_SRCS) $(CIRCUIT_BREAKER_SRCS) $(CLI_SRCS) $(UTIL_SRCS) # Test binary sources TEST_SRCS = $(LIB_SRCS) $(TEST_DIR)/test_framework.cc $(TEST_DIR)/run_test.cc @@ -142,11 +145,12 @@ WS_HEADERS = $(LIB_DIR)/ws/websocket_connection.h $(LIB_DIR)/ws/websocket_frame. TLS_HEADERS = $(LIB_DIR)/tls/tls_context.h $(LIB_DIR)/tls/tls_connection.h $(LIB_DIR)/tls/tls_client_context.h UPSTREAM_HEADERS = $(LIB_DIR)/upstream/upstream_manager.h $(LIB_DIR)/upstream/upstream_host_pool.h $(LIB_DIR)/upstream/pool_partition.h $(LIB_DIR)/upstream/upstream_connection.h $(LIB_DIR)/upstream/upstream_lease.h $(LIB_DIR)/upstream/upstream_http_codec.h $(LIB_DIR)/upstream/http_request_serializer.h $(LIB_DIR)/upstream/header_rewriter.h $(LIB_DIR)/upstream/retry_policy.h $(LIB_DIR)/upstream/proxy_transaction.h $(LIB_DIR)/upstream/proxy_handler.h $(LIB_DIR)/upstream/upstream_response.h $(LIB_DIR)/upstream/upstream_callbacks.h RATE_LIMIT_HEADERS = $(LIB_DIR)/rate_limit/token_bucket.h $(LIB_DIR)/rate_limit/rate_limit_zone.h $(LIB_DIR)/rate_limit/rate_limiter.h +CIRCUIT_BREAKER_HEADERS = $(LIB_DIR)/circuit_breaker/circuit_breaker_state.h $(LIB_DIR)/circuit_breaker/circuit_breaker_window.h $(LIB_DIR)/circuit_breaker/circuit_breaker_slice.h $(LIB_DIR)/circuit_breaker/retry_budget.h $(LIB_DIR)/circuit_breaker/circuit_breaker_host.h $(LIB_DIR)/circuit_breaker/circuit_breaker_manager.h CLI_HEADERS = $(LIB_DIR)/cli/cli_parser.h $(LIB_DIR)/cli/signal_handler.h $(LIB_DIR)/cli/pid_file.h $(LIB_DIR)/cli/version.h $(LIB_DIR)/cli/daemonizer.h -TEST_HEADERS = $(TEST_DIR)/test_framework.h $(TEST_DIR)/http_test_client.h $(TEST_DIR)/basic_test.h $(TEST_DIR)/stress_test.h $(TEST_DIR)/race_condition_test.h $(TEST_DIR)/timeout_test.h $(TEST_DIR)/config_test.h $(TEST_DIR)/http_test.h $(TEST_DIR)/websocket_test.h $(TEST_DIR)/tls_test.h $(TEST_DIR)/cli_test.h $(TEST_DIR)/http2_test.h $(TEST_DIR)/route_test.h $(TEST_DIR)/upstream_pool_test.h $(TEST_DIR)/proxy_test.h +TEST_HEADERS = $(TEST_DIR)/test_framework.h $(TEST_DIR)/http_test_client.h $(TEST_DIR)/basic_test.h $(TEST_DIR)/stress_test.h $(TEST_DIR)/race_condition_test.h $(TEST_DIR)/timeout_test.h $(TEST_DIR)/config_test.h $(TEST_DIR)/http_test.h $(TEST_DIR)/websocket_test.h $(TEST_DIR)/tls_test.h $(TEST_DIR)/cli_test.h $(TEST_DIR)/http2_test.h $(TEST_DIR)/route_test.h $(TEST_DIR)/upstream_pool_test.h $(TEST_DIR)/proxy_test.h $(TEST_DIR)/rate_limit_test.h $(TEST_DIR)/kqueue_test.h $(TEST_DIR)/circuit_breaker_test.h $(TEST_DIR)/circuit_breaker_components_test.h $(TEST_DIR)/circuit_breaker_integration_test.h $(TEST_DIR)/circuit_breaker_retry_budget_test.h $(TEST_DIR)/circuit_breaker_wait_queue_drain_test.h $(TEST_DIR)/circuit_breaker_observability_test.h $(TEST_DIR)/circuit_breaker_reload_test.h # All headers combined -HEADERS = $(CORE_HEADERS) $(CALLBACK_HEADERS) $(REACTOR_HEADERS) $(NETWORK_HEADERS) $(SERVER_HEADERS) $(THREAD_POOL_HEADERS) $(UTIL_HEADERS) $(FOUNDATION_HEADERS) $(HTTP_HEADERS) $(HTTP2_HEADERS) $(WS_HEADERS) $(TLS_HEADERS) $(UPSTREAM_HEADERS) $(RATE_LIMIT_HEADERS) $(CLI_HEADERS) $(TEST_HEADERS) +HEADERS = $(CORE_HEADERS) $(CALLBACK_HEADERS) $(REACTOR_HEADERS) $(NETWORK_HEADERS) $(SERVER_HEADERS) $(THREAD_POOL_HEADERS) $(UTIL_HEADERS) $(FOUNDATION_HEADERS) $(HTTP_HEADERS) $(HTTP2_HEADERS) $(WS_HEADERS) $(TLS_HEADERS) $(UPSTREAM_HEADERS) $(RATE_LIMIT_HEADERS) $(CIRCUIT_BREAKER_HEADERS) $(CLI_HEADERS) $(TEST_HEADERS) # Default target .DEFAULT_GOAL := all @@ -238,6 +242,11 @@ test_rate_limit: $(TARGET) @echo "Running rate limit tests only..." ./$(TARGET) rate_limit +# Run only circuit breaker tests +test_circuit_breaker: $(TARGET) + @echo "Running circuit breaker tests only..." + ./$(TARGET) circuit_breaker + # Display help information help: @echo "Reactor Server C++ - Makefile Help" @@ -318,4 +327,4 @@ help: # Build only the production server binary server: $(SERVER_TARGET) -.PHONY: all clean test server test_basic test_stress test_race test_config test_http test_ws test_tls test_cli test_http2 test_upstream test_proxy test_rate_limit help +.PHONY: all clean test server test_basic test_stress test_race test_config test_http test_ws test_tls test_cli test_http2 test_upstream test_proxy test_rate_limit test_circuit_breaker help diff --git a/docs/circuit_breaker.md b/docs/circuit_breaker.md new file mode 100644 index 00000000..ef3a5ef0 --- /dev/null +++ b/docs/circuit_breaker.md @@ -0,0 +1,149 @@ +# Circuit Breaker + +Per-upstream circuit breaking for the gateway, preventing cascading failures when a backend becomes unhealthy. Follows the resilience4j three-state machine (`CLOSED` → `OPEN` → `HALF_OPEN` → `CLOSED`), trips on either consecutive-failure or failure-rate thresholds, and short-circuits checkouts with `503 Service Unavailable` while the circuit is open. A separate **retry budget** caps the fraction of concurrent upstream work that may be retries, bounding the retry-storm amplification factor even when individual retries pass the breaker gate. + +--- + +## Overview + +- **Per-dispatcher slices.** One `CircuitBreakerSlice` per dispatcher partition for each upstream. Hot-path `TryAcquire` / `Report*` calls are lock-free — each slice is dispatcher-thread-pinned. +- **Three states.** `CLOSED` = normal traffic. `OPEN` = all requests short-circuited with 503 for the exponential-backoff open duration. `HALF_OPEN` = a bounded number of probe requests are admitted to test recovery; on success, closes; on failure, re-trips with longer backoff. +- **Dual trip paths.** Either `consecutive_failures >= N` OR `failure_rate >= P%` over a sliding window (subject to `minimum_volume`). +- **Retry budget.** Host-level cap: `max(retry_budget_min_concurrency, (in_flight - retries_in_flight) * retry_budget_percent / 100)`. Retries that exceed the cap receive `503` + `X-Retry-Budget-Exhausted: 1` instead of going to the upstream. +- **Wait-queue drain on trip.** On every `CLOSED → OPEN` transition, the corresponding pool partition's wait queue is drained immediately with `503 + X-Circuit-Breaker: open` — queued waiters don't have to wait out the full open window. +- **Dry-run mode.** `dry_run=true` computes decisions and logs them, but still admits traffic. Useful for staging a breaker in production without risk. +- **Hot-reload.** Breaker-field edits (thresholds, window, probe budget, retry budget tuning, enabled toggle) apply live on SIGHUP — no restart required. Topology edits (host/port/pool/proxy/tls) still require a restart. + +--- + +## Configuration + +Each `upstream` entry accepts a nested `circuit_breaker` block: + +```json +{ + "upstreams": [ + { + "name": "orders", + "host": "orders-backend", + "port": 8080, + "circuit_breaker": { + "enabled": true, + "dry_run": false, + "consecutive_failure_threshold": 5, + "failure_rate_threshold": 50, + "minimum_volume": 20, + "window_seconds": 10, + "permitted_half_open_calls": 3, + "base_open_duration_ms": 5000, + "max_open_duration_ms": 60000, + "retry_budget_percent": 20, + "retry_budget_min_concurrency": 3 + } + } + ] +} +``` + +### Fields + +| Field | Type | Default | Meaning | +|---|---|---|---| +| `enabled` | bool | `false` | Master switch. When false, the slice is a zero-overhead no-op on the hot path. | +| `dry_run` | bool | `false` | Shadow mode: log would-reject decisions but admit traffic. Both the state machine and the retry budget honor this flag. | +| `consecutive_failure_threshold` | int | `5` | Trip when N consecutive failures are observed in `CLOSED`. Upper bound 10,000. | +| `failure_rate_threshold` | int | `50` | Trip when `(failures / total) * 100 >= this` over the rolling window, provided `total >= minimum_volume`. 0-100. | +| `minimum_volume` | int | `20` | Minimum calls-in-window before rate-based trip is even considered. Upper bound 10,000,000. | +| `window_seconds` | int | `10` | Rolling window duration for the rate trip. >= 1. | +| `permitted_half_open_calls` | int | `3` | Probe admissions allowed per `HALF_OPEN` cycle. A single success flips to `CLOSED`; a single failure re-trips to `OPEN`. Upper bound 1,000. | +| `base_open_duration_ms` | int | `5000` | Initial open duration on first trip. Subsequent trips use `min(base << consecutive_trips, max)`. | +| `max_open_duration_ms` | int | `60000` | Ceiling for the exponential-backoff open duration. | +| `retry_budget_percent` | int | `20` | Retries capped at this % of non-retry in-flight traffic to the same host. 0-100. | +| `retry_budget_min_concurrency` | int | `3` | Floor for the retry cap — always allow at least this many concurrent retries regardless of traffic level. | + +### Defaults (when `circuit_breaker` block is absent) + +`enabled=false`. The breaker is fully opt-in. No behavioral change from a pre-breaker gateway configuration. + +--- + +## Client-facing responses + +Two distinct `503` variants, keyed off the reject source: + +**Circuit-open reject** — breaker is `OPEN` or in `HALF_OPEN`-full: +``` +HTTP/1.1 503 Service Unavailable +Retry-After: 5 +X-Circuit-Breaker: open # or half_open +X-Upstream-Host: orders-backend:8080 +Connection: close +``` + +- `Retry-After` derivation: + - `OPEN`: derived from the stored `open_until` deadline (time remaining until next probe). + - `HALF_OPEN`: derived from the *next* open duration (`base << consecutive_trips`) — reflects what the backoff would be if the in-flight probes fail. Base alone would under-report after multiple trips. + - Both paths: ceil-divide the millisecond value to seconds, capped at 3600s. +- `X-Circuit-Breaker` distinguishes the two reject paths so operators can tell "backoff active" from "probing, no capacity left". + +**Retry-budget reject** — every retry attempt rejected because the host's budget is exhausted: +``` +HTTP/1.1 503 Service Unavailable +X-Retry-Budget-Exhausted: 1 +Connection: close +``` + +No `Retry-After` (the budget has no recovery clock — it depends on concurrent traffic). No `X-Circuit-Breaker` header (this reject path is orthogonal to the state machine). + +Both responses are **terminal**: the retry loop never retries a circuit-open or retry-budget-exhausted outcome. + +--- + +## Hot reload + +All `circuit_breaker` fields on existing upstream services are hot-reloadable via `SIGHUP`. Reload semantics: + +| Edit | Behavior | +|---|---| +| Threshold change (failures, rate, window, probe budget, open durations) | Applied on the next `TryAcquire` / `Report*` call on each slice. Live state (`CLOSED`/`OPEN`/`HALF_OPEN`) is preserved. | +| `enabled=true → false` | Live state reset to `CLOSED`; hot path short-circuits to `ADMITTED`. No transition callback fired. | +| `enabled=false → true` | Live state reset to `CLOSED`. The transition callback (wired at startup) re-engages for future trips. | +| `window_seconds` change | Rolling window reset. In-flight reports admitted pre-reload are invalidated (by `closed_gen_` bump); `consecutive_failures_` reset so stale counts can't trip the fresh window. In-flight `HALF_OPEN` probes are NOT invalidated (separate `halfopen_gen_` counter) — probe cycles complete normally. | +| `retry_budget_percent` / `retry_budget_min_concurrency` | Applied immediately (atomic stores). In-flight counters preserved. | + +Topology edits (`host`, `port`, `pool.*`, `proxy.*`, `tls.*`) still require a restart; the gateway logs `"Reload: upstream topology changes require a restart to take effect"` and keeps the old pool alive. Breaker edits on the same reload are still applied live. + +--- + +## Observability + +### Logs + +| Event | Level | Sample | +|---|---|---| +| `CLOSED → OPEN` trip | `warn` | `circuit breaker tripped service=orders host=orders-backend:8080 partition=0 trigger=consecutive consecutive_failures=5 window_total=12 window_fail_rate=41 open_for_ms=5000 consecutive_trips=1` | +| `OPEN → HALF_OPEN` | `info` | `circuit breaker half-open ... probes_allowed=3` | +| `HALF_OPEN → CLOSED` | `info` | `circuit breaker closed ... probes_succeeded=3` | +| `HALF_OPEN → OPEN` re-trip | `warn` | `circuit breaker re-tripped ... trigger=probe_fail consecutive_trips=2 open_for_ms=10000` | +| Reject (first of cycle) | `info` | `circuit breaker rejected ... state=open` | +| Reject (subsequent) | `debug` | Same, at debug. | +| Reject (dry-run) | `info` | `[dry-run] circuit breaker would reject ...` | +| Retry budget exhausted | `warn` | `retry budget exhausted service=orders in_flight=45 retries_in_flight=9 cap=9 client_fd=... attempt=1` | +| Reload applied | `info` | `circuit breaker config applied service=orders enabled=true window_s=10 fail_rate=50 consec_threshold=5` | +| Wait-queue drain on trip | `info` | `PoolPartition draining wait queue on breaker trip: orders-backend:8080 queue_size=3` | + +### Snapshot API + +`CircuitBreakerManager::SnapshotAll()` returns one `CircuitBreakerHostSnapshot` per upstream with per-slice rows (`state`, `trips`, `rejected`, `probe_successes`, `probe_failures`) plus host-level aggregates (`total_trips`, `total_rejected`, `open_partitions`, `half_open_partitions`, `retries_in_flight`, `retries_rejected`, `in_flight`). A `/admin/breakers` HTTP endpoint that JSON-serializes this snapshot is **planned but not yet exposed** — the API is ready for future wiring. + +--- + +## Design notes + +- **Dispatcher affinity.** Slices are pinned to their dispatcher thread — no CAS on the hot path. The trade-off: skewed request distribution across dispatchers can cause one partition to trip while another stays `CLOSED`. Uniform hashing keeps this mild in practice. +- **Lazy `HALF_OPEN`.** The transition from `OPEN` happens on the next inbound `TryAcquire` once the open deadline elapses — no background timer. Envoy and resilience4j use the same model. +- **Generation tokens.** Every admission is stamped with a per-domain generation counter (`closed_gen_` or `halfopen_gen_`, depending on state). `Report*` drops stale-generation completions so pre-transition requests can't pollute a fresh cycle. Window resizes bump only `closed_gen_` so in-flight probes aren't stranded. +- **Retry budget CAS.** `TryConsumeRetry` uses `compare_exchange_weak` to serialize concurrent retry admissions. A plain load-check-add would let N callers all observe `current < cap` and all increment past the cap. +- **Non-retry denominator.** The budget base is `in_flight - retries_in_flight`, not raw `in_flight`. Retries count in both terms but subtract out here so admitting a retry doesn't inflate its own cap. + +For the full design document (motivations, trade-offs, failure modes, revision history, test strategy), see [.claude/documents/design/CIRCUIT_BREAKER_DESIGN.md](../.claude/documents/design/CIRCUIT_BREAKER_DESIGN.md). diff --git a/include/circuit_breaker/circuit_breaker_host.h b/include/circuit_breaker/circuit_breaker_host.h new file mode 100644 index 00000000..67211667 --- /dev/null +++ b/include/circuit_breaker/circuit_breaker_host.h @@ -0,0 +1,118 @@ +#pragma once + +#include "common.h" +#include "config/server_config.h" +#include "circuit_breaker/circuit_breaker_slice.h" +#include "circuit_breaker/retry_budget.h" +// , , provided by common.h + +class Dispatcher; + +namespace circuit_breaker { + +// Observability snapshot of a single host, aggregated across all its +// partition slices. Safe to call from any thread (relaxed reads of +// atomic counters). Per-slice rows let dashboards detect skewed +// failure distribution across dispatchers. +struct CircuitBreakerHostSnapshot { + std::string service_name; + std::string host; + int port = 0; + + struct SliceRow { + size_t dispatcher_index = 0; + State state = State::CLOSED; + int64_t trips = 0; + int64_t rejected = 0; + int64_t probe_successes = 0; + int64_t probe_failures = 0; + }; + std::vector slices; + + // Aggregates across slices. + int64_t total_trips = 0; + int64_t total_rejected = 0; + int open_partitions = 0; + int half_open_partitions = 0; + + // Retry budget state (per-host, shared across partitions). + int64_t retries_in_flight = 0; + int64_t retries_rejected = 0; + int64_t in_flight = 0; +}; + +// Per-upstream-service aggregation layer. Owns: +// - N CircuitBreakerSlice instances (one per dispatcher partition, +// each pinned to its dispatcher for lock-free hot-path access). +// - One RetryBudget (shared across partitions — retry %-of-in-flight +// is a host-level metric, not per-dispatcher). +// +// Lifetime: constructed by CircuitBreakerManager at server start, lives +// for the server's lifetime. `service_name`, `host`, `port`, and the +// slice vector are never mutated post-construction (keys are stable for +// lock-free map lookup in the manager). +class CircuitBreakerHost { +public: + // `partition_count` must equal the number of dispatcher partitions + // in the server — typically NetServer's socket worker count or + // upstream pool's partition count. One slice is created per + // partition up-front. + CircuitBreakerHost(std::string service_name, + std::string host, + int port, + size_t partition_count, + const CircuitBreakerConfig& config); + + CircuitBreakerHost(const CircuitBreakerHost&) = delete; + CircuitBreakerHost& operator=(const CircuitBreakerHost&) = delete; + + // Hot-path lookup — returns nullptr only if `dispatcher_index` is + // out of range (programming error). Caller must invoke the + // returned slice's methods on its owning dispatcher thread. + CircuitBreakerSlice* GetSlice(size_t dispatcher_index); + + // Owned retry budget. Never null for the host's lifetime; safe to + // cache the pointer. Shared across all partitions of this host. + RetryBudget* GetRetryBudget() { return retry_budget_.get(); } + const RetryBudget* GetRetryBudget() const { return retry_budget_.get(); } + + // Aggregate snapshot across all slices + retry budget. Reads are + // relaxed atomic — eventually consistent across threads, which is + // fine for dashboards. + CircuitBreakerHostSnapshot Snapshot() const; + + // Apply a new config to every slice. Because each slice is pinned + // to its dispatcher thread, the call is dispatched per-partition — + // the caller provides the dispatcher list in the same order used at + // construction. If `dispatchers.size() != slices_.size()`, the + // method logs an error and returns without applying. + // + // The retry-budget sub-fields (percent, min_concurrency) are + // updated immediately (atomic stores, any thread) as part of this + // call — they don't need dispatcher routing. + void Reload(const std::vector>& dispatchers, + const CircuitBreakerConfig& new_config); + + // Install a transition callback on every slice. Uniform callback + // across partitions — callers that need partition-specific behavior + // can read `slice->dispatcher_index()` inside the callback. + // Must be called before live traffic; thread-safety depends on + // slice-dispatcher affinity at the Reload layer. + void SetTransitionCallbackOnAllSlices(StateTransitionCallback cb); + + // Accessors. + const std::string& service_name() const { return service_name_; } + const std::string& host() const { return host_; } + int port() const { return port_; } + size_t partition_count() const { return slices_.size(); } + +private: + std::string service_name_; + std::string host_; + int port_; + CircuitBreakerConfig config_; + std::vector> slices_; + std::unique_ptr retry_budget_; +}; + +} // namespace circuit_breaker diff --git a/include/circuit_breaker/circuit_breaker_manager.h b/include/circuit_breaker/circuit_breaker_manager.h new file mode 100644 index 00000000..b4b32f06 --- /dev/null +++ b/include/circuit_breaker/circuit_breaker_manager.h @@ -0,0 +1,80 @@ +#pragma once + +#include "common.h" +#include "circuit_breaker/circuit_breaker_host.h" +// , , , , provided by common.h + +class Dispatcher; + +namespace circuit_breaker { + +// Top-level circuit-breaker orchestrator. Mirrors the shape of +// RateLimitManager: one instance lives on HttpServer, built once at +// MarkServerReady, survives for the server's lifetime. +// +// Ownership (per design §3.1): +// HttpServer +// ├── upstream_manager_ (declared FIRST, destructs last) +// └── circuit_breaker_manager_ (declared SECOND, destructs first) +// +// CircuitBreakerManager +// └── hosts_: unordered_map> +// +// `hosts_` is built once in the constructor — keys are never added or +// removed at runtime (topology is restart-only per the existing +// upstream policy). This makes GetHost lock-free after construction, +// which is critical for the hot path. +// +// Hot-reload: only `circuit_breaker` sub-fields on EXISTING +// upstream services can be live-reloaded. New or removed service names +// log a warn and are skipped — the caller (HttpServer::Reload) still +// fires the "restart required" diagnostic in that case. +class CircuitBreakerManager { +public: + // Builds one CircuitBreakerHost per upstream in `upstreams` — even + // when upstreams[i].circuit_breaker.enabled is false — so a later + // reload that flips enabled to true can take effect without + // re-wiring transition callbacks (disabled slices hold the callback + // but never invoke it). + // + // `partition_count` must match the server's dispatcher partition + // count (upstream pool / NetServer worker count). `dispatchers` + // captures the dispatcher list so Reload can route per-slice work. + CircuitBreakerManager( + const std::vector& upstreams, + size_t partition_count, + std::vector> dispatchers); + + CircuitBreakerManager(const CircuitBreakerManager&) = delete; + CircuitBreakerManager& operator=(const CircuitBreakerManager&) = delete; + + // Hot-path lookup — returns nullptr for unknown service names. + // Thread-safe (post-construction `hosts_` is read-only). + CircuitBreakerHost* GetHost(const std::string& service_name); + const CircuitBreakerHost* GetHost(const std::string& service_name) const; + + // Apply breaker-field edits to EXISTING upstream services. Topology + // changes (new/removed service names) are logged at warn and + // skipped — HttpServer::Reload is the only layer that warns about + // topology, and this manager trusts that signal. Serialized by + // reload_mtx_ so concurrent Reload calls queue cleanly; the hot + // path does NOT take this lock. + void Reload(const std::vector& new_upstreams); + + // Observability — snapshots every host. Safe from any thread. + std::vector SnapshotAll() const; + + // Test/admin helpers. + size_t host_count() const { return hosts_.size(); } + +private: + // Post-construction read-only — keys and unique_ptr values never + // change, so lookups don't need a lock. + std::unordered_map> hosts_; + std::vector> dispatchers_; + + // Serializes concurrent Reload calls. NOT taken on the hot path. + mutable std::mutex reload_mtx_; +}; + +} // namespace circuit_breaker diff --git a/include/circuit_breaker/circuit_breaker_slice.h b/include/circuit_breaker/circuit_breaker_slice.h new file mode 100644 index 00000000..d6899bae --- /dev/null +++ b/include/circuit_breaker/circuit_breaker_slice.h @@ -0,0 +1,281 @@ +#pragma once + +#include "common.h" +#include "config/server_config.h" +#include "circuit_breaker/circuit_breaker_state.h" +#include "circuit_breaker/circuit_breaker_window.h" +// , , provided by common.h + +namespace circuit_breaker { + +// One per-dispatcher slice of the breaker state for a given upstream host. +// Dispatcher-thread-local for hot-path correctness: TryAcquire, ReportSuccess, +// ReportFailure must only be called on the dispatcher that owns this slice. +// +// Observability counters (`trips_`, `rejected_`, etc.) are atomic so other +// threads can snapshot them without synchronization. Everything else is +// plain (no atomics) — single-writer, single-reader. +class CircuitBreakerSlice { +public: + // `time_source` defaults to steady_clock::now. Tests inject a mock clock. + using TimeSource = std::function; + + CircuitBreakerSlice(std::string host_label, + size_t dispatcher_index, + const CircuitBreakerConfig& config, + TimeSource time_source = nullptr); + + // Non-copyable, non-movable: slices are pinned in a Host's vector and + // callbacks capture raw pointers. + CircuitBreakerSlice(const CircuitBreakerSlice&) = delete; + CircuitBreakerSlice& operator=(const CircuitBreakerSlice&) = delete; + + // Return value of TryAcquire. `generation` is a monotonically-increasing + // token identifying which state-machine cycle the admission belongs to. + // Callers MUST pass it back to Report*() unchanged so the slice can drop + // late completions that belong to a prior cycle (crossed a state + // transition or a Reload()-reset boundary). Without this, stale + // completions can pollute the bookkeeping of a fresh CLOSED/HALF_OPEN + // cycle (e.g., a pre-toggle failure incrementing the post-toggle + // consecutive_failures_, or a pre-CLOSED'-cycle success wiping a + // legitimate post-CLOSED' counter). + struct Admission { + Decision decision; + uint64_t generation; + }; + + // Hot-path decision. Consults state + (if applicable) advances OPEN→HALF_OPEN + // and reserves a probe slot. Increments `rejected_` on REJECTED_OPEN* + // (both enforce and dry-run). Emits reject log on dispatcher thread. + // Returned generation must be threaded to the paired Report*(). + Admission TryAcquire(); + + // Outcome reporting. `probe` is true iff the paired TryAcquire returned + // ADMITTED_PROBE. `admission_generation` is the generation returned by + // the paired TryAcquire — reports from a stale generation are silently + // dropped (observability counters still update so the outcome is not + // lost from dashboards). Report* may trigger state transitions and fire + // the transition callback. + void ReportSuccess(bool probe, uint64_t admission_generation); + void ReportFailure(FailureKind kind, bool probe, uint64_t admission_generation); + + // Neutral completion — the admission never exercised the upstream. + // Use when the request was terminated locally before reaching the + // upstream (POOL_EXHAUSTED after admission, shutdown draining, client + // disconnect, RESULT_PARSE_ERROR self-attributable). Must NOT be used + // for upstream outcomes — those go to ReportSuccess / ReportFailure. + // + // For probe=true (HALF_OPEN admission): returns the probe slot to the + // cycle — decrements `half_open_inflight_` AND `half_open_admitted_` + // so a replacement probe can still exercise the upstream within this + // cycle's budget. Without this path, a probe that dies locally leaks + // its slot forever, eventually wedging the slice in HALF_OPEN. + // + // For probe=false (CLOSED admission): no-op — CLOSED admissions have + // no slot to release. The bool matches ReportSuccess/ReportFailure so + // callers can use the same dispatch pattern. + void ReportNeutral(bool probe, uint64_t admission_generation); + + // Apply a new config (called on this slice's dispatcher thread). + // Preserves live state (CLOSED/OPEN/HALF_OPEN). Resets window if + // window_seconds changed. + void Reload(const CircuitBreakerConfig& new_config); + + // Install or replace the state-transition callback. Safe to call before + // any traffic (startup wiring) OR after a hot-reload flips enabled=false→true. + // Callers must invoke on this slice's dispatcher thread. + void SetTransitionCallback(StateTransitionCallback cb); + + // Observability — safe from any thread. + State CurrentState() const { return state_.load(std::memory_order_acquire); } + int64_t Trips() const { return trips_.load(std::memory_order_relaxed); } + int64_t Rejected() const { return rejected_.load(std::memory_order_relaxed); } + int64_t ProbeSuccesses() const { return probe_successes_.load(std::memory_order_relaxed); } + int64_t ProbeFailures() const { return probe_failures_.load(std::memory_order_relaxed); } + // Rejections specifically caused by HALF_OPEN being out of probe slots + // (subset of `Rejected()`). Lets dashboards distinguish "backoff has not + // elapsed" from "probing, no capacity left". + int64_t RejectedHalfOpenFull() const { + return rejected_half_open_full_.load(std::memory_order_relaxed); + } + // Number of Report* calls silently dropped because their admission + // generation no longer matches the relevant per-domain counter + // (closed_gen_ for non-probe, halfopen_gen_ for probe). These are + // reports of requests admitted before a state transition or a + // Reload()-reset. Useful for detecting mis-threaded admission tokens. + int64_t ReportsStaleGeneration() const { + return reports_stale_generation_.load(std::memory_order_relaxed); + } + + // **Test-only** accessor for the generation that the current state's + // next admission would receive. Returns `halfopen_gen_` when state is + // HALF_OPEN (probe admissions use that counter), otherwise `closed_gen_` + // (non-probe admissions use that counter). This matches what TryAcquire + // would stamp on a new admission right now. + // + // Production callers MUST use the generation returned by TryAcquire + // (racy otherwise — these getters are not atomic). Tests use it as + // ergonomic shorthand for "admission just happened in the current + // cycle", bypassing the need to thread a token per synthetic Report*. + uint64_t CurrentGenerationForTesting() const { + return (state_.load(std::memory_order_acquire) == State::HALF_OPEN) + ? halfopen_gen_ : closed_gen_; + } + // Explicit per-domain getters for tests that cross state transitions + // while holding a captured generation from a specific domain. + uint64_t CurrentClosedGenForTesting() const { return closed_gen_; } + uint64_t CurrentHalfOpenGenForTesting() const { return halfopen_gen_; } + + const std::string& host_label() const { return host_label_; } + size_t dispatcher_index() const { return dispatcher_index_; } + + // Read-only view of the live config. Dispatcher-thread-owned for + // writes (Reload only mutates here); readers on other threads get a + // potentially-torn read, which is acceptable for observability hints + // like Retry-After clamping. + const CircuitBreakerConfig& config() const { return config_; } + + // Current open_until time. Used by ProxyTransaction to compute + // Retry-After. Returns zero ns when not OPEN. + std::chrono::steady_clock::time_point OpenUntil() const; + + // Convenience predicate: whether OpenUntil() currently holds a + // non-zero deadline. Avoids callers hand-rolling the zero-epoch + // check against `time_since_epoch().count() > 0`. + bool IsOpenDeadlineSet() const { + return open_until_steady_ns_.load(std::memory_order_relaxed) > 0; + } + + // Expected next open-duration in milliseconds if the slice re-trips + // from its current state. Computed from base_open_duration_ms + // shifted by the current `consecutive_trips_` count and clamped by + // max_open_duration_ms. Used by the Retry-After hint path for + // HALF_OPEN rejections, where there's no stored deadline but the + // next OPEN window (if the probe cycle fails) will follow the + // exponential-backoff curve — base alone would under-report after + // multiple trips. + // + // Safe from any thread (atomic load of consecutive_trips_ + plain + // reads of config_ fields). Config fields are dispatcher-owned but + // a slightly-torn read is fine for an observability hint. + int64_t NextOpenDurationMs() const; + +private: + // Logging label: "service=X host=Y:Z partition=N" built once. + std::string host_label_; + size_t dispatcher_index_; + CircuitBreakerConfig config_; + + TimeSource time_source_; + + // Hot-path state — state_ written on dispatcher, read by observers. + std::atomic state_{State::CLOSED}; + // Nanoseconds since steady_clock epoch — 0 when not OPEN. + std::atomic open_until_steady_ns_{0}; + // Count of consecutive trips (OPEN entries) since last CLOSED — + // drives exponential backoff of open duration. + std::atomic consecutive_trips_{0}; + + // Dispatcher-thread-only (no atomics). + int consecutive_failures_ = 0; + CircuitBreakerWindow window_; + int half_open_inflight_ = 0; + int half_open_successes_ = 0; + bool half_open_saw_failure_ = false; + // Total probes admitted in the CURRENT HALF_OPEN cycle. Never decrements + // within a cycle; resets on every cycle entry (TransitionOpenToHalfOpen) + // and cycle exit (TransitionHalfOpenToClosed / TripHalfOpenToOpen). This + // is what caps the cycle's probe budget — NOT half_open_inflight_, which + // can free slots as probes complete. Gating on inflight would let an + // early-completing probe's slot be reused, causing the cycle to admit + // more than permitted_half_open_calls total probes. The close check + // (successes >= snapshot) could then fire while a late-admitted probe + // is still running; its eventual failure would drop as stale (generation + // bumped by the transition) and the breaker would falsely mark an + // unhealthy host recovered. + int half_open_admitted_ = 0; + // Probe budget for the CURRENT HALF_OPEN cycle. Snapshotted from + // config_.permitted_half_open_calls at the moment TransitionOpenToHalfOpen + // fires. A live Reload() may lower (or raise) the config field mid-cycle; + // the snapshot ensures TryAcquire's slot gate and ReportSuccess's close + // check both operate against the budget that was in effect when the probes + // were admitted — preventing early close or indefinitely-open behaviour. + int half_open_permitted_snapshot_ = 0; + + // Observability counters. + std::atomic trips_{0}; + std::atomic rejected_{0}; + std::atomic rejected_half_open_full_{0}; + std::atomic probe_successes_{0}; + std::atomic probe_failures_{0}; + + // One-shot flag: true after the slice has emitted a higher-level + // (info) log for the first rejection in the current OPEN/HALF_OPEN + // cycle. Reset on transition to CLOSED and on each fresh trip. Keeps + // per-request reject logs at debug while still surfacing the first + // post-trip reject in default-warn operator logs. Dispatcher-thread only. + bool first_reject_logged_for_open_ = false; + + // Monotonic generation counters — one per admission domain. TryAcquire + // stamps the admission with the domain's current value; Report* compares + // against it and drops reports whose admission no longer matches a live + // cycle. Split into two counters so operations that reset ONE domain + // (e.g., window_seconds reload wipes the CLOSED rate window) don't + // invalidate admissions in the OTHER domain (HALF_OPEN probes) — which + // would strand probe capacity and wedge the slice in HALF_OPEN. + // + // Dispatcher-thread only — plain ints (no atomics needed). + // + // closed_gen_ bumps on: TripClosedToOpen (CLOSED cycle ends), + // Reload enabled-toggle reset, + // Reload window_seconds change (rate-window wipe). + // halfopen_gen_ bumps on: TripHalfOpenToOpen (HALF_OPEN cycle ends), + // TransitionHalfOpenToClosed (HALF_OPEN cycle ends on success), + // Reload enabled-toggle reset. + // + // Initial value 1 (so 0 can be a "not-applicable" sentinel for + // admissions returned from disabled slices or the REJECTED_* paths). + uint64_t closed_gen_ = 1; + uint64_t halfopen_gen_ = 1; + + // Rejections silently dropped because their admission generation no + // longer matches `generation_`. Observability only; lets dashboards see + // how often the generation guard fires. + std::atomic reports_stale_generation_{0}; + + StateTransitionCallback transition_cb_; + + // Internal transitions (dispatcher-thread). + // `now` is threaded through from ReportFailure so the window_total / + // window_fail_rate fields in the trip log reflect the SAME sliding-window + // view that ShouldTripClosed just saw — a fresh Now() here can cross a + // bucket boundary (especially with window_seconds=1 or under a dispatcher + // stall) and trigger Window::Advance's full-reset, zeroing the bucket that + // holds the failure which actually tripped the breaker. + void TripClosedToOpen(const char* trigger, + std::chrono::steady_clock::time_point now); + void TransitionOpenToHalfOpen(); + void TransitionHalfOpenToClosed(); + void TripHalfOpenToOpen(const char* trigger); + + // Emit the correct reject log line, bump counters, and return the matching + // Decision (enforce or dry-run). Used by both the OPEN (backoff active) + // and HALF_OPEN-full paths — keeps the three loggers/counters consistent. + Decision RejectWithLog(const char* state_label, bool half_open_full); + + // Compute open duration for the current consecutive_trips_ value: + // min(base * 2^consecutive_trips, max). Always >= base_open_duration_ms. + std::chrono::nanoseconds ComputeOpenDuration() const; + + // Check whether CLOSED trip conditions are met. Called after every failure. + // Takes `now` as a parameter so the caller can record the failure and + // evaluate the trip against THE SAME timestamp — otherwise a clock tick + // between AddFailure() and ShouldTripClosed() can advance the ring and + // wipe the just-recorded failure (critical when window_seconds is small: + // with window=1, a 1-second delta triggers the full-reset path). + bool ShouldTripClosed(std::chrono::steady_clock::time_point now); + + std::chrono::steady_clock::time_point Now() const; +}; + +} // namespace circuit_breaker diff --git a/include/circuit_breaker/circuit_breaker_state.h b/include/circuit_breaker/circuit_breaker_state.h new file mode 100644 index 00000000..92872f8b --- /dev/null +++ b/include/circuit_breaker/circuit_breaker_state.h @@ -0,0 +1,70 @@ +#pragma once + +#include "common.h" +// , , provided by common.h + +// Circuit breaker state machine and classification enums. Used by +// CircuitBreakerSlice, CircuitBreakerHost, CircuitBreakerManager, and +// ProxyTransaction to talk about state, admission decisions, and +// failure kinds. +// +// Three-state resilience4j-style machine: +// +// CLOSED ──trip── OPEN ──(open_until elapsed)── HALF_OPEN ──success── CLOSED +// │ +// failure +// ▼ +// OPEN +namespace circuit_breaker { + +enum class State : uint8_t { + CLOSED = 0, + OPEN = 1, + HALF_OPEN = 2, +}; + +// Result of CircuitBreakerSlice::TryAcquire. Callers branch on this enum +// only — they never read the CircuitBreakerConfig directly. Dry-run policy +// is encoded in the decision, not in a separate flag. +enum class Decision : uint8_t { + ADMITTED, // CLOSED — proceed to pool + ADMITTED_PROBE, // HALF_OPEN probe slot consumed — proceed, tag as probe + REJECTED_OPEN, // OPEN (or HALF_OPEN-full); ENFORCE — drop with 503 + REJECTED_OPEN_DRYRUN, // Shadow mode: slice would reject but operator asked + // for pass-through. Caller proceeds to pool. Counters + // and log already updated by TryAcquire. +}; + +// Failure classification. Only these kinds feed ReportFailure — 4xx and +// local-capacity issues (POOL_EXHAUSTED, QUEUE_TIMEOUT, shutdown) are NOT +// reported as failures. +enum class FailureKind : uint8_t { + CONNECT_FAILURE, + RESPONSE_5XX, + RESPONSE_TIMEOUT, + UPSTREAM_DISCONNECT, +}; + +// Callback fired on every slice state transition. Runs on the slice's +// owning dispatcher thread. Callers can compare old/new to key off a +// specific edge (e.g. CLOSED→OPEN fires wait-queue drain). +// `trigger` is a short static string such as "consecutive" / "rate" / +// "probe_success" / "probe_fail" / "open_elapsed" for logging. +// +// TODO(post-v1): once a snapshot / admin JSON endpoint lands, convert +// `trigger` to an `enum class TransitionTrigger` so the valid set is +// compile-time checked rather than string-compared. See design doc §15.8. +using StateTransitionCallback = + std::function; + +// Convert a state to a short lowercase label for logging. +inline const char* StateName(State s) { + switch (s) { + case State::CLOSED: return "closed"; + case State::OPEN: return "open"; + case State::HALF_OPEN: return "half_open"; + } + return "unknown"; +} + +} // namespace circuit_breaker diff --git a/include/circuit_breaker/circuit_breaker_window.h b/include/circuit_breaker/circuit_breaker_window.h new file mode 100644 index 00000000..12679bcd --- /dev/null +++ b/include/circuit_breaker/circuit_breaker_window.h @@ -0,0 +1,59 @@ +#pragma once + +#include "common.h" +// , provided by common.h + +namespace circuit_breaker { + +// Time-bucketed sliding window. One bucket per second; ring indexed by +// `epoch_sec % window_seconds`. Advances lazily on every Add* call: +// when the incoming `now` is ahead of the recorded head, all buckets +// that have aged out of the window are zeroed before the new increment. +// +// Dispatcher-thread-local by design — NO synchronization. Used from +// CircuitBreakerSlice, which is owned by a single dispatcher. +class CircuitBreakerWindow { +public: + explicit CircuitBreakerWindow(int window_seconds); + + // Record one outcome at `now`. Advances the ring if needed. + void AddSuccess(std::chrono::steady_clock::time_point now); + void AddFailure(std::chrono::steady_clock::time_point now); + + // Observed counts across the current window. `now` is used to expire + // stale buckets before reading. + int64_t TotalCount(std::chrono::steady_clock::time_point now); + int64_t FailureCount(std::chrono::steady_clock::time_point now); + + // Reset the ring to zero. Called on state transitions that should + // start a fresh observation (e.g. HALF_OPEN → CLOSED). + void Reset(); + + // Reinitialize for a new window size (config reload). Resets buckets. + void Resize(int new_window_seconds); + + int window_seconds() const { return window_seconds_; } + +private: + struct Bucket { + int64_t total = 0; + int64_t failures = 0; + }; + + int window_seconds_; + std::vector buckets_; + + // Epoch-seconds of the most recent observation. Used to compute how + // many buckets need to be zeroed on advance. + int64_t head_epoch_sec_ = -1; + + // Advance the ring if `now_sec` is newer than `head_epoch_sec_`, + // zeroing any buckets that aged out. + void Advance(int64_t now_sec); + + // Convert a steady_clock time_point to epoch-seconds (we only + // care about relative seconds; steady_clock is monotonic). + static int64_t ToEpochSec(std::chrono::steady_clock::time_point now); +}; + +} // namespace circuit_breaker diff --git a/include/circuit_breaker/retry_budget.h b/include/circuit_breaker/retry_budget.h new file mode 100644 index 00000000..f8392013 --- /dev/null +++ b/include/circuit_breaker/retry_budget.h @@ -0,0 +1,151 @@ +#pragma once + +#include "common.h" +// , provided by common.h + +namespace circuit_breaker { + +// Retry budget — orthogonal to the breaker state machine. +// +// Problem: even when the circuit is CLOSED, a cascading failure on a +// healthy-looking upstream can be amplified by per-request retries. If +// 100 requests are in flight and each retries once, the upstream sees +// 200. If each retries twice, 300. A sick-but-not-dead upstream gets +// tipped over by the retry multiplier itself. +// +// Fix: cap concurrent retries as a fraction of concurrent non-retry +// traffic plus a floor for low-volume correctness. +// +// allowed_retries = max(min_concurrency, +// (in_flight - retries_in_flight) * percent / 100) +// +// The subtraction is load-bearing: callers hold TrackInFlight() for +// BOTH first attempts and retries (so the guard's RAII paired with +// ReleaseRetry doesn't need a second counter on the hot path). +// Without subtracting retries, admitting a retry increases in_flight +// which increases the cap, and in steady state the effective ratio +// converges above the configured percent of original traffic. +// +// The retry budget is PER-HOST (one instance owned by CircuitBreakerHost, +// shared across its partitions — the percent math is about aggregate +// upstream load, not per-dispatcher slicing). All counters are atomic +// relaxed — snapshots can be slightly stale, which is fine for a +// capacity gate on a retry storm. +// +// Usage: +// 1. On every attempt (first or retry), call TrackInFlight() and keep +// the returned guard alive until the attempt completes. The guard +// decrements in_flight_ in its destructor. +// 2. Before issuing a retry attempt, call TryConsumeRetry(). Proceed +// if it returns true; reject as RETRY_BUDGET_EXHAUSTED if false. +// 3. When the retried attempt completes, call ReleaseRetry(). +class RetryBudget { +public: + // `percent` — cap retries at this % of in-flight (0-100). + // `min_concurrency` — always allow at least this many concurrent + // retries regardless of in_flight; ensures low-volume correctness + // (without it, a 20% budget allows 0 retries when in_flight < 5). + RetryBudget(int percent, int min_concurrency); + + // Non-copyable, non-movable. Lifetime-stable under its owner + // (CircuitBreakerHost). + RetryBudget(const RetryBudget&) = delete; + RetryBudget& operator=(const RetryBudget&) = delete; + + // RAII guard — decrements in_flight_ on destruction. Move-only. + class InFlightGuard { + public: + InFlightGuard() = default; + explicit InFlightGuard(std::atomic* counter) : counter_(counter) {} + ~InFlightGuard() { + if (counter_) counter_->fetch_sub(1, std::memory_order_relaxed); + } + InFlightGuard(InFlightGuard&& o) noexcept : counter_(o.counter_) { + o.counter_ = nullptr; + } + InFlightGuard& operator=(InFlightGuard&& o) noexcept { + if (this != &o) { + if (counter_) counter_->fetch_sub(1, std::memory_order_relaxed); + counter_ = o.counter_; + o.counter_ = nullptr; + } + return *this; + } + InFlightGuard(const InFlightGuard&) = delete; + InFlightGuard& operator=(const InFlightGuard&) = delete; + + private: + std::atomic* counter_ = nullptr; + }; + + // Call on every upstream attempt entry (first try OR retry). The + // returned guard MUST outlive the attempt — typically stored as a + // ProxyTransaction member. Never returns an empty guard. + InFlightGuard TrackInFlight(); + + // Call BEFORE issuing a retry attempt. Returns true if the retry + // fits under the budget (retries_in_flight < cap); caller must pair + // a true return with a matching ReleaseRetry when the retry + // completes. Returns false if over budget — caller must NOT retry + // and must NOT call ReleaseRetry. + // + // The cap is computed against a freshly-loaded in_flight snapshot: + // cap = max(min_concurrency, in_flight * percent / 100) + bool TryConsumeRetry(); + + // Call when a consumed retry attempt finishes. Must be paired with a + // prior successful TryConsumeRetry. + void ReleaseRetry(); + + // Apply new tuning. Thread-safe (atomics). Preserves in-flight counters + // — only the admission formula changes. + void Reload(int percent, int min_concurrency); + + // Observability — safe from any thread, relaxed. + int64_t InFlight() const { + return in_flight_.load(std::memory_order_relaxed); + } + // Compute the current effective retry cap for observability / log + // enrichment. Uses the same formula as TryConsumeRetry but without + // mutating retries_in_flight_. Returns the point-in-time cap against + // which a would-be retry admission would be compared. Slightly racy + // (separate loads of in_flight_ and retries_in_flight_ aren't atomic + // relative to each other), but the result is for dashboards / logs + // where a one-entry drift is noise. + int64_t ComputeCap() const { + int64_t in_flight = in_flight_.load(std::memory_order_relaxed); + int64_t retries = retries_in_flight_.load(std::memory_order_relaxed); + int pct = percent_.load(std::memory_order_relaxed); + int min_conc = min_concurrency_.load(std::memory_order_relaxed); + int64_t non_retry = in_flight - retries; + if (non_retry < 0) non_retry = 0; + int64_t pct_cap = (non_retry * pct) / 100; + return pct_cap > min_conc ? pct_cap : min_conc; + } + int64_t RetriesInFlight() const { + return retries_in_flight_.load(std::memory_order_relaxed); + } + int64_t RetriesRejected() const { + return retries_rejected_.load(std::memory_order_relaxed); + } + + int percent() const { return percent_.load(std::memory_order_relaxed); } + int min_concurrency() const { + return min_concurrency_.load(std::memory_order_relaxed); + } + +private: + // Tuning — atomic so Reload() is lock-free. + std::atomic percent_; + std::atomic min_concurrency_; + + // Counters (relaxed — admission decisions tolerate slightly stale + // reads; correctness depends on each guard's fetch_sub pairing with + // its increment, which holds under relaxed because they touch the + // same atomic). + std::atomic in_flight_{0}; + std::atomic retries_in_flight_{0}; + std::atomic retries_rejected_{0}; +}; + +} // namespace circuit_breaker diff --git a/include/config/config_loader.h b/include/config/config_loader.h index ba13f62a..2a76c3b8 100644 --- a/include/config/config_loader.h +++ b/include/config/config_loader.h @@ -3,6 +3,7 @@ #include "config/server_config.h" #include #include +#include class ConfigLoader { public: @@ -27,6 +28,42 @@ class ConfigLoader { // Throws std::invalid_argument if validation fails. static void Validate(const ServerConfig& config); + // Validate ONLY the fields that are live-reloadable without a + // restart — today this is the per-upstream circuit_breaker block + // plus a duplicate-name check. + // + // Used by the SIGHUP reload path, which downgrades the full + // `Validate()` failure to a warn because most of its rules cover + // restart-only fields. That downgrade is unsafe for live- + // reloadable fields: an invalid breaker threshold would be + // pushed into live slices even though the same value would be + // rejected at startup. Call this BEFORE applying a reloaded + // config and abort the reload if it throws. + // + // Scope of CB-field validation: + // `live_upstream_names` lists service names CURRENTLY known to + // the running server. CB fields are validated only for entries + // whose name is in this set, because + // `CircuitBreakerManager::Reload` only applies CB changes to + // pre-existing hosts (new/removed names are restart-only and + // skipped with a warn). Validating CB blocks for not-yet- + // running entries would block otherwise-safe reloads — e.g. a + // reload that stages a new upstream with an intentionally + // placeholder breaker block would abort even though the live + // server would never apply it. Pass an empty set when no + // upstreams are running yet (only the duplicate-name check + // runs in that case). + // + // Duplicate-name rejection runs unconditionally on the new + // config's upstream list: even for new/renamed entries, the + // file itself is malformed if names collide. + // + // Throws std::invalid_argument with a message identifying the + // offending upstream and field. + static void ValidateHotReloadable( + const ServerConfig& config, + const std::unordered_set& live_upstream_names); + // Return a ServerConfig with all default values. static ServerConfig Default(); diff --git a/include/config/server_config.h b/include/config/server_config.h index bff3ffc4..ee879f28 100644 --- a/include/config/server_config.h +++ b/include/config/server_config.h @@ -138,6 +138,52 @@ struct ProxyConfig { bool operator!=(const ProxyConfig& o) const { return !(*this == o); } }; +struct CircuitBreakerConfig { + bool enabled = false; // Opt-in; off by default + bool dry_run = false; // Compute + log, but do not reject + + // Trip conditions (ORed). Either alone is sufficient. + int consecutive_failure_threshold = 5; // Trip after N consecutive failures + int failure_rate_threshold = 50; // Trip when fail_rate >= N percent + int minimum_volume = 20; // Required window volume before + // failure_rate is consulted + int window_seconds = 10; // Sliding-window duration + + // HALF_OPEN admission + int permitted_half_open_calls = 5; + + // Recovery timing. open_duration = min(base * 2^consecutive_trips, max). + int base_open_duration_ms = 5000; + int max_open_duration_ms = 60000; + + // Safety valve (future-proof for load-balanced services; no-op v1). + int max_ejection_percent_per_host_set = 50; + + // Retry budget (orthogonal to the breaker). Caps concurrent retries to + // max(retry_budget_min_concurrency, in_flight * retry_budget_percent/100). + // Wired into the request path via ProxyTransaction's retry-budget + // gate in MaybeRetry; also read by + // CircuitBreakerHost to construct its owned RetryBudget. + int retry_budget_percent = 20; + int retry_budget_min_concurrency = 3; + + bool operator==(const CircuitBreakerConfig& o) const { + return enabled == o.enabled && + dry_run == o.dry_run && + consecutive_failure_threshold == o.consecutive_failure_threshold && + failure_rate_threshold == o.failure_rate_threshold && + minimum_volume == o.minimum_volume && + window_seconds == o.window_seconds && + permitted_half_open_calls == o.permitted_half_open_calls && + base_open_duration_ms == o.base_open_duration_ms && + max_open_duration_ms == o.max_open_duration_ms && + max_ejection_percent_per_host_set == o.max_ejection_percent_per_host_set && + retry_budget_percent == o.retry_budget_percent && + retry_budget_min_concurrency == o.retry_budget_min_concurrency; + } + bool operator!=(const CircuitBreakerConfig& o) const { return !(*this == o); } +}; + struct UpstreamConfig { std::string name; std::string host; @@ -145,7 +191,18 @@ struct UpstreamConfig { UpstreamTlsConfig tls; UpstreamPoolConfig pool; ProxyConfig proxy; + CircuitBreakerConfig circuit_breaker; + // Excludes `circuit_breaker` — breaker fields are live-reloadable via + // `CircuitBreakerManager::Reload`, which `HttpServer::Reload` invokes on + // every reload. Topology fields (name, host, port, tls, pool, + // proxy) remain restart-only; a mismatch here triggers the + // "restart required" warning in the outer reload. + // + // Contract: a config pair that differs ONLY in circuit_breaker fields + // must compare EQUAL so the outer reload doesn't fire a spurious warn. + // Any future field whose propagation path is wired into a live + // `*Manager::Reload` should be removed from this operator symmetrically. bool operator==(const UpstreamConfig& o) const { return name == o.name && host == o.host && port == o.port && tls == o.tls && pool == o.pool && proxy == o.proxy; diff --git a/include/http/http_server.h b/include/http/http_server.h index bffbd854..f595d1c3 100644 --- a/include/http/http_server.h +++ b/include/http/http_server.h @@ -22,6 +22,10 @@ class UpstreamManager; class ProxyHandler; +namespace circuit_breaker { +class CircuitBreakerManager; +} + class HttpServer { public: // Snapshot of server runtime statistics. All values are approximate @@ -336,6 +340,16 @@ class HttpServer { std::vector upstream_configs_; std::unique_ptr upstream_manager_; + // Circuit breaker — declared AFTER upstream_manager_ so destruction + // order is breaker-FIRST, pool-SECOND (design §3.1). On shutdown the + // breaker's slices may still be consulted by in-flight + // ProxyTransactions until they drain; destroying the breaker first + // (before the pool) is safe because UpstreamManager's outstanding + // breaker_manager_ pointer is checked against null on every lookup. + // Destroying the pool first would leave breaker slices holding + // dangling references. + std::unique_ptr circuit_breaker_manager_; + // Rate limiting RateLimitConfig rate_limit_config_; std::unique_ptr rate_limit_manager_; diff --git a/include/upstream/pool_partition.h b/include/upstream/pool_partition.h index 4c33a0cd..a6d904b2 100644 --- a/include/upstream/pool_partition.h +++ b/include/upstream/pool_partition.h @@ -25,6 +25,11 @@ class PoolPartition { static constexpr int CHECKOUT_CONNECT_TIMEOUT = -3; static constexpr int CHECKOUT_SHUTTING_DOWN = -4; static constexpr int CHECKOUT_QUEUE_TIMEOUT = -5; + // Delivered to wait-queue waiters drained on a breaker trip by + // DrainWaitQueueOnTrip. ProxyTransaction::OnCheckoutError maps + // this to RESULT_CIRCUIT_OPEN so the queued client gets the same + // circuit-open response a fresh requester would get. + static constexpr int CHECKOUT_CIRCUIT_OPEN = -6; PoolPartition(std::shared_ptr dispatcher, const std::string& upstream_host, int upstream_port, @@ -85,6 +90,28 @@ class PoolPartition { // completion. Same pattern as ScheduleInitiateShutdown. void ScheduleForceCloseActive(); + // Drain the wait queue on a CLOSED → OPEN breaker trip. + // + // Every live waiter receives CHECKOUT_CIRCUIT_OPEN (mapped by + // ProxyTransaction::OnCheckoutError to RESULT_CIRCUIT_OPEN, emitting + // the §12.1 circuit-open response). Cancelled waiters are dropped + // silently — the transaction already tore its side down via the + // framework abort hook. Does NOT set shutting_down_ (this is a + // transient drain, not a shutdown); the partition keeps its + // connections for HALF_OPEN probing when the open window elapses. + // + // Dispatcher-thread-only. The breaker's transition callback fires + // on the slice's owning dispatcher thread — the SAME dispatcher + // that owns this partition (one slice ↔ one partition by + // dispatcher_index). No enqueue needed. + // + // Rationale: without this drain, a queued waiter admitted by + // ConsultBreaker just before the trip would wait out the full + // `open_duration_ms` (up to 60s by default) before the pool's + // queue timeout rejects it. That's a visible latency spike for + // clients who are about to be served 503 anyway. + void DrainWaitQueueOnTrip(); + bool IsShuttingDown() const { return shutting_down_; } // Stats (dispatcher-thread-only reads) diff --git a/include/upstream/proxy_transaction.h b/include/upstream/proxy_transaction.h index 6e25c689..ccda6d24 100644 --- a/include/upstream/proxy_transaction.h +++ b/include/upstream/proxy_transaction.h @@ -6,6 +6,7 @@ #include "upstream/header_rewriter.h" #include "upstream/retry_policy.h" #include "config/server_config.h" // ProxyConfig (stored by value) +#include "circuit_breaker/retry_budget.h" // RetryBudget::InFlightGuard (member-by-value) #include "http/http_callbacks.h" #include "http/http_response.h" // , , , , , provided by common.h @@ -15,16 +16,28 @@ class UpstreamManager; class ConnectionHandler; class Dispatcher; +namespace circuit_breaker { +class CircuitBreakerSlice; +} // RetryBudget already defined via retry_budget.h + class ProxyTransaction : public std::enable_shared_from_this { public: // Result codes for internal state tracking - static constexpr int RESULT_SUCCESS = 0; - static constexpr int RESULT_CHECKOUT_FAILED = -1; // Upstream connect failure → 502 - static constexpr int RESULT_SEND_FAILED = -2; - static constexpr int RESULT_PARSE_ERROR = -3; - static constexpr int RESULT_RESPONSE_TIMEOUT = -4; + static constexpr int RESULT_SUCCESS = 0; + static constexpr int RESULT_CHECKOUT_FAILED = -1; // Upstream connect failure → 502 + static constexpr int RESULT_SEND_FAILED = -2; + static constexpr int RESULT_PARSE_ERROR = -3; + static constexpr int RESULT_RESPONSE_TIMEOUT = -4; static constexpr int RESULT_UPSTREAM_DISCONNECT = -5; - static constexpr int RESULT_POOL_EXHAUSTED = -6; // Local capacity → 503 + static constexpr int RESULT_POOL_EXHAUSTED = -6; // Local capacity → 503 + // Circuit breaker rejected this attempt before it touched the upstream. + // Carries Retry-After + X-Circuit-Breaker headers (§12.1). + // Terminal — retry loop MUST NOT retry this outcome (§8). + static constexpr int RESULT_CIRCUIT_OPEN = -7; + // Retry budget exhausted. No Retry-After; distinct header + // X-Retry-Budget-Exhausted so operators can tell the two 503s apart + // from circuit-open rejects. + static constexpr int RESULT_RETRY_BUDGET_EXHAUSTED = -8; // Constructor copies all needed fields from client_request (method, path, // query, headers, body, params, dispatcher_index, client_ip, client_tls, @@ -145,6 +158,47 @@ class ProxyTransaction : public std::enable_shared_from_this { // Timing std::chrono::steady_clock::time_point start_time_; + // Circuit breaker integration — resolved once in Start() from + // `service_name_` + `dispatcher_index_`. Null when there's no + // CircuitBreakerManager attached (server has no upstreams, or the + // breaker is being built lazily) — the breaker is simply skipped in + // that case. Lifetime: the slice is owned by CircuitBreakerHost in + // CircuitBreakerManager on HttpServer, which outlives this transaction. + circuit_breaker::CircuitBreakerSlice* slice_ = nullptr; + + // Per-host retry budget, resolved alongside `slice_` in Start() from + // the same CircuitBreakerHost. Null when there's no breaker attached + // for this service — in that case the transaction skips budget + // tracking entirely. Lifetime: the budget is owned by the host, + // which outlives this transaction (destruction order guaranteed by + // HttpServer member declaration). + circuit_breaker::RetryBudget* retry_budget_ = nullptr; + + // Per-attempt in-flight tracker. Held for the duration of each + // attempt (first try and retries alike). Replaced on every + // AttemptCheckout — move-assignment decrements the counter for the + // prior attempt and increments for the new one, so a retrying + // transaction stays at a single in_flight unit. Default-constructed + // guard is empty (counter_ = nullptr): used when retry_budget_ is + // null or before the first ConsultBreaker admission. + circuit_breaker::RetryBudget::InFlightGuard inflight_guard_; + + // Per-ATTEMPT admission state. Reset on each call to ConsultBreaker(); + // paired Report*() calls thread the `generation` back so the slice + // can drop stale completions across state transitions (see + // CircuitBreakerSlice::Admission doc). generation_==0 is a sentinel + // for "no admission held" — slice domain gens start at 1 so a 0-gen + // report always drops safely. + uint64_t admission_generation_ = 0; + bool is_probe_ = false; + + // Retry-budget token held by this transaction's current retry + // attempt (attempt_ > 0). Set true after a successful + // TryConsumeRetry in MaybeRetry; cleared by ReleaseRetryToken in + // Cleanup. Dry-run rejects proceed but the flag stays false — no + // token was consumed, so no ReleaseRetry is required. + bool retry_token_held_ = false; + // Internal methods void AttemptCheckout(); void OnCheckoutReady(UpstreamLease lease); @@ -170,6 +224,60 @@ class ProxyTransaction : public std::enable_shared_from_this { void ArmResponseTimeout(int explicit_budget_ms = 0); void ClearResponseTimeout(); - // Error response factory (maps result codes to HTTP responses) + // Error response factory (maps result codes to HTTP responses). + // Circuit-open and retry-budget responses need richer context + // (Retry-After from slice_, distinguishing header), so they have + // dedicated factories below — MakeErrorResponse falls back to a + // plain 503 for those codes if called generically. static HttpResponse MakeErrorResponse(int result_code); + + // Emit the circuit-open response (design §12.1): + // 503 + Retry-After (seconds until slice->OpenUntil()) + // + X-Circuit-Breaker: open + // + X-Upstream-Host: service:host:port + HttpResponse MakeCircuitOpenResponse() const; + + // Emit the retry-budget-exhausted response (design §12.2): + // 503 + X-Retry-Budget-Exhausted: 1 + static HttpResponse MakeRetryBudgetResponse(); + + // Breaker helpers — gate and outcome classification. + // + // ConsultBreaker: call at the top of AttemptCheckout. Populates + // admission_generation_ and is_probe_ on admission; delivers the + // circuit-open response and returns false on reject. Dry-run admits + // and returns true (slice already counted the would-reject). + // Returns true if the caller should proceed to CheckoutAsync. + bool ConsultBreaker(); + + // ReportBreakerOutcome: classify a result_code into + // success/failure/neutral (per design §7) and call slice->Report* + // with admission_generation_. Clears admission_generation_ so a + // double-report is impossible. + // + // failure_kind is ignored unless the outcome is a FailureKind-bearing + // result; the caller passes the appropriate kind for 5xx vs disconnect + // vs timeout since the slice treats them differently only for logs. + void ReportBreakerOutcome(int result_code); + + // ReleaseBreakerAdmissionNeutral: release the admission slot without + // counting a success or failure. Used when the transaction is aborted + // locally (Cancel() on client disconnect, cancelled_ early-return + // after checkout, etc.) before an upstream health signal was observed. + // + // Without this, a HALF_OPEN probe slot is stranded if the client + // disconnects mid-probe — the slice stays in half_open_full until an + // external reset. No-op if admission_generation_ == 0. Clears + // admission_generation_ so a following ReportBreakerOutcome is a + // no-op. + void ReleaseBreakerAdmissionNeutral(); + + // Release the retry-budget token held by this attempt, if any. + // Idempotent via the retry_token_held_ flag — called from Cleanup + // between attempts (so the next retry's TryConsumeRetry sees a + // freshly-released counter) AND from the destructor / Cancel as + // safety nets. No-op when no budget was attached or no token was + // consumed (e.g. first attempt, or dry-run reject that didn't + // consume). + void ReleaseRetryToken(); }; diff --git a/include/upstream/upstream_manager.h b/include/upstream/upstream_manager.h index c308cbd3..346bc4d5 100644 --- a/include/upstream/upstream_manager.h +++ b/include/upstream/upstream_manager.h @@ -9,6 +9,10 @@ class TlsClientContext; +namespace circuit_breaker { +class CircuitBreakerManager; +} + class UpstreamManager { public: UpstreamManager(const std::vector& upstreams, @@ -59,6 +63,33 @@ class UpstreamManager { // Check if an upstream service is configured bool HasUpstream(const std::string& service_name) const; + // Look up the PoolPartition for (service_name, dispatcher_index). + // Returns nullptr if service is unknown or dispatcher_index is out + // of range. Used by the circuit-breaker transition callback (wired + // in HttpServer::MarkServerReady) to drain the wait queue on a + // CLOSED → OPEN trip. Must be called on the dispatcher thread + // identified by `dispatcher_index` — the returned partition's + // DrainWaitQueueOnTrip is dispatcher-thread-only. + PoolPartition* GetPoolPartition(const std::string& service_name, + size_t dispatcher_index); + + // Install a non-owning pointer to the server's CircuitBreakerManager. + // Called once from HttpServer::MarkServerReady after both managers are + // constructed (§3.1). Lifetime guarantee: the CircuitBreakerManager + // is declared AFTER upstream_manager_ on HttpServer, so it destructs + // FIRST — UpstreamManager never reads through a dangling pointer on + // shutdown. Passing nullptr is allowed (detaches). + void AttachCircuitBreakerManager(circuit_breaker::CircuitBreakerManager* mgr) { + breaker_manager_.store(mgr, std::memory_order_release); + } + + // Returns the attached breaker manager, or nullptr if no manager is + // attached. Safe from any thread (atomic load, acquire so any + // Attach-time publication is visible). + circuit_breaker::CircuitBreakerManager* GetCircuitBreakerManager() const { + return breaker_manager_.load(std::memory_order_acquire); + } + private: // service_name → host pool. Built once at construction, never modified. std::unordered_map> pools_; @@ -73,6 +104,14 @@ class UpstreamManager { // reject new checkouts before per-partition shutdown tasks execute. std::atomic shutting_down_{false}; + // Non-owning pointer to the circuit-breaker manager, installed by + // HttpServer::MarkServerReady after both managers exist. Atomic so + // late-arriving hot-path reads in ProxyTransaction see either a + // coherent pointer or nullptr (never torn). Owned by HttpServer; + // lifetime outlives UpstreamManager (breaker destructs first — + // §3.1 ownership). Default nullptr — breaker is an opt-in layer. + std::atomic breaker_manager_{nullptr}; + // Manager-owned atomic counter: total outstanding connections std::atomic outstanding_conns_{0}; diff --git a/server/circuit_breaker_host.cc b/server/circuit_breaker_host.cc new file mode 100644 index 00000000..4523d3be --- /dev/null +++ b/server/circuit_breaker_host.cc @@ -0,0 +1,142 @@ +#include "circuit_breaker/circuit_breaker_host.h" +#include "dispatcher.h" +#include "log/logger.h" + +namespace circuit_breaker { + +CircuitBreakerHost::CircuitBreakerHost(std::string service_name, + std::string host, + int port, + size_t partition_count, + const CircuitBreakerConfig& config) + : service_name_(std::move(service_name)), + host_(std::move(host)), + port_(port), + config_(config), + retry_budget_(std::make_unique( + config.retry_budget_percent, + config.retry_budget_min_concurrency)) { + // Clamp partition_count — a zero-partition host would be unusable + // (no slices to dispatch to). Tests or misuse may pass 0; log and + // clamp to 1 so the host is at least consistent. + if (partition_count == 0) { + logging::Get()->error( + "CircuitBreakerHost({}, {}:{}) constructed with 0 partitions; " + "clamping to 1", + service_name_, host_, port_); + partition_count = 1; + } + + slices_.reserve(partition_count); + for (size_t i = 0; i < partition_count; ++i) { + // Per-slice label for logs — lets operators grep logs for a + // specific host:partition pair. Key=value form matches the + // format documented in circuit_breaker_slice.h:host_label_. + std::string label = "service=" + service_name_ + + " host=" + host_ + ":" + std::to_string(port_) + + " partition=" + std::to_string(i); + slices_.emplace_back(std::make_unique( + std::move(label), i, config_)); + } + logging::Get()->debug( + "CircuitBreakerHost created service={} host={}:{} partitions={} " + "enabled={} retry_budget={}%,min={}", + service_name_, host_, port_, partition_count, + config_.enabled, + config_.retry_budget_percent, + config_.retry_budget_min_concurrency); +} + +CircuitBreakerSlice* CircuitBreakerHost::GetSlice(size_t dispatcher_index) { + if (dispatcher_index >= slices_.size()) return nullptr; + return slices_[dispatcher_index].get(); +} + +CircuitBreakerHostSnapshot CircuitBreakerHost::Snapshot() const { + CircuitBreakerHostSnapshot snap; + snap.service_name = service_name_; + snap.host = host_; + snap.port = port_; + snap.slices.reserve(slices_.size()); + + for (const auto& slice : slices_) { + CircuitBreakerHostSnapshot::SliceRow row; + row.dispatcher_index = slice->dispatcher_index(); + row.state = slice->CurrentState(); + row.trips = slice->Trips(); + row.rejected = slice->Rejected(); + row.probe_successes = slice->ProbeSuccesses(); + row.probe_failures = slice->ProbeFailures(); + + snap.total_trips += row.trips; + snap.total_rejected += row.rejected; + if (row.state == State::OPEN) ++snap.open_partitions; + else if (row.state == State::HALF_OPEN) ++snap.half_open_partitions; + + snap.slices.push_back(row); + } + + // Retry budget aggregate (host-level, not per-partition). + snap.retries_in_flight = retry_budget_->RetriesInFlight(); + snap.retries_rejected = retry_budget_->RetriesRejected(); + snap.in_flight = retry_budget_->InFlight(); + + return snap; +} + +void CircuitBreakerHost::Reload( + const std::vector>& dispatchers, + const CircuitBreakerConfig& new_config) { + // Dispatcher list must match the slice count one-for-one — the + // slice at index i lives on dispatcher i. A size mismatch is a + // programming error (topology changed post-construction, which is + // restart-only); log and bail rather than mis-dispatching. + if (dispatchers.size() != slices_.size()) { + logging::Get()->error( + "CircuitBreakerHost::Reload({}:{}) dispatcher count mismatch: " + "got {}, expected {} — reload skipped", + service_name_, host_, dispatchers.size(), slices_.size()); + return; + } + + // Update host-level retry budget fields immediately — atomic stores, + // no dispatcher routing needed. RetryBudget::Reload clamps internally. + retry_budget_->Reload(new_config.retry_budget_percent, + new_config.retry_budget_min_concurrency); + + // Enqueue per-slice Reload on each owning dispatcher. The slice is + // dispatcher-thread-local for mutation, so the config swap must + // happen there. Passing slice as raw pointer is safe: slices_ is + // owned by `this` (the host), which outlives the manager's reload + // (enforced by CircuitBreakerManager's lifetime). + for (size_t i = 0; i < slices_.size(); ++i) { + CircuitBreakerSlice* slice = slices_[i].get(); + auto& dispatcher = dispatchers[i]; + if (!dispatcher) { + logging::Get()->error( + "CircuitBreakerHost::Reload({}:{}) null dispatcher at index {}", + service_name_, host_, i); + continue; + } + dispatcher->EnQueue([slice, new_config]() { + slice->Reload(new_config); + }); + } + + // Save the new config for future Snapshot() / construction-like + // operations. Other threads never read config_ directly. + config_ = new_config; +} + +void CircuitBreakerHost::SetTransitionCallbackOnAllSlices( + StateTransitionCallback cb) { + for (auto& slice : slices_) { + // Copy the callback so each slice owns its own std::function. + // Passing by value into SetTransitionCallback gives each slice + // an independent copy, avoiding cross-partition std::function + // data races. + slice->SetTransitionCallback(cb); + } +} + +} // namespace circuit_breaker diff --git a/server/circuit_breaker_manager.cc b/server/circuit_breaker_manager.cc new file mode 100644 index 00000000..9e4934a3 --- /dev/null +++ b/server/circuit_breaker_manager.cc @@ -0,0 +1,124 @@ +#include "circuit_breaker/circuit_breaker_manager.h" +#include "log/logger.h" +#include + +namespace circuit_breaker { + +CircuitBreakerManager::CircuitBreakerManager( + const std::vector& upstreams, + size_t partition_count, + std::vector> dispatchers) + : dispatchers_(std::move(dispatchers)) { + // Invariant (production path): slices are indexed by dispatcher, + // so partition_count must match dispatcher count. Any divergence + // would cause every subsequent host->Reload() to silently skip + // (size-mismatch guard in CircuitBreakerHost::Reload) — fail + // loudly at startup instead of on reload. + // + // Exception: pure unit tests that don't exercise Reload pass an + // empty dispatcher list; skip the check in that case so those + // tests can continue to allocate slices without wiring up live + // dispatchers. + if (!dispatchers_.empty() && partition_count != dispatchers_.size()) { + logging::Get()->critical( + "CircuitBreakerManager: partition_count ({}) != dispatcher count " + "({}) — topology mismatch", + partition_count, dispatchers_.size()); + throw std::invalid_argument( + "CircuitBreakerManager: partition_count must equal dispatcher count"); + } + + // Build one Host per upstream regardless of .circuit_breaker.enabled. + // Disabled hosts still need a live Slice so a later reload can flip + // them on without re-wiring transition callbacks (design §3.1). + hosts_.reserve(upstreams.size()); + for (const auto& u : upstreams) { + if (u.name.empty()) { + // ConfigLoader::Validate rejects empty names upstream, but + // defense in depth — skip rather than insert an unreachable + // host with an empty key that would shadow future lookups. + logging::Get()->error( + "CircuitBreakerManager: skipping upstream with empty name"); + continue; + } + auto [it, inserted] = hosts_.emplace( + u.name, + std::make_unique( + u.name, u.host, u.port, partition_count, u.circuit_breaker)); + if (!inserted) { + // Duplicate service name — shouldn't happen (Validate checks + // uniqueness), but log so the collision is visible rather + // than silently dropping the second entry. + logging::Get()->error( + "CircuitBreakerManager: duplicate upstream name '{}' ignored", + u.name); + } + } + logging::Get()->info( + "CircuitBreakerManager initialized hosts={} partitions={}", + hosts_.size(), partition_count); +} + +CircuitBreakerHost* CircuitBreakerManager::GetHost( + const std::string& service_name) { + auto it = hosts_.find(service_name); + return it == hosts_.end() ? nullptr : it->second.get(); +} + +const CircuitBreakerHost* CircuitBreakerManager::GetHost( + const std::string& service_name) const { + auto it = hosts_.find(service_name); + return it == hosts_.end() ? nullptr : it->second.get(); +} + +void CircuitBreakerManager::Reload( + const std::vector& new_upstreams) { + // Serialize with any other Reload calls. Hot path doesn't take this. + std::lock_guard lk(reload_mtx_); + + // Detect topology changes (added / removed service names) so we can + // log and skip — the authoritative "restart required" warning lives + // in HttpServer::Reload; we just honor the "existing hosts only" + // contract by applying breaker fields to matching names and nothing + // else. + std::unordered_set new_names; + new_names.reserve(new_upstreams.size()); + for (const auto& u : new_upstreams) new_names.insert(u.name); + + for (const auto& u : new_upstreams) { + auto* host = GetHost(u.name); + if (!host) { + // New service name — topology change, skip. The outer + // reload layer warns. + logging::Get()->warn( + "CircuitBreakerManager::Reload: new upstream '{}' requires " + "restart (ignored)", + u.name); + continue; + } + host->Reload(dispatchers_, u.circuit_breaker); + } + + // Log removals without touching the hosts (their removal also + // requires a restart). + for (const auto& [name, _] : hosts_) { + if (new_names.find(name) == new_names.end()) { + logging::Get()->warn( + "CircuitBreakerManager::Reload: removed upstream '{}' requires " + "restart (ignored)", + name); + } + } +} + +std::vector +CircuitBreakerManager::SnapshotAll() const { + std::vector snapshots; + snapshots.reserve(hosts_.size()); + for (const auto& [_, host] : hosts_) { + snapshots.push_back(host->Snapshot()); + } + return snapshots; +} + +} // namespace circuit_breaker diff --git a/server/circuit_breaker_slice.cc b/server/circuit_breaker_slice.cc new file mode 100644 index 00000000..e6bd1c93 --- /dev/null +++ b/server/circuit_breaker_slice.cc @@ -0,0 +1,675 @@ +#include "circuit_breaker/circuit_breaker_slice.h" +#include "log/logger.h" + +namespace circuit_breaker { + +CircuitBreakerSlice::CircuitBreakerSlice(std::string host_label, + size_t dispatcher_index, + const CircuitBreakerConfig& config, + TimeSource time_source) + : host_label_(std::move(host_label)), + dispatcher_index_(dispatcher_index), + config_(config), + time_source_(std::move(time_source)), + window_(config.window_seconds) { +} + +std::chrono::steady_clock::time_point CircuitBreakerSlice::Now() const { + if (time_source_) return time_source_(); + return std::chrono::steady_clock::now(); +} + +std::chrono::steady_clock::time_point CircuitBreakerSlice::OpenUntil() const { + int64_t ns = open_until_steady_ns_.load(std::memory_order_acquire); + if (ns == 0) return std::chrono::steady_clock::time_point{}; + return std::chrono::steady_clock::time_point(std::chrono::nanoseconds(ns)); +} + +// Cap the left-shift exponent used to compute open duration. `1 << 30` already +// covers ~12.4 days of base open duration even before the `max_open_duration_ms` +// clamp — higher shift amounts would invoke undefined behavior on `int`. +static constexpr int MAX_OPEN_DURATION_SHIFT = 30; + +// Scale factor for integer percent math: `fails * PERCENT_SCALE >= threshold * total`. +static constexpr int PERCENT_SCALE = 100; + +std::chrono::nanoseconds CircuitBreakerSlice::ComputeOpenDuration() const { + // Duration = base << consecutive_trips_ (shift expresses 2^n exponential). + // `consecutive_trips_` is the number of trips observed BEFORE this one, so + // the first trip uses 2^0 = 1x base, the second trip uses 2x, etc. + // Callers must increment consecutive_trips_ AFTER calling this method. + int trips = consecutive_trips_.load(std::memory_order_relaxed); + if (trips > MAX_OPEN_DURATION_SHIFT) trips = MAX_OPEN_DURATION_SHIFT; + // Clamp base/max for programmatic callers that bypass ConfigLoader::Validate + // (same hardening as CircuitBreakerWindow's ctor and the HALF_OPEN probe + // budget snapshot). Without these clamps: + // - base_open_duration_ms <= 0: `base_ms << trips` is <= 0 → open_until + // <= now → next TryAcquire immediately drains OPEN→HALF_OPEN, + // disabling the backoff entirely. + // - max_open_duration_ms < base_open_duration_ms: the overflow/clamp + // branch (`scaled_ms > max_ms`) fires on every trip, pinning the + // duration to a value smaller than base — same "no meaningful + // backoff" effect. + // Clamp floors: base >= 1ms, max >= base. + int64_t base_ms = config_.base_open_duration_ms > 0 + ? config_.base_open_duration_ms : 1; + int64_t max_ms = config_.max_open_duration_ms >= base_ms + ? config_.max_open_duration_ms : base_ms; + int64_t scaled_ms = base_ms << trips; + if (scaled_ms < base_ms /* overflow */ || scaled_ms > max_ms) { + scaled_ms = max_ms; + } + return std::chrono::milliseconds(scaled_ms); +} + +int64_t CircuitBreakerSlice::NextOpenDurationMs() const { + return std::chrono::duration_cast( + ComputeOpenDuration()).count(); +} + +bool CircuitBreakerSlice::ShouldTripClosed( + std::chrono::steady_clock::time_point now) { + if (consecutive_failures_ >= config_.consecutive_failure_threshold) { + return true; + } + int64_t total = window_.TotalCount(now); + if (total < config_.minimum_volume) return false; + int64_t fails = window_.FailureCount(now); + // Integer percent math: fails * PERCENT_SCALE >= threshold_pct * total. + return (fails * PERCENT_SCALE) >= + (static_cast(config_.failure_rate_threshold) * total); +} + +void CircuitBreakerSlice::TripClosedToOpen( + const char* trigger, std::chrono::steady_clock::time_point now) { + // `now` is the same time_point the caller (ReportFailure) passed to + // AddFailure/ShouldTripClosed — reusing it keeps the trip log's + // window_total/window_fail_rate consistent with the rate check that + // fired the trip. Calling Now() fresh here would risk crossing a + // bucket boundary and logging window_total=0 for the very failure + // that tripped the breaker. + // + // Capture pre-reset observability context BEFORE mutating state. + // §11.1 log format asks for consecutive_failures + window_total + + // window_fail_rate at the trip event so operators can distinguish a + // "100 consecutive bad responses" trip from a "55% failure rate over + // a wide call window" trip — two very different operational stories + // that the `trigger` string alone doesn't fully capture. + int consec_at_trip = consecutive_failures_; + int64_t window_total = window_.TotalCount(now); + int64_t window_failures = window_.FailureCount(now); + int window_fail_rate_pct = + (window_total > 0) + ? static_cast((window_failures * 100) / window_total) + : 0; + + auto duration = ComputeOpenDuration(); // uses current consecutive_trips_ + consecutive_trips_.fetch_add(1, std::memory_order_relaxed); + auto open_until = now + duration; + int64_t open_until_ns = + std::chrono::duration_cast( + open_until.time_since_epoch()).count(); + + open_until_steady_ns_.store(open_until_ns, std::memory_order_release); + state_.store(State::OPEN, std::memory_order_release); + + // Reset on-trip bookkeeping. + consecutive_failures_ = 0; + half_open_inflight_ = 0; + half_open_successes_ = 0; + half_open_saw_failure_ = false; + half_open_admitted_ = 0; + first_reject_logged_for_open_ = false; + // Bump closed_gen_: non-probe admissions from the closing CLOSED cycle + // are now stale. Late Report(false, ...) calls for those requests drop. + // halfopen_gen_ is NOT bumped — OPEN holds no HALF_OPEN admissions. + ++closed_gen_; + + trips_.fetch_add(1, std::memory_order_relaxed); + + logging::Get()->warn( + "circuit breaker tripped {} trigger={} consecutive_failures={} " + "window_total={} window_fail_rate={} open_for_ms={} consecutive_trips={}", + host_label_, trigger, consec_at_trip, + window_total, window_fail_rate_pct, + std::chrono::duration_cast(duration).count(), + consecutive_trips_.load(std::memory_order_relaxed)); + + if (transition_cb_) transition_cb_(State::CLOSED, State::OPEN, trigger); +} + +void CircuitBreakerSlice::TransitionOpenToHalfOpen() { + state_.store(State::HALF_OPEN, std::memory_order_release); + // Clear open_until_steady_ns_ per the OpenUntil() contract ("zero when + // not OPEN"). Leaving a stale deadline here would cause + // ProxyTransaction::MakeCircuitOpenResponse to compute a Retry-After + // from a past time_point (negative delta → floor at 1s, misleading for + // a reject in the HALF_OPEN probe-budget-full path). Retry-After for + // HALF_OPEN rejects is computed fresh by callers when needed. + open_until_steady_ns_.store(0, std::memory_order_release); + half_open_inflight_ = 0; + half_open_successes_ = 0; + half_open_saw_failure_ = false; + half_open_admitted_ = 0; + // Snapshot the probe budget for this cycle. A live Reload() during this + // HALF_OPEN episode may lower or raise config_.permitted_half_open_calls, + // but TryAcquire's slot gate (Case B) and ReportSuccess's close check must + // both operate against the budget that was in effect when probes were + // admitted. Without the snapshot: lowering the limit causes premature close + // (first success satisfies the reduced count → TransitionHalfOpenToClosed + // bumps halfopen_gen_ → remaining admitted probes become stale → their + // failures are silently dropped and the breaker falsely closes). + // + // Clamp to a minimum of 1. ConfigLoader::Validate() enforces >= 1 on the + // production path, but programmatic callers (tests, future direct users) + // that bypass validation could set permitted_half_open_calls <= 0. With + // snapshot=0, TryAcquire's Case B check (`inflight >= snapshot`) is + // immediately true for every probe → no probe ever admitted → no probe + // ever completes → half_open_inflight_ stays at 0 forever → slice is + // permanently stuck in HALF_OPEN rejecting all traffic. Matches the + // symmetric clamp in CircuitBreakerWindow's ctor. + int permitted = config_.permitted_half_open_calls; + half_open_permitted_snapshot_ = permitted > 0 ? permitted : 1; + // Reset the info-log "first reject" breadcrumb so the first rejection + // observed in the HALF_OPEN phase surfaces at info, not debug. HALF_OPEN + // rejection (recovery attempt failing or probe budget full) is + // operationally distinct from OPEN rejection (still backing off) and + // deserves its own breadcrumb in default-warn operator logs. + first_reject_logged_for_open_ = false; + // NOTE: neither closed_gen_ nor halfopen_gen_ is bumped here. No + // admissions are made in OPEN — the previous HALF_OPEN cycle (if any) + // already bumped halfopen_gen_ on its exit (TripHalfOpenToOpen) or on + // cycle-complete (TransitionHalfOpenToClosed), so any latent stale + // probes are already tagged. Bumping again would be redundant. + + logging::Get()->info( + "circuit breaker half-open {} probes_allowed={}", + host_label_, half_open_permitted_snapshot_); + + if (transition_cb_) { + transition_cb_(State::OPEN, State::HALF_OPEN, "open_elapsed"); + } +} + +void CircuitBreakerSlice::TransitionHalfOpenToClosed() { + // Capture actual probes-succeeded BEFORE resetting — the log then reflects + // reality instead of the configured target (the two are equal at the moment + // of transition today, but relying on that is brittle if the transition + // logic ever changes). + int probes_succeeded = half_open_successes_; + + state_.store(State::CLOSED, std::memory_order_release); + open_until_steady_ns_.store(0, std::memory_order_release); + consecutive_trips_.store(0, std::memory_order_relaxed); + consecutive_failures_ = 0; + window_.Reset(); + half_open_inflight_ = 0; + half_open_successes_ = 0; + half_open_saw_failure_ = false; + half_open_admitted_ = 0; + first_reject_logged_for_open_ = false; + // Bump halfopen_gen_: the just-completed HALF_OPEN cycle's probe + // admissions are now stale. closed_gen_ is NOT bumped — pre-trip + // CLOSED admissions were already invalidated by TripClosedToOpen + // when we left CLOSED. + ++halfopen_gen_; + + logging::Get()->info( + "circuit breaker closed {} probes_succeeded={}", + host_label_, probes_succeeded); + + if (transition_cb_) { + transition_cb_(State::HALF_OPEN, State::CLOSED, "probe_success"); + } +} + +void CircuitBreakerSlice::TripHalfOpenToOpen(const char* trigger) { + auto duration = ComputeOpenDuration(); // uses current consecutive_trips_ + consecutive_trips_.fetch_add(1, std::memory_order_relaxed); + auto now = Now(); + auto open_until = now + duration; + int64_t open_until_ns = + std::chrono::duration_cast( + open_until.time_since_epoch()).count(); + + open_until_steady_ns_.store(open_until_ns, std::memory_order_release); + state_.store(State::OPEN, std::memory_order_release); + + half_open_inflight_ = 0; + half_open_successes_ = 0; + half_open_saw_failure_ = false; + half_open_admitted_ = 0; + first_reject_logged_for_open_ = false; + // Bump halfopen_gen_: probe admissions from the closing HALF_OPEN + // cycle are now stale. closed_gen_ is NOT bumped — no CLOSED + // admissions are outstanding (we came from HALF_OPEN, not CLOSED). + ++halfopen_gen_; + + trips_.fetch_add(1, std::memory_order_relaxed); + + logging::Get()->warn( + "circuit breaker re-tripped {} trigger={} open_for_ms={} consecutive_trips={}", + host_label_, trigger, + std::chrono::duration_cast(duration).count(), + consecutive_trips_.load(std::memory_order_relaxed)); + + if (transition_cb_) transition_cb_(State::HALF_OPEN, State::OPEN, trigger); +} + +CircuitBreakerSlice::Admission CircuitBreakerSlice::TryAcquire() { + // Disabled fast path — zero overhead when config.enabled=false. + // Use generation 0 (sentinel) since the slice won't consult it on report. + if (!config_.enabled) { + return Admission{Decision::ADMITTED, /*generation=*/0}; + } + + State s = state_.load(std::memory_order_acquire); + + if (s == State::OPEN) { + // Check whether the open window has elapsed. + int64_t open_until_ns = + open_until_steady_ns_.load(std::memory_order_acquire); + int64_t now_ns = std::chrono::duration_cast( + Now().time_since_epoch()).count(); + if (now_ns >= open_until_ns) { + // Transition OPEN → HALF_OPEN on this thread. Because slices are + // dispatcher-thread-pinned, no CAS is needed (a plain store is + // safe under the single-writer invariant). + TransitionOpenToHalfOpen(); + s = State::HALF_OPEN; + } else { + // Rejected admissions get generation 0 — callers must not call + // Report* for a rejected admission, and 0 always compares stale + // (domain gens start at 1), so an accidental Report would drop + // safely rather than mutating state. + return Admission{RejectWithLog("open", /*half_open_full=*/false), + /*generation=*/0}; + } + } + + if (s == State::HALF_OPEN) { + // Case A: a sibling probe already failed. Short-circuit remaining + // admissions — the breaker is guaranteed to re-trip once in-flight + // probes drain. This is operationally DIFFERENT from "budget + // exhausted" (case B): probe slots may still be free, we just know + // using them can't change the outcome. Track it with its own log + // label and do NOT bump `rejected_half_open_full_` — that counter + // is specifically "probing, no capacity left" for dashboards. + if (half_open_saw_failure_) { + return Admission{RejectWithLog("half_open_recovery_failing", + /*half_open_full=*/false), + /*generation=*/0}; + } + // Case B: probe budget exhausted for this cycle. "No capacity" — bump + // the dedicated counter so dashboards can tell this apart from + // saw_failure rejects. + // + // Gate on `half_open_admitted_` (total cycle admissions, never + // decrements), NOT on `half_open_inflight_`. Inflight drops when a + // probe completes, so gating on it would reuse the freed slot and let + // the cycle admit more than `snapshot` total probes. Consequences of + // that bug: the close check `successes >= snapshot` could fire before + // ALL admitted probes have reported (the reused-slot probe is still + // in flight); TransitionHalfOpenToClosed would bump halfopen_gen_; + // the late probe's failure would drop as stale — falsely marking an + // unhealthy host recovered. + // + // Use the cycle snapshot so a live Reload() that lowers + // permitted_half_open_calls mid-cycle doesn't change how many probes + // were promised to this cycle. + if (half_open_admitted_ >= half_open_permitted_snapshot_) { + return Admission{RejectWithLog("half_open_full", + /*half_open_full=*/true), + /*generation=*/0}; + } + half_open_admitted_++; + half_open_inflight_++; + // Probe admission — stamp with halfopen_gen_. + return Admission{Decision::ADMITTED_PROBE, halfopen_gen_}; + } + + // CLOSED: fast path — stamp with closed_gen_. + return Admission{Decision::ADMITTED, closed_gen_}; +} + +Decision CircuitBreakerSlice::RejectWithLog(const char* state_label, + bool half_open_full) { + rejected_.fetch_add(1, std::memory_order_relaxed); + if (half_open_full) { + rejected_half_open_full_.fetch_add(1, std::memory_order_relaxed); + } + // First reject in this OPEN/HALF_OPEN cycle is info — gives operators + // looking at a flurry of 503s a single high-level breadcrumb in default- + // warn logs without flooding them. Subsequent rejects are debug. + const bool first = !first_reject_logged_for_open_; + if (first) first_reject_logged_for_open_ = true; + + if (config_.dry_run) { + if (first) { + logging::Get()->info( + "[dry-run] circuit breaker would reject {} state={}", + host_label_, state_label); + } else { + logging::Get()->debug( + "[dry-run] circuit breaker would reject {} state={}", + host_label_, state_label); + } + return Decision::REJECTED_OPEN_DRYRUN; + } + if (first) { + logging::Get()->info( + "circuit breaker rejecting {} state={} (first reject this cycle)", + host_label_, state_label); + } else { + logging::Get()->debug( + "circuit breaker rejected {} state={}", host_label_, state_label); + } + return Decision::REJECTED_OPEN; +} + +void CircuitBreakerSlice::ReportSuccess(bool probe, + uint64_t admission_generation) { + if (!config_.enabled) return; + + if (probe) { + // Record the completed-probe outcome for observability regardless of + // current state — this is a signal about upstream behavior, not a + // signal about our state machine. + probe_successes_.fetch_add(1, std::memory_order_relaxed); + + // Generation guard: drop reports for probes admitted before the + // current HALF_OPEN cycle. Probes use halfopen_gen_ exclusively — + // so a window_seconds reload (bumps closed_gen_, NOT halfopen_gen_) + // does NOT invalidate in-flight probes, which would otherwise + // strand half_open_inflight_ at its pre-reload value and wedge the + // slice in HALF_OPEN/half_open_full. + if (admission_generation != halfopen_gen_) { + reports_stale_generation_.fetch_add(1, std::memory_order_relaxed); + return; + } + + // Stale probe defense: we admitted this probe in HALF_OPEN, but the + // slice may have transitioned out (e.g., `Reload()` flipped enabled, + // `TransitionHalfOpenToClosed` already fired on sibling probes, or — + // operator toggle transitioned us to CLOSED via Reload(). + // Only touch HALF_OPEN bookkeeping / fire transitions when state is + // STILL HALF_OPEN. + if (state_.load(std::memory_order_acquire) != State::HALF_OPEN) return; + + if (half_open_inflight_ > 0) half_open_inflight_--; + if (half_open_saw_failure_) { + // A sibling probe already failed; whichever probe finishes last + // transitions to OPEN. Handle here only if this is the last probe. + if (half_open_inflight_ == 0) { + TripHalfOpenToOpen("probe_fail"); + } + return; + } + half_open_successes_++; + // Use the cycle snapshot so a mid-cycle Reload() that lowers the + // limit doesn't close the breaker early (before all admitted probes + // have reported back), silently dropping the remaining probes' failures. + if (half_open_successes_ >= half_open_permitted_snapshot_) { + TransitionHalfOpenToClosed(); + } + return; + } + + // Non-probe success path — checked against closed_gen_. + if (admission_generation != closed_gen_) { + reports_stale_generation_.fetch_add(1, std::memory_order_relaxed); + return; + } + // Only meaningful when state is CLOSED. If the slice has since + // transitioned (e.g., other requests in this burst tripped it), this + // late outcome must NOT retroactively reset `consecutive_failures_` or + // pollute the window — a fresh CLOSED cycle after recovery would start + // with bogus success history. (Transitions bump `closed_gen_`, so the + // guard above catches this too; the state check is a direct guard for + // observability clarity.) + if (state_.load(std::memory_order_acquire) != State::CLOSED) return; + consecutive_failures_ = 0; + window_.AddSuccess(Now()); +} + +void CircuitBreakerSlice::ReportFailure(FailureKind kind, bool probe, + uint64_t admission_generation) { + (void)kind; // Kind is used by higher layers for logging; slice itself + // treats all failures the same way for trip math. + if (!config_.enabled) return; + + if (probe) { + probe_failures_.fetch_add(1, std::memory_order_relaxed); + + // Probes use halfopen_gen_ — see matching comment in ReportSuccess. + if (admission_generation != halfopen_gen_) { + reports_stale_generation_.fetch_add(1, std::memory_order_relaxed); + return; + } + + // Stale probe defense — see matching comment in ReportSuccess above. + if (state_.load(std::memory_order_acquire) != State::HALF_OPEN) return; + + if (half_open_inflight_ > 0) half_open_inflight_--; + half_open_saw_failure_ = true; + // On the last probe (or if all remaining complete) transition OPEN. + if (half_open_inflight_ == 0) { + TripHalfOpenToOpen("probe_fail"); + } + return; + } + + // Non-probe failure path — checked against closed_gen_. + if (admission_generation != closed_gen_) { + reports_stale_generation_.fetch_add(1, std::memory_order_relaxed); + return; + } + // Only count when CLOSED. Late failures from requests admitted in CLOSED + // but completing after a trip must NOT re-enter `TripClosedToOpen` — + // doing so double-increments `consecutive_trips_` (inflating + // open_duration) and fires a spurious CLOSED→OPEN transition edge that + // downstream consumers (wait-queue drain, snapshot telemetry) would see + // as a ghost trip. (Again, the generation guard above catches this too; + // keep the state check for observability clarity.) + if (state_.load(std::memory_order_acquire) != State::CLOSED) return; + + consecutive_failures_++; + // Capture Now() once and reuse for both the record and the trip check. + // Separate Now() calls can cross a second boundary, letting TotalCount's + // internal Advance() zero the bucket we just wrote — with window_seconds=1, + // a 1-second delta trips the Advance full-reset path and the just-recorded + // failure disappears from the ring, missing a rate trip that should fire. + auto now = Now(); + window_.AddFailure(now); + + if (ShouldTripClosed(now)) { + const char* trigger = + (consecutive_failures_ >= config_.consecutive_failure_threshold) + ? "consecutive" : "rate"; + // Thread `now` through so the trip log's window stats reflect the + // same view ShouldTripClosed just used. + TripClosedToOpen(trigger, now); + } +} + +void CircuitBreakerSlice::ReportNeutral(bool probe, + uint64_t admission_generation) { + if (!config_.enabled) return; + if (!probe) { + // CLOSED-state admission: no slot to release. The bool parameter + // exists for API symmetry with ReportSuccess/ReportFailure; a + // neutral outcome in CLOSED simply means the breaker records + // nothing (which matches pre-neutral behavior — POOL_EXHAUSTED, + // shutdown, and similar local terminations were already "ignored" + // on the CLOSED path). + return; + } + + // Probe: gate on halfopen_gen_ + current state, matching the other + // Report* paths. Stale (pre-transition or pre-reload) neutral + // completions drop silently into the stale-generation counter. + if (admission_generation != halfopen_gen_) { + reports_stale_generation_.fetch_add(1, std::memory_order_relaxed); + return; + } + if (state_.load(std::memory_order_acquire) != State::HALF_OPEN) return; + + // Return the slot to the cycle. Decrement BOTH inflight and admitted: + // - inflight so the last-probe re-trip logic below fires correctly, + // - admitted so a replacement probe can still be admitted within + // this cycle's budget (the whole point of a neutral release — + // the upstream wasn't actually exercised by this admission). + if (half_open_inflight_ > 0) half_open_inflight_--; + if (half_open_admitted_ > 0) half_open_admitted_--; + + // If an earlier sibling probe failed and this neutral release drains + // the last in-flight probe, the cycle must re-trip — otherwise the + // slice would wedge in HALF_OPEN with saw_failure=true, rejecting all + // future admissions via Case A forever. Mirrors the failure-path + // last-probe trigger. + if (half_open_saw_failure_ && half_open_inflight_ == 0) { + TripHalfOpenToOpen("probe_fail"); + } +} + +void CircuitBreakerSlice::Reload(const CircuitBreakerConfig& new_config) { + const bool enabled_changed = (config_.enabled != new_config.enabled); + const bool window_changed = + (config_.window_seconds != new_config.window_seconds); + // Snapshot the OLD dry_run before config_ is overwritten — used at + // the end of Reload to detect a true→false flip and signal the + // host to drain any waiters that accumulated during shadow mode. + const bool old_dry_run = config_.dry_run; + + config_ = new_config; + if (window_changed) { + // Resize wipes the failure-rate ring buckets. Without bumping + // closed_gen_ here, late completions from pre-reload CLOSED + // admissions would pass the generation guard and repopulate the + // freshly empty window — mixing pre-reload and post-reload traffic + // in the rate-trip calc. + // + // CRUCIALLY: we bump ONLY closed_gen_, NOT halfopen_gen_. + // window_seconds affects only the CLOSED rate window. Bumping + // halfopen_gen_ too (as prior fix did) would invalidate in-flight + // probes, whose late reports could no longer decrement + // half_open_inflight_ or honor saw_failure/TripHalfOpenToOpen — + // wedging the slice in HALF_OPEN/half_open_full with full probe + // slots until another reset. Probe bookkeeping is untouched by + // Resize, so preserving halfopen_gen_ keeps probes live. + // + // Skip when enabled_changed is also true: the full-reset branch + // below bumps both generations as part of its larger reset. + window_.Resize(new_config.window_seconds); + if (!enabled_changed) { + // Reset consecutive_failures_ alongside the window wipe. + // Both are CLOSED-domain state from the same observation cycle. + // Bumping closed_gen_ drops all pre-reload CLOSED reports + // (correct — they must not seed the fresh window). But if + // consecutive_failures_ is NOT also reset, those dropped reports + // can no longer clear or advance the counter either, so the + // leftover count becomes an orphaned value that mis-fires future + // trip evaluations (spurious trip: pre-reload success was going + // to clear the counter but got dropped, so the next real failure + // crosses the threshold using a stale count). + consecutive_failures_ = 0; + ++closed_gen_; + } + } + + if (enabled_changed) { + // Toggling `enabled` is an operator intent to start fresh, not a + // runtime state transition. Without this reset: + // - Disabling while OPEN and re-enabling later would resume the + // OPEN state and reject requests even though the operator + // explicitly turned the breaker off and back on. + // - Disabling while HALF_OPEN with in-flight probes would leave + // inconsistent bookkeeping (inflight > 0, state=HALF_OPEN) that + // a subsequent enable would interpret as live probes. + // - Disabling mid-CLOSED-cycle and re-enabling would trip on the + // very next failure because consecutive_failures_ persisted. + // Matches design doc §10.1 (enabled→disabled / disabled→enabled + // transitions both get a clean CLOSED start). + // + // Silent reset — no transition callback. The change is operator- + // initiated configuration, not a runtime state signal; firing the + // callback would cause PoolPartition::DrainWaitQueueOnTrip-style + // consumers (the wait-queue drain transition callback) to spuriously + // drain waiters on a config edit. + state_.store(State::CLOSED, std::memory_order_release); + open_until_steady_ns_.store(0, std::memory_order_release); + consecutive_trips_.store(0, std::memory_order_relaxed); + consecutive_failures_ = 0; + window_.Reset(); + half_open_inflight_ = 0; + half_open_successes_ = 0; + half_open_saw_failure_ = false; + first_reject_logged_for_open_ = false; + // Fresh generations for BOTH domains: this is a full reset. + // Both pre-toggle non-probe admissions (closed_gen) and in-flight + // probes (halfopen_gen) are invalidated — their late reports + // silently drop, preserving clean-restart semantics. + ++closed_gen_; + ++halfopen_gen_; + } + // When `enabled` is unchanged: live state preserved — operator expects + // new thresholds to apply to the next evaluation, not to reset an + // in-progress trip. + + logging::Get()->info( + "circuit breaker config applied {} enabled={} window_s={} " + "fail_rate={} consec_threshold={}{}", + host_label_, new_config.enabled, new_config.window_seconds, + new_config.failure_rate_threshold, + new_config.consecutive_failure_threshold, + enabled_changed ? " (enabled toggled — state reset to CLOSED)" : ""); + + // dry_run true→false on a slice that's STILL OPEN: enforcement is + // back on, but the OPEN→OPEN intra-state config edit doesn't fire + // any natural transition callback. The pool partition may have + // queued waiters from the shadow-mode period (the original + // CLOSED→OPEN drain was skipped because dry_run was true at the + // time). Without flushing them now, those queued requests will + // eventually dispatch to the unhealthy upstream once a pool slot + // frees, defeating the just-re-enabled enforcement. + // + // Signal the host via a synthetic OPEN→OPEN transition callback + // with trigger="dry_run_disabled". The HttpServer-installed + // callback recognizes this special trigger and drains the + // partition queue. Real state transitions never reuse the same + // old/new state with this trigger string, so there's no overlap. + // + // IMPORTANT — why this does NOT fire in HALF_OPEN: HALF_OPEN + // queues can mix two admission kinds that share a partition wait + // slot but differ on slice bookkeeping: + // (a) Valid probes admitted within permitted_half_open_calls — + // admission_generation_ = current halfopen_gen_, holding a + // real half_open_inflight_/admitted_ slot. These drive + // recovery on a healthy upstream and must NOT be disrupted + // by an operator config flip. + // (b) Dry-run-admitted shadow requests (half_open_full / + // half_open_recovery_failing paths) — admission_generation_ + // = 0 (RejectWithLog sentinel). Their outcomes drop as + // stale-gen on report, so they never influence the slice's + // state machine and are bounded by pool queue size. + // DrainWaitQueueOnTrip is partition-wide and can't tell (a) from + // (b); draining would 503 valid probes (delaying/preventing + // recovery) to also drop the harmless (b). We accept the small + // bounded leak of (b) as the lesser evil. + // + // State is dispatcher-thread-only here; a plain load is sufficient. + if (old_dry_run && !new_config.dry_run && + state_.load(std::memory_order_acquire) == State::OPEN && + transition_cb_) { + logging::Get()->info( + "circuit breaker dry_run disabled while OPEN {} — " + "flushing wait queue", host_label_); + transition_cb_(State::OPEN, State::OPEN, "dry_run_disabled"); + } +} + +void CircuitBreakerSlice::SetTransitionCallback(StateTransitionCallback cb) { + transition_cb_ = std::move(cb); +} + +} // namespace circuit_breaker diff --git a/server/circuit_breaker_window.cc b/server/circuit_breaker_window.cc new file mode 100644 index 00000000..776c00ec --- /dev/null +++ b/server/circuit_breaker_window.cc @@ -0,0 +1,99 @@ +#include "circuit_breaker/circuit_breaker_window.h" + +namespace circuit_breaker { + +// Map an epoch-second value into a non-negative bucket index. C++ built-in `%` +// can return a negative result when the dividend is negative — and while +// `steady_clock::time_since_epoch()` is zero-based on all mainstream +// libstdc++/libc++ implementations, the standard does not strictly guarantee a +// non-negative epoch across every implementation. The extra `+ w` and second +// `% w` costs a single add + mod on the slow (negative) branch, zero observable +// overhead on the common positive branch after the compiler eliminates the +// redundant math. +static inline size_t BucketIndex(int64_t epoch_sec, int window_seconds) { + const int64_t w = window_seconds; + return static_cast(((epoch_sec % w) + w) % w); +} + +CircuitBreakerWindow::CircuitBreakerWindow(int window_seconds) + // Clamp to a minimum of 1 bucket. ConfigLoader::Validate() rejects + // window_seconds <= 0 on the production path, but the constructor is a + // public API and programmatic callers (tests, future direct users) may + // bypass that validation. Without the clamp, BucketIndex() does `% 0` on + // the first Add/TotalCount and crashes; negative values violate the ring + // math. Matches Resize()'s clamp so the two entry points are symmetric. + : window_seconds_(window_seconds > 0 ? window_seconds : 1), + buckets_(static_cast(window_seconds_)) { +} + +int64_t CircuitBreakerWindow::ToEpochSec( + std::chrono::steady_clock::time_point now) { + return std::chrono::duration_cast( + now.time_since_epoch()).count(); +} + +void CircuitBreakerWindow::Advance(int64_t now_sec) { + if (head_epoch_sec_ < 0) { + head_epoch_sec_ = now_sec; + return; + } + if (now_sec <= head_epoch_sec_) return; + int64_t delta = now_sec - head_epoch_sec_; + // If delta exceeds window size, everything is stale — full reset. + if (delta >= window_seconds_) { + for (auto& b : buckets_) { b.total = 0; b.failures = 0; } + } else { + // Zero buckets from head+1..now_sec inclusive. + for (int64_t s = head_epoch_sec_ + 1; s <= now_sec; ++s) { + size_t idx = BucketIndex(s, window_seconds_); + buckets_[idx].total = 0; + buckets_[idx].failures = 0; + } + } + head_epoch_sec_ = now_sec; +} + +void CircuitBreakerWindow::AddSuccess( + std::chrono::steady_clock::time_point now) { + int64_t now_sec = ToEpochSec(now); + Advance(now_sec); + buckets_[BucketIndex(now_sec, window_seconds_)].total++; +} + +void CircuitBreakerWindow::AddFailure( + std::chrono::steady_clock::time_point now) { + int64_t now_sec = ToEpochSec(now); + Advance(now_sec); + size_t idx = BucketIndex(now_sec, window_seconds_); + buckets_[idx].total++; + buckets_[idx].failures++; +} + +int64_t CircuitBreakerWindow::TotalCount( + std::chrono::steady_clock::time_point now) { + Advance(ToEpochSec(now)); + int64_t sum = 0; + for (const auto& b : buckets_) sum += b.total; + return sum; +} + +int64_t CircuitBreakerWindow::FailureCount( + std::chrono::steady_clock::time_point now) { + Advance(ToEpochSec(now)); + int64_t sum = 0; + for (const auto& b : buckets_) sum += b.failures; + return sum; +} + +void CircuitBreakerWindow::Reset() { + for (auto& b : buckets_) { b.total = 0; b.failures = 0; } + head_epoch_sec_ = -1; +} + +void CircuitBreakerWindow::Resize(int new_window_seconds) { + window_seconds_ = new_window_seconds > 0 ? new_window_seconds : 1; + buckets_.assign(static_cast(window_seconds_), Bucket{}); + head_epoch_sec_ = -1; +} + +} // namespace circuit_breaker diff --git a/server/config_loader.cc b/server/config_loader.cc index 80e9312f..38fb2fb4 100644 --- a/server/config_loader.cc +++ b/server/config_loader.cc @@ -267,6 +267,60 @@ ServerConfig ConfigLoader::LoadFromString(const std::string& json_str) { } } + if (item.contains("circuit_breaker")) { + if (!item["circuit_breaker"].is_object()) + throw std::runtime_error("upstream circuit_breaker must be an object"); + auto& cb = item["circuit_breaker"]; + // Strict integer accessor: rejects float/bool/string inputs + // that nlohmann's default value() would silently coerce + // (e.g., 1.9 → 1, true → 1). Without this, malformed configs + // pass Validate() and change breaker behavior in production. + auto cb_int = [&cb](const char* name, int default_val) -> int { + if (!cb.contains(name)) return default_val; + const auto& v = cb[name]; + if (!v.is_number_integer()) { + throw std::invalid_argument( + std::string("circuit_breaker.") + name + + " must be an integer"); + } + return v.get(); + }; + auto cb_bool = [&cb](const char* name, bool default_val) -> bool { + if (!cb.contains(name)) return default_val; + const auto& v = cb[name]; + if (!v.is_boolean()) { + throw std::invalid_argument( + std::string("circuit_breaker.") + name + + " must be a boolean"); + } + return v.get(); + }; + upstream.circuit_breaker.enabled = + cb_bool("enabled", false); + upstream.circuit_breaker.dry_run = + cb_bool("dry_run", false); + upstream.circuit_breaker.consecutive_failure_threshold = + cb_int("consecutive_failure_threshold", 5); + upstream.circuit_breaker.failure_rate_threshold = + cb_int("failure_rate_threshold", 50); + upstream.circuit_breaker.minimum_volume = + cb_int("minimum_volume", 20); + upstream.circuit_breaker.window_seconds = + cb_int("window_seconds", 10); + upstream.circuit_breaker.permitted_half_open_calls = + cb_int("permitted_half_open_calls", 5); + upstream.circuit_breaker.base_open_duration_ms = + cb_int("base_open_duration_ms", 5000); + upstream.circuit_breaker.max_open_duration_ms = + cb_int("max_open_duration_ms", 60000); + upstream.circuit_breaker.max_ejection_percent_per_host_set = + cb_int("max_ejection_percent_per_host_set", 50); + upstream.circuit_breaker.retry_budget_percent = + cb_int("retry_budget_percent", 20); + upstream.circuit_breaker.retry_budget_min_concurrency = + cb_int("retry_budget_min_concurrency", 3); + } + config.upstreams.push_back(std::move(upstream)); } } @@ -507,6 +561,115 @@ void ConfigLoader::ApplyEnvOverrides(ServerConfig& config) { if (val) config.rate_limit.status_code = EnvToInt(val, "REACTOR_RATE_LIMIT_STATUS_CODE"); } +void ConfigLoader::ValidateHotReloadable( + const ServerConfig& config, + const std::unordered_set& live_upstream_names) { + // Mirrors the circuit_breaker validation block in Validate(). + // Kept in lock-step with that block — any rule added there for a + // hot-reloadable field must be added here too, or the SIGHUP + // reload path would silently accept values the startup path + // rejects (which is exactly the regression this helper exists + // to prevent). + + // Reject duplicate upstream service names BEFORE the per-upstream + // CB validation. Even for new/renamed entries, the file is + // malformed if names collide: `CircuitBreakerManager::Reload` + // iterates the new upstream list and applies each entry's + // `circuit_breaker` block to GetHost(name); duplicates would + // silently overwrite (last-write wins). Startup's full Validate() + // rejects the file outright; the hot-reload path must match. + // This rule runs UNCONDITIONALLY on the new config — it doesn't + // depend on `live_upstream_names`. + { + std::unordered_set seen; + seen.reserve(config.upstreams.size()); + for (size_t i = 0; i < config.upstreams.size(); ++i) { + const auto& name = config.upstreams[i].name; + if (!seen.insert(name).second) { + throw std::invalid_argument( + "upstreams[" + std::to_string(i) + + "] duplicate service name '" + name + + "' (upstream service names must be unique)"); + } + } + } + + for (size_t i = 0; i < config.upstreams.size(); ++i) { + const auto& u = config.upstreams[i]; + const std::string idx = "upstreams[" + std::to_string(i) + "]"; + + // CB-field validation is scoped to upstreams that are LIVE in + // the running server. CircuitBreakerManager::Reload only + // applies CB changes to pre-existing hosts — new/renamed + // entries are restart-only and skipped with a warn — so + // validating their CB blocks here would block otherwise-safe + // reloads (e.g. a reload that stages a new upstream alongside + // a log-level edit would abort even though the live server + // would never apply the new upstream's CB block). + // + // The empty-set case (no live upstreams yet) is handled by + // the same check: every entry is "new", so every entry is + // skipped — only the duplicate-name check runs. + if (live_upstream_names.find(u.name) == live_upstream_names.end()) { + continue; + } + const auto& cb = u.circuit_breaker; + if (cb.consecutive_failure_threshold < 1 || + cb.consecutive_failure_threshold > 10000) { + throw std::invalid_argument( + idx + " ('" + u.name + + "'): circuit_breaker.consecutive_failure_threshold must be in [1, 10000]"); + } + if (cb.failure_rate_threshold < 0 || cb.failure_rate_threshold > 100) { + throw std::invalid_argument( + idx + " ('" + u.name + + "'): circuit_breaker.failure_rate_threshold must be in [0, 100]"); + } + if (cb.minimum_volume < 1 || cb.minimum_volume > 10000000) { + throw std::invalid_argument( + idx + " ('" + u.name + + "'): circuit_breaker.minimum_volume must be in [1, 10000000]"); + } + if (cb.window_seconds < 1 || cb.window_seconds > 3600) { + throw std::invalid_argument( + idx + " ('" + u.name + + "'): circuit_breaker.window_seconds must be in [1, 3600]"); + } + if (cb.permitted_half_open_calls < 1 || + cb.permitted_half_open_calls > 1000) { + throw std::invalid_argument( + idx + " ('" + u.name + + "'): circuit_breaker.permitted_half_open_calls must be in [1, 1000]"); + } + if (cb.base_open_duration_ms < 100) { + throw std::invalid_argument( + idx + " ('" + u.name + + "'): circuit_breaker.base_open_duration_ms must be >= 100"); + } + if (cb.max_open_duration_ms < cb.base_open_duration_ms) { + throw std::invalid_argument( + idx + " ('" + u.name + + "'): circuit_breaker.max_open_duration_ms must be >= base_open_duration_ms"); + } + if (cb.max_ejection_percent_per_host_set < 0 || + cb.max_ejection_percent_per_host_set > 100) { + throw std::invalid_argument( + idx + " ('" + u.name + + "'): circuit_breaker.max_ejection_percent_per_host_set must be in [0, 100]"); + } + if (cb.retry_budget_percent < 0 || cb.retry_budget_percent > 100) { + throw std::invalid_argument( + idx + " ('" + u.name + + "'): circuit_breaker.retry_budget_percent must be in [0, 100]"); + } + if (cb.retry_budget_min_concurrency < 0) { + throw std::invalid_argument( + idx + " ('" + u.name + + "'): circuit_breaker.retry_budget_min_concurrency must be >= 0"); + } + } +} + void ConfigLoader::Validate(const ServerConfig& config) { // Validate bind_host is a strict dotted-quad IPv4 address. // Use inet_pton (not inet_addr) to reject legacy shorthand forms @@ -811,6 +974,69 @@ void ConfigLoader::Validate(const ServerConfig& config) { idx + " ('" + u.name + "'): proxy.retry.max_retries must be >= 0 and <= 10"); } + + // Circuit breaker validation. + // + // Upper bounds on counting fields are generous — they exist to + // catch pathological configs (typo like "10_000_000_000" or a + // missing unit conversion), not to constrain legitimate tuning. + // Defaults are 5 / 20 / 5; limits are 1000× to 50000× the defaults. + { + const auto& cb = u.circuit_breaker; + if (cb.consecutive_failure_threshold < 1 || + cb.consecutive_failure_threshold > 10000) { + throw std::invalid_argument( + idx + " ('" + u.name + + "'): circuit_breaker.consecutive_failure_threshold must be in [1, 10000]"); + } + if (cb.failure_rate_threshold < 0 || cb.failure_rate_threshold > 100) { + throw std::invalid_argument( + idx + " ('" + u.name + + "'): circuit_breaker.failure_rate_threshold must be in [0, 100]"); + } + if (cb.minimum_volume < 1 || cb.minimum_volume > 10000000) { + throw std::invalid_argument( + idx + " ('" + u.name + + "'): circuit_breaker.minimum_volume must be in [1, 10000000]"); + } + if (cb.window_seconds < 1 || cb.window_seconds > 3600) { + throw std::invalid_argument( + idx + " ('" + u.name + + "'): circuit_breaker.window_seconds must be in [1, 3600]"); + } + if (cb.permitted_half_open_calls < 1 || + cb.permitted_half_open_calls > 1000) { + throw std::invalid_argument( + idx + " ('" + u.name + + "'): circuit_breaker.permitted_half_open_calls must be in [1, 1000]"); + } + if (cb.base_open_duration_ms < 100) { + throw std::invalid_argument( + idx + " ('" + u.name + + "'): circuit_breaker.base_open_duration_ms must be >= 100"); + } + if (cb.max_open_duration_ms < cb.base_open_duration_ms) { + throw std::invalid_argument( + idx + " ('" + u.name + + "'): circuit_breaker.max_open_duration_ms must be >= base_open_duration_ms"); + } + if (cb.max_ejection_percent_per_host_set < 0 || + cb.max_ejection_percent_per_host_set > 100) { + throw std::invalid_argument( + idx + " ('" + u.name + + "'): circuit_breaker.max_ejection_percent_per_host_set must be in [0, 100]"); + } + if (cb.retry_budget_percent < 0 || cb.retry_budget_percent > 100) { + throw std::invalid_argument( + idx + " ('" + u.name + + "'): circuit_breaker.retry_budget_percent must be in [0, 100]"); + } + if (cb.retry_budget_min_concurrency < 0) { + throw std::invalid_argument( + idx + " ('" + u.name + + "'): circuit_breaker.retry_budget_min_concurrency must be >= 0"); + } + } // Validate method names — reject unknowns and duplicates. // Duplicates would cause RouteAsync to throw at startup. { @@ -1073,6 +1299,31 @@ std::string ConfigLoader::ToJson(const ServerConfig& config) { uj["proxy"] = pj; } + // Always serialize circuit_breaker — same rationale as proxy block. + if (u.circuit_breaker != CircuitBreakerConfig{}) { + nlohmann::json cbj; + cbj["enabled"] = u.circuit_breaker.enabled; + cbj["dry_run"] = u.circuit_breaker.dry_run; + cbj["consecutive_failure_threshold"] = + u.circuit_breaker.consecutive_failure_threshold; + cbj["failure_rate_threshold"] = + u.circuit_breaker.failure_rate_threshold; + cbj["minimum_volume"] = u.circuit_breaker.minimum_volume; + cbj["window_seconds"] = u.circuit_breaker.window_seconds; + cbj["permitted_half_open_calls"] = + u.circuit_breaker.permitted_half_open_calls; + cbj["base_open_duration_ms"] = + u.circuit_breaker.base_open_duration_ms; + cbj["max_open_duration_ms"] = + u.circuit_breaker.max_open_duration_ms; + cbj["max_ejection_percent_per_host_set"] = + u.circuit_breaker.max_ejection_percent_per_host_set; + cbj["retry_budget_percent"] = + u.circuit_breaker.retry_budget_percent; + cbj["retry_budget_min_concurrency"] = + u.circuit_breaker.retry_budget_min_concurrency; + uj["circuit_breaker"] = cbj; + } j["upstreams"].push_back(uj); } diff --git a/server/http_server.cc b/server/http_server.cc index b9edda92..67575de7 100644 --- a/server/http_server.cc +++ b/server/http_server.cc @@ -6,6 +6,10 @@ #include "http2/http2_constants.h" #include "upstream/upstream_manager.h" #include "upstream/proxy_handler.h" +#include "circuit_breaker/circuit_breaker_manager.h" +#include "circuit_breaker/circuit_breaker_host.h" +#include "circuit_breaker/circuit_breaker_slice.h" +#include "upstream/pool_partition.h" #include "log/logger.h" #include "log/log_utils.h" #include @@ -410,6 +414,140 @@ void HttpServer::MarkServerReady() { throw; } + // Circuit breaker — built alongside the pool. One host per + // configured upstream (regardless of enabled), with one slice + // per dispatcher so hot-path TryAcquire is lock-free. Attached + // to UpstreamManager via a non-owning pointer so ProxyTransaction + // can reach it on the hot path via upstream_manager_-> + // GetCircuitBreakerManager(). The manager is declared AFTER + // upstream_manager_ on HttpServer (see header) so teardown runs + // breaker-first, which matches the dangling-pointer safety rule + // in UpstreamManager::breaker_manager_. + try { + circuit_breaker_manager_ = + std::make_unique( + upstream_configs_, dispatchers.size(), dispatchers); + upstream_manager_->AttachCircuitBreakerManager( + circuit_breaker_manager_.get()); + + // Wire CLOSED→OPEN transition callbacks for every slice of every + // host — regardless of `enabled=false`, per design §3.1 R3-1. A + // disabled slice never fires transitions (TryAcquire short- + // circuits to ADMITTED); wiring the callback costs nothing but + // lets a live reload flip enable=false→true without re-wiring. + // + // The callback routes trip events to the corresponding + // PoolPartition's DrainWaitQueueOnTrip so queued waiters fail + // fast with CHECKOUT_CIRCUIT_OPEN instead of waiting out the + // open window. Each slice gets a distinct callback that + // captures its (service, dispatcher_index) pair — we can't use + // SetTransitionCallbackOnAllSlices because that would install a + // single callback across slices that need different partition + // lookups. + // + // Safe to capture raw `UpstreamManager*`: CircuitBreakerManager + // destructs BEFORE UpstreamManager (§3.1 ownership), and slice + // callbacks only fire on dispatcher threads which are stopped + // before either manager is destroyed. So any live callback + // invocation sees a valid UpstreamManager. + UpstreamManager* um = upstream_manager_.get(); + for (const auto& u : upstream_configs_) { + auto* host = circuit_breaker_manager_->GetHost(u.name); + if (!host) continue; + std::string service = u.name; + for (size_t i = 0; i < host->partition_count(); ++i) { + auto* slice = host->GetSlice(i); + if (!slice) continue; + // Capture the slice pointer so the callback can read + // the LIVE `dry_run` flag on every fire — operators + // can toggle dry_run via SIGHUP, and the drain + // decision must reflect the current setting, not a + // snapshot from server startup. Slice lifetime is + // tied to the manager (declared after upstream + // manager → destructs first), so the raw pointer + // outlives every possible callback invocation. + auto* slice_ptr = slice; + slice->SetTransitionCallback( + [um, service, i, slice_ptr]( + circuit_breaker::State old_s, + circuit_breaker::State new_s, + const char* trigger) { + // Three drain triggers, all entering OPEN: + // CLOSED→OPEN : fresh trip; queued non- + // probe waiters need CHECKOUT_CIRCUIT_OPEN + // instead of waiting out the full open + // window. + // HALF_OPEN→OPEN : probe cycle re-tripped; + // probe admissions passed ConsultBreaker + // before CheckoutAsync, so saturated + // pools can leave them queued. Without + // draining they eventually dispatch to a + // known-bad upstream. + // OPEN→OPEN with trigger="dry_run_disabled" + // : synthetic signal from + // CircuitBreakerSlice::Reload when + // dry_run flips true→false on a slice + // that's still OPEN. The earlier trip + // skipped the drain (shadow mode); now + // enforcement is back on, queued + // waiters from that period must be + // flushed before the pool services + // them. Real transitions never use this + // trigger string with old==new==OPEN, + // so there's no overlap with normal + // state-machine signals. + // (The slice intentionally does NOT + // fire this signal in HALF_OPEN — see + // CircuitBreakerSlice::Reload for why + // valid probes must not be flushed.) + const bool normal_trip = + new_s == circuit_breaker::State::OPEN && + (old_s == circuit_breaker::State::CLOSED || + old_s == circuit_breaker::State::HALF_OPEN); + const bool dry_run_disable_drain = + old_s == circuit_breaker::State::OPEN && + new_s == circuit_breaker::State::OPEN && + trigger != nullptr && + std::strcmp(trigger, + "dry_run_disabled") == 0; + if (!normal_trip && !dry_run_disable_drain) { + return; + } + // Dry-run shadow-mode contract: the slice + // log-but-admits would-reject decisions, so + // the wait-queue drain — which would + // deliver hard 503s (CHECKOUT_CIRCUIT_OPEN + // → RESULT_CIRCUIT_OPEN) to queued + // waiters — must also be a no-op while + // dry_run is true. Note: when this fires + // via the dry_run_disabled trigger, the + // slice's config_.dry_run was already + // updated to false in Reload BEFORE the + // synthetic callback, so this guard + // correctly does NOT skip the drain in + // that case. + if (slice_ptr && slice_ptr->config().dry_run) { + logging::Get()->info( + "[dry-run] circuit breaker would drain " + "wait queue on trip — skipping (shadow " + "mode) service={} partition={}", + service, i); + return; + } + if (auto* part = um->GetPoolPartition( + service, i)) { + part->DrainWaitQueueOnTrip(); + } + }); + } + } + } catch (...) { + logging::Get()->error( + "Circuit breaker init failed, stopping server"); + net_server_.Stop(); + throw; + } + // Ensure the timer cadence is fast enough for upstream connect timeouts. // SetDeadline stores a ms-precision deadline, but TimerHandler only fires // at the timer scan interval. If connect_timeout_ms < current interval, @@ -3451,8 +3589,16 @@ bool HttpServer::Reload(const ServerConfig& new_config) { // field changes (timeouts, limits, log level). validation_copy.http2.enabled = http2_enabled_ && new_config.http2.enabled; - // Upstream configs are restart-only — clear them so staged edits - // in the config file don't block live-safe field reloads. + // Upstream configs are RESTART-ONLY for topology fields, but the + // per-upstream `circuit_breaker` block is HOT-RELOADABLE — clearing + // upstreams entirely from validation_copy would skip CB-field + // validation here. Instead: clear the topology-restart-only + // path (the full Validate would reject those) and run a separate + // ValidateHotReloadable on the original new_config so live- + // reloadable CB rules (range checks, duplicate names) are + // enforced symmetrically with the SIGHUP path in main.cc. + // Without this, in-process callers using HttpServer::Reload + // directly would bypass the gate that the CLI path enforces. validation_copy.upstreams.clear(); // Rate limit config IS live-reloadable and MUST be validated. // Unlike upstreams (restart-only), rate_limit changes are applied @@ -3465,6 +3611,29 @@ bool HttpServer::Reload(const ServerConfig& new_config) { logging::Get()->error("Reload() rejected invalid config: {}", e.what()); return false; } + // Strict gate for hot-reloadable CB fields + duplicate names. + // Mirrors main.cc::ReloadConfig — both entry points must reject + // invalid CB tuning before it reaches live slices. + // + // CB validation is scoped to existing upstream names: only + // those entries get applied via CircuitBreakerManager::Reload, + // so validating CB blocks for new/renamed entries would + // block otherwise-safe reloads. `upstream_configs_` is the + // post-Start snapshot of running upstreams. + { + std::unordered_set live_names; + live_names.reserve(upstream_configs_.size()); + for (const auto& u : upstream_configs_) { + live_names.insert(u.name); + } + try { + ConfigLoader::ValidateHotReloadable(new_config, live_names); + } catch (const std::invalid_argument& e) { + logging::Get()->error("Reload() rejected invalid config: {}", + e.what()); + return false; + } + } } // Three-phase update to prevent mid-reload connections from seeing @@ -3645,11 +3814,51 @@ bool HttpServer::Reload(const ServerConfig& new_config) { rate_limit_manager_->Reload(new_config.rate_limit); } - // Upstream pool changes require a restart — pools are built once in Start() - // and cannot be rebuilt at runtime without a full drain cycle. + // Circuit breaker reload — live-propagates breaker-field edits on + // existing upstream services. CircuitBreakerManager::Reload is + // idempotent (atomic stores to unchanged values), so calling it + // unconditionally costs nothing when the operator didn't edit any + // breaker fields. Topology changes (added / removed service names) + // are logged as warn + skipped inside the manager; the outer + // restart-required warning still fires via the upstreams-inequality + // check below. After this call, update the breaker slices on every + // partition via per-dispatcher EnQueue — the manager handles that + // routing internally. The topology check itself now only diffs non- + // breaker fields (UpstreamConfig::operator== excludes circuit_breaker), + // so a CB-only SIGHUP is a clean hot reload with no spurious warn. + if (circuit_breaker_manager_) { + circuit_breaker_manager_->Reload(new_config.upstreams); + } + + // Upstream topology changes (host/port/pool/proxy/tls) require a + // restart — pools are built once in Start() and cannot be rebuilt + // at runtime without a full drain cycle. The equality operator on + // UpstreamConfig deliberately excludes `circuit_breaker` so a CB- + // only edit doesn't trigger this warning (the reload above already + // applied the new breaker settings to live slices). + // + // When topology DIFFERS, we deliberately DO NOT copy the staged + // config into `upstream_configs_`: subsequent reloads (including + // the timer-cadence recomputation above) read from this vector to + // match live pool state. Adopting staged-but-inactive topology + // values would silently widen the dispatcher timer past the active + // pool timeouts — e.g. staging `pool.connect_timeout_ms=10000` + // (restart required) then reloading any unrelated field would + // recompute cadence from 10s while the live pool still uses 3s, + // firing connect-timeouts late. The CB-field portion of the edit + // was already applied live via `circuit_breaker_manager_->Reload` + // above, so the live slices carry the new tuning regardless of + // whether `upstream_configs_` shows it. + // + // When topology MATCHES (the common case, including CB-only + // edits), adopt the new snapshot as the fresh baseline so CB- + // field edits persist for later reload diffs. if (new_config.upstreams != upstream_configs_) { - logging::Get()->warn("Reload: upstream configuration changes require a " - "restart to take effect (ignored)"); + logging::Get()->warn("Reload: upstream topology changes require a " + "restart to take effect (circuit-breaker " + "field edits, if any, were applied live)"); + } else { + upstream_configs_ = new_config.upstreams; } return true; diff --git a/server/main.cc b/server/main.cc index 06dd2551..e0fa7790 100644 --- a/server/main.cc +++ b/server/main.cc @@ -328,7 +328,41 @@ static bool ReloadConfig(const std::string& config_path, } } } + // Hot-reloadable fields (today: per-upstream `circuit_breaker.*` + // on existing services + duplicate-name uniqueness across the + // new file) are the only ones that go LIVE on a SIGHUP reload. + // Validate them strictly — a bad value here would be pushed into + // running slices and keep running until an operator-driven + // restart fixes the config file. Hard-reject so operators see + // the error immediately instead of discovering drift the next + // time the startup path rejects the same file. + // + // CB validation is scoped to existing upstream names — + // CircuitBreakerManager::Reload only applies CB changes to those. + // New/renamed upstreams are restart-only; their CB blocks are + // skipped here so an intentional placeholder doesn't block other + // live-safe edits in the same reload (log/rate-limit/breaker + // edits on existing services). + { + std::unordered_set live_names; + live_names.reserve(current_config.upstreams.size()); + for (const auto& u : current_config.upstreams) { + live_names.insert(u.name); + } + try { + ConfigLoader::ValidateHotReloadable(new_config, live_names); + } catch (const std::invalid_argument& e) { + logging::Get()->error("Config reload rejected: {}", e.what()); + reopen_existing_logs(); + return false; + } + } + // Warn about restart-required field issues (not applied during reload). + // Full Validate() includes both hot-reloadable rules (already checked + // above) and restart-only rules; by the time we reach this point the + // hot-reloadable subset is known valid, so any exception thrown here + // is from restart-only rules and is legitimately a warn, not an error. try { ConfigLoader::Validate(new_config); } catch (const std::invalid_argument& e) { @@ -427,6 +461,19 @@ static bool ReloadConfig(const std::string& config_path, auto saved_tls = current_config.tls; auto saved_workers = current_config.worker_threads; auto saved_h2_enabled = current_config.http2.enabled; + // Preserve upstreams for the same reason: HttpServer::Reload treats + // the whole upstream block as restart-required (see http_server.cc + // upstream_configs_ comparison), and that internal copy never changes + // post-startup. If we overwrote current_config.upstreams here, a + // breaker-only edit would stage into current_config while the live + // server keeps running the startup values — /stats and other + // current_config consumers would report phantom state, and subsequent + // identical reloads could produce inconsistent diagnostics. Pin to + // the running values until CircuitBreakerManager::Reload implements + // CircuitBreakerManager::Reload (the only upstream sub-field that + // becomes hot-reloadable); at that point this save becomes a + // partial-field save excluding circuit_breaker. + auto saved_upstreams = current_config.upstreams; current_config = new_config; @@ -435,6 +482,7 @@ static bool ReloadConfig(const std::string& config_path, current_config.tls = saved_tls; current_config.worker_threads = saved_workers; current_config.http2.enabled = saved_h2_enabled; + current_config.upstreams = std::move(saved_upstreams); // Commit file-backed state only after full success — a failed reload // must not flip this flag or future reloads lose the defaults+env fallback. diff --git a/server/pool_partition.cc b/server/pool_partition.cc index 819c941d..a0ba866c 100644 --- a/server/pool_partition.cc +++ b/server/pool_partition.cc @@ -549,6 +549,41 @@ void PoolPartition::InitiateShutdown() { MaybeSignalDrain(); } +void PoolPartition::DrainWaitQueueOnTrip() { + // Hoist alive_ — a waiter's error_callback may synchronously trigger + // a request completion path that tears down the partition (e.g. the + // test harness). Same pattern used by InitiateShutdown. + auto alive = alive_; + + if (shutting_down_) { + // Already draining via InitiateShutdown — that path will send + // CHECKOUT_SHUTTING_DOWN to every waiter. Don't double-fire. + return; + } + + if (wait_queue_.empty()) return; + + logging::Get()->info( + "PoolPartition draining wait queue on breaker trip: {}:{} " + "queue_size={}", + upstream_host_, upstream_port_, wait_queue_.size()); + + while (!wait_queue_.empty()) { + auto entry = std::move(wait_queue_.front()); + wait_queue_.pop_front(); + // Cancelled waiters have no callback to fire — the transaction + // already tore its side down via the framework abort hook. + if (IsEntryCancelled(entry)) { + continue; + } + // CHECKOUT_CIRCUIT_OPEN — ProxyTransaction::OnCheckoutError maps + // to RESULT_CIRCUIT_OPEN and delivers MakeCircuitOpenResponse() + // without touching the breaker (our own reject, don't feed back). + entry.error_callback(CHECKOUT_CIRCUIT_OPEN); + if (!alive->load(std::memory_order_acquire)) return; + } +} + void PoolPartition::ForceCloseActive() { // Collect transports + borrower callbacks, then move to zombie, then // close transports, then notify borrowers. This ordering ensures: diff --git a/server/proxy_transaction.cc b/server/proxy_transaction.cc index 18aa6193..d3e8bd82 100644 --- a/server/proxy_transaction.cc +++ b/server/proxy_transaction.cc @@ -2,6 +2,9 @@ #include "upstream/upstream_manager.h" #include "upstream/upstream_connection.h" #include "upstream/http_request_serializer.h" +#include "circuit_breaker/circuit_breaker_manager.h" +#include "circuit_breaker/circuit_breaker_host.h" +#include "circuit_breaker/circuit_breaker_slice.h" #include "connection_handler.h" #include "dispatcher.h" // config/server_config.h provided by proxy_transaction.h (ProxyConfig stored by value) @@ -110,12 +113,109 @@ void ProxyTransaction::Start() { upstream_host_, upstream_port_, method_, upstream_path); + // Resolve the circuit-breaker slice once. Null when no breaker is + // attached (server has no upstreams configured), or when the + // service/dispatcher pair is out of + // range. In any null case the breaker is simply bypassed — the + // transaction proceeds as if circuit breaking were disabled. + if (upstream_manager_ && dispatcher_index_ >= 0) { + auto* cbm = upstream_manager_->GetCircuitBreakerManager(); + if (cbm) { + auto* host = cbm->GetHost(service_name_); + if (host) { + slice_ = host->GetSlice(static_cast(dispatcher_index_)); + // Cache the retry-budget pointer unconditionally when + // the host exists — usage at each attempt is gated by + // the live `slice_->config().enabled` flag so that + // SIGHUP toggles take effect on the next retry within + // a running transaction. Resolution-time gating would + // miss the flip in either direction. + retry_budget_ = host->GetRetryBudget(); + } + } + } + AttemptCheckout(); } void ProxyTransaction::AttemptCheckout() { state_ = State::CHECKOUT_PENDING; + // Circuit breaker gate — consulted before every attempt (first try and + // retries both). Each attempt gets a fresh admission stamped with the + // slice's current generation. If the slice rejects with REJECTED_OPEN, + // ConsultBreaker delivers the §12.1 response and returns false; the + // retry loop treats RESULT_CIRCUIT_OPEN as terminal (§8) so a rejected + // retry produces a single 503 to the client, not a nested retry. + // Dry-run reject logs inside TryAcquire and returns ADMITTED through + // the decision enum (REJECTED_OPEN_DRYRUN), so ConsultBreaker proceeds. + if (!ConsultBreaker()) { + return; + } + + // Retry-budget gate for retry attempts (attempt_ > 0). Gating here + // rather than in MaybeRetry means a delayed retry holds no token + // during its backoff sleep — the budget's `retries_in_flight` + // reflects only retries that are actually about to reach (or are + // reaching) the upstream, matching the "aggregate upstream load" + // semantics of the %-of-in-flight cap. + // + // Live-check `slice_->config().enabled` at each attempt — the + // cached `retry_budget_` pointer is resolved once in Start(), but + // the `enabled` flag is the documented live master switch. A + // SIGHUP flipping enabled=true→false mid-flight must stop + // enforcing the budget on subsequent retries; enabled=false→true + // mid-flight must start. Gating at the pointer level would miss + // both directions. + // + // The `!retry_token_held_` guard is defensive — Cleanup() between + // retry attempts always releases the prior token. + bool breaker_live_enabled = slice_ && slice_->config().enabled; + if (retry_budget_ && breaker_live_enabled && + attempt_ > 0 && !retry_token_held_) { + bool is_dry_run = slice_->config().dry_run; + if (retry_budget_->TryConsumeRetry()) { + retry_token_held_ = true; + } else if (is_dry_run) { + logging::Get()->info( + "ProxyTransaction retry budget would-reject (dry-run) " + "client_fd={} service={} attempt={}", + client_fd_, service_name_, attempt_); + } else { + logging::Get()->warn( + "retry budget exhausted service={} in_flight={} " + "retries_in_flight={} cap={} client_fd={} attempt={}", + service_name_, + retry_budget_->InFlight(), + retry_budget_->RetriesInFlight(), + retry_budget_->ComputeCap(), + client_fd_, attempt_); + // CRITICAL: release the slice admission before bailing. + // ConsultBreaker() already admitted this attempt — in + // HALF_OPEN that means a probe slot was reserved + // (half_open_inflight_ / half_open_admitted_ both + // incremented). Returning here without releasing would + // strand that slot forever, wedging the slice in + // half_open_full until an operator-driven reload/reset. + // Neutral release decrements both counters for probes; + // no-op for non-probe (CLOSED) admissions, matching the + // general "local cause, no upstream signal" semantic. + ReleaseBreakerAdmissionNeutral(); + state_ = State::FAILED; + DeliverResponse(MakeRetryBudgetResponse()); + return; + } + } + + // Track this attempt against the host-level retry budget's + // in_flight counter. Gated by the live `enabled` flag so disabling + // the breaker mid-flight stops tracking immediately; enabling it + // starts tracking at the next attempt. No-op when retry_budget_ + // is null (no breaker manager / unknown host). + if (retry_budget_ && breaker_live_enabled) { + inflight_guard_ = retry_budget_->TrackInFlight(); + } + auto self = shared_from_this(); // Lazily allocate the shared cancel token so the pool can drop @@ -149,6 +249,11 @@ void ProxyTransaction::OnCheckoutReady(UpstreamLease lease) { // returns to the pool for another request to use, instead of // sitting idle attached to a torn-down transaction. lease.Release(); + // Release the breaker admission neutrally — the upstream was + // never exercised, and stranding the slot would wedge a + // HALF_OPEN probe cycle. Cancel() may already have released; + // the helper is no-op in that case. + ReleaseBreakerAdmissionNeutral(); return; } if (state_ != State::CHECKOUT_PENDING) { @@ -224,21 +329,72 @@ void ProxyTransaction::OnCheckoutError(int error_code) { // Only retry actual network connect failures. Pool saturation // (POOL_EXHAUSTED, QUEUE_TIMEOUT) and shutdown should fail fast — // retrying under backpressure amplifies load on an already-stressed - // pool and stretches client latency with no benefit. + // pool and stretches client latency with no benefit. A breaker-drain + // reject (CHECKOUT_CIRCUIT_OPEN from the wait-queue drain) is also + // terminal: the + // client gets the same circuit-open response a fresh requester + // would, and the retry loop must not retry it. + // + // Breaker reporting: connect failures (both timeout and refused) are + // upstream-health signals → ReportFailure(CONNECT_FAILURE). Local + // capacity (POOL_EXHAUSTED, QUEUE_TIMEOUT) and shutdown are NOT + // reported — they don't imply upstream unhealthiness (design §7). + // CHECKOUT_CIRCUIT_OPEN is also not reported to the breaker (would + // be a feedback loop — our own reject counting against the upstream). + // // Import error codes from PoolPartition: - // CHECKOUT_CONNECT_FAILED = -2 → retryable - // CHECKOUT_CONNECT_TIMEOUT = -3 → retryable - // CHECKOUT_POOL_EXHAUSTED = -1 → not retryable - // CHECKOUT_QUEUE_TIMEOUT = -5 → not retryable - // CHECKOUT_SHUTTING_DOWN = -4 → not retryable + // CHECKOUT_CONNECT_FAILED = -2 → retryable, report CONNECT_FAILURE + // CHECKOUT_CONNECT_TIMEOUT = -3 → retryable, report CONNECT_FAILURE + // CHECKOUT_POOL_EXHAUSTED = -1 → not retryable, neutral-release probe + // CHECKOUT_QUEUE_TIMEOUT = -5 → not retryable, neutral-release probe + // CHECKOUT_SHUTTING_DOWN = -4 → not retryable, neutral-release probe + // CHECKOUT_CIRCUIT_OPEN = -6 → not retryable, do NOT report static constexpr int CONNECT_FAILED = -2; static constexpr int CONNECT_TIMEOUT = -3; + static constexpr int CIRCUIT_OPEN = -6; + + if (error_code == CIRCUIT_OPEN) { + // Drain path: breaker tripped while this transaction was queued. + // Do NOT Report success/failure to the slice — our own reject + // must not feed back into the failure math. Emit the §12.1 + // circuit-open response directly. + logging::Get()->info( + "ProxyTransaction checkout drained by circuit breaker " + "client_fd={} service={}", + client_fd_, service_name_); + // Neutral-release the slice admission instead of just clearing + // admission_generation_. Three drain paths reach here: + // CLOSED→OPEN : closed_gen_ was bumped by the trip; our + // generation is now stale → ReportNeutral + // drops as stale-gen. No state mutation. Safe. + // HALF_OPEN→OPEN : halfopen_gen_ was bumped by the trip AND + // half_open_inflight_/admitted_ reset to 0 by + // TransitionOpenToHalfOpen's sibling path → + // ReportNeutral drops as stale-gen. Safe. + // (Any future same-cycle drain without a generation bump): + // admission_generation_ is still current → + // ReportNeutral correctly returns the slot, + // preventing half_open_inflight_/admitted_ + // from leaking and wedging the slice in + // half_open_full until the next reset. + // ReleaseBreakerAdmissionNeutral clears admission_generation_ + // internally, so Cleanup/destructor won't double-report. + ReleaseBreakerAdmissionNeutral(); + DeliverResponse(MakeCircuitOpenResponse()); + return; + } if (error_code == CONNECT_FAILED || error_code == CONNECT_TIMEOUT) { + // Report connect failure to the breaker BEFORE retrying — + // otherwise the retry's ConsultBreaker might admit against a + // stale success count, delaying trip detection. + ReportBreakerOutcome(RESULT_CHECKOUT_FAILED); MaybeRetry(RetryPolicy::RetryCondition::CONNECT_FAILURE); } else { // Pool exhaustion, queue timeout, or shutdown — local capacity issue. // Use RESULT_POOL_EXHAUSTED → 503 (not 502 which implies upstream failure). + // Release the breaker slot neutrally — admission never reached upstream. + ReportBreakerOutcome(RESULT_POOL_EXHAUSTED); OnError(RESULT_POOL_EXHAUSTED, "Pool checkout failed (local capacity, error=" + std::to_string(error_code) + ")"); @@ -263,6 +419,13 @@ void ProxyTransaction::SendUpstreamRequest() { logging::Get()->warn("ProxyTransaction stale connection before send " "client_fd={} service={} attempt={}", client_fd_, service_name_, attempt_); + // Report to the breaker BEFORE retrying — MaybeRetry's + // AttemptCheckout will overwrite admission_generation_ on the + // next ConsultBreaker. Without this call, a probe in HALF_OPEN + // would leak its slot and the slice could stall in + // half_open_full; in CLOSED, the failure would be under-counted + // until the last retry ran through OnError. + ReportBreakerOutcome(RESULT_UPSTREAM_DISCONNECT); MaybeRetry(RetryPolicy::RetryCondition::UPSTREAM_DISCONNECT); return; } @@ -340,6 +503,8 @@ void ProxyTransaction::OnUpstreamData( "state={} attempt={}", client_fd_, service_name_, upstream_fd, static_cast(state_), attempt_); + // Report BEFORE retry — see stale-connection path above for why. + ReportBreakerOutcome(RESULT_UPSTREAM_DISCONNECT); MaybeRetry(RetryPolicy::RetryCondition::UPSTREAM_DISCONNECT); return; } @@ -517,10 +682,20 @@ void ProxyTransaction::OnResponseComplete() { "service={} status={} attempt={}", client_fd_, service_name_, response.status_code, attempt_); + // Report failure BEFORE MaybeRetry — the retry's fresh + // ConsultBreaker must see the just-added failure in the window + // (and potentially reject if this was the trip-causing call). + // Pass a synthetic RESULT_CHECKOUT_FAILED-like signal; the + // classifier maps 5xx → FailureKind::RESPONSE_5XX. + ReportBreakerOutcome(/* sentinel */ -1000); MaybeRetry(RetryPolicy::RetryCondition::RESPONSE_5XX); return; } + // 2xx / 3xx / 4xx: upstream is healthy (from the breaker's + // perspective — 4xx is a client-side problem). Report success. + ReportBreakerOutcome(RESULT_SUCCESS); + state_ = State::COMPLETE; auto duration = std::chrono::duration_cast( @@ -550,8 +725,19 @@ void ProxyTransaction::OnError(int result_code, client_fd_, service_name_, result_code, attempt_, duration.count(), log_message); + // Report the outcome if an admission is still held. Most error paths + // call ReportBreakerOutcome themselves BEFORE reaching OnError (so a + // retry's ConsultBreaker sees the fresh signal) — this is a safety + // net for error paths that skipped reporting, e.g., RESULT_SEND_FAILED + // and RESULT_RESPONSE_TIMEOUT from the on-upstream-data paths. + // ReportBreakerOutcome is idempotent: it clears admission_generation_ + // on the first call so a double-call drops harmlessly. + ReportBreakerOutcome(result_code); + state_ = State::FAILED; - HttpResponse error_response = MakeErrorResponse(result_code); + HttpResponse error_response = (result_code == RESULT_CIRCUIT_OPEN) + ? MakeCircuitOpenResponse() + : MakeErrorResponse(result_code); DeliverResponse(std::move(error_response)); } @@ -569,7 +755,14 @@ void ProxyTransaction::MaybeRetry(RetryPolicy::RetryCondition condition) { client_fd_, service_name_, attempt_, static_cast(condition)); - // Release old lease, clear callbacks, poison if tainted + // Release old lease, clear callbacks, poison if tainted. + // Cleanup also releases any retry token held by the previous + // retry attempt so the next TryConsumeRetry in AttemptCheckout + // sees a fresh counter. The retry-budget gate itself now lives + // at the top of AttemptCheckout — that way a delayed retry + // doesn't hold a token during its backoff sleep, which would + // otherwise pollute the budget's retries_in_flight with + // queued-but-sleeping work that hasn't reached the upstream. Cleanup(); codec_.Reset(); // Re-apply request method after reset — llhttp_init() zeroes @@ -734,6 +927,25 @@ void ProxyTransaction::Cancel() { if (state_ != State::INIT && state_ != State::CHECKOUT_PENDING) { poison_connection_ = true; } + // Release any held breaker admission neutrally. Cancel() is always + // a LOCAL termination — client disconnect, framework-level abort, + // H2 stream reset, etc. Even when we poisoned a pooled connection + // mid-request, counting that as an upstream-health failure would + // trip the breaker against a backend that may be perfectly healthy + // (browser cancels, user-initiated timeouts, etc. are all common + // causes). The reviewer guidance is explicit: client-initiated + // aborts must be neutral from the breaker's perspective. + // + // Trade-off: in HALF_OPEN, ReportNeutral on a probe decrements + // both inflight and admitted, so a cancelled probe makes the slot + // eligible for a replacement admission in the same cycle. That is + // the documented design contract of ReportNeutral ("the upstream + // wasn't actually exercised by this admission" from the breaker's + // decision-math point of view — we didn't observe a success or + // failure), and it is acceptable: probes that genuinely succeed + // or fail still close / re-trip the cycle normally, and a broken + // upstream under cancel-spam will still fail those real probes. + ReleaseBreakerAdmissionNeutral(); // Release the upstream lease back to the pool (or destroy it if // poisoned) and clear transport callbacks so any in-flight upstream // bytes land harmlessly. @@ -741,6 +953,22 @@ void ProxyTransaction::Cancel() { } void ProxyTransaction::Cleanup() { + // Release any retry-budget token held by the attempt that just + // ended. Must happen BEFORE the next TryConsumeRetry in MaybeRetry + // so the new attempt sees accurate retries_in_flight. Idempotent + // via the retry_token_held_ flag. + ReleaseRetryToken(); + + // Release the in-flight guard from the just-ended attempt. If + // MaybeRetry schedules a delayed backoff, the gap between Cleanup + // and the eventual AttemptCheckout (which would move-assign a + // fresh guard) holds the old slot in `retry_budget_->in_flight_` + // for the entire backoff sleep. That inflates the effective + // denominator of the percent-cap formula, weakening the budget + // exactly during retry storms. Move-assign from a default + // (empty) guard decrements the old counter immediately. + inflight_guard_ = circuit_breaker::RetryBudget::InFlightGuard{}; + if (lease_) { auto* conn = lease_.Get(); if (conn) { @@ -851,6 +1079,13 @@ void ProxyTransaction::ArmResponseTimeout(int explicit_budget_ms) { if (self->state_ == State::SENDING_REQUEST || self->state_ == State::AWAITING_RESPONSE || self->state_ == State::RECEIVING_BODY) { + // Report BEFORE retry — MaybeRetry's AttemptCheckout will + // overwrite admission_generation_ on the next + // ConsultBreaker, stranding the current attempt's + // admission (probe slot leaks in HALF_OPEN; CLOSED + // under-counts the failure until the last retry hits + // OnError). + self->ReportBreakerOutcome(RESULT_RESPONSE_TIMEOUT); self->MaybeRetry(RetryPolicy::RetryCondition::RESPONSE_TIMEOUT); } else { self->OnError(RESULT_RESPONSE_TIMEOUT, "Response timeout"); @@ -886,6 +1121,32 @@ HttpResponse ProxyTransaction::MakeErrorResponse(int result_code) { if (result_code == RESULT_POOL_EXHAUSTED) { return HttpResponse::ServiceUnavailable(); } + if (result_code == RESULT_RETRY_BUDGET_EXHAUSTED) { + return MakeRetryBudgetResponse(); + } + if (result_code == RESULT_CIRCUIT_OPEN) { + // The static factory has no `this`, so it cannot build the + // fully §12.1-compliant response (Retry-After derived from + // slice state, X-Upstream-Host). All in-class paths for + // CIRCUIT_OPEN use the non-static MakeCircuitOpenResponse() + // — reaching this branch means a future caller forgot that + // rule. Log loudly so the mistake shows up in logs instead + // of producing a stealth regression against the contract. + // + // Still emit `X-Circuit-Breaker: open` + `Connection: close` + // so the response remains self-identifying as a circuit-open + // reject. Clients inspecting that header will correctly back + // off via their own client-side logic rather than treating + // this as an anonymous 503. + logging::Get()->error( + "ProxyTransaction::MakeErrorResponse(RESULT_CIRCUIT_OPEN) " + "invoked from static context — use MakeCircuitOpenResponse() " + "to emit §12.1-compliant headers"); + HttpResponse resp = HttpResponse::ServiceUnavailable(); + resp.Header("X-Circuit-Breaker", "open"); + resp.Header("Connection", "close"); + return resp; + } if (result_code == RESULT_CHECKOUT_FAILED || result_code == RESULT_SEND_FAILED || result_code == RESULT_PARSE_ERROR || @@ -894,3 +1155,225 @@ HttpResponse ProxyTransaction::MakeErrorResponse(int result_code) { } return HttpResponse::InternalError(); } + +HttpResponse ProxyTransaction::MakeCircuitOpenResponse() const { + // TryAcquire() returns REJECTED_OPEN for three distinct situations: + // * True OPEN: slice is in OPEN state, IsOpenDeadlineSet() is true, + // Retry-After reflects remaining backoff from OpenUntil(). + // * HALF_OPEN reject (half_open_full or half_open_recovery_failing): + // slice transitioned HALF_OPEN via TransitionOpenToHalfOpen, which + // clears open_until. IsOpenDeadlineSet() is false. These rejects + // wait on the in-flight probe cycle completing (success → CLOSED, + // failure → re-trip with fresh backoff). Retry-After = 1 in this + // branch would under-report the likely wait on a re-trip; ceil to + // base_open_duration_ms as a conservative hint (the worst case is + // re-trip + fresh backoff window). + // Emit a distinct X-Circuit-Breaker label for observability so + // operators can separate "true OPEN" from "HALF_OPEN recovery back- + // pressure" on dashboards. + int retry_after_secs = 1; + const char* breaker_label = "open"; + // Absolute sanity ceiling — independent of config. Protects against + // ridiculous programmatic values that might slip past validation. + static constexpr int RETRY_AFTER_ABS_MAX_SECS = 3600; // 1 hour + if (slice_) { + if (slice_->IsOpenDeadlineSet()) { + // True OPEN — Retry-After from the actual stored deadline. + // The deadline is authoritative: it's what the slice will + // actually honor, regardless of any subsequent config + // reload that might lower max_open_duration_ms. Clamping + // below the stored deadline would tell well-behaved clients + // to retry early and bounce on more 503s until the original + // deadline elapses. + auto open_until = slice_->OpenUntil(); + auto now = std::chrono::steady_clock::now(); + auto ms_remaining = std::chrono::duration_cast( + open_until - now).count(); + // Ceiling-round to seconds so we never advertise a window + // shorter than the actual remaining backoff. + int64_t diff = (ms_remaining + 999) / 1000; + if (diff < 1) diff = 1; + if (diff > RETRY_AFTER_ABS_MAX_SECS) diff = RETRY_AFTER_ABS_MAX_SECS; + retry_after_secs = static_cast(diff); + breaker_label = "open"; + } else if (slice_->CurrentState() == + circuit_breaker::State::HALF_OPEN) { + // HALF_OPEN reject — no deadline to read. Hint with the + // NEXT expected open duration (base << consecutive_trips_, + // clamped by max_open_duration_ms) rather than base alone: + // after multiple trips, exponential backoff has already + // grown the OPEN window, and advertising bare base would + // tell clients to retry far earlier than the breaker will + // admit even in the worst case (probe cycle fails, slice + // re-trips into the larger backoff). + int64_t next_ms = slice_->NextOpenDurationMs(); + int hint = static_cast( + std::max(1, (next_ms + 999) / 1000)); + retry_after_secs = std::min(hint, RETRY_AFTER_ABS_MAX_SECS); + breaker_label = "half_open"; + } + // Any other state (CLOSED): shouldn't reach here — ConsultBreaker + // only calls this on REJECTED_OPEN. Fall through with the + // conservative defaults (Retry-After=1, label="open") so a + // regression can't silently emit Retry-After=0. + } + + HttpResponse resp; + resp.Status(HttpStatus::SERVICE_UNAVAILABLE); + resp.Text("Upstream circuit breaker is open; please retry later.\n"); + resp.Header("Retry-After", std::to_string(retry_after_secs)); + resp.Header("X-Circuit-Breaker", breaker_label); + // Hint operators (not clients) at which upstream tripped. Useful + // when a gateway fronts multiple backends; without this header, a + // 503 is opaque. + resp.Header("X-Upstream-Host", + upstream_host_ + ":" + std::to_string(upstream_port_)); + resp.Header("Connection", "close"); + return resp; +} + +HttpResponse ProxyTransaction::MakeRetryBudgetResponse() { + HttpResponse resp; + resp.Status(HttpStatus::SERVICE_UNAVAILABLE); + resp.Text("Upstream retry budget exhausted.\n"); + resp.Header("X-Retry-Budget-Exhausted", "1"); + resp.Header("Connection", "close"); + return resp; +} + +bool ProxyTransaction::ConsultBreaker() { + if (!slice_) { + // No breaker attached for this service. Proceed as if the + // breaker layer didn't exist. admission_generation_ stays 0 so + // any accidental ReportBreakerOutcome call is a no-op. + is_probe_ = false; + admission_generation_ = 0; + return true; + } + auto admission = slice_->TryAcquire(); + + // Stash the admission metadata for the paired Report*() call. Note + // we record this EVEN for REJECTED_OPEN (where generation_==0 is a + // sentinel) — it's harmless and keeps the branches simpler. + admission_generation_ = admission.generation; + is_probe_ = (admission.decision == + circuit_breaker::Decision::ADMITTED_PROBE); + + if (admission.decision == circuit_breaker::Decision::REJECTED_OPEN) { + // Hard reject — slice counted it, logged it, and we must not + // touch the upstream. Emit §12.1 response and DO NOT Report + // back (would create a feedback loop — our own reject counting + // as a failure against the already-OPEN slice). + state_ = State::FAILED; + logging::Get()->info( + "ProxyTransaction circuit-open reject client_fd={} service={} " + "attempt={}", + client_fd_, service_name_, attempt_); + DeliverResponse(MakeCircuitOpenResponse()); + // Clear admission_generation_ — there's nothing to Report. + admission_generation_ = 0; + return false; + } + + // REJECTED_OPEN_DRYRUN: slice logged the would-reject and counted + // it; caller proceeds to the upstream. Fall through as admitted. + // ADMITTED / ADMITTED_PROBE: proceed. + return true; +} + +void ProxyTransaction::ReleaseRetryToken() { + if (retry_token_held_ && retry_budget_) { + retry_budget_->ReleaseRetry(); + } + retry_token_held_ = false; +} + +void ProxyTransaction::ReleaseBreakerAdmissionNeutral() { + if (!slice_ || admission_generation_ == 0) return; + + uint64_t gen = admission_generation_; + admission_generation_ = 0; + bool probe = is_probe_; + is_probe_ = false; + + // Neutral release — no upstream health signal. Decrements the + // per-partition inflight (CLOSED) or the HALF_OPEN probe admitted + // counter, so a cancelled probe doesn't wedge the slice in + // half_open_full. + slice_->ReportNeutral(probe, gen); +} + +void ProxyTransaction::ReportBreakerOutcome(int result_code) { + // No slice, or already reported: bail. admission_generation_==0 is + // the sentinel — slice domain generations start at 1, so a 0 gen + // would be rejected as stale anyway; the early return just avoids + // an unnecessary atomic load. The Report* methods themselves are + // idempotent against stale gens, but we also must not increment a + // probe_*/rejected_ counter for a non-event. + if (!slice_ || admission_generation_ == 0) return; + + // Capture + clear in one go so concurrent / re-entrant calls bail. + uint64_t gen = admission_generation_; + admission_generation_ = 0; + bool probe = is_probe_; + is_probe_ = false; + + using circuit_breaker::FailureKind; + + // Synthetic sentinel for the OnResponseComplete 5xx path — maps to + // RESPONSE_5XX without needing a new public result code. Callers + // other than OnResponseComplete never use this value. + static constexpr int SENTINEL_5XX = -1000; + + switch (result_code) { + case RESULT_SUCCESS: + slice_->ReportSuccess(probe, gen); + return; + + case SENTINEL_5XX: + slice_->ReportFailure(FailureKind::RESPONSE_5XX, probe, gen); + return; + + case RESULT_CHECKOUT_FAILED: + slice_->ReportFailure(FailureKind::CONNECT_FAILURE, probe, gen); + return; + + case RESULT_RESPONSE_TIMEOUT: + slice_->ReportFailure(FailureKind::RESPONSE_TIMEOUT, probe, gen); + return; + + case RESULT_UPSTREAM_DISCONNECT: + case RESULT_SEND_FAILED: + slice_->ReportFailure(FailureKind::UPSTREAM_DISCONNECT, probe, gen); + return; + + case RESULT_POOL_EXHAUSTED: + case RESULT_PARSE_ERROR: + // Local outcomes — no upstream health signal. Release the + // admission slot neutrally so a probe doesn't leak the + // HALF_OPEN slot. + slice_->ReportNeutral(probe, gen); + return; + + case RESULT_CIRCUIT_OPEN: + case RESULT_RETRY_BUDGET_EXHAUSTED: + // Our own rejects — MUST NOT feed back into the slice. + // These paths should not reach ReportBreakerOutcome (both + // clear admission_generation_ before delivering), but the + // defensive branch keeps the class-wide invariant: these + // outcomes are invisible to the breaker. + return; + + default: + // Unknown result code — log and neutral-release to keep the + // probe bookkeeping consistent. A runtime log here is + // cheaper than a slice stuck in HALF_OPEN forever because a + // new result code slipped through unclassified. + logging::Get()->error( + "ReportBreakerOutcome: unclassified result_code={} " + "service={} — releasing neutrally", + result_code, service_name_); + slice_->ReportNeutral(probe, gen); + return; + } +} diff --git a/server/retry_budget.cc b/server/retry_budget.cc new file mode 100644 index 00000000..9723d949 --- /dev/null +++ b/server/retry_budget.cc @@ -0,0 +1,97 @@ +#include "circuit_breaker/retry_budget.h" + +namespace circuit_breaker { + +namespace { + +// Clamp floors for direct-ctor / Reload callers that bypass +// ConfigLoader::Validate(). Mirrors the hardening elsewhere in the +// circuit-breaker code (window ctor, probe budget snapshot, +// ComputeOpenDuration) so programmatic callers can't disable the +// budget by passing pathological values. +// percent < 0 → 0 (pure min_concurrency floor, no %-based cap) +// percent > 100 → 100 (retries capped at total in_flight) +// min_concurrency < 0 → 0 (no floor) +int ClampPercent(int p) { + if (p < 0) return 0; + if (p > 100) return 100; + return p; +} +int ClampMinConcurrency(int m) { + return m < 0 ? 0 : m; +} + +} // namespace + +RetryBudget::RetryBudget(int percent, int min_concurrency) + : percent_(ClampPercent(percent)), + min_concurrency_(ClampMinConcurrency(min_concurrency)) {} + +RetryBudget::InFlightGuard RetryBudget::TrackInFlight() { + in_flight_.fetch_add(1, std::memory_order_relaxed); + return InFlightGuard(&in_flight_); +} + +bool RetryBudget::TryConsumeRetry() { + // Snapshot tuning + both in-flight counters once so the cap is + // computed against a consistent slice. Retrying the cap math inside + // the CAS loop would just churn without improving accuracy + // (in_flight is inherently a moving target). + int64_t in_flight = in_flight_.load(std::memory_order_relaxed); + int64_t retries_in_flight = retries_in_flight_.load(std::memory_order_relaxed); + int pct = percent_.load(std::memory_order_relaxed); + int min_conc = min_concurrency_.load(std::memory_order_relaxed); + + // cap = max(min_concurrency, (in_flight - retries_in_flight) * percent / 100) + // + // Subtracting retries from the in_flight base prevents the budget + // from self-inflating: callers hold TrackInFlight() for BOTH first- + // attempts and retries (per the documented API), so admitting a + // retry increases in_flight_. Using the raw in_flight as the base + // would then increase the cap, which in steady state converges + // above the configured percentage of ORIGINAL traffic (e.g. a 20% + // budget with retries counted in would allow ~25% of originals to + // retry simultaneously; at higher percents the amplification grows + // faster). + // + // Floor the subtraction at 0: `retries_in_flight > in_flight` is + // transiently possible under racing increments (retry admitted and + // in_flight guard observed before first-attempt guard's pair) — + // clamp rather than letting the multiply go negative. + int64_t non_retry_in_flight = in_flight - retries_in_flight; + if (non_retry_in_flight < 0) non_retry_in_flight = 0; + int64_t pct_cap = (non_retry_in_flight * pct) / 100; + int64_t cap = pct_cap > min_conc ? pct_cap : min_conc; + + // Atomically reserve a slot: load current, verify under cap, CAS up + // by 1. Separate load + fetch_add would let N concurrent callers + // all observe current < cap and all increment past the cap — under + // the cross-dispatcher load the retry budget is meant to protect + // against, the gate would stop bounding anything. + int64_t current = retries_in_flight; + while (current < cap) { + if (retries_in_flight_.compare_exchange_weak( + current, current + 1, + std::memory_order_acq_rel, + std::memory_order_relaxed)) { + return true; + } + // CAS failure — `current` was updated with the latest value; + // loop re-evaluates against cap. Spurious wakeups on weak CAS + // are also handled by the retry. + } + retries_rejected_.fetch_add(1, std::memory_order_relaxed); + return false; +} + +void RetryBudget::ReleaseRetry() { + retries_in_flight_.fetch_sub(1, std::memory_order_relaxed); +} + +void RetryBudget::Reload(int percent, int min_concurrency) { + percent_.store(ClampPercent(percent), std::memory_order_relaxed); + min_concurrency_.store(ClampMinConcurrency(min_concurrency), + std::memory_order_relaxed); +} + +} // namespace circuit_breaker diff --git a/server/upstream_manager.cc b/server/upstream_manager.cc index 9cd5a284..c4a4314f 100644 --- a/server/upstream_manager.cc +++ b/server/upstream_manager.cc @@ -296,3 +296,13 @@ Dispatcher* UpstreamManager::GetDispatcherForIndex(size_t index) const { bool UpstreamManager::HasUpstream(const std::string& service_name) const { return pools_.find(service_name) != pools_.end(); } + +PoolPartition* UpstreamManager::GetPoolPartition( + const std::string& service_name, + size_t dispatcher_index) { + auto it = pools_.find(service_name); + if (it == pools_.end()) { + return nullptr; + } + return it->second->GetPartition(dispatcher_index); +} diff --git a/test/circuit_breaker_components_test.h b/test/circuit_breaker_components_test.h new file mode 100644 index 00000000..36285b16 --- /dev/null +++ b/test/circuit_breaker_components_test.h @@ -0,0 +1,507 @@ +#pragma once + +#include "test_framework.h" +#include "config/server_config.h" +#include "circuit_breaker/circuit_breaker_state.h" +#include "circuit_breaker/circuit_breaker_slice.h" +#include "circuit_breaker/retry_budget.h" +#include "circuit_breaker/circuit_breaker_host.h" +#include "circuit_breaker/circuit_breaker_manager.h" +#include "dispatcher.h" + +#include +#include +#include +#include + +// Circuit-breaker component unit tests: RetryBudget, CircuitBreakerHost, +// CircuitBreakerManager. +// +// These tests exercise the standalone data structures without any +// integration into the request path (covered by the integration suite). +// Every test constructs the object under test in isolation — no live +// dispatchers, no network I/O. A minimal Dispatcher is instantiated only +// where CircuitBreakerHost::Reload needs one to enqueue per-slice Reload +// calls. +namespace CircuitBreakerComponentsTests { + +using circuit_breaker::CircuitBreakerHost; +using circuit_breaker::CircuitBreakerHostSnapshot; +using circuit_breaker::CircuitBreakerManager; +using circuit_breaker::Decision; +using circuit_breaker::FailureKind; +using circuit_breaker::RetryBudget; +using circuit_breaker::State; + +static CircuitBreakerConfig DefaultCbConfig() { + CircuitBreakerConfig cb; + cb.enabled = true; + cb.consecutive_failure_threshold = 5; + cb.failure_rate_threshold = 50; + cb.minimum_volume = 20; + cb.window_seconds = 10; + cb.permitted_half_open_calls = 3; + cb.base_open_duration_ms = 5000; + cb.max_open_duration_ms = 60000; + cb.retry_budget_percent = 20; + cb.retry_budget_min_concurrency = 3; + return cb; +} + +// ============================================================================ +// RetryBudget tests +// ============================================================================ + +// Min-concurrency floor: with tiny in_flight, min_concurrency still permits +// the configured floor of concurrent retries (otherwise a 20% budget allows 0 +// retries when in_flight < 5 — useless in low-volume services). +void TestRetryBudgetMinConcurrencyFloor() { + std::cout << "\n[TEST] RetryBudget: min_concurrency floor permits retries..." + << std::endl; + try { + // percent=20, min=3. Even with 0 in_flight, 3 retries allowed. + RetryBudget rb(20, 3); + + // Without any in_flight, min floor is what gates us. + bool r1 = rb.TryConsumeRetry(); // 1/3 + bool r2 = rb.TryConsumeRetry(); // 2/3 + bool r3 = rb.TryConsumeRetry(); // 3/3 + bool r4 = rb.TryConsumeRetry(); // over → rejected + + bool pass = r1 && r2 && r3 && !r4 && + rb.RetriesInFlight() == 3 && + rb.RetriesRejected() == 1; + + rb.ReleaseRetry(); rb.ReleaseRetry(); rb.ReleaseRetry(); + pass = pass && rb.RetriesInFlight() == 0; + + TestFramework::RecordTest("RetryBudget min_concurrency floor", pass, + pass ? "" : "r1=" + std::to_string(r1) + + " r2=" + std::to_string(r2) + + " r3=" + std::to_string(r3) + + " r4=" + std::to_string(r4) + + " inflight=" + std::to_string(rb.RetriesInFlight()) + + " rejected=" + std::to_string(rb.RetriesRejected()), + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("RetryBudget min_concurrency floor", false, + e.what(), TestFramework::TestCategory::OTHER); + } +} + +// Percent-based cap scales with in_flight. +// percent=20, min=0, in_flight=50 → cap = 10 retries. +void TestRetryBudgetPercentCap() { + std::cout << "\n[TEST] RetryBudget: percent cap scales with in_flight..." + << std::endl; + try { + RetryBudget rb(20, 0); // no min floor — pure percent + + // Push in_flight to 50 via guards that we intentionally keep + // alive. Per the documented API, callers hold TrackInFlight() + // for BOTH first attempts and retries — but TryConsumeRetry + // subtracts retries_in_flight from the base so the budget + // doesn't self-inflate as retries are admitted. + std::vector guards; + for (int i = 0; i < 50; ++i) guards.push_back(rb.TrackInFlight()); + + // With 50 non-retry in-flight and 20% budget the first + // admission is against cap=10, but each admission shrinks the + // non-retry base by 1. The admission count converges at r + // where r >= floor((50-r) * 20 / 100). Solving: r = 8. The + // pre-fix formula (cap computed from raw in_flight) would + // admit 10, drifting the effective ratio above 20% of + // originals. + int admitted = 0; + for (int i = 0; i < 20; ++i) { + if (rb.TryConsumeRetry()) ++admitted; + } + bool cap_hit = admitted == 8; + bool rejected_count = rb.RetriesRejected() == 12; + + // Release guards — in_flight drops to 0; future TryConsumeRetry with + // min=0 and in_flight=0 rejects everything. + for (auto& g : guards) (void)std::move(g); + guards.clear(); + for (int i = 0; i < admitted; ++i) rb.ReleaseRetry(); + + bool pass = cap_hit && rejected_count && rb.InFlight() == 0 && + rb.RetriesInFlight() == 0; + TestFramework::RecordTest("RetryBudget percent cap", pass, + pass ? "" : "admitted=" + std::to_string(admitted) + + " rejected=" + std::to_string(rb.RetriesRejected()) + + " inflight=" + std::to_string(rb.InFlight()), + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("RetryBudget percent cap", false, + e.what(), TestFramework::TestCategory::OTHER); + } +} + +// TrackInFlight guards must be RAII-safe: destroying the guard decrements +// in_flight_; moving the guard transfers ownership; self-move safe. +void TestRetryBudgetInFlightGuardRaii() { + std::cout << "\n[TEST] RetryBudget: InFlightGuard RAII..." << std::endl; + try { + RetryBudget rb(20, 3); + + bool zero_init = rb.InFlight() == 0; + { + auto g = rb.TrackInFlight(); + bool one_after_track = rb.InFlight() == 1; + + // Move-construct: counter transfers, original is empty. + auto g2 = std::move(g); + bool still_one_after_move = rb.InFlight() == 1; + // g is now empty, destroying it decrements nothing. + (void)g; + + // g2 goes out of scope next. + if (!zero_init || !one_after_track || !still_one_after_move) { + TestFramework::RecordTest("RetryBudget InFlightGuard RAII", + false, "mid-test state wrong", + TestFramework::TestCategory::OTHER); + return; + } + } + bool zero_after_drop = rb.InFlight() == 0; + TestFramework::RecordTest("RetryBudget InFlightGuard RAII", + zero_after_drop, + zero_after_drop ? "" : "in_flight not zero after guard drop", + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("RetryBudget InFlightGuard RAII", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + +// Reload updates tuning atomically without resetting in-flight counters — +// the admission formula changes, outstanding retries keep running. +void TestRetryBudgetReloadPreservesCounters() { + std::cout << "\n[TEST] RetryBudget: Reload preserves in-flight..." + << std::endl; + try { + RetryBudget rb(20, 3); + bool r1 = rb.TryConsumeRetry(); // 1/3 + + // Tighten tuning mid-flight. + rb.Reload(10, 1); + + // Outstanding retry is still tracked. + bool inflight_preserved = rb.RetriesInFlight() == 1; + + // New tuning applies — min=1, so 1/1 retry allowed max. + // Current retries_in_flight=1 already, next attempt rejects. + bool r2 = rb.TryConsumeRetry(); + + rb.ReleaseRetry(); + bool cleanup_ok = rb.RetriesInFlight() == 0; + + bool pass = r1 && inflight_preserved && !r2 && cleanup_ok; + TestFramework::RecordTest("RetryBudget Reload preserves counters", pass, + pass ? "" : "r1=" + std::to_string(r1) + + " inflight_preserved=" + std::to_string(inflight_preserved) + + " r2=" + std::to_string(r2) + + " cleanup_ok=" + std::to_string(cleanup_ok), + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("RetryBudget Reload preserves counters", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + +// Clamp guards: negative percent / negative min_concurrency are clamped at +// construction (mirrors ConfigLoader::Validate — programmatic callers that +// bypass validation get safe defaults). +void TestRetryBudgetClampsInvalidTuning() { + std::cout << "\n[TEST] RetryBudget: clamps invalid tuning..." << std::endl; + try { + RetryBudget rb(-50, -10); + bool clamped = rb.percent() == 0 && rb.min_concurrency() == 0; + + // Over-max percent clamps to 100. + RetryBudget rb2(500, 5); + bool over_clamped = rb2.percent() == 100; + + // Reload also clamps. + rb.Reload(-1, -1); + bool reload_clamped = rb.percent() == 0 && rb.min_concurrency() == 0; + + bool pass = clamped && over_clamped && reload_clamped; + TestFramework::RecordTest("RetryBudget clamps invalid tuning", pass, + pass ? "" : + "clamped=" + std::to_string(clamped) + + " over_clamped=" + std::to_string(over_clamped) + + " reload_clamped=" + std::to_string(reload_clamped), + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("RetryBudget clamps invalid tuning", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + +// ============================================================================ +// CircuitBreakerHost tests +// ============================================================================ + +// Host creates partition_count slices, GetSlice looks up by index, out-of- +// range returns nullptr (not a crash). +void TestHostCreatesSlicesAndGetSlice() { + std::cout << "\n[TEST] CircuitBreakerHost: creates slices + GetSlice..." + << std::endl; + try { + auto cb = DefaultCbConfig(); + CircuitBreakerHost host("svc", "10.0.0.1", 8080, 4, cb); + + bool count_ok = host.partition_count() == 4; + bool slice0 = host.GetSlice(0) != nullptr; + bool slice3 = host.GetSlice(3) != nullptr; + bool slice4_null = host.GetSlice(4) == nullptr; // out of range + bool slice_big_null = host.GetSlice(100) == nullptr; + + // Retry budget always present. + bool rb_present = host.GetRetryBudget() != nullptr; + + // Field getters. + bool fields_ok = host.service_name() == "svc" && + host.host() == "10.0.0.1" && + host.port() == 8080; + + bool pass = count_ok && slice0 && slice3 && slice4_null && + slice_big_null && rb_present && fields_ok; + TestFramework::RecordTest("CircuitBreakerHost GetSlice", pass, "", + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("CircuitBreakerHost GetSlice", false, + e.what(), TestFramework::TestCategory::OTHER); + } +} + +// Host Snapshot aggregates counters across slices and rolls up states. +void TestHostSnapshotAggregates() { + std::cout << "\n[TEST] CircuitBreakerHost: Snapshot aggregates..." + << std::endl; + try { + auto cb = DefaultCbConfig(); + cb.consecutive_failure_threshold = 2; + cb.failure_rate_threshold = 100; + cb.minimum_volume = 1000; + CircuitBreakerHost host("svc", "h", 80, 3, cb); + + // Trip slice 0 and 2 → 2 open_partitions, 1 closed. + for (int p : {0, 2}) { + auto* s = host.GetSlice(p); + for (int i = 0; i < 2; ++i) { + auto a = s->TryAcquire(); + s->ReportFailure(FailureKind::RESPONSE_5XX, false, a.generation); + } + } + + auto snap = host.Snapshot(); + + bool rows_ok = snap.slices.size() == 3; + bool total_trips = snap.total_trips == 2; + bool open = snap.open_partitions == 2; + bool halfopen = snap.half_open_partitions == 0; + bool svc_ok = snap.service_name == "svc" && + snap.host == "h" && snap.port == 80; + + bool pass = rows_ok && total_trips && open && halfopen && svc_ok; + TestFramework::RecordTest("CircuitBreakerHost Snapshot aggregates", pass, + pass ? "" : + "rows=" + std::to_string(snap.slices.size()) + + " trips=" + std::to_string(snap.total_trips) + + " open=" + std::to_string(snap.open_partitions), + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("CircuitBreakerHost Snapshot aggregates", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + +// Host Reload with mismatched dispatcher count logs error and does nothing. +// Uses an empty dispatcher vector — the mismatch path must NOT dereference. +void TestHostReloadDispatcherMismatchIsSafe() { + std::cout << "\n[TEST] CircuitBreakerHost: Reload dispatcher mismatch..." + << std::endl; + try { + auto cb = DefaultCbConfig(); + CircuitBreakerHost host("svc", "h", 80, 3, cb); + + auto new_cb = cb; + new_cb.failure_rate_threshold = 80; + + // Mismatch: 0 dispatchers vs 3 slices. Must not crash, must not + // apply (retry budget atomics should stay at old values). + std::vector> empty; + host.Reload(empty, new_cb); + + // Retry budget fields should be unchanged — Reload bailed early. + bool rb_unchanged = + host.GetRetryBudget()->percent() == cb.retry_budget_percent && + host.GetRetryBudget()->min_concurrency() == + cb.retry_budget_min_concurrency; + + TestFramework::RecordTest("CircuitBreakerHost Reload mismatch is safe", + rb_unchanged, + rb_unchanged ? "" : "retry budget incorrectly updated on bail", + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("CircuitBreakerHost Reload mismatch is safe", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + +// ============================================================================ +// CircuitBreakerManager tests +// ============================================================================ + +// Manager builds one host per upstream (regardless of enabled). GetHost +// returns non-null for known names and null for unknown. +void TestManagerGetHostLookup() { + std::cout << "\n[TEST] CircuitBreakerManager: GetHost lookup..." + << std::endl; + try { + std::vector upstreams(2); + upstreams[0].name = "svc-a"; + upstreams[0].host = "10.0.0.1"; + upstreams[0].port = 8080; + upstreams[0].circuit_breaker = DefaultCbConfig(); + upstreams[1].name = "svc-b"; + upstreams[1].host = "10.0.0.2"; + upstreams[1].port = 9090; + upstreams[1].circuit_breaker = DefaultCbConfig(); + upstreams[1].circuit_breaker.enabled = false; // disabled still built + + CircuitBreakerManager mgr(upstreams, 4, {}); + + bool count_ok = mgr.host_count() == 2; + auto* a = mgr.GetHost("svc-a"); + auto* b = mgr.GetHost("svc-b"); + auto* unknown = mgr.GetHost("nope"); + + bool a_ok = a != nullptr && a->port() == 8080 && + a->partition_count() == 4; + bool b_ok = b != nullptr && b->port() == 9090 && + b->partition_count() == 4; + bool unknown_null = unknown == nullptr; + + bool pass = count_ok && a_ok && b_ok && unknown_null; + TestFramework::RecordTest("CircuitBreakerManager GetHost lookup", pass, + pass ? "" : + "count_ok=" + std::to_string(count_ok) + + " a=" + std::to_string(a_ok) + + " b=" + std::to_string(b_ok) + + " unknown_null=" + std::to_string(unknown_null), + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("CircuitBreakerManager GetHost lookup", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + +// SnapshotAll returns one entry per host; topology-preserved Reload logs and +// skips new/removed names without crashing. +void TestManagerSnapshotAllAndReloadSkipsTopologyChanges() { + std::cout << "\n[TEST] CircuitBreakerManager: SnapshotAll + Reload skips topology..." + << std::endl; + try { + std::vector upstreams(1); + upstreams[0].name = "svc-a"; + upstreams[0].host = "h"; + upstreams[0].port = 80; + upstreams[0].circuit_breaker = DefaultCbConfig(); + + CircuitBreakerManager mgr(upstreams, 2, {}); + + auto snaps = mgr.SnapshotAll(); + bool one_snapshot = snaps.size() == 1; + bool snap_name_ok = snaps[0].service_name == "svc-a"; + + // Reload with a NEW name + REMOVED existing name — both must log + // warn and do nothing (topology is restart-only). + std::vector new_upstreams(1); + new_upstreams[0].name = "svc-NEW"; + new_upstreams[0].host = "h"; + new_upstreams[0].port = 80; + new_upstreams[0].circuit_breaker = DefaultCbConfig(); + + mgr.Reload(new_upstreams); + + // Manager must still only know about svc-a (the original). + bool original_preserved = mgr.GetHost("svc-a") != nullptr; + bool new_not_added = mgr.GetHost("svc-NEW") == nullptr; + bool count_stable = mgr.host_count() == 1; + + bool pass = one_snapshot && snap_name_ok && original_preserved && + new_not_added && count_stable; + TestFramework::RecordTest( + "CircuitBreakerManager SnapshotAll + topology-skip", pass, + pass ? "" : + "one_snap=" + std::to_string(one_snapshot) + + " name_ok=" + std::to_string(snap_name_ok) + + " preserved=" + std::to_string(original_preserved) + + " new_not_added=" + std::to_string(new_not_added) + + " count=" + std::to_string(mgr.host_count()), + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CircuitBreakerManager SnapshotAll + topology-skip", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + +// Empty-name upstream is skipped defensively (ConfigLoader::Validate rejects +// empty names, but manager must not blow up if something slips through). +void TestManagerSkipsEmptyNameUpstream() { + std::cout << "\n[TEST] CircuitBreakerManager: skips empty-name upstream..." + << std::endl; + try { + std::vector upstreams(2); + upstreams[0].name = ""; // defensive — should be skipped + upstreams[0].host = "h"; + upstreams[0].port = 80; + upstreams[0].circuit_breaker = DefaultCbConfig(); + upstreams[1].name = "svc-b"; + upstreams[1].host = "h"; + upstreams[1].port = 81; + upstreams[1].circuit_breaker = DefaultCbConfig(); + + CircuitBreakerManager mgr(upstreams, 2, {}); + + bool pass = mgr.host_count() == 1 && + mgr.GetHost("svc-b") != nullptr && + mgr.GetHost("") == nullptr; + TestFramework::RecordTest( + "CircuitBreakerManager skips empty-name upstream", pass, + pass ? "" : "count=" + std::to_string(mgr.host_count()), + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CircuitBreakerManager skips empty-name upstream", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + +// Run all circuit-breaker component unit tests. +void RunAllTests() { + std::cout << "\n" << std::string(60, '=') << std::endl; + std::cout << "CIRCUIT BREAKER - COMPONENT UNIT TESTS" << std::endl; + std::cout << std::string(60, '=') << std::endl; + + TestRetryBudgetMinConcurrencyFloor(); + TestRetryBudgetPercentCap(); + TestRetryBudgetInFlightGuardRaii(); + TestRetryBudgetReloadPreservesCounters(); + TestRetryBudgetClampsInvalidTuning(); + + TestHostCreatesSlicesAndGetSlice(); + TestHostSnapshotAggregates(); + TestHostReloadDispatcherMismatchIsSafe(); + + TestManagerGetHostLookup(); + TestManagerSnapshotAllAndReloadSkipsTopologyChanges(); + TestManagerSkipsEmptyNameUpstream(); +} + +} // namespace CircuitBreakerComponentsTests diff --git a/test/circuit_breaker_integration_test.h b/test/circuit_breaker_integration_test.h new file mode 100644 index 00000000..10e72e5b --- /dev/null +++ b/test/circuit_breaker_integration_test.h @@ -0,0 +1,1213 @@ +#pragma once + +// Integration tests: circuit breaker wired into ProxyTransaction + +// UpstreamManager + HttpServer. Exercises the full request path end-to-end. +// +// Strategy: use a backend that returns 5xx on every request so repeated hits +// trip the breaker via the consecutive-failure threshold. 5xx responses are +// the cheapest way to accumulate failures (no connect timeouts to wait for). +// Low thresholds keep tests fast. + +#include "test_framework.h" +#include "test_server_runner.h" +#include "http_test_client.h" +#include "http/http_server.h" +#include "config/server_config.h" +#include "upstream/upstream_manager.h" +#include "circuit_breaker/circuit_breaker_manager.h" +#include "circuit_breaker/circuit_breaker_host.h" +#include "circuit_breaker/circuit_breaker_slice.h" + +#include +#include +#include + +namespace CircuitBreakerIntegrationTests { + +using circuit_breaker::State; + +// Shared helper: build an upstream config that proxies /echo → backend and +// has a breaker configured with low thresholds for fast trip. +static UpstreamConfig MakeBreakerUpstream(const std::string& name, + const std::string& host, + int port, + bool breaker_enabled, + int consecutive_threshold = 3) { + UpstreamConfig u; + u.name = name; + u.host = host; + u.port = port; + u.pool.max_connections = 8; + u.pool.max_idle_connections = 4; + u.pool.connect_timeout_ms = 3000; + u.pool.idle_timeout_sec = 30; + u.pool.max_lifetime_sec = 3600; + u.pool.max_requests_per_conn = 0; + + // Exact-match route — simpler than prefix patterns for integration tests. + u.proxy.route_prefix = "/fail"; + u.proxy.strip_prefix = false; + u.proxy.response_timeout_ms = 2000; + // No retries — keeps the test deterministic: one request = one attempt. + u.proxy.retry.max_retries = 0; + + u.circuit_breaker.enabled = breaker_enabled; + u.circuit_breaker.consecutive_failure_threshold = consecutive_threshold; + // Disable the rate-based trip path — we drive everything through + // consecutive failures to keep the test count predictable. + u.circuit_breaker.failure_rate_threshold = 100; + u.circuit_breaker.minimum_volume = 10000; + u.circuit_breaker.window_seconds = 10; + u.circuit_breaker.permitted_half_open_calls = 2; + u.circuit_breaker.base_open_duration_ms = 500; // short so recovery test is quick + u.circuit_breaker.max_open_duration_ms = 60000; + return u; +} + +// --------------------------------------------------------------------------- +// Test 1: Breaker trips on consecutive 5xx responses and emits circuit-open +// headers on the rejected request. +// --------------------------------------------------------------------------- +void TestBreakerTripsAfterConsecutiveFailures() { + std::cout << "\n[TEST] CB Integration: breaker trips after consecutive 5xx..." + << std::endl; + try { + // Backend always returns 502 — gateway classifies the response as + // FailureKind::RESPONSE_5XX and reports to the breaker on every attempt. + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) { + resp.Status(502).Body("upstream err", "text/plain"); + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 1; + gw.http2.enabled = false; + // worker_threads=1 → all TCP connections land on dispatcher 0 + // (NetServer shards new connections by fd%worker_threads), so + // per-request failures accumulate deterministically on slice[0] + // instead of splitting across multiple slices. // single thread → single breaker partition exercised + gw.upstreams.push_back( + MakeBreakerUpstream("bad-svc", "127.0.0.1", backend_port, + /*enabled=*/true, /*threshold=*/3)); + + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + int gw_port = gw_runner.GetPort(); + + // Hit the failing backend threshold times — each 502 from backend + // propagates to the client as 502 (gateway pass-through) AND counts + // as a RESPONSE_5XX failure in the breaker. + for (int i = 0; i < 3; ++i) { + std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 3000); + if (!TestHttpClient::HasStatus(r, 502)) { + TestFramework::RecordTest( + "CB Integration: trip after consecutive failures", false, + "pre-trip request " + std::to_string(i) + " expected 502, got: " + + r.substr(0, 32)); + return; + } + } + + // Next request must be rejected by the breaker (not proxied). The + // response is 503 with X-Circuit-Breaker: open and Retry-After. + std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 3000); + bool is_503 = TestHttpClient::HasStatus(r, 503); + bool has_breaker_header = + r.find("X-Circuit-Breaker: open") != std::string::npos || + r.find("x-circuit-breaker: open") != std::string::npos; + bool has_retry_after = + r.find("Retry-After:") != std::string::npos || + r.find("retry-after:") != std::string::npos; + bool has_upstream_host = + r.find("X-Upstream-Host:") != std::string::npos || + r.find("x-upstream-host:") != std::string::npos; + + bool pass = is_503 && has_breaker_header && has_retry_after && + has_upstream_host; + TestFramework::RecordTest( + "CB Integration: trip after consecutive failures", pass, + pass ? "" : + "is_503=" + std::to_string(is_503) + + " breaker_hdr=" + std::to_string(has_breaker_header) + + " retry_after=" + std::to_string(has_retry_after) + + " upstream_host=" + std::to_string(has_upstream_host) + + " body=" + r.substr(0, 256)); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Integration: trip after consecutive failures", false, e.what()); + } +} + +// --------------------------------------------------------------------------- +// Test 2: When circuit_breaker.enabled=false, the breaker is bypassed entirely. +// The same failure pattern that would trip an enabled breaker must leave the +// pass-through path untouched — every request still reaches the backend. +// --------------------------------------------------------------------------- +void TestBreakerDisabledPassesThrough() { + std::cout << "\n[TEST] CB Integration: disabled breaker passes through..." + << std::endl; + try { + std::atomic backend_hits{0}; + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [&backend_hits](const HttpRequest&, HttpResponse& resp) { + backend_hits.fetch_add(1, std::memory_order_relaxed); + resp.Status(502).Body("err", "text/plain"); + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 1; + gw.http2.enabled = false; + // worker_threads=1 → all TCP connections land on dispatcher 0 + // (NetServer shards new connections by fd%worker_threads), so + // per-request failures accumulate deterministically on slice[0] + // instead of splitting across multiple slices. + gw.upstreams.push_back( + MakeBreakerUpstream("svc", "127.0.0.1", backend_port, + /*enabled=*/false, /*threshold=*/3)); + + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + int gw_port = gw_runner.GetPort(); + + // 10 requests — with breaker disabled, all 10 reach backend. + for (int i = 0; i < 10; ++i) { + std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 3000); + if (!TestHttpClient::HasStatus(r, 502)) { + TestFramework::RecordTest( + "CB Integration: disabled breaker passes through", false, + "request " + std::to_string(i) + " expected 502, got: " + + r.substr(0, 32)); + return; + } + } + + bool all_hit = backend_hits.load() == 10; + TestFramework::RecordTest( + "CB Integration: disabled breaker passes through", all_hit, + all_hit ? "" : + "expected 10 backend hits, got " + std::to_string(backend_hits.load())); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Integration: disabled breaker passes through", false, e.what()); + } +} + +// --------------------------------------------------------------------------- +// Test 3: 2xx responses are reported as success — they reset the +// consecutive-failure counter so the breaker doesn't trip on interleaved +// success/failure traffic. +// --------------------------------------------------------------------------- +void TestSuccessResetsConsecutiveFailureCounter() { + std::cout << "\n[TEST] CB Integration: 2xx success resets consecutive-failure counter..." + << std::endl; + try { + std::atomic fail_mode{true}; + HttpServer backend("127.0.0.1", 0); + // Backend must serve /fail — that's the exact-match route the + // proxy forwards (MakeBreakerUpstream sets route_prefix="/fail", + // strip_prefix=false). A different backend path would leave + // the gateway 404-ing every request without ever exercising + // the proxy, and the CLOSED-state assertion below would pass + // for the wrong reason. + backend.Get("/fail", [&fail_mode](const HttpRequest&, HttpResponse& resp) { + if (fail_mode.load()) { + resp.Status(502).Body("err", "text/plain"); + } else { + resp.Status(200).Body("ok", "text/plain"); + } + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 1; + gw.http2.enabled = false; + // worker_threads=1 → all TCP connections land on dispatcher 0 + // (NetServer shards new connections by fd%worker_threads), so + // per-request failures accumulate deterministically on slice[0] + // instead of splitting across multiple slices. + gw.upstreams.push_back( + MakeBreakerUpstream("svc", "127.0.0.1", backend_port, + /*enabled=*/true, /*threshold=*/3)); + + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + int gw_port = gw_runner.GetPort(); + + // Pattern: F F S F F — 5 total: 2 fails, 1 success, 2 fails. + // With reset semantics, consecutive_failures_ never exceeds 2 → no trip. + for (int i = 0; i < 2; ++i) { + TestHttpClient::HttpGet(gw_port, "/fail", 3000); // FAIL + } + fail_mode.store(false); + TestHttpClient::HttpGet(gw_port, "/fail", 3000); // SUCCESS → reset + fail_mode.store(true); + for (int i = 0; i < 2; ++i) { + TestHttpClient::HttpGet(gw_port, "/fail", 3000); // FAIL + } + + // Inspect the breaker's state directly. The slice must be CLOSED + // AND must have observed activity — without the second check, a + // gateway that 404's every request (e.g. because the proxy route + // doesn't match) would also pass trivially. + auto* cbm = gateway.GetUpstreamManager() ? + gateway.GetUpstreamManager()->GetCircuitBreakerManager() : nullptr; + auto* host = cbm ? cbm->GetHost("svc") : nullptr; + auto* slice = host ? host->GetSlice(0) : nullptr; + bool still_closed = slice && slice->CurrentState() == State::CLOSED; + // No trip fired: total_trips should be zero for this slice. + int64_t trips = slice ? slice->Trips() : -1; + bool no_trips = (trips == 0); + + bool pass = still_closed && no_trips; + TestFramework::RecordTest( + "CB Integration: success resets consecutive counter", pass, + pass ? "" : + "state=" + std::to_string(static_cast( + slice ? slice->CurrentState() : State::CLOSED)) + + " trips=" + std::to_string(trips)); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Integration: success resets consecutive counter", false, e.what()); + } +} + +// --------------------------------------------------------------------------- +// Test 4: After the trip, the live slice state is OPEN. Verifies the +// integration actually drives the slice state machine (not just the response). +// --------------------------------------------------------------------------- +void TestTripDrivesSliceState() { + std::cout << "\n[TEST] CB Integration: trip drives slice state to OPEN..." + << std::endl; + try { + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) { + resp.Status(502).Body("err", "text/plain"); + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 1; + gw.http2.enabled = false; + // worker_threads=1 → all TCP connections land on dispatcher 0 + // (NetServer shards new connections by fd%worker_threads), so + // per-request failures accumulate deterministically on slice[0] + // instead of splitting across multiple slices. + gw.upstreams.push_back( + MakeBreakerUpstream("svc", "127.0.0.1", backend_port, + /*enabled=*/true, /*threshold=*/3)); + + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + int gw_port = gw_runner.GetPort(); + + // 3 failures → trip. + for (int i = 0; i < 3; ++i) { + TestHttpClient::HttpGet(gw_port, "/fail", 3000); + } + + // With worker_threads > 1 the 3 failing requests can land on either + // dispatcher (hash-dependent). Check the aggregate snapshot — at + // least one partition must be OPEN with exactly one trip recorded. + auto* cbm = gateway.GetUpstreamManager()->GetCircuitBreakerManager(); + auto* host = cbm->GetHost("svc"); + auto snap = host->Snapshot(); + bool at_least_one_open = snap.open_partitions >= 1; + bool one_trip = snap.total_trips == 1; + // Sanity: the tripped partition should be the one that saw all 3 + // failures (consecutive trip is single-slice, not cross-slice). + bool single_partition_tripped = snap.open_partitions == 1; + + bool pass = at_least_one_open && one_trip && single_partition_tripped; + TestFramework::RecordTest( + "CB Integration: trip drives slice state to OPEN", pass, + pass ? "" : + "at_least_one_open=" + std::to_string(at_least_one_open) + + " one_trip=" + std::to_string(one_trip) + + " single_partition=" + std::to_string(single_partition_tripped) + + " (open_partitions=" + std::to_string(snap.open_partitions) + + ", total_trips=" + std::to_string(snap.total_trips) + ")"); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Integration: trip drives slice state to OPEN", false, e.what()); + } +} + +// --------------------------------------------------------------------------- +// Test 5: Breaker-rejected requests do NOT hit the backend. After the trip, +// subsequent requests must be served locally (503) without any upstream I/O. +// Prevents regression where the gate leaked admissions to a known-bad upstream. +// --------------------------------------------------------------------------- +void TestOpenBreakerShortCircuitsUpstreamCall() { + std::cout << "\n[TEST] CB Integration: OPEN breaker short-circuits upstream call..." + << std::endl; + try { + std::atomic backend_hits{0}; + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [&backend_hits](const HttpRequest&, HttpResponse& resp) { + backend_hits.fetch_add(1, std::memory_order_relaxed); + resp.Status(502).Body("err", "text/plain"); + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 1; + gw.http2.enabled = false; + // worker_threads=1 → all TCP connections land on dispatcher 0 + // (NetServer shards new connections by fd%worker_threads), so + // per-request failures accumulate deterministically on slice[0] + // instead of splitting across multiple slices. + gw.upstreams.push_back( + MakeBreakerUpstream("svc", "127.0.0.1", backend_port, + /*enabled=*/true, /*threshold=*/3)); + + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + int gw_port = gw_runner.GetPort(); + + // 3 failing requests to trip. + for (int i = 0; i < 3; ++i) { + TestHttpClient::HttpGet(gw_port, "/fail", 3000); + } + int hits_at_trip = backend_hits.load(); + + // 5 more requests — all should be rejected locally. + for (int i = 0; i < 5; ++i) { + TestHttpClient::HttpGet(gw_port, "/fail", 3000); + } + int hits_after = backend_hits.load(); + + // Backend hits must not grow during the post-trip burst. + bool no_leak = hits_after == hits_at_trip; + TestFramework::RecordTest( + "CB Integration: OPEN short-circuits upstream call", no_leak, + no_leak ? "" : + "backend hits grew from " + std::to_string(hits_at_trip) + + " to " + std::to_string(hits_after) + " after trip"); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Integration: OPEN short-circuits upstream call", false, e.what()); + } +} + +// Sanity check: verify the bare proxy setup works without the breaker +// before blaming the breaker integration. +void TestBareProxyWorks() { + std::cout << "\n[TEST] CB Integration: bare proxy (sanity)..." << std::endl; + try { + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) { + resp.Status(502).Body("err", "text/plain"); + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 1; + gw.http2.enabled = false; + UpstreamConfig u; + u.name = "svc"; + u.host = "127.0.0.1"; + u.port = backend_port; + u.pool.max_connections = 8; + u.pool.max_idle_connections = 4; + u.pool.connect_timeout_ms = 3000; + u.proxy.route_prefix = "/fail"; + u.proxy.response_timeout_ms = 5000; + u.circuit_breaker.enabled = true; // sanity + breaker enabled + u.circuit_breaker.consecutive_failure_threshold = 3; + u.circuit_breaker.failure_rate_threshold = 100; + u.circuit_breaker.minimum_volume = 10000; + u.circuit_breaker.window_seconds = 10; + u.circuit_breaker.permitted_half_open_calls = 2; + u.circuit_breaker.base_open_duration_ms = 500; + u.circuit_breaker.max_open_duration_ms = 60000; + gw.upstreams.push_back(u); + + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + int gw_port = gw_runner.GetPort(); + + std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 5000); + bool pass = TestHttpClient::HasStatus(r, 502); + TestFramework::RecordTest( + "CB Integration: bare proxy sanity", pass, + pass ? "" : "expected 502, got: " + r.substr(0, 128)); + } catch (const std::exception& e) { + TestFramework::RecordTest("CB Integration: bare proxy sanity", + false, e.what()); + } +} + +// --------------------------------------------------------------------------- +// Test 7: Retry-After header carries a sensible value — within [1, configured +// max_open_duration_ms / 1000], and in the right ballpark of OpenUntil()-now. +// --------------------------------------------------------------------------- +void TestRetryAfterHeaderValue() { + std::cout << "\n[TEST] CB Integration: Retry-After value correctness..." + << std::endl; + try { + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) { + resp.Status(502).Body("err", "text/plain"); + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 1; + gw.http2.enabled = false; + // base_open_duration 2000ms, max 60_000ms — Retry-After should + // ceiling-round and fall inside [1, 60]. + auto u = MakeBreakerUpstream("svc", "127.0.0.1", backend_port, + /*enabled=*/true, /*threshold=*/3); + u.circuit_breaker.base_open_duration_ms = 2000; + u.circuit_breaker.max_open_duration_ms = 60000; + gw.upstreams.push_back(u); + + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + int gw_port = gw_runner.GetPort(); + + // Trip the breaker. + for (int i = 0; i < 3; ++i) { + TestHttpClient::HttpGet(gw_port, "/fail", 3000); + } + + // Capture the open-rejection response. + std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 3000); + bool is_503 = TestHttpClient::HasStatus(r, 503); + + // Extract Retry-After integer value (case-insensitive header). + int retry_after = -1; + const char* markers[] = {"Retry-After:", "retry-after:"}; + for (const char* m : markers) { + auto pos = r.find(m); + if (pos == std::string::npos) continue; + pos += std::string(m).size(); + while (pos < r.size() && (r[pos] == ' ' || r[pos] == '\t')) ++pos; + int val = 0; + bool any = false; + while (pos < r.size() && r[pos] >= '0' && r[pos] <= '9') { + val = val * 10 + (r[pos] - '0'); + any = true; + ++pos; + } + if (any) { retry_after = val; break; } + } + + // Contract: value ≥ 1 and ≤ max_open_duration_ms / 1000 (60). + // For base_open_duration 2000ms the remaining-seconds at this + // moment is ≤ 2 (probably 1 or 2 after ceiling), so the upper + // sanity bound is generous but still rules out 300/3600-class + // buggy fallbacks. + bool in_range = (retry_after >= 1 && retry_after <= 60); + bool reasonable = (retry_after >= 1 && retry_after <= 3); + + bool pass = is_503 && in_range && reasonable; + TestFramework::RecordTest( + "CB Integration: Retry-After value in range", pass, + pass ? "" : + "is_503=" + std::to_string(is_503) + + " retry_after=" + std::to_string(retry_after) + + " body=" + r.substr(0, 256)); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Integration: Retry-After value in range", false, e.what()); + } +} + +// --------------------------------------------------------------------------- +// Test 8: Retry loop is terminal on CIRCUIT_OPEN — even with max_retries=3, +// a request that hits an OPEN breaker gets exactly ONE 503 (no retry-flavored +// second 503). Ensures ReportBreakerOutcome doesn't feed the reject back into +// the breaker and MaybeRetry stays out. +// --------------------------------------------------------------------------- +void TestCircuitOpenTerminalForRetry() { + std::cout << "\n[TEST] CB Integration: CIRCUIT_OPEN terminal for retry loop..." + << std::endl; + try { + std::atomic backend_hits{0}; + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [&backend_hits](const HttpRequest&, HttpResponse& resp) { + backend_hits.fetch_add(1, std::memory_order_relaxed); + resp.Status(502).Body("err", "text/plain"); + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 1; + gw.http2.enabled = false; + // Retries enabled on 5xx — if the breaker reject leaked into + // MaybeRetry, the test would see extra backend hits after the + // trip. Long open window so the breaker stays OPEN for the + // duration of the post-trip assertion (no HALF_OPEN probe + // admission racing the test). + auto u = MakeBreakerUpstream("svc", "127.0.0.1", backend_port, + /*enabled=*/true, /*threshold=*/3); + u.proxy.retry.max_retries = 3; + u.proxy.retry.retry_on_5xx = true; + u.circuit_breaker.base_open_duration_ms = 30000; + u.circuit_breaker.max_open_duration_ms = 60000; + gw.upstreams.push_back(u); + + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + int gw_port = gw_runner.GetPort(); + + // Trip the breaker. Each pre-trip request may retry up to 3 + // times (all failing 5xx), so backend sees up to 3*threshold=12 + // hits. That's acceptable — we just care about post-trip behavior. + for (int i = 0; i < 3; ++i) { + TestHttpClient::HttpGet(gw_port, "/fail", 5000); + } + int pre_trip_hits = backend_hits.load(); + + // Post-trip request: expect a single 503 and NO new backend hits. + std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 3000); + bool is_503 = TestHttpClient::HasStatus(r, 503); + int post_trip_hits = backend_hits.load(); + bool no_new_hits = (post_trip_hits == pre_trip_hits); + + bool pass = is_503 && no_new_hits; + TestFramework::RecordTest( + "CB Integration: CIRCUIT_OPEN terminal for retry", pass, + pass ? "" : + "is_503=" + std::to_string(is_503) + + " pre=" + std::to_string(pre_trip_hits) + + " post=" + std::to_string(post_trip_hits)); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Integration: CIRCUIT_OPEN terminal for retry", false, e.what()); + } +} + +// --------------------------------------------------------------------------- +// Test 9: Dry-run mode — dry_run=true forwards rejected requests to the +// upstream (pass-through) but still increments the rejected_ counter so +// operators can observe the would-reject rate without production impact. +// --------------------------------------------------------------------------- +void TestDryRunPassthrough() { + std::cout << "\n[TEST] CB Integration: dry-run passthrough..." << std::endl; + try { + std::atomic backend_hits{0}; + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [&backend_hits](const HttpRequest&, HttpResponse& resp) { + backend_hits.fetch_add(1, std::memory_order_relaxed); + resp.Status(502).Body("err", "text/plain"); + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 1; + gw.http2.enabled = false; + auto u = MakeBreakerUpstream("svc", "127.0.0.1", backend_port, + /*enabled=*/true, /*threshold=*/3); + u.circuit_breaker.dry_run = true; // would-reject, but still forward + gw.upstreams.push_back(u); + + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + int gw_port = gw_runner.GetPort(); + + // Trip thresholds with 5 requests. All should reach backend (502), + // not a 503 — dry-run never short-circuits. + for (int i = 0; i < 5; ++i) { + std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 3000); + if (!TestHttpClient::HasStatus(r, 502)) { + TestFramework::RecordTest( + "CB Integration: dry-run passthrough", false, + "request " + std::to_string(i) + + " expected 502, got: " + r.substr(0, 64)); + return; + } + } + + bool all_hit = (backend_hits.load() == 5); + + // Verify the slice observed trips/rejected even though traffic passed. + auto* mgr = gateway.GetUpstreamManager() ? + gateway.GetUpstreamManager()->GetCircuitBreakerManager() : + nullptr; + int64_t trips = 0, rejected = 0; + if (mgr) { + auto* host = mgr->GetHost("svc"); + if (host) { + auto snap = host->Snapshot(); + trips = snap.total_trips; + rejected = snap.total_rejected; + } + } + // At least one trip fired (consecutive_threshold=3 → slice + // transitioned at least once during the run), and the post-trip + // requests were counted as would-reject (rejected > 0). + bool observed = (trips >= 1) && (rejected >= 1); + + bool pass = all_hit && observed; + TestFramework::RecordTest( + "CB Integration: dry-run passthrough", pass, + pass ? "" : + "hits=" + std::to_string(backend_hits.load()) + + " trips=" + std::to_string(trips) + + " rejected=" + std::to_string(rejected)); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Integration: dry-run passthrough", false, e.what()); + } +} + +// --------------------------------------------------------------------------- +// Test 10: HALF_OPEN → CLOSED recovery round-trip through the proxy. Trip the +// breaker, wait for the open window to elapse, then serve success responses +// and assert the slice transitions back to CLOSED (consecutive_successes +// crosses the threshold — default 2 from DefaultCbConfig / integration config). +// --------------------------------------------------------------------------- +void TestHalfOpenRecoveryRoundTrip() { + std::cout << "\n[TEST] CB Integration: HALF_OPEN → CLOSED recovery..." + << std::endl; + try { + std::atomic fail_mode{true}; + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [&fail_mode](const HttpRequest&, HttpResponse& resp) { + if (fail_mode.load()) { + resp.Status(502).Body("err", "text/plain"); + } else { + resp.Status(200).Body("ok", "text/plain"); + } + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 1; + gw.http2.enabled = false; + auto u = MakeBreakerUpstream("svc", "127.0.0.1", backend_port, + /*enabled=*/true, /*threshold=*/3); + // Short open duration so recovery path finishes quickly. + u.circuit_breaker.base_open_duration_ms = 300; + u.circuit_breaker.max_open_duration_ms = 1000; + // Two probes needed to close (default permitted_half_open_calls=2). + u.circuit_breaker.permitted_half_open_calls = 2; + gw.upstreams.push_back(u); + + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + int gw_port = gw_runner.GetPort(); + + // Trip by hitting the failing backend. + for (int i = 0; i < 3; ++i) { + TestHttpClient::HttpGet(gw_port, "/fail", 3000); + } + + // Flip backend to success and wait for the open window to elapse. + fail_mode.store(false); + std::this_thread::sleep_for(std::chrono::milliseconds(500)); + + // Probe the proxy — each successful 200 advances HALF_OPEN toward + // CLOSED. Do more than permitted_half_open_calls; some will be + // rejected as half_open_full but the ones that are admitted will + // close the breaker. + bool saw_success = false; + for (int i = 0; i < 8; ++i) { + std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 3000); + if (TestHttpClient::HasStatus(r, 200)) saw_success = true; + // Small gap between probes — HALF_OPEN only admits permitted + // probes per cycle; spacing lets subsequent probes observe a + // possibly-closed breaker. + std::this_thread::sleep_for(std::chrono::milliseconds(50)); + } + + // Verify slice aggregate: at least one CLOSED transition observed + // (probe_successes >= 1 and total_trips == 1 — we only tripped once). + auto* mgr = gateway.GetUpstreamManager() ? + gateway.GetUpstreamManager()->GetCircuitBreakerManager() : + nullptr; + int64_t probe_succ = 0; + int open_parts = 0, half_open_parts = 0; + if (mgr) { + auto* host = mgr->GetHost("svc"); + if (host) { + auto snap = host->Snapshot(); + probe_succ = 0; + for (const auto& row : snap.slices) { + probe_succ += row.probe_successes; + } + open_parts = snap.open_partitions; + half_open_parts = snap.half_open_partitions; + } + } + + // Recovery complete: saw at least one 200 through the breaker, + // at least one probe success counted, and no partition still + // stuck in OPEN (HALF_OPEN may still linger on the unused slice, + // which is fine for a 2-partition setup). + bool pass = saw_success && (probe_succ >= 1) && (open_parts == 0); + TestFramework::RecordTest( + "CB Integration: HALF_OPEN → CLOSED recovery", pass, + pass ? "" : + "saw_success=" + std::to_string(saw_success) + + " probe_succ=" + std::to_string(probe_succ) + + " open_parts=" + std::to_string(open_parts) + + " half_open_parts=" + std::to_string(half_open_parts)); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Integration: HALF_OPEN → CLOSED recovery", false, e.what()); + } +} + +// --------------------------------------------------------------------------- +// Test 11: Retry-After ceils the config cap from a non-second-aligned +// max_open_duration_ms (e.g. 1500ms → 2s, not 1s). Floor-rounding the cap +// would clamp the advertised retry window below what the breaker honors, +// causing well-behaved clients to re-hit the 503. +// --------------------------------------------------------------------------- +void TestRetryAfterCapCeilsNonAlignedMax() { + std::cout << "\n[TEST] CB Integration: Retry-After cap ceils non-aligned max..." + << std::endl; + try { + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) { + resp.Status(502).Body("err", "text/plain"); + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 1; + gw.http2.enabled = false; + // Configure a non-second-aligned max backoff. base = 1500ms so + // the actual OpenUntil-now at trip time is ~1.5s, which ceil- + // rounds to 2s. If cfg_cap_secs floor-rounded max_open_duration + // (1500ms → 1s), the clamp would drop Retry-After to 1s even + // though the breaker would keep rejecting through the second + // half of that window. + auto u = MakeBreakerUpstream("svc", "127.0.0.1", backend_port, + /*enabled=*/true, /*threshold=*/3); + u.circuit_breaker.base_open_duration_ms = 1500; + u.circuit_breaker.max_open_duration_ms = 1500; + gw.upstreams.push_back(u); + + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + int gw_port = gw_runner.GetPort(); + + for (int i = 0; i < 3; ++i) { + TestHttpClient::HttpGet(gw_port, "/fail", 3000); + } + std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 3000); + + int retry_after = -1; + const char* markers[] = {"Retry-After:", "retry-after:"}; + for (const char* m : markers) { + auto pos = r.find(m); + if (pos == std::string::npos) continue; + pos += std::string(m).size(); + while (pos < r.size() && (r[pos] == ' ' || r[pos] == '\t')) ++pos; + int val = 0; + bool any = false; + while (pos < r.size() && r[pos] >= '0' && r[pos] <= '9') { + val = val * 10 + (r[pos] - '0'); + any = true; + ++pos; + } + if (any) { retry_after = val; break; } + } + + // Expectation: Retry-After is in [1, 2] — cfg_cap_secs ceil- + // rounds 1500ms to 2s, and the remaining-time ceil-rounds to + // 2 at the moment of trip (may be 1 if enough wall-clock has + // elapsed between trip and response). Critically it must NEVER + // be zero or exceed 2 (clamped to the 2s cap). + bool in_range = (retry_after >= 1 && retry_after <= 2); + TestFramework::RecordTest( + "CB Integration: Retry-After ceils non-aligned cap", in_range, + in_range ? "" : + "retry_after=" + std::to_string(retry_after)); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Integration: Retry-After ceils non-aligned cap", false, e.what()); + } +} + +// --------------------------------------------------------------------------- +// Test 12: Retried failures are reported BEFORE the retry fires. With retries +// enabled on 5xx, each attempt's outcome must be counted against the breaker; +// otherwise the slice trips only after the final retry exhausts, under- +// counting failures and potentially never tripping if retries mask enough of +// them. Verifies the trip still happens within the expected number of client +// requests once reporting is attached to the retry path. +// --------------------------------------------------------------------------- +void TestRetriedFailuresCountTowardTrip() { + std::cout << "\n[TEST] CB Integration: retried failures count toward trip..." + << std::endl; + try { + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) { + resp.Status(502).Body("err", "text/plain"); + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 1; + gw.http2.enabled = false; + // Retries on 5xx enabled. threshold=3 — with retry_on_5xx, each + // client request produces 1 + max_retries=3 = 4 upstream + // attempts, each reporting RESPONSE_5XX via the ReportBreakerOutcome + // path that this fix patches in. The breaker must trip after + // at most 3 upstream failure reports (which the first client + // request alone produces). + auto u = MakeBreakerUpstream("svc", "127.0.0.1", backend_port, + /*enabled=*/true, /*threshold=*/3); + u.proxy.retry.max_retries = 3; + u.proxy.retry.retry_on_5xx = true; + u.circuit_breaker.base_open_duration_ms = 30000; + u.circuit_breaker.max_open_duration_ms = 60000; + gw.upstreams.push_back(u); + + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + int gw_port = gw_runner.GetPort(); + + // One client request → 4 upstream attempts → 4 RESPONSE_5XX + // reports. Threshold=3 should trip during this single request. + TestHttpClient::HttpGet(gw_port, "/fail", 5000); + + // Second client request must hit the OPEN breaker → 503. + std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 3000); + bool is_503 = TestHttpClient::HasStatus(r, 503); + bool has_breaker_header = + r.find("X-Circuit-Breaker: open") != std::string::npos || + r.find("x-circuit-breaker: open") != std::string::npos; + + bool pass = is_503 && has_breaker_header; + TestFramework::RecordTest( + "CB Integration: retried failures count toward trip", pass, + pass ? "" : + "is_503=" + std::to_string(is_503) + + " breaker_hdr=" + std::to_string(has_breaker_header) + + " body=" + r.substr(0, 256)); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Integration: retried failures count toward trip", false, e.what()); + } +} + +// --------------------------------------------------------------------------- +// Test 13: HALF_OPEN rejects emit a distinct X-Circuit-Breaker label. +// TryAcquire returns REJECTED_OPEN for three situations (true OPEN, +// half_open_full, half_open_recovery_failing). When the slice is in +// HALF_OPEN, OpenUntil is cleared and a generic MakeCircuitOpenResponse +// would fall back to Retry-After=1 + X-Circuit-Breaker:open — misleading +// clients. The fix emits X-Circuit-Breaker:half_open for HALF_OPEN rejects +// with a more conservative Retry-After hint. +// +// Strategy: trip the breaker, wait for the open window to elapse so the +// slice transitions HALF_OPEN on the next admission attempt, then flood +// concurrent requests so some hit half_open_full. +// --------------------------------------------------------------------------- +void TestHalfOpenRejectLabel() { + std::cout << "\n[TEST] CB Integration: HALF_OPEN reject label..." + << std::endl; + try { + // Backend hangs to keep probes in-flight so later concurrent + // requests hit half_open_full. + std::atomic hang{false}; + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [&hang](const HttpRequest&, HttpResponse& resp) { + if (hang.load()) { + std::this_thread::sleep_for(std::chrono::milliseconds(600)); + } + resp.Status(502).Body("err", "text/plain"); + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 1; + gw.http2.enabled = false; + auto u = MakeBreakerUpstream("svc", "127.0.0.1", backend_port, + /*enabled=*/true, /*threshold=*/3); + u.circuit_breaker.base_open_duration_ms = 200; + u.circuit_breaker.max_open_duration_ms = 500; + u.circuit_breaker.permitted_half_open_calls = 1; // tiny budget + gw.upstreams.push_back(u); + + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + int gw_port = gw_runner.GetPort(); + + // Trip the breaker. + for (int i = 0; i < 3; ++i) { + TestHttpClient::HttpGet(gw_port, "/fail", 3000); + } + // Wait for the open window to elapse so the next admission + // flips the slice to HALF_OPEN. + std::this_thread::sleep_for(std::chrono::milliseconds(300)); + + // Flip backend to hang so the probe occupies the single probe + // slot while we fire sibling requests that must hit half_open_full. + hang.store(true); + + std::atomic saw_half_open{false}; + std::atomic saw_open{false}; + auto probe = [&](int id) { + (void)id; + std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 1500); + if (!TestHttpClient::HasStatus(r, 503)) return; + if (r.find("X-Circuit-Breaker: half_open") != std::string::npos || + r.find("x-circuit-breaker: half_open") != std::string::npos) { + saw_half_open.store(true); + } + if (r.find("X-Circuit-Breaker: open") != std::string::npos || + r.find("x-circuit-breaker: open") != std::string::npos) { + // We want to distinguish the labels; the "open" substring + // also matches "half_open". Only count true "open" if + // "half_open" didn't appear in THIS response. + if (r.find("half_open") == std::string::npos) { + saw_open.store(true); + } + } + }; + + std::vector threads; + for (int i = 0; i < 6; ++i) { + threads.emplace_back(probe, i); + std::this_thread::sleep_for(std::chrono::milliseconds(20)); + } + for (auto& t : threads) t.join(); + + // Pass if at least one HALF_OPEN-labelled reject was observed. + // saw_open may or may not be observed (some rejects could have + // hit between cycles) — the key contract is that HALF_OPEN + // rejects no longer get the plain "open" label. + bool pass = saw_half_open.load(); + TestFramework::RecordTest( + "CB Integration: HALF_OPEN reject label", pass, + pass ? "" : + "saw_half_open=" + std::to_string(saw_half_open.load()) + + " saw_open=" + std::to_string(saw_open.load())); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Integration: HALF_OPEN reject label", false, e.what()); + } +} + +// --------------------------------------------------------------------------- +// Test 14: HALF_OPEN Retry-After reflects the current exponential backoff, +// not just base_open_duration_ms. After multiple trips the next OPEN window +// (base << consecutive_trips_, clamped by max) can exceed 1 second; the old +// base-only hint (ceil(base/1000) = 1s for base=100ms) would under-report +// the worst-case wait, which this test must fail for. +// +// Strategy: keep the backend failing and drive MULTIPLE re-trips by letting +// the OPEN window elapse and single probe fail each cycle. Successful +// recoveries must be avoided — TransitionHalfOpenToClosed resets +// consecutive_trips_ to 0, which hides the exponential hint. +// --------------------------------------------------------------------------- +void TestHalfOpenRetryAfterScalesWithBackoff() { + std::cout << "\n[TEST] CB Integration: HALF_OPEN Retry-After exponential..." + << std::endl; + try { + // Backend fails fast by default. When `hang` is set, the + // handler blocks — used at the end to pin the probe slot so + // a concurrent request observes HALF_OPEN rejection. + std::atomic hang{false}; + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [&hang](const HttpRequest&, HttpResponse& resp) { + if (hang.load()) { + std::this_thread::sleep_for(std::chrono::milliseconds(1500)); + } + resp.Status(502).Body("err", "text/plain"); + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 1; // pin all traffic to slice[0] + gw.http2.enabled = false; + auto u = MakeBreakerUpstream("svc", "127.0.0.1", backend_port, + /*enabled=*/true, /*threshold=*/2); + u.circuit_breaker.base_open_duration_ms = 100; // config minimum + u.circuit_breaker.max_open_duration_ms = 8000; // cap at 8s + u.circuit_breaker.permitted_half_open_calls = 1; // single probe + gw.upstreams.push_back(u); + + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + int gw_port = gw_runner.GetPort(); + + auto* cbm = gateway.GetUpstreamManager() ? + gateway.GetUpstreamManager()->GetCircuitBreakerManager() : nullptr; + auto* host = cbm ? cbm->GetHost("svc") : nullptr; + auto* slice = host ? host->GetSlice(0) : nullptr; + if (!slice) { + TestFramework::RecordTest( + "CB Integration: HALF_OPEN Retry-After exponential-aware", + false, "slice lookup failed"); + return; + } + + // Initial trip: 2 consecutive failures with threshold=2. + for (int i = 0; i < 2; ++i) { + TestHttpClient::HttpGet(gw_port, "/fail", 3000); + } + + // Drive consecutive_trips_ up by letting successive OPEN windows + // elapse and probes fail (no recovery → no reset). Stop when + // NextOpenDurationMs crosses 1000ms, which is the threshold + // where the HALF_OPEN Retry-After hint starts exceeding the + // base-only value (ceil(100ms)=1s). + // + // The slice re-trips on each failed probe; each trip doubles + // the open duration. We run ~8 cycles with safety margin which + // is comfortably past the trip count needed for Retry-After>=2. + for (int cycle = 0; cycle < 8; ++cycle) { + // Wait past the current open window. Upper bound: max=8s, + // so 1200ms is plenty for the first few short cycles, and + // we re-check after each request anyway. + int64_t next_ms = slice->NextOpenDurationMs(); + // Current OPEN window is the one stored BEFORE the upcoming + // re-trip — we don't have that directly, so sleep past the + // NEXT duration as an over-approximation (next is always >= + // current). This ensures OPEN has elapsed. + auto sleep_ms = std::max(next_ms + 50, 200); + if (sleep_ms > 2000) sleep_ms = 2000; // cap per cycle + std::this_thread::sleep_for(std::chrono::milliseconds(sleep_ms)); + + // One request — it should admit as a probe (HALF_OPEN), + // the backend fails fast (502), probe fails → re-trip with + // consecutive_trips_++ and fresh OPEN. + TestHttpClient::HttpGet(gw_port, "/fail", 3000); + + // Bail early once the exponential hint crosses 1s → the + // subsequent HALF_OPEN reject will carry Retry-After >= 2. + if (slice->NextOpenDurationMs() >= 2000) break; + } + + int64_t next_open_ms = slice->NextOpenDurationMs(); + if (next_open_ms < 2000) { + TestFramework::RecordTest( + "CB Integration: HALF_OPEN Retry-After exponential-aware", + false, + "setup failed: next_open_ms=" + std::to_string(next_open_ms) + + " (need >= 2000 to distinguish from base-only hint)"); + return; + } + + // Now trigger a HALF_OPEN reject: wait for current OPEN to + // elapse, start a hanging probe (pins the slot), then fire a + // sibling request — it must see half_open_full with the + // exponential Retry-After. + int64_t post_wait_ms = next_open_ms + 100; + if (post_wait_ms > 4000) post_wait_ms = 4000; + std::this_thread::sleep_for(std::chrono::milliseconds(post_wait_ms)); + + hang.store(true); + std::thread probe([&]() { + TestHttpClient::HttpGet(gw_port, "/fail", 3500); + }); + // Let the probe get admitted and start hanging. + std::this_thread::sleep_for(std::chrono::milliseconds(200)); + + std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 1500); + hang.store(false); + probe.join(); + + bool is_half_open = + r.find("X-Circuit-Breaker: half_open") != std::string::npos || + r.find("x-circuit-breaker: half_open") != std::string::npos; + + int retry_after = -1; + const char* markers[] = {"Retry-After:", "retry-after:"}; + for (const char* m : markers) { + auto pos = r.find(m); + if (pos == std::string::npos) continue; + pos += std::string(m).size(); + while (pos < r.size() && (r[pos] == ' ' || r[pos] == '\t')) ++pos; + int val = 0; + bool any = false; + while (pos < r.size() && r[pos] >= '0' && r[pos] <= '9') { + val = val * 10 + (r[pos] - '0'); + any = true; + ++pos; + } + if (any) { retry_after = val; break; } + } + + // Post-fix: Retry-After = ceil(next_open_ms / 1000) >= 2. + // Pre-fix (base-only): Retry-After = ceil(base/1000) = 1. + // Asserting >= 2 fails the pre-fix implementation. + bool retry_after_ok = (retry_after >= 2 && retry_after <= 8); + bool pass = is_half_open && retry_after_ok; + TestFramework::RecordTest( + "CB Integration: HALF_OPEN Retry-After exponential-aware", pass, + pass ? "" : + "is_half_open=" + std::to_string(is_half_open) + + " retry_after=" + std::to_string(retry_after) + + " next_open_ms=" + std::to_string(next_open_ms)); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Integration: HALF_OPEN Retry-After exponential-aware", + false, e.what()); + } +} + +void RunAllTests() { + std::cout << "\n" << std::string(60, '=') << std::endl; + std::cout << "CIRCUIT BREAKER - INTEGRATION TESTS" << std::endl; + std::cout << std::string(60, '=') << std::endl; + + TestBareProxyWorks(); + TestBreakerTripsAfterConsecutiveFailures(); + TestBreakerDisabledPassesThrough(); + TestSuccessResetsConsecutiveFailureCounter(); + TestTripDrivesSliceState(); + TestOpenBreakerShortCircuitsUpstreamCall(); + TestRetryAfterHeaderValue(); + TestCircuitOpenTerminalForRetry(); + TestDryRunPassthrough(); + TestHalfOpenRecoveryRoundTrip(); + TestRetryAfterCapCeilsNonAlignedMax(); + TestRetriedFailuresCountTowardTrip(); + TestHalfOpenRejectLabel(); + TestHalfOpenRetryAfterScalesWithBackoff(); +} + +} // namespace CircuitBreakerIntegrationTests diff --git a/test/circuit_breaker_observability_test.h b/test/circuit_breaker_observability_test.h new file mode 100644 index 00000000..42694a67 --- /dev/null +++ b/test/circuit_breaker_observability_test.h @@ -0,0 +1,405 @@ +#pragma once + +// Observability integration tests: observability — counter accuracy, snapshot +// API correctness, and log emission. +// +// Phases 2-6 each added counters and log lines as a side effect of their +// functional work. This suite locks those in as regressions: +// +// * Counters (§11.2): trips, rejected, probe_successes, probe_failures, +// retries_rejected surface through CircuitBreakerManager::SnapshotAll. +// * Snapshot API (§11.3): per-slice rows aggregate into host-level +// totals; host-level fields (retries_in_flight / retries_rejected / +// in_flight) reflect the owning RetryBudget. +// * Logs (§11.1): the CLOSED→OPEN trip emits the full-context message +// including trigger, consecutive_failures, window_total, +// window_fail_rate, open_for_ms, and consecutive_trips. +// +// The log-emission test attaches a spdlog ring-buffer sink to the logger +// for the duration of the test, triggers a trip, then asserts the +// captured messages contain the expected fields. No log file I/O. + +#include "test_framework.h" +#include "test_server_runner.h" +#include "http_test_client.h" +#include "http/http_server.h" +#include "config/server_config.h" +#include "upstream/upstream_manager.h" +#include "circuit_breaker/circuit_breaker_manager.h" +#include "circuit_breaker/circuit_breaker_host.h" +#include "circuit_breaker/circuit_breaker_slice.h" +#include "log/logger.h" +#include "spdlog/sinks/ringbuffer_sink.h" + +#include +#include +#include +#include +#include +#include + +namespace CircuitBreakerObservabilityTests { + +using circuit_breaker::State; + +static UpstreamConfig MakeObservUpstream(const std::string& name, + const std::string& host, + int port, + int consecutive_threshold = 3) { + UpstreamConfig u; + u.name = name; + u.host = host; + u.port = port; + u.pool.max_connections = 8; + u.pool.max_idle_connections = 4; + u.pool.connect_timeout_ms = 3000; + u.pool.idle_timeout_sec = 30; + u.pool.max_lifetime_sec = 3600; + u.pool.max_requests_per_conn = 0; + + u.proxy.route_prefix = "/fail"; + u.proxy.strip_prefix = false; + u.proxy.response_timeout_ms = 2000; + u.proxy.retry.max_retries = 0; + + u.circuit_breaker.enabled = true; + u.circuit_breaker.consecutive_failure_threshold = consecutive_threshold; + u.circuit_breaker.failure_rate_threshold = 100; + u.circuit_breaker.minimum_volume = 10000; + u.circuit_breaker.window_seconds = 10; + u.circuit_breaker.permitted_half_open_calls = 2; + // Long open duration — keep the slice OPEN so post-trip assertions + // don't race a HALF_OPEN transition. + u.circuit_breaker.base_open_duration_ms = 30000; + u.circuit_breaker.max_open_duration_ms = 60000; + return u; +} + +// --------------------------------------------------------------------------- +// Test 1: Snapshot API reflects per-slice trip/rejected counters and +// host-level aggregates. Drives N+1 requests against a backend that always +// 502s (N to trip, 1 more that the OPEN slice short-circuits) and asserts +// the snapshot shows total_trips >= 1, total_rejected >= 1, +// open_partitions >= 1. +// --------------------------------------------------------------------------- +void TestSnapshotReflectsCounters() { + std::cout << "\n[TEST] CB Observability: snapshot reflects counters..." + << std::endl; + try { + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) { + resp.Status(502).Body("upstream-err", "text/plain"); + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 1; + gw.http2.enabled = false; + + auto u = MakeObservUpstream("svc", "127.0.0.1", backend_port, + /*threshold=*/3); + gw.upstreams.push_back(u); + + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + int gw_port = gw_runner.GetPort(); + + // Trip (3 failures), then 2 more to accumulate rejected counter. + for (int i = 0; i < 3; ++i) { + TestHttpClient::HttpGet(gw_port, "/fail", 3000); + } + for (int i = 0; i < 2; ++i) { + TestHttpClient::HttpGet(gw_port, "/fail", 3000); + } + + auto* cbm = gateway.GetUpstreamManager()->GetCircuitBreakerManager(); + if (!cbm) { + TestFramework::RecordTest( + "CB Observability: snapshot reflects counters", false, + "no circuit breaker manager attached"); + return; + } + auto snaps = cbm->SnapshotAll(); + bool found = false; + int64_t trips = 0, rejected = 0, probe_s = 0, probe_f = 0; + int open_parts = 0; + for (const auto& s : snaps) { + if (s.service_name == "svc") { + trips = s.total_trips; + rejected = s.total_rejected; + open_parts = s.open_partitions; + for (const auto& row : s.slices) { + probe_s += row.probe_successes; + probe_f += row.probe_failures; + } + found = true; + break; + } + } + + bool pass = found + && trips >= 1 + && rejected >= 2 // 2 post-trip short-circuits + && open_parts >= 1 + && probe_s == 0 // never entered HALF_OPEN + && probe_f == 0; + TestFramework::RecordTest( + "CB Observability: snapshot reflects counters", pass, + pass ? "" : + "found=" + std::to_string(found) + + " trips=" + std::to_string(trips) + + " rejected=" + std::to_string(rejected) + + " open_parts=" + std::to_string(open_parts) + + " probe_s=" + std::to_string(probe_s) + + " probe_f=" + std::to_string(probe_f)); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Observability: snapshot reflects counters", false, e.what()); + } +} + +// --------------------------------------------------------------------------- +// Test 2: The CLOSED→OPEN trip log emits the §11.1 full-context message. +// Attaches a spdlog ringbuffer_sink to the shared logger, triggers a trip, +// then inspects the captured messages for the key tokens. The sink is +// removed before the test returns so it doesn't affect later tests. +// --------------------------------------------------------------------------- +void TestTripLogEmission() { + std::cout << "\n[TEST] CB Observability: trip log emission..." << std::endl; + try { + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) { + resp.Status(502).Body("upstream-err", "text/plain"); + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 1; + gw.http2.enabled = false; + + auto u = MakeObservUpstream("svc-log", "127.0.0.1", backend_port, + /*threshold=*/2); + gw.upstreams.push_back(u); + + // `HttpServer` construction calls `logging::Init()` which rebuilds + // the default logger via `spdlog::set_default_logger`. Any sink + // attached BEFORE that point lands on a stale logger. Attach the + // ringbuffer sink AFTER the last HttpServer construction so it + // captures the live logger's output. + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + int gw_port = gw_runner.GetPort(); + + auto ring = std::make_shared< + spdlog::sinks::ringbuffer_sink_mt>(1024); + auto logger = logging::Get(); + auto prev_level = logger->level(); + logger->set_level(spdlog::level::debug); + logger->sinks().push_back(ring); + + struct SinkGuard { + std::shared_ptr logger; + std::shared_ptr ring; + spdlog::level::level_enum prev_level; + ~SinkGuard() { + auto& sinks = logger->sinks(); + sinks.erase(std::remove(sinks.begin(), sinks.end(), + std::shared_ptr(ring)), + sinks.end()); + logger->set_level(prev_level); + } + } guard{logger, ring, prev_level}; + + // Drive exactly threshold=2 failures to trip. + TestHttpClient::HttpGet(gw_port, "/fail", 3000); + TestHttpClient::HttpGet(gw_port, "/fail", 3000); + + // Give the dispatcher a breath to emit + the sink to settle. + std::this_thread::sleep_for(std::chrono::milliseconds(50)); + + auto messages = ring->last_formatted(); + // Scan for the trip message. Look for the static prefix plus the + // §11.1 field tokens. + bool saw_tripped = false; + bool has_trigger = false; + bool has_consec_failures = false; + bool has_window_total = false; + bool has_fail_rate = false; + bool has_open_for_ms = false; + bool has_consec_trips = false; + for (const auto& msg : messages) { + if (msg.find("circuit breaker tripped") == std::string::npos) { + continue; + } + saw_tripped = true; + if (msg.find("trigger=") != std::string::npos) has_trigger = true; + if (msg.find("consecutive_failures=") != std::string::npos) + has_consec_failures = true; + if (msg.find("window_total=") != std::string::npos) + has_window_total = true; + if (msg.find("window_fail_rate=") != std::string::npos) + has_fail_rate = true; + if (msg.find("open_for_ms=") != std::string::npos) + has_open_for_ms = true; + if (msg.find("consecutive_trips=") != std::string::npos) + has_consec_trips = true; + } + + bool pass = saw_tripped && has_trigger && has_consec_failures && + has_window_total && has_fail_rate && + has_open_for_ms && has_consec_trips; + TestFramework::RecordTest( + "CB Observability: trip log emission", pass, + pass ? "" : + "saw_tripped=" + std::to_string(saw_tripped) + + " trigger=" + std::to_string(has_trigger) + + " consec_failures=" + std::to_string(has_consec_failures) + + " window_total=" + std::to_string(has_window_total) + + " fail_rate=" + std::to_string(has_fail_rate) + + " open_for_ms=" + std::to_string(has_open_for_ms) + + " consec_trips=" + std::to_string(has_consec_trips)); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Observability: trip log emission", false, e.what()); + } +} + +// --------------------------------------------------------------------------- +// Test 3: Retry-budget observability — the exhausted log carries the +// §11.1 fields (service, in_flight, retries_in_flight, cap), and the +// host snapshot reflects retries_rejected. +// --------------------------------------------------------------------------- +void TestRetryBudgetObservability() { + std::cout << "\n[TEST] CB Observability: retry budget observability..." + << std::endl; + try { + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) { + resp.Status(502).Body("upstream-err", "text/plain"); + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 1; + gw.http2.enabled = false; + + // Budget: zero percent AND zero floor → every retry rejected. + auto u = MakeObservUpstream("svc-budget", "127.0.0.1", backend_port, + /*threshold=*/10000); + u.proxy.retry.max_retries = 2; + u.proxy.retry.retry_on_5xx = true; + u.circuit_breaker.retry_budget_percent = 0; + u.circuit_breaker.retry_budget_min_concurrency = 0; + gw.upstreams.push_back(u); + + // Attach the ringbuffer AFTER gateway construction — see + // TestTripLogEmission for rationale (HttpServer's ctor + // replaces the default logger via logging::Init, detaching + // any previously-attached sinks). + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + int gw_port = gw_runner.GetPort(); + + auto ring = std::make_shared< + spdlog::sinks::ringbuffer_sink_mt>(1024); + auto logger = logging::Get(); + auto prev_level = logger->level(); + logger->set_level(spdlog::level::debug); + logger->sinks().push_back(ring); + + struct SinkGuard { + std::shared_ptr logger; + std::shared_ptr ring; + spdlog::level::level_enum prev_level; + ~SinkGuard() { + auto& sinks = logger->sinks(); + sinks.erase(std::remove(sinks.begin(), sinks.end(), + std::shared_ptr(ring)), + sinks.end()); + logger->set_level(prev_level); + } + } guard{logger, ring, prev_level}; + + // One client request: first attempt hits backend (502), retry + // blocked by budget → 503 + X-Retry-Budget-Exhausted. + TestHttpClient::HttpGet(gw_port, "/fail", 5000); + + std::this_thread::sleep_for(std::chrono::milliseconds(50)); + + auto messages = ring->last_formatted(); + bool saw_exhausted = false; + bool has_service = false; + bool has_inflight = false; + bool has_retries_inflight = false; + bool has_cap = false; + for (const auto& msg : messages) { + if (msg.find("retry budget exhausted") == std::string::npos) { + continue; + } + saw_exhausted = true; + if (msg.find("service=") != std::string::npos) has_service = true; + if (msg.find("in_flight=") != std::string::npos) + has_inflight = true; + if (msg.find("retries_in_flight=") != std::string::npos) + has_retries_inflight = true; + if (msg.find("cap=") != std::string::npos) has_cap = true; + } + + // Snapshot: retries_rejected must be >= 1 (every rejection increments). + int64_t retries_rejected = 0; + auto* cbm = gateway.GetUpstreamManager()->GetCircuitBreakerManager(); + if (cbm) { + for (const auto& s : cbm->SnapshotAll()) { + if (s.service_name == "svc-budget") { + // Host aggregate — single host, so the sum is the + // host's retries_rejected. The snapshot doesn't yet + // expose that directly — derive from RetryBudget + // via the host getter. + auto* host = cbm->GetHost("svc-budget"); + if (host) { + retries_rejected = + host->GetRetryBudget()->RetriesRejected(); + } + break; + } + } + } + + bool pass = saw_exhausted && has_service && has_inflight && + has_retries_inflight && has_cap && + retries_rejected >= 1; + TestFramework::RecordTest( + "CB Observability: retry budget observability", pass, + pass ? "" : + "saw_exhausted=" + std::to_string(saw_exhausted) + + " service=" + std::to_string(has_service) + + " inflight=" + std::to_string(has_inflight) + + " retries_inflight=" + std::to_string(has_retries_inflight) + + " cap=" + std::to_string(has_cap) + + " retries_rejected=" + std::to_string(retries_rejected)); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Observability: retry budget observability", false, e.what()); + } +} + +void RunAllTests() { + std::cout << "\n" << std::string(60, '=') << std::endl; + std::cout << "CIRCUIT BREAKER - OBSERVABILITY TESTS" << std::endl; + std::cout << std::string(60, '=') << std::endl; + + TestSnapshotReflectsCounters(); + TestTripLogEmission(); + TestRetryBudgetObservability(); +} + +} // namespace CircuitBreakerObservabilityTests diff --git a/test/circuit_breaker_reload_test.h b/test/circuit_breaker_reload_test.h new file mode 100644 index 00000000..5b63e6b4 --- /dev/null +++ b/test/circuit_breaker_reload_test.h @@ -0,0 +1,594 @@ +#pragma once + +// Reload integration tests: hot-reload of circuit-breaker fields. +// +// UpstreamConfig::operator== now excludes `circuit_breaker` — a CB-only +// SIGHUP is a clean reload that propagates via HttpServer::Reload → +// CircuitBreakerManager::Reload → per-host per-slice Reload enqueued on +// each owning dispatcher. +// +// Topology fields (host, port, pool, proxy, tls) remain restart-only. +// +// Strategy: construct a gateway with an enabled breaker, capture the +// initial slice config, call HttpServer::Reload with an edited +// CircuitBreakerConfig, and verify the slice's live config reflects the +// edit. The reload-log capture also verifies the manager-level log lines +// ("CircuitBreakerManager::Reload: new/removed upstream ...") fire for +// topology-change SIGHUPs. + +#include "test_framework.h" +#include "test_server_runner.h" +#include "http_test_client.h" +#include "http/http_server.h" +#include "config/server_config.h" +#include "upstream/upstream_manager.h" +#include "circuit_breaker/circuit_breaker_manager.h" +#include "circuit_breaker/circuit_breaker_host.h" +#include "circuit_breaker/circuit_breaker_slice.h" +#include "log/logger.h" +#include "spdlog/sinks/ringbuffer_sink.h" + +#include +#include +#include +#include +#include + +namespace CircuitBreakerReloadTests { + +static UpstreamConfig MakeReloadUpstream(const std::string& name, + const std::string& host, + int port) { + UpstreamConfig u; + u.name = name; + u.host = host; + u.port = port; + u.pool.max_connections = 8; + u.pool.max_idle_connections = 4; + u.pool.connect_timeout_ms = 3000; + u.pool.idle_timeout_sec = 30; + u.pool.max_lifetime_sec = 3600; + u.pool.max_requests_per_conn = 0; + + u.proxy.route_prefix = "/fail"; + u.proxy.strip_prefix = false; + u.proxy.response_timeout_ms = 2000; + u.proxy.retry.max_retries = 0; + + u.circuit_breaker.enabled = true; + u.circuit_breaker.consecutive_failure_threshold = 3; + u.circuit_breaker.failure_rate_threshold = 100; + u.circuit_breaker.minimum_volume = 10000; + u.circuit_breaker.window_seconds = 10; + u.circuit_breaker.permitted_half_open_calls = 2; + u.circuit_breaker.base_open_duration_ms = 5000; + u.circuit_breaker.max_open_duration_ms = 60000; + return u; +} + +// --------------------------------------------------------------------------- +// Test 1: CB-only SIGHUP propagates to live slice config. +// +// Build gateway with threshold=3. Reload with threshold=7. Verify the +// slice's live config().consecutive_failure_threshold flipped to 7. +// --------------------------------------------------------------------------- +void TestCbReloadPropagatesToSlice() { + std::cout << "\n[TEST] CB Reload: reload propagates to slice..." + << std::endl; + try { + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) { + resp.Status(502).Body("err", "text/plain"); + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 1; + gw.http2.enabled = false; + gw.upstreams.push_back( + MakeReloadUpstream("svc", "127.0.0.1", backend_port)); + + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + + auto* cbm = gateway.GetUpstreamManager()->GetCircuitBreakerManager(); + auto* host = cbm->GetHost("svc"); + auto* slice = host->GetSlice(0); + int threshold_before = slice->config().consecutive_failure_threshold; + int window_before = slice->config().window_seconds; + + // Build reloaded config with modified CB fields only. + ServerConfig reloaded = gw; + reloaded.upstreams[0].circuit_breaker.consecutive_failure_threshold = 7; + reloaded.upstreams[0].circuit_breaker.window_seconds = 20; + + bool ok = gateway.Reload(reloaded); + // Reload enqueues per-slice updates on the owning dispatcher — + // brief sleep to let the dispatcher execute the queued Slice::Reload. + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + + int threshold_after = slice->config().consecutive_failure_threshold; + int window_after = slice->config().window_seconds; + + bool pass = ok && threshold_before == 3 && window_before == 10 + && threshold_after == 7 && window_after == 20; + TestFramework::RecordTest( + "CB Reload: reload propagates to slice", pass, + pass ? "" : + "ok=" + std::to_string(ok) + + " threshold_before=" + std::to_string(threshold_before) + + " threshold_after=" + std::to_string(threshold_after) + + " window_before=" + std::to_string(window_before) + + " window_after=" + std::to_string(window_after)); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Reload: reload propagates to slice", false, e.what()); + } +} + +// --------------------------------------------------------------------------- +// Test 2: CB-only reload does NOT emit the topology "restart required" +// warning. UpstreamConfig::operator== excludes circuit_breaker so a +// CB-only edit doesn't make the outer config != comparison true — the +// warning fires only on topology-field changes (host, port, pool, proxy, +// tls), which remain restart-only. +// --------------------------------------------------------------------------- +void TestCbOnlyReloadNoRestartWarn() { + std::cout << "\n[TEST] CB Reload: CB-only reload emits no restart warn..." + << std::endl; + try { + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) { + resp.Status(502).Body("err", "text/plain"); + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 1; + gw.http2.enabled = false; + gw.upstreams.push_back( + MakeReloadUpstream("svc", "127.0.0.1", backend_port)); + + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + + // Attach ringbuffer sink AFTER gateway ctor (logging::Init + // rebuilds the default logger). See the observability test for rationale. + auto ring = std::make_shared< + spdlog::sinks::ringbuffer_sink_mt>(1024); + auto logger = logging::Get(); + auto prev_level = logger->level(); + logger->set_level(spdlog::level::debug); + logger->sinks().push_back(ring); + + struct SinkGuard { + std::shared_ptr logger; + std::shared_ptr ring; + spdlog::level::level_enum prev_level; + ~SinkGuard() { + auto& sinks = logger->sinks(); + sinks.erase(std::remove(sinks.begin(), sinks.end(), + std::shared_ptr(ring)), + sinks.end()); + logger->set_level(prev_level); + } + } guard{logger, ring, prev_level}; + + ServerConfig reloaded = gw; + reloaded.upstreams[0].circuit_breaker.consecutive_failure_threshold = 9; + + gateway.Reload(reloaded); + std::this_thread::sleep_for(std::chrono::milliseconds(50)); + + bool saw_topology_warn = false; + bool saw_cb_config_applied = false; + for (const auto& msg : ring->last_formatted()) { + if (msg.find("upstream topology changes require a restart") != + std::string::npos) { + saw_topology_warn = true; + } + if (msg.find("circuit breaker config applied") != + std::string::npos) { + saw_cb_config_applied = true; + } + } + + bool pass = !saw_topology_warn && saw_cb_config_applied; + TestFramework::RecordTest( + "CB Reload: CB-only reload emits no restart warn", pass, + pass ? "" : + "saw_topology_warn=" + std::to_string(saw_topology_warn) + + " saw_cb_config_applied=" + std::to_string(saw_cb_config_applied)); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Reload: CB-only reload emits no restart warn", false, + e.what()); + } +} + +// --------------------------------------------------------------------------- +// Test 3: Topology change (pool field edit) STILL emits the restart warn +// — the exclusion of circuit_breaker from operator== must NOT compromise +// the restart-required signal for unreloadable fields. +// --------------------------------------------------------------------------- +void TestTopologyChangeStillEmitsRestartWarn() { + std::cout << "\n[TEST] CB Reload: topology change still warns..." + << std::endl; + try { + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) { + resp.Status(502).Body("err", "text/plain"); + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 1; + gw.http2.enabled = false; + gw.upstreams.push_back( + MakeReloadUpstream("svc", "127.0.0.1", backend_port)); + + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + + auto ring = std::make_shared< + spdlog::sinks::ringbuffer_sink_mt>(1024); + auto logger = logging::Get(); + auto prev_level = logger->level(); + logger->set_level(spdlog::level::debug); + logger->sinks().push_back(ring); + + struct SinkGuard { + std::shared_ptr logger; + std::shared_ptr ring; + spdlog::level::level_enum prev_level; + ~SinkGuard() { + auto& sinks = logger->sinks(); + sinks.erase(std::remove(sinks.begin(), sinks.end(), + std::shared_ptr(ring)), + sinks.end()); + logger->set_level(prev_level); + } + } guard{logger, ring, prev_level}; + + ServerConfig reloaded = gw; + // Topology-level edit that operator== still detects. + reloaded.upstreams[0].pool.max_connections = 16; + // Also flip a breaker field so we verify BOTH happen on the + // same reload (live CB edit + topology warn). + reloaded.upstreams[0].circuit_breaker.consecutive_failure_threshold = 5; + + gateway.Reload(reloaded); + std::this_thread::sleep_for(std::chrono::milliseconds(50)); + + bool saw_topology_warn = false; + bool saw_cb_config_applied = false; + for (const auto& msg : ring->last_formatted()) { + if (msg.find("upstream topology changes require a restart") != + std::string::npos) { + saw_topology_warn = true; + } + if (msg.find("circuit breaker config applied") != + std::string::npos) { + saw_cb_config_applied = true; + } + } + + bool pass = saw_topology_warn && saw_cb_config_applied; + TestFramework::RecordTest( + "CB Reload: topology change still warns", pass, + pass ? "" : + "saw_topology_warn=" + std::to_string(saw_topology_warn) + + " saw_cb_config_applied=" + std::to_string(saw_cb_config_applied)); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Reload: topology change still warns", false, e.what()); + } +} + +// --------------------------------------------------------------------------- +// Test 4: Disable → enable toggle via reload. A CB-only reload that sets +// `enabled=false` must make the slice short-circuit admissions; a +// subsequent reload flipping `enabled=true` must re-engage the state +// machine without requiring a restart. Verifies the "wire transition +// callbacks for ALL upstreams regardless of enabled" design (§3.1 R3-1). +// --------------------------------------------------------------------------- +void TestReloadDisableThenEnable() { + std::cout << "\n[TEST] CB Reload: reload disable→enable..." << std::endl; + try { + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) { + resp.Status(502).Body("err", "text/plain"); + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 1; + gw.http2.enabled = false; + gw.upstreams.push_back( + MakeReloadUpstream("svc", "127.0.0.1", backend_port)); + + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + + auto* cbm = gateway.GetUpstreamManager()->GetCircuitBreakerManager(); + auto* slice = cbm->GetHost("svc")->GetSlice(0); + + // Start: enabled=true. + bool enabled_before = slice->config().enabled; + + // Reload to enabled=false. + ServerConfig disabled = gw; + disabled.upstreams[0].circuit_breaker.enabled = false; + gateway.Reload(disabled); + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + bool disabled_after = !slice->config().enabled; + + // Reload back to enabled=true with a new threshold. + ServerConfig reenabled = gw; + reenabled.upstreams[0].circuit_breaker.enabled = true; + reenabled.upstreams[0].circuit_breaker.consecutive_failure_threshold = 11; + gateway.Reload(reenabled); + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + bool enabled_again = slice->config().enabled; + int threshold_after = slice->config().consecutive_failure_threshold; + + bool pass = enabled_before && disabled_after && + enabled_again && threshold_after == 11; + TestFramework::RecordTest( + "CB Reload: reload disable→enable", pass, + pass ? "" : + "enabled_before=" + std::to_string(enabled_before) + + " disabled_after=" + std::to_string(disabled_after) + + " enabled_again=" + std::to_string(enabled_again) + + " threshold_after=" + std::to_string(threshold_after)); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Reload: reload disable→enable", false, e.what()); + } +} + +// Regression: a SIGHUP carrying an invalid CB threshold (e.g. +// `consecutive_failure_threshold = 0`) on an EXISTING upstream must +// be hard-rejected. The downgrade-to-warn behavior of the wider +// `Validate()` call would otherwise push the bad value into live +// slices even though startup rejects the same file. +void TestReloadRejectsInvalidCbField() { + std::cout << "\n[TEST] CB Reload: invalid CB tuning is hard-rejected..." + << std::endl; + try { + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) { + resp.Status(502).Body("err", "text/plain"); + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 1; + gw.http2.enabled = false; + gw.upstreams.push_back( + MakeReloadUpstream("svc", "127.0.0.1", backend_port)); + + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + + // Build an invalid reload — threshold below the [1, 10000] range. + ServerConfig invalid = gw; + invalid.upstreams[0].circuit_breaker.consecutive_failure_threshold = 0; + + bool reload_returned = gateway.Reload(invalid); + // The slice's threshold must NOT have been pushed live. + auto* cbm = gateway.GetUpstreamManager()->GetCircuitBreakerManager(); + auto* slice = cbm->GetHost("svc")->GetSlice(0); + int live_threshold = slice->config().consecutive_failure_threshold; + + bool pass = reload_returned == false && live_threshold == 3; + TestFramework::RecordTest( + "CB Reload: invalid CB tuning is hard-rejected", pass, + pass ? "" : + "reload_returned=" + std::to_string(reload_returned) + + " live_threshold=" + std::to_string(live_threshold) + + " (expected reload=false, threshold=3 unchanged)"); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Reload: invalid CB tuning is hard-rejected", false, e.what()); + } +} + +// Regression: with `dry_run=true`, the CLOSED→OPEN transition callback +// must NOT drain the partition wait queue (shadow-mode contract: log +// would-reject decisions, admit traffic). The breaker's dry_run check +// inside the transition callback covers this; the regression we lock +// in is the log-emitted breadcrumb plus the absence of CHECKOUT_CIRCUIT_OPEN +// to queued waiters. +void TestDryRunDoesNotDrainOnTrip() { + std::cout << "\n[TEST] CB Reload: dry-run skips wait-queue drain on trip..." + << std::endl; + try { + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) { + resp.Status(502).Body("err", "text/plain"); + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 1; + gw.http2.enabled = false; + UpstreamConfig u = MakeReloadUpstream("svc", "127.0.0.1", backend_port); + u.circuit_breaker.dry_run = true; + u.circuit_breaker.consecutive_failure_threshold = 2; + gw.upstreams.push_back(u); + + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + + auto ring = std::make_shared< + spdlog::sinks::ringbuffer_sink_mt>(1024); + auto logger = logging::Get(); + auto prev_level = logger->level(); + logger->set_level(spdlog::level::debug); + logger->sinks().push_back(ring); + struct SinkGuard { + std::shared_ptr logger; + std::shared_ptr ring; + spdlog::level::level_enum prev_level; + ~SinkGuard() { + auto& sinks = logger->sinks(); + sinks.erase(std::remove(sinks.begin(), sinks.end(), + std::shared_ptr(ring)), + sinks.end()); + logger->set_level(prev_level); + } + } guard{logger, ring, prev_level}; + + int gw_port = gw_runner.GetPort(); + // Trip the breaker via 2 failures. + for (int i = 0; i < 2; ++i) { + TestHttpClient::HttpGet(gw_port, "/fail", 3000); + } + std::this_thread::sleep_for(std::chrono::milliseconds(50)); + + bool saw_dryrun_drain_skip = false; + for (const auto& msg : ring->last_formatted()) { + if (msg.find("[dry-run] circuit breaker would drain wait queue") != + std::string::npos) { + saw_dryrun_drain_skip = true; + break; + } + } + + TestFramework::RecordTest( + "CB Reload: dry-run skips wait-queue drain on trip", + saw_dryrun_drain_skip, + saw_dryrun_drain_skip ? "" : + "expected '[dry-run] circuit breaker would drain wait queue' log line"); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Reload: dry-run skips wait-queue drain on trip", false, e.what()); + } +} + +// Regression: when `dry_run` flips true→false on a slice that's +// currently OPEN, `Slice::Reload` fires a synthetic OPEN→OPEN +// transition with trigger="dry_run_disabled". The HttpServer-installed +// callback recognizes it and drains the partition queue so shadow-mode +// waiters don't leak through to the upstream once enforcement is back on. +void TestDryRunDisableOnOpenTriggersDrainSignal() { + std::cout << "\n[TEST] CB Reload: dry_run disable on OPEN triggers drain..." + << std::endl; + try { + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) { + resp.Status(502).Body("err", "text/plain"); + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 1; + gw.http2.enabled = false; + UpstreamConfig u = MakeReloadUpstream("svc", "127.0.0.1", backend_port); + u.circuit_breaker.dry_run = true; + u.circuit_breaker.consecutive_failure_threshold = 2; + u.circuit_breaker.base_open_duration_ms = 60000; // long open window + gw.upstreams.push_back(u); + + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + int gw_port = gw_runner.GetPort(); + + // Trip the breaker (dry-run still records the trip; state goes OPEN). + for (int i = 0; i < 2; ++i) { + TestHttpClient::HttpGet(gw_port, "/fail", 3000); + } + + auto* cbm = gateway.GetUpstreamManager()->GetCircuitBreakerManager(); + auto* slice = cbm->GetHost("svc")->GetSlice(0); + bool was_open = slice->CurrentState() == circuit_breaker::State::OPEN; + + auto ring = std::make_shared< + spdlog::sinks::ringbuffer_sink_mt>(1024); + auto logger = logging::Get(); + auto prev_level = logger->level(); + logger->set_level(spdlog::level::debug); + logger->sinks().push_back(ring); + struct SinkGuard { + std::shared_ptr logger; + std::shared_ptr ring; + spdlog::level::level_enum prev_level; + ~SinkGuard() { + auto& sinks = logger->sinks(); + sinks.erase(std::remove(sinks.begin(), sinks.end(), + std::shared_ptr(ring)), + sinks.end()); + logger->set_level(prev_level); + } + } guard{logger, ring, prev_level}; + + // Reload with dry_run=false, everything else same. + ServerConfig disable_dry = gw; + disable_dry.upstreams[0].circuit_breaker.dry_run = false; + gateway.Reload(disable_dry); + std::this_thread::sleep_for(std::chrono::milliseconds(150)); + + // The synthetic-callback fire path emits a slice-side log line. + bool saw_flush_log = false; + for (const auto& msg : ring->last_formatted()) { + if (msg.find("dry_run disabled while OPEN") != std::string::npos && + msg.find("flushing wait queue") != std::string::npos) { + saw_flush_log = true; + break; + } + } + bool live_dry_run = slice->config().dry_run; + bool still_open = slice->CurrentState() == circuit_breaker::State::OPEN; + + bool pass = was_open && !live_dry_run && saw_flush_log && still_open; + TestFramework::RecordTest( + "CB Reload: dry_run disable on OPEN triggers drain", pass, + pass ? "" : + "was_open=" + std::to_string(was_open) + + " live_dry_run=" + std::to_string(live_dry_run) + + " saw_flush_log=" + std::to_string(saw_flush_log) + + " still_open=" + std::to_string(still_open)); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Reload: dry_run disable on OPEN triggers drain", false, + e.what()); + } +} + +void RunAllTests() { + std::cout << "\n" << std::string(60, '=') << std::endl; + std::cout << "CIRCUIT BREAKER - HOT-RELOAD TESTS" << std::endl; + std::cout << std::string(60, '=') << std::endl; + + TestCbReloadPropagatesToSlice(); + TestCbOnlyReloadNoRestartWarn(); + TestTopologyChangeStillEmitsRestartWarn(); + TestReloadDisableThenEnable(); + TestReloadRejectsInvalidCbField(); + TestDryRunDoesNotDrainOnTrip(); + TestDryRunDisableOnOpenTriggersDrainSignal(); +} + +} // namespace CircuitBreakerReloadTests diff --git a/test/circuit_breaker_retry_budget_test.h b/test/circuit_breaker_retry_budget_test.h new file mode 100644 index 00000000..608a0602 --- /dev/null +++ b/test/circuit_breaker_retry_budget_test.h @@ -0,0 +1,367 @@ +#pragma once + +// Retry-budget integration tests: retry budget wired into ProxyTransaction. +// +// The component suite covers the RetryBudget math (CAS, non-retry +// denominator, min-concurrency floor) as unit tests against the +// RetryBudget class in isolation. This suite tests the INTEGRATION: +// ProxyTransaction resolves +// `retry_budget_` from the same CircuitBreakerHost as `slice_`, tracks +// every attempt's in_flight via the RAII guard, and consults +// `TryConsumeRetry` before each retry. Exhaustion emits the §12.2 +// response (503 + `X-Retry-Budget-Exhausted: 1`) and does NOT feed +// back into the slice's failure math. +// +// Strategy: backends that always 502 with `retry_on_5xx=true` drive the +// retry path. A near-zero retry-budget (`percent=0, min_concurrency=0`) +// rejects every retry deterministically without needing concurrent +// client load. The circuit-breaker consecutive-failure threshold is +// raised well above the retry count so the breaker stays CLOSED — the +// budget gate is tested in isolation from the state machine. + +#include "test_framework.h" +#include "test_server_runner.h" +#include "http_test_client.h" +#include "http/http_server.h" +#include "config/server_config.h" + +#include +#include +#include +#include + +namespace CircuitBreakerRetryBudgetTests { + +// Upstream config that always proxies /fail, with the circuit breaker +// enabled so `retry_budget_` is resolved on `slice_`'s host. Breaker +// thresholds intentionally unreachable for these tests — we want the +// retry-budget gate fired in isolation, not co-tripping the state +// machine. +static UpstreamConfig MakeRetryBudgetUpstream(const std::string& name, + const std::string& host, + int port, + int retry_budget_percent, + int retry_budget_min_concurrency, + bool dry_run = false) { + UpstreamConfig u; + u.name = name; + u.host = host; + u.port = port; + u.pool.max_connections = 16; + u.pool.max_idle_connections = 8; + u.pool.connect_timeout_ms = 3000; + u.pool.idle_timeout_sec = 30; + u.pool.max_lifetime_sec = 3600; + u.pool.max_requests_per_conn = 0; + + u.proxy.route_prefix = "/fail"; + u.proxy.strip_prefix = false; + u.proxy.response_timeout_ms = 2000; + + u.circuit_breaker.enabled = true; + u.circuit_breaker.dry_run = dry_run; + // Breaker thresholds unreachable — we don't want the state machine + // tripping during a retry-budget test. + u.circuit_breaker.consecutive_failure_threshold = 10000; + u.circuit_breaker.failure_rate_threshold = 100; + u.circuit_breaker.minimum_volume = 10000; + u.circuit_breaker.window_seconds = 10; + u.circuit_breaker.permitted_half_open_calls = 2; + u.circuit_breaker.base_open_duration_ms = 30000; + u.circuit_breaker.max_open_duration_ms = 60000; + + u.circuit_breaker.retry_budget_percent = retry_budget_percent; + u.circuit_breaker.retry_budget_min_concurrency = retry_budget_min_concurrency; + return u; +} + +static bool HasRetryBudgetHeader(const std::string& response) { + return response.find("X-Retry-Budget-Exhausted: 1") != std::string::npos || + response.find("x-retry-budget-exhausted: 1") != std::string::npos; +} + +// --------------------------------------------------------------------------- +// Test 1: A retry attempt rejected by the retry-budget gate delivers 503 + +// X-Retry-Budget-Exhausted instead of the upstream's 5xx. Verifies that +// `TryConsumeRetry` runs BEFORE the retry executes and that +// `MakeRetryBudgetResponse` is emitted through the standard DeliverResponse +// path. +// +// retry_budget_percent=0 + retry_budget_min_concurrency=0 → cap = 0. Every +// retry attempt's TryConsumeRetry returns false. First attempt is +// unaffected (budget only gates retries), so the backend is hit exactly +// once per client request; the retry is short-circuited locally. +// --------------------------------------------------------------------------- +void TestRetryBudgetRejectsRetry() { + std::cout << "\n[TEST] CB Retry Budget: retry budget rejects retry..." + << std::endl; + try { + std::atomic backend_hits{0}; + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [&backend_hits](const HttpRequest&, HttpResponse& resp) { + backend_hits.fetch_add(1, std::memory_order_relaxed); + resp.Status(502).Body("upstream-err", "text/plain"); + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 1; + gw.http2.enabled = false; + + auto u = MakeRetryBudgetUpstream("svc", "127.0.0.1", backend_port, + /*percent=*/0, + /*min_concurrency=*/0); + u.proxy.retry.max_retries = 3; + u.proxy.retry.retry_on_5xx = true; + gw.upstreams.push_back(u); + + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + int gw_port = gw_runner.GetPort(); + + std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 5000); + + bool is_503 = TestHttpClient::HasStatus(r, 503); + bool has_budget_hdr = HasRetryBudgetHeader(r); + // Backend should have been hit exactly once (the first attempt); + // every retry was short-circuited by the budget gate. + int hits = backend_hits.load(std::memory_order_relaxed); + bool single_backend_hit = (hits == 1); + + bool pass = is_503 && has_budget_hdr && single_backend_hit; + TestFramework::RecordTest( + "CB Retry Budget: retry budget rejects retry", pass, + pass ? "" : + "is_503=" + std::to_string(is_503) + + " budget_hdr=" + std::to_string(has_budget_hdr) + + " backend_hits=" + std::to_string(hits) + + " body=" + r.substr(0, 256)); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Retry Budget: retry budget rejects retry", false, e.what()); + } +} + +// --------------------------------------------------------------------------- +// Test 2: The min-concurrency floor admits retries even when the %-based +// cap would be zero. With percent=0 + min_concurrency=5, a single sequential +// client request's retry chain (1 first + 3 retries = 4 backend hits) all +// fit under the floor and proceed normally to the upstream — no 503, no +// X-Retry-Budget-Exhausted, and the client sees the final 5xx response. +// +// This is the symmetric test to Test 1: same near-zero %-cap, but a floor +// large enough that retries aren't budget-gated. Proves the floor is +// consulted (retries admitted) instead of the %-cap (retries rejected). +// --------------------------------------------------------------------------- +void TestRetryBudgetMinConcurrencyFloor() { + std::cout << "\n[TEST] CB Retry Budget: retry budget min-concurrency floor..." + << std::endl; + try { + std::atomic backend_hits{0}; + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [&backend_hits](const HttpRequest&, HttpResponse& resp) { + backend_hits.fetch_add(1, std::memory_order_relaxed); + resp.Status(502).Body("upstream-err", "text/plain"); + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 1; + gw.http2.enabled = false; + + // percent=0 → no %-based capacity. min_concurrency=5 → floor + // admits up to 5 concurrent retries, easily covering the 3 + // sequential retries from a single client request. + auto u = MakeRetryBudgetUpstream("svc", "127.0.0.1", backend_port, + /*percent=*/0, + /*min_concurrency=*/5); + u.proxy.retry.max_retries = 3; + u.proxy.retry.retry_on_5xx = true; + gw.upstreams.push_back(u); + + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + int gw_port = gw_runner.GetPort(); + + std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 10000); + + // Client sees the upstream's final 502 — no local 503, no + // X-Retry-Budget-Exhausted. + bool is_502 = TestHttpClient::HasStatus(r, 502); + bool no_budget_hdr = !HasRetryBudgetHeader(r); + // 1 first attempt + 3 retries admitted by the floor = 4 backend hits. + int hits = backend_hits.load(std::memory_order_relaxed); + bool all_retries_proceeded = (hits == 4); + + bool pass = is_502 && no_budget_hdr && all_retries_proceeded; + TestFramework::RecordTest( + "CB Retry Budget: retry budget min-concurrency floor", pass, + pass ? "" : + "is_502=" + std::to_string(is_502) + + " no_budget_hdr=" + std::to_string(no_budget_hdr) + + " backend_hits=" + std::to_string(hits) + + " body=" + r.substr(0, 256)); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Retry Budget: retry budget min-concurrency floor", false, e.what()); + } +} + +// --------------------------------------------------------------------------- +// Test 3: Dry-run bypasses the retry-budget gate. +// +// With percent=0 + min_concurrency=0 (same as Test 1), TryConsumeRetry +// returns false for every retry. But `circuit_breaker.dry_run=true` +// switches the rejection path to a log-and-proceed: no token is +// consumed, retry_token_held_ stays false, and AttemptCheckout runs as +// though the budget was unlimited. +// +// Result: the client sees the upstream's 502 response (because the +// retries actually fire), NOT a 503 + X-Retry-Budget-Exhausted. +// --------------------------------------------------------------------------- +void TestRetryBudgetDryRunPassthrough() { + std::cout << "\n[TEST] CB Retry Budget: retry budget dry-run passthrough..." + << std::endl; + try { + std::atomic backend_hits{0}; + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [&backend_hits](const HttpRequest&, HttpResponse& resp) { + backend_hits.fetch_add(1, std::memory_order_relaxed); + resp.Status(502).Body("upstream-err", "text/plain"); + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 1; + gw.http2.enabled = false; + + auto u = MakeRetryBudgetUpstream("svc", "127.0.0.1", backend_port, + /*percent=*/0, + /*min_concurrency=*/0, + /*dry_run=*/true); + u.proxy.retry.max_retries = 2; + u.proxy.retry.retry_on_5xx = true; + gw.upstreams.push_back(u); + + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + int gw_port = gw_runner.GetPort(); + + std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 10000); + + // Retries proceeded despite would-reject decisions — the client + // sees the upstream's final 502, not our local 503. + bool is_502 = TestHttpClient::HasStatus(r, 502); + bool no_budget_hdr = !HasRetryBudgetHeader(r); + int hits = backend_hits.load(std::memory_order_relaxed); + bool all_attempts_ran = (hits == 3); // 1 first + 2 retries + + bool pass = is_502 && no_budget_hdr && all_attempts_ran; + TestFramework::RecordTest( + "CB Retry Budget: retry budget dry-run passthrough", pass, + pass ? "" : + "is_502=" + std::to_string(is_502) + + " no_budget_hdr=" + std::to_string(no_budget_hdr) + + " backend_hits=" + std::to_string(hits) + + " body=" + r.substr(0, 256)); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Retry Budget: retry budget dry-run passthrough", false, e.what()); + } +} + +// --------------------------------------------------------------------------- +// Test 4: First attempts are NOT budget-gated. +// +// The retry-budget cap applies only to retries (attempt_ > 0). First +// attempts call TrackInFlight (which only ever increments) but skip +// TryConsumeRetry entirely. With percent=0 + min_concurrency=0 and a +// backend that always 200s, every client request must succeed — if the +// gate accidentally ran on first attempts, we'd see 503s here. +// +// Guards against a regression where TryConsumeRetry is called before +// the `attempt_ > 0` gate, or where the gate is placed in +// AttemptCheckout instead of MaybeRetry. +// --------------------------------------------------------------------------- +void TestFirstAttemptsNotGated() { + std::cout << "\n[TEST] CB Retry Budget: first attempts not gated..." + << std::endl; + try { + std::atomic backend_hits{0}; + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [&backend_hits](const HttpRequest&, HttpResponse& resp) { + backend_hits.fetch_add(1, std::memory_order_relaxed); + resp.Status(200).Body("ok", "text/plain"); + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 1; + gw.http2.enabled = false; + + auto u = MakeRetryBudgetUpstream("svc", "127.0.0.1", backend_port, + /*percent=*/0, + /*min_concurrency=*/0); + // No retries — every request is a first attempt. + u.proxy.retry.max_retries = 0; + gw.upstreams.push_back(u); + + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + int gw_port = gw_runner.GetPort(); + + int client_count = 5; + int successes = 0; + for (int i = 0; i < client_count; ++i) { + std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 3000); + if (TestHttpClient::HasStatus(r, 200)) ++successes; + if (HasRetryBudgetHeader(r)) { + // Any X-Retry-Budget-Exhausted on a first-attempt-only + // path is a bug. Record and bail. + TestFramework::RecordTest( + "CB Retry Budget: first attempts not gated", false, + "unexpected X-Retry-Budget-Exhausted on first-attempt path " + "i=" + std::to_string(i)); + return; + } + } + + int hits = backend_hits.load(std::memory_order_relaxed); + bool pass = (successes == client_count) && (hits == client_count); + TestFramework::RecordTest( + "CB Retry Budget: first attempts not gated", pass, + pass ? "" : + "successes=" + std::to_string(successes) + + "/" + std::to_string(client_count) + + " backend_hits=" + std::to_string(hits)); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Retry Budget: first attempts not gated", false, e.what()); + } +} + +void RunAllTests() { + std::cout << "\n" << std::string(60, '=') << std::endl; + std::cout << "CIRCUIT BREAKER - RETRY BUDGET INTEGRATION TESTS" + << std::endl; + std::cout << std::string(60, '=') << std::endl; + + TestRetryBudgetRejectsRetry(); + TestRetryBudgetMinConcurrencyFloor(); + TestRetryBudgetDryRunPassthrough(); + TestFirstAttemptsNotGated(); +} + +} // namespace CircuitBreakerRetryBudgetTests diff --git a/test/circuit_breaker_test.h b/test/circuit_breaker_test.h new file mode 100644 index 00000000..bed54da0 --- /dev/null +++ b/test/circuit_breaker_test.h @@ -0,0 +1,2070 @@ +#pragma once + +#include "test_framework.h" +#include "config/server_config.h" +#include "circuit_breaker/circuit_breaker_state.h" +#include "circuit_breaker/circuit_breaker_window.h" +#include "circuit_breaker/circuit_breaker_slice.h" + +#include +#include +#include + +namespace CircuitBreakerTests { + +using circuit_breaker::CircuitBreakerSlice; +using circuit_breaker::CircuitBreakerWindow; +using circuit_breaker::Decision; +using circuit_breaker::FailureKind; +using circuit_breaker::State; + +// A simple mock clock that advances only when the test tells it to. +class MockClock { +public: + std::chrono::steady_clock::time_point now{ + // Choose a non-zero base so 0 is distinguishable from "not OPEN". + std::chrono::steady_clock::time_point(std::chrono::seconds(1'000'000)) + }; + void Advance(std::chrono::milliseconds ms) { now += ms; } + void AdvanceSec(int seconds) { now += std::chrono::seconds(seconds); } + std::chrono::steady_clock::time_point operator()() const { return now; } +}; + +// Build a config with default values — tests override specific fields. +static CircuitBreakerConfig DefaultEnabledConfig() { + CircuitBreakerConfig cb; + cb.enabled = true; + cb.consecutive_failure_threshold = 5; + cb.failure_rate_threshold = 50; + cb.minimum_volume = 20; + cb.window_seconds = 10; + cb.permitted_half_open_calls = 5; + cb.base_open_duration_ms = 5000; + cb.max_open_duration_ms = 60000; + return cb; +} + +// ============================================================================ +// State machine tests +// ============================================================================ + +void TestDisabledFastPath() { + std::cout << "\n[TEST] CB: Disabled fast path..." << std::endl; + try { + CircuitBreakerConfig cb; // enabled=false by default + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + + bool pass = slice.TryAcquire().decision == Decision::ADMITTED && + slice.CurrentState() == State::CLOSED; + + // Reporting 100 failures must not trip. + for (int i = 0; i < 100; ++i) { + slice.ReportFailure(FailureKind::CONNECT_FAILURE, false, slice.CurrentGenerationForTesting()); + } + pass = pass && slice.CurrentState() == State::CLOSED && + slice.Trips() == 0; + + TestFramework::RecordTest("CB: disabled fast path", pass, "", + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("CB: disabled fast path", false, e.what(), + TestFramework::TestCategory::OTHER); + } +} + +void TestClosedStaysClosedBelowConsecutiveThreshold() { + std::cout << "\n[TEST] CB: 4 failures below threshold..." << std::endl; + try { + auto cb = DefaultEnabledConfig(); + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + + for (int i = 0; i < 4; ++i) { + slice.ReportFailure(FailureKind::CONNECT_FAILURE, false, slice.CurrentGenerationForTesting()); + } + bool pass = slice.CurrentState() == State::CLOSED && + slice.TryAcquire().decision == Decision::ADMITTED && + slice.Trips() == 0; + TestFramework::RecordTest("CB: 4 failures below threshold", pass, "", + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("CB: 4 failures below threshold", false, + e.what(), TestFramework::TestCategory::OTHER); + } +} + +void TestConsecutiveFailureTrip() { + std::cout << "\n[TEST] CB: 5 consecutive failures trip..." << std::endl; + try { + auto cb = DefaultEnabledConfig(); + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + + for (int i = 0; i < 5; ++i) { + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting()); + } + bool pass = slice.CurrentState() == State::OPEN && + slice.Trips() == 1 && + slice.TryAcquire().decision == Decision::REJECTED_OPEN; + TestFramework::RecordTest("CB: 5 consecutive failures trip", pass, "", + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("CB: 5 consecutive failures trip", false, + e.what(), TestFramework::TestCategory::OTHER); + } +} + +void TestFailureRateTrip() { + std::cout << "\n[TEST] CB: failure-rate trip (50% of 20)..." << std::endl; + try { + auto cb = DefaultEnabledConfig(); + cb.consecutive_failure_threshold = 1000; // disable consec path + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + + // Alternate 10 failures and 10 successes within the same second — + // ratio = 50%, total = 20 (>= minimum_volume). + for (int i = 0; i < 10; ++i) { + slice.ReportSuccess(false, slice.CurrentGenerationForTesting()); + } + // A success between-failures clears consecutive_failures_, confirming + // only rate path can trip here. + for (int i = 0; i < 9; ++i) { + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting()); + } + // Still CLOSED — 9/19 < 50%. + bool pass_pre = slice.CurrentState() == State::CLOSED; + // 10th failure brings ratio to 10/20 = 50% exactly — tripper. + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting()); + bool pass = pass_pre && slice.CurrentState() == State::OPEN && + slice.Trips() == 1; + TestFramework::RecordTest("CB: failure-rate trip (50% of 20)", pass, "", + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("CB: failure-rate trip (50% of 20)", false, + e.what(), TestFramework::TestCategory::OTHER); + } +} + +void TestMinimumVolumeGate() { + std::cout << "\n[TEST] CB: minimum_volume gate..." << std::endl; + try { + auto cb = DefaultEnabledConfig(); + cb.consecutive_failure_threshold = 1000; // disable consec path + cb.minimum_volume = 20; + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + + // 19 total calls, all failures — should NOT trip (below volume). + for (int i = 0; i < 19; ++i) { + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting()); + } + bool pass = slice.CurrentState() == State::CLOSED && slice.Trips() == 0; + TestFramework::RecordTest("CB: minimum_volume gate", pass, "", + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("CB: minimum_volume gate", false, e.what(), + TestFramework::TestCategory::OTHER); + } +} + +void TestOpenBeforeDurationStaysOpen() { + std::cout << "\n[TEST] CB: OPEN rejects before elapsed..." << std::endl; + try { + auto cb = DefaultEnabledConfig(); + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + + for (int i = 0; i < 5; ++i) { + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting()); + } + // Advance less than base_open_duration_ms (5000ms). + clock->Advance(std::chrono::milliseconds(2000)); + Decision d = slice.TryAcquire().decision; + bool pass = d == Decision::REJECTED_OPEN && + slice.CurrentState() == State::OPEN; + TestFramework::RecordTest("CB: OPEN rejects before elapsed", pass, "", + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("CB: OPEN rejects before elapsed", false, + e.what(), TestFramework::TestCategory::OTHER); + } +} + +void TestOpenToHalfOpenAfterDuration() { + std::cout << "\n[TEST] CB: OPEN → HALF_OPEN after duration..." << std::endl; + try { + auto cb = DefaultEnabledConfig(); + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + + for (int i = 0; i < 5; ++i) { + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting()); + } + clock->Advance(std::chrono::milliseconds(cb.base_open_duration_ms + 1)); + Decision d = slice.TryAcquire().decision; + bool pass = d == Decision::ADMITTED_PROBE && + slice.CurrentState() == State::HALF_OPEN; + TestFramework::RecordTest("CB: OPEN -> HALF_OPEN after duration", pass, "", + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("CB: OPEN -> HALF_OPEN after duration", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + +void TestHalfOpenAllProbesSucceed() { + std::cout << "\n[TEST] CB: HALF_OPEN 5 probe successes close..." << std::endl; + try { + auto cb = DefaultEnabledConfig(); + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + + for (int i = 0; i < 5; ++i) { + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting()); + } + clock->Advance(std::chrono::milliseconds(cb.base_open_duration_ms + 1)); + + // Take 5 probes; report success on each. + for (int i = 0; i < cb.permitted_half_open_calls; ++i) { + Decision d = slice.TryAcquire().decision; + if (d != Decision::ADMITTED_PROBE) { + TestFramework::RecordTest( + "CB: HALF_OPEN 5 probe successes close", false, + "probe " + std::to_string(i) + " not ADMITTED_PROBE", + TestFramework::TestCategory::OTHER); + return; + } + slice.ReportSuccess(true, slice.CurrentGenerationForTesting()); + } + bool pass = slice.CurrentState() == State::CLOSED && + slice.ProbeSuccesses() == 5; + TestFramework::RecordTest("CB: HALF_OPEN 5 probe successes close", + pass, "", TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("CB: HALF_OPEN 5 probe successes close", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + +void TestHalfOpenProbeFailureReopens() { + std::cout << "\n[TEST] CB: HALF_OPEN single probe fail re-opens..." << std::endl; + try { + auto cb = DefaultEnabledConfig(); + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + + for (int i = 0; i < 5; ++i) { + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting()); + } + clock->Advance(std::chrono::milliseconds(cb.base_open_duration_ms + 1)); + + // Take 1 probe, fail it. + Decision d = slice.TryAcquire().decision; + slice.ReportFailure(FailureKind::RESPONSE_5XX, true, slice.CurrentGenerationForTesting()); + bool pass = d == Decision::ADMITTED_PROBE && + slice.CurrentState() == State::OPEN && + slice.Trips() == 2 && // initial trip + re-trip + slice.ProbeFailures() == 1; + TestFramework::RecordTest("CB: HALF_OPEN probe fail re-opens", pass, "", + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("CB: HALF_OPEN probe fail re-opens", false, + e.what(), TestFramework::TestCategory::OTHER); + } +} + +void TestHalfOpenExhaustedSlotsRejected() { + std::cout << "\n[TEST] CB: HALF_OPEN over capacity rejects..." << std::endl; + try { + auto cb = DefaultEnabledConfig(); + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + + for (int i = 0; i < 5; ++i) { + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting()); + } + clock->Advance(std::chrono::milliseconds(cb.base_open_duration_ms + 1)); + // Take 5 probes but DON'T report outcomes yet. + for (int i = 0; i < 5; ++i) slice.TryAcquire(); + // 6th TryAcquire must reject (all slots taken). + Decision d = slice.TryAcquire().decision; + bool pass = d == Decision::REJECTED_OPEN; + TestFramework::RecordTest("CB: HALF_OPEN over capacity rejects", + pass, "", TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("CB: HALF_OPEN over capacity rejects", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + +void TestExponentialBackoff() { + std::cout << "\n[TEST] CB: exponential backoff progression..." << std::endl; + try { + auto cb = DefaultEnabledConfig(); + cb.base_open_duration_ms = 1000; + cb.max_open_duration_ms = 8000; + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + + auto trip_then_probe_fail = [&]() { + // Reach OPEN. + for (int i = 0; i < 5; ++i) { + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting()); + } + }; + auto measure_open_ms = [&]() { + // open_until - now at the instant of the trip. + auto open_until = slice.OpenUntil(); + auto remaining = open_until - clock->now; + return std::chrono::duration_cast( + remaining).count(); + }; + + // Trip 1 — expect ~1000ms. + trip_then_probe_fail(); + int64_t d1 = measure_open_ms(); + // Move to HALF_OPEN and fail the probe → trip 2. + clock->Advance(std::chrono::milliseconds(d1 + 1)); + slice.TryAcquire(); // HALF_OPEN, ADMITTED_PROBE + slice.ReportFailure(FailureKind::RESPONSE_5XX, true, slice.CurrentGenerationForTesting()); + int64_t d2 = measure_open_ms(); + clock->Advance(std::chrono::milliseconds(d2 + 1)); + slice.TryAcquire(); + slice.ReportFailure(FailureKind::RESPONSE_5XX, true, slice.CurrentGenerationForTesting()); + int64_t d3 = measure_open_ms(); + clock->Advance(std::chrono::milliseconds(d3 + 1)); + slice.TryAcquire(); + slice.ReportFailure(FailureKind::RESPONSE_5XX, true, slice.CurrentGenerationForTesting()); + int64_t d4 = measure_open_ms(); + clock->Advance(std::chrono::milliseconds(d4 + 1)); + slice.TryAcquire(); + slice.ReportFailure(FailureKind::RESPONSE_5XX, true, slice.CurrentGenerationForTesting()); + int64_t d5 = measure_open_ms(); + + // Expect 1000, 2000, 4000, 8000, 8000 (capped). + bool pass = d1 == 1000 && d2 == 2000 && d3 == 4000 && + d4 == 8000 && d5 == 8000; + std::string err = "d1=" + std::to_string(d1) + " d2=" + std::to_string(d2) + + " d3=" + std::to_string(d3) + " d4=" + std::to_string(d4) + + " d5=" + std::to_string(d5); + TestFramework::RecordTest("CB: exponential backoff", + pass, pass ? "" : err, TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("CB: exponential backoff", false, e.what(), + TestFramework::TestCategory::OTHER); + } +} + +void TestResetOnClose() { + std::cout << "\n[TEST] CB: consecutive_trips resets on close..." << std::endl; + try { + auto cb = DefaultEnabledConfig(); + cb.base_open_duration_ms = 1000; + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + + // Trip 1. + for (int i = 0; i < 5; ++i) { + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting()); + } + clock->Advance(std::chrono::milliseconds(1001)); + // Move to HALF_OPEN. + for (int i = 0; i < 5; ++i) { + slice.TryAcquire(); + slice.ReportSuccess(true, slice.CurrentGenerationForTesting()); + } + // Now CLOSED. Trip again — expect base_duration again (not doubled). + for (int i = 0; i < 5; ++i) { + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting()); + } + auto open_until = slice.OpenUntil(); + auto remaining = open_until - clock->now; + int64_t d_after_close = std::chrono::duration_cast< + std::chrono::milliseconds>(remaining).count(); + bool pass = d_after_close == 1000; + TestFramework::RecordTest("CB: trips reset on close", pass, + pass ? "" : "expected 1000ms, got " + std::to_string(d_after_close), + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("CB: trips reset on close", false, e.what(), + TestFramework::TestCategory::OTHER); + } +} + +// ============================================================================ +// Window tests +// ============================================================================ + +void TestWindowBucketByCurrentSecond() { + std::cout << "\n[TEST] CB Window: bucket by current second..." << std::endl; + try { + CircuitBreakerWindow w(10); + auto t0 = std::chrono::steady_clock::time_point(std::chrono::seconds(100)); + w.AddSuccess(t0); + w.AddFailure(t0); + w.AddFailure(t0); + bool pass = w.TotalCount(t0) == 3 && w.FailureCount(t0) == 2; + TestFramework::RecordTest("CB Window: bucket by current second", pass, + "", TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("CB Window: bucket by current second", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + +void TestWindowAdvanceSkipsStale() { + std::cout << "\n[TEST] CB Window: advance skips stale..." << std::endl; + try { + CircuitBreakerWindow w(10); + auto t0 = std::chrono::steady_clock::time_point(std::chrono::seconds(100)); + w.AddFailure(t0); // bucket 100%10 = 0 + auto t1 = t0 + std::chrono::seconds(15); // beyond window + // After long idle, incoming record should see zero history. + bool pre = w.TotalCount(t1) == 0; + w.AddSuccess(t1); + bool pass = pre && w.TotalCount(t1) == 1 && w.FailureCount(t1) == 0; + TestFramework::RecordTest("CB Window: advance skips stale", pass, "", + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("CB Window: advance skips stale", false, + e.what(), TestFramework::TestCategory::OTHER); + } +} + +void TestWindowPartialExpiry() { + std::cout << "\n[TEST] CB Window: partial expiry..." << std::endl; + try { + CircuitBreakerWindow w(10); + auto t0 = std::chrono::steady_clock::time_point(std::chrono::seconds(100)); + w.AddFailure(t0); // sec 100 + auto t1 = t0 + std::chrono::seconds(5); + w.AddFailure(t1); // sec 105 + auto t2 = t0 + std::chrono::seconds(11); + // sec 100 is now out of window (100 + 10 <= 111 - 1 = 110). So: + // bucket 0 (sec 100 or sec 110) would have been zeroed when advancing + // from head=105 past sec 110. + bool pass = w.TotalCount(t2) == 1 && w.FailureCount(t2) == 1; + TestFramework::RecordTest("CB Window: partial expiry", pass, "", + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("CB Window: partial expiry", false, + e.what(), TestFramework::TestCategory::OTHER); + } +} + +void TestWindowReset() { + std::cout << "\n[TEST] CB Window: reset clears..." << std::endl; + try { + CircuitBreakerWindow w(10); + auto t0 = std::chrono::steady_clock::time_point(std::chrono::seconds(100)); + w.AddFailure(t0); w.AddSuccess(t0); w.AddFailure(t0); + w.Reset(); + bool pass = w.TotalCount(t0) == 0 && w.FailureCount(t0) == 0; + TestFramework::RecordTest("CB Window: reset clears", pass, "", + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("CB Window: reset clears", false, e.what(), + TestFramework::TestCategory::OTHER); + } +} + +// ============================================================================ +// Dry-run + Reload + Edge cases +// ============================================================================ + +void TestDryRunAdmits() { + std::cout << "\n[TEST] CB: dry_run admits through OPEN..." << std::endl; + try { + auto cb = DefaultEnabledConfig(); + cb.dry_run = true; + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + + for (int i = 0; i < 5; ++i) { + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting()); + } + // OPEN + dry_run → REJECTED_OPEN_DRYRUN (caller proceeds). + Decision d = slice.TryAcquire().decision; + bool pass = d == Decision::REJECTED_OPEN_DRYRUN && + slice.CurrentState() == State::OPEN && + slice.Rejected() == 1; + TestFramework::RecordTest("CB: dry_run admits through OPEN", pass, "", + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("CB: dry_run admits through OPEN", false, + e.what(), TestFramework::TestCategory::OTHER); + } +} + +void TestReloadPreservesState() { + std::cout << "\n[TEST] CB: reload preserves live state..." << std::endl; + try { + auto cb = DefaultEnabledConfig(); + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + + for (int i = 0; i < 5; ++i) { + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting()); + } + // OPEN at this point. + auto cb2 = cb; + cb2.consecutive_failure_threshold = 2; // tighter + cb2.window_seconds = 30; // triggers ring resize + slice.Reload(cb2); + // Still OPEN immediately after reload — live state preserved. + bool pass = slice.CurrentState() == State::OPEN; + TestFramework::RecordTest("CB: reload preserves live state", pass, "", + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("CB: reload preserves live state", false, + e.what(), TestFramework::TestCategory::OTHER); + } +} + +void TestConsecutiveThresholdOne() { + std::cout << "\n[TEST] CB: threshold=1 single failure trips..." << std::endl; + try { + auto cb = DefaultEnabledConfig(); + cb.consecutive_failure_threshold = 1; + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting()); + bool pass = slice.CurrentState() == State::OPEN && slice.Trips() == 1; + TestFramework::RecordTest("CB: threshold=1 single failure trips", + pass, "", TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("CB: threshold=1 single failure trips", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + +void TestSuccessClearsConsecutive() { + std::cout << "\n[TEST] CB: success clears consecutive..." << std::endl; + try { + auto cb = DefaultEnabledConfig(); + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + + for (int i = 0; i < 4; ++i) { + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting()); + } + slice.ReportSuccess(false, slice.CurrentGenerationForTesting()); // resets consecutive + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting()); + // consecutive is back to 1, no trip. + bool pass = slice.CurrentState() == State::CLOSED; + TestFramework::RecordTest("CB: success clears consecutive", pass, "", + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("CB: success clears consecutive", false, + e.what(), TestFramework::TestCategory::OTHER); + } +} + +// ============================================================================ +// Regression tests — critical bugs caught in code review +// ============================================================================ + +// BUG: late non-probe failure after trip re-entered TripClosedToOpen, inflating +// consecutive_trips_ (→ longer backoff) and firing a spurious CLOSED→OPEN +// transition edge. Fix: guard ReportFailure(probe=false) on state_ == CLOSED. +void TestLateFailureAfterTripDoesNotInflateBackoff() { + std::cout << "\n[TEST] CB: late failure after trip does not inflate backoff..." + << std::endl; + try { + auto cb = DefaultEnabledConfig(); + cb.base_open_duration_ms = 1000; + cb.max_open_duration_ms = 60000; + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + + // Admit 10 requests in CLOSED. Slice state is single-threaded so + // admission + bookkeeping is serialized by the event loop — but in + // production the outcomes for those admitted requests can arrive after + // the slice has already tripped. + for (int i = 0; i < 10; ++i) { + Decision d = slice.TryAcquire().decision; + if (d != Decision::ADMITTED) { + TestFramework::RecordTest("CB: late failure after trip", + false, "admission i=" + std::to_string(i) + " not ADMITTED", + TestFramework::TestCategory::OTHER); + return; + } + } + // Report 5 failures — trip at the 5th. + for (int i = 0; i < 5; ++i) { + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting()); + } + if (slice.CurrentState() != State::OPEN) { + TestFramework::RecordTest("CB: late failure after trip", false, + "expected OPEN after 5 failures", + TestFramework::TestCategory::OTHER); + return; + } + int64_t trips_after_first_trip = slice.Trips(); + // Capture open_until immediately post-trip. + auto open_until_initial = slice.OpenUntil(); + + // Now the remaining 5 in-flight requests land with late failures. + // Before the fix, each of these would go through the CLOSED path, + // climb consecutive_failures_, and trigger another TripClosedToOpen + // even though state is already OPEN. + for (int i = 0; i < 5; ++i) { + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting()); + } + int64_t trips_after_late = slice.Trips(); + auto open_until_after_late = slice.OpenUntil(); + + bool pass = slice.CurrentState() == State::OPEN && + trips_after_late == trips_after_first_trip && // no ghost trip + open_until_after_late == open_until_initial; // backoff unchanged + TestFramework::RecordTest( + "CB: late failure after trip does not inflate backoff", + pass, pass ? "" : + "trips: " + std::to_string(trips_after_first_trip) + + " → " + std::to_string(trips_after_late), + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB: late failure after trip does not inflate backoff", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + +// BUG: late non-probe success after trip would reset consecutive_failures_ +// and pollute the sliding window (pretending a fresh CLOSED cycle observed +// successes). Fix: guard ReportSuccess(probe=false) on state_ == CLOSED. +void TestLateSuccessAfterTripIgnored() { + std::cout << "\n[TEST] CB: late success after trip ignored..." << std::endl; + try { + auto cb = DefaultEnabledConfig(); + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + + for (int i = 0; i < 5; ++i) { + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting()); + } + // Slice is OPEN now. A late success arrives — must not change state. + State pre = slice.CurrentState(); + slice.ReportSuccess(false, slice.CurrentGenerationForTesting()); + bool pass = pre == State::OPEN && slice.CurrentState() == State::OPEN; + TestFramework::RecordTest("CB: late success after trip ignored", pass, + "", TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("CB: late success after trip ignored", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + +// BUG: HALF_OPEN admission kept accepting probes after the first probe +// failure (only enforcing `inflight < permitted`), so under load a failed +// recovery cycle could keep leaking traffic indefinitely instead of re-OPENing +// after the in-flight probes drained. Fix: short-circuit on saw_failure. +void TestHalfOpenStopsAdmittingAfterFirstProbeFailure() { + std::cout << "\n[TEST] CB: HALF_OPEN stops admitting after probe fail..." + << std::endl; + try { + auto cb = DefaultEnabledConfig(); + cb.permitted_half_open_calls = 5; + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + + // Trip the breaker. + for (int i = 0; i < 5; ++i) { + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting()); + } + clock->Advance(std::chrono::milliseconds(cb.base_open_duration_ms + 1)); + + // Admit 2 probes. Report failure on the first (but NOT the second yet + // — leave 1 in-flight so we can observe the short-circuit). + Decision d1 = slice.TryAcquire().decision; // ADMITTED_PROBE, inflight=1 + Decision d2 = slice.TryAcquire().decision; // ADMITTED_PROBE, inflight=2 + if (d1 != Decision::ADMITTED_PROBE || d2 != Decision::ADMITTED_PROBE) { + TestFramework::RecordTest( + "CB: HALF_OPEN stops admitting after probe fail", + false, "probes not admitted as expected", + TestFramework::TestCategory::OTHER); + return; + } + // Fail the first probe — inflight drops to 1, saw_failure=true. + // Last-probe trip does not yet fire (inflight is still 1). + slice.ReportFailure(FailureKind::RESPONSE_5XX, true, slice.CurrentGenerationForTesting()); + + // State must still be HALF_OPEN (final probe not yet completed). + State mid = slice.CurrentState(); + + // Subsequent TryAcquire — BEFORE fix this would succeed because + // inflight (1) < permitted (5). AFTER fix it short-circuits because + // saw_failure is set. + Decision d3 = slice.TryAcquire().decision; + + bool pass = mid == State::HALF_OPEN && + d3 == Decision::REJECTED_OPEN; + TestFramework::RecordTest( + "CB: HALF_OPEN stops admitting after probe fail", + pass, pass ? "" : "expected REJECTED_OPEN on 3rd TryAcquire", + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB: HALF_OPEN stops admitting after probe fail", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + +// Verifies the dedicated HALF_OPEN-full counter is bumped separately from the +// generic `rejected_` counter, so observability snapshots can distinguish +// "open, backoff not elapsed" from "probing, no slots left". +void TestHalfOpenFullCounterSeparate() { + std::cout << "\n[TEST] CB: HALF_OPEN_FULL counter separate..." << std::endl; + try { + auto cb = DefaultEnabledConfig(); + cb.permitted_half_open_calls = 2; + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + + // Trip → OPEN reject increments generic counter only. + for (int i = 0; i < 5; ++i) { + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting()); + } + slice.TryAcquire(); // REJECTED_OPEN (backoff active) + int64_t rejected_open_only = slice.Rejected(); + int64_t half_open_full_open_only = slice.RejectedHalfOpenFull(); + + // Elapse backoff → HALF_OPEN. Fill the probe budget, then a 3rd + // TryAcquire rejects with half_open_full, incrementing both counters. + clock->Advance(std::chrono::milliseconds(cb.base_open_duration_ms + 1)); + slice.TryAcquire(); // probe 1 admitted + slice.TryAcquire(); // probe 2 admitted (budget full) + slice.TryAcquire(); // REJECTED (full) + int64_t rejected_total = slice.Rejected(); + int64_t half_open_full_total = slice.RejectedHalfOpenFull(); + + bool pass = rejected_open_only == 1 && + half_open_full_open_only == 0 && + rejected_total == 2 && // 1 OPEN + 1 HALF_OPEN_FULL + half_open_full_total == 1; // only the HALF_OPEN one + TestFramework::RecordTest("CB: HALF_OPEN_FULL counter separate", + pass, pass ? "" : + "rej=" + std::to_string(rejected_total) + + " hof=" + std::to_string(half_open_full_total), + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("CB: HALF_OPEN_FULL counter separate", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + +// BUG (review round 2, P2): Reload preserved stale state across enabled +// toggles. Disabling while OPEN and re-enabling later resumed the OPEN state, +// rejecting requests despite an explicit operator off→on cycle. Disabling +// after accumulated consecutive failures would re-trip on the very next +// failure. Fix: reset state to CLOSED whenever enabled toggles. +void TestReloadResetsStateOnEnabledToggleWhileOpen() { + std::cout << "\n[TEST] CB: reload resets state on enabled toggle (while OPEN)..." + << std::endl; + try { + auto cb = DefaultEnabledConfig(); + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + + // Drive to OPEN. + for (int i = 0; i < 5; ++i) { + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting()); + } + if (slice.CurrentState() != State::OPEN) { + TestFramework::RecordTest( + "CB: reload resets state on enabled toggle (OPEN)", false, + "precondition: slice not OPEN", + TestFramework::TestCategory::OTHER); + return; + } + + // Disable via reload — state must reset to CLOSED. + auto disabled = cb; + disabled.enabled = false; + slice.Reload(disabled); + bool disabled_closed = slice.CurrentState() == State::CLOSED; + + // Re-enable via reload — state must remain CLOSED (no stale OPEN). + slice.Reload(cb); + bool reenabled_closed = slice.CurrentState() == State::CLOSED; + + // And the slice must NOT insta-trip on a single failure (pre-fix, + // consecutive_failures_ could have persisted ≥ threshold). + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting()); + bool one_fail_no_trip = slice.CurrentState() == State::CLOSED; + + bool pass = disabled_closed && reenabled_closed && one_fail_no_trip; + TestFramework::RecordTest( + "CB: reload resets state on enabled toggle (OPEN)", pass, + pass ? "" : "disabled_closed=" + std::to_string(disabled_closed) + + " reenabled_closed=" + std::to_string(reenabled_closed) + + " one_fail_no_trip=" + std::to_string(one_fail_no_trip), + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB: reload resets state on enabled toggle (OPEN)", false, e.what(), + TestFramework::TestCategory::OTHER); + } +} + +// BUG (review round 2, P2, variant): if disable happens while +// consecutive_failures_ has accumulated but not yet tripped, re-enable would +// inherit that count and trip early on the next failure. +void TestReloadResetsConsecutiveFailuresOnEnabledToggle() { + std::cout << "\n[TEST] CB: reload clears consecutive_failures on enable toggle..." + << std::endl; + try { + auto cb = DefaultEnabledConfig(); + cb.consecutive_failure_threshold = 5; + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + + // 4 failures — just under threshold. State still CLOSED. + for (int i = 0; i < 4; ++i) { + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting()); + } + if (slice.CurrentState() != State::CLOSED) { + TestFramework::RecordTest( + "CB: reload clears consecutive_failures", false, + "precondition: slice not CLOSED", + TestFramework::TestCategory::OTHER); + return; + } + + // Disable then re-enable. + auto disabled = cb; disabled.enabled = false; + slice.Reload(disabled); + slice.Reload(cb); + + // A single failure post-reenable must NOT trip — consecutive_failures_ + // should have been reset to 0, not preserved at 4. + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting()); + bool pass = slice.CurrentState() == State::CLOSED; + TestFramework::RecordTest( + "CB: reload clears consecutive_failures on enable toggle", + pass, + pass ? "" : "expected CLOSED after 1 post-reenable failure, got " + + std::string(circuit_breaker::StateName(slice.CurrentState())), + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB: reload clears consecutive_failures on enable toggle", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + +// Threshold-change-only reload (enabled unchanged) MUST preserve live state +// per design §10. Regression guard for fix #1. +void TestReloadThresholdChangePreservesState() { + std::cout << "\n[TEST] CB: reload preserves state when only thresholds change..." + << std::endl; + try { + auto cb = DefaultEnabledConfig(); + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + + for (int i = 0; i < 5; ++i) { + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting()); + } + // OPEN. Reload with a tighter threshold but enabled unchanged. + auto tighter = cb; + tighter.consecutive_failure_threshold = 2; + slice.Reload(tighter); + // State must remain OPEN — live state preservation. + bool pass = slice.CurrentState() == State::OPEN; + TestFramework::RecordTest( + "CB: reload preserves state on threshold-only change", + pass, pass ? "" : "expected OPEN, got " + + std::string(circuit_breaker::StateName(slice.CurrentState())), + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB: reload preserves state on threshold-only change", false, + e.what(), TestFramework::TestCategory::OTHER); + } +} + +// BUG (review round 2, P3): saw_failure short-circuit incorrectly bumped the +// HALF_OPEN_FULL counter, polluting dashboards that need to distinguish +// "probing, no capacity left" from "recovery attempt is failing". +void TestSawFailureDoesNotBumpHalfOpenFullCounter() { + std::cout << "\n[TEST] CB: saw_failure reject does not bump HALF_OPEN_FULL..." + << std::endl; + try { + auto cb = DefaultEnabledConfig(); + cb.permitted_half_open_calls = 5; // plenty of capacity + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + + for (int i = 0; i < 5; ++i) { + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting()); + } + clock->Advance(std::chrono::milliseconds(cb.base_open_duration_ms + 1)); + + // Admit 2 probes, fail the first — saw_failure=true, inflight=1. + slice.TryAcquire(); // probe 1 admitted + slice.TryAcquire(); // probe 2 admitted + slice.ReportFailure(FailureKind::RESPONSE_5XX, true, slice.CurrentGenerationForTesting()); + + int64_t hof_before = slice.RejectedHalfOpenFull(); + // Reject via saw_failure short-circuit (capacity is NOT exhausted — + // only 1 probe actually in flight, and permitted is 5). + Decision d = slice.TryAcquire().decision; + int64_t hof_after = slice.RejectedHalfOpenFull(); + + // Still REJECTED_OPEN (same client-visible outcome), but + // RejectedHalfOpenFull must NOT be incremented — this is a + // "recovery failing" reject, not a capacity reject. + bool pass = d == Decision::REJECTED_OPEN && + hof_before == 0 && + hof_after == 0; + TestFramework::RecordTest( + "CB: saw_failure reject does not bump HALF_OPEN_FULL", + pass, pass ? "" : "hof_before=" + std::to_string(hof_before) + + " hof_after=" + std::to_string(hof_after), + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB: saw_failure reject does not bump HALF_OPEN_FULL", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + +// BUG (review round 3, P2): TransitionOpenToHalfOpen deliberately left +// `open_until_steady_ns_` populated, violating the documented OpenUntil() +// contract ("zero when not OPEN"). A consumer computing Retry-After +// from a HALF_OPEN slice would compute (stale_deadline - now), which is +// negative once HALF_OPEN begins. +void TestOpenUntilZeroWhenHalfOpen() { + std::cout << "\n[TEST] CB: OpenUntil() zero in HALF_OPEN..." << std::endl; + try { + auto cb = DefaultEnabledConfig(); + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + + // Trip → OPEN. OpenUntil() must be non-zero (contract: zero iff NOT OPEN). + for (int i = 0; i < 5; ++i) { + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, + slice.CurrentGenerationForTesting()); + } + auto open_ns = slice.OpenUntil(); + bool open_nonzero = open_ns != std::chrono::steady_clock::time_point{}; + + // Elapse backoff → HALF_OPEN via TryAcquire. + clock->Advance(std::chrono::milliseconds(cb.base_open_duration_ms + 1)); + auto a = slice.TryAcquire(); + bool halfopen = slice.CurrentState() == State::HALF_OPEN && + a.decision == Decision::ADMITTED_PROBE; + + // Contract: OpenUntil() zero now that state != OPEN. + auto halfopen_ns = slice.OpenUntil(); + bool halfopen_zero = halfopen_ns == std::chrono::steady_clock::time_point{}; + + bool pass = open_nonzero && halfopen && halfopen_zero; + TestFramework::RecordTest( + "CB: OpenUntil() zero in HALF_OPEN", + pass, pass ? "" : + "open_nonzero=" + std::to_string(open_nonzero) + + " halfopen=" + std::to_string(halfopen) + + " halfopen_zero=" + std::to_string(halfopen_zero), + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB: OpenUntil() zero in HALF_OPEN", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + +// BUG (review round 3, P1): Reload reset the state on enabled toggle but +// gave Report* no way to distinguish pre-toggle admissions from post-toggle +// ones. Stale completions then polluted the fresh CLOSED cycle. Fixed with +// a generation token captured at admission and checked at report. +void TestStaleGenerationReportsDroppedAfterReloadToggle() { + std::cout << "\n[TEST] CB: stale-generation reports dropped after reload toggle..." + << std::endl; + try { + auto cb = DefaultEnabledConfig(); + cb.consecutive_failure_threshold = 3; // make insta-trip detection easy + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + + // Admit 3 requests in the original CLOSED cycle (generation = A). + auto a1 = slice.TryAcquire(); + auto a2 = slice.TryAcquire(); + auto a3 = slice.TryAcquire(); + uint64_t gen_A = a1.generation; + bool same_gen_pre = a2.generation == gen_A && a3.generation == gen_A; + + // Operator toggles: disable then re-enable → fresh CLOSED cycle. + auto disabled = cb; disabled.enabled = false; + slice.Reload(disabled); + slice.Reload(cb); + // After toggle, state is CLOSED and generation has advanced. + uint64_t gen_B = slice.CurrentGenerationForTesting(); + bool generation_advanced = gen_B != gen_A; + + // Late failures from the pre-toggle cycle arrive. Without the fix, + // these would increment consecutive_failures_ and trip the fresh + // cycle IMMEDIATELY (threshold=3, 3 late failures). + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, gen_A); + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, gen_A); + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, gen_A); + + // Fresh cycle must be untouched. + bool state_still_closed = slice.CurrentState() == State::CLOSED; + bool stale_counter_bumped = slice.ReportsStaleGeneration() == 3; + + // A fresh post-toggle admission + 3 REAL failures should still trip — + // so the guard didn't over-drop. + auto fresh = slice.TryAcquire(); + for (int i = 0; i < 3; ++i) { + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, fresh.generation); + } + bool fresh_trips = slice.CurrentState() == State::OPEN; + + bool pass = same_gen_pre && generation_advanced && + state_still_closed && stale_counter_bumped && fresh_trips; + TestFramework::RecordTest( + "CB: stale-generation reports dropped after reload toggle", + pass, pass ? "" : + "same_gen_pre=" + std::to_string(same_gen_pre) + + " gen_advanced=" + std::to_string(generation_advanced) + + " state_closed=" + std::to_string(state_still_closed) + + " stale_cnt=" + std::to_string(slice.ReportsStaleGeneration()) + + " fresh_trips=" + std::to_string(fresh_trips), + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB: stale-generation reports dropped after reload toggle", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + +// Generation also advances across state transitions (not just Reload), so +// a report admitted in CLOSED cycle A that completes after OPEN → HALF_OPEN +// → CLOSED cycle B is dropped instead of polluting cycle B's counters. +void TestStaleGenerationReportsDroppedAcrossStateTransitions() { + std::cout << "\n[TEST] CB: stale reports dropped across CLOSED->OPEN->CLOSED..." + << std::endl; + try { + auto cb = DefaultEnabledConfig(); + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + + // CLOSED cycle A — admit a request, capture its generation. + auto admit_A = slice.TryAcquire(); + uint64_t gen_A = admit_A.generation; + + // Drive to OPEN, then HALF_OPEN, then CLOSED (cycle B) via probe success. + for (int i = 0; i < 5; ++i) { + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, + slice.CurrentGenerationForTesting()); + } + clock->Advance(std::chrono::milliseconds(cb.base_open_duration_ms + 1)); + for (int i = 0; i < cb.permitted_half_open_calls; ++i) { + auto p = slice.TryAcquire(); // probe + slice.ReportSuccess(true, p.generation); + } + bool cycleB_closed = slice.CurrentState() == State::CLOSED; + uint64_t gen_B = slice.CurrentGenerationForTesting(); + bool gen_advanced = gen_B > gen_A; + + // Now the original cycle-A request finally reports a success. In a + // world without the generation guard, this would reset cycle B's + // (freshly-zero) consecutive_failures_ and add to cycle B's window, + // polluting fresh telemetry. + int64_t stale_before = slice.ReportsStaleGeneration(); + slice.ReportSuccess(false, gen_A); + int64_t stale_after = slice.ReportsStaleGeneration(); + bool dropped = stale_after == stale_before + 1; + + bool pass = cycleB_closed && gen_advanced && dropped; + TestFramework::RecordTest( + "CB: stale reports dropped across CLOSED->OPEN->CLOSED", + pass, pass ? "" : + "cycleB_closed=" + std::to_string(cycleB_closed) + + " gen_advanced=" + std::to_string(gen_advanced) + + " dropped=" + std::to_string(dropped), + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB: stale reports dropped across CLOSED->OPEN->CLOSED", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + +// BUG (review round 4, P2): Reload that resizes the rolling window without +// toggling enabled cleared the window buckets but left generation_ unchanged. +// Late reports from pre-reload admissions would carry the still-current +// generation, pass the guard, and re-populate the freshly empty window — +// mixing pre-reload and post-reload traffic. A pre-reload + post-reload +// failure pair could satisfy minimum_volume / failure_rate immediately on +// what should be a fresh observation cycle. +void TestWindowResizeAdvancesGeneration() { + std::cout << "\n[TEST] CB: window resize advances generation..." << std::endl; + try { + // Use rate-trip path only (high consec threshold disables that path), + // a low minimum_volume so 2 failures suffice, and a high + // failure_rate_threshold so the trip relies on the rate calc. + CircuitBreakerConfig cb; + cb.enabled = true; + cb.consecutive_failure_threshold = 1000; // disable consecutive path + cb.failure_rate_threshold = 50; + cb.minimum_volume = 2; + cb.window_seconds = 10; + cb.permitted_half_open_calls = 5; + cb.base_open_duration_ms = 5000; + cb.max_open_duration_ms = 60000; + + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + + // Pre-reload: admit a request and capture its generation. + auto admit_pre = slice.TryAcquire(); + uint64_t gen_pre = admit_pre.generation; + + // Reload: change window_seconds but keep enabled=true. Window is + // resized (cleared) and generation MUST advance so the pre-reload + // admission's late report doesn't seed the new window. + auto resized = cb; + resized.window_seconds = 30; + slice.Reload(resized); + + uint64_t gen_post = slice.CurrentGenerationForTesting(); + bool gen_advanced = gen_post != gen_pre; + + // The pre-reload admission completes (failure). Without the fix, + // this would add one failure to the freshly-empty window. Then + // a post-reload admission's failure brings total=2 >= minimum_volume, + // failures=2/2=100% >= 50% → IMMEDIATE TRIP on a fresh window. + // With the fix, the pre-reload report is dropped (counted as stale). + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, gen_pre); + + int64_t stale_after_pre = slice.ReportsStaleGeneration(); + + // Now a real post-reload admission and failure — single failure in + // a fresh window of size 30s. total=1, below minimum_volume=2 → no trip. + auto admit_post = slice.TryAcquire(); + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, admit_post.generation); + + bool state_still_closed = slice.CurrentState() == State::CLOSED; + bool stale_dropped = stale_after_pre == 1; + + bool pass = gen_advanced && state_still_closed && stale_dropped; + TestFramework::RecordTest( + "CB: window resize advances generation", + pass, pass ? "" : + "gen_advanced=" + std::to_string(gen_advanced) + + " state_closed=" + std::to_string(state_still_closed) + + " stale_count=" + std::to_string(slice.ReportsStaleGeneration()), + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB: window resize advances generation", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + +// Regression guard: a reload that changes only thresholds (no window resize, +// no enabled toggle) MUST preserve generation. Operator intent is "apply new +// thresholds to existing observations" — the round-4 fix's window-resize +// generation bump must NOT trigger here. +void TestThresholdOnlyReloadDoesNotAdvanceGeneration() { + std::cout << "\n[TEST] CB: threshold-only reload preserves generation..." + << std::endl; + try { + auto cb = DefaultEnabledConfig(); + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + + auto admit = slice.TryAcquire(); + uint64_t gen_pre = admit.generation; + + // Tighten thresholds; same enabled, same window_seconds. + auto tightened = cb; + tightened.consecutive_failure_threshold = 2; + tightened.failure_rate_threshold = 30; + slice.Reload(tightened); + + uint64_t gen_post = slice.CurrentGenerationForTesting(); + bool gen_preserved = gen_post == gen_pre; + + // The pre-reload admission's report should NOT be dropped — operator + // wants the new thresholds applied to existing in-flight observations. + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, gen_pre); + bool stale_zero = slice.ReportsStaleGeneration() == 0; + + bool pass = gen_preserved && stale_zero; + TestFramework::RecordTest( + "CB: threshold-only reload preserves generation", + pass, pass ? "" : + "gen_preserved=" + std::to_string(gen_preserved) + + " stale_zero=" + std::to_string(stale_zero), + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB: threshold-only reload preserves generation", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + +// BUG (review round 5, P1): Reload with window_seconds change while the +// slice is HALF_OPEN used to bump the single `generation_`, invalidating +// every in-flight probe. Those probes' late Report* calls then dropped +// WITHOUT decrementing half_open_inflight_, wedging the slice in HALF_OPEN +// with all probe slots stuck "in flight" forever — subsequent TryAcquires +// rejected with half_open_full indefinitely until another full reset. +// +// Fix: split generation into closed_gen_ (non-probe, CLOSED-state data) +// and halfopen_gen_ (probe, HALF_OPEN-state data). window_seconds reload +// bumps only closed_gen_ because it only resets CLOSED-state data. +void TestWindowResizeDuringHalfOpenDoesNotStrandProbes() { + std::cout << "\n[TEST] CB: window resize during HALF_OPEN preserves probes..." + << std::endl; + try { + auto cb = DefaultEnabledConfig(); + cb.permitted_half_open_calls = 3; + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + + // Drive to HALF_OPEN. + for (int i = 0; i < 5; ++i) { + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, + slice.CurrentGenerationForTesting()); + } + clock->Advance(std::chrono::milliseconds(cb.base_open_duration_ms + 1)); + + // Admit all 3 probes (capture their admission tokens). + auto p1 = slice.TryAcquire(); + auto p2 = slice.TryAcquire(); + auto p3 = slice.TryAcquire(); + bool all_admitted_probe = p1.decision == Decision::ADMITTED_PROBE && + p2.decision == Decision::ADMITTED_PROBE && + p3.decision == Decision::ADMITTED_PROBE; + + // Reload window_seconds (enabled unchanged). PRE-fix: bumps single + // generation, invalidates p1/p2/p3 probes → stranded. POST-fix: + // bumps only closed_gen_, probe tokens still match halfopen_gen_. + auto resized = cb; + resized.window_seconds = 30; + slice.Reload(resized); + + // closed_gen advanced, halfopen_gen preserved. + bool closed_gen_advanced = slice.CurrentClosedGenForTesting() != + p1.generation; // p1 was admitted in HALF_OPEN + // but let's check against gen + // we'd have captured in CLOSED + // Actually, directly: probes tokens must still match halfopen_gen_. + bool probe_gen_preserved = + p1.generation == slice.CurrentHalfOpenGenForTesting() && + p2.generation == slice.CurrentHalfOpenGenForTesting() && + p3.generation == slice.CurrentHalfOpenGenForTesting(); + + // Probes report success — each must be accepted and advance the + // HALF_OPEN → CLOSED transition. + slice.ReportSuccess(true, p1.generation); + slice.ReportSuccess(true, p2.generation); + slice.ReportSuccess(true, p3.generation); + + // After 3 probe successes at permitted_half_open_calls=3, slice + // MUST have transitioned to CLOSED. Pre-fix: probes dropped, no + // progression, still HALF_OPEN with inflight stuck at 3. + bool closed_now = slice.CurrentState() == State::CLOSED; + // None of the probes were dropped as stale. + bool no_stale_drops = slice.ReportsStaleGeneration() == 0; + // All 3 probe successes counted. + bool all_probes_counted = slice.ProbeSuccesses() == 3; + + bool pass = all_admitted_probe && probe_gen_preserved && + closed_now && no_stale_drops && all_probes_counted; + (void)closed_gen_advanced; // (informational only) + + TestFramework::RecordTest( + "CB: window resize during HALF_OPEN preserves probes", + pass, pass ? "" : + "admitted=" + std::to_string(all_admitted_probe) + + " probe_gen_preserved=" + std::to_string(probe_gen_preserved) + + " closed_now=" + std::to_string(closed_now) + + " stale=" + std::to_string(slice.ReportsStaleGeneration()) + + " probe_success=" + std::to_string(slice.ProbeSuccesses()), + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB: window resize during HALF_OPEN preserves probes", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + +// Companion guard: window_seconds reload MUST still invalidate pre-reload +// CLOSED (non-probe) admissions. Ensures the split-gen didn't weaken the +// round-4 fix. +void TestWindowResizeStillInvalidatesClosedAdmissions() { + std::cout << "\n[TEST] CB: window resize invalidates CLOSED admissions..." + << std::endl; + try { + CircuitBreakerConfig cb; + cb.enabled = true; + cb.consecutive_failure_threshold = 1000; // disable consec path + cb.failure_rate_threshold = 50; + cb.minimum_volume = 2; + cb.window_seconds = 10; + cb.permitted_half_open_calls = 5; + cb.base_open_duration_ms = 5000; + cb.max_open_duration_ms = 60000; + + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + + auto admit_pre = slice.TryAcquire(); + uint64_t gen_pre = admit_pre.generation; + + auto resized = cb; resized.window_seconds = 30; + slice.Reload(resized); + + // Pre-reload CLOSED admission reports — must drop as stale. + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, gen_pre); + bool dropped_stale = slice.ReportsStaleGeneration() == 1; + + // And state must remain CLOSED (pre-reload failure did NOT seed window). + auto admit_post = slice.TryAcquire(); + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, admit_post.generation); + bool still_closed = slice.CurrentState() == State::CLOSED; + + bool pass = dropped_stale && still_closed; + TestFramework::RecordTest( + "CB: window resize invalidates CLOSED admissions", + pass, pass ? "" : + "dropped=" + std::to_string(dropped_stale) + + " closed=" + std::to_string(still_closed), + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB: window resize invalidates CLOSED admissions", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + +// BUG (review round 7, P2): Reload() lowering permitted_half_open_calls +// while a HALF_OPEN cycle is active could close the breaker early and +// discard failures from already-admitted probes. +// +// Scenario (5-probe cycle reloaded down to 1): +// TransitionOpenToHalfOpen: snapshot=5, admit 5 probes. +// Reload: permitted_half_open_calls → 1. +// First success arrives → half_open_successes_=1 ≥ NEW limit (1) +// → TransitionHalfOpenToClosed() fires → halfopen_gen_ bumped. +// Remaining 4 admitted probes are now stale → their failures DROPPED. +// Breaker falsely closes even though 4 probes have not reported yet. +// +// Fix: snapshot config_.permitted_half_open_calls into +// half_open_permitted_snapshot_ at TransitionOpenToHalfOpen time. +// TryAcquire (slot gate) and ReportSuccess (close check) both use the +// snapshot so the cycle budget is frozen for its lifetime. +void TestHalfOpenBudgetFrozenAcrossReload() { + std::cout << "\n[TEST] CB: HALF_OPEN budget frozen across mid-cycle reload..." + << std::endl; + try { + CircuitBreakerConfig cb; + cb.enabled = true; + cb.consecutive_failure_threshold = 5; + cb.failure_rate_threshold = 100; // disable rate-trip + cb.minimum_volume = 1000; // disable rate-trip + cb.window_seconds = 10; + cb.permitted_half_open_calls = 2; // exactly 2 probes for clean drain + cb.base_open_duration_ms = 100; + cb.max_open_duration_ms = 60000; + + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + + // Trip the breaker. + for (int i = 0; i < 5; ++i) { + auto a = slice.TryAcquire(); + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, a.generation); + } + bool is_open = slice.CurrentState() == State::OPEN; + + // Advance past open_until → OPEN→HALF_OPEN on next TryAcquire. + clock->Advance(std::chrono::milliseconds(cb.base_open_duration_ms + 1)); + + // Admit both probes (budget=2; snapshot set to 2 at TransitionOpenToHalfOpen). + auto a0 = slice.TryAcquire(); + auto a1 = slice.TryAcquire(); + bool both_probes = (a0.decision == Decision::ADMITTED_PROBE) && + (a1.decision == Decision::ADMITTED_PROBE); + bool is_halfopen = slice.CurrentState() == State::HALF_OPEN; + + // Lower the limit to 1 mid-cycle. + auto lowered = cb; + lowered.permitted_half_open_calls = 1; + slice.Reload(lowered); + + // First probe succeeds. + // Without fix: successes(1) >= NEW config(1) → TransitionHalfOpenToClosed + // → halfopen_gen_ bumped → second probe's failure DROPPED + // → breaker falsely CLOSED. + // With fix: successes(1) >= snapshot(2) is false → stays HALF_OPEN. + slice.ReportSuccess(true, a0.generation); + bool not_closed_after_one = slice.CurrentState() == State::HALF_OPEN; + + // Second probe fails. inflight drops to 0 → TripHalfOpenToOpen fires. + slice.ReportFailure(FailureKind::RESPONSE_5XX, true, a1.generation); + bool retripped = slice.CurrentState() == State::OPEN; + + bool pass = is_open && both_probes && is_halfopen && + not_closed_after_one && retripped; + TestFramework::RecordTest( + "CB: HALF_OPEN budget frozen across mid-cycle reload", + pass, pass ? "" : + "is_open=" + std::to_string(is_open) + + " both_probes=" + std::to_string(both_probes) + + " is_halfopen=" + std::to_string(is_halfopen) + + " not_closed_after_one=" + std::to_string(not_closed_after_one) + + " retripped=" + std::to_string(retripped), + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB: HALF_OPEN budget frozen across mid-cycle reload", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + +// BUG (review round 6, P2): Reload with window_seconds change preserved +// consecutive_failures_ while bumping closed_gen_. Pre-reload CLOSED +// reports are correctly blocked (stale gen), but they can no longer +// clear or advance consecutive_failures_ either. The counter becomes an +// orphaned relic from a prior observation cycle: +// +// Scenario: 4 consecutive failures (threshold=5), reload window_seconds. +// Pre-reload success arrives → stale gen → DROPPED. +// Without fix: consecutive_failures_ stays at 4. +// Next real failure: consecutive_failures_ = 5 → SPURIOUS TRIP. +// +// Fix: reset consecutive_failures_ = 0 in the same branch that clears +// the window on resize. Both are CLOSED-domain state from the same +// observation cycle; invalidating one without resetting the other leaves +// an inconsistent counter. +void TestWindowResizeResetConsecutiveFailures() { + std::cout << "\n[TEST] CB: window resize resets consecutive_failures_..." + << std::endl; + try { + CircuitBreakerConfig cb; + cb.enabled = true; + cb.consecutive_failure_threshold = 5; + cb.failure_rate_threshold = 100; // rate-trip disabled (100% threshold) + cb.minimum_volume = 1000; // rate-trip disabled (high volume gate) + cb.window_seconds = 10; + cb.permitted_half_open_calls = 5; + cb.base_open_duration_ms = 5000; + cb.max_open_duration_ms = 60000; + + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + + // Accumulate 4 consecutive failures (one below the threshold of 5). + for (int i = 0; i < 4; ++i) { + auto a = slice.TryAcquire(); + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, a.generation); + } + bool pre_reload_closed = slice.CurrentState() == State::CLOSED; + + // Capture a pre-reload admission. + auto pre_admit = slice.TryAcquire(); + uint64_t pre_gen = pre_admit.generation; + + // Window-only reload: wipes the rate window, bumps closed_gen_, + // and (with the fix) resets consecutive_failures_ to 0. + auto resized = cb; + resized.window_seconds = 30; + slice.Reload(resized); + + // Pre-reload success arrives late — must be dropped (stale gen). + slice.ReportSuccess(false, pre_gen); + bool stale_dropped = slice.ReportsStaleGeneration() == 1; + + // Verify consecutive_failures_ was reset: one real post-reload failure + // must NOT trip the breaker (counter is 1/5, not 5/5). + auto post_admit = slice.TryAcquire(); + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, post_admit.generation); + bool no_spurious_trip = slice.CurrentState() == State::CLOSED; + + bool pass = pre_reload_closed && stale_dropped && no_spurious_trip; + TestFramework::RecordTest( + "CB: window resize resets consecutive_failures_", + pass, pass ? "" : + "pre_reload_closed=" + std::to_string(pre_reload_closed) + + " stale_dropped=" + std::to_string(stale_dropped) + + " no_spurious_trip=" + std::to_string(no_spurious_trip), + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB: window resize resets consecutive_failures_", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + +// BUG (review round 9, P2-1): ReportFailure captured Now() separately in +// AddFailure() and ShouldTripClosed()'s internal TotalCount/FailureCount +// calls. If a second boundary elapsed between the two calls, Advance() could +// wipe the just-recorded failure — with window_seconds=1, the 1-second delta +// hits the delta >= window_seconds full-reset path and the failure +// disappears before the trip evaluation runs. Fix: capture Now() once in +// ReportFailure and thread it through ShouldTripClosed(now), AddFailure(now). +// +// Regression test injects a time source that returns T on the first call +// and T+1s on every subsequent call, simulating the boundary crossing. +// Post-fix, ReportFailure only calls Now() once — the fix is effective. +// Pre-fix, the second Now() call inside ShouldTripClosed would advance the +// ring and wipe the failure → no trip. +void TestReportFailureUsesOneTimestampAcrossTripEval() { + std::cout << "\n[TEST] CB: ReportFailure uses single timestamp for trip eval..." + << std::endl; + try { + CircuitBreakerConfig cb; + cb.enabled = true; + cb.consecutive_failure_threshold = 1000; // disable consec path + cb.failure_rate_threshold = 100; // rate=100% to trip on fail + cb.minimum_volume = 1; // single failure suffices + cb.window_seconds = 1; // boundary-sensitive + cb.permitted_half_open_calls = 5; + cb.base_open_duration_ms = 5000; + cb.max_open_duration_ms = 60000; + + // Time source returns base on call #1 and base+1s on every call after. + // This simulates a clock tick between AddFailure (call 1) and any + // subsequent Now() inside ShouldTripClosed (call 2+). + auto base = std::chrono::steady_clock::time_point( + std::chrono::seconds(1'000'000)); + int call_count = 0; + auto time_source = [&call_count, base]() { + int n = call_count++; + return n == 0 ? base : base + std::chrono::seconds(1); + }; + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, time_source); + + // Admit + fail one request. + // Pre-fix trace (BUGGY): AddFailure(base) records in bucket[0]. Then + // ShouldTripClosed()'s internal TotalCount(base+1s) calls Advance + // → delta=1 >= window=1 → full reset wipes the bucket → total=0 < + // minimum_volume=1 → NO TRIP. Rate trip missed. + // Post-fix: ReportFailure captures Now() once (=base), passes to + // AddFailure(base) AND ShouldTripClosed(base). Ring stays aligned; + // total=1, failures=1 → rate fires → TRIP to OPEN. + auto a = slice.TryAcquire(); + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, a.generation); + + bool pass = slice.CurrentState() == State::OPEN; + TestFramework::RecordTest( + "CB: ReportFailure uses single timestamp for trip eval", + pass, pass ? "" : + "expected OPEN, got state=" + + std::to_string(static_cast(slice.CurrentState())), + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB: ReportFailure uses single timestamp for trip eval", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + +// BUG (review round 8, P2): CircuitBreakerWindow's constructor allocated +// `max(1, window_seconds)` buckets but stored the RAW window_seconds_ value. +// Programmatic callers bypassing ConfigLoader::Validate() (tests, future +// direct users) that passed window_seconds <= 0 would trigger BucketIndex's +// `% window_seconds_` on the first Add*/TotalCount call — dividing by zero +// for 0, or violating ring math for negatives. Resize() already clamped. +// Fix: constructor applies the same clamp so both entry points are symmetric. +void TestWindowNonPositiveWindowSizeClamp() { + std::cout << "\n[TEST] CB: window ctor clamps non-positive sizes..." + << std::endl; + try { + // Zero would have crashed on % 0 before the fix. + CircuitBreakerWindow w0(0); + auto t = std::chrono::steady_clock::time_point(std::chrono::seconds(1000)); + w0.AddSuccess(t); + w0.AddFailure(t); + bool zero_ok = (w0.TotalCount(t) == 2) && (w0.FailureCount(t) == 1); + + // Negative values would have violated the ring math. + CircuitBreakerWindow wn(-5); + wn.AddSuccess(t); + bool negative_ok = wn.TotalCount(t) == 1; + + bool pass = zero_ok && negative_ok; + TestFramework::RecordTest( + "CB: window ctor clamps non-positive sizes", + pass, pass ? "" : + "zero_ok=" + std::to_string(zero_ok) + + " negative_ok=" + std::to_string(negative_ok), + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB: window ctor clamps non-positive sizes", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + +// BUG (review round 9, P3): CircuitBreakerSlice copied permitted_half_open_calls +// into the HALF_OPEN snapshot verbatim. For programmatic callers bypassing +// ConfigLoader::Validate() (same class as the window ctor clamp), a zero or +// negative budget would permanently wedge the breaker in HALF_OPEN: +// TryAcquire (HALF_OPEN, case B): half_open_inflight_(0) >= snapshot(0) +// → every probe rejected as half_open_full → no probe ever admitted +// → no report ever fires → half_open_inflight_ stays at 0 forever. +// +// Fix: clamp the snapshot to min 1 at TransitionOpenToHalfOpen. Symmetric +// with CircuitBreakerWindow's constructor clamp from round 8. +void TestHalfOpenClampsNonPositiveProbeBudget() { + std::cout << "\n[TEST] CB: HALF_OPEN clamps non-positive probe budget..." + << std::endl; + try { + CircuitBreakerConfig cb; + cb.enabled = true; + cb.consecutive_failure_threshold = 2; + cb.failure_rate_threshold = 100; + cb.minimum_volume = 1000; + cb.window_seconds = 10; + cb.permitted_half_open_calls = 0; // bypasses Validate() — direct ctor + cb.base_open_duration_ms = 100; + cb.max_open_duration_ms = 60000; + + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + + // Trip to OPEN. + for (int i = 0; i < 2; ++i) { + auto a = slice.TryAcquire(); + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, a.generation); + } + + // Advance past open_until → OPEN→HALF_OPEN on next TryAcquire. + clock->Advance(std::chrono::milliseconds(cb.base_open_duration_ms + 1)); + + // First TryAcquire triggers the transition. With the clamp, snapshot=1 + // and this probe is admitted. Without the clamp, snapshot=0 → rejected + // as half_open_full → breaker stuck forever. + auto a0 = slice.TryAcquire(); + bool probe_admitted = a0.decision == Decision::ADMITTED_PROBE; + + // A successful probe closes the cycle (successes(1) >= snapshot(1)). + // Without the clamp this branch would never execute. + if (probe_admitted) { + slice.ReportSuccess(true, a0.generation); + } + bool recovered = slice.CurrentState() == State::CLOSED; + + bool pass = probe_admitted && recovered; + TestFramework::RecordTest( + "CB: HALF_OPEN clamps non-positive probe budget", + pass, pass ? "" : + "probe_admitted=" + std::to_string(probe_admitted) + + " recovered=" + std::to_string(recovered), + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB: HALF_OPEN clamps non-positive probe budget", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + +// BUG (review round 10, P1): TryAcquire gated HALF_OPEN admission on +// half_open_inflight_, so a probe slot was reused once an earlier probe +// completed. With permitted_half_open_calls=2: +// +// admit A → inflight=1, admitted=1 +// admit B → inflight=2, admitted=2 +// Report success on A → inflight=1, successes=1 +// admit C → inflight(1) < snapshot(2) → ACCEPTED (BUG: 3rd admission) +// Report success on B → inflight=0, successes=2 +// successes(2) >= snapshot(2) → TransitionHalfOpenToClosed fires +// → halfopen_gen_ bumped → C's eventual failure DROPPED as stale +// → breaker falsely marked recovered despite the probe failing. +// +// Fix: gate on half_open_admitted_ (total cycle admissions, never +// decrements) instead of half_open_inflight_. The cycle can admit at most +// `snapshot` probes total, regardless of how quickly earlier probes drain. +void TestHalfOpenDoesNotReuseProbeSlots() { + std::cout << "\n[TEST] CB: HALF_OPEN does not reuse probe slots..." + << std::endl; + try { + CircuitBreakerConfig cb; + cb.enabled = true; + cb.consecutive_failure_threshold = 2; + cb.failure_rate_threshold = 100; + cb.minimum_volume = 1000; + cb.window_seconds = 10; + cb.permitted_half_open_calls = 2; + cb.base_open_duration_ms = 100; + cb.max_open_duration_ms = 60000; + + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + + // Trip to OPEN. + for (int i = 0; i < 2; ++i) { + auto a = slice.TryAcquire(); + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, a.generation); + } + clock->Advance(std::chrono::milliseconds(cb.base_open_duration_ms + 1)); + + // Admit 2 probes (budget=2). + auto a = slice.TryAcquire(); + auto b = slice.TryAcquire(); + bool both_admitted = a.decision == Decision::ADMITTED_PROBE && + b.decision == Decision::ADMITTED_PROBE; + + // Report success on A — freeing its inflight slot. + slice.ReportSuccess(true, a.generation); + bool still_halfopen = slice.CurrentState() == State::HALF_OPEN; + + // Third admission attempt. With the fix: admitted(2) >= snapshot(2) + // → REJECTED. Without the fix: inflight(1) < snapshot(2) → ADMITTED, + // creating a ghost probe. + auto c = slice.TryAcquire(); + bool third_rejected = c.decision == Decision::REJECTED_OPEN; + + // Close the cycle by succeeding B. + slice.ReportSuccess(true, b.generation); + bool closed = slice.CurrentState() == State::CLOSED; + + // Verify no stale-generation reports accumulated — if the 3rd admission + // had slipped through, its (dropped) report after the close would have + // bumped this counter. Since the admission is now rejected up front, + // this should stay zero. + bool no_stale_reports = slice.ReportsStaleGeneration() == 0; + + bool pass = both_admitted && still_halfopen && third_rejected && + closed && no_stale_reports; + TestFramework::RecordTest( + "CB: HALF_OPEN does not reuse probe slots", + pass, pass ? "" : + "both_admitted=" + std::to_string(both_admitted) + + " still_halfopen=" + std::to_string(still_halfopen) + + " third_rejected=" + std::to_string(third_rejected) + + " closed=" + std::to_string(closed) + + " no_stale_reports=" + std::to_string(no_stale_reports), + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB: HALF_OPEN does not reuse probe slots", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + +// BUG (review round 11, P1): Admission contract has ReportSuccess and +// ReportFailure but no path for probes that complete without touching the +// upstream (POOL_EXHAUSTED after probe admission, shutdown, client +// disconnect, PARSE_ERROR). Following the §7 "don't report these as +// failures" contract strictly, such probes would leak their inflight slot +// forever — once half_open_admitted_ reaches snapshot, all further +// admissions reject as half_open_full and nothing ever drains the cycle, +// wedging the slice in HALF_OPEN. +// +// Fix: ReportNeutral decrements BOTH inflight (so the last-probe re-trip +// still fires) and admitted (so a replacement probe can still exercise +// the upstream within the cycle budget). No touch to successes / fails. +void TestReportNeutralReleasesProbeSlot() { + std::cout << "\n[TEST] CB: ReportNeutral releases probe slot..." + << std::endl; + try { + CircuitBreakerConfig cb; + cb.enabled = true; + cb.consecutive_failure_threshold = 2; + cb.failure_rate_threshold = 100; + cb.minimum_volume = 1000; + cb.window_seconds = 10; + cb.permitted_half_open_calls = 2; + cb.base_open_duration_ms = 100; + cb.max_open_duration_ms = 60000; + + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + + // Trip to OPEN, advance past backoff, fully consume probe budget. + for (int i = 0; i < 2; ++i) { + auto a = slice.TryAcquire(); + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, a.generation); + } + clock->Advance(std::chrono::milliseconds(cb.base_open_duration_ms + 1)); + + auto a = slice.TryAcquire(); + auto b = slice.TryAcquire(); + bool both_probes = a.decision == Decision::ADMITTED_PROBE && + b.decision == Decision::ADMITTED_PROBE; + + // Budget full: 3rd admission rejected. + auto pre_release = slice.TryAcquire(); + bool budget_full_before = pre_release.decision == Decision::REJECTED_OPEN; + + // Neutral-release A: slot returns, replacement probe fits within budget. + slice.ReportNeutral(true, a.generation); + + auto c = slice.TryAcquire(); + bool replacement_admitted = c.decision == Decision::ADMITTED_PROBE; + + // Cycle completes cleanly via B + C successes → CLOSED. + slice.ReportSuccess(true, b.generation); + slice.ReportSuccess(true, c.generation); + bool closed = slice.CurrentState() == State::CLOSED; + + // Neutral release must NOT have bumped probe_failures / probe_successes. + bool counters_clean = slice.ProbeSuccesses() == 2 && + slice.ProbeFailures() == 0; + + bool pass = both_probes && budget_full_before && + replacement_admitted && closed && counters_clean; + TestFramework::RecordTest( + "CB: ReportNeutral releases probe slot", + pass, pass ? "" : + "both_probes=" + std::to_string(both_probes) + + " budget_full_before=" + std::to_string(budget_full_before) + + " replacement_admitted=" + std::to_string(replacement_admitted) + + " closed=" + std::to_string(closed) + + " counters_clean=" + std::to_string(counters_clean), + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB: ReportNeutral releases probe slot", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + +// Companion: a neutral release that drains the last in-flight probe AFTER +// a sibling failure must still trigger the HALF_OPEN→OPEN re-trip. Without +// this last-probe hook in ReportNeutral, the slice would wedge in HALF_OPEN +// with saw_failure=true rejecting every admission via Case A. +void TestReportNeutralLastProbeAfterFailureReTrips() { + std::cout << "\n[TEST] CB: ReportNeutral re-trips as last probe after sibling fail..." + << std::endl; + try { + CircuitBreakerConfig cb; + cb.enabled = true; + cb.consecutive_failure_threshold = 2; + cb.failure_rate_threshold = 100; + cb.minimum_volume = 1000; + cb.window_seconds = 10; + cb.permitted_half_open_calls = 2; + cb.base_open_duration_ms = 100; + cb.max_open_duration_ms = 60000; + + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + + for (int i = 0; i < 2; ++i) { + auto a = slice.TryAcquire(); + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, a.generation); + } + clock->Advance(std::chrono::milliseconds(cb.base_open_duration_ms + 1)); + + auto a = slice.TryAcquire(); + auto b = slice.TryAcquire(); + + // A fails → saw_failure=true, inflight=1 (B still running), no re-trip yet. + slice.ReportFailure(FailureKind::RESPONSE_5XX, true, a.generation); + bool still_halfopen = slice.CurrentState() == State::HALF_OPEN; + + // B neutral-releases → last in-flight drains. With the fix, the + // sibling-failure + last-probe hook fires TripHalfOpenToOpen. + slice.ReportNeutral(true, b.generation); + bool retripped = slice.CurrentState() == State::OPEN; + + bool pass = still_halfopen && retripped; + TestFramework::RecordTest( + "CB: ReportNeutral re-trips as last probe after sibling fail", + pass, pass ? "" : + "still_halfopen=" + std::to_string(still_halfopen) + + " retripped=" + std::to_string(retripped), + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB: ReportNeutral re-trips as last probe after sibling fail", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + +// BUG (review round 12, P2): ComputeOpenDuration read base/max durations +// straight from config_, so a programmatic caller bypassing +// ConfigLoader::Validate() with base_open_duration_ms <= 0 or max < base +// would compute scaled_ms <= 0. open_until = now + 0 → next TryAcquire +// sees now_ns >= open_until_ns → transition to HALF_OPEN immediately. +// The breaker never actually backed off. Fix: clamp base to >= 1ms and +// max to >= base at the compute site, matching the window and probe +// budget clamps. +void TestComputeOpenDurationClampsInvalidBase() { + std::cout << "\n[TEST] CB: ComputeOpenDuration clamps invalid base/max..." + << std::endl; + try { + CircuitBreakerConfig cb; + cb.enabled = true; + cb.consecutive_failure_threshold = 2; + cb.failure_rate_threshold = 100; + cb.minimum_volume = 1000; + cb.window_seconds = 10; + cb.permitted_half_open_calls = 1; + cb.base_open_duration_ms = 0; // bypass — would kill backoff + cb.max_open_duration_ms = 0; // bypass — would kill backoff + + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + + // Trip to OPEN. + for (int i = 0; i < 2; ++i) { + auto a = slice.TryAcquire(); + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, a.generation); + } + bool is_open = slice.CurrentState() == State::OPEN; + + // Immediate TryAcquire: clock hasn't moved, so if the clamp holds + // (open_until >= now + 1ms), this MUST reject as "open" (not drain + // to HALF_OPEN). Without the fix, scaled_ms=0 → open_until==now → + // admission path immediately transitions to HALF_OPEN. + auto immediate = slice.TryAcquire(); + bool rejected_as_open = immediate.decision == Decision::REJECTED_OPEN; + bool still_open = slice.CurrentState() == State::OPEN; + + bool pass = is_open && rejected_as_open && still_open; + TestFramework::RecordTest( + "CB: ComputeOpenDuration clamps invalid base/max", + pass, pass ? "" : + "is_open=" + std::to_string(is_open) + + " rejected_as_open=" + std::to_string(rejected_as_open) + + " still_open=" + std::to_string(still_open), + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB: ComputeOpenDuration clamps invalid base/max", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + +void TestTransitionCallbackInvoked() { + std::cout << "\n[TEST] CB: transition callback invoked..." << std::endl; + try { + auto cb = DefaultEnabledConfig(); + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + + int closed_to_open = 0; + int open_to_halfopen = 0; + int halfopen_to_closed = 0; + slice.SetTransitionCallback( + [&](State o, State n, const char*) { + if (o == State::CLOSED && n == State::OPEN) closed_to_open++; + else if (o == State::OPEN && n == State::HALF_OPEN) open_to_halfopen++; + else if (o == State::HALF_OPEN && n == State::CLOSED) halfopen_to_closed++; + }); + + // Full cycle. + for (int i = 0; i < 5; ++i) { + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting()); + } + clock->Advance(std::chrono::milliseconds(cb.base_open_duration_ms + 1)); + for (int i = 0; i < cb.permitted_half_open_calls; ++i) { + slice.TryAcquire(); + slice.ReportSuccess(true, slice.CurrentGenerationForTesting()); + } + bool pass = closed_to_open == 1 && open_to_halfopen == 1 && + halfopen_to_closed == 1; + TestFramework::RecordTest("CB: transition callback invoked", pass, "", + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("CB: transition callback invoked", false, + e.what(), TestFramework::TestCategory::OTHER); + } +} + +// Run all circuit breaker unit tests. +void RunAllTests() { + std::cout << "\n" << std::string(60, '=') << std::endl; + std::cout << "CIRCUIT BREAKER - UNIT TESTS" << std::endl; + std::cout << std::string(60, '=') << std::endl; + + TestDisabledFastPath(); + TestClosedStaysClosedBelowConsecutiveThreshold(); + TestConsecutiveFailureTrip(); + TestFailureRateTrip(); + TestMinimumVolumeGate(); + TestOpenBeforeDurationStaysOpen(); + TestOpenToHalfOpenAfterDuration(); + TestHalfOpenAllProbesSucceed(); + TestHalfOpenProbeFailureReopens(); + TestHalfOpenExhaustedSlotsRejected(); + TestExponentialBackoff(); + TestResetOnClose(); + TestWindowBucketByCurrentSecond(); + TestWindowAdvanceSkipsStale(); + TestWindowPartialExpiry(); + TestWindowReset(); + TestDryRunAdmits(); + TestReloadPreservesState(); + TestConsecutiveThresholdOne(); + TestSuccessClearsConsecutive(); + TestLateFailureAfterTripDoesNotInflateBackoff(); + TestLateSuccessAfterTripIgnored(); + TestHalfOpenStopsAdmittingAfterFirstProbeFailure(); + TestHalfOpenFullCounterSeparate(); + TestReloadResetsStateOnEnabledToggleWhileOpen(); + TestReloadResetsConsecutiveFailuresOnEnabledToggle(); + TestReloadThresholdChangePreservesState(); + TestSawFailureDoesNotBumpHalfOpenFullCounter(); + TestOpenUntilZeroWhenHalfOpen(); + TestStaleGenerationReportsDroppedAfterReloadToggle(); + TestStaleGenerationReportsDroppedAcrossStateTransitions(); + TestWindowResizeAdvancesGeneration(); + TestThresholdOnlyReloadDoesNotAdvanceGeneration(); + TestWindowResizeDuringHalfOpenDoesNotStrandProbes(); + TestWindowResizeStillInvalidatesClosedAdmissions(); + TestWindowResizeResetConsecutiveFailures(); + TestHalfOpenBudgetFrozenAcrossReload(); + TestWindowNonPositiveWindowSizeClamp(); + TestReportFailureUsesOneTimestampAcrossTripEval(); + TestHalfOpenClampsNonPositiveProbeBudget(); + TestHalfOpenDoesNotReuseProbeSlots(); + TestReportNeutralReleasesProbeSlot(); + TestReportNeutralLastProbeAfterFailureReTrips(); + TestComputeOpenDurationClampsInvalidBase(); + TestTransitionCallbackInvoked(); +} + +} // namespace CircuitBreakerTests diff --git a/test/circuit_breaker_wait_queue_drain_test.h b/test/circuit_breaker_wait_queue_drain_test.h new file mode 100644 index 00000000..d2200094 --- /dev/null +++ b/test/circuit_breaker_wait_queue_drain_test.h @@ -0,0 +1,261 @@ +#pragma once + +// Wait-queue-drain integration tests: wait-queue drain on CLOSED → OPEN trip. +// +// The integration suite covers "new requests after a trip hit +// REJECTED_OPEN". This suite covers the orthogonal case: a request that passed ConsultBreaker +// pre-trip and is waiting in the pool's bounded wait queue when the trip +// fires. Without the drain, that waiter would sit until either the pool +// frees a slot (and then re-hit the upstream — pointless traffic) or the +// queue-timeout / open-duration elapses (up to 60s latency spike). +// +// Mechanism tested: `HttpServer::MarkServerReady` installs a transition +// callback on every slice that routes CLOSED → OPEN to the corresponding +// `PoolPartition::DrainWaitQueueOnTrip()`. Each waiter receives +// `CHECKOUT_CIRCUIT_OPEN`, which `ProxyTransaction::OnCheckoutError` maps +// to the standard circuit-open response (503 + `X-Circuit-Breaker: open`). +// +// Strategy: gate concurrency via a 1-connection pool. The first request +// hangs at the backend long enough to let a second request queue behind +// it. When the first's response lands (502), the breaker trips and the +// drain fires, causing the queued request to receive 503 + circuit-open +// headers instead of the backend's 502 (which would happen if the drain +// were missing and the queued request proceeded). + +#include "test_framework.h" +#include "test_server_runner.h" +#include "http_test_client.h" +#include "http/http_server.h" +#include "config/server_config.h" + +#include +#include +#include +#include +#include + +namespace CircuitBreakerWaitQueueDrainTests { + +static UpstreamConfig MakeDrainTripUpstream(const std::string& name, + const std::string& host, + int port, + bool breaker_enabled) { + UpstreamConfig u; + u.name = name; + u.host = host; + u.port = port; + // Single connection per partition — forces the second concurrent + // request to queue behind the first. Since tests run with + // worker_threads=1, one partition exists and it has exactly one + // connection slot. + u.pool.max_connections = 1; + u.pool.max_idle_connections = 1; + u.pool.connect_timeout_ms = 3000; + u.pool.idle_timeout_sec = 30; + u.pool.max_lifetime_sec = 3600; + u.pool.max_requests_per_conn = 0; + + u.proxy.route_prefix = "/fail"; + u.proxy.strip_prefix = false; + u.proxy.response_timeout_ms = 5000; + u.proxy.retry.max_retries = 0; // Deterministic — no retry confounds. + + u.circuit_breaker.enabled = breaker_enabled; + u.circuit_breaker.consecutive_failure_threshold = 1; // Trip on first 5xx. + u.circuit_breaker.failure_rate_threshold = 100; + u.circuit_breaker.minimum_volume = 10000; + u.circuit_breaker.window_seconds = 10; + u.circuit_breaker.permitted_half_open_calls = 2; + // Long open duration so the drain is unambiguously the thing that + // surfaces the 503 to the queued client — not a timer-driven + // HALF_OPEN recovery admitting a subsequent attempt. + u.circuit_breaker.base_open_duration_ms = 30000; + u.circuit_breaker.max_open_duration_ms = 60000; + return u; +} + +// --------------------------------------------------------------------------- +// Test 1: CLOSED→OPEN trip drains queued waiter with 503 + X-Circuit-Breaker. +// +// Request A takes the single pool slot and hangs at the backend for ~300ms. +// Request B queues (pool exhausted). At t≈300ms, A's backend response +// arrives: 502 → slice trip → transition callback → DrainWaitQueueOnTrip → +// B's error_callback fires with CHECKOUT_CIRCUIT_OPEN. B's client receives +// 503 + `X-Circuit-Breaker: open`. +// +// Pre-fix (no drain): B waits ~300ms for A's slot to free, then hits the +// backend itself, gets 502, client sees 502 — NOT 503 and NOT +// X-Circuit-Breaker: open. The assertion `is_503 && has_breaker_header` +// fails without the drain wiring. +// --------------------------------------------------------------------------- +void TestWaitQueueDrainedOnTrip() { + std::cout << "\n[TEST] CB Wait-Queue Drain: wait queue drained on trip..." + << std::endl; + try { + std::atomic backend_hits{0}; + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [&backend_hits](const HttpRequest&, HttpResponse& resp) { + backend_hits.fetch_add(1, std::memory_order_relaxed); + // Delay so the gateway's pool holds the connection long + // enough for a second client request to queue on it. + std::this_thread::sleep_for(std::chrono::milliseconds(300)); + resp.Status(502).Body("upstream-err", "text/plain"); + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 1; // Single partition → single wait queue. + gw.http2.enabled = false; + + gw.upstreams.push_back( + MakeDrainTripUpstream("svc", "127.0.0.1", backend_port, + /*breaker_enabled=*/true)); + + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + int gw_port = gw_runner.GetPort(); + + // Launch A first (takes the one connection), then B 50ms later + // so B is guaranteed to enter the wait queue. + std::promise a_resp, b_resp; + auto a_fut = a_resp.get_future(); + auto b_fut = b_resp.get_future(); + std::thread a([&]() { + a_resp.set_value(TestHttpClient::HttpGet(gw_port, "/fail", 5000)); + }); + std::this_thread::sleep_for(std::chrono::milliseconds(50)); + std::thread b([&]() { + b_resp.set_value(TestHttpClient::HttpGet(gw_port, "/fail", 5000)); + }); + a.join(); + b.join(); + + std::string ra = a_fut.get(); + std::string rb = b_fut.get(); + + // A unambiguously hits the backend (owns the slot) and sees 502. + bool a_is_502 = TestHttpClient::HasStatus(ra, 502); + // B must see the circuit-open short-circuit from the drain — + // NOT a 502 from the backend, which is what happens without + // the drain wiring. + bool b_is_503 = TestHttpClient::HasStatus(rb, 503); + bool b_has_breaker_hdr = + rb.find("X-Circuit-Breaker: open") != std::string::npos || + rb.find("x-circuit-breaker: open") != std::string::npos; + // Exactly one backend hit — B was drained before making it to + // the upstream. Without the drain, backend_hits would be 2. + int hits = backend_hits.load(std::memory_order_relaxed); + bool single_hit = (hits == 1); + + bool pass = a_is_502 && b_is_503 && b_has_breaker_hdr && single_hit; + TestFramework::RecordTest( + "CB Wait-Queue Drain: wait queue drained on trip", pass, + pass ? "" : + "a_is_502=" + std::to_string(a_is_502) + + " b_is_503=" + std::to_string(b_is_503) + + " b_breaker_hdr=" + std::to_string(b_has_breaker_hdr) + + " backend_hits=" + std::to_string(hits) + + " rb_head=" + rb.substr(0, 200)); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Wait-Queue Drain: wait queue drained on trip", false, e.what()); + } +} + +// --------------------------------------------------------------------------- +// Test 2: With the breaker disabled, the drain does NOT fire — the queued +// waiter proceeds to the upstream as it would absent the circuit-breaker +// layer entirely. +// +// Same setup as Test 1 but `circuit_breaker.enabled=false`. Disabled slices +// short-circuit in TryAcquire and never invoke transition callbacks, so +// DrainWaitQueueOnTrip is never called. Request B must hit the backend +// (backend_hits == 2) and receive the upstream's 502 — NOT a 503. +// --------------------------------------------------------------------------- +void TestDisabledBreakerDoesNotDrain() { + std::cout << "\n[TEST] CB Wait-Queue Drain: disabled breaker does not drain..." + << std::endl; + try { + std::atomic backend_hits{0}; + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [&backend_hits](const HttpRequest&, HttpResponse& resp) { + backend_hits.fetch_add(1, std::memory_order_relaxed); + std::this_thread::sleep_for(std::chrono::milliseconds(300)); + resp.Status(502).Body("upstream-err", "text/plain"); + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 1; + gw.http2.enabled = false; + + gw.upstreams.push_back( + MakeDrainTripUpstream("svc", "127.0.0.1", backend_port, + /*breaker_enabled=*/false)); + + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + int gw_port = gw_runner.GetPort(); + + std::promise a_resp, b_resp; + auto a_fut = a_resp.get_future(); + auto b_fut = b_resp.get_future(); + std::thread a([&]() { + a_resp.set_value(TestHttpClient::HttpGet(gw_port, "/fail", 5000)); + }); + std::this_thread::sleep_for(std::chrono::milliseconds(50)); + std::thread b([&]() { + b_resp.set_value(TestHttpClient::HttpGet(gw_port, "/fail", 5000)); + }); + a.join(); + b.join(); + + std::string ra = a_fut.get(); + std::string rb = b_fut.get(); + + // Both reach the backend — disabled breaker = no drain. + bool a_is_502 = TestHttpClient::HasStatus(ra, 502); + bool b_is_502 = TestHttpClient::HasStatus(rb, 502); + // Neither should carry the circuit-open header. + bool no_breaker_on_a = + ra.find("X-Circuit-Breaker") == std::string::npos && + ra.find("x-circuit-breaker") == std::string::npos; + bool no_breaker_on_b = + rb.find("X-Circuit-Breaker") == std::string::npos && + rb.find("x-circuit-breaker") == std::string::npos; + int hits = backend_hits.load(std::memory_order_relaxed); + bool two_hits = (hits == 2); + + bool pass = a_is_502 && b_is_502 && no_breaker_on_a && + no_breaker_on_b && two_hits; + TestFramework::RecordTest( + "CB Wait-Queue Drain: disabled breaker does not drain", pass, + pass ? "" : + "a_is_502=" + std::to_string(a_is_502) + + " b_is_502=" + std::to_string(b_is_502) + + " no_breaker_on_a=" + std::to_string(no_breaker_on_a) + + " no_breaker_on_b=" + std::to_string(no_breaker_on_b) + + " backend_hits=" + std::to_string(hits)); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Wait-Queue Drain: disabled breaker does not drain", false, e.what()); + } +} + +void RunAllTests() { + std::cout << "\n" << std::string(60, '=') << std::endl; + std::cout << "CIRCUIT BREAKER - WAIT-QUEUE DRAIN ON TRIP TESTS" + << std::endl; + std::cout << std::string(60, '=') << std::endl; + + TestWaitQueueDrainedOnTrip(); + TestDisabledBreakerDoesNotDrain(); +} + +} // namespace CircuitBreakerWaitQueueDrainTests diff --git a/test/config_test.h b/test/config_test.h index cfb90c7a..778f464b 100644 --- a/test/config_test.h +++ b/test/config_test.h @@ -348,6 +348,274 @@ namespace ConfigTests { } } + // Test 9: Circuit breaker defaults + void TestCircuitBreakerDefaults() { + std::cout << "\n[TEST] Circuit Breaker Defaults..." << std::endl; + try { + CircuitBreakerConfig cb; // value-initialized defaults + bool pass = cb.enabled == false && + cb.dry_run == false && + cb.consecutive_failure_threshold == 5 && + cb.failure_rate_threshold == 50 && + cb.minimum_volume == 20 && + cb.window_seconds == 10 && + cb.permitted_half_open_calls == 5 && + cb.base_open_duration_ms == 5000 && + cb.max_open_duration_ms == 60000 && + cb.max_ejection_percent_per_host_set == 50 && + cb.retry_budget_percent == 20 && + cb.retry_budget_min_concurrency == 3; + TestFramework::RecordTest("Circuit Breaker Defaults", pass, + pass ? "" : "default value mismatch", + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("Circuit Breaker Defaults", false, e.what(), + TestFramework::TestCategory::OTHER); + } + } + + // Test 10: Circuit breaker JSON parsing (populated block) + void TestCircuitBreakerJsonParse() { + std::cout << "\n[TEST] Circuit Breaker JSON Parse..." << std::endl; + try { + std::string json = R"({ + "upstreams": [{ + "name": "svc", + "host": "10.0.0.1", + "port": 8080, + "circuit_breaker": { + "enabled": true, + "dry_run": true, + "consecutive_failure_threshold": 7, + "failure_rate_threshold": 75, + "minimum_volume": 50, + "window_seconds": 30, + "permitted_half_open_calls": 3, + "base_open_duration_ms": 2000, + "max_open_duration_ms": 120000, + "max_ejection_percent_per_host_set": 33, + "retry_budget_percent": 10, + "retry_budget_min_concurrency": 5 + } + }] + })"; + ServerConfig config = ConfigLoader::LoadFromString(json); + const auto& cb = config.upstreams.at(0).circuit_breaker; + bool pass = cb.enabled == true && cb.dry_run == true && + cb.consecutive_failure_threshold == 7 && + cb.failure_rate_threshold == 75 && + cb.minimum_volume == 50 && + cb.window_seconds == 30 && + cb.permitted_half_open_calls == 3 && + cb.base_open_duration_ms == 2000 && + cb.max_open_duration_ms == 120000 && + cb.max_ejection_percent_per_host_set == 33 && + cb.retry_budget_percent == 10 && + cb.retry_budget_min_concurrency == 5; + TestFramework::RecordTest("Circuit Breaker JSON Parse", pass, + pass ? "" : "parsed values mismatch", + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("Circuit Breaker JSON Parse", false, e.what(), + TestFramework::TestCategory::OTHER); + } + } + + // Test 11: Circuit breaker JSON partial block uses defaults for missing fields + void TestCircuitBreakerJsonPartial() { + std::cout << "\n[TEST] Circuit Breaker JSON Partial..." << std::endl; + try { + std::string json = R"({ + "upstreams": [{ + "name": "svc", "host": "10.0.0.1", "port": 8080, + "circuit_breaker": {"enabled": true} + }] + })"; + ServerConfig config = ConfigLoader::LoadFromString(json); + const auto& cb = config.upstreams.at(0).circuit_breaker; + bool pass = cb.enabled == true && + cb.consecutive_failure_threshold == 5 && + cb.window_seconds == 10; + TestFramework::RecordTest("Circuit Breaker JSON Partial", pass, + pass ? "" : "expected defaults for unset fields", + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("Circuit Breaker JSON Partial", false, e.what(), + TestFramework::TestCategory::OTHER); + } + } + + // Test 12: Round-trip via ToJson() preserves circuit_breaker + void TestCircuitBreakerJsonRoundTrip() { + std::cout << "\n[TEST] Circuit Breaker JSON Round-Trip..." << std::endl; + try { + ServerConfig in; + UpstreamConfig u; + u.name = "svc"; u.host = "10.0.0.1"; u.port = 8080; + u.circuit_breaker.enabled = true; + u.circuit_breaker.window_seconds = 25; + u.circuit_breaker.failure_rate_threshold = 42; + in.upstreams.push_back(u); + + std::string serialized = ConfigLoader::ToJson(in); + ServerConfig out = ConfigLoader::LoadFromString(serialized); + + const auto& cb = out.upstreams.at(0).circuit_breaker; + bool pass = cb.enabled == true && cb.window_seconds == 25 && + cb.failure_rate_threshold == 42; + TestFramework::RecordTest("Circuit Breaker JSON Round-Trip", pass, + pass ? "" : "round-trip lost fields", + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("Circuit Breaker JSON Round-Trip", false, + e.what(), TestFramework::TestCategory::OTHER); + } + } + + // Helper: assert a circuit_breaker JSON override is rejected by Validate(). + static void ExpectValidationFailure(const std::string& name, + const std::string& cb_json_override, + const std::string& expected_substr) { + std::string json = std::string(R"({ + "upstreams": [{ + "name": "svc", "host": "10.0.0.1", "port": 8080, + "circuit_breaker": )") + cb_json_override + R"( + }] + })"; + try { + ServerConfig config = ConfigLoader::LoadFromString(json); + ConfigLoader::Validate(config); + TestFramework::RecordTest(name, false, + "expected validation failure containing: " + expected_substr, + TestFramework::TestCategory::OTHER); + } catch (const std::invalid_argument& e) { + std::string msg(e.what()); + bool pass = msg.find(expected_substr) != std::string::npos; + TestFramework::RecordTest(name, pass, + pass ? "" : std::string("wrong error: ") + msg, + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest(name, false, + std::string("wrong exception type: ") + e.what(), + TestFramework::TestCategory::OTHER); + } + } + + // Test 13: Validation rejects bad circuit_breaker fields + void TestCircuitBreakerValidation() { + std::cout << "\n[TEST] Circuit Breaker Validation..." << std::endl; + ExpectValidationFailure("CB Validation: consecutive_failure_threshold<1", + R"({"consecutive_failure_threshold": 0})", + "consecutive_failure_threshold must be in [1, 10000]"); + ExpectValidationFailure("CB Validation: failure_rate_threshold>100", + R"({"failure_rate_threshold": 101})", + "failure_rate_threshold must be in [0, 100]"); + ExpectValidationFailure("CB Validation: minimum_volume<1", + R"({"minimum_volume": 0})", + "minimum_volume must be in [1, 10000000]"); + ExpectValidationFailure("CB Validation: window_seconds<1", + R"({"window_seconds": 0})", + "window_seconds must be in [1, 3600]"); + ExpectValidationFailure("CB Validation: window_seconds>3600", + R"({"window_seconds": 3601})", + "window_seconds must be in [1, 3600]"); + ExpectValidationFailure("CB Validation: base_open_duration_ms<100", + R"({"base_open_duration_ms": 50})", + "base_open_duration_ms must be >= 100"); + ExpectValidationFailure("CB Validation: max= base_open_duration_ms"); + ExpectValidationFailure("CB Validation: retry_budget_percent>100", + R"({"retry_budget_percent": 200})", + "retry_budget_percent must be in [0, 100]"); + ExpectValidationFailure("CB Validation: retry_budget_min_concurrency<0", + R"({"retry_budget_min_concurrency": -1})", + "retry_budget_min_concurrency must be >= 0"); + ExpectValidationFailure("CB Validation: max_ejection_percent>100", + R"({"max_ejection_percent_per_host_set": 150})", + "max_ejection_percent_per_host_set must be in [0, 100]"); + ExpectValidationFailure("CB Validation: permitted_half_open_calls<1", + R"({"permitted_half_open_calls": 0})", + "permitted_half_open_calls must be in [1, 1000]"); + // Upper-bound regressions — pathological configs must be rejected. + ExpectValidationFailure("CB Validation: consecutive_failure_threshold>10000", + R"({"consecutive_failure_threshold": 10001})", + "consecutive_failure_threshold must be in [1, 10000]"); + ExpectValidationFailure("CB Validation: minimum_volume>10000000", + R"({"minimum_volume": 10000001})", + "minimum_volume must be in [1, 10000000]"); + ExpectValidationFailure("CB Validation: permitted_half_open_calls>1000", + R"({"permitted_half_open_calls": 1001})", + "permitted_half_open_calls must be in [1, 1000]"); + // Type-strictness guards: nlohmann's value() silently coerces + // float/bool to int (1.9 → 1, true → 1). Rejecting at parse time is + // safer than letting malformed configs pass Validate() and change + // production breaker behavior. + ExpectValidationFailure("CB Validation: float rejected for int field", + R"({"window_seconds": 1.9})", + "circuit_breaker.window_seconds must be an integer"); + ExpectValidationFailure("CB Validation: bool rejected for int field", + R"({"consecutive_failure_threshold": true})", + "circuit_breaker.consecutive_failure_threshold must be an integer"); + ExpectValidationFailure("CB Validation: int rejected for bool field", + R"({"enabled": 1})", + "circuit_breaker.enabled must be a boolean"); + } + + // UpstreamConfig::operator== EXCLUDES circuit_breaker. + // CircuitBreakerManager::Reload is wired in HttpServer::Reload, so a + // CB-only SIGHUP is a clean hot reload. Excluding circuit_breaker from + // the equality check ensures the outer reload doesn't fire a spurious + // "restart required" warning on a pure CB-fields edit. + // Topology fields (name, host, port, tls, pool, proxy) remain + // restart-only and must still trigger inequality. + void TestCircuitBreakerEquality() { + std::cout << "\n[TEST] Circuit Breaker Equality (CB excluded from UpstreamConfig::operator==)..." << std::endl; + try { + UpstreamConfig a; + a.name = "svc"; a.host = "h"; a.port = 80; + UpstreamConfig b = a; + + // Default equal. + bool equal_default = (a == b); + + // Circuit-breaker-only edit must NOT break equality — breaker + // fields are live-reloadable via CircuitBreakerManager::Reload. + b.circuit_breaker.enabled = true; + b.circuit_breaker.window_seconds = 30; + bool cb_edit_invisible = (a == b); + + // CircuitBreakerConfig::operator== still detects the field diff + // (CircuitBreakerManager::Reload relies on this inner comparison). + bool cb_fields_differ = (a.circuit_breaker != b.circuit_breaker); + + // Topology changes still make configs unequal. + UpstreamConfig c = a; + c.host = "different"; + bool topology_changed = (a != c); + + UpstreamConfig d = a; + d.port = 9999; + bool port_change_detected = (a != d); + + bool pass = equal_default && cb_edit_invisible && + cb_fields_differ && topology_changed && + port_change_detected; + TestFramework::RecordTest("Circuit Breaker Equality (CB excluded from UpstreamConfig::operator==)", + pass, + pass ? "" : + "equal_default=" + std::to_string(equal_default) + + " cb_edit_invisible=" + std::to_string(cb_edit_invisible) + + " cb_fields_differ=" + std::to_string(cb_fields_differ) + + " topology_changed=" + std::to_string(topology_changed) + + " port_change_detected=" + std::to_string(port_change_detected), + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("Circuit Breaker Equality (CB excluded from UpstreamConfig::operator==)", + false, e.what(), TestFramework::TestCategory::OTHER); + } + } + // Run all config tests void RunAllTests() { std::cout << "\n" << std::string(60, '=') << std::endl; @@ -362,6 +630,14 @@ namespace ConfigTests { TestValidationTlsNoCert(); TestEnvOverrides(); TestMissingFile(); + + // Circuit breaker config tests + TestCircuitBreakerDefaults(); + TestCircuitBreakerJsonParse(); + TestCircuitBreakerJsonPartial(); + TestCircuitBreakerJsonRoundTrip(); + TestCircuitBreakerValidation(); + TestCircuitBreakerEquality(); } } // namespace ConfigTests diff --git a/test/run_test.cc b/test/run_test.cc index 4edb0139..0419c6ee 100644 --- a/test/run_test.cc +++ b/test/run_test.cc @@ -13,6 +13,13 @@ #include "upstream_pool_test.h" #include "proxy_test.h" #include "rate_limit_test.h" +#include "circuit_breaker_test.h" +#include "circuit_breaker_components_test.h" +#include "circuit_breaker_integration_test.h" +#include "circuit_breaker_retry_budget_test.h" +#include "circuit_breaker_wait_queue_drain_test.h" +#include "circuit_breaker_observability_test.h" +#include "circuit_breaker_reload_test.h" #include "test_framework.h" #include #include @@ -77,6 +84,28 @@ void RunAllTest(){ // Run rate limit tests RateLimitTests::RunAllTests(); + // Run circuit breaker tests + CircuitBreakerTests::RunAllTests(); + + // Run circuit-breaker component unit tests (RetryBudget / Host / Manager) + CircuitBreakerComponentsTests::RunAllTests(); + + // Run circuit-breaker integration tests (end-to-end through + // ProxyTransaction + UpstreamManager + HttpServer) + CircuitBreakerIntegrationTests::RunAllTests(); + + // Run circuit-breaker retry-budget integration tests + CircuitBreakerRetryBudgetTests::RunAllTests(); + + // Run circuit-breaker wait-queue-drain-on-trip tests + CircuitBreakerWaitQueueDrainTests::RunAllTests(); + + // Run circuit-breaker observability tests + CircuitBreakerObservabilityTests::RunAllTests(); + + // Run circuit-breaker hot-reload tests + CircuitBreakerReloadTests::RunAllTests(); + std::cout << "====================================\n" << std::endl; } @@ -155,6 +184,15 @@ int main(int argc, char* argv[]) { // Run rate limit tests }else if(mode == "rate_limit" || mode == "-L"){ RateLimitTests::RunAllTests(); + // Run circuit-breaker tests (unit + components + integration + retry-budget + drain + observability + reload) + }else if(mode == "circuit_breaker" || mode == "-B"){ + CircuitBreakerTests::RunAllTests(); + CircuitBreakerComponentsTests::RunAllTests(); + CircuitBreakerIntegrationTests::RunAllTests(); + CircuitBreakerRetryBudgetTests::RunAllTests(); + CircuitBreakerWaitQueueDrainTests::RunAllTests(); + CircuitBreakerObservabilityTests::RunAllTests(); + CircuitBreakerReloadTests::RunAllTests(); // Show help }else if(mode == "help" || mode == "-h" || mode == "--help"){ PrintUsage(argv[0]);